From 8b4000f13b303cc154136abc74c55670673e2a96 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Mon, 8 May 2017 17:13:54 +0000 Subject: Vendor import of lldb trunk r302418: https://llvm.org/svn/llvm-project/lldb/trunk@302418 --- include/lldb/API/SBAddress.h | 4 + include/lldb/API/SBInstruction.h | 2 + include/lldb/API/SBInstructionList.h | 9 + include/lldb/Core/Disassembler.h | 2 + include/lldb/Expression/Expression.h | 10 + include/lldb/Host/MainLoop.h | 7 + include/lldb/Host/common/UDPSocket.h | 4 +- include/lldb/Target/ThreadPlanCallFunction.h | 2 +- include/lldb/Target/ThreadPlanCallUserExpression.h | 3 + include/lldb/Utility/TaskPool.h | 108 +---------- .../multiline/TestMultilineExpressions.py | 28 +++ .../step_over_breakpoint/TestStepOverBreakpoint.py | 9 +- .../return-value/TestReturnValue.py | 50 +++-- .../tools/lldb-server/TestGdbRemoteHostInfo.py | 1 + scripts/interface/SBInstruction.i | 3 + scripts/interface/SBInstructionList.i | 3 + source/API/SBAddress.cpp | 6 + source/API/SBInstruction.cpp | 7 + source/API/SBInstructionList.cpp | 26 +++ source/API/SBProcess.cpp | 16 +- source/Core/Disassembler.cpp | 4 + source/Host/common/Editline.cpp | 2 +- source/Host/common/MainLoop.cpp | 206 ++++++++++----------- source/Host/common/UDPSocket.cpp | 82 ++++---- source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp | 34 ++-- .../MacOSX-DYLD/DynamicLoaderMacOS.cpp | 17 +- .../Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp | 73 +++----- source/Target/ThreadPlanCallUserExpression.cpp | 11 ++ source/Utility/TaskPool.cpp | 23 +++ unittests/Host/CMakeLists.txt | 1 + unittests/Host/MainLoopTest.cpp | 120 ++++++++++++ unittests/Utility/TaskPoolTest.cpp | 31 +--- www/lldb-gdb.html | 31 ++++ 33 files changed, 566 insertions(+), 369 deletions(-) create mode 100644 unittests/Host/MainLoopTest.cpp diff --git a/include/lldb/API/SBAddress.h b/include/lldb/API/SBAddress.h index ddbe5a742786..9e697beffdd1 100644 --- a/include/lldb/API/SBAddress.h +++ b/include/lldb/API/SBAddress.h @@ -103,6 +103,8 @@ protected: const lldb_private::Address *operator->() const; + friend bool operator==(const SBAddress &lhs, const SBAddress &rhs); + lldb_private::Address *get(); lldb_private::Address &ref(); @@ -117,6 +119,8 @@ private: std::unique_ptr m_opaque_ap; }; +bool operator==(const SBAddress &lhs, const SBAddress &rhs); + } // namespace lldb #endif // LLDB_SBAddress_h_ diff --git a/include/lldb/API/SBInstruction.h b/include/lldb/API/SBInstruction.h index 0fc12eb61cba..23daf1c56637 100644 --- a/include/lldb/API/SBInstruction.h +++ b/include/lldb/API/SBInstruction.h @@ -53,6 +53,8 @@ public: bool HasDelaySlot(); + bool CanSetBreakpoint(); + void Print(FILE *out); bool GetDescription(lldb::SBStream &description); diff --git a/include/lldb/API/SBInstructionList.h b/include/lldb/API/SBInstructionList.h index 29baef5790eb..0323a3c80c05 100644 --- a/include/lldb/API/SBInstructionList.h +++ b/include/lldb/API/SBInstructionList.h @@ -32,6 +32,15 @@ public: lldb::SBInstruction GetInstructionAtIndex(uint32_t idx); + // ---------------------------------------------------------------------- + // Returns the number of instructions between the start and end address. + // If canSetBreakpoint is true then the count will be the number of + // instructions on which a breakpoint can be set. + // ---------------------------------------------------------------------- + size_t GetInstructionsCount(const SBAddress &start, + const SBAddress &end, + bool canSetBreakpoint = false); + void Clear(); void AppendInstruction(lldb::SBInstruction inst); diff --git a/include/lldb/Core/Disassembler.h b/include/lldb/Core/Disassembler.h index 929b668c092b..addc83ad5e9d 100644 --- a/include/lldb/Core/Disassembler.h +++ b/include/lldb/Core/Disassembler.h @@ -173,6 +173,8 @@ public: virtual bool HasDelaySlot(); + bool CanSetBreakpoint (); + virtual size_t Decode(const Disassembler &disassembler, const DataExtractor &data, lldb::offset_t data_offset) = 0; diff --git a/include/lldb/Expression/Expression.h b/include/lldb/Expression/Expression.h index f48a7992227d..860444e9c2c2 100644 --- a/include/lldb/Expression/Expression.h +++ b/include/lldb/Expression/Expression.h @@ -99,6 +99,16 @@ public: //------------------------------------------------------------------ lldb::addr_t StartAddress() { return m_jit_start_addr; } + //------------------------------------------------------------------ + /// Called to notify the expression that it is about to be executed. + //------------------------------------------------------------------ + virtual void WillStartExecuting() {} + + //------------------------------------------------------------------ + /// Called to notify the expression that its execution has finished. + //------------------------------------------------------------------ + virtual void DidFinishExecuting() {} + virtual ExpressionTypeSystemHelper *GetTypeSystemHelper() { return nullptr; } protected: diff --git a/include/lldb/Host/MainLoop.h b/include/lldb/Host/MainLoop.h index 79370bf8461f..f5d906e98a7b 100644 --- a/include/lldb/Host/MainLoop.h +++ b/include/lldb/Host/MainLoop.h @@ -42,6 +42,7 @@ private: public: typedef std::unique_ptr SignalHandleUP; + MainLoop(); ~MainLoop() override; ReadHandleUP RegisterReadObject(const lldb::IOObjectSP &object_sp, @@ -71,6 +72,9 @@ protected: void UnregisterSignal(int signo); private: + void ProcessReadObject(IOObject::WaitableHandle handle); + void ProcessSignal(int signo); + class SignalHandle { public: ~SignalHandle() { m_mainloop.UnregisterSignal(m_signo); } @@ -97,6 +101,9 @@ private: llvm::DenseMap m_read_fds; llvm::DenseMap m_signals; +#if HAVE_SYS_EVENT_H + int m_kqueue; +#endif bool m_terminate_request : 1; }; diff --git a/include/lldb/Host/common/UDPSocket.h b/include/lldb/Host/common/UDPSocket.h index 38524fa8f62b..977ce151e4ff 100644 --- a/include/lldb/Host/common/UDPSocket.h +++ b/include/lldb/Host/common/UDPSocket.h @@ -21,15 +21,13 @@ public: Socket *&socket); private: - UDPSocket(NativeSocket socket, const UDPSocket &listen_socket); + UDPSocket(NativeSocket socket); size_t Send(const void *buf, const size_t num_bytes) override; Error Connect(llvm::StringRef name) override; Error Listen(llvm::StringRef name, int backlog) override; Error Accept(Socket *&socket) override; - Error CreateSocket(); - SocketAddress m_sockaddr; }; } diff --git a/include/lldb/Target/ThreadPlanCallFunction.h b/include/lldb/Target/ThreadPlanCallFunction.h index 3d43491af9af..1c75b0a3645c 100644 --- a/include/lldb/Target/ThreadPlanCallFunction.h +++ b/include/lldb/Target/ThreadPlanCallFunction.h @@ -117,7 +117,7 @@ protected: lldb::addr_t &start_load_addr, lldb::addr_t &function_load_addr); - void DoTakedown(bool success); + virtual void DoTakedown(bool success); void SetBreakpoints(); diff --git a/include/lldb/Target/ThreadPlanCallUserExpression.h b/include/lldb/Target/ThreadPlanCallUserExpression.h index f1425b2f97e1..5fe80927ca21 100644 --- a/include/lldb/Target/ThreadPlanCallUserExpression.h +++ b/include/lldb/Target/ThreadPlanCallUserExpression.h @@ -35,6 +35,8 @@ public: void GetDescription(Stream *s, lldb::DescriptionLevel level) override; + void DidPush() override; + void WillPop() override; lldb::StopInfoSP GetRealStopInfo() override; @@ -48,6 +50,7 @@ public: } protected: + void DoTakedown(bool success) override; private: lldb::UserExpressionSP m_user_expression_sp; // This is currently just used to ensure the diff --git a/include/lldb/Utility/TaskPool.h b/include/lldb/Utility/TaskPool.h index fb936bbb739a..87b8824f9226 100644 --- a/include/lldb/Utility/TaskPool.h +++ b/include/lldb/Utility/TaskPool.h @@ -53,50 +53,6 @@ private: static void AddTaskImpl(std::function &&task_fn); }; -// Wrapper class around the global TaskPool implementation to make it possible -// to create a set of -// tasks and then wait for the tasks to be completed by the -// WaitForNextCompletedTask call. This -// class should be used when WaitForNextCompletedTask is needed because this -// class add no other -// extra functionality to the TaskPool class and it have a very minor -// performance overhead. -template // The return type of the tasks what will be added to this - // task runner - class TaskRunner { -public: - // Add a task to the task runner what will also add the task to the global - // TaskPool. The - // function doesn't return the std::future for the task because it will be - // supplied by the - // WaitForNextCompletedTask after the task is completed. - template void AddTask(F &&f, Args &&... args); - - // Wait for the next task in this task runner to finish and then return the - // std::future what - // belongs to the finished task. If there is no task in this task runner - // (neither pending nor - // comleted) then this function will return an invalid future. Usually this - // function should be - // called in a loop processing the results of the tasks until it returns an - // invalid std::future - // what means that all task in this task runner is completed. - std::future WaitForNextCompletedTask(); - - // Convenience method to wait for all task in this TaskRunner to finish. Do - // NOT use this class - // just because of this method. Use TaskPool instead and wait for each - // std::future returned by - // AddTask in a loop. - void WaitForAllTasks(); - -private: - std::list> m_ready; - std::list> m_pending; - std::mutex m_mutex; - std::condition_variable m_cv; -}; - template std::future::type> TaskPool::AddTask(F &&f, Args &&... args) { @@ -126,64 +82,10 @@ template <> struct TaskPool::RunTaskImpl<> { static void Run() {} }; -template -template -void TaskRunner::AddTask(F &&f, Args &&... args) { - std::unique_lock lock(m_mutex); - auto it = m_pending.emplace(m_pending.end()); - *it = std::move(TaskPool::AddTask( - [this, it](F f, Args... args) { - T &&r = f(std::forward(args)...); - - std::unique_lock lock(this->m_mutex); - this->m_ready.splice(this->m_ready.end(), this->m_pending, it); - lock.unlock(); - - this->m_cv.notify_one(); - return r; - }, - std::forward(f), std::forward(args)...)); -} - -template <> -template -void TaskRunner::AddTask(F &&f, Args &&... args) { - std::unique_lock lock(m_mutex); - auto it = m_pending.emplace(m_pending.end()); - *it = std::move(TaskPool::AddTask( - [this, it](F f, Args... args) { - f(std::forward(args)...); - - std::unique_lock lock(this->m_mutex); - this->m_ready.emplace_back(std::move(*it)); - this->m_pending.erase(it); - lock.unlock(); - - this->m_cv.notify_one(); - }, - std::forward(f), std::forward(args)...)); -} - -template std::future TaskRunner::WaitForNextCompletedTask() { - std::unique_lock lock(m_mutex); - if (m_ready.empty() && m_pending.empty()) - return std::future(); // No more tasks - - if (m_ready.empty()) - m_cv.wait(lock, [this]() { return !this->m_ready.empty(); }); - - std::future res = std::move(m_ready.front()); - m_ready.pop_front(); - - lock.unlock(); - res.wait(); - - return std::move(res); -} - -template void TaskRunner::WaitForAllTasks() { - while (WaitForNextCompletedTask().valid()) - ; -} +// Run 'func' on every value from begin .. end-1. Each worker will grab +// 'batch_size' numbers at a time to work on, so for very fast functions, batch +// should be large enough to avoid too much cache line contention. +void TaskMapOverInt(size_t begin, size_t end, + std::function const &func); #endif // #ifndef utility_TaskPool_h_ diff --git a/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py b/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py index b1b5cbe677c4..aa369ebeff87 100644 --- a/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py +++ b/packages/Python/lldbsuite/test/expression_command/multiline/TestMultilineExpressions.py @@ -12,6 +12,7 @@ from lldbsuite.test import lldbutil class MultilineExpressionsTestCase(TestBase): mydir = TestBase.compute_mydir(__file__) + NO_DEBUG_INFO_TESTCASE = True def setUp(self): # Call super's setUp(). @@ -60,3 +61,30 @@ class MultilineExpressionsTestCase(TestBase): child.expect_exact(prompt) self.expect(child.before, exe=False, patterns=['= 5']) + + @skipIfRemote + @expectedFailureAll( + oslist=["windows"], + bugnumber="llvm.org/pr22274: need a pexpect replacement for windows") + def test_empty_list(self): + """Test printing an empty list of expressions""" + import pexpect + prompt = "(lldb) " + + # So that the child gets torn down after the test + self.child = pexpect.spawn( + "%s %s" % + (lldbtest_config.lldbExec, self.lldbOption)) + child = self.child + + # Turn on logging for what the child sends back. + if self.TraceOn(): + child.logfile_read = sys.stdout + + # We expect a prompt, then send "print" to start a list of expressions, + # then an empty line. We expect a prompt back. + child.expect_exact(prompt) + child.sendline("print") + child.expect_exact('1:') + child.sendline("") + child.expect_exact(prompt) diff --git a/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py b/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py index 00ddc628607c..4dfeae3f5e19 100644 --- a/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py +++ b/packages/Python/lldbsuite/test/functionalities/breakpoint/step_over_breakpoint/TestStepOverBreakpoint.py @@ -62,12 +62,11 @@ class StepOverBreakpointsTestCase(TestBase): instructions = function.GetInstructions(self.target) addr_1 = self.breakpoint1.GetLocationAtIndex(0).GetAddress() addr_4 = self.breakpoint4.GetLocationAtIndex(0).GetAddress() - for i in range(instructions.GetSize()) : - addr = instructions.GetInstructionAtIndex(i).GetAddress() - if (addr == addr_1) : index_1 = i - if (addr == addr_4) : index_4 = i - steps_expected = index_4 - index_1 + # if third argument is true then the count will be the number of + # instructions on which a breakpoint can be set. + # start = addr_1, end = addr_4, canSetBreakpoint = True + steps_expected = instructions.GetInstructionsCount(addr_1, addr_4, True) step_count = 0 # Step from breakpoint_1 to breakpoint_4 while True: diff --git a/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py b/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py index 778c098a38ee..90562f52a4b2 100644 --- a/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py +++ b/packages/Python/lldbsuite/test/functionalities/return-value/TestReturnValue.py @@ -171,17 +171,45 @@ class ReturnValueTestCase(TestBase): #self.return_and_test_struct_value ("return_one_int_one_double_packed") self.return_and_test_struct_value("return_one_int_one_long") - # icc and gcc don't support this extension. - if self.getCompiler().endswith('clang'): - self.return_and_test_struct_value("return_vector_size_float32_8") - self.return_and_test_struct_value("return_vector_size_float32_16") - self.return_and_test_struct_value("return_vector_size_float32_32") - self.return_and_test_struct_value( - "return_ext_vector_size_float32_2") - self.return_and_test_struct_value( - "return_ext_vector_size_float32_4") - self.return_and_test_struct_value( - "return_ext_vector_size_float32_8") + @expectedFailureAll(oslist=["freebsd"], archs=["i386"]) + @expectedFailureAll(oslist=["macosx"], archs=["i386"], bugnumber="") + @expectedFailureAll( + oslist=["linux"], + compiler="clang", + compiler_version=[ + "<=", + "3.6"], + archs=["i386"]) + @expectedFailureAll( + bugnumber="llvm.org/pr25785", + hostoslist=["windows"], + compiler="gcc", + archs=["i386"], + triple='.*-android') + @expectedFailureAll(compiler=["gcc"], archs=["x86_64", "i386"]) + @expectedFailureAll(oslist=["windows"], bugnumber="llvm.org/pr24778") + def test_vector_values(self): + self.build() + exe = os.path.join(os.getcwd(), "a.out") + error = lldb.SBError() + + self.target = self.dbg.CreateTarget(exe) + self.assertTrue(self.target, VALID_TARGET) + + main_bktp = self.target.BreakpointCreateByName("main", exe) + self.assertTrue(main_bktp, VALID_BREAKPOINT) + + self.process = self.target.LaunchSimple( + None, None, self.get_process_working_directory()) + self.assertEqual(len(lldbutil.get_threads_stopped_at_breakpoint( + self.process, main_bktp)), 1) + + self.return_and_test_struct_value("return_vector_size_float32_8") + self.return_and_test_struct_value("return_vector_size_float32_16") + self.return_and_test_struct_value("return_vector_size_float32_32") + self.return_and_test_struct_value("return_ext_vector_size_float32_2") + self.return_and_test_struct_value("return_ext_vector_size_float32_4") + self.return_and_test_struct_value("return_ext_vector_size_float32_8") def return_and_test_struct_value(self, func_name): """Pass in the name of the function to return from - takes in value, returns value.""" diff --git a/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py b/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py index 5089ee85773f..d84511d54273 100644 --- a/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py +++ b/packages/Python/lldbsuite/test/tools/lldb-server/TestGdbRemoteHostInfo.py @@ -14,6 +14,7 @@ class TestGdbRemoteHostInfo(GdbRemoteTestCaseBase): mydir = TestBase.compute_mydir(__file__) KNOWN_HOST_INFO_KEYS = set([ + "arch", "cputype", "cpusubtype", "distribution_id", diff --git a/scripts/interface/SBInstruction.i b/scripts/interface/SBInstruction.i index d5b60201e95e..c78799c6fe69 100644 --- a/scripts/interface/SBInstruction.i +++ b/scripts/interface/SBInstruction.i @@ -54,6 +54,9 @@ public: bool HasDelaySlot (); + bool + CanSetBreakpoint (); + void Print (FILE *out); diff --git a/scripts/interface/SBInstructionList.i b/scripts/interface/SBInstructionList.i index 32603be5cc1e..f4b572c341cd 100644 --- a/scripts/interface/SBInstructionList.i +++ b/scripts/interface/SBInstructionList.i @@ -44,6 +44,9 @@ public: lldb::SBInstruction GetInstructionAtIndex (uint32_t idx); + size_t GetInstructionsCount(const SBAddress &start, const SBAddress &end, + bool canSetBreakpoint); + void Clear (); diff --git a/source/API/SBAddress.cpp b/source/API/SBAddress.cpp index b452ce327ab7..a3493d7c743f 100644 --- a/source/API/SBAddress.cpp +++ b/source/API/SBAddress.cpp @@ -55,6 +55,12 @@ const SBAddress &SBAddress::operator=(const SBAddress &rhs) { return *this; } +bool lldb::operator==(const SBAddress &lhs, const SBAddress &rhs) { + if (lhs.IsValid() && rhs.IsValid()) + return lhs.ref() == rhs.ref(); + return false; +} + bool SBAddress::IsValid() const { return m_opaque_ap.get() != NULL && m_opaque_ap->IsValid(); } diff --git a/source/API/SBInstruction.cpp b/source/API/SBInstruction.cpp index c47307c733a8..8b7deb7011be 100644 --- a/source/API/SBInstruction.cpp +++ b/source/API/SBInstruction.cpp @@ -176,6 +176,13 @@ bool SBInstruction::HasDelaySlot() { return false; } +bool SBInstruction::CanSetBreakpoint () { + lldb::InstructionSP inst_sp(GetOpaque()); + if (inst_sp) + return inst_sp->CanSetBreakpoint(); + return false; +} + lldb::InstructionSP SBInstruction::GetOpaque() { if (m_opaque_sp) return m_opaque_sp->GetSP(); diff --git a/source/API/SBInstructionList.cpp b/source/API/SBInstructionList.cpp index 04c37f50c2d7..3edb9eae98c1 100644 --- a/source/API/SBInstructionList.cpp +++ b/source/API/SBInstructionList.cpp @@ -9,6 +9,7 @@ #include "lldb/API/SBInstructionList.h" #include "lldb/API/SBInstruction.h" +#include "lldb/API/SBAddress.h" #include "lldb/API/SBStream.h" #include "lldb/Core/Disassembler.h" #include "lldb/Core/Module.h" @@ -49,6 +50,31 @@ SBInstruction SBInstructionList::GetInstructionAtIndex(uint32_t idx) { return inst; } +size_t SBInstructionList::GetInstructionsCount(const SBAddress &start, + const SBAddress &end, + bool canSetBreakpoint) { + size_t num_instructions = GetSize(); + size_t i = 0; + SBAddress addr; + size_t lower_index = 0; + size_t upper_index = 0; + size_t instructions_to_skip = 0; + for (i = 0; i < num_instructions; ++i) { + addr = GetInstructionAtIndex(i).GetAddress(); + if (start == addr) + lower_index = i; + if (end == addr) + upper_index = i; + } + if (canSetBreakpoint) + for (i = lower_index; i <= upper_index; ++i) { + SBInstruction insn = GetInstructionAtIndex(i); + if (!insn.CanSetBreakpoint()) + ++instructions_to_skip; + } + return upper_index - lower_index - instructions_to_skip; +} + void SBInstructionList::Clear() { m_opaque_sp.reset(); } void SBInstructionList::AppendInstruction(SBInstruction insn) {} diff --git a/source/API/SBProcess.cpp b/source/API/SBProcess.cpp index 5614cb468a69..0348113a9873 100644 --- a/source/API/SBProcess.cpp +++ b/source/API/SBProcess.cpp @@ -1157,22 +1157,34 @@ uint32_t SBProcess::LoadImage(lldb::SBFileSpec &sb_remote_image_spec, uint32_t SBProcess::LoadImage(const lldb::SBFileSpec &sb_local_image_spec, const lldb::SBFileSpec &sb_remote_image_spec, lldb::SBError &sb_error) { + Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_API)); ProcessSP process_sp(GetSP()); if (process_sp) { Process::StopLocker stop_locker; if (stop_locker.TryLock(&process_sp->GetRunLock())) { + if (log) + log->Printf("SBProcess(%p)::LoadImage() => calling Platform::LoadImage" + "for: %s", + static_cast(process_sp.get()), + sb_local_image_spec.GetFilename()); + std::lock_guard guard( - process_sp->GetTarget().GetAPIMutex()); + process_sp->GetTarget().GetAPIMutex()); PlatformSP platform_sp = process_sp->GetTarget().GetPlatform(); return platform_sp->LoadImage(process_sp.get(), *sb_local_image_spec, *sb_remote_image_spec, sb_error.ref()); } else { - Log *log(lldb_private::GetLogIfAllCategoriesSet(LIBLLDB_LOG_API)); if (log) log->Printf("SBProcess(%p)::LoadImage() => error: process is running", static_cast(process_sp.get())); sb_error.SetErrorString("process is running"); } + } else { + if (log) + log->Printf("SBProcess(%p)::LoadImage() => error: called with invalid" + " process", + static_cast(process_sp.get())); + sb_error.SetErrorString("process is invalid"); } return LLDB_INVALID_IMAGE_TOKEN; } diff --git a/source/Core/Disassembler.cpp b/source/Core/Disassembler.cpp index 3880bfd16ecc..51d93d9acdbb 100644 --- a/source/Core/Disassembler.cpp +++ b/source/Core/Disassembler.cpp @@ -759,6 +759,10 @@ bool Instruction::DumpEmulation(const ArchSpec &arch) { return false; } +bool Instruction::CanSetBreakpoint () { + return !HasDelaySlot(); +} + bool Instruction::HasDelaySlot() { // Default is false. return false; diff --git a/source/Host/common/Editline.cpp b/source/Host/common/Editline.cpp index b157cdb7c110..851287e76331 100644 --- a/source/Host/common/Editline.cpp +++ b/source/Host/common/Editline.cpp @@ -367,7 +367,7 @@ void Editline::MoveCursor(CursorLocation from, CursorLocation to) { if (to == CursorLocation::EditingCursor) { toColumn = editline_cursor_position - (editline_cursor_row * m_terminal_width) + 1; - } else if (to == CursorLocation::BlockEnd) { + } else if (to == CursorLocation::BlockEnd && !m_input_lines.empty()) { toColumn = ((m_input_lines[m_input_lines.size() - 1].length() + GetPromptWidth()) % 80) + diff --git a/source/Host/common/MainLoop.cpp b/source/Host/common/MainLoop.cpp index 8a9d4f020d5f..abd52f7f46fb 100644 --- a/source/Host/common/MainLoop.cpp +++ b/source/Host/common/MainLoop.cpp @@ -18,6 +18,11 @@ #include #include +// Multiplexing is implemented using kqueue on systems that support it (BSD +// variants including OSX). On linux we use ppoll, while android uses pselect +// (ppoll is present but not implemented properly). On windows we use WSApoll +// (which does not support signals). + #if HAVE_SYS_EVENT_H #include #elif defined(LLVM_ON_WIN32) @@ -65,92 +70,72 @@ static void SignalHandler(int signo, siginfo_t *info, void *) { class MainLoop::RunImpl { public: - // TODO: Use llvm::Expected - static std::unique_ptr Create(MainLoop &loop, Error &error); - ~RunImpl(); + RunImpl(MainLoop &loop); + ~RunImpl() = default; Error Poll(); - - template void ForEachReadFD(F &&f); - template void ForEachSignal(F &&f); + void ProcessEvents(); private: MainLoop &loop; #if HAVE_SYS_EVENT_H - int queue_id; std::vector in_events; struct kevent out_events[4]; int num_events = -1; - RunImpl(MainLoop &loop, int queue_id) : loop(loop), queue_id(queue_id) { - in_events.reserve(loop.m_read_fds.size() + loop.m_signals.size()); - } #else - std::vector signals; #ifdef FORCE_PSELECT fd_set read_fd_set; #else std::vector read_fds; #endif - RunImpl(MainLoop &loop) : loop(loop) { - signals.reserve(loop.m_signals.size()); - } - sigset_t get_sigmask(); #endif }; #if HAVE_SYS_EVENT_H -MainLoop::RunImpl::~RunImpl() { - int r = close(queue_id); - assert(r == 0); - (void)r; -} -std::unique_ptr MainLoop::RunImpl::Create(MainLoop &loop, Error &error) -{ - error.Clear(); - int queue_id = kqueue(); - if(queue_id < 0) { - error = Error(errno, eErrorTypePOSIX); - return nullptr; - } - return std::unique_ptr(new RunImpl(loop, queue_id)); +MainLoop::RunImpl::RunImpl(MainLoop &loop) : loop(loop) { + in_events.reserve(loop.m_read_fds.size()); } Error MainLoop::RunImpl::Poll() { - in_events.resize(loop.m_read_fds.size() + loop.m_signals.size()); + in_events.resize(loop.m_read_fds.size()); unsigned i = 0; for (auto &fd : loop.m_read_fds) EV_SET(&in_events[i++], fd.first, EVFILT_READ, EV_ADD, 0, 0, 0); - for (const auto &sig : loop.m_signals) - EV_SET(&in_events[i++], sig.first, EVFILT_SIGNAL, EV_ADD, 0, 0, 0); - - num_events = kevent(queue_id, in_events.data(), in_events.size(), out_events, - llvm::array_lengthof(out_events), nullptr); + num_events = kevent(loop.m_kqueue, in_events.data(), in_events.size(), + out_events, llvm::array_lengthof(out_events), nullptr); if (num_events < 0) return Error("kevent() failed with error %d\n", num_events); return Error(); } -template void MainLoop::RunImpl::ForEachReadFD(F &&f) { +void MainLoop::RunImpl::ProcessEvents() { assert(num_events >= 0); for (int i = 0; i < num_events; ++i) { - f(out_events[i].ident); if (loop.m_terminate_request) return; + switch (out_events[i].filter) { + case EVFILT_READ: + loop.ProcessReadObject(out_events[i].ident); + break; + case EVFILT_SIGNAL: + loop.ProcessSignal(out_events[i].ident); + break; + default: + llvm_unreachable("Unknown event"); + } } } -template void MainLoop::RunImpl::ForEachSignal(F && f) {} #else -MainLoop::RunImpl::~RunImpl() {} -std::unique_ptr MainLoop::RunImpl::Create(MainLoop &loop, Error &error) -{ - error.Clear(); - return std::unique_ptr(new RunImpl(loop)); +MainLoop::RunImpl::RunImpl(MainLoop &loop) : loop(loop) { +#ifndef FORCE_PSELECT + read_fds.reserve(loop.m_read_fds.size()); +#endif } sigset_t MainLoop::RunImpl::get_sigmask() { @@ -162,18 +147,14 @@ sigset_t MainLoop::RunImpl::get_sigmask() { assert(ret == 0); (void) ret; - for (const auto &sig : loop.m_signals) { - signals.push_back(sig.first); + for (const auto &sig : loop.m_signals) sigdelset(&sigmask, sig.first); - } return sigmask; #endif } #ifdef FORCE_PSELECT Error MainLoop::RunImpl::Poll() { - signals.clear(); - FD_ZERO(&read_fd_set); int nfds = 0; for (const auto &fd : loop.m_read_fds) { @@ -188,20 +169,8 @@ Error MainLoop::RunImpl::Poll() { return Error(); } - -template void MainLoop::RunImpl::ForEachReadFD(F &&f) { - for (const auto &fd : loop.m_read_fds) { - if(!FD_ISSET(fd.first, &read_fd_set)) - continue; - - f(fd.first); - if (loop.m_terminate_request) - return; - } -} #else Error MainLoop::RunImpl::Poll() { - signals.clear(); read_fds.clear(); sigset_t sigmask = get_sigmask(); @@ -220,33 +189,47 @@ Error MainLoop::RunImpl::Poll() { return Error(); } +#endif -template void MainLoop::RunImpl::ForEachReadFD(F &&f) { +void MainLoop::RunImpl::ProcessEvents() { +#ifdef FORCE_PSELECT + for (const auto &fd : loop.m_read_fds) { + if (!FD_ISSET(fd.first, &read_fd_set)) + continue; + IOObject::WaitableHandle handle = fd.first; +#else for (const auto &fd : read_fds) { if ((fd.revents & POLLIN) == 0) continue; - - f(fd.fd); + IOObject::WaitableHandle handle = fd.fd; +#endif if (loop.m_terminate_request) return; - } -} -#endif -template void MainLoop::RunImpl::ForEachSignal(F &&f) { - for (int sig : signals) { - if (g_signal_flags[sig] == 0) - continue; // No signal - g_signal_flags[sig] = 0; - f(sig); + loop.ProcessReadObject(handle); + } + for (const auto &entry : loop.m_signals) { if (loop.m_terminate_request) return; + if (g_signal_flags[entry.first] == 0) + continue; // No signal + g_signal_flags[entry.first] = 0; + loop.ProcessSignal(entry.first); } } #endif +MainLoop::MainLoop() { +#if HAVE_SYS_EVENT_H + m_kqueue = kqueue(); + assert(m_kqueue >= 0); +#endif +} MainLoop::~MainLoop() { +#if HAVE_SYS_EVENT_H + close(m_kqueue); +#endif assert(m_read_fds.size() == 0); assert(m_signals.size() == 0); } @@ -298,24 +281,30 @@ MainLoop::RegisterSignal(int signo, const Callback &callback, new_action.sa_flags = SA_SIGINFO; sigemptyset(&new_action.sa_mask); sigaddset(&new_action.sa_mask, signo); - sigset_t old_set; - if (int ret = pthread_sigmask(SIG_BLOCK, &new_action.sa_mask, &old_set)) { - error.SetErrorStringWithFormat("pthread_sigmask failed with error %d\n", - ret); - return nullptr; - } - info.was_blocked = sigismember(&old_set, signo); - if (sigaction(signo, &new_action, &info.old_action) == -1) { - error.SetErrorToErrno(); - if (!info.was_blocked) - pthread_sigmask(SIG_UNBLOCK, &new_action.sa_mask, nullptr); - return nullptr; - } + g_signal_flags[signo] = 0; + + // Even if using kqueue, the signal handler will still be invoked, so it's + // important to replace it with our "bening" handler. + int ret = sigaction(signo, &new_action, &info.old_action); + assert(ret == 0 && "sigaction failed"); +#if HAVE_SYS_EVENT_H + struct kevent ev; + EV_SET(&ev, signo, EVFILT_SIGNAL, EV_ADD, 0, 0, 0); + ret = kevent(m_kqueue, &ev, 1, nullptr, 0, nullptr); + assert(ret == 0); +#endif + + // If we're using kqueue, the signal needs to be unblocked in order to recieve + // it. If using pselect/ppoll, we need to block it, and later unblock it as a + // part of the system call. + ret = pthread_sigmask(HAVE_SYS_EVENT_H ? SIG_UNBLOCK : SIG_BLOCK, + &new_action.sa_mask, &old_set); + assert(ret == 0 && "pthread_sigmask failed"); + info.was_blocked = sigismember(&old_set, signo); m_signals.insert({signo, info}); - g_signal_flags[signo] = 0; return SignalHandleUP(new SignalHandle(*this, signo)); #endif @@ -331,7 +320,6 @@ void MainLoop::UnregisterSignal(int signo) { #if SIGNAL_POLLING_UNSUPPORTED Error("Signal polling is not supported on this platform."); #else - // We undo the actions of RegisterSignal on a best-effort basis. auto it = m_signals.find(signo); assert(it != m_signals.end()); @@ -340,8 +328,17 @@ void MainLoop::UnregisterSignal(int signo) { sigset_t set; sigemptyset(&set); sigaddset(&set, signo); - pthread_sigmask(it->second.was_blocked ? SIG_BLOCK : SIG_UNBLOCK, &set, - nullptr); + int ret = pthread_sigmask(it->second.was_blocked ? SIG_BLOCK : SIG_UNBLOCK, + &set, nullptr); + assert(ret == 0); + (void)ret; + +#if HAVE_SYS_EVENT_H + struct kevent ev; + EV_SET(&ev, signo, EVFILT_SIGNAL, EV_DELETE, 0, 0, 0); + ret = kevent(m_kqueue, &ev, 1, nullptr, 0, nullptr); + assert(ret == 0); +#endif m_signals.erase(it); #endif @@ -351,32 +348,31 @@ Error MainLoop::Run() { m_terminate_request = false; Error error; - auto impl = RunImpl::Create(*this, error); - if (!impl) - return error; + RunImpl impl(*this); // run until termination or until we run out of things to listen to while (!m_terminate_request && (!m_read_fds.empty() || !m_signals.empty())) { - error = impl->Poll(); + error = impl.Poll(); if (error.Fail()) return error; - impl->ForEachSignal([&](int sig) { - auto it = m_signals.find(sig); - if (it != m_signals.end()) - it->second.callback(*this); // Do the work - }); - if (m_terminate_request) - return Error(); + impl.ProcessEvents(); - impl->ForEachReadFD([&](int fd) { - auto it = m_read_fds.find(fd); - if (it != m_read_fds.end()) - it->second(*this); // Do the work - }); if (m_terminate_request) return Error(); } return Error(); } + +void MainLoop::ProcessSignal(int signo) { + auto it = m_signals.find(signo); + if (it != m_signals.end()) + it->second.callback(*this); // Do the work +} + +void MainLoop::ProcessReadObject(IOObject::WaitableHandle handle) { + auto it = m_read_fds.find(handle); + if (it != m_read_fds.end()) + it->second(*this); // Do the work +} diff --git a/source/Host/common/UDPSocket.cpp b/source/Host/common/UDPSocket.cpp index a32657aab0a6..ce8d90891b2b 100644 --- a/source/Host/common/UDPSocket.cpp +++ b/source/Host/common/UDPSocket.cpp @@ -28,31 +28,41 @@ const int kDomain = AF_INET; const int kType = SOCK_DGRAM; static const char *g_not_supported_error = "Not supported"; -} // namespace - -UDPSocket::UDPSocket(bool should_close, bool child_processes_inherit) - : Socket(ProtocolUdp, should_close, child_processes_inherit) {} +} -UDPSocket::UDPSocket(NativeSocket socket, const UDPSocket &listen_socket) - : Socket(ProtocolUdp, listen_socket.m_should_close_fd, - listen_socket.m_child_processes_inherit) { +UDPSocket::UDPSocket(NativeSocket socket) : Socket(ProtocolUdp, true, true) { m_socket = socket; } +UDPSocket::UDPSocket(bool should_close, bool child_processes_inherit) + : Socket(ProtocolUdp, should_close, child_processes_inherit) {} + size_t UDPSocket::Send(const void *buf, const size_t num_bytes) { return ::sendto(m_socket, static_cast(buf), num_bytes, 0, m_sockaddr, m_sockaddr.GetLength()); } Error UDPSocket::Connect(llvm::StringRef name) { + return Error("%s", g_not_supported_error); +} + +Error UDPSocket::Listen(llvm::StringRef name, int backlog) { + return Error("%s", g_not_supported_error); +} + +Error UDPSocket::Accept(Socket *&socket) { + return Error("%s", g_not_supported_error); +} + +Error UDPSocket::Connect(llvm::StringRef name, bool child_processes_inherit, + Socket *&socket) { + std::unique_ptr final_socket; + Log *log(lldb_private::GetLogIfAnyCategoriesSet(LIBLLDB_LOG_CONNECTION)); if (log) log->Printf("UDPSocket::%s (host/port = %s)", __FUNCTION__, name.data()); Error error; - if (error.Fail()) - return error; - std::string host_str; std::string port_str; int32_t port = INT32_MIN; @@ -84,11 +94,12 @@ Error UDPSocket::Connect(llvm::StringRef name) { for (struct addrinfo *service_info_ptr = service_info_list; service_info_ptr != nullptr; service_info_ptr = service_info_ptr->ai_next) { - m_socket = Socket::CreateSocket( + auto send_fd = CreateSocket( service_info_ptr->ai_family, service_info_ptr->ai_socktype, - service_info_ptr->ai_protocol, m_child_processes_inherit, error); + service_info_ptr->ai_protocol, child_processes_inherit, error); if (error.Success()) { - m_sockaddr = service_info_ptr; + final_socket.reset(new UDPSocket(send_fd)); + final_socket->m_sockaddr = service_info_ptr; break; } else continue; @@ -96,17 +107,16 @@ Error UDPSocket::Connect(llvm::StringRef name) { ::freeaddrinfo(service_info_list); - if (IsValid()) + if (!final_socket) return error; SocketAddress bind_addr; // Only bind to the loopback address if we are expecting a connection from // localhost to avoid any firewall issues. - const bool bind_addr_success = - (host_str == "127.0.0.1" || host_str == "localhost") - ? bind_addr.SetToLocalhost(kDomain, port) - : bind_addr.SetToAnyAddress(kDomain, port); + const bool bind_addr_success = (host_str == "127.0.0.1" || host_str == "localhost") + ? bind_addr.SetToLocalhost(kDomain, port) + : bind_addr.SetToAnyAddress(kDomain, port); if (!bind_addr_success) { error.SetErrorString("Failed to get hostspec to bind for"); @@ -115,37 +125,13 @@ Error UDPSocket::Connect(llvm::StringRef name) { bind_addr.SetPort(0); // Let the source port # be determined dynamically - err = ::bind(m_socket, bind_addr, bind_addr.GetLength()); + err = ::bind(final_socket->GetNativeSocket(), bind_addr, bind_addr.GetLength()); - error.Clear(); - return error; -} + struct sockaddr_in source_info; + socklen_t address_len = sizeof (struct sockaddr_in); + err = ::getsockname(final_socket->GetNativeSocket(), (struct sockaddr *) &source_info, &address_len); -Error UDPSocket::Listen(llvm::StringRef name, int backlog) { - return Error("%s", g_not_supported_error); -} - -Error UDPSocket::Accept(Socket *&socket) { - return Error("%s", g_not_supported_error); -} - -Error UDPSocket::CreateSocket() { - Error error; - if (IsValid()) - error = Close(); - if (error.Fail()) - return error; - m_socket = - Socket::CreateSocket(kDomain, kType, 0, m_child_processes_inherit, error); - return error; -} - -Error UDPSocket::Connect(llvm::StringRef name, bool child_processes_inherit, - Socket *&socket) { - std::unique_ptr final_socket( - new UDPSocket(true, child_processes_inherit)); - Error error = final_socket->Connect(name); - if (!error.Fail()) - socket = final_socket.release(); + socket = final_socket.release(); + error.Clear(); return error; } diff --git a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp index 04df0065d7bc..65cbd271e979 100644 --- a/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp +++ b/source/Plugins/ABI/SysV-arm64/ABISysV_arm64.cpp @@ -2362,32 +2362,30 @@ ValueObjectSP ABISysV_arm64::GetReturnValueObjectImpl( if (success) return_valobj_sp = ValueObjectConstResult::Create( thread.GetStackFrameAtIndex(0).get(), value, ConstString("")); - } else if (type_flags & eTypeIsVector) { + } else if (type_flags & eTypeIsVector && byte_size <= 16) { if (byte_size > 0) { const RegisterInfo *v0_info = reg_ctx->GetRegisterInfoByName("v0", 0); if (v0_info) { - if (byte_size <= v0_info->byte_size) { - std::unique_ptr heap_data_ap( - new DataBufferHeap(byte_size, 0)); - const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder(); - RegisterValue reg_value; - if (reg_ctx->ReadRegister(v0_info, reg_value)) { - Error error; - if (reg_value.GetAsMemoryData(v0_info, heap_data_ap->GetBytes(), - heap_data_ap->GetByteSize(), - byte_order, error)) { - DataExtractor data(DataBufferSP(heap_data_ap.release()), - byte_order, - exe_ctx.GetProcessRef().GetAddressByteSize()); - return_valobj_sp = ValueObjectConstResult::Create( - &thread, return_compiler_type, ConstString(""), data); - } + std::unique_ptr heap_data_ap( + new DataBufferHeap(byte_size, 0)); + const ByteOrder byte_order = exe_ctx.GetProcessRef().GetByteOrder(); + RegisterValue reg_value; + if (reg_ctx->ReadRegister(v0_info, reg_value)) { + Error error; + if (reg_value.GetAsMemoryData(v0_info, heap_data_ap->GetBytes(), + heap_data_ap->GetByteSize(), byte_order, + error)) { + DataExtractor data(DataBufferSP(heap_data_ap.release()), byte_order, + exe_ctx.GetProcessRef().GetAddressByteSize()); + return_valobj_sp = ValueObjectConstResult::Create( + &thread, return_compiler_type, ConstString(""), data); } } } } - } else if (type_flags & eTypeIsStructUnion || type_flags & eTypeIsClass) { + } else if (type_flags & eTypeIsStructUnion || type_flags & eTypeIsClass || + (type_flags & eTypeIsVector && byte_size > 16)) { DataExtractor data; uint32_t NGRN = 0; // Search ABI docs for NGRN diff --git a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp index 20260ee5b5c3..c824653b2e93 100644 --- a/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp +++ b/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp @@ -434,24 +434,25 @@ Error DynamicLoaderMacOS::CanLoadImage() { // Default assumption is that it is OK to load images. // Only say that we cannot load images if we find the symbol in libdyld and it - // indicates that - // we cannot. + // indicates that we cannot. if (symbol_address != LLDB_INVALID_ADDRESS) { { int lock_held = m_process->ReadUnsignedIntegerFromMemory(symbol_address, 4, 0, error); if (lock_held != 0) { - error.SetErrorToGenericError(); + error.SetErrorString("dyld lock held - unsafe to load images."); } } } else { // If we were unable to find _dyld_global_lock_held in any modules, or it is - // not loaded into - // memory yet, we may be at process startup (sitting at _dyld_start) - so we - // should not allow - // dlopen calls. - error.SetErrorToGenericError(); + // not loaded into memory yet, we may be at process startup (sitting + // at _dyld_start) - so we should not allow dlopen calls. + // But if we found more than one module then we are clearly past _dyld_start + // so in that case we'll default to "it's safe". + if (num_modules <= 1) + error.SetErrorString("could not find the dyld library or " + "the dyld lock symbol"); } return error; } diff --git a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp index 8c2fc3d3aa42..ad6af8dfebd5 100644 --- a/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp +++ b/source/Plugins/SymbolFile/DWARF/SymbolFileDWARF.cpp @@ -1946,7 +1946,9 @@ void SymbolFileDWARF::Index() { std::vector type_index(num_compile_units); std::vector namespace_index(num_compile_units); - std::vector clear_cu_dies(num_compile_units, false); + // std::vector might be implemented using bit test-and-set, so use + // uint8_t instead. + std::vector clear_cu_dies(num_compile_units, false); auto parser_fn = [debug_info, &function_basename_index, &function_fullname_index, &function_method_index, &function_selector_index, &objc_class_selectors_index, @@ -1963,22 +1965,18 @@ void SymbolFileDWARF::Index() { return cu_idx; }; - auto extract_fn = [debug_info](uint32_t cu_idx) { + auto extract_fn = [debug_info, &clear_cu_dies](uint32_t cu_idx) { DWARFCompileUnit *dwarf_cu = debug_info->GetCompileUnitAtIndex(cu_idx); if (dwarf_cu) { // dwarf_cu->ExtractDIEsIfNeeded(false) will return zero if the // DIEs for a compile unit have already been parsed. - return std::make_pair(cu_idx, dwarf_cu->ExtractDIEsIfNeeded(false) > 1); + if (dwarf_cu->ExtractDIEsIfNeeded(false) > 1) + clear_cu_dies[cu_idx] = true; } - return std::make_pair(cu_idx, false); }; // Create a task runner that extracts dies for each DWARF compile unit in a // separate thread - TaskRunner> task_runner_extract; - for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx) - task_runner_extract.AddTask(extract_fn, cu_idx); - //---------------------------------------------------------------------- // First figure out which compile units didn't have their DIEs already // parsed and remember this. If no DIEs were parsed prior to this index @@ -1988,48 +1986,37 @@ void SymbolFileDWARF::Index() { // a DIE in one compile unit refers to another and the indexes accesses // those DIEs. //---------------------------------------------------------------------- - while (true) { - auto f = task_runner_extract.WaitForNextCompletedTask(); - if (!f.valid()) - break; - unsigned cu_idx; - bool clear; - std::tie(cu_idx, clear) = f.get(); - clear_cu_dies[cu_idx] = clear; - } + TaskMapOverInt(0, num_compile_units, extract_fn); // Now create a task runner that can index each DWARF compile unit in a // separate // thread so we can index quickly. - TaskRunner task_runner; - for (uint32_t cu_idx = 0; cu_idx < num_compile_units; ++cu_idx) - task_runner.AddTask(parser_fn, cu_idx); + TaskMapOverInt(0, num_compile_units, parser_fn); - while (true) { - std::future f = task_runner.WaitForNextCompletedTask(); - if (!f.valid()) - break; - uint32_t cu_idx = f.get(); - - m_function_basename_index.Append(function_basename_index[cu_idx]); - m_function_fullname_index.Append(function_fullname_index[cu_idx]); - m_function_method_index.Append(function_method_index[cu_idx]); - m_function_selector_index.Append(function_selector_index[cu_idx]); - m_objc_class_selectors_index.Append(objc_class_selectors_index[cu_idx]); - m_global_index.Append(global_index[cu_idx]); - m_type_index.Append(type_index[cu_idx]); - m_namespace_index.Append(namespace_index[cu_idx]); - } + auto finalize_fn = [](NameToDIE &index, std::vector &srcs) { + for (auto &src : srcs) + index.Append(src); + index.Finalize(); + }; - TaskPool::RunTasks([&]() { m_function_basename_index.Finalize(); }, - [&]() { m_function_fullname_index.Finalize(); }, - [&]() { m_function_method_index.Finalize(); }, - [&]() { m_function_selector_index.Finalize(); }, - [&]() { m_objc_class_selectors_index.Finalize(); }, - [&]() { m_global_index.Finalize(); }, - [&]() { m_type_index.Finalize(); }, - [&]() { m_namespace_index.Finalize(); }); + TaskPool::RunTasks( + [&]() { + finalize_fn(m_function_basename_index, function_basename_index); + }, + [&]() { + finalize_fn(m_function_fullname_index, function_fullname_index); + }, + [&]() { finalize_fn(m_function_method_index, function_method_index); }, + [&]() { + finalize_fn(m_function_selector_index, function_selector_index); + }, + [&]() { + finalize_fn(m_objc_class_selectors_index, objc_class_selectors_index); + }, + [&]() { finalize_fn(m_global_index, global_index); }, + [&]() { finalize_fn(m_type_index, type_index); }, + [&]() { finalize_fn(m_namespace_index, namespace_index); }); //---------------------------------------------------------------------- // Keep memory down by clearing DIEs for any compile units if indexing diff --git a/source/Target/ThreadPlanCallUserExpression.cpp b/source/Target/ThreadPlanCallUserExpression.cpp index 679040d09a02..15cbd0baa9a6 100644 --- a/source/Target/ThreadPlanCallUserExpression.cpp +++ b/source/Target/ThreadPlanCallUserExpression.cpp @@ -60,6 +60,12 @@ void ThreadPlanCallUserExpression::GetDescription( ThreadPlanCallFunction::GetDescription(s, level); } +void ThreadPlanCallUserExpression::DidPush() { + ThreadPlanCallFunction::DidPush(); + if (m_user_expression_sp) + m_user_expression_sp->WillStartExecuting(); +} + void ThreadPlanCallUserExpression::WillPop() { ThreadPlanCallFunction::WillPop(); if (m_user_expression_sp) @@ -113,3 +119,8 @@ StopInfoSP ThreadPlanCallUserExpression::GetRealStopInfo() { return stop_info_sp; } + +void ThreadPlanCallUserExpression::DoTakedown(bool success) { + ThreadPlanCallFunction::DoTakedown(success); + m_user_expression_sp->DidFinishExecuting(); +} diff --git a/source/Utility/TaskPool.cpp b/source/Utility/TaskPool.cpp index 244e64fdb5fb..d8306dc7dc8f 100644 --- a/source/Utility/TaskPool.cpp +++ b/source/Utility/TaskPool.cpp @@ -73,3 +73,26 @@ void TaskPoolImpl::Worker(TaskPoolImpl *pool) { f(); } } + +void TaskMapOverInt(size_t begin, size_t end, + std::function const &func) { + std::atomic idx{begin}; + size_t num_workers = + std::min(end, std::thread::hardware_concurrency()); + + auto wrapper = [&idx, end, &func]() { + while (true) { + size_t i = idx.fetch_add(1); + if (i >= end) + break; + func(i); + } + }; + + std::vector> futures; + futures.reserve(num_workers); + for (size_t i = 0; i < num_workers; i++) + futures.push_back(TaskPool::AddTask(wrapper)); + for (size_t i = 0; i < num_workers; i++) + futures[i].wait(); +} diff --git a/unittests/Host/CMakeLists.txt b/unittests/Host/CMakeLists.txt index 3396f45da4f3..7b2ce3bbfde5 100644 --- a/unittests/Host/CMakeLists.txt +++ b/unittests/Host/CMakeLists.txt @@ -1,6 +1,7 @@ set (FILES FileSpecTest.cpp FileSystemTest.cpp + MainLoopTest.cpp SocketAddressTest.cpp SocketTest.cpp SymbolsTest.cpp diff --git a/unittests/Host/MainLoopTest.cpp b/unittests/Host/MainLoopTest.cpp new file mode 100644 index 000000000000..a2a673d38ca5 --- /dev/null +++ b/unittests/Host/MainLoopTest.cpp @@ -0,0 +1,120 @@ +//===-- MainLoopTest.cpp ----------------------------------------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "lldb/Host/MainLoop.h" +#include "lldb/Host/common/TCPSocket.h" +#include "gtest/gtest.h" +#include + +using namespace lldb_private; + +namespace { +class MainLoopTest : public testing::Test { +public: + static void SetUpTestCase() { +#ifdef _MSC_VER + WSADATA data; + ASSERT_EQ(0, WSAStartup(MAKEWORD(2, 2), &data)); +#endif + } + + static void TearDownTestCase() { +#ifdef _MSC_VER + ASSERT_EQ(0, WSACleanup()); +#endif + } + + void SetUp() override { + bool child_processes_inherit = false; + Error error; + std::unique_ptr listen_socket_up( + new TCPSocket(true, child_processes_inherit)); + ASSERT_TRUE(error.Success()); + error = listen_socket_up->Listen("localhost:0", 5); + ASSERT_TRUE(error.Success()); + + Socket *accept_socket; + std::future accept_error = std::async(std::launch::async, [&] { + return listen_socket_up->Accept(accept_socket); + }); + + std::unique_ptr connect_socket_up( + new TCPSocket(true, child_processes_inherit)); + error = connect_socket_up->Connect( + llvm::formatv("localhost:{0}", listen_socket_up->GetLocalPortNumber()) + .str()); + ASSERT_TRUE(error.Success()); + ASSERT_TRUE(accept_error.get().Success()); + + callback_count = 0; + socketpair[0] = std::move(connect_socket_up); + socketpair[1].reset(accept_socket); + } + + void TearDown() override { + socketpair[0].reset(); + socketpair[1].reset(); + } + +protected: + MainLoop::Callback make_callback() { + return [&](MainLoopBase &loop) { + ++callback_count; + loop.RequestTermination(); + }; + } + std::shared_ptr socketpair[2]; + unsigned callback_count; +}; +} // namespace + +TEST_F(MainLoopTest, ReadObject) { + char X = 'X'; + size_t len = sizeof(X); + ASSERT_TRUE(socketpair[0]->Write(&X, len).Success()); + + MainLoop loop; + + Error error; + auto handle = loop.RegisterReadObject(socketpair[1], make_callback(), error); + ASSERT_TRUE(error.Success()); + ASSERT_TRUE(handle); + ASSERT_TRUE(loop.Run().Success()); + ASSERT_EQ(1u, callback_count); +} + +TEST_F(MainLoopTest, TerminatesImmediately) { + char X = 'X'; + size_t len = sizeof(X); + ASSERT_TRUE(socketpair[0]->Write(&X, len).Success()); + ASSERT_TRUE(socketpair[1]->Write(&X, len).Success()); + + MainLoop loop; + Error error; + auto handle0 = loop.RegisterReadObject(socketpair[0], make_callback(), error); + ASSERT_TRUE(error.Success()); + auto handle1 = loop.RegisterReadObject(socketpair[1], make_callback(), error); + ASSERT_TRUE(error.Success()); + + ASSERT_TRUE(loop.Run().Success()); + ASSERT_EQ(1u, callback_count); +} + +#ifdef LLVM_ON_UNIX +TEST_F(MainLoopTest, Signal) { + MainLoop loop; + Error error; + + auto handle = loop.RegisterSignal(SIGUSR1, make_callback(), error); + ASSERT_TRUE(error.Success()); + kill(getpid(), SIGUSR1); + ASSERT_TRUE(loop.Run().Success()); + ASSERT_EQ(1u, callback_count); +} +#endif diff --git a/unittests/Utility/TaskPoolTest.cpp b/unittests/Utility/TaskPoolTest.cpp index 172e32a9c6c0..e340a81b27db 100644 --- a/unittests/Utility/TaskPoolTest.cpp +++ b/unittests/Utility/TaskPoolTest.cpp @@ -30,25 +30,14 @@ TEST(TaskPoolTest, RunTasks) { ASSERT_EQ(17, r[3]); } -TEST(TaskPoolTest, TaskRunner) { - auto fn = [](int x) { return std::make_pair(x, x * x); }; - - TaskRunner> tr; - tr.AddTask(fn, 1); - tr.AddTask(fn, 2); - tr.AddTask(fn, 3); - tr.AddTask(fn, 4); - - int count = 0; - while (true) { - auto f = tr.WaitForNextCompletedTask(); - if (!f.valid()) - break; - - ++count; - std::pair v = f.get(); - ASSERT_EQ(v.first * v.first, v.second); - } - - ASSERT_EQ(4, count); +TEST(TaskPoolTest, TaskMap) { + int data[4]; + auto fn = [&data](int x) { data[x] = x * x; }; + + TaskMapOverInt(0, 4, fn); + + ASSERT_EQ(data[0], 0); + ASSERT_EQ(data[1], 1); + ASSERT_EQ(data[2], 4); + ASSERT_EQ(data[3], 9); } diff --git a/www/lldb-gdb.html b/www/lldb-gdb.html index 60cd718d5255..69179bd8c07c 100755 --- a/www/lldb-gdb.html +++ b/www/lldb-gdb.html @@ -772,6 +772,27 @@ + List the threads in your program. + + + (gdb) info threads
+ + + (lldb) thread list
+ + + + Select thread 1 as the default thread for subsequent commands. + + + (gdb) thread 1
+ + + (lldb) thread select 1
+ (lldb) t 1
+ + + Show the stack backtrace for the current thread. @@ -1250,6 +1271,16 @@ LLDB + Search command help for a keyword. + + + (gdb) apropos keyword
+ + + (lldb) apropos keyword
+ + + Echo text to the screen. -- cgit v1.2.3 From 6b3f41ed88e8e440e11a4fbf20b6600529f80049 Mon Sep 17 00:00:00 2001 From: Dimitry Andric Date: Tue, 16 May 2017 19:46:52 +0000 Subject: Vendor import of llvm trunk r303197: https://llvm.org/svn/llvm-project/llvm/trunk@303197 --- CREDITS.TXT | 2 +- cmake/config-ix.cmake | 10 - cmake/modules/AddSphinxTarget.cmake | 13 + docs/CMakeLists.txt | 2 +- docs/GettingStarted.rst | 4 +- docs/LangRef.rst | 348 +- docs/Lexicon.rst | 8 + docs/LibFuzzer.rst | 19 +- docs/ReleaseNotes.rst | 4 + include/llvm/ADT/APInt.h | 49 +- include/llvm/ADT/BitVector.h | 2 +- include/llvm/ADT/STLExtras.h | 14 +- include/llvm/ADT/StringExtras.h | 7 + include/llvm/Analysis/CallGraph.h | 10 - include/llvm/Analysis/ProfileSummaryInfo.h | 4 +- include/llvm/Analysis/ScalarEvolution.h | 21 +- include/llvm/Analysis/TargetLibraryInfo.def | 127 +- include/llvm/Analysis/TargetTransformInfo.h | 33 + include/llvm/Analysis/TargetTransformInfoImpl.h | 12 + include/llvm/Analysis/ValueTracking.h | 13 +- include/llvm/Bitcode/BitcodeReader.h | 5 +- include/llvm/CodeGen/ExpandReductions.h | 24 + include/llvm/CodeGen/GlobalISel/LegalizerInfo.h | 17 +- include/llvm/CodeGen/GlobalISel/Utils.h | 3 + include/llvm/CodeGen/ISDOpcodes.h | 21 + include/llvm/CodeGen/MachineCombinerPattern.h | 2 + include/llvm/CodeGen/Passes.h | 12 + include/llvm/CodeGen/SelectionDAG.h | 14 +- include/llvm/DebugInfo/CodeView/CVTypeVisitor.h | 4 + .../DebugInfo/CodeView/RandomAccessTypeVisitor.h | 103 + include/llvm/DebugInfo/CodeView/TypeDatabase.h | 23 +- .../llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h | 13 +- include/llvm/DebugInfo/CodeView/TypeDeserializer.h | 4 + include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h | 1 + include/llvm/DebugInfo/CodeView/TypeIndex.h | 62 + .../CodeView/TypeVisitorCallbackPipeline.h | 8 + .../llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h | 9 +- include/llvm/DebugInfo/DWARF/DWARFContext.h | 7 - include/llvm/DebugInfo/DWARF/DWARFDebugLine.h | 2 +- include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h | 7 +- include/llvm/DebugInfo/DWARF/DWARFRelocMap.h | 12 +- include/llvm/DebugInfo/DWARF/DWARFVerifier.h | 6 +- include/llvm/DebugInfo/PDB/Native/RawTypes.h | 7 - include/llvm/DebugInfo/PDB/Native/TpiStream.h | 4 +- .../llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h | 2 +- .../ExecutionEngine/Orc/CompileOnDemandLayer.h | 13 + .../ExecutionEngine/Orc/OrcRemoteTargetClient.h | 29 +- .../ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h | 4 + include/llvm/ExecutionEngine/RTDyldMemoryManager.h | 16 +- include/llvm/ExecutionEngine/RuntimeDyld.h | 3 +- include/llvm/IR/Attributes.h | 41 + include/llvm/IR/CallingConv.h | 4 + include/llvm/IR/Constants.h | 10 +- include/llvm/IR/DebugInfoMetadata.h | 37 +- include/llvm/IR/DebugLoc.h | 16 + include/llvm/IR/DerivedTypes.h | 42 +- include/llvm/IR/DiagnosticInfo.h | 23 +- include/llvm/IR/Function.h | 38 +- include/llvm/IR/GetElementPtrTypeIterator.h | 8 +- include/llvm/IR/GlobalAlias.h | 8 +- include/llvm/IR/GlobalIFunc.h | 8 +- include/llvm/IR/GlobalObject.h | 4 +- include/llvm/IR/GlobalValue.h | 20 +- include/llvm/IR/GlobalVariable.h | 63 +- include/llvm/IR/IRBuilder.h | 39 + include/llvm/IR/InstrTypes.h | 18 +- include/llvm/IR/Instruction.h | 6 + include/llvm/IR/Instructions.h | 149 +- include/llvm/IR/Intrinsics.td | 44 + include/llvm/IR/LLVMContext.h | 12 +- include/llvm/IR/LegacyPassManager.h | 3 + include/llvm/IR/Module.h | 100 +- include/llvm/IR/ModuleSummaryIndex.h | 59 +- include/llvm/IR/PassManager.h | 122 +- include/llvm/IR/PassManagerInternal.h | 11 +- include/llvm/IR/PatternMatch.h | 39 +- include/llvm/IR/ProfileSummary.h | 20 +- include/llvm/IR/Statepoint.h | 16 +- include/llvm/IR/SymbolTableListTraits.h | 10 +- include/llvm/IR/TrackingMDRef.h | 16 +- include/llvm/IR/Type.h | 45 +- include/llvm/IR/TypeFinder.h | 4 +- include/llvm/IR/Use.h | 28 +- include/llvm/IR/UseListOrder.h | 2 +- include/llvm/IR/User.h | 19 +- include/llvm/IR/Value.h | 23 +- include/llvm/IR/ValueHandle.h | 48 +- include/llvm/IR/ValueMap.h | 42 +- include/llvm/IR/ValueSymbolTable.h | 6 +- include/llvm/IR/Verifier.h | 15 +- include/llvm/InitializePasses.h | 5 +- include/llvm/LibDriver/LibDriver.h | 24 - include/llvm/LinkAllPasses.h | 1 + include/llvm/Object/Wasm.h | 7 +- include/llvm/ObjectYAML/WasmYAML.h | 23 +- include/llvm/ProfileData/SampleProfWriter.h | 11 +- include/llvm/Support/BinaryStreamArray.h | 5 +- include/llvm/Support/Compiler.h | 8 +- include/llvm/Support/KnownBits.h | 60 + include/llvm/Support/Parallel.h | 249 + include/llvm/Support/Wasm.h | 23 +- include/llvm/Target/Target.td | 5 + include/llvm/Target/TargetInstrInfo.h | 13 +- include/llvm/Target/TargetLowering.h | 24 +- include/llvm/Target/TargetSchedule.td | 2 +- include/llvm/Target/TargetSelectionDAG.td | 2 +- include/llvm/ToolDrivers/llvm-lib/LibDriver.h | 24 + include/llvm/Transforms/Utils/Cloning.h | 7 +- include/llvm/Transforms/Utils/LoopUtils.h | 32 + include/llvm/Transforms/Vectorize/SLPVectorizer.h | 4 +- include/llvm/module.modulemap | 1 + lib/Analysis/BasicAliasAnalysis.cpp | 9 +- lib/Analysis/BranchProbabilityInfo.cpp | 11 +- lib/Analysis/CallGraph.cpp | 34 +- lib/Analysis/ConstantFolding.cpp | 80 +- lib/Analysis/DemandedBits.cpp | 10 +- lib/Analysis/InlineCost.cpp | 2 +- lib/Analysis/InstructionSimplify.cpp | 167 +- lib/Analysis/ModuleSummaryAnalysis.cpp | 3 +- lib/Analysis/OptimizationDiagnosticInfo.cpp | 2 +- lib/Analysis/ProfileSummaryInfo.cpp | 13 +- lib/Analysis/ScalarEvolution.cpp | 159 +- lib/Analysis/TargetLibraryInfo.cpp | 112 +- lib/Analysis/TargetTransformInfo.cpp | 13 + lib/Analysis/ValueTracking.cpp | 276 +- lib/Analysis/VectorUtils.cpp | 1 + lib/AsmParser/LLParser.cpp | 18 +- lib/Bitcode/Reader/BitcodeReader.cpp | 17 +- lib/Bitcode/Reader/MetadataLoader.cpp | 2 +- lib/Bitcode/Writer/BitcodeWriter.cpp | 8 +- lib/Bitcode/Writer/ValueEnumerator.cpp | 7 +- lib/CMakeLists.txt | 2 +- lib/CodeGen/AsmPrinter/CodeViewDebug.cpp | 14 +- lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp | 3 +- lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp | 42 +- lib/CodeGen/AsmPrinter/DwarfCompileUnit.h | 22 + lib/CodeGen/AsmPrinter/DwarfDebug.cpp | 89 +- lib/CodeGen/AsmPrinter/DwarfDebug.h | 20 +- lib/CodeGen/AsmPrinter/DwarfFile.h | 4 + lib/CodeGen/AsmPrinter/DwarfUnit.cpp | 6 +- lib/CodeGen/AsmPrinter/DwarfUnit.h | 5 +- lib/CodeGen/AsmPrinter/WinException.cpp | 12 +- lib/CodeGen/AtomicExpandPass.cpp | 31 +- lib/CodeGen/CMakeLists.txt | 3 + lib/CodeGen/CodeGen.cpp | 4 +- lib/CodeGen/CodeGenPrepare.cpp | 548 +- lib/CodeGen/ExpandPostRAPseudos.cpp | 5 +- lib/CodeGen/ExpandReductions.cpp | 167 + lib/CodeGen/GlobalISel/LegalizerInfo.cpp | 10 +- lib/CodeGen/GlobalISel/RegBankSelect.cpp | 9 +- lib/CodeGen/GlobalISel/Utils.cpp | 8 + lib/CodeGen/IfConversion.cpp | 30 +- lib/CodeGen/LiveRangeShrink.cpp | 211 + lib/CodeGen/LiveVariables.cpp | 2 +- lib/CodeGen/MachineBlockPlacement.cpp | 30 +- lib/CodeGen/MachineVerifier.cpp | 4 +- lib/CodeGen/PHIElimination.cpp | 2 +- lib/CodeGen/RegisterCoalescer.cpp | 2 +- lib/CodeGen/RegisterScavenging.cpp | 7 +- lib/CodeGen/SafeStack.cpp | 172 +- lib/CodeGen/ScalarizeMaskedMemIntrin.cpp | 660 ++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 176 +- lib/CodeGen/SelectionDAG/FastISel.cpp | 20 +- lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp | 33 +- lib/CodeGen/SelectionDAG/LegalizeDAG.cpp | 3 +- lib/CodeGen/SelectionDAG/LegalizeTypes.h | 1 + lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp | 58 + lib/CodeGen/SelectionDAG/SelectionDAG.cpp | 162 +- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp | 138 +- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h | 6 +- lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp | 13 + lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp | 74 +- lib/CodeGen/SelectionDAG/TargetLowering.cpp | 11 +- lib/CodeGen/ShrinkWrap.cpp | 12 +- lib/CodeGen/SjLjEHPrepare.cpp | 4 +- lib/CodeGen/TargetLoweringObjectFileImpl.cpp | 6 +- lib/CodeGen/TargetPassConfig.cpp | 11 + lib/CodeGen/TwoAddressInstructionPass.cpp | 7 +- lib/CodeGen/UnreachableBlockElim.cpp | 7 +- lib/DebugInfo/CodeView/CMakeLists.txt | 2 +- lib/DebugInfo/CodeView/CVTypeVisitor.cpp | 41 +- .../CodeView/ModuleDebugUnknownFragment.cpp | 10 - lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp | 91 + lib/DebugInfo/CodeView/TypeDatabase.cpp | 73 +- lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp | 65 +- lib/DebugInfo/CodeView/TypeDumpVisitor.cpp | 9 +- lib/DebugInfo/DWARF/DWARFContext.cpp | 25 +- lib/DebugInfo/DWARF/DWARFDebugAranges.cpp | 5 +- lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp | 4 +- lib/DebugInfo/DWARF/DWARFDie.cpp | 12 +- lib/DebugInfo/DWARF/DWARFTypeUnit.cpp | 6 +- lib/DebugInfo/DWARF/DWARFUnit.cpp | 16 +- lib/DebugInfo/DWARF/DWARFVerifier.cpp | 8 +- lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp | 2 +- lib/ExecutionEngine/Orc/OrcMCJITReplacement.h | 5 +- .../RuntimeDyld/RTDyldMemoryManager.cpp | 12 + lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp | 4 +- lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp | 33 +- lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h | 2 - lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h | 2 +- .../RuntimeDyld/Targets/RuntimeDyldCOFFI386.h | 1 - .../RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h | 1 - .../RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h | 3 - lib/Fuzzer/FuzzerDriver.cpp | 3 +- lib/Fuzzer/FuzzerFlags.def | 8 +- lib/Fuzzer/FuzzerInternal.h | 1 + lib/Fuzzer/FuzzerLoop.cpp | 20 + lib/Fuzzer/FuzzerMutate.cpp | 5 +- lib/Fuzzer/afl/afl_driver.cpp | 57 +- lib/Fuzzer/test/AFLDriverTest.cpp | 8 +- lib/Fuzzer/test/CMakeLists.txt | 1 + lib/Fuzzer/test/OverwriteInputTest.cpp | 13 + lib/Fuzzer/test/afl-driver.test | 26 + lib/Fuzzer/test/overwrite-input.test | 2 + lib/IR/AsmWriter.cpp | 7 + lib/IR/AttributeImpl.h | 18 +- lib/IR/Attributes.cpp | 72 +- lib/IR/ConstantFold.cpp | 10 +- lib/IR/ConstantRange.cpp | 37 +- lib/IR/Constants.cpp | 15 +- lib/IR/ConstantsContext.h | 49 +- lib/IR/DebugInfoMetadata.cpp | 18 + lib/IR/DebugLoc.cpp | 114 + lib/IR/DiagnosticInfo.cpp | 25 +- lib/IR/Function.cpp | 79 +- lib/IR/Globals.cpp | 43 +- lib/IR/IRBuilder.cpp | 88 + lib/IR/Instruction.cpp | 24 + lib/IR/Instructions.cpp | 119 +- lib/IR/LegacyPassManager.cpp | 13 + lib/IR/Module.cpp | 35 +- lib/IR/Type.cpp | 71 +- lib/IR/Verifier.cpp | 13 +- lib/LLVMBuild.txt | 2 +- lib/LTO/LTO.cpp | 4 +- lib/LTO/LTOCodeGenerator.cpp | 18 +- lib/LTO/ThinLTOCodeGenerator.cpp | 3 +- lib/LibDriver/CMakeLists.txt | 8 - lib/LibDriver/LLVMBuild.txt | 22 - lib/LibDriver/LibDriver.cpp | 171 - lib/LibDriver/Options.td | 25 - lib/Linker/IRMover.cpp | 18 +- lib/MC/MCObjectStreamer.cpp | 5 + lib/MC/MCParser/AsmParser.cpp | 21 + lib/Object/COFFObjectFile.cpp | 4 +- lib/Object/WasmObjectFile.cpp | 41 +- lib/ObjectYAML/WasmYAML.cpp | 8 +- lib/ProfileData/SampleProfWriter.cpp | 42 +- lib/Support/APInt.cpp | 312 +- lib/Support/CMakeLists.txt | 1 + lib/Support/Parallel.cpp | 138 + lib/Support/Unix/Path.inc | 30 +- lib/Support/Unix/Process.inc | 4 +- lib/Target/AArch64/AArch64.td | 1 + lib/Target/AArch64/AArch64CallLowering.cpp | 2 +- lib/Target/AArch64/AArch64FastISel.cpp | 2 +- lib/Target/AArch64/AArch64ISelLowering.cpp | 6 +- lib/Target/AArch64/AArch64InstrInfo.cpp | 30 +- lib/Target/AArch64/AArch64InstrInfo.td | 8 +- lib/Target/AArch64/AArch64RegisterBankInfo.cpp | 27 +- lib/Target/AArch64/AArch64SchedFalkorDetails.td | 78 +- lib/Target/AArch64/AArch64SchedFalkorWriteRes.td | 37 +- lib/Target/AArch64/AArch64Subtarget.cpp | 8 + lib/Target/AArch64/AArch64Subtarget.h | 10 + lib/Target/AArch64/AArch64TargetObjectFile.cpp | 8 + lib/Target/AArch64/AArch64TargetObjectFile.h | 3 + lib/Target/AArch64/AArch64TargetTransformInfo.cpp | 106 +- lib/Target/AArch64/AArch64TargetTransformInfo.h | 11 + lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 19 +- .../AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp | 10 +- lib/Target/AMDGPU/AMDGPU.h | 4 + lib/Target/AMDGPU/AMDGPU.td | 21 +- lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp | 8 +- lib/Target/AMDGPU/AMDGPUISelLowering.cpp | 10 +- lib/Target/AMDGPU/AMDGPUISelLowering.h | 2 + lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp | 3 +- lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp | 1 + lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp | 2881 +++++++++ lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp | 7 +- lib/Target/AMDGPU/AMDGPUSubtarget.cpp | 3 + lib/Target/AMDGPU/AMDGPUSubtarget.h | 15 + lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 18 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp | 34 +- lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h | 3 + lib/Target/AMDGPU/CMakeLists.txt | 1 + lib/Target/AMDGPU/FLATInstructions.td | 34 +- lib/Target/AMDGPU/GCNRegPressure.cpp | 153 +- lib/Target/AMDGPU/GCNRegPressure.h | 61 +- lib/Target/AMDGPU/GCNSchedStrategy.cpp | 282 +- lib/Target/AMDGPU/GCNSchedStrategy.h | 24 +- lib/Target/AMDGPU/SIAnnotateControlFlow.cpp | 2 +- lib/Target/AMDGPU/SIISelLowering.cpp | 22 + lib/Target/AMDGPU/SIISelLowering.h | 1 + lib/Target/AMDGPU/SIInstrInfo.cpp | 310 +- lib/Target/AMDGPU/SIInstrInfo.h | 33 +- lib/Target/AMDGPU/SIInstructions.td | 7 + lib/Target/AMDGPU/VOP2Instructions.td | 51 +- lib/Target/AMDGPU/VOP3Instructions.td | 3 - lib/Target/ARM/ARMBaseInstrInfo.h | 18 +- lib/Target/ARM/ARMCallLowering.cpp | 2 +- lib/Target/ARM/ARMFastISel.cpp | 2 +- lib/Target/ARM/ARMISelLowering.cpp | 27 +- lib/Target/ARM/ARMISelLowering.h | 8 +- lib/Target/ARM/ARMInstrInfo.td | 7 +- lib/Target/ARM/ARMInstrThumb.td | 4 +- lib/Target/ARM/ARMInstructionSelector.cpp | 39 +- lib/Target/ARM/ARMLegalizerInfo.cpp | 8 +- lib/Target/ARM/ARMOptimizeBarriersPass.cpp | 4 +- lib/Target/ARM/ARMRegisterBankInfo.cpp | 1 + lib/Target/ARM/ARMTargetMachine.cpp | 2 + lib/Target/AVR/AVRFrameLowering.cpp | 2 +- lib/Target/AVR/AVRISelLowering.cpp | 10 +- lib/Target/AVR/AVRInstrInfo.td | 6 +- lib/Target/AVR/AVRRegisterInfo.cpp | 1 - lib/Target/BPF/BPFISelLowering.cpp | 3 +- lib/Target/BPF/BPFInstrInfo.td | 9 +- lib/Target/Hexagon/HexagonISelLowering.cpp | 3 +- lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp | 2 +- lib/Target/Hexagon/HexagonPatterns.td | 7 +- lib/Target/Hexagon/HexagonPseudo.td | 2 +- lib/Target/Lanai/LanaiISelLowering.cpp | 31 +- lib/Target/Lanai/LanaiISelLowering.h | 5 + lib/Target/Lanai/LanaiInstrInfo.td | 12 +- lib/Target/MSP430/MSP430FrameLowering.cpp | 7 +- lib/Target/MSP430/MSP430ISelLowering.cpp | 271 +- lib/Target/MSP430/MSP430InstrInfo.h | 6 + lib/Target/MSP430/MSP430InstrInfo.td | 9 +- lib/Target/MSP430/MSP430RegisterInfo.cpp | 4 +- lib/Target/Mips/MipsFastISel.cpp | 2 +- lib/Target/Mips/MipsISelLowering.cpp | 2 +- lib/Target/Mips/MipsInstrInfo.td | 6 +- lib/Target/Mips/MipsOptimizePICCall.cpp | 2 +- lib/Target/NVPTX/NVPTXISelLowering.cpp | 18 +- lib/Target/NVPTX/NVPTXInstrInfo.td | 9 +- .../PowerPC/Disassembler/PPCDisassembler.cpp | 17 + lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp | 3 +- lib/Target/PowerPC/PPCFastISel.cpp | 3 +- lib/Target/PowerPC/PPCISelDAGToDAG.cpp | 255 + lib/Target/PowerPC/PPCISelLowering.cpp | 98 +- lib/Target/PowerPC/PPCISelLowering.h | 29 +- lib/Target/PowerPC/PPCInstr64Bit.td | 28 +- lib/Target/PowerPC/PPCInstrAltivec.td | 40 +- lib/Target/PowerPC/PPCInstrInfo.td | 13 +- lib/Target/PowerPC/PPCInstrVSX.td | 2 +- lib/Target/PowerPC/PPCTLSDynamicCall.cpp | 3 +- lib/Target/Sparc/SparcISelLowering.cpp | 31 +- lib/Target/Sparc/SparcInstrInfo.td | 9 +- lib/Target/Sparc/SparcRegisterInfo.td | 6 +- lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp | 4 + .../SystemZ/Disassembler/SystemZDisassembler.cpp | 19 + .../SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp | 14 + lib/Target/SystemZ/README.txt | 2 +- lib/Target/SystemZ/SystemZFeatures.td | 14 +- lib/Target/SystemZ/SystemZISelLowering.cpp | 13 +- lib/Target/SystemZ/SystemZISelLowering.h | 2 + lib/Target/SystemZ/SystemZInstrFP.td | 13 + lib/Target/SystemZ/SystemZInstrFormats.td | 301 +- lib/Target/SystemZ/SystemZInstrInfo.td | 201 +- lib/Target/SystemZ/SystemZOperands.td | 2 + lib/Target/SystemZ/SystemZOperators.td | 3 +- lib/Target/SystemZ/SystemZSchedule.td | 4 + lib/Target/SystemZ/SystemZScheduleZ13.td | 84 +- lib/Target/SystemZ/SystemZScheduleZ196.td | 92 +- lib/Target/SystemZ/SystemZScheduleZEC12.td | 92 +- lib/Target/SystemZ/SystemZSubtarget.cpp | 7 +- lib/Target/SystemZ/SystemZSubtarget.h | 10 + lib/Target/WebAssembly/WebAssemblyInstrCall.td | 4 +- lib/Target/WebAssembly/WebAssemblyInstrInfo.td | 3 +- lib/Target/X86/X86.td | 3 + lib/Target/X86/X86FastISel.cpp | 48 +- lib/Target/X86/X86FixupLEAs.cpp | 269 +- lib/Target/X86/X86ISelDAGToDAG.cpp | 41 +- lib/Target/X86/X86ISelLowering.cpp | 214 +- lib/Target/X86/X86InstrCompiler.td | 14 +- lib/Target/X86/X86InstrInfo.cpp | 52 +- lib/Target/X86/X86InstrInfo.h | 11 +- lib/Target/X86/X86InstrInfo.td | 35 +- lib/Target/X86/X86InstrSSE.td | 18 +- lib/Target/X86/X86InstructionSelector.cpp | 214 +- lib/Target/X86/X86IntrinsicsInfo.h | 2 +- lib/Target/X86/X86LegalizerInfo.cpp | 16 +- lib/Target/X86/X86RegisterInfo.cpp | 14 +- lib/Target/X86/X86Subtarget.h | 6 + lib/Target/X86/X86TargetMachine.cpp | 4 +- lib/Target/X86/X86TargetTransformInfo.cpp | 194 +- lib/Target/X86/X86WinEHState.cpp | 2 +- lib/Target/XCore/XCoreISelLowering.cpp | 5 +- lib/Target/XCore/XCoreInstrInfo.td | 11 +- lib/ToolDrivers/CMakeLists.txt | 1 + lib/ToolDrivers/LLVMBuild.txt | 24 + lib/ToolDrivers/llvm-lib/CMakeLists.txt | 8 + lib/ToolDrivers/llvm-lib/LLVMBuild.txt | 22 + lib/ToolDrivers/llvm-lib/LibDriver.cpp | 171 + lib/ToolDrivers/llvm-lib/Options.td | 25 + lib/Transforms/Coroutines/CoroFrame.cpp | 100 +- lib/Transforms/IPO/FunctionImport.cpp | 15 +- lib/Transforms/IPO/Inliner.cpp | 4 +- lib/Transforms/IPO/PartialInlining.cpp | 426 +- lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp | 6 +- lib/Transforms/InstCombine/InstCombineAddSub.cpp | 199 +- lib/Transforms/InstCombine/InstCombineAndOrXor.cpp | 109 +- lib/Transforms/InstCombine/InstCombineCalls.cpp | 8 +- lib/Transforms/InstCombine/InstCombineCasts.cpp | 25 +- lib/Transforms/InstCombine/InstCombineCompares.cpp | 39 +- lib/Transforms/InstCombine/InstCombineInternal.h | 30 +- .../InstCombine/InstCombineLoadStoreAlloca.cpp | 6 +- .../InstCombine/InstCombineMulDivRem.cpp | 20 +- .../InstCombine/InstCombineSimplifyDemanded.cpp | 2 +- .../InstCombine/InstructionCombining.cpp | 26 +- .../Instrumentation/AddressSanitizer.cpp | 34 +- .../Instrumentation/DataFlowSanitizer.cpp | 8 +- .../Instrumentation/EfficiencySanitizer.cpp | 49 +- lib/Transforms/Instrumentation/MemorySanitizer.cpp | 7 +- .../Scalar/CorrelatedValuePropagation.cpp | 10 +- lib/Transforms/Scalar/LoopIdiomRecognize.cpp | 295 +- lib/Transforms/Scalar/NewGVN.cpp | 210 +- lib/Transforms/Scalar/SimpleLoopUnswitch.cpp | 161 +- lib/Transforms/Scalar/SpeculativeExecution.cpp | 43 +- lib/Transforms/Utils/BypassSlowDivision.cpp | 4 +- lib/Transforms/Utils/CloneFunction.cpp | 32 +- lib/Transforms/Utils/CloneModule.cpp | 2 +- lib/Transforms/Utils/EscapeEnumerator.cpp | 3 +- lib/Transforms/Utils/InlineFunction.cpp | 61 +- lib/Transforms/Utils/InstructionNamer.cpp | 13 +- lib/Transforms/Utils/Local.cpp | 106 +- lib/Transforms/Utils/LoopUtils.cpp | 201 + lib/Transforms/Utils/ModuleUtils.cpp | 12 +- lib/Transforms/Utils/SimplifyLibCalls.cpp | 6 +- lib/Transforms/Utils/VNCoercion.cpp | 9 + lib/Transforms/Utils/ValueMapper.cpp | 9 +- lib/Transforms/Vectorize/LoadStoreVectorizer.cpp | 2 +- lib/Transforms/Vectorize/LoopVectorize.cpp | 241 +- lib/Transforms/Vectorize/SLPVectorizer.cpp | 112 +- lib/XRay/Trace.cpp | 34 +- projects/CMakeLists.txt | 4 +- test/Analysis/BasicAA/cs-cs-arm.ll | 34 + test/Analysis/BasicAA/cs-cs.ll | 37 +- test/Analysis/BasicAA/intrinsics-arm.ll | 31 + test/Analysis/BasicAA/intrinsics.ll | 34 +- test/Analysis/BranchProbabilityInfo/basic.ll | 6 +- .../CostModel/AArch64/free-widening-casts.ll | 622 ++ test/Analysis/CostModel/AMDGPU/extractelement.ll | 74 +- test/Analysis/CostModel/AMDGPU/insertelement.ll | 43 +- test/Analysis/CostModel/AMDGPU/shufflevector.ll | 43 + test/Analysis/CostModel/X86/div.ll | 32 +- test/Analysis/CostModel/X86/vshift-ashr-cost.ll | 138 +- test/Analysis/CostModel/X86/vshift-lshr-cost.ll | 128 +- test/Analysis/CostModel/X86/vshift-shl-cost.ll | 134 +- .../ScalarEvolution/different-loops-recs.ll | 454 ++ test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll | 18 +- test/Assembler/globalvariable-attributes.ll | 19 + test/Bitcode/globalvariable-attributes.ll | 19 + test/Bitcode/ptest-old.ll | 1 + ...o-function-summary-callgraph-profile-summary.ll | 2 +- ...ion-summary-callgraph-sample-profile-summary.ll | 121 + .../AArch64/GlobalISel/arm64-regbankselect.mir | 96 + test/CodeGen/AArch64/GlobalISel/call-translator.ll | 4 +- test/CodeGen/AArch64/arm64-ccmp.ll | 2 +- test/CodeGen/AArch64/arm64-fml-combines.ll | 24 +- test/CodeGen/AArch64/arm64-hello.ll | 4 +- test/CodeGen/AArch64/arm64-misched-multimmo.ll | 2 +- test/CodeGen/AArch64/macho-global-symbols.ll | 17 + test/CodeGen/AArch64/misched-fusion-aes.ll | 33 + test/CodeGen/AArch64/stackmap-frame-setup.ll | 4 +- .../AMDGPU/GlobalISel/inst-select-load-flat.mir | 2 +- .../AMDGPU/GlobalISel/inst-select-store-flat.mir | 2 +- .../AMDGPU/GlobalISel/legalize-constant.mir | 20 + test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg | 2 + test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir | 70 +- test/CodeGen/AMDGPU/constant-fold-mi-operands.ll | 12 +- test/CodeGen/AMDGPU/ctpop.ll | 80 +- test/CodeGen/AMDGPU/ctpop64.ll | 16 +- test/CodeGen/AMDGPU/fneg-combines.ll | 9 +- test/CodeGen/AMDGPU/fneg.f16.ll | 39 +- test/CodeGen/AMDGPU/inserted-wait-states.mir | 10 +- test/CodeGen/AMDGPU/limit-coalesce.mir | 6 +- test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll | 18 +- test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll | 4 +- test/CodeGen/AMDGPU/madak.ll | 6 +- test/CodeGen/AMDGPU/promote-alloca-volatile.ll | 12 +- test/CodeGen/AMDGPU/v_madak_f16.ll | 2 +- test/CodeGen/AMDGPU/waitcnt.mir | 22 +- .../ARM/GlobalISel/arm-instruction-select.mir | 200 +- test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll | 16 +- test/CodeGen/ARM/GlobalISel/arm-legalizer.mir | 30 +- test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir | 136 +- test/CodeGen/ARM/divmod-eabi.ll | 73 +- test/CodeGen/ARM/divmod.ll | 1 + test/CodeGen/AVR/select-mbb-placement-bug.ll | 35 + .../Generic/expand-experimental-reductions.ll | 210 + test/CodeGen/Hexagon/regalloc-bad-undef.mir | 8 +- test/CodeGen/Lanai/masking_setccs.ll | 48 + test/CodeGen/Lanai/peephole-compare.mir | 4 +- .../ARM/PR32721_ifcvt_triangle_unanalyzable.mir | 24 + test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir | 64 + .../MIR/X86/frame-info-save-restore-points.mir | 2 +- test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll | 22 - test/CodeGen/MSP430/hwmult16.ll | 43 + test/CodeGen/MSP430/hwmult32.ll | 43 + test/CodeGen/MSP430/hwmultf5.ll | 43 + test/CodeGen/MSP430/jumptable.ll | 2 +- test/CodeGen/MSP430/libcalls.ll | 595 ++ test/CodeGen/MSP430/promote-i8-mul.ll | 22 + test/CodeGen/NVPTX/bug17709.ll | 52 +- test/CodeGen/NVPTX/ctlz.ll | 2 +- test/CodeGen/NVPTX/ctpop.ll | 2 +- test/CodeGen/NVPTX/cttz.ll | 3 +- test/CodeGen/NVPTX/f16-instructions.ll | 2157 +++---- test/CodeGen/NVPTX/f16x2-instructions.ll | 2853 ++++----- test/CodeGen/NVPTX/fma.ll | 84 +- test/CodeGen/NVPTX/i8-param.ll | 46 +- test/CodeGen/NVPTX/param-load-store.ll | 1878 +++--- test/CodeGen/NVPTX/sched1.ll | 4 +- test/CodeGen/NVPTX/sched2.ll | 4 +- test/CodeGen/NVPTX/simple-call.ll | 52 +- test/CodeGen/NVPTX/vec8.ll | 2 +- test/CodeGen/NVPTX/vector-call.ll | 60 +- test/CodeGen/NVPTX/zeroext-32bit.ll | 52 +- test/CodeGen/PowerPC/mtvsrdd.ll | 22 + test/CodeGen/PowerPC/setcc-logic.ll | 12 +- test/CodeGen/PowerPC/stackmap-frame-setup.ll | 4 +- test/CodeGen/PowerPC/tail-dup-layout.ll | 97 +- test/CodeGen/PowerPC/testComparesieqsc.ll | 138 + test/CodeGen/PowerPC/testComparesieqsi.ll | 138 + test/CodeGen/PowerPC/testComparesieqss.ll | 138 + test/CodeGen/PowerPC/testComparesiequc.ll | 138 + test/CodeGen/PowerPC/testComparesiequi.ll | 138 + test/CodeGen/PowerPC/testComparesiequs.ll | 138 + test/CodeGen/PowerPC/testCompareslleqsc.ll | 138 + test/CodeGen/PowerPC/testCompareslleqsi.ll | 138 + test/CodeGen/PowerPC/testCompareslleqss.ll | 137 + test/CodeGen/PowerPC/testComparesllequc.ll | 137 + test/CodeGen/PowerPC/testComparesllequi.ll | 137 + test/CodeGen/PowerPC/testComparesllequs.ll | 137 + test/CodeGen/SPARC/LeonItinerariesUT.ll | 4 +- test/CodeGen/SPARC/inlineasm-v9.ll | 30 + test/CodeGen/SPARC/inlineasm.ll | 18 + test/CodeGen/SystemZ/list-ilp-crash.ll | 23 + test/CodeGen/SystemZ/lower-copy-undef-src.mir | 14 + test/CodeGen/Thumb2/v8_IT_5.ll | 2 +- test/CodeGen/X86/2007-01-08-InstrSched.ll | 4 +- test/CodeGen/X86/2010-01-18-DbgValue.ll | 13 +- test/CodeGen/X86/2012-11-30-handlemove-dbg.ll | 51 - test/CodeGen/X86/2012-11-30-misched-dbg.ll | 142 - test/CodeGen/X86/2012-11-30-regpres-dbg.ll | 47 - test/CodeGen/X86/GlobalISel/add-scalar.ll | 44 + test/CodeGen/X86/GlobalISel/binop.ll | 42 - test/CodeGen/X86/GlobalISel/br.ll | 19 + test/CodeGen/X86/GlobalISel/cmp.ll | 159 + test/CodeGen/X86/GlobalISel/ext-x86-64.ll | 14 +- test/CodeGen/X86/GlobalISel/ext.ll | 18 + test/CodeGen/X86/GlobalISel/legalize-cmp.mir | 179 + .../CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir | 64 + test/CodeGen/X86/GlobalISel/legalize-ext.mir | 64 + test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll | 101 + test/CodeGen/X86/GlobalISel/memop-scalar.ll | 146 + test/CodeGen/X86/GlobalISel/memop-vec.ll | 39 + test/CodeGen/X86/GlobalISel/memop-x32.ll | 101 - test/CodeGen/X86/GlobalISel/memop.ll | 206 - .../X86/GlobalISel/regbankselect-X86_64.mir | 125 +- test/CodeGen/X86/GlobalISel/select-br.mir | 39 + test/CodeGen/X86/GlobalISel/select-cmp.mir | 563 ++ test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir | 38 + test/CodeGen/X86/GlobalISel/select-ext.mir | 33 + .../X86/GlobalISel/select-memop-scalar-x32.mir | 310 + .../CodeGen/X86/GlobalISel/select-memop-scalar.mir | 500 ++ test/CodeGen/X86/GlobalISel/select-memop-v128.mir | 143 + test/CodeGen/X86/GlobalISel/select-memop-x32.mir | 310 - test/CodeGen/X86/GlobalISel/select-memop.mir | 637 -- test/CodeGen/X86/O0-pipeline.ll | 67 + test/CodeGen/X86/all-ones-vector.ll | 112 +- test/CodeGen/X86/avg.ll | 833 ++- test/CodeGen/X86/avx-basic.ll | 8 +- test/CodeGen/X86/avx-cvt-3.ll | 22 +- test/CodeGen/X86/avx-intrinsics-fast-isel.ll | 60 +- test/CodeGen/X86/avx-schedule.ll | 50 + test/CodeGen/X86/avx.ll | 2 +- test/CodeGen/X86/avx512-cmp-kor-sequence.ll | 6 +- test/CodeGen/X86/avx512-gather-scatter-intrin.ll | 10 +- test/CodeGen/X86/avx512-intrinsics-upgrade.ll | 44 +- test/CodeGen/X86/avx512-intrinsics.ll | 215 +- test/CodeGen/X86/avx512-mask-spills.ll | 40 +- test/CodeGen/X86/avx512-scalar_mask.ll | 107 + test/CodeGen/X86/avx512-vselect.ll | 61 + test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll | 12 +- test/CodeGen/X86/avx512bw-intrinsics.ll | 16 +- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll | 24 +- test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll | 2 +- test/CodeGen/X86/avx512cdvl-intrinsics.ll | 2 +- test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll | 2 +- test/CodeGen/X86/avx512dq-intrinsics.ll | 4 +- test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll | 10 +- test/CodeGen/X86/avx512dqvl-intrinsics.ll | 4 +- test/CodeGen/X86/avx512er-intrinsics.ll | 48 +- test/CodeGen/X86/avx512ifma-intrinsics.ll | 8 +- test/CodeGen/X86/avx512ifmavl-intrinsics.ll | 16 +- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll | 64 +- test/CodeGen/X86/avx512vl-intrinsics.ll | 28 +- test/CodeGen/X86/bmi.ll | 76 + test/CodeGen/X86/bswap_tree2.ll | 35 +- test/CodeGen/X86/cast-vsel.ll | 37 +- test/CodeGen/X86/combine-abs.ll | 11 +- test/CodeGen/X86/combine-shl.ll | 3 +- test/CodeGen/X86/combine-srl.ll | 22 +- test/CodeGen/X86/constructor.ll | 5 + test/CodeGen/X86/dbg-baseptr.ll | 62 +- test/CodeGen/X86/elf-associated.ll | 5 + test/CodeGen/X86/fold-tied-op.ll | 7 +- test/CodeGen/X86/fp128-i128.ll | 2 +- test/CodeGen/X86/haddsub-2.ll | 12 +- test/CodeGen/X86/known-signbits-vector.ll | 61 + test/CodeGen/X86/leaFixup32.mir | 508 ++ test/CodeGen/X86/leaFixup64.mir | 1041 +++ test/CodeGen/X86/lrshrink.ll | 57 + test/CodeGen/X86/madd.ll | 34 +- test/CodeGen/X86/masked_gather_scatter.ll | 2 +- test/CodeGen/X86/merge-consecutive-loads-128.ll | 16 +- test/CodeGen/X86/misched-matrix.ll | 4 +- test/CodeGen/X86/not-and-simplify.ll | 28 +- test/CodeGen/X86/oddshuffles.ll | 34 +- test/CodeGen/X86/packss.ll | 11 +- test/CodeGen/X86/pmul.ll | 55 +- test/CodeGen/X86/pr28129.ll | 32 +- test/CodeGen/X86/pr29112.ll | 8 +- test/CodeGen/X86/pr30562.ll | 1 + test/CodeGen/X86/pr31088.ll | 2 +- test/CodeGen/X86/pr32284.ll | 71 +- test/CodeGen/X86/pr32907.ll | 53 +- .../X86/replace_unsupported_masked_mem_intrin.ll | 37 + test/CodeGen/X86/rotate.ll | 16 +- test/CodeGen/X86/sad.ll | 929 ++- test/CodeGen/X86/select.ll | 28 +- test/CodeGen/X86/setcc-wide-types.ll | 56 +- test/CodeGen/X86/shrink_vmul_sse.ll | 2 +- test/CodeGen/X86/shuffle-of-splat-multiuses.ll | 34 +- test/CodeGen/X86/sse-intrinsics-fast-isel.ll | 10 +- test/CodeGen/X86/sse1.ll | 20 +- test/CodeGen/X86/sse3-avx-addsub-2.ll | 14 +- test/CodeGen/X86/sse41.ll | 8 +- test/CodeGen/X86/stackmap-frame-setup.ll | 4 +- test/CodeGen/X86/vec_int_to_fp.ll | 84 +- test/CodeGen/X86/vec_set-2.ll | 31 +- test/CodeGen/X86/vec_set-3.ll | 45 +- test/CodeGen/X86/vec_set-4.ll | 38 +- test/CodeGen/X86/vec_set-6.ll | 23 +- test/CodeGen/X86/vec_set-7.ll | 18 +- test/CodeGen/X86/vec_set-8.ll | 16 +- test/CodeGen/X86/vec_set-A.ll | 19 +- test/CodeGen/X86/vec_set-B.ll | 40 +- test/CodeGen/X86/vec_set-C.ll | 10 +- test/CodeGen/X86/vec_set.ll | 63 +- test/CodeGen/X86/vector-bitreverse.ll | 6 +- test/CodeGen/X86/vector-blend.ll | 4 +- test/CodeGen/X86/vector-lzcnt-128.ll | 380 +- test/CodeGen/X86/vector-lzcnt-256.ll | 536 +- test/CodeGen/X86/vector-narrow-binop.ll | 111 + test/CodeGen/X86/vector-pcmp.ll | 27 +- test/CodeGen/X86/vector-shift-ashr-256.ll | 580 ++ test/CodeGen/X86/vector-shift-lshr-256.ll | 434 ++ test/CodeGen/X86/vector-shift-shl-256.ll | 377 ++ test/CodeGen/X86/vector-shuffle-512-v32.ll | 356 +- test/CodeGen/X86/vector-sqrt.ll | 8 +- test/CodeGen/X86/viabs.ll | 107 +- test/CodeGen/X86/vselect-pcmp.ll | 12 +- test/CodeGen/X86/x86-interleaved-access.ll | 14 +- .../X86/x86-no_caller_saved_registers-preserve.ll | 26 +- test/CodeGen/X86/x86-no_caller_saved_registers.ll | 62 +- test/CodeGen/X86/x86-shrink-wrapping.ll | 53 +- test/CodeGen/X86/xop-intrinsics-fast-isel.ll | 8 +- test/DebugInfo/COFF/local-variables.ll | 5 - test/DebugInfo/COFF/no-cus.ll | 25 + test/DebugInfo/Inputs/typeunit-header.elf-x86-64 | Bin 0 -> 840 bytes test/DebugInfo/Inputs/typeunit-header.s | 49 + test/DebugInfo/PDB/DIA/pdbdump-symbol-format.test | 4 +- test/DebugInfo/X86/dbg-declare-inalloca.ll | 199 + .../X86/split-dwarf-cross-unit-reference.ll | 198 +- test/DebugInfo/typeunit-header.test | 15 + test/Feature/intrinsic-noduplicate.ll | 1 + test/Instrumentation/MemorySanitizer/msan_basic.ll | 64 - .../MemorySanitizer/msan_x86intrinsics.ll | 68 + test/Instrumentation/MemorySanitizer/pr32842.ll | 20 + .../MemorySanitizer/vector_arith.ll | 1 + test/Instrumentation/MemorySanitizer/vector_cmp.ll | 1 + test/Instrumentation/MemorySanitizer/vector_cvt.ll | 1 + .../Instrumentation/MemorySanitizer/vector_pack.ll | 1 + .../MemorySanitizer/vector_shift.ll | 1 + test/LTO/Resolution/X86/ifunc.ll | 15 + test/MC/AArch64/directive-cpu-err.s | 9 + test/MC/AArch64/label-arithmetic-diags-elf.s | 9 + test/MC/AMDGPU/flat.s | 66 - test/MC/AMDGPU/literal16.s | 8 +- test/MC/AMDGPU/vop2.s | 38 +- test/MC/AMDGPU/vop3-convert.s | 14 +- test/MC/AsmParser/altmacro_string_escape.s | 29 + test/MC/Disassembler/AMDGPU/flat_vi.txt | 24 - test/MC/Disassembler/AMDGPU/literal16_vi.txt | 6 +- test/MC/Disassembler/AMDGPU/vop2_vi.txt | 30 +- test/MC/Disassembler/AMDGPU/vop3_vi.txt | 18 + .../PowerPC/ppc64-encoding-p9vector.txt | 4 + test/MC/Disassembler/SystemZ/insns-z13.txt | 4068 ++++++------ test/MC/Disassembler/SystemZ/insns.txt | 6717 +++++++++++++------- test/MC/SystemZ/insn-bad-z13.s | 792 ++- test/MC/SystemZ/insn-bad-z196.s | 53 +- test/MC/SystemZ/insn-bad-zEC12.s | 511 +- test/MC/SystemZ/insn-bad.s | 2284 ++++++- test/MC/SystemZ/insn-good-z13.s | 1736 ++--- test/MC/SystemZ/insn-good-z196.s | 158 +- test/MC/SystemZ/insn-good-zEC12.s | 16 +- test/MC/SystemZ/insn-good.s | 2131 ++++++- test/Object/Inputs/COFF/empty-drectve.yaml | 14 + test/Object/X86/archive-symbol-table.s | 19 + test/Object/X86/nm-ir.ll | 2 +- test/Object/coff-empty-drectve.test | 3 + test/Object/invalid.test | 4 +- test/Object/wasm-invalid-start.test | 10 + test/ObjectYAML/wasm/export_section.yaml | 28 +- test/ObjectYAML/wasm/function_section.yaml | 4 +- test/ObjectYAML/wasm/import_section.yaml | 45 +- test/ObjectYAML/wasm/start_section.yaml | 9 + test/TableGen/AsmVariant.td | 2 +- test/TableGen/RegisterEncoder.td | 35 + .../CodeExtractor/ExtractedFnEntryCount.ll | 2 +- .../CodeExtractor/MultipleExitBranchProb.ll | 2 +- test/Transforms/CodeExtractor/PartialInlineAnd.ll | 4 +- .../CodeExtractor/PartialInlineEntryUpdate.ll | 41 + .../CodeExtractor/PartialInlineHighCost.ll | 107 + test/Transforms/CodeExtractor/PartialInlineOr.ll | 4 +- .../Transforms/CodeExtractor/PartialInlineOrAnd.ll | 4 +- test/Transforms/CodeExtractor/SingleCondition.ll | 4 +- .../CodeExtractor/X86/InheritTargetAttributes.ll | 4 +- .../Transforms/CodeGenPrepare/section-samplepgo.ll | 57 + test/Transforms/CodeGenPrepare/section.ll | 20 +- test/Transforms/ConstProp/calls-math-finite.ll | 83 + test/Transforms/ConstProp/calls.ll | 206 - test/Transforms/ConstProp/sse.ll | 208 + .../Coroutines/coro-eh-aware-edge-split.ll | 218 + .../GVN/PRE/2011-06-01-NonLocalMemdepMiscompile.ll | 7 +- test/Transforms/GVN/PRE/nonintegral.ll | 39 + .../IndVarSimplify/2011-10-27-lftrnull.ll | 2 +- test/Transforms/InferFunctionAttrs/annotate.ll | 126 + test/Transforms/InferFunctionAttrs/no-proto.ll | 126 + test/Transforms/Inline/inline-cold.ll | 20 +- .../inline-constexpr-addrspacecast-argument.ll | 2 +- test/Transforms/Inline/partial-inline-act.ll | 2 +- test/Transforms/Inline/prof-update.ll | 35 +- .../InstCombine/2012-04-23-Neon-Intrinsics.ll | 135 - .../AArch64/2012-04-23-Neon-Intrinsics.ll | 71 + test/Transforms/InstCombine/AArch64/lit.local.cfg | 2 + .../InstCombine/AMDGPU/amdgcn-intrinsics.ll | 1540 +++++ test/Transforms/InstCombine/AMDGPU/lit.local.cfg | 2 + .../InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll | 65 + .../InstCombine/ARM/constant-fold-hang.ll | 14 + test/Transforms/InstCombine/ARM/lit.local.cfg | 2 + test/Transforms/InstCombine/ARM/neon-intrinsics.ll | 25 + .../InstCombine/PowerPC/aligned-altivec.ll | 131 + test/Transforms/InstCombine/PowerPC/aligned-qpx.ll | 165 + test/Transforms/InstCombine/PowerPC/lit.local.cfg | 3 + .../InstCombine/PowerPC/vsx-unaligned.ll | 44 + .../InstCombine/X86/X86FsubCmpCombine.ll | 181 + test/Transforms/InstCombine/X86/blend_x86.ll | 151 + test/Transforms/InstCombine/X86/lit.local.cfg | 2 + test/Transforms/InstCombine/X86/pr2645-1.ll | 39 + .../InstCombine/X86/shufflemask-undef.ll | 110 + test/Transforms/InstCombine/X86/x86-avx2.ll | 109 + test/Transforms/InstCombine/X86/x86-avx512.ll | 2793 ++++++++ .../InstCombine/X86/x86-crc32-demanded.ll | 17 + test/Transforms/InstCombine/X86/x86-f16c.ll | 68 + test/Transforms/InstCombine/X86/x86-fma.ll | 315 + test/Transforms/InstCombine/X86/x86-insertps.ll | 166 + .../InstCombine/X86/x86-masked-memops.ll | 328 + test/Transforms/InstCombine/X86/x86-movmsk.ll | 324 + test/Transforms/InstCombine/X86/x86-muldq.ll | 245 + test/Transforms/InstCombine/X86/x86-pack.ll | 366 ++ test/Transforms/InstCombine/X86/x86-pshufb.ll | 515 ++ test/Transforms/InstCombine/X86/x86-sse.ll | 613 ++ test/Transforms/InstCombine/X86/x86-sse2.ll | 460 ++ test/Transforms/InstCombine/X86/x86-sse41.ll | 98 + test/Transforms/InstCombine/X86/x86-sse4a.ll | 408 ++ .../InstCombine/X86/x86-vec_demanded_elts.ll | 110 + .../InstCombine/X86/x86-vector-shifts.ll | 3434 ++++++++++ test/Transforms/InstCombine/X86/x86-vperm2.ll | 313 + test/Transforms/InstCombine/X86/x86-vpermil.ll | 298 + test/Transforms/InstCombine/X86/x86-xop.ll | 305 + test/Transforms/InstCombine/X86FsubCmpCombine.ll | 181 - test/Transforms/InstCombine/add.ll | 26 + test/Transforms/InstCombine/aligned-altivec.ll | 131 - test/Transforms/InstCombine/aligned-qpx.ll | 165 - test/Transforms/InstCombine/amdgcn-intrinsics.ll | 1540 ----- test/Transforms/InstCombine/and.ll | 2 +- test/Transforms/InstCombine/bit-tracking.ll | 26 - test/Transforms/InstCombine/blend_x86.ll | 151 - test/Transforms/InstCombine/cast.ll | 38 + test/Transforms/InstCombine/constant-fold-hang.ll | 14 - .../InstCombine/constant-fold-iteration.ll | 10 + test/Transforms/InstCombine/demorgan.ll | 8 +- test/Transforms/InstCombine/icmp.ll | 15 + test/Transforms/InstCombine/intrinsics.ll | 29 +- test/Transforms/InstCombine/logical-select.ll | 75 + test/Transforms/InstCombine/neon-intrinsics.ll | 25 - test/Transforms/InstCombine/not.ll | 76 +- test/Transforms/InstCombine/or-xor.ll | 70 + test/Transforms/InstCombine/or.ll | 109 - test/Transforms/InstCombine/pr2645-1.ll | 39 - test/Transforms/InstCombine/sext.ll | 2 +- test/Transforms/InstCombine/shufflemask-undef.ll | 109 - test/Transforms/InstCombine/trunc.ll | 2 +- test/Transforms/InstCombine/vec_demanded_elts.ll | 108 - test/Transforms/InstCombine/vsx-unaligned.ll | 44 - test/Transforms/InstCombine/x86-avx2.ll | 109 - test/Transforms/InstCombine/x86-avx512.ll | 2793 -------- test/Transforms/InstCombine/x86-crc32-demanded.ll | 17 - test/Transforms/InstCombine/x86-f16c.ll | 68 - test/Transforms/InstCombine/x86-fma.ll | 315 - test/Transforms/InstCombine/x86-insertps.ll | 166 - test/Transforms/InstCombine/x86-masked-memops.ll | 328 - test/Transforms/InstCombine/x86-movmsk.ll | 324 - test/Transforms/InstCombine/x86-muldq.ll | 245 - test/Transforms/InstCombine/x86-pack.ll | 366 -- test/Transforms/InstCombine/x86-pshufb.ll | 515 -- test/Transforms/InstCombine/x86-sse.ll | 613 -- test/Transforms/InstCombine/x86-sse2.ll | 460 -- test/Transforms/InstCombine/x86-sse41.ll | 98 - test/Transforms/InstCombine/x86-sse4a.ll | 408 -- test/Transforms/InstCombine/x86-vector-shifts.ll | 3434 ---------- test/Transforms/InstCombine/x86-vperm2.ll | 313 - test/Transforms/InstCombine/x86-vpermil.ll | 298 - test/Transforms/InstCombine/x86-xop.ll | 305 - test/Transforms/InstCombine/xor2.ll | 11 - test/Transforms/InstNamer/basic.ll | 19 + test/Transforms/InstSimplify/AndOrXor.ll | 173 + test/Transforms/InstSimplify/apint-or.ll | 72 - test/Transforms/InstSimplify/compare.ll | 7 +- test/Transforms/InstSimplify/or.ll | 181 + test/Transforms/LoopIdiom/ARM/ctlz.ll | 185 + test/Transforms/LoopIdiom/X86/ctlz.ll | 185 + test/Transforms/LoopUnroll/not-rotated.ll | 2 +- .../LoopVectorize/X86/svml-calls-finite.ll | 187 + test/Transforms/LoopVectorize/induction.ll | 45 + test/Transforms/LoopVectorize/pr32859.ll | 30 + test/Transforms/NewGVN/pr32934.ll | 69 + test/Transforms/NewGVN/pr32952.ll | 42 + test/Transforms/NewGVN/verify-memoryphi.ll | 29 + .../SLPVectorizer/AArch64/64-bit-vector.ll | 22 + .../SLPVectorizer/AArch64/getelementptr.ll | 43 +- .../Transforms/SLPVectorizer/AArch64/horizontal.ll | 33 +- test/Transforms/SLPVectorizer/AArch64/remarks.ll | 32 + test/Transforms/SLPVectorizer/X86/arith-add.ll | 649 ++ test/Transforms/SLPVectorizer/X86/arith-mul.ll | 700 ++ test/Transforms/SLPVectorizer/X86/arith-sub.ll | 649 ++ test/Transforms/SLPVectorizer/X86/shift-ashr.ll | 913 +++ test/Transforms/SLPVectorizer/X86/shift-lshr.ll | 862 +++ test/Transforms/SLPVectorizer/X86/shift-shl.ll | 814 +++ .../SimpleLoopUnswitch/trivial-unswitch.ll | 199 + test/Transforms/SpeculativeExecution/spec-other.ll | 32 - .../Transforms/SpeculativeExecution/spec-vector.ll | 73 - test/Transforms/Util/split-bit-piece.ll | 110 +- test/Verifier/metadata-function-dbg.ll | 16 +- test/tools/llvm-pdbdump/Inputs/FilterTest.cpp | 18 + test/tools/llvm-pdbdump/Inputs/FilterTest.pdb | Bin 44032 -> 44032 bytes test/tools/llvm-pdbdump/regex-filter.test | 8 +- test/tools/llvm-pdbdump/symbol-filters.test | 74 + test/tools/llvm-profdata/sample-profile-basic.test | 7 +- test/tools/llvm-readobj/wasm-invalid.test | 7 + tools/bugpoint/ExtractFunction.cpp | 3 +- tools/llc/llc.cpp | 2 + tools/lli/RemoteJITUtils.h | 5 +- tools/llvm-ar/llvm-ar.cpp | 2 +- tools/llvm-pdbdump/LLVMOutputStyle.cpp | 2 +- tools/llvm-pdbdump/PrettyCompilandDumper.cpp | 12 + tools/llvm-pdbdump/PrettyFunctionDumper.cpp | 10 +- tools/llvm-pdbdump/llvm-pdbdump.cpp | 102 +- tools/llvm-pdbdump/llvm-pdbdump.h | 23 + tools/llvm-readobj/COFFDumper.cpp | 8 + tools/llvm-readobj/llvm-readobj.cpp | 21 +- tools/llvm-rtdyld/llvm-rtdyld.cpp | 3 +- tools/obj2yaml/wasm2yaml.cpp | 63 +- tools/opt/opt.cpp | 4 +- tools/yaml2obj/yaml2wasm.cpp | 11 +- unittests/Analysis/ProfileSummaryInfoTest.cpp | 6 + unittests/Analysis/TargetLibraryInfoTest.cpp | 46 + unittests/DebugInfo/CMakeLists.txt | 2 +- unittests/DebugInfo/CodeView/CMakeLists.txt | 11 + unittests/DebugInfo/CodeView/ErrorChecking.h | 61 + .../DebugInfo/CodeView/RandomAccessVisitorTest.cpp | 353 + .../Orc/ObjectTransformLayerTest.cpp | 2 +- unittests/ExecutionEngine/Orc/OrcTestCommon.h | 2 +- .../Orc/RTDyldObjectLinkingLayerTest.cpp | 29 +- unittests/IR/ConstantRangeTest.cpp | 9 +- unittests/IR/InstructionsTest.cpp | 7 + unittests/IR/TypeBuilderTest.cpp | 30 +- unittests/Support/CMakeLists.txt | 1 + .../Support/DynamicLibrary/DynamicLibraryTest.cpp | 4 +- unittests/Support/ParallelTest.cpp | 53 + unittests/Support/Path.cpp | 2 +- unittests/Transforms/Utils/Cloning.cpp | 65 +- utils/TableGen/CodeGenInstruction.cpp | 1 + utils/TableGen/SubtargetEmitter.cpp | 2 +- utils/TableGen/X86RecognizableInstr.cpp | 125 +- utils/TableGen/X86RecognizableInstr.h | 122 + utils/git-svn/git-llvm | 33 +- utils/release/build_llvm_package.bat | 8 +- utils/vscode/README | 18 + utils/vscode/tablegen/.vscode/launch.json | 13 + utils/vscode/tablegen/CHANGELOG.md | 4 + utils/vscode/tablegen/README.md | 13 + utils/vscode/tablegen/language-configuration.json | 30 + utils/vscode/tablegen/package.json | 26 + utils/vscode/tablegen/syntaxes/TableGen.tmLanguage | 132 + utils/vscode/tablegen/vsc-extension-quickstart.md | 27 + 909 files changed, 68912 insertions(+), 33784 deletions(-) create mode 100644 include/llvm/CodeGen/ExpandReductions.h create mode 100644 include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h delete mode 100644 include/llvm/LibDriver/LibDriver.h create mode 100644 include/llvm/Support/Parallel.h create mode 100644 include/llvm/ToolDrivers/llvm-lib/LibDriver.h create mode 100644 lib/CodeGen/ExpandReductions.cpp create mode 100644 lib/CodeGen/LiveRangeShrink.cpp create mode 100644 lib/CodeGen/ScalarizeMaskedMemIntrin.cpp delete mode 100644 lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp create mode 100644 lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp create mode 100644 lib/Fuzzer/test/OverwriteInputTest.cpp create mode 100644 lib/Fuzzer/test/afl-driver.test create mode 100644 lib/Fuzzer/test/overwrite-input.test delete mode 100644 lib/LibDriver/CMakeLists.txt delete mode 100644 lib/LibDriver/LLVMBuild.txt delete mode 100644 lib/LibDriver/LibDriver.cpp delete mode 100644 lib/LibDriver/Options.td create mode 100644 lib/Support/Parallel.cpp create mode 100644 lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp create mode 100644 lib/ToolDrivers/CMakeLists.txt create mode 100644 lib/ToolDrivers/LLVMBuild.txt create mode 100644 lib/ToolDrivers/llvm-lib/CMakeLists.txt create mode 100644 lib/ToolDrivers/llvm-lib/LLVMBuild.txt create mode 100644 lib/ToolDrivers/llvm-lib/LibDriver.cpp create mode 100644 lib/ToolDrivers/llvm-lib/Options.td create mode 100644 test/Analysis/BasicAA/cs-cs-arm.ll create mode 100644 test/Analysis/BasicAA/intrinsics-arm.ll create mode 100644 test/Analysis/CostModel/AArch64/free-widening-casts.ll create mode 100644 test/Analysis/CostModel/AMDGPU/shufflevector.ll create mode 100644 test/Analysis/ScalarEvolution/different-loops-recs.ll create mode 100644 test/Assembler/globalvariable-attributes.ll create mode 100644 test/Bitcode/globalvariable-attributes.ll create mode 100644 test/Bitcode/thinlto-function-summary-callgraph-sample-profile-summary.ll create mode 100644 test/CodeGen/AArch64/macho-global-symbols.ll create mode 100644 test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir create mode 100644 test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg create mode 100644 test/CodeGen/AVR/select-mbb-placement-bug.ll create mode 100644 test/CodeGen/Generic/expand-experimental-reductions.ll create mode 100644 test/CodeGen/Lanai/masking_setccs.ll create mode 100644 test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir create mode 100644 test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir delete mode 100644 test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll create mode 100644 test/CodeGen/MSP430/hwmult16.ll create mode 100644 test/CodeGen/MSP430/hwmult32.ll create mode 100644 test/CodeGen/MSP430/hwmultf5.ll create mode 100644 test/CodeGen/MSP430/libcalls.ll create mode 100644 test/CodeGen/MSP430/promote-i8-mul.ll create mode 100644 test/CodeGen/PowerPC/mtvsrdd.ll create mode 100644 test/CodeGen/PowerPC/testComparesieqsc.ll create mode 100644 test/CodeGen/PowerPC/testComparesieqsi.ll create mode 100644 test/CodeGen/PowerPC/testComparesieqss.ll create mode 100644 test/CodeGen/PowerPC/testComparesiequc.ll create mode 100644 test/CodeGen/PowerPC/testComparesiequi.ll create mode 100644 test/CodeGen/PowerPC/testComparesiequs.ll create mode 100644 test/CodeGen/PowerPC/testCompareslleqsc.ll create mode 100644 test/CodeGen/PowerPC/testCompareslleqsi.ll create mode 100644 test/CodeGen/PowerPC/testCompareslleqss.ll create mode 100644 test/CodeGen/PowerPC/testComparesllequc.ll create mode 100644 test/CodeGen/PowerPC/testComparesllequi.ll create mode 100644 test/CodeGen/PowerPC/testComparesllequs.ll create mode 100644 test/CodeGen/SPARC/inlineasm-v9.ll create mode 100644 test/CodeGen/SystemZ/list-ilp-crash.ll create mode 100644 test/CodeGen/SystemZ/lower-copy-undef-src.mir delete mode 100644 test/CodeGen/X86/2012-11-30-handlemove-dbg.ll delete mode 100644 test/CodeGen/X86/2012-11-30-misched-dbg.ll delete mode 100644 test/CodeGen/X86/2012-11-30-regpres-dbg.ll create mode 100644 test/CodeGen/X86/GlobalISel/add-scalar.ll create mode 100644 test/CodeGen/X86/GlobalISel/br.ll create mode 100644 test/CodeGen/X86/GlobalISel/cmp.ll create mode 100644 test/CodeGen/X86/GlobalISel/legalize-cmp.mir create mode 100644 test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll create mode 100644 test/CodeGen/X86/GlobalISel/memop-scalar.ll create mode 100644 test/CodeGen/X86/GlobalISel/memop-vec.ll delete mode 100644 test/CodeGen/X86/GlobalISel/memop-x32.ll delete mode 100644 test/CodeGen/X86/GlobalISel/memop.ll create mode 100644 test/CodeGen/X86/GlobalISel/select-br.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-cmp.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-scalar.mir create mode 100644 test/CodeGen/X86/GlobalISel/select-memop-v128.mir delete mode 100644 test/CodeGen/X86/GlobalISel/select-memop-x32.mir delete mode 100644 test/CodeGen/X86/GlobalISel/select-memop.mir create mode 100644 test/CodeGen/X86/O0-pipeline.ll create mode 100644 test/CodeGen/X86/avx512-scalar_mask.ll create mode 100644 test/CodeGen/X86/avx512-vselect.ll create mode 100644 test/CodeGen/X86/leaFixup32.mir create mode 100644 test/CodeGen/X86/leaFixup64.mir create mode 100644 test/CodeGen/X86/lrshrink.ll create mode 100644 test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll create mode 100644 test/CodeGen/X86/vector-narrow-binop.ll create mode 100644 test/DebugInfo/COFF/no-cus.ll create mode 100644 test/DebugInfo/Inputs/typeunit-header.elf-x86-64 create mode 100644 test/DebugInfo/Inputs/typeunit-header.s create mode 100644 test/DebugInfo/X86/dbg-declare-inalloca.ll create mode 100644 test/DebugInfo/typeunit-header.test create mode 100644 test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll create mode 100644 test/Instrumentation/MemorySanitizer/pr32842.ll create mode 100644 test/LTO/Resolution/X86/ifunc.ll create mode 100644 test/MC/AArch64/directive-cpu-err.s create mode 100644 test/MC/AsmParser/altmacro_string_escape.s create mode 100644 test/MC/Disassembler/PowerPC/ppc64-encoding-p9vector.txt create mode 100644 test/Object/Inputs/COFF/empty-drectve.yaml create mode 100644 test/Object/X86/archive-symbol-table.s create mode 100644 test/Object/coff-empty-drectve.test create mode 100644 test/Object/wasm-invalid-start.test create mode 100644 test/TableGen/RegisterEncoder.td create mode 100644 test/Transforms/CodeExtractor/PartialInlineEntryUpdate.ll create mode 100644 test/Transforms/CodeExtractor/PartialInlineHighCost.ll create mode 100644 test/Transforms/CodeGenPrepare/section-samplepgo.ll create mode 100644 test/Transforms/ConstProp/calls-math-finite.ll create mode 100644 test/Transforms/ConstProp/sse.ll create mode 100644 test/Transforms/Coroutines/coro-eh-aware-edge-split.ll create mode 100644 test/Transforms/GVN/PRE/nonintegral.ll delete mode 100644 test/Transforms/InstCombine/2012-04-23-Neon-Intrinsics.ll create mode 100644 test/Transforms/InstCombine/AArch64/2012-04-23-Neon-Intrinsics.ll create mode 100644 test/Transforms/InstCombine/AArch64/lit.local.cfg create mode 100644 test/Transforms/InstCombine/AMDGPU/amdgcn-intrinsics.ll create mode 100644 test/Transforms/InstCombine/AMDGPU/lit.local.cfg create mode 100644 test/Transforms/InstCombine/ARM/2012-04-23-Neon-Intrinsics.ll create mode 100644 test/Transforms/InstCombine/ARM/constant-fold-hang.ll create mode 100644 test/Transforms/InstCombine/ARM/lit.local.cfg create mode 100644 test/Transforms/InstCombine/ARM/neon-intrinsics.ll create mode 100644 test/Transforms/InstCombine/PowerPC/aligned-altivec.ll create mode 100644 test/Transforms/InstCombine/PowerPC/aligned-qpx.ll create mode 100644 test/Transforms/InstCombine/PowerPC/lit.local.cfg create mode 100644 test/Transforms/InstCombine/PowerPC/vsx-unaligned.ll create mode 100644 test/Transforms/InstCombine/X86/X86FsubCmpCombine.ll create mode 100644 test/Transforms/InstCombine/X86/blend_x86.ll create mode 100644 test/Transforms/InstCombine/X86/lit.local.cfg create mode 100644 test/Transforms/InstCombine/X86/pr2645-1.ll create mode 100644 test/Transforms/InstCombine/X86/shufflemask-undef.ll create mode 100644 test/Transforms/InstCombine/X86/x86-avx2.ll create mode 100644 test/Transforms/InstCombine/X86/x86-avx512.ll create mode 100644 test/Transforms/InstCombine/X86/x86-crc32-demanded.ll create mode 100644 test/Transforms/InstCombine/X86/x86-f16c.ll create mode 100644 test/Transforms/InstCombine/X86/x86-fma.ll create mode 100644 test/Transforms/InstCombine/X86/x86-insertps.ll create mode 100644 test/Transforms/InstCombine/X86/x86-masked-memops.ll create mode 100644 test/Transforms/InstCombine/X86/x86-movmsk.ll create mode 100644 test/Transforms/InstCombine/X86/x86-muldq.ll create mode 100644 test/Transforms/InstCombine/X86/x86-pack.ll create mode 100644 test/Transforms/InstCombine/X86/x86-pshufb.ll create mode 100644 test/Transforms/InstCombine/X86/x86-sse.ll create mode 100644 test/Transforms/InstCombine/X86/x86-sse2.ll create mode 100644 test/Transforms/InstCombine/X86/x86-sse41.ll create mode 100644 test/Transforms/InstCombine/X86/x86-sse4a.ll create mode 100644 test/Transforms/InstCombine/X86/x86-vec_demanded_elts.ll create mode 100644 test/Transforms/InstCombine/X86/x86-vector-shifts.ll create mode 100644 test/Transforms/InstCombine/X86/x86-vperm2.ll create mode 100644 test/Transforms/InstCombine/X86/x86-vpermil.ll create mode 100644 test/Transforms/InstCombine/X86/x86-xop.ll delete mode 100644 test/Transforms/InstCombine/X86FsubCmpCombine.ll delete mode 100644 test/Transforms/InstCombine/aligned-altivec.ll delete mode 100644 test/Transforms/InstCombine/aligned-qpx.ll delete mode 100644 test/Transforms/InstCombine/amdgcn-intrinsics.ll delete mode 100644 test/Transforms/InstCombine/bit-tracking.ll delete mode 100644 test/Transforms/InstCombine/blend_x86.ll delete mode 100644 test/Transforms/InstCombine/constant-fold-hang.ll create mode 100644 test/Transforms/InstCombine/constant-fold-iteration.ll delete mode 100644 test/Transforms/InstCombine/neon-intrinsics.ll delete mode 100644 test/Transforms/InstCombine/pr2645-1.ll delete mode 100644 test/Transforms/InstCombine/shufflemask-undef.ll delete mode 100644 test/Transforms/InstCombine/vsx-unaligned.ll delete mode 100644 test/Transforms/InstCombine/x86-avx2.ll delete mode 100644 test/Transforms/InstCombine/x86-avx512.ll delete mode 100644 test/Transforms/InstCombine/x86-crc32-demanded.ll delete mode 100644 test/Transforms/InstCombine/x86-f16c.ll delete mode 100644 test/Transforms/InstCombine/x86-fma.ll delete mode 100644 test/Transforms/InstCombine/x86-insertps.ll delete mode 100644 test/Transforms/InstCombine/x86-masked-memops.ll delete mode 100644 test/Transforms/InstCombine/x86-movmsk.ll delete mode 100644 test/Transforms/InstCombine/x86-muldq.ll delete mode 100644 test/Transforms/InstCombine/x86-pack.ll delete mode 100644 test/Transforms/InstCombine/x86-pshufb.ll delete mode 100644 test/Transforms/InstCombine/x86-sse.ll delete mode 100644 test/Transforms/InstCombine/x86-sse2.ll delete mode 100644 test/Transforms/InstCombine/x86-sse41.ll delete mode 100644 test/Transforms/InstCombine/x86-sse4a.ll delete mode 100644 test/Transforms/InstCombine/x86-vector-shifts.ll delete mode 100644 test/Transforms/InstCombine/x86-vperm2.ll delete mode 100644 test/Transforms/InstCombine/x86-vpermil.ll delete mode 100644 test/Transforms/InstCombine/x86-xop.ll create mode 100644 test/Transforms/InstNamer/basic.ll delete mode 100644 test/Transforms/InstSimplify/apint-or.ll create mode 100644 test/Transforms/InstSimplify/or.ll create mode 100644 test/Transforms/LoopIdiom/ARM/ctlz.ll create mode 100644 test/Transforms/LoopIdiom/X86/ctlz.ll create mode 100644 test/Transforms/LoopVectorize/X86/svml-calls-finite.ll create mode 100644 test/Transforms/LoopVectorize/pr32859.ll create mode 100644 test/Transforms/NewGVN/pr32934.ll create mode 100644 test/Transforms/NewGVN/pr32952.ll create mode 100644 test/Transforms/NewGVN/verify-memoryphi.ll create mode 100644 test/Transforms/SLPVectorizer/AArch64/64-bit-vector.ll create mode 100644 test/Transforms/SLPVectorizer/AArch64/remarks.ll create mode 100644 test/Transforms/SLPVectorizer/X86/arith-add.ll create mode 100644 test/Transforms/SLPVectorizer/X86/arith-mul.ll create mode 100644 test/Transforms/SLPVectorizer/X86/arith-sub.ll create mode 100644 test/Transforms/SLPVectorizer/X86/shift-ashr.ll create mode 100644 test/Transforms/SLPVectorizer/X86/shift-lshr.ll create mode 100644 test/Transforms/SLPVectorizer/X86/shift-shl.ll delete mode 100644 test/Transforms/SpeculativeExecution/spec-other.ll delete mode 100644 test/Transforms/SpeculativeExecution/spec-vector.ll create mode 100644 test/tools/llvm-pdbdump/symbol-filters.test create mode 100644 test/tools/llvm-readobj/wasm-invalid.test create mode 100644 unittests/DebugInfo/CodeView/CMakeLists.txt create mode 100644 unittests/DebugInfo/CodeView/ErrorChecking.h create mode 100644 unittests/DebugInfo/CodeView/RandomAccessVisitorTest.cpp create mode 100644 unittests/Support/ParallelTest.cpp create mode 100644 utils/vscode/README create mode 100644 utils/vscode/tablegen/.vscode/launch.json create mode 100644 utils/vscode/tablegen/CHANGELOG.md create mode 100644 utils/vscode/tablegen/README.md create mode 100644 utils/vscode/tablegen/language-configuration.json create mode 100644 utils/vscode/tablegen/package.json create mode 100644 utils/vscode/tablegen/syntaxes/TableGen.tmLanguage create mode 100644 utils/vscode/tablegen/vsc-extension-quickstart.md diff --git a/CREDITS.TXT b/CREDITS.TXT index 15d822a68091..20bd553ae2bc 100644 --- a/CREDITS.TXT +++ b/CREDITS.TXT @@ -265,7 +265,7 @@ D: Release manager (1.7+) N: Sylvestre Ledru E: sylvestre@debian.org W: http://sylvestre.ledru.info/ -W: http://llvm.org/apt/ +W: http://apt.llvm.org/ D: Debian and Ubuntu packaging D: Continuous integration with jenkins diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake index 0331d0fa10ab..de8e9bf9a494 100755 --- a/cmake/config-ix.cmake +++ b/cmake/config-ix.cmake @@ -530,16 +530,6 @@ else() message(STATUS "Doxygen disabled.") endif() -if (LLVM_ENABLE_SPHINX) - message(STATUS "Sphinx enabled.") - find_package(Sphinx REQUIRED) - if (LLVM_BUILD_DOCS) - add_custom_target(sphinx ALL) - endif() -else() - message(STATUS "Sphinx disabled.") -endif() - set(LLVM_BINDINGS "") if(WIN32) message(STATUS "Go bindings disabled.") diff --git a/cmake/modules/AddSphinxTarget.cmake b/cmake/modules/AddSphinxTarget.cmake index cfc7f38e9e77..4540c5c36c8e 100644 --- a/cmake/modules/AddSphinxTarget.cmake +++ b/cmake/modules/AddSphinxTarget.cmake @@ -1,3 +1,16 @@ + +# Create sphinx target +if (LLVM_ENABLE_SPHINX) + message(STATUS "Sphinx enabled.") + find_package(Sphinx REQUIRED) + if (LLVM_BUILD_DOCS AND NOT TARGET sphinx) + add_custom_target(sphinx ALL) + endif() +else() + message(STATUS "Sphinx disabled.") +endif() + + # Handy function for creating the different Sphinx targets. # # ``builder`` should be one of the supported builders used by diff --git a/docs/CMakeLists.txt b/docs/CMakeLists.txt index 6dff219ae37f..4437610146c4 100644 --- a/docs/CMakeLists.txt +++ b/docs/CMakeLists.txt @@ -103,8 +103,8 @@ endif() endif() if (LLVM_ENABLE_SPHINX) + include(AddSphinxTarget) if (SPHINX_FOUND) - include(AddSphinxTarget) if (${SPHINX_OUTPUT_HTML}) add_sphinx_target(html llvm) endif() diff --git a/docs/GettingStarted.rst b/docs/GettingStarted.rst index d5c8ba4b8214..0cb415ad764e 100644 --- a/docs/GettingStarted.rst +++ b/docs/GettingStarted.rst @@ -699,14 +699,14 @@ For developers to work with a git monorepo .. note:: - This set-up is using unofficial mirror hosted on GitHub, use with caution. + This set-up is using an unofficial mirror hosted on GitHub, use with caution. To set up a clone of all the llvm projects using a unified repository: .. code-block:: console % export TOP_LEVEL_DIR=`pwd` - % git clone https://github.com/llvm-project/llvm-project/ + % git clone https://github.com/llvm-project/llvm-project-20170507/ llvm-project % cd llvm-project % git config branch.master.rebase true diff --git a/docs/LangRef.rst b/docs/LangRef.rst index dad99e3352dd..9ff47e8366dc 100644 --- a/docs/LangRef.rst +++ b/docs/LangRef.rst @@ -641,8 +641,9 @@ assume that the globals are densely packed in their section and try to iterate over them as an array, alignment padding would break this iteration. The maximum alignment is ``1 << 29``. -Globals can also have a :ref:`DLL storage class ` and -an optional list of attached :ref:`metadata `, +Globals can also have a :ref:`DLL storage class `, +an optional :ref:`global attributes ` and +an optional list of attached :ref:`metadata `. Variables and aliases can have a :ref:`Thread Local Storage Model `. @@ -1624,6 +1625,14 @@ example: the ELF x86-64 abi, but it can be disabled for some compilation units. +.. _glattrs: + +Global Attributes +----------------- + +Attributes may be set to communicate additional information about a global variable. +Unlike :ref:`function attributes `, attributes on a global variable +are grouped into a single :ref:`attribute group `. .. _opbundles: @@ -3664,6 +3673,9 @@ Sparc: - ``I``: An immediate 13-bit signed integer. - ``r``: A 32-bit integer register. +- ``f``: Any floating-point register on SparcV8, or a floating point + register in the "low" half of the registers on SparcV9. +- ``e``: Any floating point register. (Same as ``f`` on SparcV8.) SystemZ: @@ -11687,6 +11699,338 @@ Examples: %r2 = call float @llvm.fmuladd.f32(float %a, float %b, float %c) ; yields float:r2 = (a * b) + c + +Experimental Vector Reduction Intrinsics +---------------------------------------- + +Horizontal reductions of vectors can be expressed using the following +intrinsics. Each one takes a vector operand as an input and applies its +respective operation across all elements of the vector, returning a single +scalar result of the same element type. + + +'``llvm.experimental.vector.reduce.add.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> %a) + declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.add.*``' intrinsics do an integer ``ADD`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.fadd.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.fadd.f64.v2f64(double %acc, <2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fadd.*``' intrinsics do a floating point +``ADD`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has fast-math flags, then the reduction will not preserve +the associativity of an equivalent scalarized counterpart. If it does not have +fast-math flags, then the reduction will be *ordered*, implying that the +operation respects the associativity of a scalarized reduction. + + +Arguments: +"""""""""" +The first argument to this intrinsic is a scalar accumulator value, which is +only used when there are no fast-math flags attached. This argument may be undef +when fast-math flags are used. + +The second argument must be a vector of floating point values. + +Examples: +""""""""" + +.. code-block:: llvm + + %fast = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %input) ; fast reduction + %ord = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + + +'``llvm.experimental.vector.reduce.mul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.mul.i32.v4i32(<4 x i32> %a) + declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.mul.*``' intrinsics do an integer ``MUL`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.fmul.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %a) + declare double @llvm.experimental.vector.reduce.fmul.f64.v2f64(double %acc, <2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fmul.*``' intrinsics do a floating point +``MUL`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has fast-math flags, then the reduction will not preserve +the associativity of an equivalent scalarized counterpart. If it does not have +fast-math flags, then the reduction will be *ordered*, implying that the +operation respects the associativity of a scalarized reduction. + + +Arguments: +"""""""""" +The first argument to this intrinsic is a scalar accumulator value, which is +only used when there are no fast-math flags attached. This argument may be undef +when fast-math flags are used. + +The second argument must be a vector of floating point values. + +Examples: +""""""""" + +.. code-block:: llvm + + %fast = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %input) ; fast reduction + %ord = call float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float %acc, <4 x float> %input) ; ordered reduction + +'``llvm.experimental.vector.reduce.and.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.and.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.and.*``' intrinsics do a bitwise ``AND`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.or.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.or.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.or.*``' intrinsics do a bitwise ``OR`` reduction +of a vector, returning the result as a scalar. The return type matches the +element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.xor.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.xor.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.xor.*``' intrinsics do a bitwise ``XOR`` +reduction of a vector, returning the result as a scalar. The return type matches +the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.smax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.smax.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.smax.*``' intrinsics do a signed integer +``MAX`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.smin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.smin.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.smin.*``' intrinsics do a signed integer +``MIN`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.umax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.umax.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.umax.*``' intrinsics do an unsigned +integer ``MAX`` reduction of a vector, returning the result as a scalar. The +return type matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.umin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 @llvm.experimental.vector.reduce.umin.i32.v4i32(<4 x i32> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.umin.*``' intrinsics do an unsigned +integer ``MIN`` reduction of a vector, returning the result as a scalar. The +return type matches the element-type of the vector input. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of integer values. + +'``llvm.experimental.vector.reduce.fmax.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fmax.f32.v4f32(<4 x float> %a) + declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fmax.*``' intrinsics do a floating point +``MAX`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has the ``nnan`` fast-math flag then the operation can +assume that NaNs are not present in the input vector. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of floating point values. + +'``llvm.experimental.vector.reduce.fmin.*``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare float @llvm.experimental.vector.reduce.fmin.f32.v4f32(<4 x float> %a) + declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %a) + +Overview: +""""""""" + +The '``llvm.experimental.vector.reduce.fmin.*``' intrinsics do a floating point +``MIN`` reduction of a vector, returning the result as a scalar. The return type +matches the element-type of the vector input. + +If the intrinsic call has the ``nnan`` fast-math flag then the operation can +assume that NaNs are not present in the input vector. + +Arguments: +"""""""""" +The argument to this intrinsic must be a vector of floating point values. + Half Precision Floating Point Intrinsics ---------------------------------------- diff --git a/docs/Lexicon.rst b/docs/Lexicon.rst index 35687e258182..ebc3fb772e81 100644 --- a/docs/Lexicon.rst +++ b/docs/Lexicon.rst @@ -249,6 +249,14 @@ S Superword-Level Parallelism, same as :ref:`Basic-Block Vectorization `. +**Splat** + Splat refers to a vector of identical scalar elements. + + The term is based on the PowerPC Altivec instructions that provided + this functionality in hardware. For example, "vsplth" and the corresponding + software intrinsic "vec_splat()". Examples of other hardware names for this + action include "duplicate" (ARM) and "broadcast" (x86). + **SRoA** Scalar Replacement of Aggregates diff --git a/docs/LibFuzzer.rst b/docs/LibFuzzer.rst index a11baa720ec8..5acfa04ce1f4 100644 --- a/docs/LibFuzzer.rst +++ b/docs/LibFuzzer.rst @@ -305,6 +305,10 @@ The most important command line options are: - 1 : close ``stdout`` - 2 : close ``stderr`` - 3 : close both ``stdout`` and ``stderr``. +``-print_coverage`` + If 1, print coverage information as text at exit. +``-dump_coverage`` + If 1, dump coverage information as a .sancov file at exit. For the full list of flags run the fuzzer binary with ``-help=1``. @@ -543,12 +547,19 @@ You can get the coverage for your corpus like this: .. code-block:: console - ASAN_OPTIONS=coverage=1 ./fuzzer CORPUS_DIR -runs=0 + ./fuzzer CORPUS_DIR -runs=0 -print_coverage=1 This will run all tests in the CORPUS_DIR but will not perform any fuzzing. -At the end of the process it will dump a single ``.sancov`` file with coverage -information. See SanitizerCoverage_ for details on querying the file using the -``sancov`` tool. +At the end of the process it will print text describing what code has been covered and what hasn't. + +Alternatively, use + +.. code-block:: console + + ./fuzzer CORPUS_DIR -runs=0 -dump_coverage=1 + +which will dump a ``.sancov`` file with coverage information. +See SanitizerCoverage_ for details on querying the file using the ``sancov`` tool. You may also use other ways to visualize coverage, e.g. using `Clang coverage `_, diff --git a/docs/ReleaseNotes.rst b/docs/ReleaseNotes.rst index dbffb53d5a51..bc35e62189a2 100644 --- a/docs/ReleaseNotes.rst +++ b/docs/ReleaseNotes.rst @@ -40,6 +40,10 @@ Non-comprehensive list of changes in this release functionality, or simply have a lot to talk about), see the `NOTE` below for adding a new subsection. +* LLVM's ``WeakVH`` has been renamed to ``WeakTrackingVH`` and a new ``WeakVH`` + has been introduced. The new ``WeakVH`` nulls itself out on deletion, but + does not track values across RAUW. + * ... next change ... .. NOTE diff --git a/include/llvm/ADT/APInt.h b/include/llvm/ADT/APInt.h index c3822e35906a..94fbd1a29bf9 100644 --- a/include/llvm/ADT/APInt.h +++ b/include/llvm/ADT/APInt.h @@ -157,6 +157,11 @@ private: return isSingleWord() ? U.VAL : U.pVal[whichWord(bitPosition)]; } + /// Utility method to change the bit width of this APInt to new bit width, + /// allocating and/or deallocating as necessary. There is no guarantee on the + /// value of any bits upon return. Caller should populate the bits after. + void reallocate(unsigned NewBitWidth); + /// \brief Convert a char array into an APInt /// /// \param radix 2, 8, 10, 16, or 36 @@ -1437,6 +1442,12 @@ public: /// as "bitPosition". void flipBit(unsigned bitPosition); + /// Negate this APInt in place. + void negate() { + flipAllBits(); + ++(*this); + } + /// Insert the bits from a smaller APInt starting at bitPosition. void insertBits(const APInt &SubBits, unsigned bitPosition); @@ -1646,12 +1657,7 @@ public: /// re-interprets the bits as a double. Note that it is valid to do this on /// any bit width. Exactly 64 bits will be translated. double bitsToDouble() const { - union { - uint64_t I; - double D; - } T; - T.I = (isSingleWord() ? U.VAL : U.pVal[0]); - return T.D; + return BitsToDouble(getWord(0)); } /// \brief Converts APInt bits to a double @@ -1660,12 +1666,7 @@ public: /// re-interprets the bits as a float. Note that it is valid to do this on /// any bit width. Exactly 32 bits will be translated. float bitsToFloat() const { - union { - unsigned I; - float F; - } T; - T.I = unsigned((isSingleWord() ? U.VAL : U.pVal[0])); - return T.F; + return BitsToFloat(getWord(0)); } /// \brief Converts a double to APInt bits. @@ -1673,12 +1674,7 @@ public: /// The conversion does not do a translation from double to integer, it just /// re-interprets the bits of the double. static APInt doubleToBits(double V) { - union { - uint64_t I; - double D; - } T; - T.D = V; - return APInt(sizeof T * CHAR_BIT, T.I); + return APInt(sizeof(double) * CHAR_BIT, DoubleToBits(V)); } /// \brief Converts a float to APInt bits. @@ -1686,12 +1682,7 @@ public: /// The conversion does not do a translation from float to integer, it just /// re-interprets the bits of the float. static APInt floatToBits(float V) { - union { - unsigned I; - float F; - } T; - T.F = V; - return APInt(sizeof T * CHAR_BIT, T.I); + return APInt(sizeof(float) * CHAR_BIT, FloatToBits(V)); } /// @} @@ -1852,10 +1843,9 @@ public: unsigned); /// DST = LHS * RHS, where DST has width the sum of the widths of the - /// operands. No overflow occurs. DST must be disjoint from both - /// operands. Returns the number of parts required to hold the result. - static unsigned tcFullMultiply(WordType *, const WordType *, - const WordType *, unsigned, unsigned); + /// operands. No overflow occurs. DST must be disjoint from both operands. + static void tcFullMultiply(WordType *, const WordType *, + const WordType *, unsigned, unsigned); /// If RHS is zero LHS and REMAINDER are left unchanged, return one. /// Otherwise set LHS to LHS / RHS with the fractional part discarded, set @@ -1997,8 +1987,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const APInt &I) { } inline APInt operator-(APInt v) { - v.flipAllBits(); - ++v; + v.negate(); return v; } diff --git a/include/llvm/ADT/BitVector.h b/include/llvm/ADT/BitVector.h index e835f1516225..4a2af7cd68a6 100644 --- a/include/llvm/ADT/BitVector.h +++ b/include/llvm/ADT/BitVector.h @@ -255,7 +255,7 @@ public: /// find_prev - Returns the index of the first set bit that precedes the /// the bit at \p PriorTo. Returns -1 if all previous bits are unset. - int find_prev(unsigned PriorTo) { + int find_prev(unsigned PriorTo) const { if (PriorTo == 0) return -1; diff --git a/include/llvm/ADT/STLExtras.h b/include/llvm/ADT/STLExtras.h index 15945adbe589..8c28412bb607 100644 --- a/include/llvm/ADT/STLExtras.h +++ b/include/llvm/ADT/STLExtras.h @@ -706,6 +706,18 @@ struct is_one_of { std::is_same::value || is_one_of::value; }; +/// \brief traits class for checking whether type T is a base class for all +/// the given types in the variadic list. +template struct are_base_of { + static const bool value = true; +}; + +template +struct are_base_of { + static const bool value = + std::is_base_of::value && are_base_of::value; +}; + //===----------------------------------------------------------------------===// // Extra additions for arrays //===----------------------------------------------------------------------===// @@ -1079,7 +1091,7 @@ private: /// /// std::vector Items = {'A', 'B', 'C', 'D'}; /// for (auto X : enumerate(Items)) { -/// printf("Item %d - %c\n", X.Index, X.Value); +/// printf("Item %d - %c\n", X.index(), X.value()); /// } /// /// Output: diff --git a/include/llvm/ADT/StringExtras.h b/include/llvm/ADT/StringExtras.h index 26f11924b771..1c109be3bab3 100644 --- a/include/llvm/ADT/StringExtras.h +++ b/include/llvm/ADT/StringExtras.h @@ -106,6 +106,13 @@ static inline std::string fromHex(StringRef Input) { return Output; } +/// \brief Convert the string \p S to an integer of the specified type using +/// the radix \p Base. If \p Base is 0, auto-detects the radix. +/// Returns true if the number was successfully converted, false otherwise. +template bool to_integer(StringRef S, N &Num, unsigned Base = 0) { + return !S.getAsInteger(Base, Num); +} + static inline std::string utostr(uint64_t X, bool isNeg = false) { char Buffer[21]; char *BufPtr = std::end(Buffer); diff --git a/include/llvm/Analysis/CallGraph.h b/include/llvm/Analysis/CallGraph.h index cc4788d3edae..01469a25c96c 100644 --- a/include/llvm/Analysis/CallGraph.h +++ b/include/llvm/Analysis/CallGraph.h @@ -41,12 +41,6 @@ /// of all of the caller-callee relationships, which is useful for /// transformations. /// -/// The CallGraph class also attempts to figure out what the root of the -/// CallGraph is, which it currently does by looking for a function named -/// 'main'. If no function named 'main' is found, the external node is used as -/// the entry node, reflecting the fact that any function without internal -/// linkage could be called into (which is common for libraries). -/// //===----------------------------------------------------------------------===// #ifndef LLVM_ANALYSIS_CALLGRAPH_H @@ -82,10 +76,6 @@ class CallGraph { /// \brief A map from \c Function* to \c CallGraphNode*. FunctionMapTy FunctionMap; - /// \brief Root is root of the call graph, or the external node if a 'main' - /// function couldn't be found. - CallGraphNode *Root; - /// \brief This node has edges to all external functions and those internal /// functions that have their address taken. CallGraphNode *ExternalCallingNode; diff --git a/include/llvm/Analysis/ProfileSummaryInfo.h b/include/llvm/Analysis/ProfileSummaryInfo.h index 75c4cbd03706..c5f97083af4d 100644 --- a/include/llvm/Analysis/ProfileSummaryInfo.h +++ b/include/llvm/Analysis/ProfileSummaryInfo.h @@ -67,8 +67,8 @@ public: } /// Returns the profile count for \p CallInst. - static Optional getProfileCount(const Instruction *CallInst, - BlockFrequencyInfo *BFI); + Optional getProfileCount(const Instruction *CallInst, + BlockFrequencyInfo *BFI); /// \brief Returns true if \p F has hot function entry. bool isFunctionEntryHot(const Function *F); /// Returns true if \p F has hot function entry or hot call edge. diff --git a/include/llvm/Analysis/ScalarEvolution.h b/include/llvm/Analysis/ScalarEvolution.h index 85350fa159d6..ceca6cb389a1 100644 --- a/include/llvm/Analysis/ScalarEvolution.h +++ b/include/llvm/Analysis/ScalarEvolution.h @@ -568,27 +568,16 @@ private: Predicates.insert(P); } - /*implicit*/ ExitLimit(const SCEV *E) - : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {} + /*implicit*/ ExitLimit(const SCEV *E); ExitLimit( const SCEV *E, const SCEV *M, bool MaxOrZero, - ArrayRef *> PredSetList) - : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) { - assert((isa(ExactNotTaken) || - !isa(MaxNotTaken)) && - "Exact is not allowed to be less precise than Max"); - for (auto *PredSet : PredSetList) - for (auto *P : *PredSet) - addPredicate(P); - } + ArrayRef *> PredSetList); ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero, - const SmallPtrSetImpl &PredSet) - : ExitLimit(E, M, MaxOrZero, {&PredSet}) {} + const SmallPtrSetImpl &PredSet); - ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero) - : ExitLimit(E, M, MaxOrZero, None) {} + ExitLimit(const SCEV *E, const SCEV *M, bool MaxOrZero); /// Test whether this ExitLimit contains any computed information, or /// whether it's all SCEVCouldNotCompute values. @@ -782,7 +771,7 @@ private: /// Set the memoized range for the given SCEV. const ConstantRange &setRange(const SCEV *S, RangeSignHint Hint, - ConstantRange &&CR) { + ConstantRange CR) { DenseMap &Cache = Hint == HINT_RANGE_UNSIGNED ? UnsignedRanges : SignedRanges; diff --git a/include/llvm/Analysis/TargetLibraryInfo.def b/include/llvm/Analysis/TargetLibraryInfo.def index 099a3c7cf2ac..9cbe917c146d 100644 --- a/include/llvm/Analysis/TargetLibraryInfo.def +++ b/include/llvm/Analysis/TargetLibraryInfo.def @@ -161,6 +161,60 @@ TLI_DEFINE_STRING_INTERNAL("_Znwm") /// void *new(unsigned long, nothrow); TLI_DEFINE_ENUM_INTERNAL(ZnwmRKSt9nothrow_t) TLI_DEFINE_STRING_INTERNAL("_ZnwmRKSt9nothrow_t") +/// double __acos_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(acos_finite) +TLI_DEFINE_STRING_INTERNAL("__acos_finite") +/// float __acosf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(acosf_finite) +TLI_DEFINE_STRING_INTERNAL("__acosf_finite") +/// double __acosh_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(acosh_finite) +TLI_DEFINE_STRING_INTERNAL("__acosh_finite") +/// float __acoshf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(acoshf_finite) +TLI_DEFINE_STRING_INTERNAL("__acoshf_finite") +/// long double __acoshl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(acoshl_finite) +TLI_DEFINE_STRING_INTERNAL("__acoshl_finite") +/// long double __acosl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(acosl_finite) +TLI_DEFINE_STRING_INTERNAL("__acosl_finite") +/// double __asin_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(asin_finite) +TLI_DEFINE_STRING_INTERNAL("__asin_finite") +/// float __asinf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(asinf_finite) +TLI_DEFINE_STRING_INTERNAL("__asinf_finite") +/// long double __asinl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(asinl_finite) +TLI_DEFINE_STRING_INTERNAL("__asinl_finite") +/// double atan2_finite(double y, double x); +TLI_DEFINE_ENUM_INTERNAL(atan2_finite) +TLI_DEFINE_STRING_INTERNAL("__atan2_finite") +/// float atan2f_finite(float y, float x); +TLI_DEFINE_ENUM_INTERNAL(atan2f_finite) +TLI_DEFINE_STRING_INTERNAL("__atan2f_finite") +/// long double atan2l_finite(long double y, long double x); +TLI_DEFINE_ENUM_INTERNAL(atan2l_finite) +TLI_DEFINE_STRING_INTERNAL("__atan2l_finite") +/// double __atanh_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(atanh_finite) +TLI_DEFINE_STRING_INTERNAL("__atanh_finite") +/// float __atanhf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(atanhf_finite) +TLI_DEFINE_STRING_INTERNAL("__atanhf_finite") +/// long double __atanhl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(atanhl_finite) +TLI_DEFINE_STRING_INTERNAL("__atanhl_finite") +/// double __cosh_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(cosh_finite) +TLI_DEFINE_STRING_INTERNAL("__cosh_finite") +/// float __coshf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(coshf_finite) +TLI_DEFINE_STRING_INTERNAL("__coshf_finite") +/// long double __coshl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(coshl_finite) +TLI_DEFINE_STRING_INTERNAL("__coshl_finite") /// double __cospi(double x); TLI_DEFINE_ENUM_INTERNAL(cospi) TLI_DEFINE_STRING_INTERNAL("__cospi") @@ -180,12 +234,66 @@ TLI_DEFINE_STRING_INTERNAL("__cxa_guard_acquire") /// void __cxa_guard_release(guard_t *guard); TLI_DEFINE_ENUM_INTERNAL(cxa_guard_release) TLI_DEFINE_STRING_INTERNAL("__cxa_guard_release") +/// double __exp10_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(exp10_finite) +TLI_DEFINE_STRING_INTERNAL("__exp10_finite") +/// float __exp10f_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(exp10f_finite) +TLI_DEFINE_STRING_INTERNAL("__exp10f_finite") +/// long double __exp10l_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(exp10l_finite) +TLI_DEFINE_STRING_INTERNAL("__exp10l_finite") +/// double __exp2_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(exp2_finite) +TLI_DEFINE_STRING_INTERNAL("__exp2_finite") +/// float __exp2f_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(exp2f_finite) +TLI_DEFINE_STRING_INTERNAL("__exp2f_finite") +/// long double __exp2l_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(exp2l_finite) +TLI_DEFINE_STRING_INTERNAL("__exp2l_finite") +/// double __exp_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(exp_finite) +TLI_DEFINE_STRING_INTERNAL("__exp_finite") +/// float __expf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(expf_finite) +TLI_DEFINE_STRING_INTERNAL("__expf_finite") +/// long double __expl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(expl_finite) +TLI_DEFINE_STRING_INTERNAL("__expl_finite") /// int __isoc99_scanf (const char *format, ...) TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_scanf) TLI_DEFINE_STRING_INTERNAL("__isoc99_scanf") /// int __isoc99_sscanf(const char *s, const char *format, ...) TLI_DEFINE_ENUM_INTERNAL(dunder_isoc99_sscanf) TLI_DEFINE_STRING_INTERNAL("__isoc99_sscanf") +/// double __log10_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(log10_finite) +TLI_DEFINE_STRING_INTERNAL("__log10_finite") +/// float __log10f_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(log10f_finite) +TLI_DEFINE_STRING_INTERNAL("__log10f_finite") +/// long double __log10l_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(log10l_finite) +TLI_DEFINE_STRING_INTERNAL("__log10l_finite") +/// double __log2_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(log2_finite) +TLI_DEFINE_STRING_INTERNAL("__log2_finite") +/// float __log2f_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(log2f_finite) +TLI_DEFINE_STRING_INTERNAL("__log2f_finite") +/// long double __log2l_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(log2l_finite) +TLI_DEFINE_STRING_INTERNAL("__log2l_finite") +/// double __log_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(log_finite) +TLI_DEFINE_STRING_INTERNAL("__log_finite") +/// float __logf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(logf_finite) +TLI_DEFINE_STRING_INTERNAL("__logf_finite") +/// long double __logl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(logl_finite) +TLI_DEFINE_STRING_INTERNAL("__logl_finite") /// void *__memcpy_chk(void *s1, const void *s2, size_t n, size_t s1size); TLI_DEFINE_ENUM_INTERNAL(memcpy_chk) TLI_DEFINE_STRING_INTERNAL("__memcpy_chk") @@ -199,13 +307,30 @@ TLI_DEFINE_STRING_INTERNAL("__memset_chk") // int __nvvm_reflect(const char *) TLI_DEFINE_ENUM_INTERNAL(nvvm_reflect) TLI_DEFINE_STRING_INTERNAL("__nvvm_reflect") - +/// double __pow_finite(double x, double y); +TLI_DEFINE_ENUM_INTERNAL(pow_finite) +TLI_DEFINE_STRING_INTERNAL("__pow_finite") +/// float _powf_finite(float x, float y); +TLI_DEFINE_ENUM_INTERNAL(powf_finite) +TLI_DEFINE_STRING_INTERNAL("__powf_finite") +/// long double __powl_finite(long double x, long double y); +TLI_DEFINE_ENUM_INTERNAL(powl_finite) +TLI_DEFINE_STRING_INTERNAL("__powl_finite") /// double __sincospi_stret(double x); TLI_DEFINE_ENUM_INTERNAL(sincospi_stret) TLI_DEFINE_STRING_INTERNAL("__sincospi_stret") /// float __sincospif_stret(float x); TLI_DEFINE_ENUM_INTERNAL(sincospif_stret) TLI_DEFINE_STRING_INTERNAL("__sincospif_stret") +/// double __sinh_finite(double x); +TLI_DEFINE_ENUM_INTERNAL(sinh_finite) +TLI_DEFINE_STRING_INTERNAL("__sinh_finite") +/// float _sinhf_finite(float x); +TLI_DEFINE_ENUM_INTERNAL(sinhf_finite) +TLI_DEFINE_STRING_INTERNAL("__sinhf_finite") +/// long double __sinhl_finite(long double x); +TLI_DEFINE_ENUM_INTERNAL(sinhl_finite) +TLI_DEFINE_STRING_INTERNAL("__sinhl_finite") /// double __sinpi(double x); TLI_DEFINE_ENUM_INTERNAL(sinpi) TLI_DEFINE_STRING_INTERNAL("__sinpi") diff --git a/include/llvm/Analysis/TargetTransformInfo.h b/include/llvm/Analysis/TargetTransformInfo.h index b9639dba1881..0a0af384c3e6 100644 --- a/include/llvm/Analysis/TargetTransformInfo.h +++ b/include/llvm/Analysis/TargetTransformInfo.h @@ -537,6 +537,9 @@ public: /// \return The width of the largest scalar or vector register type. unsigned getRegisterBitWidth(bool Vector) const; + /// \return The width of the smallest vector register type. + unsigned getMinVectorRegisterBitWidth() const; + /// \return True if it should be considered for address type promotion. /// \p AllowPromotionWithoutCommonHeader Set true if promoting \p I is /// profitable without finding other extensions fed by the same input. @@ -740,6 +743,22 @@ public: unsigned ChainSizeInBytes, VectorType *VecTy) const; + /// Flags describing the kind of vector reduction. + struct ReductionFlags { + ReductionFlags() : IsMaxOp(false), IsSigned(false), NoNaN(false) {} + bool IsMaxOp; ///< If the op a min/max kind, true if it's a max operation. + bool IsSigned; ///< Whether the operation is a signed int reduction. + bool NoNaN; ///< If op is an fp min/max, whether NaNs may be present. + }; + + /// \returns True if the target wants to handle the given reduction idiom in + /// the intrinsics form instead of the shuffle form. + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const; + + /// \returns True if the target wants to expand the given reduction intrinsic + /// into a shuffle sequence. + bool shouldExpandReduction(const IntrinsicInst *II) const; /// @} private: @@ -824,6 +843,7 @@ public: Type *Ty) = 0; virtual unsigned getNumberOfRegisters(bool Vector) = 0; virtual unsigned getRegisterBitWidth(bool Vector) = 0; + virtual unsigned getMinVectorRegisterBitWidth() = 0; virtual bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) = 0; virtual unsigned getCacheLineSize() = 0; @@ -895,6 +915,9 @@ public: virtual unsigned getStoreVectorFactor(unsigned VF, unsigned StoreSize, unsigned ChainSizeInBytes, VectorType *VecTy) const = 0; + virtual bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + ReductionFlags) const = 0; + virtual bool shouldExpandReduction(const IntrinsicInst *II) const = 0; }; template @@ -1057,6 +1080,9 @@ public: unsigned getRegisterBitWidth(bool Vector) override { return Impl.getRegisterBitWidth(Vector); } + unsigned getMinVectorRegisterBitWidth() override { + return Impl.getMinVectorRegisterBitWidth(); + } bool shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) override { return Impl.shouldConsiderAddressTypePromotion( @@ -1200,6 +1226,13 @@ public: VectorType *VecTy) const override { return Impl.getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + ReductionFlags Flags) const override { + return Impl.useReductionIntrinsic(Opcode, Ty, Flags); + } + bool shouldExpandReduction(const IntrinsicInst *II) const override { + return Impl.shouldExpandReduction(II); + } }; template diff --git a/include/llvm/Analysis/TargetTransformInfoImpl.h b/include/llvm/Analysis/TargetTransformInfoImpl.h index d7fda9e14b05..550e84ad90c4 100644 --- a/include/llvm/Analysis/TargetTransformInfoImpl.h +++ b/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -311,6 +311,8 @@ public: unsigned getRegisterBitWidth(bool Vector) { return 32; } + unsigned getMinVectorRegisterBitWidth() { return 128; } + bool shouldConsiderAddressTypePromotion(const Instruction &I, bool &AllowPromotionWithoutCommonHeader) { @@ -456,6 +458,16 @@ public: VectorType *VecTy) const { return VF; } + + bool useReductionIntrinsic(unsigned Opcode, Type *Ty, + TTI::ReductionFlags Flags) const { + return false; + } + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return true; + } + protected: // Obtain the minimum required size to hold the value (without the sign) // In case of a vector it returns the min required size for one element. diff --git a/include/llvm/Analysis/ValueTracking.h b/include/llvm/Analysis/ValueTracking.h index a54c39e3ea3a..f5f323c6b797 100644 --- a/include/llvm/Analysis/ValueTracking.h +++ b/include/llvm/Analysis/ValueTracking.h @@ -56,6 +56,11 @@ template class ArrayRef; const Instruction *CxtI = nullptr, const DominatorTree *DT = nullptr, OptimizationRemarkEmitter *ORE = nullptr); + /// Returns the known bits rather than passing by reference. + KnownBits computeKnownBits(const Value *V, const DataLayout &DL, + unsigned Depth = 0, AssumptionCache *AC = nullptr, + const Instruction *CxtI = nullptr, + const DominatorTree *DT = nullptr); /// Compute known bits from the range metadata. /// \p KnownZero the set of bits that are known to be zero /// \p KnownOne the set of bits that are known to be one @@ -68,14 +73,6 @@ template class ArrayRef; const Instruction *CxtI = nullptr, const DominatorTree *DT = nullptr); - /// Determine whether the sign bit is known to be zero or one. Convenience - /// wrapper around computeKnownBits. - void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, - const DataLayout &DL, unsigned Depth = 0, - AssumptionCache *AC = nullptr, - const Instruction *CxtI = nullptr, - const DominatorTree *DT = nullptr); - /// Return true if the given value is known to have exactly one bit set when /// defined. For vectors return true if every element is known to be a power /// of two when defined. Supports values with integer or pointer type and diff --git a/include/llvm/Bitcode/BitcodeReader.h b/include/llvm/Bitcode/BitcodeReader.h index 54f990d00233..31ffb7645f3a 100644 --- a/include/llvm/Bitcode/BitcodeReader.h +++ b/include/llvm/Bitcode/BitcodeReader.h @@ -152,10 +152,11 @@ namespace llvm { /// Parse the module summary index out of an IR file and return the module /// summary index object if found, or an empty summary if not. If Path refers - /// to an empty file and the -ignore-empty-index-file cl::opt flag is passed + /// to an empty file and IgnoreEmptyThinLTOIndexFile is true, then /// this function will return nullptr. Expected> - getModuleSummaryIndexForFile(StringRef Path); + getModuleSummaryIndexForFile(StringRef Path, + bool IgnoreEmptyThinLTOIndexFile = false); /// isBitcodeWrapper - Return true if the given bytes are the magic bytes /// for an LLVM IR bitcode wrapper. diff --git a/include/llvm/CodeGen/ExpandReductions.h b/include/llvm/CodeGen/ExpandReductions.h new file mode 100644 index 000000000000..c6aaaad967b3 --- /dev/null +++ b/include/llvm/CodeGen/ExpandReductions.h @@ -0,0 +1,24 @@ +//===----- ExpandReductions.h - Expand experimental reduction intrinsics --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_CODEGEN_EXPANDREDUCTIONS_H +#define LLVM_CODEGEN_EXPANDREDUCTIONS_H + +#include "llvm/IR/PassManager.h" + +namespace llvm { + +class ExpandReductionsPass + : public PassInfoMixin { +public: + PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); +}; +} // end namespace llvm + +#endif // LLVM_CODEGEN_EXPANDREDUCTIONS_H diff --git a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h index 30d67eb49923..21354ae20ed1 100644 --- a/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h +++ b/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h @@ -145,7 +145,7 @@ public: /// Iterate the given function (typically something like doubling the width) /// on Ty until we find a legal type for this operation. - LLT findLegalType(const InstrAspect &Aspect, + Optional findLegalType(const InstrAspect &Aspect, function_ref NextType) const { LegalizeAction Action; const TypeMap &Map = Actions[Aspect.Opcode - FirstOp][Aspect.Idx]; @@ -153,8 +153,12 @@ public: do { Ty = NextType(Ty); auto ActionIt = Map.find(Ty); - if (ActionIt == Map.end()) - Action = DefaultActions.find(Aspect.Opcode)->second; + if (ActionIt == Map.end()) { + auto DefaultIt = DefaultActions.find(Aspect.Opcode); + if (DefaultIt == DefaultActions.end()) + return None; + Action = DefaultIt->second; + } else Action = ActionIt->second; } while(Action != Legal); @@ -163,11 +167,14 @@ public: /// Find what type it's actually OK to perform the given operation on, given /// the general approach we've decided to take. - LLT findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const; + Optional findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const; std::pair findLegalAction(const InstrAspect &Aspect, LegalizeAction Action) const { - return std::make_pair(Action, findLegalType(Aspect, Action)); + auto LegalType = findLegalType(Aspect, Action); + if (!LegalType) + return std::make_pair(LegalizeAction::Unsupported, LLT()); + return std::make_pair(Action, *LegalType); } /// Find the specified \p Aspect in the primary (explicitly set) Actions diff --git a/include/llvm/CodeGen/GlobalISel/Utils.h b/include/llvm/CodeGen/GlobalISel/Utils.h index 92bc9736141a..69d507069808 100644 --- a/include/llvm/CodeGen/GlobalISel/Utils.h +++ b/include/llvm/CodeGen/GlobalISel/Utils.h @@ -30,6 +30,7 @@ class TargetInstrInfo; class TargetPassConfig; class TargetRegisterInfo; class Twine; +class ConstantFP; /// Try to constrain Reg so that it is usable by argument OpIdx of the /// provided MCInstrDesc \p II. If this fails, create a new virtual @@ -62,6 +63,8 @@ void reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC, Optional getConstantVRegVal(unsigned VReg, const MachineRegisterInfo &MRI); +const ConstantFP* getConstantFPVRegVal(unsigned VReg, + const MachineRegisterInfo &MRI); } // End namespace llvm. #endif diff --git a/include/llvm/CodeGen/ISDOpcodes.h b/include/llvm/CodeGen/ISDOpcodes.h index ca0f3fbad892..f2a9a9f73ca6 100644 --- a/include/llvm/CodeGen/ISDOpcodes.h +++ b/include/llvm/CodeGen/ISDOpcodes.h @@ -644,6 +644,13 @@ namespace ISD { /// of a call sequence, and carry arbitrary information that target might /// want to know. The first operand is a chain, the rest are specified by /// the target and not touched by the DAG optimizers. + /// Targets that may use stack to pass call arguments define additional + /// operands: + /// - size of the call frame part that must be set up within the + /// CALLSEQ_START..CALLSEQ_END pair, + /// - part of the call frame prepared prior to CALLSEQ_START. + /// Both these parameters must be constants, their sum is the total call + /// frame size. /// CALLSEQ_START..CALLSEQ_END pairs may not be nested. CALLSEQ_START, // Beginning of a call sequence CALLSEQ_END, // End of a call sequence @@ -783,6 +790,20 @@ namespace ISD { /// known nonzero constant. The only operand here is the chain. GET_DYNAMIC_AREA_OFFSET, + /// Generic reduction nodes. These nodes represent horizontal vector + /// reduction operations, producing a scalar result. + /// The STRICT variants perform reductions in sequential order. The first + /// operand is an initial scalar accumulator value, and the second operand + /// is the vector to reduce. + VECREDUCE_STRICT_FADD, VECREDUCE_STRICT_FMUL, + /// These reductions are non-strict, and have a single vector operand. + VECREDUCE_FADD, VECREDUCE_FMUL, + VECREDUCE_ADD, VECREDUCE_MUL, + VECREDUCE_AND, VECREDUCE_OR, VECREDUCE_XOR, + VECREDUCE_SMAX, VECREDUCE_SMIN, VECREDUCE_UMAX, VECREDUCE_UMIN, + /// FMIN/FMAX nodes can have flags, for NaN/NoNaN variants. + VECREDUCE_FMAX, VECREDUCE_FMIN, + /// BUILTIN_OP_END - This must be the last enum value in this list. /// The target-specific pre-isel opcode values start here. BUILTIN_OP_END diff --git a/include/llvm/CodeGen/MachineCombinerPattern.h b/include/llvm/CodeGen/MachineCombinerPattern.h index 11238016d447..8c54ae925470 100644 --- a/include/llvm/CodeGen/MachineCombinerPattern.h +++ b/include/llvm/CodeGen/MachineCombinerPattern.h @@ -48,6 +48,8 @@ enum class MachineCombinerPattern { FMULADDD_OP2, FMULSUBD_OP1, FMULSUBD_OP2, + FNMULSUBS_OP1, + FNMULSUBD_OP1, FMLAv1i32_indexed_OP1, FMLAv1i32_indexed_OP2, FMLAv1i64_indexed_OP1, diff --git a/include/llvm/CodeGen/Passes.h b/include/llvm/CodeGen/Passes.h index 42299b529410..8a5a1997386f 100644 --- a/include/llvm/CodeGen/Passes.h +++ b/include/llvm/CodeGen/Passes.h @@ -68,6 +68,10 @@ namespace llvm { /// matching during instruction selection. FunctionPass *createCodeGenPreparePass(const TargetMachine *TM = nullptr); + /// createScalarizeMaskedMemIntrinPass - Replace masked load, store, gather + /// and scatter intrinsics with scalar code when target doesn't support them. + FunctionPass *createScalarizeMaskedMemIntrinPass(); + /// AtomicExpandID -- Lowers atomic operations in terms of either cmpxchg /// load-linked/store-conditional loops. extern char &AtomicExpandID; @@ -129,6 +133,10 @@ namespace llvm { // instruction and update the MachineFunctionInfo with that information. extern char &ShrinkWrapID; + /// LiveRangeShrink pass. Move instruction close to its definition to shrink + /// the definition's live range. + extern char &LiveRangeShrinkID; + /// Greedy register allocator. extern char &RAGreedyID; @@ -405,6 +413,10 @@ namespace llvm { /// printing assembly. ModulePass *createMachineOutlinerPass(); + /// This pass expands the experimental reduction intrinsics into sequences of + /// shuffles. + FunctionPass *createExpandReductionsPass(); + } // End llvm namespace /// Target machine pass initializer for passes with dependencies. Use with diff --git a/include/llvm/CodeGen/SelectionDAG.h b/include/llvm/CodeGen/SelectionDAG.h index 9e1d148c7ce5..d761661f763e 100644 --- a/include/llvm/CodeGen/SelectionDAG.h +++ b/include/llvm/CodeGen/SelectionDAG.h @@ -406,7 +406,7 @@ public: /// certain types of nodes together, or eliminating superfluous nodes. The /// Level argument controls whether Combine is allowed to produce nodes and /// types that are illegal on the target. - void Combine(CombineLevel Level, AliasAnalysis &AA, + void Combine(CombineLevel Level, AliasAnalysis *AA, CodeGenOpt::Level OptLevel); /// This transforms the SelectionDAG into a SelectionDAG that @@ -737,11 +737,15 @@ public: /// \brief Create a logical NOT operation as (XOR Val, BooleanOne). SDValue getLogicalNOT(const SDLoc &DL, SDValue Val, EVT VT); - /// Return a new CALLSEQ_START node, which always must have a glue result - /// (to ensure it's not CSE'd). CALLSEQ_START does not have a useful SDLoc. - SDValue getCALLSEQ_START(SDValue Chain, SDValue Op, const SDLoc &DL) { + /// Return a new CALLSEQ_START node, that starts new call frame, in which + /// InSize bytes are set up inside CALLSEQ_START..CALLSEQ_END sequence and + /// OutSize specifies part of the frame set up prior to the sequence. + SDValue getCALLSEQ_START(SDValue Chain, uint64_t InSize, uint64_t OutSize, + const SDLoc &DL) { SDVTList VTs = getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, Op }; + SDValue Ops[] = { Chain, + getIntPtrConstant(InSize, DL, true), + getIntPtrConstant(OutSize, DL, true) }; return getNode(ISD::CALLSEQ_START, DL, VTs, Ops); } diff --git a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h index e9012db7602d..f3122f0bf7f0 100644 --- a/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h +++ b/include/llvm/DebugInfo/CodeView/CVTypeVisitor.h @@ -26,6 +26,7 @@ public: void addTypeServerHandler(TypeServerHandler &Handler); + Error visitTypeRecord(CVType &Record, TypeIndex Index); Error visitTypeRecord(CVType &Record); Error visitMemberRecord(CVMemberRecord &Record); @@ -37,6 +38,9 @@ public: Error visitFieldListMemberStream(BinaryStreamReader Reader); private: + Expected handleTypeServer(CVType &Record); + Error finishVisitation(CVType &Record); + /// The interface to the class that gets notified of each visitation. TypeVisitorCallbacks &Callbacks; diff --git a/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h new file mode 100644 index 000000000000..35a8010f1163 --- /dev/null +++ b/include/llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h @@ -0,0 +1,103 @@ +//===- RandomAccessTypeVisitor.h ------------------------------ *- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H +#define LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H + +#include "llvm/ADT/TinyPtrVector.h" +#include "llvm/DebugInfo/CodeView/CVTypeVisitor.h" +#include "llvm/DebugInfo/CodeView/TypeDatabase.h" +#include "llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h" +#include "llvm/DebugInfo/CodeView/TypeDeserializer.h" +#include "llvm/DebugInfo/CodeView/TypeIndex.h" +#include "llvm/DebugInfo/CodeView/TypeRecord.h" +#include "llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h" +#include "llvm/Support/Error.h" + +namespace llvm { +namespace codeview { + +class TypeDatabase; +class TypeServerHandler; +class TypeVisitorCallbacks; + +/// \brief Provides amortized O(1) random access to a CodeView type stream. +/// Normally to access a type from a type stream, you must know its byte +/// offset into the type stream, because type records are variable-lengthed. +/// However, this is not the way we prefer to access them. For example, given +/// a symbol record one of the fields may be the TypeIndex of the symbol's +/// type record. Or given a type record such as an array type, there might +/// be a TypeIndex for the element type. Sequential access is perfect when +/// we're just dumping every entry, but it's very poor for real world usage. +/// +/// Type streams in PDBs contain an additional field which is a list of pairs +/// containing indices and their corresponding offsets, roughly every ~8KB of +/// record data. This general idea need not be confined to PDBs though. By +/// supplying such an array, the producer of a type stream can allow the +/// consumer much better access time, because the consumer can find the nearest +/// index in this array, and do a linear scan forward only from there. +/// +/// RandomAccessTypeVisitor implements this algorithm, but additionally goes one +/// step further by caching offsets of every record that has been visited at +/// least once. This way, even repeated visits of the same record will never +/// require more than one linear scan. For a type stream of N elements divided +/// into M chunks of roughly equal size, this yields a worst case lookup time +/// of O(N/M) and an amortized time of O(1). +class RandomAccessTypeVisitor { + typedef FixedStreamArray PartialOffsetArray; + +public: + RandomAccessTypeVisitor(const CVTypeArray &Types, uint32_t NumRecords, + PartialOffsetArray PartialOffsets); + + Error visitTypeIndex(TypeIndex Index, TypeVisitorCallbacks &Callbacks); + + const TypeDatabase &database() const { return Database; } + +private: + Error visitRangeForType(TypeIndex TI); + Error visitRange(TypeIndex Begin, uint32_t BeginOffset, TypeIndex End); + + /// Visited records get automatically added to the type database. + TypeDatabase Database; + + /// The type array to allow random access visitation of. + const CVTypeArray &Types; + + /// The database visitor which adds new records to the database. + TypeDatabaseVisitor DatabaseVisitor; + + /// The deserializer which deserializes new records. + TypeDeserializer Deserializer; + + /// The visitation callback pipeline to use. By default this contains a + /// deserializer and a type database visitor. But the callback specified + /// in the constructor is also added. + TypeVisitorCallbackPipeline Pipeline; + + /// The visitor used to visit the internal pipeline for deserialization and + /// database maintenance. + CVTypeVisitor InternalVisitor; + + /// A vector mapping type indices to type offset. For every record that has + /// been visited, contains the absolute offset of that record in the record + /// array. + std::vector KnownOffsets; + + /// An array of index offsets for the given type stream, allowing log(N) + /// lookups of a type record by index. Similar to KnownOffsets but only + /// contains offsets for some type indices, some of which may not have + /// ever been visited. + PartialOffsetArray PartialOffsets; +}; + +} // end namespace codeview +} // end namespace llvm + +#endif // LLVM_DEBUGINFO_CODEVIEW_RANDOMACCESSTYPEVISITOR_H diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabase.h b/include/llvm/DebugInfo/CodeView/TypeDatabase.h index be7b19e7df0c..92c15ebd8b2b 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDatabase.h +++ b/include/llvm/DebugInfo/CodeView/TypeDatabase.h @@ -10,6 +10,7 @@ #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H #define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASE_H +#include "llvm/ADT/BitVector.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" @@ -20,14 +21,16 @@ namespace llvm { namespace codeview { class TypeDatabase { + friend class RandomAccessTypeVisitor; + public: - explicit TypeDatabase(uint32_t ExpectedSize); + explicit TypeDatabase(uint32_t Capacity); - /// Gets the type index for the next type record. - TypeIndex getNextTypeIndex() const; + /// Records the name of a type, and reserves its type index. + TypeIndex appendType(StringRef Name, const CVType &Data); /// Records the name of a type, and reserves its type index. - void recordType(StringRef Name, const CVType &Data); + void recordType(StringRef Name, TypeIndex Index, const CVType &Data); /// Saves the name in a StringSet and creates a stable StringRef. StringRef saveTypeName(StringRef TypeName); @@ -37,13 +40,21 @@ public: const CVType &getTypeRecord(TypeIndex Index) const; CVType &getTypeRecord(TypeIndex Index); - bool containsTypeIndex(TypeIndex Index) const; + bool contains(TypeIndex Index) const; uint32_t size() const; + uint32_t capacity() const; + bool empty() const; + + TypeIndex getAppendIndex() const; private: + void grow(); + BumpPtrAllocator Allocator; + uint32_t Count = 0; + /// All user defined type records in .debug$T live in here. Type indices /// greater than 0x1000 are user defined. Subtract 0x1000 from the index to /// index into this vector. @@ -51,6 +62,8 @@ private: SmallVector TypeRecords; StringSaver TypeNameStorage; + + BitVector ValidRecords; }; } } diff --git a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h index 39d234cf9814..c064e19a7e90 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h +++ b/include/llvm/DebugInfo/CodeView/TypeDatabaseVisitor.h @@ -10,6 +10,8 @@ #ifndef LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H #define LLVM_DEBUGINFO_CODEVIEW_TYPEDATABASEVISITOR_H +#include "llvm/ADT/PointerUnion.h" + #include "llvm/DebugInfo/CodeView/TypeDatabase.h" #include "llvm/DebugInfo/CodeView/TypeIndex.h" #include "llvm/DebugInfo/CodeView/TypeRecord.h" @@ -21,11 +23,12 @@ namespace codeview { /// Dumper for CodeView type streams found in COFF object files and PDB files. class TypeDatabaseVisitor : public TypeVisitorCallbacks { public: - explicit TypeDatabaseVisitor(TypeDatabase &TypeDB) : TypeDB(TypeDB) {} + explicit TypeDatabaseVisitor(TypeDatabase &TypeDB) : TypeDB(&TypeDB) {} /// Paired begin/end actions for all types. Receives all record data, /// including the fixed-length record prefix. Error visitTypeBegin(CVType &Record) override; + Error visitTypeBegin(CVType &Record, TypeIndex Index) override; Error visitTypeEnd(CVType &Record) override; Error visitMemberBegin(CVMemberRecord &Record) override; Error visitMemberEnd(CVMemberRecord &Record) override; @@ -39,12 +42,18 @@ public: #include "TypeRecords.def" private: + StringRef getTypeName(TypeIndex Index) const; + StringRef saveTypeName(StringRef Name); + bool IsInFieldList = false; /// Name of the current type. Only valid before visitTypeEnd. StringRef Name; + /// Current type index. Only valid before visitTypeEnd, and if we are + /// visiting a random access type database. + Optional CurrentTypeIndex; - TypeDatabase &TypeDB; + TypeDatabase *TypeDB; }; } // end namespace codeview diff --git a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h index 0e3443789170..2142d4a2dec7 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDeserializer.h +++ b/include/llvm/DebugInfo/CodeView/TypeDeserializer.h @@ -46,6 +46,10 @@ public: return Mapping->Mapping.visitTypeBegin(Record); } + Error visitTypeBegin(CVType &Record, TypeIndex Index) override { + return visitTypeBegin(Record); + } + Error visitTypeEnd(CVType &Record) override { assert(Mapping && "Not in a type mapping!"); auto EC = Mapping->Mapping.visitTypeEnd(Record); diff --git a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h index 00bb09137e48..6f10afb30d60 100644 --- a/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h +++ b/include/llvm/DebugInfo/CodeView/TypeDumpVisitor.h @@ -45,6 +45,7 @@ public: /// Paired begin/end actions for all types. Receives all record data, /// including the fixed-length record prefix. Error visitTypeBegin(CVType &Record) override; + Error visitTypeBegin(CVType &Record, TypeIndex Index) override; Error visitTypeEnd(CVType &Record) override; Error visitMemberBegin(CVMemberRecord &Record) override; Error visitMemberEnd(CVMemberRecord &Record) override; diff --git a/include/llvm/DebugInfo/CodeView/TypeIndex.h b/include/llvm/DebugInfo/CodeView/TypeIndex.h index 3c11d248fa72..b5d695fc49d5 100644 --- a/include/llvm/DebugInfo/CodeView/TypeIndex.h +++ b/include/llvm/DebugInfo/CodeView/TypeIndex.h @@ -106,6 +106,15 @@ public: bool isNoneType() const { return *this == None(); } + uint32_t toArrayIndex() const { + assert(!isSimple()); + return getIndex() - FirstNonSimpleIndex; + } + + static TypeIndex fromArrayIndex(uint32_t Index) { + return TypeIndex(Index + FirstNonSimpleIndex); + } + SimpleTypeKind getSimpleKind() const { assert(isSimple()); return static_cast(Index & SimpleKindMask); @@ -159,6 +168,39 @@ public: static TypeIndex Float32() { return TypeIndex(SimpleTypeKind::Float32); } static TypeIndex Float64() { return TypeIndex(SimpleTypeKind::Float64); } + TypeIndex &operator+=(unsigned N) { + Index += N; + return *this; + } + + TypeIndex &operator++() { + Index += 1; + return *this; + } + + TypeIndex operator++(int) { + TypeIndex Copy = *this; + operator++(); + return Copy; + } + + TypeIndex &operator-=(unsigned N) { + assert(Index >= N); + Index -= N; + return *this; + } + + TypeIndex &operator--() { + Index -= 1; + return *this; + } + + TypeIndex operator--(int) { + TypeIndex Copy = *this; + operator--(); + return Copy; + } + friend inline bool operator==(const TypeIndex &A, const TypeIndex &B) { return A.getIndex() == B.getIndex(); } @@ -183,10 +225,30 @@ public: return A.getIndex() >= B.getIndex(); } + friend inline TypeIndex operator+(const TypeIndex &A, uint32_t N) { + TypeIndex Result(A); + Result += N; + return Result; + } + + friend inline TypeIndex operator-(const TypeIndex &A, uint32_t N) { + assert(A.getIndex() >= N); + TypeIndex Result(A); + Result -= N; + return Result; + } + private: support::ulittle32_t Index; }; +// Used for pseudo-indexing an array of type records. An array of such records +// sorted by TypeIndex can allow log(N) lookups even though such a type record +// stream does not provide random access. +struct TypeIndexOffset { + TypeIndex Type; + support::ulittle32_t Offset; +}; } } diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h index f25129691041..ed48df33249f 100644 --- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h +++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbackPipeline.h @@ -47,6 +47,14 @@ public: return Error::success(); } + Error visitTypeBegin(CVType &Record, TypeIndex Index) override { + for (auto Visitor : Pipeline) { + if (auto EC = Visitor->visitTypeBegin(Record, Index)) + return EC; + } + return Error::success(); + } + Error visitTypeEnd(CVType &Record) override { for (auto Visitor : Pipeline) { if (auto EC = Visitor->visitTypeEnd(Record)) diff --git a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h index 5e27df346b00..2950c7d27cb6 100644 --- a/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h +++ b/include/llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h @@ -26,8 +26,15 @@ public: virtual Error visitUnknownType(CVType &Record) { return Error::success(); } /// Paired begin/end actions for all types. Receives all record data, /// including the fixed-length record prefix. visitTypeBegin() should return - /// the type of the Record, or an error if it cannot be determined. + /// the type of the Record, or an error if it cannot be determined. Exactly + /// one of the two visitTypeBegin methods will be called, depending on whether + /// records are being visited sequentially or randomly. An implementation + /// should be prepared to handle both (or assert if it can't handle random + /// access visitation). virtual Error visitTypeBegin(CVType &Record) { return Error::success(); } + virtual Error visitTypeBegin(CVType &Record, TypeIndex Index) { + return Error::success(); + } virtual Error visitTypeEnd(CVType &Record) { return Error::success(); } virtual Error visitUnknownMember(CVMemberRecord &Record) { diff --git a/include/llvm/DebugInfo/DWARF/DWARFContext.h b/include/llvm/DebugInfo/DWARF/DWARFContext.h index 3fae8b441439..ca82a68ead31 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFContext.h +++ b/include/llvm/DebugInfo/DWARF/DWARFContext.h @@ -43,13 +43,6 @@ namespace llvm { class MemoryBuffer; class raw_ostream; -// In place of applying the relocations to the data we've read from disk we use -// a separate mapping table to the side and checking that at locations in the -// dwarf where we expect relocated values. This adds a bit of complexity to the -// dwarf parsing/extraction at the benefit of not allocating memory for the -// entire size of the debug info sections. -typedef DenseMap> RelocAddrMap; - /// Reads a value from data extractor and applies a relocation to the result if /// one exists for the given offset. uint64_t getRelocatedValue(const DataExtractor &Data, uint32_t Size, diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h index e21245b97b73..39a7ef71de97 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugLine.h @@ -30,7 +30,7 @@ public: struct FileNameEntry { FileNameEntry() = default; - StringRef Name = StringRef(); + StringRef Name; uint64_t DirIdx = 0; uint64_t ModTime = 0; uint64_t Length = 0; diff --git a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h index 9172df5bfac6..23a573b7a9fa 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h +++ b/include/llvm/DebugInfo/DWARF/DWARFDebugRangeList.h @@ -22,8 +22,13 @@ namespace llvm { class raw_ostream; +struct DWARFAddressRange { + uint64_t LowPC; + uint64_t HighPC; +}; + /// DWARFAddressRangesVector - represents a set of absolute address ranges. -typedef std::vector> DWARFAddressRangesVector; +typedef std::vector DWARFAddressRangesVector; class DWARFDebugRangeList { public: diff --git a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h index af01bddeed15..f1e03bb4c2e1 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h +++ b/include/llvm/DebugInfo/DWARF/DWARFRelocMap.h @@ -16,7 +16,17 @@ namespace llvm { -typedef DenseMap> RelocAddrMap; +struct RelocAddrEntry { + uint8_t Width; + int64_t Value; +}; + +// In place of applying the relocations to the data we've read from disk we use +// a separate mapping table to the side and checking that at locations in the +// dwarf where we expect relocated values. This adds a bit of complexity to the +// dwarf parsing/extraction at the benefit of not allocating memory for the +// entire size of the debug info sections. +typedef DenseMap RelocAddrMap; } // end namespace llvm diff --git a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h index 8e12bcd2c8e2..b9f14be85926 100644 --- a/include/llvm/DebugInfo/DWARF/DWARFVerifier.h +++ b/include/llvm/DebugInfo/DWARF/DWARFVerifier.h @@ -40,7 +40,7 @@ class DWARFVerifier { /// /// @param Die The DWARF DIE that owns the attribute value /// @param AttrValue The DWARF attribute value to check - void verifyDebugInfoAttribute(DWARFDie &Die, DWARFAttribute &AttrValue); + void verifyDebugInfoAttribute(const DWARFDie &Die, DWARFAttribute &AttrValue); /// Verifies the attribute's DWARF form. /// @@ -51,7 +51,7 @@ class DWARFVerifier { /// /// @param Die The DWARF DIE that owns the attribute value /// @param AttrValue The DWARF attribute value to check - void verifyDebugInfoForm(DWARFDie &Die, DWARFAttribute &AttrValue); + void verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue); /// Verifies the all valid references that were found when iterating through /// all of the DIE attributes. @@ -60,7 +60,7 @@ class DWARFVerifier { /// offset matches. This helps to ensure if a DWARF link phase moved things /// around, that it doesn't create invalid references by failing to relocate /// CU relative and absolute references. - void veifyDebugInfoReferences(); + void verifyDebugInfoReferences(); /// Verify the the DW_AT_stmt_list encoding and value and ensure that no /// compile units that have the same DW_AT_stmt_list value. diff --git a/include/llvm/DebugInfo/PDB/Native/RawTypes.h b/include/llvm/DebugInfo/PDB/Native/RawTypes.h index 979b8454dd5e..771272d6a47d 100644 --- a/include/llvm/DebugInfo/PDB/Native/RawTypes.h +++ b/include/llvm/DebugInfo/PDB/Native/RawTypes.h @@ -73,13 +73,6 @@ struct SecMapEntry { support::ulittle32_t SecByteLength; // Byte count of the segment or group. }; -// Used for serialized hash table in TPI stream. -// In the reference, it is an array of TI and cbOff pair. -struct TypeIndexOffset { - codeview::TypeIndex Type; - support::ulittle32_t Offset; -}; - /// Some of the values are stored in bitfields. Since this needs to be portable /// across compilers and architectures (big / little endian in particular) we /// can't use the actual structures below, but must instead do the shifting diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStream.h b/include/llvm/DebugInfo/PDB/Native/TpiStream.h index 9fef9bee5e1a..4579cbf4227b 100644 --- a/include/llvm/DebugInfo/PDB/Native/TpiStream.h +++ b/include/llvm/DebugInfo/PDB/Native/TpiStream.h @@ -47,7 +47,7 @@ public: uint32_t getHashKeySize() const; uint32_t getNumHashBuckets() const; FixedStreamArray getHashValues() const; - FixedStreamArray getTypeIndexOffsets() const; + FixedStreamArray getTypeIndexOffsets() const; HashTable &getHashAdjusters(); codeview::CVTypeRange types(bool *HadError) const; @@ -62,7 +62,7 @@ private: std::unique_ptr HashStream; FixedStreamArray HashValues; - FixedStreamArray TypeIndexOffsets; + FixedStreamArray TypeIndexOffsets; HashTable HashAdjusters; const TpiStreamHeader *Header; diff --git a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h index a29ed0b610d3..6c609c34665c 100644 --- a/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h +++ b/include/llvm/DebugInfo/PDB/Native/TpiStreamBuilder.h @@ -75,7 +75,7 @@ private: Optional VerHeader; std::vector> TypeRecords; std::vector TypeHashes; - std::vector TypeIndexOffsets; + std::vector TypeIndexOffsets; uint32_t HashStreamIndex = kInvalidStreamIndex; std::unique_ptr HashValueStream; diff --git a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h index 7e7f7358938a..1bb911d09cfb 100644 --- a/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h +++ b/include/llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h @@ -172,6 +172,11 @@ private: return nullptr; } + void removeModulesFromBaseLayer(BaseLayerT &BaseLayer) { + for (auto &BLH : BaseLayerHandles) + BaseLayer.removeModuleSet(BLH); + } + std::unique_ptr ExternalSymbolResolver; std::unique_ptr> MemMgr; std::unique_ptr StubsMgr; @@ -204,6 +209,11 @@ public: CreateIndirectStubsManager(std::move(CreateIndirectStubsManager)), CloneStubsIntoPartitions(CloneStubsIntoPartitions) {} + ~CompileOnDemandLayer() { + while (!LogicalDylibs.empty()) + removeModuleSet(LogicalDylibs.begin()); + } + /// @brief Add a module to the compile-on-demand layer. template @@ -239,6 +249,7 @@ public: /// This will remove all modules in the layers below that were derived from /// the module represented by H. void removeModuleSet(ModuleSetHandleT H) { + H->removeModulesFromBaseLayer(BaseLayer); LogicalDylibs.erase(H); } @@ -478,6 +489,8 @@ private: return 0; } + LD.BaseLayerHandles.push_back(PartH); + return CalledAddr; } diff --git a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h index 02f59d6a831a..a19c30631c57 100644 --- a/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h +++ b/include/llvm/ExecutionEngine/Orc/OrcRemoteTargetClient.h @@ -144,16 +144,16 @@ public: void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override { - UnfinalizedEHFrames.push_back( - std::make_pair(LoadAddr, static_cast(Size))); + UnfinalizedEHFrames.push_back({LoadAddr, Size}); } - void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) override { - auto Err = Client.deregisterEHFrames(LoadAddr, Size); - // FIXME: Add error poll. - assert(!Err && "Failed to register remote EH frames."); - (void)Err; + void deregisterEHFrames() override { + for (auto &Frame : RegisteredEHFrames) { + auto Err = Client.deregisterEHFrames(Frame.Addr, Frame.Size); + // FIXME: Add error poll. + assert(!Err && "Failed to register remote EH frames."); + (void)Err; + } } void notifyObjectLoaded(RuntimeDyld &Dyld, @@ -320,7 +320,7 @@ public: Unfinalized.clear(); for (auto &EHFrame : UnfinalizedEHFrames) { - if (auto Err = Client.registerEHFrames(EHFrame.first, EHFrame.second)) { + if (auto Err = Client.registerEHFrames(EHFrame.Addr, EHFrame.Size)) { // FIXME: Replace this once finalizeMemory can return an Error. handleAllErrors(std::move(Err), [&](ErrorInfoBase &EIB) { if (ErrMsg) { @@ -331,7 +331,8 @@ public: return false; } } - UnfinalizedEHFrames.clear(); + RegisteredEHFrames = std::move(UnfinalizedEHFrames); + UnfinalizedEHFrames = {}; return false; } @@ -387,7 +388,13 @@ public: ResourceIdMgr::ResourceId Id; std::vector Unmapped; std::vector Unfinalized; - std::vector> UnfinalizedEHFrames; + + struct EHFrame { + JITTargetAddress Addr; + uint64_t Size; + }; + std::vector UnfinalizedEHFrames; + std::vector RegisteredEHFrames; }; /// Remote indirect stubs manager. diff --git a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h index babcc7f26aab..5b3426afe584 100644 --- a/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h +++ b/include/llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h @@ -120,6 +120,10 @@ private: buildInitialSymbolTable(PFC->Objects); } + ~ConcreteLinkedObjectSet() override { + MemMgr->deregisterEHFrames(); + } + void setHandle(ObjSetHandleT H) { PFC->Handle = H; } diff --git a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h index 5638717790bb..74535fe948ff 100644 --- a/include/llvm/ExecutionEngine/RTDyldMemoryManager.h +++ b/include/llvm/ExecutionEngine/RTDyldMemoryManager.h @@ -69,13 +69,8 @@ public: /// Deregister EH frames in the current proces. static void deregisterEHFramesInProcess(uint8_t *Addr, size_t Size); - void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override { - registerEHFramesInProcess(Addr, Size); - } - - void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override { - deregisterEHFramesInProcess(Addr, Size); - } + void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) override; + void deregisterEHFrames() override; /// This method returns the address of the specified function or variable in /// the current process. @@ -139,6 +134,13 @@ public: /// MCJIT or RuntimeDyld. Use getSymbolAddress instead. virtual void *getPointerToNamedFunction(const std::string &Name, bool AbortOnFailure = true); + +private: + struct EHFrame { + uint8_t *Addr; + size_t Size; + }; + std::vector EHFrames; }; // Create wrappers for C Binding types (see CBindingWrapping.h). diff --git a/include/llvm/ExecutionEngine/RuntimeDyld.h b/include/llvm/ExecutionEngine/RuntimeDyld.h index 13a5f9922c51..9470866dc0d6 100644 --- a/include/llvm/ExecutionEngine/RuntimeDyld.h +++ b/include/llvm/ExecutionEngine/RuntimeDyld.h @@ -150,8 +150,7 @@ public: /// be the case for local execution) these two values will be the same. virtual void registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, size_t Size) = 0; - virtual void deregisterEHFrames(uint8_t *addr, uint64_t LoadAddr, - size_t Size) = 0; + virtual void deregisterEHFrames() = 0; /// This method is called when object loading is complete and section page /// permissions can be applied. It is up to the memory manager implementation diff --git a/include/llvm/IR/Attributes.h b/include/llvm/IR/Attributes.h index cbe681684a5c..d4a896c01867 100644 --- a/include/llvm/IR/Attributes.h +++ b/include/llvm/IR/Attributes.h @@ -35,6 +35,7 @@ namespace llvm { class AttrBuilder; class AttributeImpl; class AttributeListImpl; +class AttributeList; class AttributeSetNode; template struct DenseMapInfo; class Function; @@ -227,14 +228,51 @@ public: bool operator==(const AttributeSet &O) { return SetNode == O.SetNode; } bool operator!=(const AttributeSet &O) { return !(*this == O); } + /// Add an argument attribute. Because + /// attribute sets are immutable, this returns a new set. + AttributeSet addAttribute(LLVMContext &C, + Attribute::AttrKind Kind) const; + + /// Add a target-dependent attribute. Because + /// attribute sets are immutable, this returns a new set. + AttributeSet addAttribute(LLVMContext &C, StringRef Kind, + StringRef Value = StringRef()) const; + + /// Add attributes to the attribute set. Because + /// attribute sets are immutable, this returns a new set. + AttributeSet addAttributes(LLVMContext &C, AttributeSet AS) const; + + /// Remove the specified attribute from this set. Because + /// attribute sets are immutable, this returns a new set. + AttributeSet removeAttribute(LLVMContext &C, + Attribute::AttrKind Kind) const; + + /// Remove the specified attribute from this set. Because + /// attribute sets are immutable, this returns a new set. + AttributeSet removeAttribute(LLVMContext &C, + StringRef Kind) const; + + /// Remove the specified attributes from this set. Because + /// attribute sets are immutable, this returns a new set. + AttributeSet removeAttributes(LLVMContext &C, + const AttrBuilder &AttrsToRemove) const; + + /// Return the number of attributes in this set. unsigned getNumAttributes() const; + /// Return true if attributes exists in this set. bool hasAttributes() const { return SetNode != nullptr; } + /// Return true if the attribute exists in this set. bool hasAttribute(Attribute::AttrKind Kind) const; + + /// Return true if the attribute exists in this set. bool hasAttribute(StringRef Kind) const; + /// Return the attribute object. Attribute getAttribute(Attribute::AttrKind Kind) const; + + /// Return the target-dependent attribute object. Attribute getAttribute(StringRef Kind) const; unsigned getAlignment() const; @@ -248,6 +286,9 @@ public: iterator begin() const; iterator end() const; +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) + void dump() const; +#endif }; //===----------------------------------------------------------------------===// diff --git a/include/llvm/IR/CallingConv.h b/include/llvm/IR/CallingConv.h index 39fb3f1c791b..801e88aba4d1 100644 --- a/include/llvm/IR/CallingConv.h +++ b/include/llvm/IR/CallingConv.h @@ -201,6 +201,10 @@ namespace CallingConv { /// shaders) AMDGPU_HS = 93, + /// Calling convention used for special MSP430 rtlib functions + /// which have an "optimized" convention using additional registers. + MSP430_BUILTIN = 94, + /// The highest possible calling convention ID. Must be some 2^k - 1. MaxID = 1023 }; diff --git a/include/llvm/IR/Constants.h b/include/llvm/IR/Constants.h index ad83b21c7bf3..5db9b3bb5048 100644 --- a/include/llvm/IR/Constants.h +++ b/include/llvm/IR/Constants.h @@ -26,6 +26,7 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DerivedTypes.h" @@ -452,7 +453,14 @@ class ConstantStruct final : public ConstantAggregate { public: // ConstantStruct accessors static Constant *get(StructType *T, ArrayRef V); - static Constant *get(StructType *T, ...) LLVM_END_WITH_NULL; + + template + static typename std::enable_if::value, + Constant *>::type + get(StructType *T, Csts *... Vs) { + SmallVector Values({Vs...}); + return get(T, Values); + } /// Return an anonymous struct that has the specified elements. /// If the struct is possibly empty, then you must specify a context. diff --git a/include/llvm/IR/DebugInfoMetadata.h b/include/llvm/IR/DebugInfoMetadata.h index 0331d5229e7f..358106aac43b 100644 --- a/include/llvm/IR/DebugInfoMetadata.h +++ b/include/llvm/IR/DebugInfoMetadata.h @@ -16,8 +16,11 @@ #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitmaskEnum.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/None.h" +#include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Metadata.h" #include "llvm/Support/Casting.h" @@ -56,10 +59,6 @@ namespace llvm { -class DIBuilder; - -template class Optional; - /// Holds a subclass of DINode. /// /// FIXME: This class doesn't currently make much sense. Previously it was a @@ -94,9 +93,9 @@ public: bool operator!=(const TypedDINodeRef &X) const { return MD != X.MD; } }; -typedef TypedDINodeRef DINodeRef; -typedef TypedDINodeRef DIScopeRef; -typedef TypedDINodeRef DITypeRef; +using DINodeRef = TypedDINodeRef; +using DIScopeRef = TypedDINodeRef; +using DITypeRef = TypedDINodeRef; class DITypeRefArray { const MDTuple *N = nullptr; @@ -240,7 +239,8 @@ public: }; template struct simplify_type> { - typedef Metadata *SimpleType; + using SimpleType = Metadata *; + static SimpleType getSimplifiedValue(const TypedDINodeRef &MD) { return MD; } @@ -799,15 +799,18 @@ public: assert(getTag() == dwarf::DW_TAG_ptr_to_member_type); return DITypeRef(getExtraData()); } + DIObjCProperty *getObjCProperty() const { return dyn_cast_or_null(getExtraData()); } + Constant *getStorageOffsetInBits() const { assert(getTag() == dwarf::DW_TAG_member && isBitField()); if (auto *C = cast_or_null(getExtraData())) return C->getValue(); return nullptr; } + Constant *getConstant() const { assert(getTag() == dwarf::DW_TAG_member && isStaticMember()); if (auto *C = cast_or_null(getExtraData())) @@ -970,9 +973,11 @@ public: #endif replaceOperandWith(4, Elements.get()); } + void replaceVTableHolder(DITypeRef VTableHolder) { replaceOperandWith(5, VTableHolder); } + void replaceTemplateParams(DITemplateParameterArray TemplateParams) { replaceOperandWith(6, TemplateParams.get()); } @@ -1031,6 +1036,7 @@ public: DITypeRefArray getTypeArray() const { return cast_or_null(getRawTypeArray()); } + Metadata *getRawTypeArray() const { return getOperand(3); } static bool classof(const Metadata *MD) { @@ -1319,6 +1325,7 @@ public: unsigned getLine() const { return SubclassData32; } unsigned getColumn() const { return SubclassData16; } DILocalScope *getScope() const { return cast(getRawScope()); } + DILocation *getInlinedAt() const { return cast_or_null(getRawInlinedAt()); } @@ -1452,7 +1459,6 @@ public: static bool classof(const Metadata *MD) { return MD->getMetadataID() == DILocationKind; } - }; /// Subprogram description. @@ -2087,6 +2093,7 @@ public: return F->getFilename(); return ""; } + StringRef getDirectory() const { if (auto *F = getFile()) return F->getDirectory(); @@ -2143,6 +2150,7 @@ public: ArrayRef getElements() const { return Elements; } unsigned getNumElements() const { return Elements.size(); } + uint64_t getElement(unsigned I) const { assert(I < Elements.size() && "Index out of range"); return Elements[I]; @@ -2151,7 +2159,8 @@ public: /// Determine whether this represents a standalone constant value. bool isConstant() const; - typedef ArrayRef::iterator element_iterator; + using element_iterator = ArrayRef::iterator; + element_iterator elements_begin() const { return getElements().begin(); } element_iterator elements_end() const { return getElements().end(); } @@ -2276,6 +2285,10 @@ public: /// Append \p Ops with operations to apply the \p Offset. static void appendOffset(SmallVectorImpl &Ops, int64_t Offset); + /// If this is a constant offset, extract it. If there is no expression, + /// return true with an offset of zero. + bool extractIfOffset(int64_t &Offset) const; + /// Constants for DIExpression::prepend. enum { NoDeref = false, WithDeref = true, WithStackValue = true }; @@ -2509,6 +2522,7 @@ public: return F->getFilename(); return ""; } + StringRef getDirectory() const { if (auto *F = getFile()) return F->getDirectory(); @@ -2609,10 +2623,13 @@ public: TempDIGlobalVariableExpression clone() const { return cloneImpl(); } Metadata *getRawVariable() const { return getOperand(0); } + DIGlobalVariable *getVariable() const { return cast_or_null(getRawVariable()); } + Metadata *getRawExpression() const { return getOperand(1); } + DIExpression *getExpression() const { return cast_or_null(getRawExpression()); } diff --git a/include/llvm/IR/DebugLoc.h b/include/llvm/IR/DebugLoc.h index 202be3da14da..aa74f361cda2 100644 --- a/include/llvm/IR/DebugLoc.h +++ b/include/llvm/IR/DebugLoc.h @@ -80,6 +80,22 @@ namespace llvm { static DebugLoc get(unsigned Line, unsigned Col, const MDNode *Scope, const MDNode *InlinedAt = nullptr); + enum { ReplaceLastInlinedAt = true }; + /// Rebuild the entire inlined-at chain for this instruction so that the top of + /// the chain now is inlined-at the new call site. + /// \param InlinedAt The new outermost inlined-at in the chain. + /// \param ReplaceLast Replace the last location in the inlined-at chain. + static DebugLoc appendInlinedAt(DebugLoc DL, DILocation *InlinedAt, + LLVMContext &Ctx, + DenseMap &Cache, + bool ReplaceLast = false); + + /// Reparent all debug locations referenced by \c I that belong to \c OrigSP + /// to become (possibly indirect) children of \c NewSP. + static void reparentDebugInfo(Instruction &I, DISubprogram *OrigSP, + DISubprogram *NewSP, + DenseMap &Cache); + unsigned getLine() const; unsigned getCol() const; MDNode *getScope() const; diff --git a/include/llvm/IR/DerivedTypes.h b/include/llvm/IR/DerivedTypes.h index 05e99157b8dc..a92321a44511 100644 --- a/include/llvm/IR/DerivedTypes.h +++ b/include/llvm/IR/DerivedTypes.h @@ -1,4 +1,4 @@ -//===-- llvm/DerivedTypes.h - Classes for handling data types ---*- C++ -*-===// +//===- llvm/DerivedTypes.h - Classes for handling data types ----*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -19,6 +19,7 @@ #define LLVM_IR_DERIVEDTYPES_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringRef.h" #include "llvm/IR/Type.h" #include "llvm/Support/Casting.h" @@ -122,7 +123,8 @@ public: bool isVarArg() const { return getSubclassData()!=0; } Type *getReturnType() const { return ContainedTys[0]; } - typedef Type::subtype_iterator param_iterator; + using param_iterator = Type::subtype_iterator; + param_iterator param_begin() const { return ContainedTys + 1; } param_iterator param_end() const { return &ContainedTys[NumContainedTys]; } ArrayRef params() const { @@ -197,8 +199,7 @@ public: /// generator for a target expects). /// class StructType : public CompositeType { - StructType(LLVMContext &C) - : CompositeType(C, StructTyID), SymbolTableEntry(nullptr) {} + StructType(LLVMContext &C) : CompositeType(C, StructTyID) {} enum { /// This is the contents of the SubClassData field. @@ -212,7 +213,7 @@ class StructType : public CompositeType { /// symbol table entry (maintained by LLVMContext) for the struct. /// This is null if the type is an literal struct or if it is a identified /// type that has an empty name. - void *SymbolTableEntry; + void *SymbolTableEntry = nullptr; public: StructType(const StructType &) = delete; @@ -228,7 +229,14 @@ public: static StructType *create(LLVMContext &Context, ArrayRef Elements, StringRef Name, bool isPacked = false); static StructType *create(LLVMContext &Context, ArrayRef Elements); - static StructType *create(StringRef Name, Type *elt1, ...) LLVM_END_WITH_NULL; + template + static typename std::enable_if::value, + StructType *>::type + create(StringRef Name, Type *elt1, Tys *... elts) { + assert(elt1 && "Cannot create a struct type with no elements with this"); + SmallVector StructFields({elt1, elts...}); + return create(StructFields, Name); + } /// This static method is the primary way to create a literal StructType. static StructType *get(LLVMContext &Context, ArrayRef Elements, @@ -240,7 +248,15 @@ public: /// This static method is a convenience method for creating structure types by /// specifying the elements as arguments. Note that this method always returns /// a non-packed struct, and requires at least one element type. - static StructType *get(Type *elt1, ...) LLVM_END_WITH_NULL; + template + static typename std::enable_if::value, + StructType *>::type + get(Type *elt1, Tys *... elts) { + assert(elt1 && "Cannot create a struct type with no elements with this"); + LLVMContext &Ctx = elt1->getContext(); + SmallVector StructFields({elt1, elts...}); + return llvm::StructType::get(Ctx, StructFields); + } bool isPacked() const { return (getSubclassData() & SCDB_Packed) != 0; } @@ -269,13 +285,21 @@ public: /// Specify a body for an opaque identified type. void setBody(ArrayRef Elements, bool isPacked = false); - void setBody(Type *elt1, ...) LLVM_END_WITH_NULL; + + template + typename std::enable_if::value, void>::type + setBody(Type *elt1, Tys *... elts) { + assert(elt1 && "Cannot create a struct type with no elements with this"); + SmallVector StructFields({elt1, elts...}); + setBody(StructFields); + } /// Return true if the specified type is valid as a element type. static bool isValidElementType(Type *ElemTy); // Iterator access to the elements. - typedef Type::subtype_iterator element_iterator; + using element_iterator = Type::subtype_iterator; + element_iterator element_begin() const { return ContainedTys; } element_iterator element_end() const { return &ContainedTys[NumContainedTys];} ArrayRef const elements() const { diff --git a/include/llvm/IR/DiagnosticInfo.h b/include/llvm/IR/DiagnosticInfo.h index 458c3cf29b0d..5497652135bd 100644 --- a/include/llvm/IR/DiagnosticInfo.h +++ b/include/llvm/IR/DiagnosticInfo.h @@ -15,7 +15,6 @@ #ifndef LLVM_IR_DIAGNOSTICINFO_H #define LLVM_IR_DIAGNOSTICINFO_H -#include "llvm/ADT/None.h" #include "llvm/ADT/Optional.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" @@ -120,18 +119,18 @@ public: virtual void print(DiagnosticPrinter &DP) const = 0; }; -typedef std::function DiagnosticHandlerFunction; +using DiagnosticHandlerFunction = std::function; /// Diagnostic information for inline asm reporting. /// This is basically a message and an optional location. class DiagnosticInfoInlineAsm : public DiagnosticInfo { private: /// Optional line information. 0 if not set. - unsigned LocCookie; + unsigned LocCookie = 0; /// Message to be reported. const Twine &MsgStr; /// Optional origin of the problem. - const Instruction *Instr; + const Instruction *Instr = nullptr; public: /// \p MsgStr is the message to be reported to the frontend. @@ -139,8 +138,7 @@ public: /// for the whole life time of the Diagnostic. DiagnosticInfoInlineAsm(const Twine &MsgStr, DiagnosticSeverity Severity = DS_Error) - : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr), - Instr(nullptr) {} + : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr) {} /// \p LocCookie if non-zero gives the line number for this report. /// \p MsgStr gives the message. @@ -149,7 +147,7 @@ public: DiagnosticInfoInlineAsm(unsigned LocCookie, const Twine &MsgStr, DiagnosticSeverity Severity = DS_Error) : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(LocCookie), - MsgStr(MsgStr), Instr(nullptr) {} + MsgStr(MsgStr) {} /// \p Instr gives the original instruction that triggered the diagnostic. /// \p MsgStr gives the message. @@ -294,10 +292,10 @@ public: DiagnosticInfoSampleProfile(StringRef FileName, const Twine &Msg, DiagnosticSeverity Severity = DS_Error) : DiagnosticInfo(DK_SampleProfile, Severity), FileName(FileName), - LineNum(0), Msg(Msg) {} + Msg(Msg) {} DiagnosticInfoSampleProfile(const Twine &Msg, DiagnosticSeverity Severity = DS_Error) - : DiagnosticInfo(DK_SampleProfile, Severity), LineNum(0), Msg(Msg) {} + : DiagnosticInfo(DK_SampleProfile, Severity), Msg(Msg) {} /// \see DiagnosticInfo::print. void print(DiagnosticPrinter &DP) const override; @@ -316,7 +314,7 @@ private: /// Line number where the diagnostic occurred. If 0, no line number will /// be emitted in the message. - unsigned LineNum; + unsigned LineNum = 0; /// Message to report. const Twine &Msg; @@ -351,8 +349,9 @@ class DiagnosticLocation { StringRef Filename; unsigned Line = 0; unsigned Column = 0; + public: - DiagnosticLocation() {} + DiagnosticLocation() = default; DiagnosticLocation(const DebugLoc &DL); DiagnosticLocation(const DISubprogram *SP); @@ -796,6 +795,7 @@ private: const Twine &Msg) : OptimizationRemarkAnalysis(DK_OptimizationRemarkAnalysisFPCommute, PassName, Fn, Loc, Msg) {} + friend void emitOptimizationRemarkAnalysisFPCommute( LLVMContext &Ctx, const char *PassName, const Function &Fn, const DiagnosticLocation &Loc, const Twine &Msg); @@ -1012,6 +1012,7 @@ public: void print(DiagnosticPrinter &DP) const override; }; + } // end namespace llvm #endif // LLVM_IR_DIAGNOSTICINFO_H diff --git a/include/llvm/IR/Function.h b/include/llvm/IR/Function.h index c12a125b6352..8a2a6ed87eb2 100644 --- a/include/llvm/IR/Function.h +++ b/include/llvm/IR/Function.h @@ -1,4 +1,4 @@ -//===-- llvm/Function.h - Class to represent a single function --*- C++ -*-===// +//===- llvm/Function.h - Class to represent a single function ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -22,15 +22,19 @@ #include "llvm/ADT/ilist_node.h" #include "llvm/ADT/iterator_range.h" #include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/IR/Argument.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallingConv.h" +#include "llvm/IR/DerivedTypes.h" #include "llvm/IR/GlobalObject.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/SymbolTableListTraits.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/Compiler.h" #include #include @@ -40,27 +44,31 @@ namespace llvm { -template class Optional; class AssemblyAnnotationWriter; -class FunctionType; -class LLVMContext; +class Constant; class DISubprogram; +class LLVMContext; +class Module; +template class Optional; +class raw_ostream; +class Type; +class User; class Function : public GlobalObject, public ilist_node { public: - typedef SymbolTableList BasicBlockListType; + using BasicBlockListType = SymbolTableList; // BasicBlock iterators... - typedef BasicBlockListType::iterator iterator; - typedef BasicBlockListType::const_iterator const_iterator; + using iterator = BasicBlockListType::iterator; + using const_iterator = BasicBlockListType::const_iterator; - typedef Argument *arg_iterator; - typedef const Argument *const_arg_iterator; + using arg_iterator = Argument *; + using const_arg_iterator = const Argument *; private: // Important things that make up a function! - BasicBlockListType BasicBlocks; ///< The basic blocks - mutable Argument *Arguments; ///< The formal arguments + BasicBlockListType BasicBlocks; ///< The basic blocks + mutable Argument *Arguments = nullptr; ///< The formal arguments size_t NumArgs; std::unique_ptr SymTab; ///< Symbol table of args/instructions @@ -124,10 +132,12 @@ public: // Provide fast operand accessors. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); + /// Returns the FunctionType for me. FunctionType *getFunctionType() const { return cast(getValueType()); } + /// Returns the type of the ret val. Type *getReturnType() const { return getFunctionType()->getReturnType(); } @@ -484,7 +494,7 @@ public: /// copyAttributesFrom - copy all additional attributes (those not needed to /// create a Function) from the Function Src to this one. - void copyAttributesFrom(const GlobalValue *Src) override; + void copyAttributesFrom(const Function *Src); /// deleteBody - This method deletes the body of the function, and converts /// the linkage to external. @@ -497,12 +507,12 @@ public: /// removeFromParent - This method unlinks 'this' from the containing module, /// but does not delete it. /// - void removeFromParent() override; + void removeFromParent(); /// eraseFromParent - This method unlinks 'this' from the containing module /// and deletes it. /// - void eraseFromParent() override; + void eraseFromParent(); /// Steal arguments from another function. /// diff --git a/include/llvm/IR/GetElementPtrTypeIterator.h b/include/llvm/IR/GetElementPtrTypeIterator.h index 490bff29cf38..f017a449d33f 100644 --- a/include/llvm/IR/GetElementPtrTypeIterator.h +++ b/include/llvm/IR/GetElementPtrTypeIterator.h @@ -21,7 +21,9 @@ #include "llvm/IR/Operator.h" #include "llvm/IR/User.h" #include "llvm/Support/Casting.h" +#include #include +#include #include namespace llvm { @@ -29,13 +31,13 @@ namespace llvm { template class generic_gep_type_iterator : public std::iterator { - typedef std::iterator super; + using super = std::iterator; ItTy OpIt; PointerUnion CurTy; enum : uint64_t { Unbounded = -1ull }; uint64_t NumElements = Unbounded; + generic_gep_type_iterator() = default; public: @@ -121,7 +123,7 @@ namespace llvm { } }; - typedef generic_gep_type_iterator<> gep_type_iterator; + using gep_type_iterator = generic_gep_type_iterator<>; inline gep_type_iterator gep_type_begin(const User *GEP) { auto *GEPOp = cast(GEP); diff --git a/include/llvm/IR/GlobalAlias.h b/include/llvm/IR/GlobalAlias.h index 37a291dfeb7a..d4bf0d7e1ed4 100644 --- a/include/llvm/IR/GlobalAlias.h +++ b/include/llvm/IR/GlobalAlias.h @@ -59,15 +59,19 @@ public: // Linkage, Type, Parent and AddressSpace taken from the Aliasee. static GlobalAlias *create(const Twine &Name, GlobalValue *Aliasee); + void copyAttributesFrom(const GlobalValue *Src) { + GlobalValue::copyAttributesFrom(Src); + } + /// removeFromParent - This method unlinks 'this' from the containing module, /// but does not delete it. /// - void removeFromParent() override; + void removeFromParent(); /// eraseFromParent - This method unlinks 'this' from the containing module /// and deletes it. /// - void eraseFromParent() override; + void eraseFromParent(); /// These methods retrieve and set alias target. void setAliasee(Constant *Aliasee); diff --git a/include/llvm/IR/GlobalIFunc.h b/include/llvm/IR/GlobalIFunc.h index bfaa9960cb13..d90c7c78ed26 100644 --- a/include/llvm/IR/GlobalIFunc.h +++ b/include/llvm/IR/GlobalIFunc.h @@ -47,12 +47,16 @@ public: LinkageTypes Linkage, const Twine &Name, Constant *Resolver, Module *Parent); + void copyAttributesFrom(const GlobalIFunc *Src) { + GlobalValue::copyAttributesFrom(Src); + } + /// This method unlinks 'this' from the containing module, but does not /// delete it. - void removeFromParent() final; + void removeFromParent(); /// This method unlinks 'this' from the containing module and deletes it. - void eraseFromParent() final; + void eraseFromParent(); /// These methods retrieve and set ifunc resolver function. void setResolver(Constant *Resolver) { diff --git a/include/llvm/IR/GlobalObject.h b/include/llvm/IR/GlobalObject.h index f3789bafefe3..fc38f698027b 100644 --- a/include/llvm/IR/GlobalObject.h +++ b/include/llvm/IR/GlobalObject.h @@ -150,8 +150,10 @@ public: void addTypeMetadata(unsigned Offset, Metadata *TypeID); - void copyAttributesFrom(const GlobalValue *Src) override; +protected: + void copyAttributesFrom(const GlobalObject *Src); +public: // Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Value *V) { return V->getValueID() == Value::FunctionVal || diff --git a/include/llvm/IR/GlobalValue.h b/include/llvm/IR/GlobalValue.h index bb30fa8be867..0793a1c0ee2e 100644 --- a/include/llvm/IR/GlobalValue.h +++ b/include/llvm/IR/GlobalValue.h @@ -435,14 +435,20 @@ public: bool isWeakForLinker() const { return isWeakForLinker(getLinkage()); } +protected: /// Copy all additional attributes (those not needed to create a GlobalValue) /// from the GlobalValue Src to this one. - virtual void copyAttributesFrom(const GlobalValue *Src); + void copyAttributesFrom(const GlobalValue *Src); - /// If special LLVM prefix that is used to inform the asm printer to not emit - /// usual symbol prefix before the symbol name is used then return linkage - /// name after skipping this special LLVM prefix. - static StringRef getRealLinkageName(StringRef Name) { +public: + /// If the given string begins with the GlobalValue name mangling escape + /// character '\1', drop it. + /// + /// This function applies a specific mangling that is used in PGO profiles, + /// among other things. If you're trying to get a symbol name for an + /// arbitrary GlobalValue, this is not the function you're looking for; see + /// Mangler.h. + static StringRef dropLLVMManglingEscape(StringRef Name) { if (!Name.empty() && Name[0] == '\1') return Name.substr(1); return Name; @@ -530,10 +536,10 @@ public: /// This method unlinks 'this' from the containing module, but does not delete /// it. - virtual void removeFromParent() = 0; + void removeFromParent(); /// This method unlinks 'this' from the containing module and deletes it. - virtual void eraseFromParent() = 0; + void eraseFromParent(); /// Get the module that this global value is contained inside of... Module *getParent() { return Parent; } diff --git a/include/llvm/IR/GlobalVariable.h b/include/llvm/IR/GlobalVariable.h index 3b545d811d44..21d334c8f01d 100644 --- a/include/llvm/IR/GlobalVariable.h +++ b/include/llvm/IR/GlobalVariable.h @@ -24,6 +24,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/ADT/ilist_node.h" #include "llvm/IR/GlobalObject.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Value.h" #include @@ -41,6 +42,7 @@ class DIGlobalVariableExpression; class GlobalVariable : public GlobalObject, public ilist_node { friend class SymbolTableListTraits; + AttributeSet Attrs; bool isConstantGlobal : 1; // Is this a global constant? bool isExternallyInitializedConstant : 1; // Is this a global whose value // can change from its initial @@ -156,17 +158,17 @@ public: /// copyAttributesFrom - copy all additional attributes (those not needed to /// create a GlobalVariable) from the GlobalVariable Src to this one. - void copyAttributesFrom(const GlobalValue *Src) override; + void copyAttributesFrom(const GlobalVariable *Src); /// removeFromParent - This method unlinks 'this' from the containing module, /// but does not delete it. /// - void removeFromParent() override; + void removeFromParent(); /// eraseFromParent - This method unlinks 'this' from the containing module /// and deletes it. /// - void eraseFromParent() override; + void eraseFromParent(); /// Drop all references in preparation to destroy the GlobalVariable. This /// drops not only the reference to the initializer but also to any metadata. @@ -178,6 +180,61 @@ public: /// Fill the vector with all debug info attachements. void getDebugInfo(SmallVectorImpl &GVs) const; + /// Add attribute to this global. + void addAttribute(Attribute::AttrKind Kind) { + Attrs = Attrs.addAttribute(getContext(), Kind); + } + + /// Add attribute to this global. + void addAttribute(StringRef Kind, StringRef Val = StringRef()) { + Attrs = Attrs.addAttribute(getContext(), Kind, Val); + } + + /// Return true if the attribute exists. + bool hasAttribute(Attribute::AttrKind Kind) const { + return Attrs.hasAttribute(Kind); + } + + /// Return true if the attribute exists. + bool hasAttribute(StringRef Kind) const { + return Attrs.hasAttribute(Kind); + } + + /// Return true if any attributes exist. + bool hasAttributes() const { + return Attrs.hasAttributes(); + } + + /// Return the attribute object. + Attribute getAttribute(Attribute::AttrKind Kind) const { + return Attrs.getAttribute(Kind); + } + + /// Return the attribute object. + Attribute getAttribute(StringRef Kind) const { + return Attrs.getAttribute(Kind); + } + + /// Return the attribute set for this global + AttributeSet getAttributes() const { + return Attrs; + } + + /// Return attribute set as list with index. + /// FIXME: This may not be required once ValueEnumerators + /// in bitcode-writer can enumerate attribute-set. + AttributeList getAttributesAsList(unsigned index) const { + if (!hasAttributes()) + return AttributeList(); + std::pair AS[1] = {{index, Attrs}}; + return AttributeList::get(getContext(), AS); + } + + /// Set attribute list for this global + void setAttributes(AttributeSet A) { + Attrs = A; + } + // Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Value *V) { return V->getValueID() == Value::GlobalVariableVal; diff --git a/include/llvm/IR/IRBuilder.h b/include/llvm/IR/IRBuilder.h index bc689f3b01d7..9d4c13c29f68 100644 --- a/include/llvm/IR/IRBuilder.h +++ b/include/llvm/IR/IRBuilder.h @@ -454,6 +454,45 @@ public: MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); + /// \brief Create a vector fadd reduction intrinsic of the source vector. + /// The first parameter is a scalar accumulator value for ordered reductions. + CallInst *CreateFAddReduce(Value *Acc, Value *Src); + + /// \brief Create a vector fmul reduction intrinsic of the source vector. + /// The first parameter is a scalar accumulator value for ordered reductions. + CallInst *CreateFMulReduce(Value *Acc, Value *Src); + + /// \brief Create a vector int add reduction intrinsic of the source vector. + CallInst *CreateAddReduce(Value *Src); + + /// \brief Create a vector int mul reduction intrinsic of the source vector. + CallInst *CreateMulReduce(Value *Src); + + /// \brief Create a vector int AND reduction intrinsic of the source vector. + CallInst *CreateAndReduce(Value *Src); + + /// \brief Create a vector int OR reduction intrinsic of the source vector. + CallInst *CreateOrReduce(Value *Src); + + /// \brief Create a vector int XOR reduction intrinsic of the source vector. + CallInst *CreateXorReduce(Value *Src); + + /// \brief Create a vector integer max reduction intrinsic of the source + /// vector. + CallInst *CreateIntMaxReduce(Value *Src, bool IsSigned = false); + + /// \brief Create a vector integer min reduction intrinsic of the source + /// vector. + CallInst *CreateIntMinReduce(Value *Src, bool IsSigned = false); + + /// \brief Create a vector float max reduction intrinsic of the source + /// vector. + CallInst *CreateFPMaxReduce(Value *Src, bool NoNaN = false); + + /// \brief Create a vector float min reduction intrinsic of the source + /// vector. + CallInst *CreateFPMinReduce(Value *Src, bool NoNaN = false); + /// \brief Create a lifetime.start intrinsic. /// /// If the pointer isn't i8* it will be converted. diff --git a/include/llvm/IR/InstrTypes.h b/include/llvm/IR/InstrTypes.h index d16a5d318d78..61ca90de7393 100644 --- a/include/llvm/IR/InstrTypes.h +++ b/include/llvm/IR/InstrTypes.h @@ -65,27 +65,15 @@ protected: // Out of line virtual method, so the vtable, etc has a home. ~TerminatorInst() override; - /// Virtual methods - Terminators should overload these and provide inline - /// overrides of non-V methods. - virtual BasicBlock *getSuccessorV(unsigned idx) const = 0; - virtual unsigned getNumSuccessorsV() const = 0; - virtual void setSuccessorV(unsigned idx, BasicBlock *B) = 0; - public: /// Return the number of successors that this terminator has. - unsigned getNumSuccessors() const { - return getNumSuccessorsV(); - } + unsigned getNumSuccessors() const; /// Return the specified successor. - BasicBlock *getSuccessor(unsigned idx) const { - return getSuccessorV(idx); - } + BasicBlock *getSuccessor(unsigned idx) const; /// Update the specified successor to point at the provided block. - void setSuccessor(unsigned idx, BasicBlock *B) { - setSuccessorV(idx, B); - } + void setSuccessor(unsigned idx, BasicBlock *B); // Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Instruction *I) { diff --git a/include/llvm/IR/Instruction.h b/include/llvm/IR/Instruction.h index 90c3175122fd..fca29900f4c2 100644 --- a/include/llvm/IR/Instruction.h +++ b/include/llvm/IR/Instruction.h @@ -456,6 +456,12 @@ public: /// higher. bool isAtomic() const; + /// Return true if this atomic instruction loads from memory. + bool hasAtomicLoad() const; + + /// Return true if this atomic instruction stores to memory. + bool hasAtomicStore() const; + /// Return true if this instruction may throw an exception. bool mayThrow() const; diff --git a/include/llvm/IR/Instructions.h b/include/llvm/IR/Instructions.h index 844a7273eca9..c26701af27ce 100644 --- a/include/llvm/IR/Instructions.h +++ b/include/llvm/IR/Instructions.h @@ -1,4 +1,4 @@ -//===-- llvm/Instructions.h - Instruction subclass definitions --*- C++ -*-===// +//===- llvm/Instructions.h - Instruction subclass definitions ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -17,6 +17,7 @@ #define LLVM_IR_INSTRUCTIONS_H #include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/iterator.h" #include "llvm/ADT/iterator_range.h" #include "llvm/ADT/None.h" #include "llvm/ADT/SmallVector.h" @@ -24,21 +25,25 @@ #include "llvm/ADT/StringRef.h" #include "llvm/ADT/Twine.h" #include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constant.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" #include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/OperandTraits.h" #include "llvm/IR/Type.h" #include "llvm/IR/Use.h" #include "llvm/IR/User.h" +#include "llvm/IR/Value.h" #include "llvm/Support/AtomicOrdering.h" #include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include #include #include +#include namespace llvm { @@ -264,6 +269,7 @@ public: } bool isSimple() const { return !isAtomic() && !isVolatile(); } + bool isUnordered() const { return (getOrdering() == AtomicOrdering::NotAtomic || getOrdering() == AtomicOrdering::Unordered) && @@ -386,6 +392,7 @@ public: } bool isSimple() const { return !isAtomic() && !isVolatile(); } + bool isUnordered() const { return (getOrdering() == AtomicOrdering::NotAtomic || getOrdering() == AtomicOrdering::Unordered) && @@ -836,10 +843,7 @@ class GetElementPtrInst : public Instruction { Type *SourceElementType; Type *ResultElementType; - void anchor() override; - GetElementPtrInst(const GetElementPtrInst &GEPI); - void init(Value *Ptr, ArrayRef IdxList, const Twine &NameStr); /// Constructors - Create a getelementptr instruction with a base pointer an /// list of indices. The first ctor can optionally insert before an existing @@ -852,6 +856,9 @@ class GetElementPtrInst : public Instruction { ArrayRef IdxList, unsigned Values, const Twine &NameStr, BasicBlock *InsertAtEnd); + void anchor() override; + void init(Value *Ptr, ArrayRef IdxList, const Twine &NameStr); + protected: // Note: Instruction needs to be a friend here to call cloneImpl. friend class Instruction; @@ -2261,6 +2268,19 @@ public: return Mask; } + /// Change values in a shuffle permute mask assuming the two vector operands + /// of length InVecNumElts have swapped position. + static void commuteShuffleMask(MutableArrayRef Mask, + unsigned InVecNumElts) { + for (int &Idx : Mask) { + if (Idx == -1) + continue; + Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts; + assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 && + "shufflevector mask index out of range"); + } + } + // Methods for support type inquiry through isa, cast, and dyn_cast: static inline bool classof(const Instruction *I) { return I->getOpcode() == Instruction::ShuffleVector; @@ -2288,6 +2308,7 @@ class ExtractValueInst : public UnaryInstruction { SmallVector Indices; ExtractValueInst(const ExtractValueInst &EVI); + /// Constructors - Create a extractvalue instruction with a base aggregate /// value and a list of indices. The first ctor can optionally insert before /// an existing instruction, the second appends the new instruction to the @@ -2333,7 +2354,8 @@ public: /// Null is returned if the indices are invalid for the specified type. static Type *getIndexedType(Type *Agg, ArrayRef Idxs); - typedef const unsigned* idx_iterator; + using idx_iterator = const unsigned*; + inline idx_iterator idx_begin() const { return Indices.begin(); } inline idx_iterator idx_end() const { return Indices.end(); } inline iterator_range indices() const { @@ -2455,7 +2477,8 @@ public: /// Transparently provide more efficient getOperand methods. DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value); - typedef const unsigned* idx_iterator; + using idx_iterator = const unsigned*; + inline idx_iterator idx_begin() const { return Indices.begin(); } inline idx_iterator idx_end() const { return Indices.end(); } inline iterator_range indices() const { @@ -2606,8 +2629,8 @@ public: // Block iterator interface. This provides access to the list of incoming // basic blocks, which parallels the list of incoming values. - typedef BasicBlock **block_iterator; - typedef BasicBlock * const *const_block_iterator; + using block_iterator = BasicBlock **; + using const_block_iterator = BasicBlock * const *; block_iterator block_begin() { Use::UserRef *ref = @@ -2656,9 +2679,11 @@ public: "All operands to PHI node must be the same type as the PHI node!"); setOperand(i, V); } + static unsigned getOperandNumForIncomingValue(unsigned i) { return i; } + static unsigned getIncomingValueNumForOperand(unsigned i) { return i; } @@ -2937,9 +2962,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); }; template <> @@ -3047,9 +3074,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); }; template <> @@ -3123,7 +3152,7 @@ public: protected: // Expose the switch type we're parameterized with to the iterator. - typedef SwitchInstT SwitchInstType; + using SwitchInstType = SwitchInstT; SwitchInstT *SI; ptrdiff_t Index; @@ -3164,8 +3193,8 @@ public: } }; - typedef CaseHandleImpl - ConstCaseHandle; + using ConstCaseHandle = + CaseHandleImpl; class CaseHandle : public CaseHandleImpl { @@ -3192,7 +3221,7 @@ public: : public iterator_facade_base, std::random_access_iterator_tag, CaseHandleT> { - typedef typename CaseHandleT::SwitchInstType SwitchInstT; + using SwitchInstT = typename CaseHandleT::SwitchInstType; CaseHandleT Case; @@ -3254,8 +3283,8 @@ public: const CaseHandleT &operator*() const { return Case; } }; - typedef CaseIteratorImpl CaseIt; - typedef CaseIteratorImpl ConstCaseIt; + using CaseIt = CaseIteratorImpl; + using ConstCaseIt = CaseIteratorImpl; static SwitchInst *Create(Value *Value, BasicBlock *Default, unsigned NumCases, @@ -3411,9 +3440,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); }; template <> @@ -3516,9 +3547,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); }; template <> @@ -3639,6 +3672,7 @@ public: return new (Values) InvokeInst(Func, IfNormal, IfException, Args, None, Values, NameStr, InsertAtEnd); } + static InvokeInst *Create(Value *Func, BasicBlock *IfNormal, BasicBlock *IfException, ArrayRef Args, ArrayRef Bundles, @@ -3996,9 +4030,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); template bool hasFnAttrImpl(AttrKind Kind) const { if (Attrs.hasAttribute(AttributeList::FunctionIndex, Kind)) @@ -4095,9 +4131,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); }; template <> @@ -4202,13 +4240,14 @@ private: } public: - typedef std::pointer_to_unary_function DerefFnTy; - typedef mapped_iterator handler_iterator; - typedef iterator_range handler_range; - typedef std::pointer_to_unary_function - ConstDerefFnTy; - typedef mapped_iterator const_handler_iterator; - typedef iterator_range const_handler_range; + using DerefFnTy = std::pointer_to_unary_function; + using handler_iterator = mapped_iterator; + using handler_range = iterator_range; + using ConstDerefFnTy = + std::pointer_to_unary_function; + using const_handler_iterator = + mapped_iterator; + using const_handler_range = iterator_range; /// Returns an iterator that points to the first handler in CatchSwitchInst. handler_iterator handler_begin() { @@ -4278,9 +4317,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned Idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned Idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned Idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned Idx, BasicBlock *B); }; template <> @@ -4443,9 +4484,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned Idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned Idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned Idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned Idx, BasicBlock *B); }; template <> @@ -4531,9 +4574,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned Idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned Idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned Idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned Idx, BasicBlock *B); // Shadow Instruction::setInstructionSubclassData with a private forwarding // method so that subclasses cannot accidentally use it. @@ -4586,9 +4631,11 @@ public: } private: - BasicBlock *getSuccessorV(unsigned idx) const override; - unsigned getNumSuccessorsV() const override; - void setSuccessorV(unsigned idx, BasicBlock *B) override; + friend TerminatorInst; + + BasicBlock *getSuccessorV(unsigned idx) const; + unsigned getNumSuccessorsV() const; + void setSuccessorV(unsigned idx, BasicBlock *B); }; //===----------------------------------------------------------------------===// diff --git a/include/llvm/IR/Intrinsics.td b/include/llvm/IR/Intrinsics.td index 7b78d4d3d34a..19f6045568f4 100644 --- a/include/llvm/IR/Intrinsics.td +++ b/include/llvm/IR/Intrinsics.td @@ -812,6 +812,50 @@ def int_memcpy_element_atomic : Intrinsic<[], [IntrArgMemOnly, NoCapture<0>, NoCapture<1>, WriteOnly<0>, ReadOnly<1>]>; +//===------------------------ Reduction Intrinsics ------------------------===// +// +def int_experimental_vector_reduce_fadd : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyfloat_ty, + llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_fmul : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyfloat_ty, + llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_add : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_mul : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_and : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_or : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_xor : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_smax : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_smin : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_umax : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_umin : Intrinsic<[llvm_anyint_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_fmax : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; +def int_experimental_vector_reduce_fmin : Intrinsic<[llvm_anyfloat_ty], + [llvm_anyvector_ty], + [IntrNoMem]>; + //===----- Intrinsics that are used to provide predicate information -----===// def int_ssa_copy : Intrinsic<[llvm_any_ty], [LLVMMatchType<0>], diff --git a/include/llvm/IR/LLVMContext.h b/include/llvm/IR/LLVMContext.h index d13d5ddaeb3c..ad011fb72e6a 100644 --- a/include/llvm/IR/LLVMContext.h +++ b/include/llvm/IR/LLVMContext.h @@ -1,4 +1,4 @@ -//===-- llvm/LLVMContext.h - Class for managing "global" state --*- C++ -*-===// +//===- llvm/LLVMContext.h - Class for managing "global" state ---*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -37,7 +37,9 @@ class StringRef; class Twine; namespace yaml { + class Output; + } // end namespace yaml /// This is an important class for using LLVM in a threaded context. It @@ -134,17 +136,17 @@ public: void enableDebugTypeODRUniquing(); void disableDebugTypeODRUniquing(); - typedef void (*InlineAsmDiagHandlerTy)(const SMDiagnostic&, void *Context, - unsigned LocCookie); + using InlineAsmDiagHandlerTy = void (*)(const SMDiagnostic&, void *Context, + unsigned LocCookie); /// Defines the type of a diagnostic handler. /// \see LLVMContext::setDiagnosticHandler. /// \see LLVMContext::diagnose. - typedef void (*DiagnosticHandlerTy)(const DiagnosticInfo &DI, void *Context); + using DiagnosticHandlerTy = void (*)(const DiagnosticInfo &DI, void *Context); /// Defines the type of a yield callback. /// \see LLVMContext::setYieldCallback. - typedef void (*YieldCallbackTy)(LLVMContext *Context, void *OpaqueHandle); + using YieldCallbackTy = void (*)(LLVMContext *Context, void *OpaqueHandle); /// setInlineAsmDiagnosticHandler - This method sets a handler that is invoked /// when problems with inline asm are detected by the backend. The first diff --git a/include/llvm/IR/LegacyPassManager.h b/include/llvm/IR/LegacyPassManager.h index 5257a0eed488..9a376a151505 100644 --- a/include/llvm/IR/LegacyPassManager.h +++ b/include/llvm/IR/LegacyPassManager.h @@ -98,6 +98,9 @@ private: // Create wrappers for C Binding types (see CBindingWrapping.h). DEFINE_STDCXX_CONVERSION_FUNCTIONS(legacy::PassManagerBase, LLVMPassManagerRef) +/// If -time-passes has been specified, report the timings immediately and then +/// reset the timers to zero. +void reportAndResetTimings(); } // End llvm namespace #endif diff --git a/include/llvm/IR/Module.h b/include/llvm/IR/Module.h index 67c35cd22b34..3024d9e27a2f 100644 --- a/include/llvm/IR/Module.h +++ b/include/llvm/IR/Module.h @@ -1,4 +1,4 @@ -//===-- llvm/Module.h - C++ class to represent a VM module ------*- C++ -*-===// +//===- llvm/Module.h - C++ class to represent a VM module -------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -16,6 +16,10 @@ #define LLVM_IR_MODULE_H #include "llvm/ADT/iterator_range.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Attributes.h" #include "llvm/IR/Comdat.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -23,20 +27,27 @@ #include "llvm/IR/GlobalIFunc.h" #include "llvm/IR/GlobalVariable.h" #include "llvm/IR/Metadata.h" +#include "llvm/IR/SymbolTableListTraits.h" #include "llvm/Support/CBindingWrapping.h" #include "llvm/Support/CodeGen.h" -#include "llvm/Support/DataTypes.h" +#include "llvm-c/Types.h" +#include +#include +#include +#include +#include +#include namespace llvm { -template class Optional; + class Error; class FunctionType; class GVMaterializer; class LLVMContext; class MemoryBuffer; class RandomNumberGenerator; -class StructType; template class SmallPtrSetImpl; +class StructType; /// A Module instance is used to store all the information related to an /// LLVM module. Modules are the top level container of all other LLVM @@ -54,47 +65,47 @@ class Module { /// @{ public: /// The type for the list of global variables. - typedef SymbolTableList GlobalListType; + using GlobalListType = SymbolTableList; /// The type for the list of functions. - typedef SymbolTableList FunctionListType; + using FunctionListType = SymbolTableList; /// The type for the list of aliases. - typedef SymbolTableList AliasListType; + using AliasListType = SymbolTableList; /// The type for the list of ifuncs. - typedef SymbolTableList IFuncListType; + using IFuncListType = SymbolTableList; /// The type for the list of named metadata. - typedef ilist NamedMDListType; + using NamedMDListType = ilist; /// The type of the comdat "symbol" table. - typedef StringMap ComdatSymTabType; + using ComdatSymTabType = StringMap; /// The Global Variable iterator. - typedef GlobalListType::iterator global_iterator; + using global_iterator = GlobalListType::iterator; /// The Global Variable constant iterator. - typedef GlobalListType::const_iterator const_global_iterator; + using const_global_iterator = GlobalListType::const_iterator; /// The Function iterators. - typedef FunctionListType::iterator iterator; + using iterator = FunctionListType::iterator; /// The Function constant iterator - typedef FunctionListType::const_iterator const_iterator; + using const_iterator = FunctionListType::const_iterator; /// The Function reverse iterator. - typedef FunctionListType::reverse_iterator reverse_iterator; + using reverse_iterator = FunctionListType::reverse_iterator; /// The Function constant reverse iterator. - typedef FunctionListType::const_reverse_iterator const_reverse_iterator; + using const_reverse_iterator = FunctionListType::const_reverse_iterator; /// The Global Alias iterators. - typedef AliasListType::iterator alias_iterator; + using alias_iterator = AliasListType::iterator; /// The Global Alias constant iterator - typedef AliasListType::const_iterator const_alias_iterator; + using const_alias_iterator = AliasListType::const_iterator; /// The Global IFunc iterators. - typedef IFuncListType::iterator ifunc_iterator; + using ifunc_iterator = IFuncListType::iterator; /// The Global IFunc constant iterator - typedef IFuncListType::const_iterator const_ifunc_iterator; + using const_ifunc_iterator = IFuncListType::const_iterator; /// The named metadata iterators. - typedef NamedMDListType::iterator named_metadata_iterator; + using named_metadata_iterator = NamedMDListType::iterator; /// The named metadata constant iterators. - typedef NamedMDListType::const_iterator const_named_metadata_iterator; + using const_named_metadata_iterator = NamedMDListType::const_iterator; /// This enumeration defines the supported behaviors of module flags. enum ModFlagBehavior { @@ -141,6 +152,7 @@ public: ModFlagBehavior Behavior; MDString *Key; Metadata *Val; + ModuleFlagEntry(ModFlagBehavior B, MDString *K, Metadata *V) : Behavior(B), Key(K), Val(V) {} }; @@ -483,9 +495,11 @@ public: const GlobalListType &getGlobalList() const { return GlobalList; } /// Get the Module's list of global variables. GlobalListType &getGlobalList() { return GlobalList; } + static GlobalListType Module::*getSublistAccess(GlobalVariable*) { return &Module::GlobalList; } + /// Get the Module's list of functions (constant). const FunctionListType &getFunctionList() const { return FunctionList; } /// Get the Module's list of functions. @@ -493,31 +507,39 @@ public: static FunctionListType Module::*getSublistAccess(Function*) { return &Module::FunctionList; } + /// Get the Module's list of aliases (constant). const AliasListType &getAliasList() const { return AliasList; } /// Get the Module's list of aliases. AliasListType &getAliasList() { return AliasList; } + static AliasListType Module::*getSublistAccess(GlobalAlias*) { return &Module::AliasList; } + /// Get the Module's list of ifuncs (constant). const IFuncListType &getIFuncList() const { return IFuncList; } /// Get the Module's list of ifuncs. IFuncListType &getIFuncList() { return IFuncList; } + static IFuncListType Module::*getSublistAccess(GlobalIFunc*) { return &Module::IFuncList; } + /// Get the Module's list of named metadata (constant). const NamedMDListType &getNamedMDList() const { return NamedMDList; } /// Get the Module's list of named metadata. NamedMDListType &getNamedMDList() { return NamedMDList; } + static NamedMDListType Module::*getSublistAccess(NamedMDNode*) { return &Module::NamedMDList; } + /// Get the symbol table of global variable and function identifiers const ValueSymbolTable &getValueSymbolTable() const { return *ValSymTab; } /// Get the Module's symbol table of global variable and function identifiers. ValueSymbolTable &getValueSymbolTable() { return *ValSymTab; } + /// Get the Module's symbol table for COMDATs (constant). const ComdatSymTabType &getComdatSymbolTable() const { return ComdatSymTab; } /// Get the Module's symbol table for COMDATs. @@ -602,11 +624,11 @@ public: /// @name Convenience iterators /// @{ - typedef concat_iterator - global_object_iterator; - typedef concat_iterator - const_global_object_iterator; + using global_object_iterator = + concat_iterator; + using const_global_object_iterator = + concat_iterator; iterator_range global_objects() { return concat(functions(), globals()); @@ -627,13 +649,12 @@ public: return global_objects().end(); } - typedef concat_iterator - global_value_iterator; - typedef concat_iterator - const_global_value_iterator; + using global_value_iterator = + concat_iterator; + using const_global_value_iterator = + concat_iterator; iterator_range global_values() { return concat(functions(), globals(), aliases(), ifuncs()); @@ -682,28 +703,35 @@ public: : public std::iterator { NamedMDNode *CUs; unsigned Idx; + void SkipNoDebugCUs(); + public: explicit debug_compile_units_iterator(NamedMDNode *CUs, unsigned Idx) : CUs(CUs), Idx(Idx) { SkipNoDebugCUs(); } + debug_compile_units_iterator &operator++() { ++Idx; SkipNoDebugCUs(); return *this; } + debug_compile_units_iterator operator++(int) { debug_compile_units_iterator T(*this); ++Idx; return T; } + bool operator==(const debug_compile_units_iterator &I) const { return Idx == I.Idx; } + bool operator!=(const debug_compile_units_iterator &I) const { return Idx != I.Idx; } + DICompileUnit *operator*() const; DICompileUnit *operator->() const; }; @@ -833,6 +861,6 @@ inline Module *unwrap(LLVMModuleProviderRef MP) { return reinterpret_cast(MP); } -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_IR_MODULE_H diff --git a/include/llvm/IR/ModuleSummaryIndex.h b/include/llvm/IR/ModuleSummaryIndex.h index 53570bdf16f4..c46c609609e2 100644 --- a/include/llvm/IR/ModuleSummaryIndex.h +++ b/include/llvm/IR/ModuleSummaryIndex.h @@ -1,4 +1,4 @@ -//===-- llvm/ModuleSummaryIndex.h - Module Summary Index --------*- C++ -*-===// +//===- llvm/ModuleSummaryIndex.h - Module Summary Index ---------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -16,21 +16,33 @@ #ifndef LLVM_IR_MODULESUMMARYINDEX_H #define LLVM_IR_MODULESUMMARYINDEX_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/DenseSet.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include "llvm/IR/Module.h" - +#include "llvm/IR/GlobalValue.h" +#include #include +#include +#include +#include +#include +#include +#include +#include +#include namespace llvm { namespace yaml { + template struct MappingTraits; -} + +} // end namespace yaml /// \brief Class to accumulate and hold information about a callee. struct CalleeInfo { @@ -47,7 +59,7 @@ struct CalleeInfo { class GlobalValueSummary; -typedef std::vector> GlobalValueSummaryList; +using GlobalValueSummaryList = std::vector>; struct GlobalValueSummaryInfo { /// The GlobalValue corresponding to this summary. This is only used in @@ -66,19 +78,22 @@ struct GlobalValueSummaryInfo { /// likely incur less overhead, as the value type is not very small and the size /// of the map is unknown, resulting in inefficiencies due to repeated /// insertions and resizing. -typedef std::map - GlobalValueSummaryMapTy; +using GlobalValueSummaryMapTy = + std::map; /// Struct that holds a reference to a particular GUID in a global value /// summary. struct ValueInfo { const GlobalValueSummaryMapTy::value_type *Ref = nullptr; + ValueInfo() = default; ValueInfo(const GlobalValueSummaryMapTy::value_type *Ref) : Ref(Ref) {} + operator bool() const { return Ref; } GlobalValue::GUID getGUID() const { return Ref->first; } const GlobalValue *getValue() const { return Ref->second.GV; } + ArrayRef> getSummaryList() const { return Ref->second.SummaryList; } @@ -88,9 +103,11 @@ template <> struct DenseMapInfo { static inline ValueInfo getEmptyKey() { return ValueInfo((GlobalValueSummaryMapTy::value_type *)-1); } + static inline ValueInfo getTombstoneKey() { return ValueInfo((GlobalValueSummaryMapTy::value_type *)-2); } + static bool isEqual(ValueInfo L, ValueInfo R) { return L.Ref == R.Ref; } static unsigned getHashValue(ValueInfo I) { return (uintptr_t)I.Ref; } }; @@ -138,7 +155,7 @@ private: /// This is the hash of the name of the symbol in the original file. It is /// identical to the GUID for global symbols, but differs for local since the /// GUID includes the module level id in the hash. - GlobalValue::GUID OriginalName; + GlobalValue::GUID OriginalName = 0; /// \brief Path of module IR containing value's definition, used to locate /// module during importing. @@ -157,7 +174,7 @@ private: protected: GlobalValueSummary(SummaryKind K, GVFlags Flags, std::vector Refs) - : Kind(K), Flags(Flags), OriginalName(0), RefEdgeList(std::move(Refs)) {} + : Kind(K), Flags(Flags), RefEdgeList(std::move(Refs)) {} public: virtual ~GlobalValueSummary() = default; @@ -242,7 +259,7 @@ public: class FunctionSummary : public GlobalValueSummary { public: /// call edge pair. - typedef std::pair EdgeTy; + using EdgeTy = std::pair; /// An "identifier" for a virtual function. This contains the type identifier /// represented as a GUID and the offset from the address point to the virtual @@ -376,12 +393,15 @@ public: template <> struct DenseMapInfo { static FunctionSummary::VFuncId getEmptyKey() { return {0, uint64_t(-1)}; } + static FunctionSummary::VFuncId getTombstoneKey() { return {0, uint64_t(-2)}; } + static bool isEqual(FunctionSummary::VFuncId L, FunctionSummary::VFuncId R) { return L.GUID == R.GUID && L.Offset == R.Offset; } + static unsigned getHashValue(FunctionSummary::VFuncId I) { return I.GUID; } }; @@ -389,14 +409,17 @@ template <> struct DenseMapInfo { static FunctionSummary::ConstVCall getEmptyKey() { return {{0, uint64_t(-1)}, {}}; } + static FunctionSummary::ConstVCall getTombstoneKey() { return {{0, uint64_t(-2)}, {}}; } + static bool isEqual(FunctionSummary::ConstVCall L, FunctionSummary::ConstVCall R) { return DenseMapInfo::isEqual(L.VFunc, R.VFunc) && L.Args == R.Args; } + static unsigned getHashValue(FunctionSummary::ConstVCall I) { return I.VFunc.GUID; } @@ -477,20 +500,20 @@ struct TypeIdSummary { }; /// 160 bits SHA1 -typedef std::array ModuleHash; +using ModuleHash = std::array; /// Type used for iterating through the global value summary map. -typedef GlobalValueSummaryMapTy::const_iterator const_gvsummary_iterator; -typedef GlobalValueSummaryMapTy::iterator gvsummary_iterator; +using const_gvsummary_iterator = GlobalValueSummaryMapTy::const_iterator; +using gvsummary_iterator = GlobalValueSummaryMapTy::iterator; /// String table to hold/own module path strings, which additionally holds the /// module ID assigned to each module during the plugin step, as well as a hash /// of the module. The StringMap makes a copy of and owns inserted strings. -typedef StringMap> ModulePathStringTableTy; +using ModulePathStringTableTy = StringMap>; /// Map of global value GUID to its summary, used to identify values defined in /// a particular module, and provide efficient access to their summary. -typedef std::map GVSummaryMapTy; +using GVSummaryMapTy = std::map; /// Class to hold module path string table and global value map, /// and encapsulate methods for operating on them. @@ -697,6 +720,6 @@ public: StringMap &ModuleToDefinedGVSummaries) const; }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_IR_MODULESUMMARYINDEX_H diff --git a/include/llvm/IR/PassManager.h b/include/llvm/IR/PassManager.h index c845112baa45..d03b7b65f81e 100644 --- a/include/llvm/IR/PassManager.h +++ b/include/llvm/IR/PassManager.h @@ -39,8 +39,8 @@ #define LLVM_IR_PASSMANAGER_H #include "llvm/ADT/DenseMap.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/StringRef.h" #include "llvm/ADT/TinyPtrVector.h" #include "llvm/IR/Function.h" #include "llvm/IR/Module.h" @@ -48,9 +48,15 @@ #include "llvm/Support/Debug.h" #include "llvm/Support/TypeName.h" #include "llvm/Support/raw_ostream.h" -#include "llvm/Support/type_traits.h" +#include +#include +#include +#include #include #include +#include +#include +#include #include namespace llvm { @@ -469,15 +475,16 @@ public: } template void addPass(PassT Pass) { - typedef detail::PassModel - PassModelT; + using PassModelT = + detail::PassModel; + Passes.emplace_back(new PassModelT(std::move(Pass))); } private: - typedef detail::PassConcept - PassConceptT; + using PassConceptT = + detail::PassConcept; std::vector> Passes; @@ -486,12 +493,14 @@ private: }; extern template class PassManager; + /// \brief Convenience typedef for a pass manager over modules. -typedef PassManager ModulePassManager; +using ModulePassManager = PassManager; extern template class PassManager; + /// \brief Convenience typedef for a pass manager over functions. -typedef PassManager FunctionPassManager; +using FunctionPassManager = PassManager; /// \brief A container for analyses that lazily runs them and caches their /// results. @@ -504,11 +513,11 @@ public: private: // Now that we've defined our invalidator, we can define the concept types. - typedef detail::AnalysisResultConcept - ResultConceptT; - typedef detail::AnalysisPassConcept - PassConceptT; + using ResultConceptT = + detail::AnalysisResultConcept; + using PassConceptT = + detail::AnalysisPassConcept; /// \brief List of analysis pass IDs and associated concept pointers. /// @@ -516,18 +525,18 @@ private: /// erases. Provides the analysis ID to enable finding iterators to a given /// entry in maps below, and provides the storage for the actual result /// concept. - typedef std::list>> - AnalysisResultListT; + using AnalysisResultListT = + std::list>>; /// \brief Map type from IRUnitT pointer to our custom list type. - typedef DenseMap AnalysisResultListMapT; + using AnalysisResultListMapT = DenseMap; /// \brief Map type from a pair of analysis ID and IRUnitT pointer to an /// iterator into a particular result list (which is where the actual analysis /// result is stored). - typedef DenseMap, - typename AnalysisResultListT::iterator> - AnalysisResultMapT; + using AnalysisResultMapT = + DenseMap, + typename AnalysisResultListT::iterator>; public: /// API to communicate dependencies between analyses during invalidation. @@ -558,10 +567,10 @@ public: /// dependecies on it will become invalid as a result. template bool invalidate(IRUnitT &IR, const PreservedAnalyses &PA) { - typedef detail::AnalysisResultModel - ResultModelT; + using ResultModelT = + detail::AnalysisResultModel; + return invalidateImpl(PassT::ID(), IR, PA); } @@ -672,9 +681,11 @@ public: "This analysis pass was not registered prior to being queried"); ResultConceptT &ResultConcept = getResultImpl(PassT::ID(), IR, ExtraArgs...); - typedef detail::AnalysisResultModel - ResultModelT; + + using ResultModelT = + detail::AnalysisResultModel; + return static_cast(ResultConcept).Result; } @@ -692,9 +703,10 @@ public: if (!ResultConcept) return nullptr; - typedef detail::AnalysisResultModel - ResultModelT; + using ResultModelT = + detail::AnalysisResultModel; + return &static_cast(ResultConcept)->Result; } @@ -717,10 +729,10 @@ public: /// hashtable.) template bool registerPass(PassBuilderT &&PassBuilder) { - typedef decltype(PassBuilder()) PassT; - typedef detail::AnalysisPassModel - PassModelT; + using PassT = decltype(PassBuilder()); + using PassModelT = + detail::AnalysisPassModel; auto &PassPtr = AnalysisPasses[PassT::ID()]; if (PassPtr) @@ -876,7 +888,8 @@ private: } /// \brief Map type from module analysis pass ID to pass concept pointer. - typedef DenseMap> AnalysisPassMapT; + using AnalysisPassMapT = + DenseMap>; /// \brief Collection of module analysis passes, indexed by ID. AnalysisPassMapT AnalysisPasses; @@ -896,12 +909,14 @@ private: }; extern template class AnalysisManager; + /// \brief Convenience typedef for the Module analysis manager. -typedef AnalysisManager ModuleAnalysisManager; +using ModuleAnalysisManager = AnalysisManager; extern template class AnalysisManager; + /// \brief Convenience typedef for the Function analysis manager. -typedef AnalysisManager FunctionAnalysisManager; +using FunctionAnalysisManager = AnalysisManager; /// \brief An analysis over an "outer" IR unit that provides access to an /// analysis manager over an "inner" IR unit. The inner unit must be contained @@ -927,20 +942,14 @@ public: class Result { public: explicit Result(AnalysisManagerT &InnerAM) : InnerAM(&InnerAM) {} + Result(Result &&Arg) : InnerAM(std::move(Arg.InnerAM)) { // We have to null out the analysis manager in the moved-from state // because we are taking ownership of the responsibilty to clear the // analysis state. Arg.InnerAM = nullptr; } - Result &operator=(Result &&RHS) { - InnerAM = RHS.InnerAM; - // We have to null out the analysis manager in the moved-from state - // because we are taking ownership of the responsibilty to clear the - // analysis state. - RHS.InnerAM = nullptr; - return *this; - } + ~Result() { // InnerAM is cleared in a moved from state where there is nothing to do. if (!InnerAM) @@ -951,6 +960,15 @@ public: InnerAM->clear(); } + Result &operator=(Result &&RHS) { + InnerAM = RHS.InnerAM; + // We have to null out the analysis manager in the moved-from state + // because we are taking ownership of the responsibilty to clear the + // analysis state. + RHS.InnerAM = nullptr; + return *this; + } + /// \brief Accessor for the analysis manager. AnalysisManagerT &getManager() { return *InnerAM; } @@ -988,6 +1006,7 @@ public: private: friend AnalysisInfoMixin< InnerAnalysisManagerProxy>; + static AnalysisKey Key; AnalysisManagerT *InnerAM; @@ -998,8 +1017,8 @@ AnalysisKey InnerAnalysisManagerProxy::Key; /// Provide the \c FunctionAnalysisManager to \c Module proxy. -typedef InnerAnalysisManagerProxy - FunctionAnalysisManagerModuleProxy; +using FunctionAnalysisManagerModuleProxy = + InnerAnalysisManagerProxy; /// Specialization of the invalidate method for the \c /// FunctionAnalysisManagerModuleProxy's result. @@ -1097,6 +1116,7 @@ public: private: friend AnalysisInfoMixin< OuterAnalysisManagerProxy>; + static AnalysisKey Key; const AnalysisManagerT *AM; @@ -1109,8 +1129,8 @@ AnalysisKey extern template class OuterAnalysisManagerProxy; /// Provide the \c ModuleAnalysisManager to \c Function proxy. -typedef OuterAnalysisManagerProxy - ModuleAnalysisManagerFunctionProxy; +using ModuleAnalysisManagerFunctionProxy = + OuterAnalysisManagerProxy; /// \brief Trivial adaptor that maps from a module to its functions. /// @@ -1274,6 +1294,6 @@ RepeatedPass createRepeatedPass(int Count, PassT P) { return RepeatedPass(Count, std::move(P)); } -} +} // end namespace llvm -#endif +#endif // LLVM_IR_PASSMANAGER_H diff --git a/include/llvm/IR/PassManagerInternal.h b/include/llvm/IR/PassManagerInternal.h index 387dc4c65c43..9195d4dfa428 100644 --- a/include/llvm/IR/PassManagerInternal.h +++ b/include/llvm/IR/PassManagerInternal.h @@ -27,7 +27,6 @@ namespace llvm { template class AllAnalysesOn; template class AnalysisManager; -class Invalidator; class PreservedAnalyses; /// \brief Implementation details of the pass manager interfaces. @@ -116,7 +115,7 @@ struct AnalysisResultConcept { /// \brief SFINAE metafunction for computing whether \c ResultT provides an /// \c invalidate member function. template class ResultHasInvalidateMethod { - typedef char EnabledType; + using EnabledType = char; struct DisabledType { char a, b; }; @@ -124,7 +123,7 @@ template class ResultHasInvalidateMethod { // Purely to help out MSVC which fails to disable the below specialization, // explicitly enable using the result type's invalidate routine if we can // successfully call that routine. - template struct Nonce { typedef EnabledType Type; }; + template struct Nonce { using Type = EnabledType; }; template static typename Nonce().invalidate( std::declval(), std::declval()))>::Type @@ -280,9 +279,9 @@ struct AnalysisPassModel : AnalysisPassConcept - ResultModelT; + using ResultModelT = + AnalysisResultModel; /// \brief The model delegates to the \c PassT::run method. /// diff --git a/include/llvm/IR/PatternMatch.h b/include/llvm/IR/PatternMatch.h index 31a76b4ed6c3..6b2b22e82b95 100644 --- a/include/llvm/IR/PatternMatch.h +++ b/include/llvm/IR/PatternMatch.h @@ -29,11 +29,19 @@ #ifndef LLVM_IR_PATTERNMATCH_H #define LLVM_IR_PATTERNMATCH_H +#include "llvm/ADT/APFloat.h" +#include "llvm/ADT/APInt.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include namespace llvm { namespace PatternMatch { @@ -172,7 +180,9 @@ inline match_nan m_NaN() { return match_nan(); } struct apint_match { const APInt *&Res; + apint_match(const APInt *&R) : Res(R) {} + template bool match(ITy *V) { if (auto *CI = dyn_cast(V)) { Res = &CI->getValue(); @@ -230,7 +240,9 @@ template struct cst_pred_ty : public Predicate { /// satisfy a specified predicate, and bind them to an APInt. template struct api_pred_ty : public Predicate { const APInt *&Res; + api_pred_ty(const APInt *&R) : Res(R) {} + template bool match(ITy *V) { if (const auto *CI = dyn_cast(V)) if (this->isValue(CI->getValue())) { @@ -294,6 +306,7 @@ inline api_pred_ty m_MaxSignedValue(const APInt *&V) { return template struct bind_ty { Class *&VR; + bind_ty(Class *&V) : VR(V) {} template bool match(ITy *V) { @@ -326,6 +339,7 @@ inline bind_ty m_ConstantFP(ConstantFP *&C) { return C; } /// \brief Match a specified Value*. struct specificval_ty { const Value *Val; + specificval_ty(const Value *V) : Val(V) {} template bool match(ITy *V) { return V == Val; } @@ -338,6 +352,7 @@ inline specificval_ty m_Specific(const Value *V) { return V; } /// that value. struct specific_fpval { double Val; + specific_fpval(double V) : Val(V) {} template bool match(ITy *V) { @@ -360,6 +375,7 @@ inline specific_fpval m_FPOne() { return m_SpecificFP(1.0); } struct bind_const_intval_ty { uint64_t &VR; + bind_const_intval_ty(uint64_t &V) : VR(V) {} template bool match(ITy *V) { @@ -376,6 +392,7 @@ struct bind_const_intval_ty { // value. struct specific_intval { uint64_t Val; + specific_intval(uint64_t V) : Val(V) {} template bool match(ITy *V) { @@ -939,6 +956,7 @@ template inline fneg_match m_FNeg(const LHS &L) { struct br_match { BasicBlock *&Succ; + br_match(BasicBlock *&Succ) : Succ(Succ) {} template bool match(OpTy *V) { @@ -956,6 +974,7 @@ inline br_match m_UnconditionalBr(BasicBlock *&Succ) { return br_match(Succ); } template struct brc_match { Cond_t Cond; BasicBlock *&T, *&F; + brc_match(const Cond_t &C, BasicBlock *&t, BasicBlock *&f) : Cond(C), T(t), F(f) {} @@ -1202,6 +1221,7 @@ m_UnordFMin(const LHS &L, const RHS &R) { template struct Argument_match { unsigned OpI; Opnd_t Val; + Argument_match(unsigned OpIdx, const Opnd_t &V) : OpI(OpIdx), Val(V) {} template bool match(OpTy *V) { @@ -1219,6 +1239,7 @@ inline Argument_match m_Argument(const Opnd_t &Op) { /// \brief Intrinsic matchers. struct IntrinsicID_match { unsigned ID; + IntrinsicID_match(Intrinsic::ID IntrID) : ID(IntrID) {} template bool match(OpTy *V) { @@ -1239,21 +1260,23 @@ template struct m_Intrinsic_Ty; template struct m_Intrinsic_Ty { - typedef match_combine_and> Ty; + using Ty = match_combine_and>; }; template struct m_Intrinsic_Ty { - typedef match_combine_and::Ty, Argument_match> - Ty; + using Ty = + match_combine_and::Ty, Argument_match>; }; template struct m_Intrinsic_Ty { - typedef match_combine_and::Ty, - Argument_match> Ty; + using Ty = + match_combine_and::Ty, + Argument_match>; }; template struct m_Intrinsic_Ty { - typedef match_combine_and::Ty, - Argument_match> Ty; + using Ty = + match_combine_and::Ty, + Argument_match>; }; /// \brief Match intrinsic calls like this: @@ -1437,4 +1460,4 @@ m_c_UMax(const LHS &L, const RHS &R) { } // end namespace PatternMatch } // end namespace llvm -#endif +#endif // LLVM_IR_PATTERNMATCH_H diff --git a/include/llvm/IR/ProfileSummary.h b/include/llvm/IR/ProfileSummary.h index f4248014c6e1..d85ce8c443ec 100644 --- a/include/llvm/IR/ProfileSummary.h +++ b/include/llvm/IR/ProfileSummary.h @@ -1,4 +1,4 @@ -//===-- ProfileSummary.h - Profile summary data structure. ------*- C++ -*-===// +//===- ProfileSummary.h - Profile summary data structure. -------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -11,21 +11,17 @@ // //===----------------------------------------------------------------------===// -#ifndef LLVM_SUPPORT_PROFILE_SUMMARY_H -#define LLVM_SUPPORT_PROFILE_SUMMARY_H +#ifndef LLVM_IR_PROFILESUMMARY_H +#define LLVM_IR_PROFILESUMMARY_H +#include #include -#include #include -#include "llvm/Support/Casting.h" - namespace llvm { class LLVMContext; class Metadata; -class MDTuple; -class MDNode; // The profile summary is one or more (Cutoff, MinCount, NumCounts) triplets. // The semantics of counts depend on the type of profile. For instrumentation @@ -37,12 +33,13 @@ struct ProfileSummaryEntry { uint32_t Cutoff; ///< The required percentile of counts. uint64_t MinCount; ///< The minimum count for this percentile. uint64_t NumCounts; ///< Number of counts >= the minimum count. + ProfileSummaryEntry(uint32_t TheCutoff, uint64_t TheMinCount, uint64_t TheNumCounts) : Cutoff(TheCutoff), MinCount(TheMinCount), NumCounts(TheNumCounts) {} }; -typedef std::vector SummaryEntryVector; +using SummaryEntryVector = std::vector; class ProfileSummary { public: @@ -59,6 +56,7 @@ private: public: static const int Scale = 1000000; + ProfileSummary(Kind K, SummaryEntryVector DetailedSummary, uint64_t TotalCount, uint64_t MaxCount, uint64_t MaxInternalCount, uint64_t MaxFunctionCount, @@ -67,6 +65,7 @@ public: TotalCount(TotalCount), MaxCount(MaxCount), MaxInternalCount(MaxInternalCount), MaxFunctionCount(MaxFunctionCount), NumCounts(NumCounts), NumFunctions(NumFunctions) {} + Kind getKind() const { return PSK; } /// \brief Return summary information as metadata. Metadata *getMD(LLVMContext &Context); @@ -82,4 +81,5 @@ public: }; } // end namespace llvm -#endif + +#endif // LLVM_IR_PROFILESUMMARY_H diff --git a/include/llvm/IR/Statepoint.h b/include/llvm/IR/Statepoint.h index 03151cd7c8f7..f01607614a0c 100644 --- a/include/llvm/IR/Statepoint.h +++ b/include/llvm/IR/Statepoint.h @@ -1,4 +1,4 @@ -//===-- llvm/IR/Statepoint.h - gc.statepoint utilities ----------*- C++ -*-===// +//===- llvm/IR/Statepoint.h - gc.statepoint utilities -----------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -24,10 +24,12 @@ #include "llvm/IR/CallSite.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Function.h" +#include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/Support/Casting.h" +#include "llvm/Support/MathExtras.h" #include #include #include @@ -87,7 +89,7 @@ protected: } public: - typedef typename CallSiteTy::arg_iterator arg_iterator; + using arg_iterator = typename CallSiteTy::arg_iterator; enum { IDPos = 0, @@ -300,8 +302,9 @@ public: class ImmutableStatepoint : public StatepointBase { - typedef StatepointBase Base; + using Base = + StatepointBase; public: explicit ImmutableStatepoint(const Instruction *I) : Base(I) {} @@ -312,7 +315,7 @@ public: /// to a gc.statepoint. class Statepoint : public StatepointBase { - typedef StatepointBase Base; + using Base = StatepointBase; public: explicit Statepoint(Instruction *I) : Base(I) {} @@ -327,6 +330,7 @@ public: return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate || I->getIntrinsicID() == Intrinsic::experimental_gc_result; } + static inline bool classof(const Value *V) { return isa(V) && classof(cast(V)); } @@ -369,6 +373,7 @@ public: static inline bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::experimental_gc_relocate; } + static inline bool classof(const Value *V) { return isa(V) && classof(cast(V)); } @@ -403,6 +408,7 @@ public: static inline bool classof(const IntrinsicInst *I) { return I->getIntrinsicID() == Intrinsic::experimental_gc_result; } + static inline bool classof(const Value *V) { return isa(V) && classof(cast(V)); } diff --git a/include/llvm/IR/SymbolTableListTraits.h b/include/llvm/IR/SymbolTableListTraits.h index 49a5fb21297d..87ce902c2811 100644 --- a/include/llvm/IR/SymbolTableListTraits.h +++ b/include/llvm/IR/SymbolTableListTraits.h @@ -48,7 +48,7 @@ class ValueSymbolTable; template struct SymbolTableListParentType {}; #define DEFINE_SYMBOL_TABLE_PARENT_TYPE(NODE, PARENT) \ - template <> struct SymbolTableListParentType { typedef PARENT type; }; + template <> struct SymbolTableListParentType { using type = PARENT; }; DEFINE_SYMBOL_TABLE_PARENT_TYPE(Instruction, BasicBlock) DEFINE_SYMBOL_TABLE_PARENT_TYPE(BasicBlock, Function) DEFINE_SYMBOL_TABLE_PARENT_TYPE(Argument, Function) @@ -65,10 +65,10 @@ template class SymbolTableList; // template class SymbolTableListTraits : public ilist_alloc_traits { - typedef SymbolTableList ListTy; - typedef typename simple_ilist::iterator iterator; - typedef - typename SymbolTableListParentType::type ItemParentClass; + using ListTy = SymbolTableList; + using iterator = typename simple_ilist::iterator; + using ItemParentClass = + typename SymbolTableListParentType::type; public: SymbolTableListTraits() = default; diff --git a/include/llvm/IR/TrackingMDRef.h b/include/llvm/IR/TrackingMDRef.h index 12b196432006..bdec904ad1e1 100644 --- a/include/llvm/IR/TrackingMDRef.h +++ b/include/llvm/IR/TrackingMDRef.h @@ -139,31 +139,35 @@ public: bool hasTrivialDestructor() const { return Ref.hasTrivialDestructor(); } }; -typedef TypedTrackingMDRef TrackingMDNodeRef; -typedef TypedTrackingMDRef TrackingValueAsMetadataRef; +using TrackingMDNodeRef = TypedTrackingMDRef; +using TrackingValueAsMetadataRef = TypedTrackingMDRef; // Expose the underlying metadata to casting. template <> struct simplify_type { - typedef Metadata *SimpleType; + using SimpleType = Metadata *; + static SimpleType getSimplifiedValue(TrackingMDRef &MD) { return MD.get(); } }; template <> struct simplify_type { - typedef Metadata *SimpleType; + using SimpleType = Metadata *; + static SimpleType getSimplifiedValue(const TrackingMDRef &MD) { return MD.get(); } }; template struct simplify_type> { - typedef T *SimpleType; + using SimpleType = T *; + static SimpleType getSimplifiedValue(TypedTrackingMDRef &MD) { return MD.get(); } }; template struct simplify_type> { - typedef T *SimpleType; + using SimpleType = T *; + static SimpleType getSimplifiedValue(const TypedTrackingMDRef &MD) { return MD.get(); } diff --git a/include/llvm/IR/Type.h b/include/llvm/IR/Type.h index e6a0df937e9b..82362107e41e 100644 --- a/include/llvm/IR/Type.h +++ b/include/llvm/IR/Type.h @@ -1,4 +1,4 @@ -//===-- llvm/Type.h - Classes for handling data types -----------*- C++ -*-===// +//===- llvm/Type.h - Classes for handling data types ------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -18,21 +18,22 @@ #include "llvm/ADT/APFloat.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/SmallPtrSet.h" -#include "llvm/Support/CBindingWrapping.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/DataTypes.h" +#include "llvm/Support/CBindingWrapping.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/ErrorHandling.h" +#include +#include +#include namespace llvm { -class PointerType; +template struct GraphTraits; class IntegerType; -class raw_ostream; -class Module; class LLVMContext; -class LLVMContextImpl; +class PointerType; +class raw_ostream; class StringRef; -template struct GraphTraits; /// The instances of the Type class are immutable: once they are created, /// they are never changed. Also note that only one instance of a particular @@ -86,9 +87,9 @@ private: protected: friend class LLVMContextImpl; + explicit Type(LLVMContext &C, TypeID tid) - : Context(C), ID(tid), SubclassData(0), - NumContainedTys(0), ContainedTys(nullptr) {} + : Context(C), ID(tid), SubclassData(0) {} ~Type() = default; unsigned getSubclassData() const { return SubclassData; } @@ -100,14 +101,14 @@ protected: } /// Keeps track of how many Type*'s there are in the ContainedTys list. - unsigned NumContainedTys; + unsigned NumContainedTys = 0; /// A pointer to the array of Types contained by this Type. For example, this /// includes the arguments of a function type, the elements of a structure, /// the pointee of a pointer, the element type of an array, etc. This pointer /// may be 0 for types that don't contain other types (Integer, Double, /// Float). - Type * const *ContainedTys; + Type * const *ContainedTys = nullptr; static bool isSequentialType(TypeID TyID) { return TyID == ArrayTyID || TyID == VectorTyID; @@ -122,6 +123,7 @@ public: /// inlined with the operands when printing an instruction. void print(raw_ostream &O, bool IsForDebug = false, bool NoDetails = false) const; + void dump() const; /// Return the LLVMContext in which this type was uniqued. @@ -299,14 +301,16 @@ public: //===--------------------------------------------------------------------===// // Type Iteration support. // - typedef Type * const *subtype_iterator; + using subtype_iterator = Type * const *; + subtype_iterator subtype_begin() const { return ContainedTys; } subtype_iterator subtype_end() const { return &ContainedTys[NumContainedTys];} ArrayRef subtypes() const { return makeArrayRef(subtype_begin(), subtype_end()); } - typedef std::reverse_iterator subtype_reverse_iterator; + using subtype_reverse_iterator = std::reverse_iterator; + subtype_reverse_iterator subtype_rbegin() const { return subtype_reverse_iterator(subtype_end()); } @@ -348,6 +352,7 @@ public: } inline uint64_t getArrayNumElements() const; + Type *getArrayElementType() const { assert(getTypeID() == ArrayTyID); return ContainedTys[0]; @@ -444,8 +449,8 @@ template <> struct isa_impl { // graph of sub types. template <> struct GraphTraits { - typedef Type *NodeRef; - typedef Type::subtype_iterator ChildIteratorType; + using NodeRef = Type *; + using ChildIteratorType = Type::subtype_iterator; static NodeRef getEntryNode(Type *T) { return T; } static ChildIteratorType child_begin(NodeRef N) { return N->subtype_begin(); } @@ -453,8 +458,8 @@ template <> struct GraphTraits { }; template <> struct GraphTraits { - typedef const Type *NodeRef; - typedef Type::subtype_iterator ChildIteratorType; + using NodeRef = const Type *; + using ChildIteratorType = Type::subtype_iterator; static NodeRef getEntryNode(NodeRef T) { return T; } static ChildIteratorType child_begin(NodeRef N) { return N->subtype_begin(); } @@ -474,6 +479,6 @@ inline LLVMTypeRef *wrap(Type **Tys) { return reinterpret_cast(const_cast(Tys)); } -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_IR_TYPE_H diff --git a/include/llvm/IR/TypeFinder.h b/include/llvm/IR/TypeFinder.h index 48c4f1161aa1..c050c388d398 100644 --- a/include/llvm/IR/TypeFinder.h +++ b/include/llvm/IR/TypeFinder.h @@ -44,8 +44,8 @@ public: void run(const Module &M, bool onlyNamed); void clear(); - typedef std::vector::iterator iterator; - typedef std::vector::const_iterator const_iterator; + using iterator = std::vector::iterator; + using const_iterator = std::vector::const_iterator; iterator begin() { return StructTypes.begin(); } iterator end() { return StructTypes.end(); } diff --git a/include/llvm/IR/Use.h b/include/llvm/IR/Use.h index 6b56546f4421..d3a59d8a060e 100644 --- a/include/llvm/IR/Use.h +++ b/include/llvm/IR/Use.h @@ -1,4 +1,4 @@ -//===-- llvm/Use.h - Definition of the Use class ----------------*- C++ -*-===// +//===- llvm/Use.h - Definition of the Use class -----------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -27,14 +27,14 @@ #include "llvm/ADT/PointerIntPair.h" #include "llvm/Support/CBindingWrapping.h" +#include "llvm/Support/Compiler.h" #include "llvm-c/Types.h" namespace llvm { -class Value; -class User; -class Use; template struct simplify_type; +class User; +class Value; /// \brief A Use represents the edge between a Value definition and its users. /// @@ -65,23 +65,27 @@ public: /// use the LSB regardless of pointer alignment on different targets. struct UserRefPointerTraits { static inline void *getAsVoidPointer(User *P) { return P; } + static inline User *getFromVoidPointer(void *P) { return (User *)P; } + enum { NumLowBitsAvailable = 1 }; }; // A type for the word following an array of hung-off Uses in memory, which is // a pointer back to their User with the bottom bit set. - typedef PointerIntPair UserRef; + using UserRef = PointerIntPair; /// Pointer traits for the Prev PointerIntPair. This ensures we always use /// the two LSBs regardless of pointer alignment on different targets. struct PrevPointerTraits { static inline void *getAsVoidPointer(Use **P) { return P; } + static inline Use **getFromVoidPointer(void *P) { return (Use **)P; } + enum { NumLowBitsAvailable = 2 }; }; @@ -95,9 +99,11 @@ private: enum PrevPtrTag { zeroDigitTag, oneDigitTag, stopTag, fullStopTag }; /// Constructor - Use(PrevPtrTag tag) : Val(nullptr) { Prev.setInt(tag); } + Use(PrevPtrTag tag) { Prev.setInt(tag); } public: + friend class Value; + operator Value *() const { return Val; } Value *get() const { return Val; } @@ -133,7 +139,7 @@ public: private: const Use *getImpliedUser() const LLVM_READONLY; - Value *Val; + Value *Val = nullptr; Use *Next; PointerIntPair Prev; @@ -153,18 +159,18 @@ private: if (Next) Next->setPrev(StrippedPrev); } - - friend class Value; }; /// \brief Allow clients to treat uses just like values when using /// casting operators. template <> struct simplify_type { - typedef Value *SimpleType; + using SimpleType = Value *; + static SimpleType getSimplifiedValue(Use &Val) { return Val.get(); } }; template <> struct simplify_type { - typedef /*const*/ Value *SimpleType; + using SimpleType = /*const*/ Value *; + static SimpleType getSimplifiedValue(const Use &Val) { return Val.get(); } }; diff --git a/include/llvm/IR/UseListOrder.h b/include/llvm/IR/UseListOrder.h index ebe99223facd..a8b394fc6302 100644 --- a/include/llvm/IR/UseListOrder.h +++ b/include/llvm/IR/UseListOrder.h @@ -37,7 +37,7 @@ struct UseListOrder { UseListOrder &operator=(UseListOrder &&) = default; }; -typedef std::vector UseListOrderStack; +using UseListOrderStack = std::vector; } // end namespace llvm diff --git a/include/llvm/IR/User.h b/include/llvm/IR/User.h index 54758a9b6d6a..7b9d451aaf53 100644 --- a/include/llvm/IR/User.h +++ b/include/llvm/IR/User.h @@ -1,4 +1,4 @@ -//===-- llvm/User.h - User class definition ---------------------*- C++ -*-===// +//===- llvm/User.h - User class definition ----------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -114,6 +114,7 @@ protected: ? OperandTraits::op_end(const_cast(that))[Idx] : OperandTraits::op_begin(const_cast(that))[Idx]; } + template Use &Op() { return OpFrom(this); } @@ -205,10 +206,10 @@ public: // --------------------------------------------------------------------------- // Operand Iterator interface... // - typedef Use* op_iterator; - typedef const Use* const_op_iterator; - typedef iterator_range op_range; - typedef iterator_range const_op_range; + using op_iterator = Use*; + using const_op_iterator = const Use*; + using op_range = iterator_range; + using const_op_range = iterator_range; op_iterator op_begin() { return getOperandList(); } const_op_iterator op_begin() const { return getOperandList(); } @@ -252,6 +253,7 @@ public: ptrdiff_t, const Value *, const Value *> { explicit const_value_op_iterator(const Use *U = nullptr) : iterator_adaptor_base(U) {} + const Value *operator*() const { return *I; } const Value *operator->() const { return operator*(); } }; @@ -290,6 +292,7 @@ public: return isa(V) || isa(V); } }; + // Either Use objects, or a Use pointer can be prepended to User. static_assert(alignof(Use) >= alignof(User), "Alignment is insufficient after objects prepended to User"); @@ -297,13 +300,15 @@ static_assert(alignof(Use *) >= alignof(User), "Alignment is insufficient after objects prepended to User"); template<> struct simplify_type { - typedef Value* SimpleType; + using SimpleType = Value*; + static SimpleType getSimplifiedValue(User::op_iterator &Val) { return Val->get(); } }; template<> struct simplify_type { - typedef /*const*/ Value* SimpleType; + using SimpleType = /*const*/ Value*; + static SimpleType getSimplifiedValue(User::const_op_iterator &Val) { return Val->get(); } diff --git a/include/llvm/IR/Value.h b/include/llvm/IR/Value.h index 00f821399257..96a370dcc35f 100644 --- a/include/llvm/IR/Value.h +++ b/include/llvm/IR/Value.h @@ -1,4 +1,4 @@ -//===-- llvm/Value.h - Definition of the Value class ------------*- C++ -*-===// +//===- llvm/Value.h - Definition of the Value class -------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -44,12 +44,12 @@ class LLVMContext; class Module; class ModuleSlotTracker; class raw_ostream; +template class StringMapEntry; class StringRef; class Twine; class Type; -template class StringMapEntry; -typedef StringMapEntry ValueName; +using ValueName = StringMapEntry; //===----------------------------------------------------------------------===// // Value Class @@ -120,9 +120,11 @@ private: template // UseT == 'Use' or 'const Use' class use_iterator_impl : public std::iterator { + friend class Value; + UseT *U; + explicit use_iterator_impl(UseT *u) : U(u) {} - friend class Value; public: use_iterator_impl() : U() {} @@ -309,8 +311,9 @@ public: return UseList == nullptr; } - typedef use_iterator_impl use_iterator; - typedef use_iterator_impl const_use_iterator; + using use_iterator = use_iterator_impl; + using const_use_iterator = use_iterator_impl; + use_iterator materialized_use_begin() { return use_iterator(UseList); } const_use_iterator materialized_use_begin() const { return const_use_iterator(UseList); @@ -345,8 +348,9 @@ public: return UseList == nullptr; } - typedef user_iterator_impl user_iterator; - typedef user_iterator_impl const_user_iterator; + using user_iterator = user_iterator_impl; + using const_user_iterator = user_iterator_impl; + user_iterator materialized_user_begin() { return user_iterator(UseList); } const_user_iterator materialized_user_begin() const { return const_user_iterator(UseList); @@ -560,7 +564,6 @@ public: /// block. const Value *DoPHITranslation(const BasicBlock *CurBB, const BasicBlock *PredBB) const; - Value *DoPHITranslation(const BasicBlock *CurBB, const BasicBlock *PredBB) { return const_cast( static_cast(this)->DoPHITranslation(CurBB, PredBB)); @@ -606,7 +609,7 @@ private: Use *Merged; Use **Next = &Merged; - for (;;) { + while (true) { if (!L) { *Next = R; break; diff --git a/include/llvm/IR/ValueHandle.h b/include/llvm/IR/ValueHandle.h index 393618d5511b..b45cc7b6dc02 100644 --- a/include/llvm/IR/ValueHandle.h +++ b/include/llvm/IR/ValueHandle.h @@ -17,10 +17,10 @@ #include "llvm/ADT/DenseMapInfo.h" #include "llvm/ADT/PointerIntPair.h" #include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include namespace llvm { -class ValueHandleBase; -template struct simplify_type; /// \brief This is the common base class of value handles. /// @@ -29,6 +29,7 @@ template struct simplify_type; /// below for details. class ValueHandleBase { friend class Value; + protected: /// \brief This indicates what sub class the handle actually is. /// @@ -40,24 +41,23 @@ protected: : ValueHandleBase(RHS.PrevPair.getInt(), RHS) {} ValueHandleBase(HandleBaseKind Kind, const ValueHandleBase &RHS) - : PrevPair(nullptr, Kind), Next(nullptr), Val(RHS.getValPtr()) { + : PrevPair(nullptr, Kind), Val(RHS.getValPtr()) { if (isValid(getValPtr())) AddToExistingUseList(RHS.getPrevPtr()); } private: PointerIntPair PrevPair; - ValueHandleBase *Next; - - Value *Val; + ValueHandleBase *Next = nullptr; + Value *Val = nullptr; void setValPtr(Value *V) { Val = V; } public: explicit ValueHandleBase(HandleBaseKind Kind) - : PrevPair(nullptr, Kind), Next(nullptr), Val(nullptr) {} + : PrevPair(nullptr, Kind) {} ValueHandleBase(HandleBaseKind Kind, Value *V) - : PrevPair(nullptr, Kind), Next(nullptr), Val(V) { + : PrevPair(nullptr, Kind), Val(V) { if (isValid(getValPtr())) AddToUseList(); } @@ -162,11 +162,13 @@ public: // Specialize simplify_type to allow WeakVH to participate in // dyn_cast, isa, etc. template <> struct simplify_type { - typedef Value *SimpleType; + using SimpleType = Value *; + static SimpleType getSimplifiedValue(WeakVH &WVH) { return WVH; } }; template <> struct simplify_type { - typedef Value *SimpleType; + using SimpleType = Value *; + static SimpleType getSimplifiedValue(const WeakVH &WVH) { return WVH; } }; @@ -205,11 +207,13 @@ public: // Specialize simplify_type to allow WeakTrackingVH to participate in // dyn_cast, isa, etc. template <> struct simplify_type { - typedef Value *SimpleType; + using SimpleType = Value *; + static SimpleType getSimplifiedValue(WeakTrackingVH &WVH) { return WVH; } }; template <> struct simplify_type { - typedef Value *SimpleType; + using SimpleType = Value *; + static SimpleType getSimplifiedValue(const WeakTrackingVH &WVH) { return WVH; } @@ -236,7 +240,7 @@ class AssertingVH : public ValueHandleBase #endif { - friend struct DenseMapInfo >; + friend struct DenseMapInfo>; #ifndef NDEBUG Value *getRawValPtr() const { return ValueHandleBase::getValPtr(); } @@ -282,20 +286,23 @@ public: // Specialize DenseMapInfo to allow AssertingVH to participate in DenseMap. template -struct DenseMapInfo > { +struct DenseMapInfo> { static inline AssertingVH getEmptyKey() { AssertingVH Res; Res.setRawValPtr(DenseMapInfo::getEmptyKey()); return Res; } + static inline AssertingVH getTombstoneKey() { AssertingVH Res; Res.setRawValPtr(DenseMapInfo::getTombstoneKey()); return Res; } + static unsigned getHashValue(const AssertingVH &Val) { return DenseMapInfo::getHashValue(Val.getRawValPtr()); } + static bool isEqual(const AssertingVH &LHS, const AssertingVH &RHS) { return DenseMapInfo::isEqual(LHS.getRawValPtr(), RHS.getRawValPtr()); @@ -303,7 +310,7 @@ struct DenseMapInfo > { }; template -struct isPodLike > { +struct isPodLike> { #ifdef NDEBUG static const bool value = true; #else @@ -356,7 +363,7 @@ public: static Value *GetAsValue(const Value *V) { return const_cast(V); } public: - TrackingVH() {} + TrackingVH() = default; TrackingVH(ValueTy *P) { setValPtr(P); } operator ValueTy*() const { @@ -495,10 +502,12 @@ public: PoisoningVH(ValueTy *P) : CallbackVH(GetAsValue(P)) {} PoisoningVH(const PoisoningVH &RHS) : CallbackVH(RHS), Poisoned(RHS.Poisoned) {} + ~PoisoningVH() { if (Poisoned) clearValPtr(); } + PoisoningVH &operator=(const PoisoningVH &RHS) { if (Poisoned) clearValPtr(); @@ -523,14 +532,17 @@ template struct DenseMapInfo> { Res.setRawValPtr(DenseMapInfo::getEmptyKey()); return Res; } + static inline PoisoningVH getTombstoneKey() { PoisoningVH Res; Res.setRawValPtr(DenseMapInfo::getTombstoneKey()); return Res; } + static unsigned getHashValue(const PoisoningVH &Val) { return DenseMapInfo::getHashValue(Val.getRawValPtr()); } + static bool isEqual(const PoisoningVH &LHS, const PoisoningVH &RHS) { return DenseMapInfo::isEqual(LHS.getRawValPtr(), RHS.getRawValPtr()); @@ -545,6 +557,6 @@ template struct isPodLike> { #endif }; -} // End llvm namespace +} // end namespace llvm -#endif +#endif // LLVM_IR_VALUEHANDLE_H diff --git a/include/llvm/IR/ValueMap.h b/include/llvm/IR/ValueMap.h index 9648e1989f94..11d5823ee479 100644 --- a/include/llvm/IR/ValueMap.h +++ b/include/llvm/IR/ValueMap.h @@ -46,7 +46,6 @@ namespace llvm { template class ValueMapCallbackVH; - template class ValueMapIterator; template @@ -57,7 +56,7 @@ class ValueMapConstIterator; /// as possible with future versions of ValueMap. template struct ValueMapConfig { - typedef MutexT mutex_type; + using mutex_type = MutexT; /// If FollowRAUW is true, the ValueMap will update mappings on RAUW. If it's /// false, the ValueMap will leave the original mapping in place. @@ -87,21 +86,21 @@ template> class ValueMap { friend class ValueMapCallbackVH; - typedef ValueMapCallbackVH ValueMapCVH; - typedef DenseMap> MapT; - typedef DenseMap MDMapT; - typedef typename Config::ExtraData ExtraData; + using ValueMapCVH = ValueMapCallbackVH; + using MapT = DenseMap>; + using MDMapT = DenseMap; + using ExtraData = typename Config::ExtraData; + MapT Map; Optional MDMap; ExtraData Data; - bool MayMapMetadata = true; public: - typedef KeyT key_type; - typedef ValueT mapped_type; - typedef std::pair value_type; - typedef unsigned size_type; + using key_type = KeyT; + using mapped_type = ValueT; + using value_type = std::pair; + using size_type = unsigned; explicit ValueMap(unsigned NumInitBuckets = 64) : Map(NumInitBuckets), Data() {} @@ -132,8 +131,9 @@ public: return Where->second.get(); } - typedef ValueMapIterator iterator; - typedef ValueMapConstIterator const_iterator; + using iterator = ValueMapIterator; + using const_iterator = ValueMapConstIterator; + inline iterator begin() { return iterator(Map.begin()); } inline iterator end() { return iterator(Map.end()); } inline const_iterator begin() const { return const_iterator(Map.begin()); } @@ -244,8 +244,8 @@ class ValueMapCallbackVH final : public CallbackVH { friend class ValueMap; friend struct DenseMapInfo; - typedef ValueMap ValueMapT; - typedef typename std::remove_pointer::type KeySansPointerT; + using ValueMapT = ValueMap; + using KeySansPointerT = typename std::remove_pointer::type; ValueMapT *Map; @@ -298,7 +298,7 @@ public: template struct DenseMapInfo> { - typedef ValueMapCallbackVH VH; + using VH = ValueMapCallbackVH; static inline VH getEmptyKey() { return VH(DenseMapInfo::getEmptyKey()); @@ -330,8 +330,8 @@ class ValueMapIterator : public std::iterator, ptrdiff_t> { - typedef typename DenseMapT::iterator BaseT; - typedef typename DenseMapT::mapped_type ValueT; + using BaseT = typename DenseMapT::iterator; + using ValueT = typename DenseMapT::mapped_type; BaseT I; @@ -344,7 +344,9 @@ public: struct ValueTypeProxy { const KeyT first; ValueT& second; + ValueTypeProxy *operator->() { return this; } + operator std::pair() const { return std::make_pair(first, second); } @@ -380,8 +382,8 @@ class ValueMapConstIterator : public std::iterator, ptrdiff_t> { - typedef typename DenseMapT::const_iterator BaseT; - typedef typename DenseMapT::mapped_type ValueT; + using BaseT = typename DenseMapT::const_iterator; + using ValueT = typename DenseMapT::mapped_type; BaseT I; diff --git a/include/llvm/IR/ValueSymbolTable.h b/include/llvm/IR/ValueSymbolTable.h index 9e86751dae6f..26cbbfabfc0c 100644 --- a/include/llvm/IR/ValueSymbolTable.h +++ b/include/llvm/IR/ValueSymbolTable.h @@ -49,13 +49,13 @@ class ValueSymbolTable { /// @{ public: /// @brief A mapping of names to values. - typedef StringMap ValueMap; + using ValueMap = StringMap; /// @brief An iterator over a ValueMap. - typedef ValueMap::iterator iterator; + using iterator = ValueMap::iterator; /// @brief A const_iterator over a ValueMap. - typedef ValueMap::const_iterator const_iterator; + using const_iterator = ValueMap::const_iterator; /// @} /// @name Constructors diff --git a/include/llvm/IR/Verifier.h b/include/llvm/IR/Verifier.h index 71f727c3d4fc..15e52d9e0742 100644 --- a/include/llvm/IR/Verifier.h +++ b/include/llvm/IR/Verifier.h @@ -21,13 +21,17 @@ #ifndef LLVM_IR_VERIFIER_H #define LLVM_IR_VERIFIER_H +#include "llvm/ADT/DenseMap.h" #include "llvm/IR/PassManager.h" +#include namespace llvm { +class APInt; class Function; class FunctionPass; -class ModulePass; +class Instruction; +class MDNode; class Module; class raw_ostream; struct VerifierSupport; @@ -47,7 +51,7 @@ class TBAAVerifier { /// the offset of the access. If zero, only a zero offset is allowed. /// /// \c BitWidth has no meaning if \c IsInvalid is true. - typedef std::pair TBAABaseNodeSummary; + using TBAABaseNodeSummary = std::pair; DenseMap TBAABaseNodes; /// Maps an alleged scalar TBAA node to a boolean that is true if the said @@ -101,12 +105,14 @@ FunctionPass *createVerifierPass(bool FatalErrors = true); /// and debug info errors. class VerifierAnalysis : public AnalysisInfoMixin { friend AnalysisInfoMixin; + static AnalysisKey Key; public: struct Result { bool IRBroken, DebugInfoBroken; }; + Result run(Module &M, ModuleAnalysisManager &); Result run(Function &F, FunctionAnalysisManager &); }; @@ -136,7 +142,6 @@ public: PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM); }; +} // end namespace llvm -} // End llvm namespace - -#endif +#endif // LLVM_IR_VERIFIER_H diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h index 44ff4c1a581b..cf314e19d1ca 100644 --- a/include/llvm/InitializePasses.h +++ b/include/llvm/InitializePasses.h @@ -130,6 +130,7 @@ void initializeEfficiencySanitizerPass(PassRegistry&); void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&); void initializeExpandISelPseudosPass(PassRegistry&); void initializeExpandPostRAPass(PassRegistry&); +void initializeExpandReductionsPass(PassRegistry&); void initializeExternalAAWrapperPassPass(PassRegistry&); void initializeFEntryInserterPass(PassRegistry&); void initializeFinalizeMachineBundlesPass(PassRegistry&); @@ -186,6 +187,7 @@ void initializeLintPass(PassRegistry&); void initializeLiveDebugValuesPass(PassRegistry&); void initializeLiveDebugVariablesPass(PassRegistry&); void initializeLiveIntervalsPass(PassRegistry&); +void initializeLiveRangeShrinkPass(PassRegistry&); void initializeLiveRegMatrixPass(PassRegistry&); void initializeLiveStacksPass(PassRegistry&); void initializeLiveVariablesPass(PassRegistry&); @@ -319,11 +321,12 @@ void initializeSCCPLegacyPassPass(PassRegistry&); void initializeSCEVAAWrapperPassPass(PassRegistry&); void initializeSLPVectorizerPass(PassRegistry&); void initializeSROALegacyPassPass(PassRegistry&); -void initializeSafeStackPass(PassRegistry&); +void initializeSafeStackLegacyPassPass(PassRegistry&); void initializeSampleProfileLoaderLegacyPassPass(PassRegistry&); void initializeSanitizerCoverageModulePass(PassRegistry&); void initializeScalarEvolutionWrapperPassPass(PassRegistry&); void initializeScalarizerPass(PassRegistry&); +void initializeScalarizeMaskedMemIntrinPass(PassRegistry&); void initializeScopedNoAliasAAWrapperPassPass(PassRegistry&); void initializeSeparateConstOffsetFromGEPPass(PassRegistry&); void initializeShadowStackGCLoweringPass(PassRegistry&); diff --git a/include/llvm/LibDriver/LibDriver.h b/include/llvm/LibDriver/LibDriver.h deleted file mode 100644 index 95feb60be403..000000000000 --- a/include/llvm/LibDriver/LibDriver.h +++ /dev/null @@ -1,24 +0,0 @@ -//===- llvm/LibDriver/LibDriver.h - lib.exe-compatible driver ---*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Defines an interface to a lib.exe-compatible driver that also understands -// bitcode files. Used by llvm-lib and lld-link /lib. -// -//===----------------------------------------------------------------------===// - -#ifndef LLVM_LIBDRIVER_LIBDRIVER_H -#define LLVM_LIBDRIVER_LIBDRIVER_H - -namespace llvm { -template class ArrayRef; - -int libDriverMain(ArrayRef ARgs); -} - -#endif diff --git a/include/llvm/LinkAllPasses.h b/include/llvm/LinkAllPasses.h index 39a86e838bde..5c398b2ab567 100644 --- a/include/llvm/LinkAllPasses.h +++ b/include/llvm/LinkAllPasses.h @@ -206,6 +206,7 @@ namespace { (void) llvm::createMemDerefPrinter(); (void) llvm::createFloat2IntPass(); (void) llvm::createEliminateAvailableExternallyPass(); + (void) llvm::createScalarizeMaskedMemIntrinPass(); (void)new llvm::IntervalPartition(); (void)new llvm::ScalarEvolutionWrapperPass(); diff --git a/include/llvm/Object/Wasm.h b/include/llvm/Object/Wasm.h index 4bc39d98b7af..d200d4a148e3 100644 --- a/include/llvm/Object/Wasm.h +++ b/include/llvm/Object/Wasm.h @@ -67,7 +67,8 @@ public: WasmObjectFile(MemoryBufferRef Object, Error &Err); const wasm::WasmObjectHeader &getHeader() const; - const WasmSymbol &getWasmSymbol(DataRefImpl Symb) const; + const WasmSymbol &getWasmSymbol(const DataRefImpl &Symb) const; + const WasmSymbol &getWasmSymbol(const SymbolRef &Symbol) const; const WasmSection &getWasmSection(const SectionRef &Section) const; const wasm::WasmRelocation &getWasmRelocation(const RelocationRef& Ref) const; @@ -81,6 +82,10 @@ public: const std::vector& globals() const { return Globals; } const std::vector& exports() const { return Exports; } + uint32_t getNumberOfSymbols() const { + return Symbols.size(); + } + const std::vector& elements() const { return ElemSegments; } diff --git a/include/llvm/ObjectYAML/WasmYAML.h b/include/llvm/ObjectYAML/WasmYAML.h index bd7d72be4dbc..7b70c9537827 100644 --- a/include/llvm/ObjectYAML/WasmYAML.h +++ b/include/llvm/ObjectYAML/WasmYAML.h @@ -34,17 +34,6 @@ struct FileHeader { yaml::Hex32 Version; }; -struct Import { - StringRef Module; - StringRef Field; - ExportKind Kind; - union { - uint32_t SigIndex; - ValueType GlobalType; - }; - bool GlobalMutable; -}; - struct Limits { yaml::Hex32 Flags; yaml::Hex32 Initial; @@ -74,6 +63,18 @@ struct Global { wasm::WasmInitExpr InitExpr; }; +struct Import { + StringRef Module; + StringRef Field; + ExportKind Kind; + union { + uint32_t SigIndex; + Global GlobalImport; + Table TableImport; + Limits Memory; + }; +}; + struct LocalDecl { ValueType Type; uint32_t Count; diff --git a/include/llvm/ProfileData/SampleProfWriter.h b/include/llvm/ProfileData/SampleProfWriter.h index 9d69af32dd46..86af1038d74e 100644 --- a/include/llvm/ProfileData/SampleProfWriter.h +++ b/include/llvm/ProfileData/SampleProfWriter.h @@ -43,16 +43,7 @@ public: /// Write all the sample profiles in the given map of samples. /// /// \returns status code of the file update operation. - std::error_code write(const StringMap &ProfileMap) { - if (std::error_code EC = writeHeader(ProfileMap)) - return EC; - for (const auto &I : ProfileMap) { - const FunctionSamples &Profile = I.second; - if (std::error_code EC = write(Profile)) - return EC; - } - return sampleprof_error::success; - } + std::error_code write(const StringMap &ProfileMap); raw_ostream &getOutputStream() { return *OutputStream; } diff --git a/include/llvm/Support/BinaryStreamArray.h b/include/llvm/Support/BinaryStreamArray.h index bad31cd38d6a..77c99ffff919 100644 --- a/include/llvm/Support/BinaryStreamArray.h +++ b/include/llvm/Support/BinaryStreamArray.h @@ -139,6 +139,7 @@ public: } uint32_t offset() const { return AbsOffset; } + uint32_t getRecordLength() const { return ThisLen; } private: void moveToEnd() { @@ -294,6 +295,8 @@ template class FixedStreamArray { friend class FixedStreamArrayIterator; public: + typedef FixedStreamArrayIterator Iterator; + FixedStreamArray() = default; explicit FixedStreamArray(BinaryStreamRef Stream) : Stream(Stream) { assert(Stream.getLength() % sizeof(T) == 0); @@ -371,7 +374,7 @@ public: } FixedStreamArrayIterator &operator-=(std::ptrdiff_t N) { - assert(Index >= N); + assert(std::ptrdiff_t(Index) >= N); Index -= N; return *this; } diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h index a56bc93e111b..be9e46540016 100644 --- a/include/llvm/Support/Compiler.h +++ b/include/llvm/Support/Compiler.h @@ -111,12 +111,6 @@ #define LLVM_PREFETCH(addr, rw, locality) #endif -#if __has_attribute(sentinel) || LLVM_GNUC_PREREQ(3, 0, 0) -#define LLVM_END_WITH_NULL __attribute__((sentinel)) -#else -#define LLVM_END_WITH_NULL -#endif - #if __has_attribute(used) || LLVM_GNUC_PREREQ(3, 1, 0) #define LLVM_ATTRIBUTE_USED __attribute__((__used__)) #else @@ -233,6 +227,8 @@ /// LLVM_FALLTHROUGH - Mark fallthrough cases in switch statements. #if __cplusplus > 201402L && __has_cpp_attribute(fallthrough) #define LLVM_FALLTHROUGH [[fallthrough]] +#elif __has_cpp_attribute(gnu::fallthrough) +#define LLVM_FALLTHROUGH [[gnu::fallthrough]] #elif !__cplusplus // Workaround for llvm.org/PR23435, since clang 3.6 and below emit a spurious // error when __has_cpp_attribute is given a scoped attribute in C mode. diff --git a/include/llvm/Support/KnownBits.h b/include/llvm/Support/KnownBits.h index 3d38cf878538..2c77d40559b9 100644 --- a/include/llvm/Support/KnownBits.h +++ b/include/llvm/Support/KnownBits.h @@ -133,6 +133,66 @@ public: KnownBits zextOrTrunc(unsigned BitWidth) { return KnownBits(Zero.zextOrTrunc(BitWidth), One.zextOrTrunc(BitWidth)); } + + /// Returns the minimum number of trailing zero bits. + unsigned countMinTrailingZeros() const { + return Zero.countTrailingOnes(); + } + + /// Returns the minimum number of trailing one bits. + unsigned countMinTrailingOnes() const { + return One.countTrailingOnes(); + } + + /// Returns the minimum number of leading zero bits. + unsigned countMinLeadingZeros() const { + return Zero.countLeadingOnes(); + } + + /// Returns the minimum number of leading one bits. + unsigned countMinLeadingOnes() const { + return One.countLeadingOnes(); + } + + /// Returns the number of times the sign bit is replicated into the other + /// bits. + unsigned countMinSignBits() const { + if (isNonNegative()) + return countMinLeadingZeros(); + if (isNegative()) + return countMinLeadingOnes(); + return 0; + } + + /// Returns the maximum number of trailing zero bits possible. + unsigned countMaxTrailingZeros() const { + return One.countTrailingZeros(); + } + + /// Returns the maximum number of trailing one bits possible. + unsigned countMaxTrailingOnes() const { + return Zero.countTrailingZeros(); + } + + /// Returns the maximum number of leading zero bits possible. + unsigned countMaxLeadingZeros() const { + return One.countLeadingZeros(); + } + + /// Returns the maximum number of leading one bits possible. + unsigned countMaxLeadingOnes() const { + return Zero.countLeadingZeros(); + } + + /// Returns the number of bits known to be one. + unsigned countMinPopulation() const { + return One.countPopulation(); + } + + /// Returns the maximum number of bits that could be one. + unsigned countMaxPopulation() const { + return getBitWidth() - Zero.countPopulation(); + } }; } // end namespace llvm diff --git a/include/llvm/Support/Parallel.h b/include/llvm/Support/Parallel.h new file mode 100644 index 000000000000..e36e0cc29e14 --- /dev/null +++ b/include/llvm/Support/Parallel.h @@ -0,0 +1,249 @@ +//===- llvm/Support/Parallel.h - Parallel algorithms ----------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_SUPPORT_PARALLEL_H +#define LLVM_SUPPORT_PARALLEL_H + +#include "llvm/ADT/STLExtras.h" +#include "llvm/Config/llvm-config.h" +#include "llvm/Support/MathExtras.h" + +#include +#include +#include +#include + +#if defined(_MSC_VER) && LLVM_ENABLE_THREADS +#pragma warning(push) +#pragma warning(disable : 4530) +#include +#include +#pragma warning(pop) +#endif + +namespace llvm { + +namespace parallel { +struct sequential_execution_policy {}; +struct parallel_execution_policy {}; + +template +struct is_execution_policy + : public std::integral_constant< + bool, llvm::is_one_of::value> {}; + +constexpr sequential_execution_policy seq{}; +constexpr parallel_execution_policy par{}; + +namespace detail { + +#if LLVM_ENABLE_THREADS + +class Latch { + uint32_t Count; + mutable std::mutex Mutex; + mutable std::condition_variable Cond; + +public: + explicit Latch(uint32_t Count = 0) : Count(Count) {} + ~Latch() { sync(); } + + void inc() { + std::unique_lock lock(Mutex); + ++Count; + } + + void dec() { + std::unique_lock lock(Mutex); + if (--Count == 0) + Cond.notify_all(); + } + + void sync() const { + std::unique_lock lock(Mutex); + Cond.wait(lock, [&] { return Count == 0; }); + } +}; + +class TaskGroup { + Latch L; + +public: + void spawn(std::function f); + + void sync() const { L.sync(); } +}; + +#if defined(_MSC_VER) +template +void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End, + const Comparator &Comp) { + concurrency::parallel_sort(Start, End, Comp); +} +template +void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) { + concurrency::parallel_for_each(Begin, End, Fn); +} + +template +void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) { + concurrency::parallel_for(Begin, End, Fn); +} + +#else +const ptrdiff_t MinParallelSize = 1024; + +/// \brief Inclusive median. +template +RandomAccessIterator medianOf3(RandomAccessIterator Start, + RandomAccessIterator End, + const Comparator &Comp) { + RandomAccessIterator Mid = Start + (std::distance(Start, End) / 2); + return Comp(*Start, *(End - 1)) + ? (Comp(*Mid, *(End - 1)) ? (Comp(*Start, *Mid) ? Mid : Start) + : End - 1) + : (Comp(*Mid, *Start) ? (Comp(*(End - 1), *Mid) ? Mid : End - 1) + : Start); +} + +template +void parallel_quick_sort(RandomAccessIterator Start, RandomAccessIterator End, + const Comparator &Comp, TaskGroup &TG, size_t Depth) { + // Do a sequential sort for small inputs. + if (std::distance(Start, End) < detail::MinParallelSize || Depth == 0) { + std::sort(Start, End, Comp); + return; + } + + // Partition. + auto Pivot = medianOf3(Start, End, Comp); + // Move Pivot to End. + std::swap(*(End - 1), *Pivot); + Pivot = std::partition(Start, End - 1, [&Comp, End](decltype(*Start) V) { + return Comp(V, *(End - 1)); + }); + // Move Pivot to middle of partition. + std::swap(*Pivot, *(End - 1)); + + // Recurse. + TG.spawn([=, &Comp, &TG] { + parallel_quick_sort(Start, Pivot, Comp, TG, Depth - 1); + }); + parallel_quick_sort(Pivot + 1, End, Comp, TG, Depth - 1); +} + +template +void parallel_sort(RandomAccessIterator Start, RandomAccessIterator End, + const Comparator &Comp) { + TaskGroup TG; + parallel_quick_sort(Start, End, Comp, TG, + llvm::Log2_64(std::distance(Start, End)) + 1); +} + +template +void parallel_for_each(IterTy Begin, IterTy End, FuncTy Fn) { + // TaskGroup has a relatively high overhead, so we want to reduce + // the number of spawn() calls. We'll create up to 1024 tasks here. + // (Note that 1024 is an arbitrary number. This code probably needs + // improving to take the number of available cores into account.) + ptrdiff_t TaskSize = std::distance(Begin, End) / 1024; + if (TaskSize == 0) + TaskSize = 1; + + TaskGroup TG; + while (TaskSize <= std::distance(Begin, End)) { + TG.spawn([=, &Fn] { std::for_each(Begin, Begin + TaskSize, Fn); }); + Begin += TaskSize; + } + TG.spawn([=, &Fn] { std::for_each(Begin, End, Fn); }); +} + +template +void parallel_for_each_n(IndexTy Begin, IndexTy End, FuncTy Fn) { + ptrdiff_t TaskSize = (End - Begin) / 1024; + if (TaskSize == 0) + TaskSize = 1; + + TaskGroup TG; + IndexTy I = Begin; + for (; I + TaskSize < End; I += TaskSize) { + TG.spawn([=, &Fn] { + for (IndexTy J = I, E = I + TaskSize; J != E; ++J) + Fn(J); + }); + } + TG.spawn([=, &Fn] { + for (IndexTy J = I; J < End; ++J) + Fn(J); + }); +} + +#endif + +#endif + +template +using DefComparator = + std::less::value_type>; + +} // namespace detail + +// sequential algorithm implementations. +template > +void sort(Policy policy, RandomAccessIterator Start, RandomAccessIterator End, + const Comparator &Comp = Comparator()) { + static_assert(is_execution_policy::value, + "Invalid execution policy!"); + std::sort(Start, End, Comp); +} + +template +void for_each(Policy policy, IterTy Begin, IterTy End, FuncTy Fn) { + static_assert(is_execution_policy::value, + "Invalid execution policy!"); + std::for_each(Begin, End, Fn); +} + +template +void for_each_n(Policy policy, IndexTy Begin, IndexTy End, FuncTy Fn) { + static_assert(is_execution_policy::value, + "Invalid execution policy!"); + for (IndexTy I = Begin; I != End; ++I) + Fn(I); +} + +// Parallel algorithm implementations, only available when LLVM_ENABLE_THREADS +// is true. +#if LLVM_ENABLE_THREADS +template > +void sort(parallel_execution_policy policy, RandomAccessIterator Start, + RandomAccessIterator End, const Comparator &Comp = Comparator()) { + detail::parallel_sort(Start, End, Comp); +} + +template +void for_each(parallel_execution_policy policy, IterTy Begin, IterTy End, + FuncTy Fn) { + detail::parallel_for_each(Begin, End, Fn); +} + +template +void for_each_n(parallel_execution_policy policy, IndexTy Begin, IndexTy End, + FuncTy Fn) { + detail::parallel_for_each_n(Begin, End, Fn); +} +#endif + +} // namespace parallel +} // namespace llvm + +#endif // LLVM_SUPPORT_PARALLEL_H diff --git a/include/llvm/Support/Wasm.h b/include/llvm/Support/Wasm.h index a48dfe10b3bb..e3831827062c 100644 --- a/include/llvm/Support/Wasm.h +++ b/include/llvm/Support/Wasm.h @@ -37,17 +37,6 @@ struct WasmSignature { int32_t ReturnType; }; -struct WasmImport { - StringRef Module; - StringRef Field; - uint32_t Kind; - union { - uint32_t SigIndex; - int32_t GlobalType; - }; - bool GlobalMutable; -}; - struct WasmExport { StringRef Name; uint32_t Kind; @@ -82,6 +71,18 @@ struct WasmGlobal { WasmInitExpr InitExpr; }; +struct WasmImport { + StringRef Module; + StringRef Field; + uint32_t Kind; + union { + uint32_t SigIndex; + WasmGlobal Global; + WasmTable Table; + WasmLimits Memory; + }; +}; + struct WasmLocalDecl { int32_t Type; uint32_t Count; diff --git a/include/llvm/Target/Target.td b/include/llvm/Target/Target.td index fc35b4527bc3..6f44292c47ed 100644 --- a/include/llvm/Target/Target.td +++ b/include/llvm/Target/Target.td @@ -680,6 +680,11 @@ class RegisterOperand // this type. The method normally will just use an alt-name index to look // up the name to print. Default to the generic printOperand(). string PrintMethod = pm; + + // EncoderMethod - The target method name to call to encode this register + // operand. + string EncoderMethod = ""; + // ParserMatchClass - The "match class" that operands of this type fit // in. Match classes are used to define the order in which instructions are // match, to ensure that which instructions gets matched is deterministic. diff --git a/include/llvm/Target/TargetInstrInfo.h b/include/llvm/Target/TargetInstrInfo.h index 82a682cf1f7e..97a6f0c6e3ae 100644 --- a/include/llvm/Target/TargetInstrInfo.h +++ b/include/llvm/Target/TargetInstrInfo.h @@ -172,11 +172,22 @@ public: /// inalloca arguments. This function reports only the size of the frame part /// that is set up between the frame setup and destroy pseudo instructions. int64_t getFrameSize(const MachineInstr &I) const { - assert(isFrameInstr(I)); + assert(isFrameInstr(I) && "Not a frame instruction"); assert(I.getOperand(0).getImm() >= 0); return I.getOperand(0).getImm(); } + /// Returns the total frame size, which is made up of the space set up inside + /// the pair of frame start-stop instructions and the space that is set up + /// prior to the pair. + int64_t getFrameTotalSize(const MachineInstr &I) const { + if (isFrameSetup(I)) { + assert(I.getOperand(1).getImm() >= 0 && "Frame size must not be negative"); + return getFrameSize(I) + I.getOperand(1).getImm(); + } + return getFrameSize(I); + } + unsigned getCatchReturnOpcode() const { return CatchRetOpcode; } unsigned getReturnOpcode() const { return ReturnOpcode; } diff --git a/include/llvm/Target/TargetLowering.h b/include/llvm/Target/TargetLowering.h index ced183852146..1ca32d4c3589 100644 --- a/include/llvm/Target/TargetLowering.h +++ b/include/llvm/Target/TargetLowering.h @@ -1396,7 +1396,10 @@ public: /// It is called by AtomicExpandPass before expanding an /// AtomicRMW/AtomicCmpXchg/AtomicStore/AtomicLoad /// if shouldInsertFencesForAtomic returns true. - /// RMW and CmpXchg set both IsStore and IsLoad to true. + /// + /// Inst is the original atomic instruction, prior to other expansions that + /// may be performed. + /// /// This function should either return a nullptr, or a pointer to an IR-level /// Instruction*. Even complex fence sequences can be represented by a /// single Instruction* through an intrinsic to be lowered later. @@ -1422,18 +1425,17 @@ public: /// seq_cst. But if they are lowered to monotonic accesses, no amount of /// IR-level fences can prevent it. /// @{ - virtual Instruction *emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { - if (isReleaseOrStronger(Ord) && IsStore) + virtual Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const { + if (isReleaseOrStronger(Ord) && Inst->hasAtomicStore()) return Builder.CreateFence(Ord); else return nullptr; } virtual Instruction *emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { + Instruction *Inst, + AtomicOrdering Ord) const { if (isAcquireOrStronger(Ord)) return Builder.CreateFence(Ord); else @@ -2061,14 +2063,6 @@ public: return false; } - // Return true if the instruction that performs a << b actually performs - // a << (b % (sizeof(a) * 8)). - virtual bool supportsModuloShift(ISD::NodeType Inst, EVT ReturnType) const { - assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) && - "Expect a shift instruction"); - return false; - } - //===--------------------------------------------------------------------===// // Runtime Library hooks // diff --git a/include/llvm/Target/TargetSchedule.td b/include/llvm/Target/TargetSchedule.td index d342e4fe2613..7b00c9420e35 100644 --- a/include/llvm/Target/TargetSchedule.td +++ b/include/llvm/Target/TargetSchedule.td @@ -334,7 +334,7 @@ class ReadAdvance writes = []> } // Directly associate a new SchedRead type with a delay and optional -// pipeline bypess. For use with InstRW or ItinRW. +// pipeline bypass. For use with InstRW or ItinRW. class SchedReadAdvance writes = []> : SchedRead, ProcReadAdvance; diff --git a/include/llvm/Target/TargetSelectionDAG.td b/include/llvm/Target/TargetSelectionDAG.td index 45a842f77a21..9ed614ccee17 100644 --- a/include/llvm/Target/TargetSelectionDAG.td +++ b/include/llvm/Target/TargetSelectionDAG.td @@ -281,7 +281,7 @@ def SDTConvertOp : SDTypeProfile<1, 5, [ //cvtss, su, us, uu, ff, fs, fu, sf, su ]>; class SDCallSeqStart constraints> : - SDTypeProfile<0, 1, constraints>; + SDTypeProfile<0, 2, constraints>; class SDCallSeqEnd constraints> : SDTypeProfile<0, 2, constraints>; diff --git a/include/llvm/ToolDrivers/llvm-lib/LibDriver.h b/include/llvm/ToolDrivers/llvm-lib/LibDriver.h new file mode 100644 index 000000000000..a4806ac4ad69 --- /dev/null +++ b/include/llvm/ToolDrivers/llvm-lib/LibDriver.h @@ -0,0 +1,24 @@ +//===- llvm-lib/LibDriver.h - lib.exe-compatible driver ---------*- C++ -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Defines an interface to a lib.exe-compatible driver that also understands +// bitcode files. Used by llvm-lib and lld-link /lib. +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_TOOLDRIVERS_LLVM_LIB_LIBDRIVER_H +#define LLVM_TOOLDRIVERS_LLVM_LIB_LIBDRIVER_H + +namespace llvm { +template class ArrayRef; + +int libDriverMain(ArrayRef ARgs); +} + +#endif diff --git a/include/llvm/Transforms/Utils/Cloning.h b/include/llvm/Transforms/Utils/Cloning.h index 0a8903a6ed7b..91c9d255302f 100644 --- a/include/llvm/Transforms/Utils/Cloning.h +++ b/include/llvm/Transforms/Utils/Cloning.h @@ -43,6 +43,7 @@ class InvokeInst; class Loop; class LoopInfo; class Module; +class ProfileSummaryInfo; class ReturnInst; /// Return an exact copy of the specified module @@ -175,15 +176,17 @@ public: explicit InlineFunctionInfo(CallGraph *cg = nullptr, std::function *GetAssumptionCache = nullptr, + ProfileSummaryInfo *PSI = nullptr, BlockFrequencyInfo *CallerBFI = nullptr, BlockFrequencyInfo *CalleeBFI = nullptr) - : CG(cg), GetAssumptionCache(GetAssumptionCache), CallerBFI(CallerBFI), - CalleeBFI(CalleeBFI) {} + : CG(cg), GetAssumptionCache(GetAssumptionCache), PSI(PSI), + CallerBFI(CallerBFI), CalleeBFI(CalleeBFI) {} /// CG - If non-null, InlineFunction will update the callgraph to reflect the /// changes it makes. CallGraph *CG; std::function *GetAssumptionCache; + ProfileSummaryInfo *PSI; BlockFrequencyInfo *CallerBFI, *CalleeBFI; /// StaticAllocas - InlineFunction fills this in with all static allocas that diff --git a/include/llvm/Transforms/Utils/LoopUtils.h b/include/llvm/Transforms/Utils/LoopUtils.h index a1cf41d6f931..561f94880624 100644 --- a/include/llvm/Transforms/Utils/LoopUtils.h +++ b/include/llvm/Transforms/Utils/LoopUtils.h @@ -21,6 +21,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/EHPersonalities.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" @@ -42,6 +43,7 @@ class PredIteratorCache; class ScalarEvolution; class SCEV; class TargetLibraryInfo; +class TargetTransformInfo; /// \brief Captures loop safety information. /// It keep information for loop & its header may throw exception. @@ -489,6 +491,36 @@ bool canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT, LoopSafetyInfo *SafetyInfo, OptimizationRemarkEmitter *ORE = nullptr); +/// Generates a vector reduction using shufflevectors to reduce the value. +Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind + MinMaxKind = RecurrenceDescriptor::MRK_Invalid, + ArrayRef RedOps = ArrayRef()); + +/// Create a target reduction of the given vector. The reduction operation +/// is described by the \p Opcode parameter. min/max reductions require +/// additional information supplied in \p Flags. +/// The target is queried to determine if intrinsics or shuffle sequences are +/// required to implement the reduction. +Value * +createSimpleTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, + unsigned Opcode, Value *Src, + TargetTransformInfo::ReductionFlags Flags = + TargetTransformInfo::ReductionFlags(), + ArrayRef RedOps = ArrayRef()); + +/// Create a generic target reduction using a recurrence descriptor \p Desc +/// The target is queried to determine if intrinsics or shuffle sequences are +/// required to implement the reduction. +Value *createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI, + RecurrenceDescriptor &Desc, Value *Src, + bool NoNaN = false); + +/// Get the intersection (logical and) of all of the potential IR flags +/// of each scalar operation (VL) that will be converted into a vector (I). +/// Flag set: NSW, NUW, exact, and all of fast-math. +void propagateIRFlags(Value *I, ArrayRef VL); + } // end namespace llvm #endif // LLVM_TRANSFORMS_UTILS_LOOPUTILS_H diff --git a/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/include/llvm/Transforms/Vectorize/SLPVectorizer.h index 10338f7937e8..c514db41623c 100644 --- a/include/llvm/Transforms/Vectorize/SLPVectorizer.h +++ b/include/llvm/Transforms/Vectorize/SLPVectorizer.h @@ -24,6 +24,7 @@ #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/DemandedBits.h" #include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/OptimizationDiagnosticInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Function.h" @@ -59,7 +60,8 @@ public: // Glue for old PM. bool runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, - DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_); + DominatorTree *DT_, AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_); private: /// \brief Collect store and getelementptr instructions and organize them diff --git a/include/llvm/module.modulemap b/include/llvm/module.modulemap index 59b1f1621039..5e15e8d49802 100644 --- a/include/llvm/module.modulemap +++ b/include/llvm/module.modulemap @@ -148,6 +148,7 @@ module LLVM_intrinsic_gen { module IR_Attributes { header "IR/Attributes.h" export * } module IR_CallSite { header "IR/CallSite.h" export * } module IR_ConstantFolder { header "IR/ConstantFolder.h" export * } + module IR_GlobalVariable { header "IR/GlobalVariable.h" export * } module IR_NoFolder { header "IR/NoFolder.h" export * } module IR_Module { header "IR/Module.h" export * } module IR_ModuleSummaryIndex { header "IR/ModuleSummaryIndex.h" export * } diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp index 537823020301..a33c01a0e461 100644 --- a/lib/Analysis/BasicAliasAnalysis.cpp +++ b/lib/Analysis/BasicAliasAnalysis.cpp @@ -17,13 +17,13 @@ #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/AliasAnalysis.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CFG.h" #include "llvm/Analysis/CaptureTracking.h" #include "llvm/Analysis/InstructionSimplify.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/MemoryBuiltins.h" #include "llvm/Analysis/ValueTracking.h" -#include "llvm/Analysis/AssumptionCache.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" @@ -36,6 +36,7 @@ #include "llvm/IR/Operator.h" #include "llvm/Pass.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include #define DEBUG_TYPE "basicaa" @@ -1283,9 +1284,9 @@ AliasResult BasicAAResult::aliasGEP(const GEPOperator *GEP1, uint64_t V1Size, // give up if we can't determine conditions that hold for every cycle: const Value *V = DecompGEP1.VarIndices[i].V; - bool SignKnownZero, SignKnownOne; - ComputeSignBit(const_cast(V), SignKnownZero, SignKnownOne, DL, - 0, &AC, nullptr, DT); + KnownBits Known = computeKnownBits(V, DL, 0, &AC, nullptr, DT); + bool SignKnownZero = Known.isNonNegative(); + bool SignKnownOne = Known.isNegative(); // Zero-extension widens the variable, and so forces the sign // bit to zero. diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp index 0dc4475ca0e2..db87b17c1567 100644 --- a/lib/Analysis/BranchProbabilityInfo.cpp +++ b/lib/Analysis/BranchProbabilityInfo.cpp @@ -301,6 +301,8 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) { WeightSum += Weights[i]; } } + assert(WeightSum <= UINT32_MAX && + "Expected weights to scale down to 32 bits"); if (WeightSum == 0 || ReachableIdxs.size() == 0) { for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) @@ -328,21 +330,14 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) { // the difference between reachable blocks. if (ToDistribute > BranchProbability::getZero()) { BranchProbability PerEdge = ToDistribute / ReachableIdxs.size(); - for (auto i : ReachableIdxs) { + for (auto i : ReachableIdxs) BP[i] += PerEdge; - ToDistribute -= PerEdge; - } - // Tail goes to the first reachable edge. - BP[ReachableIdxs[0]] += ToDistribute; } } for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) setEdgeProbability(BB, i, BP[i]); - assert(WeightSum <= UINT32_MAX && - "Expected weights to scale down to 32 bits"); - return true; } diff --git a/lib/Analysis/CallGraph.cpp b/lib/Analysis/CallGraph.cpp index 6942176ae6ae..ff5242f69a1b 100644 --- a/lib/Analysis/CallGraph.cpp +++ b/lib/Analysis/CallGraph.cpp @@ -21,23 +21,18 @@ using namespace llvm; // CallGraph::CallGraph(Module &M) - : M(M), Root(nullptr), ExternalCallingNode(getOrInsertFunction(nullptr)), + : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)), CallsExternalNode(llvm::make_unique(nullptr)) { // Add every function to the call graph. for (Function &F : M) addToCallGraph(&F); - - // If we didn't find a main function, use the external call graph node - if (!Root) - Root = ExternalCallingNode; } CallGraph::CallGraph(CallGraph &&Arg) - : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), Root(Arg.Root), + : M(Arg.M), FunctionMap(std::move(Arg.FunctionMap)), ExternalCallingNode(Arg.ExternalCallingNode), CallsExternalNode(std::move(Arg.CallsExternalNode)) { Arg.FunctionMap.clear(); - Arg.Root = nullptr; Arg.ExternalCallingNode = nullptr; } @@ -57,21 +52,9 @@ CallGraph::~CallGraph() { void CallGraph::addToCallGraph(Function *F) { CallGraphNode *Node = getOrInsertFunction(F); - // If this function has external linkage, anything could call it. - if (!F->hasLocalLinkage()) { - ExternalCallingNode->addCalledFunction(CallSite(), Node); - - // Found the entry point? - if (F->getName() == "main") { - if (Root) // Found multiple external mains? Don't pick one. - Root = ExternalCallingNode; - else - Root = Node; // Found a main, keep track of it! - } - } - - // If this function has its address taken, anything could call it. - if (F->hasAddressTaken()) + // If this function has external linkage or has its address taken, anything + // could call it. + if (!F->hasLocalLinkage() || F->hasAddressTaken()) ExternalCallingNode->addCalledFunction(CallSite(), Node); // If this function is not defined in this translation unit, it could call @@ -96,13 +79,6 @@ void CallGraph::addToCallGraph(Function *F) { } void CallGraph::print(raw_ostream &OS) const { - OS << "CallGraph Root is: "; - if (Function *F = Root->getFunction()) - OS << F->getName() << "\n"; - else { - OS << "<>\n"; - } - // Print in a deterministic order by sorting CallGraphNodes by name. We do // this here to avoid slowing down the non-printing fast path. diff --git a/lib/Analysis/ConstantFolding.cpp b/lib/Analysis/ConstantFolding.cpp index 130e917e49d7..0ca712bbfe70 100644 --- a/lib/Analysis/ConstantFolding.cpp +++ b/lib/Analysis/ConstantFolding.cpp @@ -1438,6 +1438,36 @@ bool llvm::canConstantFoldCallTo(const Function *F) { Name == "sinf" || Name == "sinhf" || Name == "sqrtf"; case 't': return Name == "tan" || Name == "tanh" || Name == "tanf" || Name == "tanhf"; + case '_': + + // Check for various function names that get used for the math functions + // when the header files are preprocessed with the macro + // __FINITE_MATH_ONLY__ enabled. + // The '12' here is the length of the shortest name that can match. + // We need to check the size before looking at Name[1] and Name[2] + // so we may as well check a limit that will eliminate mismatches. + if (Name.size() < 12 || Name[1] != '_') + return false; + switch (Name[2]) { + default: + return false; + case 'a': + return Name == "__acos_finite" || Name == "__acosf_finite" || + Name == "__asin_finite" || Name == "__asinf_finite" || + Name == "__atan2_finite" || Name == "__atan2f_finite"; + case 'c': + return Name == "__cosh_finite" || Name == "__coshf_finite"; + case 'e': + return Name == "__exp_finite" || Name == "__expf_finite" || + Name == "__exp2_finite" || Name == "__exp2f_finite"; + case 'l': + return Name == "__log_finite" || Name == "__logf_finite" || + Name == "__log10_finite" || Name == "__log10f_finite"; + case 'p': + return Name == "__pow_finite" || Name == "__powf_finite"; + case 's': + return Name == "__sinh_finite" || Name == "__sinhf_finite"; + } } } @@ -1637,13 +1667,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, if (!TLI) return nullptr; - switch (Name[0]) { + char NameKeyChar = Name[0]; + if (Name[0] == '_' && Name.size() > 2 && Name[1] == '_') + NameKeyChar = Name[2]; + + switch (NameKeyChar) { case 'a': if ((Name == "acos" && TLI->has(LibFunc_acos)) || - (Name == "acosf" && TLI->has(LibFunc_acosf))) + (Name == "acosf" && TLI->has(LibFunc_acosf)) || + (Name == "__acos_finite" && TLI->has(LibFunc_acos_finite)) || + (Name == "__acosf_finite" && TLI->has(LibFunc_acosf_finite))) return ConstantFoldFP(acos, V, Ty); else if ((Name == "asin" && TLI->has(LibFunc_asin)) || - (Name == "asinf" && TLI->has(LibFunc_asinf))) + (Name == "asinf" && TLI->has(LibFunc_asinf)) || + (Name == "__asin_finite" && TLI->has(LibFunc_asin_finite)) || + (Name == "__asinf_finite" && TLI->has(LibFunc_asinf_finite))) return ConstantFoldFP(asin, V, Ty); else if ((Name == "atan" && TLI->has(LibFunc_atan)) || (Name == "atanf" && TLI->has(LibFunc_atanf))) @@ -1657,15 +1695,21 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, (Name == "cosf" && TLI->has(LibFunc_cosf))) return ConstantFoldFP(cos, V, Ty); else if ((Name == "cosh" && TLI->has(LibFunc_cosh)) || - (Name == "coshf" && TLI->has(LibFunc_coshf))) + (Name == "coshf" && TLI->has(LibFunc_coshf)) || + (Name == "__cosh_finite" && TLI->has(LibFunc_cosh_finite)) || + (Name == "__coshf_finite" && TLI->has(LibFunc_coshf_finite))) return ConstantFoldFP(cosh, V, Ty); break; case 'e': if ((Name == "exp" && TLI->has(LibFunc_exp)) || - (Name == "expf" && TLI->has(LibFunc_expf))) + (Name == "expf" && TLI->has(LibFunc_expf)) || + (Name == "__exp_finite" && TLI->has(LibFunc_exp_finite)) || + (Name == "__expf_finite" && TLI->has(LibFunc_expf_finite))) return ConstantFoldFP(exp, V, Ty); if ((Name == "exp2" && TLI->has(LibFunc_exp2)) || - (Name == "exp2f" && TLI->has(LibFunc_exp2f))) + (Name == "exp2f" && TLI->has(LibFunc_exp2f)) || + (Name == "__exp2_finite" && TLI->has(LibFunc_exp2_finite)) || + (Name == "__exp2f_finite" && TLI->has(LibFunc_exp2f_finite))) // Constant fold exp2(x) as pow(2,x) in case the host doesn't have a // C99 library. return ConstantFoldBinaryFP(pow, 2.0, V, Ty); @@ -1680,10 +1724,18 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, break; case 'l': if ((Name == "log" && V > 0 && TLI->has(LibFunc_log)) || - (Name == "logf" && V > 0 && TLI->has(LibFunc_logf))) + (Name == "logf" && V > 0 && TLI->has(LibFunc_logf)) || + (Name == "__log_finite" && V > 0 && + TLI->has(LibFunc_log_finite)) || + (Name == "__logf_finite" && V > 0 && + TLI->has(LibFunc_logf_finite))) return ConstantFoldFP(log, V, Ty); else if ((Name == "log10" && V > 0 && TLI->has(LibFunc_log10)) || - (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f))) + (Name == "log10f" && V > 0 && TLI->has(LibFunc_log10f)) || + (Name == "__log10_finite" && V > 0 && + TLI->has(LibFunc_log10_finite)) || + (Name == "__log10f_finite" && V > 0 && + TLI->has(LibFunc_log10f_finite))) return ConstantFoldFP(log10, V, Ty); break; case 'r': @@ -1695,7 +1747,9 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, (Name == "sinf" && TLI->has(LibFunc_sinf))) return ConstantFoldFP(sin, V, Ty); else if ((Name == "sinh" && TLI->has(LibFunc_sinh)) || - (Name == "sinhf" && TLI->has(LibFunc_sinhf))) + (Name == "sinhf" && TLI->has(LibFunc_sinhf)) || + (Name == "__sinh_finite" && TLI->has(LibFunc_sinh_finite)) || + (Name == "__sinhf_finite" && TLI->has(LibFunc_sinhf_finite))) return ConstantFoldFP(sinh, V, Ty); else if ((Name == "sqrt" && V >= 0 && TLI->has(LibFunc_sqrt)) || (Name == "sqrtf" && V >= 0 && TLI->has(LibFunc_sqrtf))) @@ -1813,13 +1867,17 @@ Constant *ConstantFoldScalarCall(StringRef Name, unsigned IntrinsicID, Type *Ty, if (!TLI) return nullptr; if ((Name == "pow" && TLI->has(LibFunc_pow)) || - (Name == "powf" && TLI->has(LibFunc_powf))) + (Name == "powf" && TLI->has(LibFunc_powf)) || + (Name == "__pow_finite" && TLI->has(LibFunc_pow_finite)) || + (Name == "__powf_finite" && TLI->has(LibFunc_powf_finite))) return ConstantFoldBinaryFP(pow, Op1V, Op2V, Ty); if ((Name == "fmod" && TLI->has(LibFunc_fmod)) || (Name == "fmodf" && TLI->has(LibFunc_fmodf))) return ConstantFoldBinaryFP(fmod, Op1V, Op2V, Ty); if ((Name == "atan2" && TLI->has(LibFunc_atan2)) || - (Name == "atan2f" && TLI->has(LibFunc_atan2f))) + (Name == "atan2f" && TLI->has(LibFunc_atan2f)) || + (Name == "__atan2_finite" && TLI->has(LibFunc_atan2_finite)) || + (Name == "__atan2f_finite" && TLI->has(LibFunc_atan2f_finite))) return ConstantFoldBinaryFP(atan2, Op1V, Op2V, Ty); } else if (auto *Op2C = dyn_cast(Operands[1])) { if (IntrinsicID == Intrinsic::powi && Ty->isHalfTy()) diff --git a/lib/Analysis/DemandedBits.cpp b/lib/Analysis/DemandedBits.cpp index 9f5dc5318239..8f808f3e7871 100644 --- a/lib/Analysis/DemandedBits.cpp +++ b/lib/Analysis/DemandedBits.cpp @@ -86,13 +86,11 @@ void DemandedBits::determineLiveOperandBits( [&](unsigned BitWidth, const Value *V1, const Value *V2) { const DataLayout &DL = I->getModule()->getDataLayout(); Known = KnownBits(BitWidth); - computeKnownBits(const_cast(V1), Known, DL, 0, - &AC, UserI, &DT); + computeKnownBits(V1, Known, DL, 0, &AC, UserI, &DT); if (V2) { Known2 = KnownBits(BitWidth); - computeKnownBits(const_cast(V2), Known2, DL, - 0, &AC, UserI, &DT); + computeKnownBits(V2, Known2, DL, 0, &AC, UserI, &DT); } }; @@ -118,7 +116,7 @@ void DemandedBits::determineLiveOperandBits( // known to be one. ComputeKnownBits(BitWidth, I, nullptr); AB = APInt::getHighBitsSet(BitWidth, - std::min(BitWidth, Known.One.countLeadingZeros()+1)); + std::min(BitWidth, Known.countMaxLeadingZeros()+1)); } break; case Intrinsic::cttz: @@ -128,7 +126,7 @@ void DemandedBits::determineLiveOperandBits( // known to be one. ComputeKnownBits(BitWidth, I, nullptr); AB = APInt::getLowBitsSet(BitWidth, - std::min(BitWidth, Known.One.countTrailingZeros()+1)); + std::min(BitWidth, Known.countMaxTrailingZeros()+1)); } break; } diff --git a/lib/Analysis/InlineCost.cpp b/lib/Analysis/InlineCost.cpp index 100a591e452c..44c14cb17c22 100644 --- a/lib/Analysis/InlineCost.cpp +++ b/lib/Analysis/InlineCost.cpp @@ -63,7 +63,7 @@ static cl::opt // PGO before we actually hook up inliner with analysis passes such as BPI and // BFI. static cl::opt ColdThreshold( - "inlinecold-threshold", cl::Hidden, cl::init(225), + "inlinecold-threshold", cl::Hidden, cl::init(45), cl::desc("Threshold for inlining functions with cold attribute")); static cl::opt diff --git a/lib/Analysis/InstructionSimplify.cpp b/lib/Analysis/InstructionSimplify.cpp index 4a713f441ce8..5728887cc1e9 100644 --- a/lib/Analysis/InstructionSimplify.cpp +++ b/lib/Analysis/InstructionSimplify.cpp @@ -1317,7 +1317,7 @@ static Value *SimplifyShift(Instruction::BinaryOps Opcode, Value *Op0, // If all valid bits in the shift amount are known zero, the first operand is // unchanged. unsigned NumValidShiftBits = Log2_32_Ceil(BitWidth); - if (Known.Zero.countTrailingOnes() >= NumValidShiftBits) + if (Known.countMinTrailingZeros() >= NumValidShiftBits) return Op0; return nullptr; @@ -1536,7 +1536,7 @@ static Value *simplifyAndOrOfICmpsWithConstants(ICmpInst *Cmp0, ICmpInst *Cmp1, auto Range0 = ConstantRange::makeExactICmpRegion(Cmp0->getPredicate(), *C0); auto Range1 = ConstantRange::makeExactICmpRegion(Cmp1->getPredicate(), *C1); - // For and-of-comapares, check if the intersection is empty: + // For and-of-compares, check if the intersection is empty: // (icmp X, C0) && (icmp X, C1) --> empty set --> false if (IsAnd && Range0.intersectWith(Range1).isEmptySet()) return getFalse(Cmp0->getType()); @@ -1870,6 +1870,24 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q, match(Op1, m_c_And(m_Not(m_Specific(A)), m_Specific(B))))) return Op0; + // (A & B) | (~A ^ B) -> (~A ^ B) + // (B & A) | (~A ^ B) -> (~A ^ B) + // (A & B) | (B ^ ~A) -> (B ^ ~A) + // (B & A) | (B ^ ~A) -> (B ^ ~A) + if (match(Op0, m_And(m_Value(A), m_Value(B))) && + (match(Op1, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) || + match(Op1, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B))))) + return Op1; + + // (~A ^ B) | (A & B) -> (~A ^ B) + // (~A ^ B) | (B & A) -> (~A ^ B) + // (B ^ ~A) | (A & B) -> (B ^ ~A) + // (B ^ ~A) | (B & A) -> (B ^ ~A) + if (match(Op1, m_And(m_Value(A), m_Value(B))) && + (match(Op0, m_c_Xor(m_Specific(A), m_Not(m_Specific(B)))) || + match(Op0, m_c_Xor(m_Not(m_Specific(A)), m_Specific(B))))) + return Op0; + if (Value *V = simplifyAndOrOfICmps(Op0, Op1, false)) return V; @@ -2286,7 +2304,6 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS, return nullptr; Type *ITy = GetCompareTy(LHS); // The return type. - bool LHSKnownNonNegative, LHSKnownNegative; switch (Pred) { default: llvm_unreachable("Unknown ICmp predicate!"); @@ -2304,39 +2321,41 @@ static Value *simplifyICmpWithZero(CmpInst::Predicate Pred, Value *LHS, if (isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return getTrue(ITy); break; - case ICmpInst::ICMP_SLT: - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (LHSKnownNegative) + case ICmpInst::ICMP_SLT: { + KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (LHSKnown.isNegative()) return getTrue(ITy); - if (LHSKnownNonNegative) + if (LHSKnown.isNonNegative()) return getFalse(ITy); break; - case ICmpInst::ICMP_SLE: - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (LHSKnownNegative) + } + case ICmpInst::ICMP_SLE: { + KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (LHSKnown.isNegative()) return getTrue(ITy); - if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) + if (LHSKnown.isNonNegative() && + isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return getFalse(ITy); break; - case ICmpInst::ICMP_SGE: - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (LHSKnownNegative) + } + case ICmpInst::ICMP_SGE: { + KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (LHSKnown.isNegative()) return getFalse(ITy); - if (LHSKnownNonNegative) + if (LHSKnown.isNonNegative()) return getTrue(ITy); break; - case ICmpInst::ICMP_SGT: - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (LHSKnownNegative) + } + case ICmpInst::ICMP_SGT: { + KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (LHSKnown.isNegative()) return getFalse(ITy); - if (LHSKnownNonNegative && isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) + if (LHSKnown.isNonNegative() && + isKnownNonZero(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT)) return getTrue(ITy); break; } + } return nullptr; } @@ -2535,6 +2554,9 @@ static Value *simplifyICmpWithConstant(CmpInst::Predicate Pred, Value *LHS, return nullptr; } +/// TODO: A large part of this logic is duplicated in InstCombine's +/// foldICmpBinOp(). We should be able to share that and avoid the code +/// duplication. static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, Value *RHS, const SimplifyQuery &Q, unsigned MaxRecurse) { @@ -2616,15 +2638,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, return getTrue(ITy); if (Pred == ICmpInst::ICMP_SLT || Pred == ICmpInst::ICMP_SGE) { - bool RHSKnownNonNegative, RHSKnownNegative; - bool YKnownNonNegative, YKnownNegative; - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, Q.DL, 0, - Q.AC, Q.CxtI, Q.DT); - ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (RHSKnownNonNegative && YKnownNegative) + KnownBits RHSKnown = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (RHSKnown.isNonNegative() && YKnown.isNegative()) return Pred == ICmpInst::ICMP_SLT ? getTrue(ITy) : getFalse(ITy); - if (RHSKnownNegative || YKnownNonNegative) + if (RHSKnown.isNegative() || YKnown.isNonNegative()) return Pred == ICmpInst::ICMP_SLT ? getFalse(ITy) : getTrue(ITy); } } @@ -2636,15 +2654,11 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, return getFalse(ITy); if (Pred == ICmpInst::ICMP_SGT || Pred == ICmpInst::ICMP_SLE) { - bool LHSKnownNonNegative, LHSKnownNegative; - bool YKnownNonNegative, YKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, Q.DL, 0, - Q.AC, Q.CxtI, Q.DT); - ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (LHSKnownNonNegative && YKnownNegative) + KnownBits LHSKnown = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + KnownBits YKnown = computeKnownBits(Y, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (LHSKnown.isNonNegative() && YKnown.isNegative()) return Pred == ICmpInst::ICMP_SGT ? getTrue(ITy) : getFalse(ITy); - if (LHSKnownNegative || YKnownNonNegative) + if (LHSKnown.isNegative() || YKnown.isNonNegative()) return Pred == ICmpInst::ICMP_SGT ? getFalse(ITy) : getTrue(ITy); } } @@ -2691,28 +2705,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, // icmp pred (urem X, Y), Y if (LBO && match(LBO, m_URem(m_Value(), m_Specific(RHS)))) { - bool KnownNonNegative, KnownNegative; switch (Pred) { default: break; case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: - ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (!KnownNonNegative) + case ICmpInst::ICMP_SGE: { + KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; + } case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: return getFalse(ITy); case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: - ComputeSignBit(RHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (!KnownNonNegative) + case ICmpInst::ICMP_SLE: { + KnownBits Known = computeKnownBits(RHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; + } case ICmpInst::ICMP_NE: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: @@ -2722,28 +2735,27 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, // icmp pred X, (urem Y, X) if (RBO && match(RBO, m_URem(m_Value(), m_Specific(LHS)))) { - bool KnownNonNegative, KnownNegative; switch (Pred) { default: break; case ICmpInst::ICMP_SGT: - case ICmpInst::ICMP_SGE: - ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (!KnownNonNegative) + case ICmpInst::ICMP_SGE: { + KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; + } case ICmpInst::ICMP_NE: case ICmpInst::ICMP_UGT: case ICmpInst::ICMP_UGE: return getTrue(ITy); case ICmpInst::ICMP_SLT: - case ICmpInst::ICMP_SLE: - ComputeSignBit(LHS, KnownNonNegative, KnownNegative, Q.DL, 0, Q.AC, - Q.CxtI, Q.DT); - if (!KnownNonNegative) + case ICmpInst::ICMP_SLE: { + KnownBits Known = computeKnownBits(LHS, Q.DL, 0, Q.AC, Q.CxtI, Q.DT); + if (!Known.isNonNegative()) break; LLVM_FALLTHROUGH; + } case ICmpInst::ICMP_EQ: case ICmpInst::ICMP_ULT: case ICmpInst::ICMP_ULE: @@ -2815,10 +2827,19 @@ static Value *simplifyICmpWithBinOp(CmpInst::Predicate Pred, Value *LHS, break; case Instruction::UDiv: case Instruction::LShr: - if (ICmpInst::isSigned(Pred)) + if (ICmpInst::isSigned(Pred) || !LBO->isExact() || !RBO->isExact()) break; - LLVM_FALLTHROUGH; + if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0), + RBO->getOperand(0), Q, MaxRecurse - 1)) + return V; + break; case Instruction::SDiv: + if (!ICmpInst::isEquality(Pred) || !LBO->isExact() || !RBO->isExact()) + break; + if (Value *V = SimplifyICmpInst(Pred, LBO->getOperand(0), + RBO->getOperand(0), Q, MaxRecurse - 1)) + return V; + break; case Instruction::AShr: if (!LBO->isExact() || !RBO->isExact()) break; @@ -4034,24 +4055,21 @@ Value *llvm::SimplifyCastInst(unsigned CastOpc, Value *Op, Type *Ty, /// match a root vector source operand that contains that element in the same /// vector lane (ie, the same mask index), so we can eliminate the shuffle(s). static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1, - Constant *Mask, Value *RootVec, int RootElt, + int MaskVal, Value *RootVec, unsigned MaxRecurse) { if (!MaxRecurse--) return nullptr; // Bail out if any mask value is undefined. That kind of shuffle may be // simplified further based on demanded bits or other folds. - int MaskVal = ShuffleVectorInst::getMaskValue(Mask, RootElt); if (MaskVal == -1) return nullptr; // The mask value chooses which source operand we need to look at next. - Value *SourceOp; int InVecNumElts = Op0->getType()->getVectorNumElements(); - if (MaskVal < InVecNumElts) { - RootElt = MaskVal; - SourceOp = Op0; - } else { + int RootElt = MaskVal; + Value *SourceOp = Op0; + if (MaskVal >= InVecNumElts) { RootElt = MaskVal - InVecNumElts; SourceOp = Op1; } @@ -4061,7 +4079,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1, if (auto *SourceShuf = dyn_cast(SourceOp)) { return foldIdentityShuffles( DestElt, SourceShuf->getOperand(0), SourceShuf->getOperand(1), - SourceShuf->getMask(), RootVec, RootElt, MaxRecurse); + SourceShuf->getMaskValue(RootElt), RootVec, MaxRecurse); } // TODO: Look through bitcasts? What if the bitcast changes the vector element @@ -4126,17 +4144,7 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, // second one. if (Op0Const && !Op1Const) { std::swap(Op0, Op1); - for (int &Idx : Indices) { - if (Idx == -1) - continue; - Idx = Idx < (int)InVecNumElts ? Idx + InVecNumElts : Idx - InVecNumElts; - assert(Idx >= 0 && Idx < (int)InVecNumElts * 2 && - "shufflevector mask index out of range"); - } - Mask = ConstantDataVector::get( - Mask->getContext(), - makeArrayRef(reinterpret_cast(Indices.data()), - MaskNumElts)); + ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts); } // A shuffle of a splat is always the splat itself. Legal if the shuffle's @@ -4160,7 +4168,8 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask, for (unsigned i = 0; i != MaskNumElts; ++i) { // Note that recursion is limited for each vector element, so if any element // exceeds the limit, this will fail to simplify. - RootVec = foldIdentityShuffles(i, Op0, Op1, Mask, RootVec, i, MaxRecurse); + RootVec = + foldIdentityShuffles(i, Op0, Op1, Indices[i], RootVec, MaxRecurse); // We can't replace a widening/narrowing shuffle with one of its operands. if (!RootVec || RootVec->getType() != RetTy) diff --git a/lib/Analysis/ModuleSummaryAnalysis.cpp b/lib/Analysis/ModuleSummaryAnalysis.cpp index 99f900ae3932..26706f5509ba 100644 --- a/lib/Analysis/ModuleSummaryAnalysis.cpp +++ b/lib/Analysis/ModuleSummaryAnalysis.cpp @@ -232,7 +232,7 @@ computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M, } // We should have named any anonymous globals assert(CalledFunction->hasName()); - auto ScaledCount = ProfileSummaryInfo::getProfileCount(&I, BFI); + auto ScaledCount = PSI->getProfileCount(&I, BFI); auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI) : CalleeInfo::HotnessType::Unknown; @@ -330,6 +330,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex( const Module &M, std::function GetBFICallback, ProfileSummaryInfo *PSI) { + assert(PSI); ModuleSummaryIndex Index; // Identify the local values in the llvm.used and llvm.compiler.used sets, diff --git a/lib/Analysis/OptimizationDiagnosticInfo.cpp b/lib/Analysis/OptimizationDiagnosticInfo.cpp index 73245981b022..e38e530c052d 100644 --- a/lib/Analysis/OptimizationDiagnosticInfo.cpp +++ b/lib/Analysis/OptimizationDiagnosticInfo.cpp @@ -101,7 +101,7 @@ void MappingTraits::mapping( // These are read-only for now. DiagnosticLocation DL = OptDiag->getLocation(); StringRef FN = - GlobalValue::getRealLinkageName(OptDiag->getFunction().getName()); + GlobalValue::dropLLVMManglingEscape(OptDiag->getFunction().getName()); StringRef PassName(OptDiag->PassName); io.mapRequired("Pass", PassName); diff --git a/lib/Analysis/ProfileSummaryInfo.cpp b/lib/Analysis/ProfileSummaryInfo.cpp index 1a53a8ed4283..502f4205b689 100644 --- a/lib/Analysis/ProfileSummaryInfo.cpp +++ b/lib/Analysis/ProfileSummaryInfo.cpp @@ -75,11 +75,14 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst, return None; assert((isa(Inst) || isa(Inst)) && "We can only get profile count for call/invoke instruction."); - // Check if there is a profile metadata on the instruction. If it is present, - // determine hotness solely based on that. - uint64_t TotalCount; - if (Inst->extractProfTotalWeight(TotalCount)) - return TotalCount; + if (computeSummary() && Summary->getKind() == ProfileSummary::PSK_Sample) { + // In sample PGO mode, check if there is a profile metadata on the + // instruction. If it is present, determine hotness solely based on that, + // since the sampled entry count may not be accurate. + uint64_t TotalCount; + if (Inst->extractProfTotalWeight(TotalCount)) + return TotalCount; + } if (BFI) return BFI->getBlockProfileCount(Inst->getParent()); return None; diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp index 01dca0793145..800354d2f5b4 100644 --- a/lib/Analysis/ScalarEvolution.cpp +++ b/lib/Analysis/ScalarEvolution.cpp @@ -584,7 +584,7 @@ CompareValueComplexity(SmallSet, 8> &EqCache, static int CompareSCEVComplexity( SmallSet, 8> &EqCacheSCEV, const LoopInfo *const LI, const SCEV *LHS, const SCEV *RHS, - unsigned Depth = 0) { + DominatorTree &DT, unsigned Depth = 0) { // Fast-path: SCEVs are uniqued so we can do a quick equality check. if (LHS == RHS) return 0; @@ -629,9 +629,16 @@ static int CompareSCEVComplexity( const SCEVAddRecExpr *LA = cast(LHS); const SCEVAddRecExpr *RA = cast(RHS); - // Compare addrec loop depths. + // If there is a dominance relationship between the loops, sort by the + // dominance. Otherwise, sort by depth. We require such order in getAddExpr. const Loop *LLoop = LA->getLoop(), *RLoop = RA->getLoop(); if (LLoop != RLoop) { + const BasicBlock *LHead = LLoop->getHeader(), *RHead = RLoop->getHeader(); + assert(LHead != RHead && "Two loops share the same header?"); + if (DT.dominates(LHead, RHead)) + return 1; + else if (DT.dominates(RHead, LHead)) + return -1; unsigned LDepth = LLoop->getLoopDepth(), RDepth = RLoop->getLoopDepth(); if (LDepth != RDepth) return (int)LDepth - (int)RDepth; @@ -645,7 +652,7 @@ static int CompareSCEVComplexity( // Lexicographically compare. for (unsigned i = 0; i != LNumOps; ++i) { int X = CompareSCEVComplexity(EqCacheSCEV, LI, LA->getOperand(i), - RA->getOperand(i), Depth + 1); + RA->getOperand(i), DT, Depth + 1); if (X != 0) return X; } @@ -669,7 +676,7 @@ static int CompareSCEVComplexity( if (i >= RNumOps) return 1; int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(i), - RC->getOperand(i), Depth + 1); + RC->getOperand(i), DT, Depth + 1); if (X != 0) return X; } @@ -683,10 +690,10 @@ static int CompareSCEVComplexity( // Lexicographically compare udiv expressions. int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getLHS(), RC->getLHS(), - Depth + 1); + DT, Depth + 1); if (X != 0) return X; - X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), + X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getRHS(), RC->getRHS(), DT, Depth + 1); if (X == 0) EqCacheSCEV.insert({LHS, RHS}); @@ -701,7 +708,7 @@ static int CompareSCEVComplexity( // Compare cast expressions by operand. int X = CompareSCEVComplexity(EqCacheSCEV, LI, LC->getOperand(), - RC->getOperand(), Depth + 1); + RC->getOperand(), DT, Depth + 1); if (X == 0) EqCacheSCEV.insert({LHS, RHS}); return X; @@ -724,7 +731,7 @@ static int CompareSCEVComplexity( /// land in memory. /// static void GroupByComplexity(SmallVectorImpl &Ops, - LoopInfo *LI) { + LoopInfo *LI, DominatorTree &DT) { if (Ops.size() < 2) return; // Noop SmallSet, 8> EqCache; @@ -732,15 +739,16 @@ static void GroupByComplexity(SmallVectorImpl &Ops, // This is the common case, which also happens to be trivially simple. // Special case it. const SCEV *&LHS = Ops[0], *&RHS = Ops[1]; - if (CompareSCEVComplexity(EqCache, LI, RHS, LHS) < 0) + if (CompareSCEVComplexity(EqCache, LI, RHS, LHS, DT) < 0) std::swap(LHS, RHS); return; } // Do the rough sort by complexity. std::stable_sort(Ops.begin(), Ops.end(), - [&EqCache, LI](const SCEV *LHS, const SCEV *RHS) { - return CompareSCEVComplexity(EqCache, LI, LHS, RHS) < 0; + [&EqCache, LI, &DT](const SCEV *LHS, const SCEV *RHS) { + return + CompareSCEVComplexity(EqCache, LI, LHS, RHS, DT) < 0; }); // Now that we are sorted by complexity, group elements of the same @@ -2186,7 +2194,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, #endif // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, &LI); + GroupByComplexity(Ops, &LI, DT); Flags = StrengthenNoWrapFlags(this, scAddExpr, Ops, Flags); @@ -2492,7 +2500,13 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, // added together. If so, we can fold them. for (unsigned OtherIdx = Idx+1; OtherIdx < Ops.size() && isa(Ops[OtherIdx]); - ++OtherIdx) + ++OtherIdx) { + // We expect the AddRecExpr's to be sorted in reverse dominance order, + // so that the 1st found AddRecExpr is dominated by all others. + assert(DT.dominates( + cast(Ops[OtherIdx])->getLoop()->getHeader(), + AddRec->getLoop()->getHeader()) && + "AddRecExprs are not sorted in reverse dominance order?"); if (AddRecLoop == cast(Ops[OtherIdx])->getLoop()) { // Other + {A,+,B} + {C,+,D} --> Other + {A+C,+,B+D} SmallVector AddRecOps(AddRec->op_begin(), @@ -2518,6 +2532,7 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl &Ops, Ops[Idx] = getAddRecExpr(AddRecOps, AddRecLoop, SCEV::FlagAnyWrap); return getAddExpr(Ops, SCEV::FlagAnyWrap, Depth + 1); } + } // Otherwise couldn't fold anything into this recurrence. Move onto the // next one. @@ -2614,7 +2629,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl &Ops, #endif // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, &LI); + GroupByComplexity(Ops, &LI, DT); Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags); @@ -3211,7 +3226,7 @@ ScalarEvolution::getSMaxExpr(SmallVectorImpl &Ops) { #endif // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, &LI); + GroupByComplexity(Ops, &LI, DT); // If there are any constants, fold them together. unsigned Idx = 0; @@ -3312,7 +3327,7 @@ ScalarEvolution::getUMaxExpr(SmallVectorImpl &Ops) { #endif // Sort by complexity, this groups all similar expression types together. - GroupByComplexity(Ops, &LI); + GroupByComplexity(Ops, &LI, DT); // If there are any constants, fold them together. unsigned Idx = 0; @@ -4636,7 +4651,7 @@ uint32_t ScalarEvolution::GetMinTrailingZerosImpl(const SCEV *S) { KnownBits Known(BitWidth); computeKnownBits(U->getValue(), Known, getDataLayout(), 0, &AC, nullptr, &DT); - return Known.Zero.countTrailingOnes(); + return Known.countMinTrailingZeros(); } // SCEVUDivExpr @@ -5955,6 +5970,30 @@ bool ScalarEvolution::BackedgeTakenInfo::hasOperand(const SCEV *S, return false; } +ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E) + : ExactNotTaken(E), MaxNotTaken(E), MaxOrZero(false) {} + +ScalarEvolution::ExitLimit::ExitLimit( + const SCEV *E, const SCEV *M, bool MaxOrZero, + ArrayRef *> PredSetList) + : ExactNotTaken(E), MaxNotTaken(M), MaxOrZero(MaxOrZero) { + assert((isa(ExactNotTaken) || + !isa(MaxNotTaken)) && + "Exact is not allowed to be less precise than Max"); + for (auto *PredSet : PredSetList) + for (auto *P : *PredSet) + addPredicate(P); +} + +ScalarEvolution::ExitLimit::ExitLimit( + const SCEV *E, const SCEV *M, bool MaxOrZero, + const SmallPtrSetImpl &PredSet) + : ExitLimit(E, M, MaxOrZero, {&PredSet}) {} + +ScalarEvolution::ExitLimit::ExitLimit(const SCEV *E, const SCEV *M, + bool MaxOrZero) + : ExitLimit(E, M, MaxOrZero, None) {} + /// Allocate memory for BackedgeTakenInfo and copy the not-taken count of each /// computable exit into a persistent ExitNotTakenInfo array. ScalarEvolution::BackedgeTakenInfo::BackedgeTakenInfo( @@ -6637,13 +6676,12 @@ ScalarEvolution::ExitLimit ScalarEvolution::computeShiftCompareExitLimit( // {K,ashr,} stabilizes to signum(K) in at most // bitwidth(K) iterations. Value *FirstValue = PN->getIncomingValueForBlock(Predecessor); - bool KnownZero, KnownOne; - ComputeSignBit(FirstValue, KnownZero, KnownOne, DL, 0, nullptr, - Predecessor->getTerminator(), &DT); + KnownBits Known = computeKnownBits(FirstValue, DL, 0, nullptr, + Predecessor->getTerminator(), &DT); auto *Ty = cast(RHS->getType()); - if (KnownZero) + if (Known.isNonNegative()) StableValue = ConstantInt::get(Ty, 0); - else if (KnownOne) + else if (Known.isNegative()) StableValue = ConstantInt::get(Ty, -1, true); else return getCouldNotCompute(); @@ -7377,48 +7415,49 @@ SolveQuadraticEquation(const SCEVAddRecExpr *AddRec, ScalarEvolution &SE) { const APInt &N = NC->getAPInt(); APInt Two(BitWidth, 2); - { - using namespace APIntOps; - const APInt& C = L; - // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C - // The B coefficient is M-N/2 - APInt B(M); - B -= N.sdiv(Two); + // Convert from chrec coefficients to polynomial coefficients AX^2+BX+C - // The A coefficient is N/2 - APInt A(N.sdiv(Two)); + // The A coefficient is N/2 + APInt A = N.sdiv(Two); - // Compute the B^2-4ac term. - APInt SqrtTerm(B); - SqrtTerm *= B; - SqrtTerm -= 4 * (A * C); + // The B coefficient is M-N/2 + APInt B = M; + B -= A; // A is the same as N/2. - if (SqrtTerm.isNegative()) { - // The loop is provably infinite. - return None; - } + // The C coefficient is L. + const APInt& C = L; - // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest - // integer value or else APInt::sqrt() will assert. - APInt SqrtVal(SqrtTerm.sqrt()); + // Compute the B^2-4ac term. + APInt SqrtTerm = B; + SqrtTerm *= B; + SqrtTerm -= 4 * (A * C); - // Compute the two solutions for the quadratic formula. - // The divisions must be performed as signed divisions. - APInt NegB(-B); - APInt TwoA(A << 1); - if (TwoA.isMinValue()) - return None; + if (SqrtTerm.isNegative()) { + // The loop is provably infinite. + return None; + } + + // Compute sqrt(B^2-4ac). This is guaranteed to be the nearest + // integer value or else APInt::sqrt() will assert. + APInt SqrtVal = SqrtTerm.sqrt(); + + // Compute the two solutions for the quadratic formula. + // The divisions must be performed as signed divisions. + APInt NegB = -std::move(B); + APInt TwoA = std::move(A); + TwoA <<= 1; + if (TwoA.isNullValue()) + return None; - LLVMContext &Context = SE.getContext(); + LLVMContext &Context = SE.getContext(); - ConstantInt *Solution1 = - ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA)); - ConstantInt *Solution2 = - ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA)); + ConstantInt *Solution1 = + ConstantInt::get(Context, (NegB + SqrtVal).sdiv(TwoA)); + ConstantInt *Solution2 = + ConstantInt::get(Context, (NegB - SqrtVal).sdiv(TwoA)); - return std::make_pair(cast(SE.getConstant(Solution1)), - cast(SE.getConstant(Solution2))); - } // end APIntOps namespace + return std::make_pair(cast(SE.getConstant(Solution1)), + cast(SE.getConstant(Solution2))); } ScalarEvolution::ExitLimit @@ -8976,7 +9015,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride, .getSignedMax(); // SMaxRHS + SMaxStrideMinusOne > SMaxValue => overflow! - return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).slt(MaxRHS); + return (std::move(MaxValue) - MaxStrideMinusOne).slt(MaxRHS); } APInt MaxRHS = getUnsignedRange(RHS).getUnsignedMax(); @@ -8985,7 +9024,7 @@ bool ScalarEvolution::doesIVOverflowOnLT(const SCEV *RHS, const SCEV *Stride, .getUnsignedMax(); // UMaxRHS + UMaxStrideMinusOne > UMaxValue => overflow! - return (std::move(MaxValue) - std::move(MaxStrideMinusOne)).ult(MaxRHS); + return (std::move(MaxValue) - MaxStrideMinusOne).ult(MaxRHS); } bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, @@ -9002,7 +9041,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, .getSignedMax(); // SMinRHS - SMaxStrideMinusOne < SMinValue => overflow! - return (std::move(MinValue) + std::move(MaxStrideMinusOne)).sgt(MinRHS); + return (std::move(MinValue) + MaxStrideMinusOne).sgt(MinRHS); } APInt MinRHS = getUnsignedRange(RHS).getUnsignedMin(); @@ -9011,7 +9050,7 @@ bool ScalarEvolution::doesIVOverflowOnGT(const SCEV *RHS, const SCEV *Stride, .getUnsignedMax(); // UMinRHS - UMaxStrideMinusOne < UMinValue => overflow! - return (std::move(MinValue) + std::move(MaxStrideMinusOne)).ugt(MinRHS); + return (std::move(MinValue) + MaxStrideMinusOne).ugt(MinRHS); } const SCEV *ScalarEvolution::computeBECount(const SCEV *Delta, const SCEV *Step, diff --git a/lib/Analysis/TargetLibraryInfo.cpp b/lib/Analysis/TargetLibraryInfo.cpp index 848e1b4717b5..3cf1bbc5daa5 100644 --- a/lib/Analysis/TargetLibraryInfo.cpp +++ b/lib/Analysis/TargetLibraryInfo.cpp @@ -241,6 +241,50 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T, TLI.setUnavailable(LibFunc_tanhf); } + // These definitions are due to math-finite.h header on Linux + TLI.setUnavailable(LibFunc_acos_finite); + TLI.setUnavailable(LibFunc_acosf_finite); + TLI.setUnavailable(LibFunc_acosl_finite); + TLI.setUnavailable(LibFunc_acosh_finite); + TLI.setUnavailable(LibFunc_acoshf_finite); + TLI.setUnavailable(LibFunc_acoshl_finite); + TLI.setUnavailable(LibFunc_asin_finite); + TLI.setUnavailable(LibFunc_asinf_finite); + TLI.setUnavailable(LibFunc_asinl_finite); + TLI.setUnavailable(LibFunc_atan2_finite); + TLI.setUnavailable(LibFunc_atan2f_finite); + TLI.setUnavailable(LibFunc_atan2l_finite); + TLI.setUnavailable(LibFunc_atanh_finite); + TLI.setUnavailable(LibFunc_atanhf_finite); + TLI.setUnavailable(LibFunc_atanhl_finite); + TLI.setUnavailable(LibFunc_cosh_finite); + TLI.setUnavailable(LibFunc_coshf_finite); + TLI.setUnavailable(LibFunc_coshl_finite); + TLI.setUnavailable(LibFunc_exp10_finite); + TLI.setUnavailable(LibFunc_exp10f_finite); + TLI.setUnavailable(LibFunc_exp10l_finite); + TLI.setUnavailable(LibFunc_exp2_finite); + TLI.setUnavailable(LibFunc_exp2f_finite); + TLI.setUnavailable(LibFunc_exp2l_finite); + TLI.setUnavailable(LibFunc_exp_finite); + TLI.setUnavailable(LibFunc_expf_finite); + TLI.setUnavailable(LibFunc_expl_finite); + TLI.setUnavailable(LibFunc_log10_finite); + TLI.setUnavailable(LibFunc_log10f_finite); + TLI.setUnavailable(LibFunc_log10l_finite); + TLI.setUnavailable(LibFunc_log2_finite); + TLI.setUnavailable(LibFunc_log2f_finite); + TLI.setUnavailable(LibFunc_log2l_finite); + TLI.setUnavailable(LibFunc_log_finite); + TLI.setUnavailable(LibFunc_logf_finite); + TLI.setUnavailable(LibFunc_logl_finite); + TLI.setUnavailable(LibFunc_pow_finite); + TLI.setUnavailable(LibFunc_powf_finite); + TLI.setUnavailable(LibFunc_powl_finite); + TLI.setUnavailable(LibFunc_sinh_finite); + TLI.setUnavailable(LibFunc_sinhf_finite); + TLI.setUnavailable(LibFunc_sinhl_finite); + // Win32 does *not* provide provide these functions, but they are // generally available on POSIX-compliant systems: TLI.setUnavailable(LibFunc_access); @@ -496,7 +540,7 @@ static StringRef sanitizeFunctionName(StringRef funcName) { // Check for \01 prefix that is used to mangle __asm declarations and // strip it if present. - return GlobalValue::getRealLinkageName(funcName); + return GlobalValue::dropLLVMManglingEscape(funcName); } bool TargetLibraryInfoImpl::getLibFunc(StringRef funcName, @@ -1004,22 +1048,34 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, return (NumParams == 1 && FTy.getParamType(0)->isFloatingPointTy()); case LibFunc_acos: + case LibFunc_acos_finite: case LibFunc_acosf: + case LibFunc_acosf_finite: case LibFunc_acosh: + case LibFunc_acosh_finite: case LibFunc_acoshf: + case LibFunc_acoshf_finite: case LibFunc_acoshl: + case LibFunc_acoshl_finite: case LibFunc_acosl: + case LibFunc_acosl_finite: case LibFunc_asin: + case LibFunc_asin_finite: case LibFunc_asinf: + case LibFunc_asinf_finite: case LibFunc_asinh: case LibFunc_asinhf: case LibFunc_asinhl: case LibFunc_asinl: + case LibFunc_asinl_finite: case LibFunc_atan: case LibFunc_atanf: case LibFunc_atanh: + case LibFunc_atanh_finite: case LibFunc_atanhf: + case LibFunc_atanhf_finite: case LibFunc_atanhl: + case LibFunc_atanhl_finite: case LibFunc_atanl: case LibFunc_cbrt: case LibFunc_cbrtf: @@ -1030,18 +1086,30 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, case LibFunc_cos: case LibFunc_cosf: case LibFunc_cosh: + case LibFunc_cosh_finite: case LibFunc_coshf: + case LibFunc_coshf_finite: case LibFunc_coshl: + case LibFunc_coshl_finite: case LibFunc_cosl: case LibFunc_exp10: + case LibFunc_exp10_finite: case LibFunc_exp10f: + case LibFunc_exp10f_finite: case LibFunc_exp10l: + case LibFunc_exp10l_finite: case LibFunc_exp2: + case LibFunc_exp2_finite: case LibFunc_exp2f: + case LibFunc_exp2f_finite: case LibFunc_exp2l: + case LibFunc_exp2l_finite: case LibFunc_exp: + case LibFunc_exp_finite: case LibFunc_expf: + case LibFunc_expf_finite: case LibFunc_expl: + case LibFunc_expl_finite: case LibFunc_expm1: case LibFunc_expm1f: case LibFunc_expm1l: @@ -1052,20 +1120,29 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, case LibFunc_floorf: case LibFunc_floorl: case LibFunc_log10: + case LibFunc_log10_finite: case LibFunc_log10f: + case LibFunc_log10f_finite: case LibFunc_log10l: + case LibFunc_log10l_finite: case LibFunc_log1p: case LibFunc_log1pf: case LibFunc_log1pl: case LibFunc_log2: + case LibFunc_log2_finite: case LibFunc_log2f: + case LibFunc_log2f_finite: case LibFunc_log2l: + case LibFunc_log2l_finite: case LibFunc_log: + case LibFunc_log_finite: case LibFunc_logb: case LibFunc_logbf: case LibFunc_logbl: case LibFunc_logf: + case LibFunc_logf_finite: case LibFunc_logl: + case LibFunc_logl_finite: case LibFunc_nearbyint: case LibFunc_nearbyintf: case LibFunc_nearbyintl: @@ -1078,8 +1155,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, case LibFunc_sin: case LibFunc_sinf: case LibFunc_sinh: + case LibFunc_sinh_finite: case LibFunc_sinhf: + case LibFunc_sinhf_finite: case LibFunc_sinhl: + case LibFunc_sinhl_finite: case LibFunc_sinl: case LibFunc_sqrt: case LibFunc_sqrt_finite: @@ -1100,8 +1180,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, FTy.getReturnType() == FTy.getParamType(0)); case LibFunc_atan2: + case LibFunc_atan2_finite: case LibFunc_atan2f: + case LibFunc_atan2f_finite: case LibFunc_atan2l: + case LibFunc_atan2l_finite: case LibFunc_fmin: case LibFunc_fminf: case LibFunc_fminl: @@ -1115,8 +1198,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy, case LibFunc_copysignf: case LibFunc_copysignl: case LibFunc_pow: + case LibFunc_pow_finite: case LibFunc_powf: + case LibFunc_powf_finite: case LibFunc_powl: + case LibFunc_powl_finite: return (NumParams == 2 && FTy.getReturnType()->isFloatingPointTy() && FTy.getReturnType() == FTy.getParamType(0) && FTy.getReturnType() == FTy.getParamType(1)); @@ -1294,6 +1380,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( {"powf", "__svml_powf8", 8}, {"powf", "__svml_powf16", 16}, + { "__pow_finite", "__svml_pow2", 2 }, + { "__pow_finite", "__svml_pow4", 4 }, + { "__pow_finite", "__svml_pow8", 8 }, + + { "__powf_finite", "__svml_powf4", 4 }, + { "__powf_finite", "__svml_powf8", 8 }, + { "__powf_finite", "__svml_powf16", 16 }, + {"llvm.pow.f64", "__svml_pow2", 2}, {"llvm.pow.f64", "__svml_pow4", 4}, {"llvm.pow.f64", "__svml_pow8", 8}, @@ -1310,6 +1404,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( {"expf", "__svml_expf8", 8}, {"expf", "__svml_expf16", 16}, + { "__exp_finite", "__svml_exp2", 2 }, + { "__exp_finite", "__svml_exp4", 4 }, + { "__exp_finite", "__svml_exp8", 8 }, + + { "__expf_finite", "__svml_expf4", 4 }, + { "__expf_finite", "__svml_expf8", 8 }, + { "__expf_finite", "__svml_expf16", 16 }, + {"llvm.exp.f64", "__svml_exp2", 2}, {"llvm.exp.f64", "__svml_exp4", 4}, {"llvm.exp.f64", "__svml_exp8", 8}, @@ -1326,6 +1428,14 @@ void TargetLibraryInfoImpl::addVectorizableFunctionsFromVecLib( {"logf", "__svml_logf8", 8}, {"logf", "__svml_logf16", 16}, + { "__log_finite", "__svml_log2", 2 }, + { "__log_finite", "__svml_log4", 4 }, + { "__log_finite", "__svml_log8", 8 }, + + { "__logf_finite", "__svml_logf4", 4 }, + { "__logf_finite", "__svml_logf8", 8 }, + { "__logf_finite", "__svml_logf16", 16 }, + {"llvm.log.f64", "__svml_log2", 2}, {"llvm.log.f64", "__svml_log4", 4}, {"llvm.log.f64", "__svml_log8", 8}, diff --git a/lib/Analysis/TargetTransformInfo.cpp b/lib/Analysis/TargetTransformInfo.cpp index 26d606cce9bb..8a5d10473662 100644 --- a/lib/Analysis/TargetTransformInfo.cpp +++ b/lib/Analysis/TargetTransformInfo.cpp @@ -279,6 +279,10 @@ unsigned TargetTransformInfo::getRegisterBitWidth(bool Vector) const { return TTIImpl->getRegisterBitWidth(Vector); } +unsigned TargetTransformInfo::getMinVectorRegisterBitWidth() const { + return TTIImpl->getMinVectorRegisterBitWidth(); +} + bool TargetTransformInfo::shouldConsiderAddressTypePromotion( const Instruction &I, bool &AllowPromotionWithoutCommonHeader) const { return TTIImpl->shouldConsiderAddressTypePromotion( @@ -500,6 +504,15 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF, return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy); } +bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, + Type *Ty, ReductionFlags Flags) const { + return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags); +} + +bool TargetTransformInfo::shouldExpandReduction(const IntrinsicInst *II) const { + return TTIImpl->shouldExpandReduction(II); +} + TargetTransformInfo::Concept::~Concept() {} TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {} diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp index a7f3ff672aef..cba7363a0afa 100644 --- a/lib/Analysis/ValueTracking.cpp +++ b/lib/Analysis/ValueTracking.cpp @@ -88,9 +88,8 @@ struct Query { /// classic case of this is assume(x = y), which will attempt to determine /// bits in x from bits in y, which will attempt to determine bits in y from /// bits in x, etc. Regarding the mutual recursion, computeKnownBits can call - /// isKnownNonZero, which calls computeKnownBits and ComputeSignBit and - /// isKnownToBeAPowerOfTwo (all of which can call computeKnownBits), and so - /// on. + /// isKnownNonZero, which calls computeKnownBits and isKnownToBeAPowerOfTwo + /// (all of which can call computeKnownBits), and so on. std::array Excluded; unsigned NumExcluded; @@ -143,6 +142,16 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known, Query(DL, AC, safeCxtI(V, CxtI), DT, ORE)); } +static KnownBits computeKnownBits(const Value *V, unsigned Depth, + const Query &Q); + +KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL, + unsigned Depth, AssumptionCache *AC, + const Instruction *CxtI, + const DominatorTree *DT) { + return ::computeKnownBits(V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT)); +} + bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, const DataLayout &DL, AssumptionCache *AC, const Instruction *CxtI, @@ -159,16 +168,6 @@ bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS, return (LHSKnown.Zero | RHSKnown.Zero).isAllOnesValue(); } -static void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, - unsigned Depth, const Query &Q); - -void llvm::ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, - const DataLayout &DL, unsigned Depth, - AssumptionCache *AC, const Instruction *CxtI, - const DominatorTree *DT) { - ::ComputeSignBit(V, KnownZero, KnownOne, Depth, - Query(DL, AC, safeCxtI(V, CxtI), DT)); -} static bool isKnownToBeAPowerOfTwo(const Value *V, bool OrZero, unsigned Depth, const Query &Q); @@ -194,9 +193,8 @@ bool llvm::isKnownNonNegative(const Value *V, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { - bool NonNegative, Negative; - ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT); - return NonNegative; + KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT); + return Known.isNonNegative(); } bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth, @@ -214,9 +212,8 @@ bool llvm::isKnownPositive(const Value *V, const DataLayout &DL, unsigned Depth, bool llvm::isKnownNegative(const Value *V, const DataLayout &DL, unsigned Depth, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { - bool NonNegative, Negative; - ComputeSignBit(V, NonNegative, Negative, DL, Depth, AC, CxtI, DT); - return Negative; + KnownBits Known = computeKnownBits(V, DL, Depth, AC, CxtI, DT); + return Known.isNegative(); } static bool isKnownNonEqual(const Value *V1, const Value *V2, const Query &Q); @@ -342,10 +339,10 @@ static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW, // Also compute a conservative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. - unsigned TrailZ = Known.Zero.countTrailingOnes() + - Known2.Zero.countTrailingOnes(); - unsigned LeadZ = std::max(Known.Zero.countLeadingOnes() + - Known2.Zero.countLeadingOnes(), + unsigned TrailZ = Known.countMinTrailingZeros() + + Known2.countMinTrailingZeros(); + unsigned LeadZ = std::max(Known.countMinLeadingZeros() + + Known2.countMinLeadingZeros(), BitWidth) - BitWidth; TrailZ = std::min(TrailZ, BitWidth); @@ -750,8 +747,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known, computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I)); // Whatever high bits in c are zero are known to be zero. - Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()); - // assume(v <_u c) + Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros()); + // assume(v <_u c) } else if (match(Arg, m_ICmp(Pred, m_V, m_Value(A))) && Pred == ICmpInst::ICMP_ULT && isValidAssumeForContext(I, Q.CxtI, Q.DT)) { @@ -761,9 +758,9 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known, // Whatever high bits in c are zero are known to be zero (if c is a power // of 2, then one more). if (isKnownToBeAPowerOfTwo(A, false, Depth + 1, Query(Q, I))) - Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()+1); + Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros() + 1); else - Known.Zero.setHighBits(RHSKnown.Zero.countLeadingOnes()); + Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros()); } } @@ -916,7 +913,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, m_Value(Y))))) { Known2.resetAll(); computeKnownBits(Y, Known2, Depth + 1, Q); - if (Known2.One.countTrailingOnes() > 0) + if (Known2.countMinTrailingOnes() > 0) Known.Zero.setBit(0); } break; @@ -953,14 +950,13 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); - unsigned LeadZ = Known2.Zero.countLeadingOnes(); + unsigned LeadZ = Known2.countMinLeadingZeros(); Known2.resetAll(); computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); - unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros(); - if (RHSUnknownLeadingOnes != BitWidth) - LeadZ = std::min(BitWidth, - LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); + unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros(); + if (RHSMaxLeadingZeros != BitWidth) + LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1); Known.Zero.setHighBits(LeadZ); break; @@ -983,8 +979,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, if (Known.isNegative() && Known2.isNegative()) // We can derive a lower bound on the result by taking the max of the // leading one bits. - MaxHighOnes = std::max(Known.One.countLeadingOnes(), - Known2.One.countLeadingOnes()); + MaxHighOnes = + std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes()); // If either side is non-negative, the result is non-negative. else if (Known.isNonNegative() || Known2.isNonNegative()) MaxHighZeros = 1; @@ -993,8 +989,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, if (Known.isNonNegative() && Known2.isNonNegative()) // We can derive an upper bound on the result by taking the max of the // leading zero bits. - MaxHighZeros = std::max(Known.Zero.countLeadingOnes(), - Known2.Zero.countLeadingOnes()); + MaxHighZeros = std::max(Known.countMinLeadingZeros(), + Known2.countMinLeadingZeros()); // If either side is negative, the result is negative. else if (Known.isNegative() || Known2.isNegative()) MaxHighOnes = 1; @@ -1002,12 +998,12 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, // We can derive a lower bound on the result by taking the max of the // leading one bits. MaxHighOnes = - std::max(Known.One.countLeadingOnes(), Known2.One.countLeadingOnes()); + std::max(Known.countMinLeadingOnes(), Known2.countMinLeadingOnes()); } else if (SPF == SPF_UMIN) { // We can derive an upper bound on the result by taking the max of the // leading zero bits. MaxHighZeros = - std::max(Known.Zero.countLeadingOnes(), Known2.Zero.countLeadingOnes()); + std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros()); } // Only known if known in both the LHS and RHS. @@ -1185,8 +1181,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, computeKnownBits(I->getOperand(0), Known, Depth + 1, Q); computeKnownBits(I->getOperand(1), Known2, Depth + 1, Q); - unsigned Leaders = std::max(Known.Zero.countLeadingOnes(), - Known2.Zero.countLeadingOnes()); + unsigned Leaders = + std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros()); Known.resetAll(); Known.Zero.setHighBits(Leaders); break; @@ -1207,7 +1203,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, // to determine if we can prove known low zero bits. KnownBits LocalKnown(BitWidth); computeKnownBits(I->getOperand(0), LocalKnown, Depth + 1, Q); - unsigned TrailZ = LocalKnown.Zero.countTrailingOnes(); + unsigned TrailZ = LocalKnown.countMinTrailingZeros(); gep_type_iterator GTI = gep_type_begin(I); for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) { @@ -1241,7 +1237,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, computeKnownBits(Index, LocalKnown, Depth + 1, Q); TrailZ = std::min(TrailZ, unsigned(countTrailingZeros(TypeSize) + - LocalKnown.Zero.countTrailingOnes())); + LocalKnown.countMinTrailingZeros())); } } @@ -1286,8 +1282,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, KnownBits Known3(Known); computeKnownBits(L, Known3, Depth + 1, Q); - Known.Zero.setLowBits(std::min(Known2.Zero.countTrailingOnes(), - Known3.Zero.countTrailingOnes())); + Known.Zero.setLowBits(std::min(Known2.countMinTrailingZeros(), + Known3.countMinTrailingZeros())); if (DontImproveNonNegativePhiBits) break; @@ -1386,12 +1382,25 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, Known.Zero |= Known2.Zero.byteSwap(); Known.One |= Known2.One.byteSwap(); break; - case Intrinsic::ctlz: + case Intrinsic::ctlz: { + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + // If we have a known 1, its position is our upper bound. + unsigned PossibleLZ = Known2.One.countLeadingZeros(); + // If this call is undefined for 0, the result will be less than 2^n. + if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext())) + PossibleLZ = std::min(PossibleLZ, BitWidth - 1); + unsigned LowBits = Log2_32(PossibleLZ)+1; + Known.Zero.setBitsFrom(LowBits); + break; + } case Intrinsic::cttz: { - unsigned LowBits = Log2_32(BitWidth)+1; + computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); + // If we have a known 1, its position is our upper bound. + unsigned PossibleTZ = Known2.One.countTrailingZeros(); // If this call is undefined for 0, the result will be less than 2^n. if (II->getArgOperand(1) == ConstantInt::getTrue(II->getContext())) - LowBits -= 1; + PossibleTZ = std::min(PossibleTZ, BitWidth - 1); + unsigned LowBits = Log2_32(PossibleTZ)+1; Known.Zero.setBitsFrom(LowBits); break; } @@ -1399,7 +1408,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q); // We can bound the space the count needs. Also, bits known to be zero // can't contribute to the population. - unsigned BitsPossiblySet = BitWidth - Known2.Zero.countPopulation(); + unsigned BitsPossiblySet = Known2.countMaxPopulation(); unsigned LowBits = Log2_32(BitsPossiblySet)+1; Known.Zero.setBitsFrom(LowBits); // TODO: we could bound KnownOne using the lower bound on the number @@ -1449,6 +1458,14 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known, } } +/// Determine which bits of V are known to be either zero or one and return +/// them. +KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) { + KnownBits Known(getBitWidth(V->getType(), Q.DL)); + computeKnownBits(V, Known, Depth, Q); + return Known; +} + /// Determine which bits of V are known to be either zero or one and return /// them in the Known bit set. /// @@ -1568,16 +1585,6 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth, assert((Known.Zero & Known.One) == 0 && "Bits known to be one AND zero?"); } -/// Determine whether the sign bit is known to be zero or one. -/// Convenience wrapper around computeKnownBits. -void ComputeSignBit(const Value *V, bool &KnownZero, bool &KnownOne, - unsigned Depth, const Query &Q) { - KnownBits Bits(getBitWidth(V->getType(), Q.DL)); - computeKnownBits(V, Bits, Depth, Q); - KnownOne = Bits.isNegative(); - KnownZero = Bits.isNonNegative(); -} - /// Return true if the given value is known to have exactly one /// bit set when defined. For vectors return true if every element is known to /// be a power of two when defined. Supports values with integer or pointer @@ -1842,24 +1849,20 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { if (BO->isExact()) return isKnownNonZero(X, Depth, Q); - bool XKnownNonNegative, XKnownNegative; - ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q); - if (XKnownNegative) + KnownBits Known = computeKnownBits(X, Depth, Q); + if (Known.isNegative()) return true; // If the shifter operand is a constant, and all of the bits shifted // out are known to be zero, and X is known non-zero then at least one // non-zero bit must remain. if (ConstantInt *Shift = dyn_cast(Y)) { - KnownBits Known(BitWidth); - computeKnownBits(X, Known, Depth, Q); - auto ShiftVal = Shift->getLimitedValue(BitWidth - 1); // Is there a known one in the portion not shifted out? - if (Known.One.countLeadingZeros() < BitWidth - ShiftVal) + if (Known.countMaxLeadingZeros() < BitWidth - ShiftVal) return true; // Are all the bits to be shifted out known zero? - if (Known.Zero.countTrailingOnes() >= ShiftVal) + if (Known.countMinTrailingZeros() >= ShiftVal) return isKnownNonZero(X, Depth, Q); } } @@ -1869,39 +1872,34 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) { } // X + Y. else if (match(V, m_Add(m_Value(X), m_Value(Y)))) { - bool XKnownNonNegative, XKnownNegative; - bool YKnownNonNegative, YKnownNegative; - ComputeSignBit(X, XKnownNonNegative, XKnownNegative, Depth, Q); - ComputeSignBit(Y, YKnownNonNegative, YKnownNegative, Depth, Q); + KnownBits XKnown = computeKnownBits(X, Depth, Q); + KnownBits YKnown = computeKnownBits(Y, Depth, Q); // If X and Y are both non-negative (as signed values) then their sum is not // zero unless both X and Y are zero. - if (XKnownNonNegative && YKnownNonNegative) + if (XKnown.isNonNegative() && YKnown.isNonNegative()) if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q)) return true; // If X and Y are both negative (as signed values) then their sum is not // zero unless both X and Y equal INT_MIN. - if (XKnownNegative && YKnownNegative) { - KnownBits Known(BitWidth); + if (XKnown.isNegative() && YKnown.isNegative()) { APInt Mask = APInt::getSignedMaxValue(BitWidth); // The sign bit of X is set. If some other bit is set then X is not equal // to INT_MIN. - computeKnownBits(X, Known, Depth, Q); - if (Known.One.intersects(Mask)) + if (XKnown.One.intersects(Mask)) return true; // The sign bit of Y is set. If some other bit is set then Y is not equal // to INT_MIN. - computeKnownBits(Y, Known, Depth, Q); - if (Known.One.intersects(Mask)) + if (YKnown.One.intersects(Mask)) return true; } // The sum of a non-negative number and a power of two is not zero. - if (XKnownNonNegative && + if (XKnown.isNonNegative() && isKnownToBeAPowerOfTwo(Y, /*OrZero*/ false, Depth, Q)) return true; - if (YKnownNonNegative && + if (YKnown.isNonNegative() && isKnownToBeAPowerOfTwo(X, /*OrZero*/ false, Depth, Q)) return true; } @@ -2276,14 +2274,7 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth, // If we know that the sign bit is either zero or one, determine the number of // identical bits in the top of the input value. - if (Known.isNonNegative()) - return std::max(FirstAnswer, Known.Zero.countLeadingOnes()); - - if (Known.isNegative()) - return std::max(FirstAnswer, Known.One.countLeadingOnes()); - - // computeKnownBits gave us no extra information about the top bits. - return FirstAnswer; + return std::max(FirstAnswer, Known.countMinSignBits()); } /// This function computes the integer multiple of Base that equals V. @@ -3441,8 +3432,8 @@ OverflowResult llvm::computeOverflowForUnsignedMul(const Value *LHS, computeKnownBits(RHS, RHSKnown, DL, /*Depth=*/0, AC, CxtI, DT); // Note that underestimating the number of zero bits gives a more // conservative answer. - unsigned ZeroBits = LHSKnown.Zero.countLeadingOnes() + - RHSKnown.Zero.countLeadingOnes(); + unsigned ZeroBits = LHSKnown.countMinLeadingZeros() + + RHSKnown.countMinLeadingZeros(); // First handle the easy case: if we have enough zero bits there's // definitely no overflow. if (ZeroBits >= BitWidth) @@ -3475,21 +3466,17 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS, AssumptionCache *AC, const Instruction *CxtI, const DominatorTree *DT) { - bool LHSKnownNonNegative, LHSKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0, - AC, CxtI, DT); - if (LHSKnownNonNegative || LHSKnownNegative) { - bool RHSKnownNonNegative, RHSKnownNegative; - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0, - AC, CxtI, DT); - - if (LHSKnownNegative && RHSKnownNegative) { + KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT); + if (LHSKnown.isNonNegative() || LHSKnown.isNegative()) { + KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT); + + if (LHSKnown.isNegative() && RHSKnown.isNegative()) { // The sign bit is set in both cases: this MUST overflow. // Create a simple add instruction, and insert it into the struct. return OverflowResult::AlwaysOverflows; } - if (LHSKnownNonNegative && RHSKnownNonNegative) { + if (LHSKnown.isNonNegative() && RHSKnown.isNonNegative()) { // The sign bit is clear in both cases: this CANNOT overflow. // Create a simple add instruction, and insert it into the struct. return OverflowResult::NeverOverflows; @@ -3499,6 +3486,51 @@ OverflowResult llvm::computeOverflowForUnsignedAdd(const Value *LHS, return OverflowResult::MayOverflow; } +/// \brief Return true if we can prove that adding the two values of the +/// knownbits will not overflow. +/// Otherwise return false. +static bool checkRippleForSignedAdd(const KnownBits &LHSKnown, + const KnownBits &RHSKnown) { + // Addition of two 2's complement numbers having opposite signs will never + // overflow. + if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) || + (LHSKnown.isNonNegative() && RHSKnown.isNegative())) + return true; + + // If either of the values is known to be non-negative, adding them can only + // overflow if the second is also non-negative, so we can assume that. + // Two non-negative numbers will only overflow if there is a carry to the + // sign bit, so we can check if even when the values are as big as possible + // there is no overflow to the sign bit. + if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) { + APInt MaxLHS = ~LHSKnown.Zero; + MaxLHS.clearSignBit(); + APInt MaxRHS = ~RHSKnown.Zero; + MaxRHS.clearSignBit(); + APInt Result = std::move(MaxLHS) + std::move(MaxRHS); + return Result.isSignBitClear(); + } + + // If either of the values is known to be negative, adding them can only + // overflow if the second is also negative, so we can assume that. + // Two negative number will only overflow if there is no carry to the sign + // bit, so we can check if even when the values are as small as possible + // there is overflow to the sign bit. + if (LHSKnown.isNegative() || RHSKnown.isNegative()) { + APInt MinLHS = LHSKnown.One; + MinLHS.clearSignBit(); + APInt MinRHS = RHSKnown.One; + MinRHS.clearSignBit(); + APInt Result = std::move(MinLHS) + std::move(MinRHS); + return Result.isSignBitSet(); + } + + // If we reached here it means that we know nothing about the sign bits. + // In this case we can't know if there will be an overflow, since by + // changing the sign bits any two values can be made to overflow. + return false; +} + static OverflowResult computeOverflowForSignedAdd(const Value *LHS, const Value *RHS, const AddOperator *Add, @@ -3510,18 +3542,29 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS, return OverflowResult::NeverOverflows; } - bool LHSKnownNonNegative, LHSKnownNegative; - bool RHSKnownNonNegative, RHSKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, DL, /*Depth=*/0, - AC, CxtI, DT); - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, DL, /*Depth=*/0, - AC, CxtI, DT); + // If LHS and RHS each have at least two sign bits, the addition will look + // like + // + // XX..... + + // YY..... + // + // If the carry into the most significant position is 0, X and Y can't both + // be 1 and therefore the carry out of the addition is also 0. + // + // If the carry into the most significant position is 1, X and Y can't both + // be 0 and therefore the carry out of the addition is also 1. + // + // Since the carry into the most significant position is always equal to + // the carry out of the addition, there is no signed overflow. + if (ComputeNumSignBits(LHS, DL, 0, AC, CxtI, DT) > 1 && + ComputeNumSignBits(RHS, DL, 0, AC, CxtI, DT) > 1) + return OverflowResult::NeverOverflows; + + KnownBits LHSKnown = computeKnownBits(LHS, DL, /*Depth=*/0, AC, CxtI, DT); + KnownBits RHSKnown = computeKnownBits(RHS, DL, /*Depth=*/0, AC, CxtI, DT); - if ((LHSKnownNonNegative && RHSKnownNegative) || - (LHSKnownNegative && RHSKnownNonNegative)) { - // The sign bits are opposite: this CANNOT overflow. + if (checkRippleForSignedAdd(LHSKnown, RHSKnown)) return OverflowResult::NeverOverflows; - } // The remaining code needs Add to be available. Early returns if not so. if (!Add) @@ -3532,14 +3575,13 @@ static OverflowResult computeOverflowForSignedAdd(const Value *LHS, // @llvm.assume'ed non-negative rather than proved so from analyzing its // operands. bool LHSOrRHSKnownNonNegative = - (LHSKnownNonNegative || RHSKnownNonNegative); - bool LHSOrRHSKnownNegative = (LHSKnownNegative || RHSKnownNegative); + (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()); + bool LHSOrRHSKnownNegative = + (LHSKnown.isNegative() || RHSKnown.isNegative()); if (LHSOrRHSKnownNonNegative || LHSOrRHSKnownNegative) { - bool AddKnownNonNegative, AddKnownNegative; - ComputeSignBit(Add, AddKnownNonNegative, AddKnownNegative, DL, - /*Depth=*/0, AC, CxtI, DT); - if ((AddKnownNonNegative && LHSOrRHSKnownNonNegative) || - (AddKnownNegative && LHSOrRHSKnownNegative)) { + KnownBits AddKnown = computeKnownBits(Add, DL, /*Depth=*/0, AC, CxtI, DT); + if ((AddKnown.isNonNegative() && LHSOrRHSKnownNonNegative) || + (AddKnown.isNegative() && LHSOrRHSKnownNegative)) { return OverflowResult::NeverOverflows; } } diff --git a/lib/Analysis/VectorUtils.cpp b/lib/Analysis/VectorUtils.cpp index 722f17a8067e..2d2249da4e13 100644 --- a/lib/Analysis/VectorUtils.cpp +++ b/lib/Analysis/VectorUtils.cpp @@ -23,6 +23,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Value.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/IRBuilder.h" using namespace llvm; using namespace llvm::PatternMatch; diff --git a/lib/AsmParser/LLParser.cpp b/lib/AsmParser/LLParser.cpp index 97a567565b47..d7602c83435c 100644 --- a/lib/AsmParser/LLParser.cpp +++ b/lib/AsmParser/LLParser.cpp @@ -162,6 +162,10 @@ bool LLParser::ValidateEndOfModule() { AS = AS.addAttributes(Context, AttributeList::FunctionIndex, AttributeSet::get(Context, FnAttrs)); II->setAttributes(AS); + } else if (auto *GV = dyn_cast(V)) { + AttrBuilder Attrs(GV->getAttributes()); + Attrs.merge(B); + GV->setAttributes(AttributeSet::get(Context,Attrs)); } else { llvm_unreachable("invalid object with forward attribute group reference"); } @@ -832,10 +836,10 @@ bool LLParser::parseIndirectSymbol( /// ParseGlobal /// ::= GlobalVar '=' OptionalLinkage OptionalVisibility OptionalDLLStorageClass /// OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace -/// OptionalExternallyInitialized GlobalType Type Const +/// OptionalExternallyInitialized GlobalType Type Const OptionalAttrs /// ::= OptionalLinkage OptionalVisibility OptionalDLLStorageClass /// OptionalThreadLocal OptionalUnnamedAddr OptionalAddrSpace -/// OptionalExternallyInitialized GlobalType Type Const +/// OptionalExternallyInitialized GlobalType Type Const OptionalAttrs /// /// Everything up to and including OptionalUnnamedAddr has been parsed /// already. @@ -950,6 +954,16 @@ bool LLParser::ParseGlobal(const std::string &Name, LocTy NameLoc, } } + AttrBuilder Attrs; + LocTy BuiltinLoc; + std::vector FwdRefAttrGrps; + if (ParseFnAttributeValuePairs(Attrs, FwdRefAttrGrps, false, BuiltinLoc)) + return true; + if (Attrs.hasAttributes() || !FwdRefAttrGrps.empty()) { + GV->setAttributes(AttributeSet::get(Context, Attrs)); + ForwardRefAttrGroups[GV] = FwdRefAttrGrps; + } + return false; } diff --git a/lib/Bitcode/Reader/BitcodeReader.cpp b/lib/Bitcode/Reader/BitcodeReader.cpp index 580261a3b5e0..76298121566a 100644 --- a/lib/Bitcode/Reader/BitcodeReader.cpp +++ b/lib/Bitcode/Reader/BitcodeReader.cpp @@ -93,13 +93,6 @@ static cl::opt PrintSummaryGUIDs( cl::desc( "Print the global id for each value when reading the module summary")); -// FIXME: This flag should either be removed or moved to clang as a driver flag. -static llvm::cl::opt IgnoreEmptyThinLTOIndexFile( - "ignore-empty-index-file", llvm::cl::ZeroOrMore, - llvm::cl::desc( - "Ignore an empty index file and perform non-ThinLTO compilation"), - llvm::cl::init(false)); - namespace { enum { @@ -2750,7 +2743,7 @@ Error BitcodeReader::parseComdatRecord(ArrayRef Record) { Error BitcodeReader::parseGlobalVarRecord(ArrayRef Record) { // v1: [pointer type, isconst, initid, linkage, alignment, section, // visibility, threadlocal, unnamed_addr, externally_initialized, - // dllstorageclass, comdat] (name in VST) + // dllstorageclass, comdat, attributes] (name in VST) // v2: [strtab_offset, strtab_size, v1] StringRef Name; std::tie(Name, Record) = readNameFromStrtab(Record); @@ -2830,6 +2823,11 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef Record) { } else if (hasImplicitComdat(RawLinkage)) { NewGV->setComdat(reinterpret_cast(1)); } + + if (Record.size() > 12) { + auto AS = getAttributes(Record[12]).getFnAttributes(); + NewGV->setAttributes(AS); + } return Error::success(); } @@ -5658,7 +5656,8 @@ Expected llvm::hasGlobalValueSummary(MemoryBufferRef Buffer) { } Expected> -llvm::getModuleSummaryIndexForFile(StringRef Path) { +llvm::getModuleSummaryIndexForFile(StringRef Path, + bool IgnoreEmptyThinLTOIndexFile) { ErrorOr> FileOrErr = MemoryBuffer::getFileOrSTDIN(Path); if (!FileOrErr) diff --git a/lib/Bitcode/Reader/MetadataLoader.cpp b/lib/Bitcode/Reader/MetadataLoader.cpp index 42135e5949ce..d80e1da911ca 100644 --- a/lib/Bitcode/Reader/MetadataLoader.cpp +++ b/lib/Bitcode/Reader/MetadataLoader.cpp @@ -500,7 +500,7 @@ class MetadataLoader::MetadataLoaderImpl { // Upgrade variables attached to globals. for (auto &GV : TheModule.globals()) { - SmallVector MDs, NewMDs; + SmallVector MDs; GV.getMetadata(LLVMContext::MD_dbg, MDs); GV.eraseMetadata(LLVMContext::MD_dbg); for (auto *MD : MDs) diff --git a/lib/Bitcode/Writer/BitcodeWriter.cpp b/lib/Bitcode/Writer/BitcodeWriter.cpp index 1b8d81a60201..1f8b50342c2d 100644 --- a/lib/Bitcode/Writer/BitcodeWriter.cpp +++ b/lib/Bitcode/Writer/BitcodeWriter.cpp @@ -1109,7 +1109,7 @@ void ModuleBitcodeWriter::writeModuleInfo() { // GLOBALVAR: [strtab offset, strtab size, type, isconst, initid, // linkage, alignment, section, visibility, threadlocal, // unnamed_addr, externally_initialized, dllstorageclass, - // comdat] + // comdat, attributes] Vals.push_back(StrtabBuilder.add(GV.getName())); Vals.push_back(GV.getName().size()); Vals.push_back(VE.getTypeID(GV.getValueType())); @@ -1124,13 +1124,17 @@ void ModuleBitcodeWriter::writeModuleInfo() { GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None || GV.isExternallyInitialized() || GV.getDLLStorageClass() != GlobalValue::DefaultStorageClass || - GV.hasComdat()) { + GV.hasComdat() || + GV.hasAttributes()) { Vals.push_back(getEncodedVisibility(GV)); Vals.push_back(getEncodedThreadLocalMode(GV)); Vals.push_back(getEncodedUnnamedAddr(GV)); Vals.push_back(GV.isExternallyInitialized()); Vals.push_back(getEncodedDLLStorageClass(GV)); Vals.push_back(GV.hasComdat() ? VE.getComdatID(GV.getComdat()) : 0); + + auto AL = GV.getAttributesAsList(AttributeList::FunctionIndex); + Vals.push_back(VE.getAttributeListID(AL)); } else { AbbrevToUse = SimpleGVarAbbrev; } diff --git a/lib/Bitcode/Writer/ValueEnumerator.cpp b/lib/Bitcode/Writer/ValueEnumerator.cpp index 861150766986..fd76400331d9 100644 --- a/lib/Bitcode/Writer/ValueEnumerator.cpp +++ b/lib/Bitcode/Writer/ValueEnumerator.cpp @@ -314,10 +314,13 @@ ValueEnumerator::ValueEnumerator(const Module &M, // Remember what is the cutoff between globalvalue's and other constants. unsigned FirstConstant = Values.size(); - // Enumerate the global variable initializers. - for (const GlobalVariable &GV : M.globals()) + // Enumerate the global variable initializers and attributes. + for (const GlobalVariable &GV : M.globals()) { if (GV.hasInitializer()) EnumerateValue(GV.getInitializer()); + if (GV.hasAttributes()) + EnumerateAttributes(GV.getAttributesAsList(AttributeList::FunctionIndex)); + } // Enumerate the aliasees. for (const GlobalAlias &GA : M.aliases()) diff --git a/lib/CMakeLists.txt b/lib/CMakeLists.txt index 76549540ce0f..73fc2b35fe4e 100644 --- a/lib/CMakeLists.txt +++ b/lib/CMakeLists.txt @@ -21,5 +21,5 @@ add_subdirectory(LineEditor) add_subdirectory(ProfileData) add_subdirectory(Fuzzer) add_subdirectory(Passes) -add_subdirectory(LibDriver) +add_subdirectory(ToolDrivers) add_subdirectory(XRay) diff --git a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp index 87b45c001de4..98163bffb60b 100644 --- a/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp +++ b/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp @@ -767,7 +767,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV, // If our DISubprogram name is empty, use the mangled name. if (FuncName.empty()) - FuncName = GlobalValue::getRealLinkageName(GV->getName()); + FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName()); // Emit a symbol subsection, required by VS2012+ to find function boundaries. OS.AddComment("Symbol subsection for " + Twine(FuncName)); @@ -888,13 +888,21 @@ void CodeViewDebug::collectVariableInfoFromMFTable( if (!Scope) continue; + // If the variable has an attached offset expression, extract it. + // FIXME: Try to handle DW_OP_deref as well. + int64_t ExprOffset = 0; + if (VI.Expr) + if (!VI.Expr->extractIfOffset(ExprOffset)) + continue; + // Get the frame register used and the offset. unsigned FrameReg = 0; int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg); uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg); // Calculate the label ranges. - LocalVarDefRange DefRange = createDefRangeMem(CVReg, FrameOffset); + LocalVarDefRange DefRange = + createDefRangeMem(CVReg, FrameOffset + ExprOffset); for (const InsnRange &Range : Scope->getRanges()) { const MCSymbol *Begin = getLabelBeforeInsn(Range.first); const MCSymbol *End = getLabelAfterInsn(Range.second); @@ -2194,7 +2202,7 @@ void CodeViewDebug::emitDebugInfoForGlobals() { if (GV->hasComdat()) { MCSymbol *GVSym = Asm->getSymbol(GV); OS.AddComment("Symbol subsection for " + - Twine(GlobalValue::getRealLinkageName(GV->getName()))); + Twine(GlobalValue::dropLLVMManglingEscape(GV->getName()))); switchToDebugSectionForSymbol(GVSym); EndLabel = beginCVSubsection(ModuleDebugFragmentKind::Symbols); // FIXME: emitDebugInfoForGlobal() doesn't handle DIExpressions. diff --git a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp index 1d63e33a4d33..826162ad47c4 100644 --- a/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp +++ b/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp @@ -129,10 +129,9 @@ bool hasDebugInfo(const MachineModuleInfo *MMI, const MachineFunction *MF) { } void DebugHandlerBase::beginFunction(const MachineFunction *MF) { - assert(Asm); PrevInstBB = nullptr; - if (!hasDebugInfo(MMI, MF)) { + if (!Asm || !hasDebugInfo(MMI, MF)) { skippedNonDebugFunction(); return; } diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp index 738e062cb93f..e172712cf889 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp @@ -440,7 +440,7 @@ DIE *DwarfCompileUnit::constructInlinedScopeDIE(LexicalScope *Scope) { auto *InlinedSP = getDISubprogram(DS); // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. - DIE *OriginDIE = DU->getAbstractSPDies()[InlinedSP]; + DIE *OriginDIE = getAbstractSPDies()[InlinedSP]; assert(OriginDIE && "Unable to find original DIE for an inlined subprogram."); auto ScopeDIE = DIE::get(DIEValueAllocator, dwarf::DW_TAG_inlined_subroutine); @@ -634,7 +634,7 @@ DIE *DwarfCompileUnit::createAndAddScopeChildren(LexicalScope *Scope, void DwarfCompileUnit::constructAbstractSubprogramScopeDIE( LexicalScope *Scope) { - DIE *&AbsDef = DU->getAbstractSPDies()[Scope->getScopeNode()]; + DIE *&AbsDef = getAbstractSPDies()[Scope->getScopeNode()]; if (AbsDef) return; @@ -696,7 +696,7 @@ DIE *DwarfCompileUnit::constructImportedEntityDIE( void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) { DIE *D = getDIE(SP); - if (DIE *AbsSPDIE = DU->getAbstractSPDies().lookup(SP)) { + if (DIE *AbsSPDIE = getAbstractSPDies().lookup(SP)) { if (D) // If this subprogram has an abstract definition, reference that addDIEEntry(*D, dwarf::DW_AT_abstract_origin, *AbsSPDIE); @@ -708,6 +708,42 @@ void DwarfCompileUnit::finishSubprogramDefinition(const DISubprogram *SP) { } } +void DwarfCompileUnit::finishVariableDefinition(const DbgVariable &Var) { + DbgVariable *AbsVar = getExistingAbstractVariable( + InlinedVariable(Var.getVariable(), Var.getInlinedAt())); + auto *VariableDie = Var.getDIE(); + if (AbsVar && AbsVar->getDIE()) { + addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, + *AbsVar->getDIE()); + } else + applyVariableAttributes(Var, *VariableDie); +} + +DbgVariable *DwarfCompileUnit::getExistingAbstractVariable(InlinedVariable IV) { + const DILocalVariable *Cleansed; + return getExistingAbstractVariable(IV, Cleansed); +} + +// Find abstract variable, if any, associated with Var. +DbgVariable *DwarfCompileUnit::getExistingAbstractVariable( + InlinedVariable IV, const DILocalVariable *&Cleansed) { + // More then one inlined variable corresponds to one abstract variable. + Cleansed = IV.first; + auto &AbstractVariables = getAbstractVariables(); + auto I = AbstractVariables.find(Cleansed); + if (I != AbstractVariables.end()) + return I->second.get(); + return nullptr; +} + +void DwarfCompileUnit::createAbstractVariable(const DILocalVariable *Var, + LexicalScope *Scope) { + assert(Scope && Scope->isAbstractScope()); + auto AbsDbgVariable = make_unique(Var, /* IA */ nullptr); + DU->addScopeVariable(Scope, AbsDbgVariable.get()); + getAbstractVariables()[Var] = std::move(AbsDbgVariable); +} + void DwarfCompileUnit::emitHeader(bool UseOffsets) { // Don't bother labeling the .dwo unit, as its offset isn't used. if (!Skeleton) { diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h index 20a415150b4d..77e9e671529f 100644 --- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h @@ -68,6 +68,9 @@ class DwarfCompileUnit final : public DwarfUnit { // ranges/locs. const MCSymbol *BaseAddress; + DenseMap AbstractSPDies; + DenseMap> AbstractVariables; + /// \brief Construct a DIE for the given DbgVariable without initializing the /// DbgVariable's DIE reference. DIE *constructVariableDIEImpl(const DbgVariable &DV, bool Abstract); @@ -76,6 +79,18 @@ class DwarfCompileUnit final : public DwarfUnit { bool includeMinimalInlineScopes() const; + DenseMap &getAbstractSPDies() { + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return AbstractSPDies; + return DU->getAbstractSPDies(); + } + + DenseMap> &getAbstractVariables() { + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return AbstractVariables; + return DU->getAbstractVariables(); + } + public: DwarfCompileUnit(unsigned UID, const DICompileUnit *Node, AsmPrinter *A, DwarfDebug *DW, DwarfFile *DWU); @@ -189,6 +204,13 @@ public: DIE *constructImportedEntityDIE(const DIImportedEntity *Module); void finishSubprogramDefinition(const DISubprogram *SP); + void finishVariableDefinition(const DbgVariable &Var); + /// Find abstract variable associated with Var. + typedef DbgValueHistoryMap::InlinedVariable InlinedVariable; + DbgVariable *getExistingAbstractVariable(InlinedVariable IV, + const DILocalVariable *&Cleansed); + DbgVariable *getExistingAbstractVariable(InlinedVariable IV); + void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope); /// Set the skeleton unit associated with this unit. void setSkeleton(DwarfCompileUnit &Skel) { Skeleton = &Skel; } diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp index 6f442f5c3172..3410b98d7776 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.cpp @@ -71,6 +71,10 @@ static cl::opt GenerateARangeSection("generate-arange-section", cl::desc("Generate dwarf aranges"), cl::init(false)); +static cl::opt SplitDwarfCrossCuReferences( + "split-dwarf-cross-cu-references", cl::Hidden, + cl::desc("Enable cross-cu references in DWO files"), cl::init(false)); + namespace { enum DefaultOnOff { Default, Enable, Disable }; } @@ -362,21 +366,29 @@ template static void forBothCUs(DwarfCompileUnit &CU, Func F) { F(*SkelCU); } -void DwarfDebug::constructAbstractSubprogramScopeDIE(LexicalScope *Scope) { +bool DwarfDebug::shareAcrossDWOCUs() const { + return SplitDwarfCrossCuReferences; +} + +void DwarfDebug::constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, + LexicalScope *Scope) { assert(Scope && Scope->getScopeNode()); assert(Scope->isAbstractScope()); assert(!Scope->getInlinedAt()); auto *SP = cast(Scope->getScopeNode()); - ProcessedSPNodes.insert(SP); - // Find the subprogram's DwarfCompileUnit in the SPMap in case the subprogram // was inlined from another compile unit. auto &CU = *CUMap.lookup(SP->getUnit()); - forBothCUs(CU, [&](DwarfCompileUnit &CU) { + if (auto *SkelCU = CU.getSkeleton()) { + (shareAcrossDWOCUs() ? CU : SrcCU) + .constructAbstractSubprogramScopeDIE(Scope); + if (CU.getCUNode()->getSplitDebugInlining()) + SkelCU->constructAbstractSubprogramScopeDIE(Scope); + } else { CU.constructAbstractSubprogramScopeDIE(Scope); - }); + } } void DwarfDebug::addGnuPubAttributes(DwarfUnit &U, DIE &D) const { @@ -564,13 +576,7 @@ void DwarfDebug::finishVariableDefinitions() { // DIE::getUnit isn't simple - it walks parent pointers, etc. DwarfCompileUnit *Unit = CUDieMap.lookup(VariableDie->getUnitDie()); assert(Unit); - DbgVariable *AbsVar = getExistingAbstractVariable( - InlinedVariable(Var->getVariable(), Var->getInlinedAt())); - if (AbsVar && AbsVar->getDIE()) { - Unit->addDIEEntry(*VariableDie, dwarf::DW_AT_abstract_origin, - *AbsVar->getDIE()); - } else - Unit->applyVariableAttributes(*Var, *VariableDie); + Unit->finishVariableDefinition(*Var); } } @@ -718,58 +724,32 @@ void DwarfDebug::endModule() { } // clean up. - AbstractVariables.clear(); + // FIXME: AbstractVariables.clear(); } -// Find abstract variable, if any, associated with Var. -DbgVariable * -DwarfDebug::getExistingAbstractVariable(InlinedVariable IV, - const DILocalVariable *&Cleansed) { - // More then one inlined variable corresponds to one abstract variable. - Cleansed = IV.first; - auto I = AbstractVariables.find(Cleansed); - if (I != AbstractVariables.end()) - return I->second.get(); - return nullptr; -} - -DbgVariable *DwarfDebug::getExistingAbstractVariable(InlinedVariable IV) { - const DILocalVariable *Cleansed; - return getExistingAbstractVariable(IV, Cleansed); -} - -void DwarfDebug::createAbstractVariable(const DILocalVariable *Var, - LexicalScope *Scope) { - assert(Scope && Scope->isAbstractScope()); - auto AbsDbgVariable = make_unique(Var, /* IA */ nullptr); - InfoHolder.addScopeVariable(Scope, AbsDbgVariable.get()); - AbstractVariables[Var] = std::move(AbsDbgVariable); -} - -void DwarfDebug::ensureAbstractVariableIsCreated(InlinedVariable IV, +void DwarfDebug::ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable IV, const MDNode *ScopeNode) { const DILocalVariable *Cleansed = nullptr; - if (getExistingAbstractVariable(IV, Cleansed)) + if (CU.getExistingAbstractVariable(IV, Cleansed)) return; - createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope( + CU.createAbstractVariable(Cleansed, LScopes.getOrCreateAbstractScope( cast(ScopeNode))); } -void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped( +void DwarfDebug::ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable IV, const MDNode *ScopeNode) { const DILocalVariable *Cleansed = nullptr; - if (getExistingAbstractVariable(IV, Cleansed)) + if (CU.getExistingAbstractVariable(IV, Cleansed)) return; if (LexicalScope *Scope = LScopes.findAbstractScope(cast_or_null(ScopeNode))) - createAbstractVariable(Cleansed, Scope); + CU.createAbstractVariable(Cleansed, Scope); } - // Collect variable information from side table maintained by MF. void DwarfDebug::collectVariableInfoFromMFTable( - DenseSet &Processed) { + DwarfCompileUnit &TheCU, DenseSet &Processed) { for (const auto &VI : Asm->MF->getVariableDbgInfo()) { if (!VI.Var) continue; @@ -784,7 +764,7 @@ void DwarfDebug::collectVariableInfoFromMFTable( if (!Scope) continue; - ensureAbstractVariableIsCreatedIfScoped(Var, Scope->getScopeNode()); + ensureAbstractVariableIsCreatedIfScoped(TheCU, Var, Scope->getScopeNode()); auto RegVar = make_unique(Var.first, Var.second); RegVar->initializeMMI(VI.Expr, VI.Slot); if (InfoHolder.addScopeVariable(Scope, RegVar.get())) @@ -955,9 +935,10 @@ DwarfDebug::buildLocationList(SmallVectorImpl &DebugLoc, } } -DbgVariable *DwarfDebug::createConcreteVariable(LexicalScope &Scope, +DbgVariable *DwarfDebug::createConcreteVariable(DwarfCompileUnit &TheCU, + LexicalScope &Scope, InlinedVariable IV) { - ensureAbstractVariableIsCreatedIfScoped(IV, Scope.getScopeNode()); + ensureAbstractVariableIsCreatedIfScoped(TheCU, IV, Scope.getScopeNode()); ConcreteVariables.push_back(make_unique(IV.first, IV.second)); InfoHolder.addScopeVariable(&Scope, ConcreteVariables.back().get()); return ConcreteVariables.back().get(); @@ -980,7 +961,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, const DISubprogram *SP, DenseSet &Processed) { // Grab the variable info that was squirreled away in the MMI side-table. - collectVariableInfoFromMFTable(Processed); + collectVariableInfoFromMFTable(TheCU, Processed); for (const auto &I : DbgValues) { InlinedVariable IV = I.first; @@ -1002,7 +983,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, continue; Processed.insert(IV); - DbgVariable *RegVar = createConcreteVariable(*Scope, IV); + DbgVariable *RegVar = createConcreteVariable(TheCU, *Scope, IV); const MachineInstr *MInsn = Ranges.front().first; assert(MInsn->isDebugValue() && "History must begin with debug value"); @@ -1038,7 +1019,7 @@ void DwarfDebug::collectVariableInfo(DwarfCompileUnit &TheCU, for (const DILocalVariable *DV : SP->getVariables()) { if (Processed.insert(InlinedVariable(DV, nullptr)).second) if (LexicalScope *Scope = LScopes.findLexicalScope(DV->getScope())) - createConcreteVariable(*Scope, InlinedVariable(DV, nullptr)); + createConcreteVariable(TheCU, *Scope, InlinedVariable(DV, nullptr)); } } @@ -1229,12 +1210,12 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) { for (const DILocalVariable *DV : SP->getVariables()) { if (!ProcessedVars.insert(InlinedVariable(DV, nullptr)).second) continue; - ensureAbstractVariableIsCreated(InlinedVariable(DV, nullptr), + ensureAbstractVariableIsCreated(TheCU, InlinedVariable(DV, nullptr), DV->getScope()); assert(LScopes.getAbstractScopesList().size() == NumAbstractScopes && "ensureAbstractVariableIsCreated inserted abstract scopes"); } - constructAbstractSubprogramScopeDIE(AScope); + constructAbstractSubprogramScopeDIE(TheCU, AScope); } ProcessedSPNodes.insert(SP); diff --git a/lib/CodeGen/AsmPrinter/DwarfDebug.h b/lib/CodeGen/AsmPrinter/DwarfDebug.h index 8a96e7867b6e..b9c5aa9ffb23 100644 --- a/lib/CodeGen/AsmPrinter/DwarfDebug.h +++ b/lib/CodeGen/AsmPrinter/DwarfDebug.h @@ -210,7 +210,6 @@ class DwarfDebug : public DebugHandlerBase { DenseMap SymSize; /// Collection of abstract variables. - DenseMap> AbstractVariables; SmallVector, 64> ConcreteVariables; /// Collection of DebugLocEntry. Stored in a linked list so that DIELocLists @@ -313,20 +312,16 @@ class DwarfDebug : public DebugHandlerBase { typedef DbgValueHistoryMap::InlinedVariable InlinedVariable; - /// Find abstract variable associated with Var. - DbgVariable *getExistingAbstractVariable(InlinedVariable IV, - const DILocalVariable *&Cleansed); - DbgVariable *getExistingAbstractVariable(InlinedVariable IV); - void createAbstractVariable(const DILocalVariable *DV, LexicalScope *Scope); - void ensureAbstractVariableIsCreated(InlinedVariable Var, + void ensureAbstractVariableIsCreated(DwarfCompileUnit &CU, InlinedVariable Var, const MDNode *Scope); - void ensureAbstractVariableIsCreatedIfScoped(InlinedVariable Var, + void ensureAbstractVariableIsCreatedIfScoped(DwarfCompileUnit &CU, InlinedVariable Var, const MDNode *Scope); - DbgVariable *createConcreteVariable(LexicalScope &Scope, InlinedVariable IV); + DbgVariable *createConcreteVariable(DwarfCompileUnit &TheCU, + LexicalScope &Scope, InlinedVariable IV); /// Construct a DIE for this abstract scope. - void constructAbstractSubprogramScopeDIE(LexicalScope *Scope); + void constructAbstractSubprogramScopeDIE(DwarfCompileUnit &SrcCU, LexicalScope *Scope); void finishVariableDefinitions(); @@ -446,7 +441,8 @@ class DwarfDebug : public DebugHandlerBase { const DbgValueHistoryMap::InstrRanges &Ranges); /// Collect variable information from the side table maintained by MF. - void collectVariableInfoFromMFTable(DenseSet &P); + void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU, + DenseSet &P); protected: /// Gather pre-function debug information. @@ -518,6 +514,8 @@ public: /// split dwarf proposal support. bool useSplitDwarf() const { return HasSplitDwarf; } + bool shareAcrossDWOCUs() const; + /// Returns the Dwarf Version. uint16_t getDwarfVersion() const; diff --git a/lib/CodeGen/AsmPrinter/DwarfFile.h b/lib/CodeGen/AsmPrinter/DwarfFile.h index d4d2ed277274..54924e9806ed 100644 --- a/lib/CodeGen/AsmPrinter/DwarfFile.h +++ b/lib/CodeGen/AsmPrinter/DwarfFile.h @@ -53,6 +53,7 @@ class DwarfFile { // Collection of abstract subprogram DIEs. DenseMap AbstractSPDies; + DenseMap> AbstractVariables; /// Maps MDNodes for type system with the corresponding DIEs. These DIEs can /// be shared across CUs, that is why we keep the map here instead @@ -105,6 +106,9 @@ public: DenseMap &getAbstractSPDies() { return AbstractSPDies; } + DenseMap> &getAbstractVariables() { + return AbstractVariables; + } void insertDIE(const MDNode *TypeMD, DIE *Die) { DITypeNodeToDieMap.insert(std::make_pair(TypeMD, Die)); diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp index 8d25def7772c..667afbb450bd 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.cpp +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.cpp @@ -173,7 +173,7 @@ int64_t DwarfUnit::getDefaultLowerBound() const { } /// Check whether the DIE for this MDNode can be shared across CUs. -static bool isShareableAcrossCUs(const DINode *D) { +bool DwarfUnit::isShareableAcrossCUs(const DINode *D) const { // When the MDNode can be part of the type system, the DIE can be shared // across CUs. // Combining type units and cross-CU DIE sharing is lower value (since @@ -181,6 +181,8 @@ static bool isShareableAcrossCUs(const DINode *D) { // level already) but may be implementable for some value in projects // building multiple independent libraries with LTO and then linking those // together. + if (isDwoUnit() && !DD->shareAcrossDWOCUs()) + return false; return (isa(D) || (isa(D) && !cast(D)->isDefinition())) && !GenerateDwarfTypeUnits; @@ -645,7 +647,7 @@ void DwarfUnit::addLinkageName(DIE &Die, StringRef LinkageName) { addString(Die, DD->getDwarfVersion() >= 4 ? dwarf::DW_AT_linkage_name : dwarf::DW_AT_MIPS_linkage_name, - GlobalValue::getRealLinkageName(LinkageName)); + GlobalValue::dropLLVMManglingEscape(LinkageName)); } void DwarfUnit::addTemplateParams(DIE &Buffer, DINodeArray TParams) { diff --git a/lib/CodeGen/AsmPrinter/DwarfUnit.h b/lib/CodeGen/AsmPrinter/DwarfUnit.h index 8fc841703e23..7acad2cbd89f 100644 --- a/lib/CodeGen/AsmPrinter/DwarfUnit.h +++ b/lib/CodeGen/AsmPrinter/DwarfUnit.h @@ -65,7 +65,7 @@ public: //===----------------------------------------------------------------------===// /// This dwarf writer support class manages information associated with a /// source file. - class DwarfUnit : public DIEUnit { +class DwarfUnit : public DIEUnit { protected: /// MDNode for the compile unit. const DICompileUnit *CUNode; @@ -103,6 +103,9 @@ protected: bool applySubprogramDefinitionAttributes(const DISubprogram *SP, DIE &SPDie); + bool shareAcrossDWOCUs() const; + bool isShareableAcrossCUs(const DINode *D) const; + public: // Accessors. AsmPrinter* getAsmPrinter() const { return Asm; } diff --git a/lib/CodeGen/AsmPrinter/WinException.cpp b/lib/CodeGen/AsmPrinter/WinException.cpp index 704f0ac2f191..815658bfb637 100644 --- a/lib/CodeGen/AsmPrinter/WinException.cpp +++ b/lib/CodeGen/AsmPrinter/WinException.cpp @@ -101,7 +101,7 @@ void WinException::beginFunction(const MachineFunction *MF) { // functions may still refer to it. const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); StringRef FLinkageName = - GlobalValue::getRealLinkageName(MF->getFunction()->getName()); + GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName()); emitEHRegistrationOffsetLabel(FuncInfo, FLinkageName); } shouldEmitLSDA = hasEHFunclets; @@ -174,7 +174,7 @@ static MCSymbol *getMCSymbolForMBB(AsmPrinter *Asm, // their funclet entry block's number. const MachineFunction *MF = MBB->getParent(); const Function *F = MF->getFunction(); - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); MCContext &Ctx = MF->getContext(); StringRef HandlerPrefix = MBB->isCleanupFuncletEntry() ? "dtor" : "catch"; return Ctx.getOrCreateSymbol("?" + HandlerPrefix + "$" + @@ -252,7 +252,7 @@ void WinException::endFunclet() { !CurrentFuncletEntry->isCleanupFuncletEntry()) { // If this is a C++ catch funclet (or the parent function), // emit a reference to the LSDA for the parent function. - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol( Twine("$cppxdata$", FuncLinkageName)); Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4); @@ -536,7 +536,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) { // Emit a label assignment with the SEH frame offset so we can use it for // llvm.x86.seh.recoverfp. StringRef FLinkageName = - GlobalValue::getRealLinkageName(MF->getFunction()->getName()); + GlobalValue::dropLLVMManglingEscape(MF->getFunction()->getName()); MCSymbol *ParentFrameOffset = Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName); const MCExpr *MCOffset = @@ -635,7 +635,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) { auto &OS = *Asm->OutStreamer; const WinEHFuncInfo &FuncInfo = *MF->getWinEHFuncInfo(); - StringRef FuncLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); SmallVector, 4> IPToStateTable; MCSymbol *FuncInfoXData = nullptr; @@ -942,7 +942,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo, void WinException::emitExceptHandlerTable(const MachineFunction *MF) { MCStreamer &OS = *Asm->OutStreamer; const Function *F = MF->getFunction(); - StringRef FLinkageName = GlobalValue::getRealLinkageName(F->getName()); + StringRef FLinkageName = GlobalValue::dropLLVMManglingEscape(F->getName()); bool VerboseAsm = OS.isVerboseAsm(); auto AddComment = [&](const Twine &Comment) { diff --git a/lib/CodeGen/AtomicExpandPass.cpp b/lib/CodeGen/AtomicExpandPass.cpp index 9c19a4fd3c3e..17e6be05eb42 100644 --- a/lib/CodeGen/AtomicExpandPass.cpp +++ b/lib/CodeGen/AtomicExpandPass.cpp @@ -47,8 +47,7 @@ namespace { bool runOnFunction(Function &F) override; private: - bool bracketInstWithFences(Instruction *I, AtomicOrdering Order, - bool IsStore, bool IsLoad); + bool bracketInstWithFences(Instruction *I, AtomicOrdering Order); IntegerType *getCorrespondingIntegerType(Type *T, const DataLayout &DL); LoadInst *convertAtomicLoadToIntegerType(LoadInst *LI); bool tryExpandAtomicLoad(LoadInst *LI); @@ -224,22 +223,16 @@ bool AtomicExpand::runOnFunction(Function &F) { if (TLI->shouldInsertFencesForAtomic(I)) { auto FenceOrdering = AtomicOrdering::Monotonic; - bool IsStore, IsLoad; if (LI && isAcquireOrStronger(LI->getOrdering())) { FenceOrdering = LI->getOrdering(); LI->setOrdering(AtomicOrdering::Monotonic); - IsStore = false; - IsLoad = true; } else if (SI && isReleaseOrStronger(SI->getOrdering())) { FenceOrdering = SI->getOrdering(); SI->setOrdering(AtomicOrdering::Monotonic); - IsStore = true; - IsLoad = false; } else if (RMWI && (isReleaseOrStronger(RMWI->getOrdering()) || isAcquireOrStronger(RMWI->getOrdering()))) { FenceOrdering = RMWI->getOrdering(); RMWI->setOrdering(AtomicOrdering::Monotonic); - IsStore = IsLoad = true; } else if (CASI && !TLI->shouldExpandAtomicCmpXchgInIR(CASI) && (isReleaseOrStronger(CASI->getSuccessOrdering()) || isAcquireOrStronger(CASI->getSuccessOrdering()))) { @@ -250,11 +243,10 @@ bool AtomicExpand::runOnFunction(Function &F) { FenceOrdering = CASI->getSuccessOrdering(); CASI->setSuccessOrdering(AtomicOrdering::Monotonic); CASI->setFailureOrdering(AtomicOrdering::Monotonic); - IsStore = IsLoad = true; } if (FenceOrdering != AtomicOrdering::Monotonic) { - MadeChange |= bracketInstWithFences(I, FenceOrdering, IsStore, IsLoad); + MadeChange |= bracketInstWithFences(I, FenceOrdering); } } @@ -320,13 +312,12 @@ bool AtomicExpand::runOnFunction(Function &F) { return MadeChange; } -bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order, - bool IsStore, bool IsLoad) { +bool AtomicExpand::bracketInstWithFences(Instruction *I, AtomicOrdering Order) { IRBuilder<> Builder(I); - auto LeadingFence = TLI->emitLeadingFence(Builder, Order, IsStore, IsLoad); + auto LeadingFence = TLI->emitLeadingFence(Builder, I, Order); - auto TrailingFence = TLI->emitTrailingFence(Builder, Order, IsStore, IsLoad); + auto TrailingFence = TLI->emitTrailingFence(Builder, I, Order); // The trailing fence is emitted before the instruction instead of after // because there is no easy way of setting Builder insertion point after // an instruction. So we must erase it from the BB, and insert it back @@ -1048,8 +1039,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { std::prev(BB->end())->eraseFromParent(); Builder.SetInsertPoint(BB); if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier) - TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitLeadingFence(Builder, CI, SuccessOrder); Builder.CreateBr(StartBB); // Start the main loop block now that we've taken care of the preliminaries. @@ -1064,8 +1054,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Builder.SetInsertPoint(ReleasingStoreBB); if (ShouldInsertFencesForAtomic && !UseUnconditionalReleaseBarrier) - TLI->emitLeadingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitLeadingFence(Builder, CI, SuccessOrder); Builder.CreateBr(TryStoreBB); Builder.SetInsertPoint(TryStoreBB); @@ -1094,8 +1083,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { // necessary. Builder.SetInsertPoint(SuccessBB); if (ShouldInsertFencesForAtomic) - TLI->emitTrailingFence(Builder, SuccessOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitTrailingFence(Builder, CI, SuccessOrder); Builder.CreateBr(ExitBB); Builder.SetInsertPoint(NoStoreBB); @@ -1107,8 +1095,7 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) { Builder.SetInsertPoint(FailureBB); if (ShouldInsertFencesForAtomic) - TLI->emitTrailingFence(Builder, FailureOrder, /*IsStore=*/true, - /*IsLoad=*/true); + TLI->emitTrailingFence(Builder, CI, FailureOrder); Builder.CreateBr(ExitBB); // Finally, we have control-flow based knowledge of whether the cmpxchg diff --git a/lib/CodeGen/CMakeLists.txt b/lib/CodeGen/CMakeLists.txt index 26da748fa244..55a27e2fb79e 100644 --- a/lib/CodeGen/CMakeLists.txt +++ b/lib/CodeGen/CMakeLists.txt @@ -23,6 +23,7 @@ add_llvm_library(LLVMCodeGen ExecutionDepsFix.cpp ExpandISelPseudos.cpp ExpandPostRAPseudos.cpp + ExpandReductions.cpp FaultMaps.cpp FEntryInserter.cpp FuncletLayout.cpp @@ -48,6 +49,7 @@ add_llvm_library(LLVMCodeGen LivePhysRegs.cpp LiveRangeCalc.cpp LiveRangeEdit.cpp + LiveRangeShrink.cpp LiveRegMatrix.cpp LiveRegUnits.cpp LiveStackAnalysis.cpp @@ -118,6 +120,7 @@ add_llvm_library(LLVMCodeGen SafeStack.cpp SafeStackColoring.cpp SafeStackLayout.cpp + ScalarizeMaskedMemIntrin.cpp ScheduleDAG.cpp ScheduleDAGInstrs.cpp ScheduleDAGPrinter.cpp diff --git a/lib/CodeGen/CodeGen.cpp b/lib/CodeGen/CodeGen.cpp index 3fc12ccc3b60..4d30c6574b12 100644 --- a/lib/CodeGen/CodeGen.cpp +++ b/lib/CodeGen/CodeGen.cpp @@ -43,6 +43,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeLiveDebugValuesPass(Registry); initializeLiveDebugVariablesPass(Registry); initializeLiveIntervalsPass(Registry); + initializeLiveRangeShrinkPass(Registry); initializeLiveStacksPass(Registry); initializeLiveVariablesPass(Registry); initializeLocalStackSlotPassPass(Registry); @@ -79,7 +80,8 @@ void llvm::initializeCodeGen(PassRegistry &Registry) { initializeRAGreedyPass(Registry); initializeRegisterCoalescerPass(Registry); initializeRenameIndependentSubregsPass(Registry); - initializeSafeStackPass(Registry); + initializeSafeStackLegacyPassPass(Registry); + initializeScalarizeMaskedMemIntrinPass(Registry); initializeShrinkWrapPass(Registry); initializeSlotIndexesPass(Registry); initializeStackColoringPass(Registry); diff --git a/lib/CodeGen/CodeGenPrepare.cpp b/lib/CodeGen/CodeGenPrepare.cpp index c6c93811a0f9..f2e024c5e3bd 100644 --- a/lib/CodeGen/CodeGenPrepare.cpp +++ b/lib/CodeGen/CodeGenPrepare.cpp @@ -295,7 +295,7 @@ bool CodeGenPrepare::runOnFunction(Function &F) { if (PSI->isFunctionHotInCallGraph(&F)) F.setSectionPrefix(".hot"); else if (PSI->isFunctionColdInCallGraph(&F)) - F.setSectionPrefix(".cold"); + F.setSectionPrefix(".unlikely"); } /// This optimization identifies DIV instructions that can be @@ -1549,519 +1549,6 @@ static bool OptimizeExtractBits(BinaryOperator *ShiftI, ConstantInt *CI, return MadeChange; } -// Translate a masked load intrinsic like -// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, -// <16 x i1> %mask, <16 x i32> %passthru) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -// %1 = bitcast i8* %addr to i32* -// %2 = extractelement <16 x i1> %mask, i32 0 -// %3 = icmp eq i1 %2, true -// br i1 %3, label %cond.load, label %else -// -//cond.load: ; preds = %0 -// %4 = getelementptr i32* %1, i32 0 -// %5 = load i32* %4 -// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 -// br label %else -// -//else: ; preds = %0, %cond.load -// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] -// %7 = extractelement <16 x i1> %mask, i32 1 -// %8 = icmp eq i1 %7, true -// br i1 %8, label %cond.load1, label %else2 -// -//cond.load1: ; preds = %else -// %9 = getelementptr i32* %1, i32 1 -// %10 = load i32* %9 -// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 -// br label %else2 -// -//else2: ; preds = %else, %cond.load1 -// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] -// %12 = extractelement <16 x i1> %mask, i32 2 -// %13 = icmp eq i1 %12, true -// br i1 %13, label %cond.load4, label %else5 -// -static void scalarizeMaskedLoad(CallInst *CI) { - Value *Ptr = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); - - unsigned AlignVal = cast(Alignment)->getZExtValue(); - VectorType *VecType = dyn_cast(CI->getType()); - assert(VecType && "Unexpected return type of masked load intrinsic"); - - Type *EltTy = CI->getType()->getVectorElementType(); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - BasicBlock *CondBlock = nullptr; - BasicBlock *PrevIfBlock = CI->getParent(); - - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Short-cut if the mask is all-true. - bool IsAllOnesMask = isa(Mask) && - cast(Mask)->isAllOnesValue(); - - if (IsAllOnesMask) { - Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - // Adjust alignment for the scalar instruction. - AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits()/8); - // Bitcast %addr fron i8* to EltTy* - Type *NewPtrType = - EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); - Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); - - Value *UndefVal = UndefValue::get(VecType); - - // The result vector - Value *VResult = UndefVal; - - if (isa(Mask)) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst* Load = Builder.CreateAlignedLoad(Gep, AlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, - Builder.getInt32(Idx)); - } - Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - PHINode *Phi = nullptr; - Value *PrevPhi = UndefVal; - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx - // %to_load = icmp eq i1 %mask_1, true - // br i1 %to_load, label %cond.load, label %else - // - if (Idx > 0) { - Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - PrevPhi = Phi; - VResult = Phi; - } - - Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1)); - - // Create "cond" block - // - // %EltAddr = getelementptr i32* %1, i32 0 - // %Elt = load i32* %EltAddr - // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx - // - CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); - Builder.SetInsertPoint(InsertPt); - - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); - VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = - CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - PrevIfBlock = IfBlock; - IfBlock = NewIfBlock; - } - - Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); -} - -// Translate a masked store intrinsic, like -// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, -// <16 x i1> %mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set -// -// %1 = bitcast i8* %addr to i32* -// %2 = extractelement <16 x i1> %mask, i32 0 -// %3 = icmp eq i1 %2, true -// br i1 %3, label %cond.store, label %else -// -// cond.store: ; preds = %0 -// %4 = extractelement <16 x i32> %val, i32 0 -// %5 = getelementptr i32* %1, i32 0 -// store i32 %4, i32* %5 -// br label %else -// -// else: ; preds = %0, %cond.store -// %6 = extractelement <16 x i1> %mask, i32 1 -// %7 = icmp eq i1 %6, true -// br i1 %7, label %cond.store1, label %else2 -// -// cond.store1: ; preds = %else -// %8 = extractelement <16 x i32> %val, i32 1 -// %9 = getelementptr i32* %1, i32 1 -// store i32 %8, i32* %9 -// br label %else2 -// . . . -static void scalarizeMaskedStore(CallInst *CI) { - Value *Src = CI->getArgOperand(0); - Value *Ptr = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); - - unsigned AlignVal = cast(Alignment)->getZExtValue(); - VectorType *VecType = dyn_cast(Src->getType()); - assert(VecType && "Unexpected data type in masked store intrinsic"); - - Type *EltTy = VecType->getElementType(); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - // Short-cut if the mask is all-true. - bool IsAllOnesMask = isa(Mask) && - cast(Mask)->isAllOnesValue(); - - if (IsAllOnesMask) { - Builder.CreateAlignedStore(Src, Ptr, AlignVal); - CI->eraseFromParent(); - return; - } - - // Adjust alignment for the scalar instruction. - AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits()/8); - // Bitcast %addr fron i8* to EltTy* - Type *NewPtrType = - EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); - Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); - unsigned VectorWidth = VecType->getNumElements(); - - if (isa(Mask)) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateAlignedStore(OneElt, Gep, AlignVal); - } - CI->eraseFromParent(); - return; - } - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %mask_1 = extractelement <16 x i1> %mask, i32 Idx - // %to_store = icmp eq i1 %mask_1, true - // br i1 %to_store, label %cond.store, label %else - // - Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1)); - - // Create "cond" block - // - // %OneElt = extractelement <16 x i32> %Src, i32 Idx - // %EltAddr = getelementptr i32* %1, i32 0 - // %store i32 %OneElt, i32* %EltAddr - // - BasicBlock *CondBlock = - IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); - Builder.SetInsertPoint(InsertPt); - - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); - Value *Gep = - Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); - Builder.CreateAlignedStore(OneElt, Gep, AlignVal); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = - CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } - CI->eraseFromParent(); -} - -// Translate a masked gather intrinsic like -// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, -// <16 x i1> %Mask, <16 x i32> %Src) -// to a chain of basic blocks, with loading element one-by-one if -// the appropriate mask bit is set -// -// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> %Mask, i32 0 -// % ToLoad0 = icmp eq i1 % Mask0, true -// br i1 % ToLoad0, label %cond.load, label %else -// -// cond.load: -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// % Load0 = load i32, i32* % Ptr0, align 4 -// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 -// br label %else -// -// else: -// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] -// % Mask1 = extractelement <16 x i1> %Mask, i32 1 -// % ToLoad1 = icmp eq i1 % Mask1, true -// br i1 % ToLoad1, label %cond.load1, label %else2 -// -// cond.load1: -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// % Load1 = load i32, i32* % Ptr1, align 4 -// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 -// br label %else2 -// . . . -// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src -// ret <16 x i32> %Result -static void scalarizeMaskedGather(CallInst *CI) { - Value *Ptrs = CI->getArgOperand(0); - Value *Alignment = CI->getArgOperand(1); - Value *Mask = CI->getArgOperand(2); - Value *Src0 = CI->getArgOperand(3); - - VectorType *VecType = dyn_cast(CI->getType()); - - assert(VecType && "Unexpected return type of masked load intrinsic"); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - BasicBlock *CondBlock = nullptr; - BasicBlock *PrevIfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - unsigned AlignVal = cast(Alignment)->getZExtValue(); - - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - Value *UndefVal = UndefValue::get(VecType); - - // The result vector - Value *VResult = UndefVal; - unsigned VectorWidth = VecType->getNumElements(); - - // Shorten the way if the mask is a vector of constants. - bool IsConstMask = isa(Mask); - - if (IsConstMask) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, - "Load" + Twine(Idx)); - VResult = Builder.CreateInsertElement(VResult, Load, - Builder.getInt32(Idx), - "Res" + Twine(Idx)); - } - Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); - return; - } - - PHINode *Phi = nullptr; - Value *PrevPhi = UndefVal; - - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - - // Fill the "else" block, created in the previous iteration - // - // %Mask1 = extractelement <16 x i1> %Mask, i32 1 - // %ToLoad1 = icmp eq i1 %Mask1, true - // br i1 %ToLoad1, label %cond.load, label %else - // - if (Idx > 0) { - Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - PrevPhi = Phi; - VResult = Phi; - } - - Value *Predicate = Builder.CreateExtractElement(Mask, - Builder.getInt32(Idx), - "Mask" + Twine(Idx)); - Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1), - "ToLoad" + Twine(Idx)); - - // Create "cond" block - // - // %EltAddr = getelementptr i32* %1, i32 0 - // %Elt = load i32* %EltAddr - // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx - // - CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); - Builder.SetInsertPoint(InsertPt); - - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - LoadInst *Load = Builder.CreateAlignedLoad(Ptr, AlignVal, - "Load" + Twine(Idx)); - VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), - "Res" + Twine(Idx)); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - PrevIfBlock = IfBlock; - IfBlock = NewIfBlock; - } - - Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); - Phi->addIncoming(VResult, CondBlock); - Phi->addIncoming(PrevPhi, PrevIfBlock); - Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); - CI->replaceAllUsesWith(NewI); - CI->eraseFromParent(); -} - -// Translate a masked scatter intrinsic, like -// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, -// <16 x i1> %Mask) -// to a chain of basic blocks, that stores element one-by-one if -// the appropriate mask bit is set. -// -// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind -// % Mask0 = extractelement <16 x i1> % Mask, i32 0 -// % ToStore0 = icmp eq i1 % Mask0, true -// br i1 %ToStore0, label %cond.store, label %else -// -// cond.store: -// % Elt0 = extractelement <16 x i32> %Src, i32 0 -// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 -// store i32 %Elt0, i32* % Ptr0, align 4 -// br label %else -// -// else: -// % Mask1 = extractelement <16 x i1> % Mask, i32 1 -// % ToStore1 = icmp eq i1 % Mask1, true -// br i1 % ToStore1, label %cond.store1, label %else2 -// -// cond.store1: -// % Elt1 = extractelement <16 x i32> %Src, i32 1 -// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 -// store i32 % Elt1, i32* % Ptr1, align 4 -// br label %else2 -// . . . -static void scalarizeMaskedScatter(CallInst *CI) { - Value *Src = CI->getArgOperand(0); - Value *Ptrs = CI->getArgOperand(1); - Value *Alignment = CI->getArgOperand(2); - Value *Mask = CI->getArgOperand(3); - - assert(isa(Src->getType()) && - "Unexpected data type in masked scatter intrinsic"); - assert(isa(Ptrs->getType()) && - isa(Ptrs->getType()->getVectorElementType()) && - "Vector of pointers is expected in masked scatter intrinsic"); - - IRBuilder<> Builder(CI->getContext()); - Instruction *InsertPt = CI; - BasicBlock *IfBlock = CI->getParent(); - Builder.SetInsertPoint(InsertPt); - Builder.SetCurrentDebugLocation(CI->getDebugLoc()); - - unsigned AlignVal = cast(Alignment)->getZExtValue(); - unsigned VectorWidth = Src->getType()->getVectorNumElements(); - - // Shorten the way if the mask is a vector of constants. - bool IsConstMask = isa(Mask); - - if (IsConstMask) { - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - if (cast(Mask)->getOperand(Idx)->isNullValue()) - continue; - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), - "Elt" + Twine(Idx)); - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); - } - CI->eraseFromParent(); - return; - } - for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { - // Fill the "else" block, created in the previous iteration - // - // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx - // % ToStore = icmp eq i1 % Mask1, true - // br i1 % ToStore, label %cond.store, label %else - // - Value *Predicate = Builder.CreateExtractElement(Mask, - Builder.getInt32(Idx), - "Mask" + Twine(Idx)); - Value *Cmp = - Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, - ConstantInt::get(Predicate->getType(), 1), - "ToStore" + Twine(Idx)); - - // Create "cond" block - // - // % Elt1 = extractelement <16 x i32> %Src, i32 1 - // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 - // %store i32 % Elt1, i32* % Ptr1 - // - BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); - Builder.SetInsertPoint(InsertPt); - - Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), - "Elt" + Twine(Idx)); - Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), - "Ptr" + Twine(Idx)); - Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); - - // Create "else" block, fill it in the next iteration - BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); - Builder.SetInsertPoint(InsertPt); - Instruction *OldBr = IfBlock->getTerminator(); - BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); - OldBr->eraseFromParent(); - IfBlock = NewIfBlock; - } - CI->eraseFromParent(); -} - /// If counting leading or trailing zeros is an expensive operation and a zero /// input is defined, add a check for zero to avoid calling the intrinsic. /// @@ -2242,39 +1729,6 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool& ModifiedDT) { } return true; } - case Intrinsic::masked_load: { - // Scalarize unsupported vector masked load - if (!TTI->isLegalMaskedLoad(CI->getType())) { - scalarizeMaskedLoad(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_store: { - if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { - scalarizeMaskedStore(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_gather: { - if (!TTI->isLegalMaskedGather(CI->getType())) { - scalarizeMaskedGather(CI); - ModifiedDT = true; - return true; - } - return false; - } - case Intrinsic::masked_scatter: { - if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { - scalarizeMaskedScatter(CI); - ModifiedDT = true; - return true; - } - return false; - } case Intrinsic::aarch64_stlxr: case Intrinsic::aarch64_stxr: { ZExtInst *ExtVal = dyn_cast(CI->getArgOperand(0)); diff --git a/lib/CodeGen/ExpandPostRAPseudos.cpp b/lib/CodeGen/ExpandPostRAPseudos.cpp index ab2382e2db6d..e860906043dd 100644 --- a/lib/CodeGen/ExpandPostRAPseudos.cpp +++ b/lib/CodeGen/ExpandPostRAPseudos.cpp @@ -142,8 +142,9 @@ bool ExpandPostRA::LowerCopy(MachineInstr *MI) { MachineOperand &DstMO = MI->getOperand(0); MachineOperand &SrcMO = MI->getOperand(1); - if (SrcMO.getReg() == DstMO.getReg()) { - DEBUG(dbgs() << "identity copy: " << *MI); + bool IdentityCopy = (SrcMO.getReg() == DstMO.getReg()); + if (IdentityCopy || SrcMO.isUndef()) { + DEBUG(dbgs() << (IdentityCopy ? "identity copy: " : "undef copy: ") << *MI); // No need to insert an identity copy instruction, but replace with a KILL // if liveness is changed. if (SrcMO.isUndef() || MI->getNumOperands() > 2) { diff --git a/lib/CodeGen/ExpandReductions.cpp b/lib/CodeGen/ExpandReductions.cpp new file mode 100644 index 000000000000..a40ea28056dd --- /dev/null +++ b/lib/CodeGen/ExpandReductions.cpp @@ -0,0 +1,167 @@ +//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass implements IR expansion for reduction intrinsics, allowing targets +// to enable the experimental intrinsics until just before codegen. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/CodeGen/ExpandReductions.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/IR/InstIterator.h" +#include "llvm/IR/Intrinsics.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Module.h" +#include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Pass.h" + +using namespace llvm; + +namespace { + +unsigned getOpcode(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + return Instruction::FAdd; + case Intrinsic::experimental_vector_reduce_fmul: + return Instruction::FMul; + case Intrinsic::experimental_vector_reduce_add: + return Instruction::Add; + case Intrinsic::experimental_vector_reduce_mul: + return Instruction::Mul; + case Intrinsic::experimental_vector_reduce_and: + return Instruction::And; + case Intrinsic::experimental_vector_reduce_or: + return Instruction::Or; + case Intrinsic::experimental_vector_reduce_xor: + return Instruction::Xor; + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + return Instruction::ICmp; + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + return Instruction::FCmp; + default: + llvm_unreachable("Unexpected ID"); + } +} + +RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) { + switch (ID) { + case Intrinsic::experimental_vector_reduce_smax: + return RecurrenceDescriptor::MRK_SIntMax; + case Intrinsic::experimental_vector_reduce_smin: + return RecurrenceDescriptor::MRK_SIntMin; + case Intrinsic::experimental_vector_reduce_umax: + return RecurrenceDescriptor::MRK_UIntMax; + case Intrinsic::experimental_vector_reduce_umin: + return RecurrenceDescriptor::MRK_UIntMin; + case Intrinsic::experimental_vector_reduce_fmax: + return RecurrenceDescriptor::MRK_FloatMax; + case Intrinsic::experimental_vector_reduce_fmin: + return RecurrenceDescriptor::MRK_FloatMin; + default: + return RecurrenceDescriptor::MRK_Invalid; + } +} + +bool expandReductions(Function &F, const TargetTransformInfo *TTI) { + bool Changed = false; + SmallVector Worklist; + for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; ++I) + if (auto II = dyn_cast(&*I)) + Worklist.push_back(II); + + for (auto *II : Worklist) { + IRBuilder<> Builder(II); + Value *Vec = nullptr; + auto ID = II->getIntrinsicID(); + auto MRK = RecurrenceDescriptor::MRK_Invalid; + switch (ID) { + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + // FMFs must be attached to the call, otherwise it's an ordered reduction + // and it can't be handled by generating this shuffle sequence. + // TODO: Implement scalarization of ordered reductions here for targets + // without native support. + if (!II->getFastMathFlags().unsafeAlgebra()) + continue; + Vec = II->getArgOperand(1); + break; + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: + Vec = II->getArgOperand(0); + MRK = getMRK(ID); + break; + default: + continue; + } + if (!TTI->shouldExpandReduction(II)) + continue; + auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + II->replaceAllUsesWith(Rdx); + II->eraseFromParent(); + Changed = true; + } + return Changed; +} + +class ExpandReductions : public FunctionPass { +public: + static char ID; + ExpandReductions() : FunctionPass(ID) { + initializeExpandReductionsPass(*PassRegistry::getPassRegistry()); + } + + bool runOnFunction(Function &F) override { + const auto *TTI =&getAnalysis().getTTI(F); + return expandReductions(F, TTI); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesCFG(); + } +}; +} + +char ExpandReductions::ID; +INITIALIZE_PASS_BEGIN(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) +INITIALIZE_PASS_END(ExpandReductions, "expand-reductions", + "Expand reduction intrinsics", false, false) + +FunctionPass *llvm::createExpandReductionsPass() { + return new ExpandReductions(); +} + +PreservedAnalyses ExpandReductionsPass::run(Function &F, + FunctionAnalysisManager &AM) { + const auto &TTI = AM.getResult(F); + if (!expandReductions(F, &TTI)) + return PreservedAnalyses::all(); + PreservedAnalyses PA; + PA.preserveSet(); + return PA; +} diff --git a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp index eaf4056e47ea..4d4591042296 100644 --- a/lib/CodeGen/GlobalISel/LegalizerInfo.cpp +++ b/lib/CodeGen/GlobalISel/LegalizerInfo.cpp @@ -162,7 +162,7 @@ bool LegalizerInfo::isLegal(const MachineInstr &MI, return std::get<0>(getAction(MI, MRI)) == Legal; } -LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, +Optional LegalizerInfo::findLegalType(const InstrAspect &Aspect, LegalizeAction Action) const { switch(Action) { default: @@ -174,20 +174,20 @@ LLT LegalizerInfo::findLegalType(const InstrAspect &Aspect, return Aspect.Type; case NarrowScalar: { return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); + [](LLT Ty) -> LLT { return Ty.halfScalarSize(); }); } case WidenScalar: { - return findLegalType(Aspect, [&](LLT Ty) -> LLT { + return findLegalType(Aspect, [](LLT Ty) -> LLT { return Ty.getSizeInBits() < 8 ? LLT::scalar(8) : Ty.doubleScalarSize(); }); } case FewerElements: { return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.halfElements(); }); + [](LLT Ty) -> LLT { return Ty.halfElements(); }); } case MoreElements: { return findLegalType(Aspect, - [&](LLT Ty) -> LLT { return Ty.doubleElements(); }); + [](LLT Ty) -> LLT { return Ty.doubleElements(); }); } } } diff --git a/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/lib/CodeGen/GlobalISel/RegBankSelect.cpp index 7248f50945d0..2eb3cdee694d 100644 --- a/lib/CodeGen/GlobalISel/RegBankSelect.cpp +++ b/lib/CodeGen/GlobalISel/RegBankSelect.cpp @@ -204,12 +204,8 @@ uint64_t RegBankSelect::getRepairCost( // TODO: use a dedicated constant for ImpossibleCost. if (Cost != UINT_MAX) return Cost; - assert(!TPC->isGlobalISelAbortEnabled() && - "Legalization not available yet"); // Return the legalization cost of that repairing. } - assert(!TPC->isGlobalISelAbortEnabled() && - "Complex repairing not implemented yet"); return UINT_MAX; } @@ -452,6 +448,11 @@ RegBankSelect::MappingCost RegBankSelect::computeMapping( // Sums up the repairing cost of MO at each insertion point. uint64_t RepairCost = getRepairCost(MO, ValMapping); + + // This is an impossible to repair cost. + if (RepairCost == UINT_MAX) + continue; + // Bias used for splitting: 5%. const uint64_t PercentageForBias = 5; uint64_t Bias = (RepairCost * PercentageForBias + 99) / 100; diff --git a/lib/CodeGen/GlobalISel/Utils.cpp b/lib/CodeGen/GlobalISel/Utils.cpp index 3c93f8123b0d..254bdf10d804 100644 --- a/lib/CodeGen/GlobalISel/Utils.cpp +++ b/lib/CodeGen/GlobalISel/Utils.cpp @@ -110,3 +110,11 @@ Optional llvm::getConstantVRegVal(unsigned VReg, return None; } + +const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg, + const MachineRegisterInfo &MRI) { + MachineInstr *MI = MRI.getVRegDef(VReg); + if (TargetOpcode::G_FCONSTANT != MI->getOpcode()) + return nullptr; + return MI->getOperand(1).getFPImm(); +} diff --git a/lib/CodeGen/IfConversion.cpp b/lib/CodeGen/IfConversion.cpp index 37fe41582333..628d599a3cc7 100644 --- a/lib/CodeGen/IfConversion.cpp +++ b/lib/CodeGen/IfConversion.cpp @@ -1318,7 +1318,8 @@ static bool canFallThroughTo(MachineBasicBlock &MBB, MachineBasicBlock &ToMBB) { return false; PI = I++; } - return true; + // Finally see if the last I is indeed a successor to PI. + return PI->isSuccessor(&*I); } /// Invalidate predecessor BB info so it would be re-analyzed to determine if it @@ -1587,22 +1588,32 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { BBCvt = MBPI->getEdgeProbability(BBI.BB, &CvtMBB); } + // To be able to insert code freely at the end of BBI we sometimes remove + // the branch from BBI to NextMBB temporarily. Remember if this happened. + bool RemovedBranchToNextMBB = false; if (CvtMBB.pred_size() > 1) { BBI.NonPredSize -= TII->removeBranch(*BBI.BB); // Copy instructions in the true block, predicate them, and add them to // the entry block. CopyAndPredicateBlock(BBI, *CvtBBI, Cond, true); - // RemoveExtraEdges won't work if the block has an unanalyzable branch, so - // explicitly remove CvtBBI as a successor. + // Keep the CFG updated. BBI.BB->removeSuccessor(&CvtMBB, true); } else { // Predicate the 'true' block after removing its branch. CvtBBI->NonPredSize -= TII->removeBranch(CvtMBB); PredicateBlock(*CvtBBI, CvtMBB.end(), Cond); - // Now merge the entry of the triangle with the true block. + // Remove the branch from the entry of the triangle to NextBB to be able to + // do the merge below. Keep the CFG updated, but remember we removed the + // branch since we do want to execute NextMBB, either by introducing a + // branch to it again, or merging it into the entry block. + // How it's handled is decided further down. BBI.NonPredSize -= TII->removeBranch(*BBI.BB); + BBI.BB->removeSuccessor(&NextMBB, true); + RemovedBranchToNextMBB = true; + + // Now merge the entry of the triangle with the true block. MergeBlocks(BBI, *CvtBBI, false); } @@ -1640,12 +1651,19 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { // block. By not merging them, we make it possible to iteratively // ifcvt the blocks. if (!HasEarlyExit && - NextMBB.pred_size() == 1 && !NextBBI->HasFallThrough && + // We might have removed BBI from NextMBB's predecessor list above but + // we want it to be there, so consider that too. + (NextMBB.pred_size() == (RemovedBranchToNextMBB ? 0 : 1)) && + !NextBBI->HasFallThrough && !NextMBB.hasAddressTaken()) { + // We will merge NextBBI into BBI, and thus remove the current + // fallthrough from BBI into CvtBBI. + BBI.BB->removeSuccessor(&CvtMBB, true); MergeBlocks(BBI, *NextBBI); FalseBBDead = true; } else { InsertUncondBranch(*BBI.BB, NextMBB, TII); + BBI.BB->addSuccessor(&NextMBB); BBI.HasFallThrough = false; } // Mixed predicated and unpredicated code. This cannot be iteratively @@ -1653,8 +1671,6 @@ bool IfConverter::IfConvertTriangle(BBInfo &BBI, IfcvtKind Kind) { IterIfcvt = false; } - RemoveExtraEdges(BBI); - // Update block info. BB can be iteratively if-converted. if (!IterIfcvt) BBI.IsDone = true; diff --git a/lib/CodeGen/LiveRangeShrink.cpp b/lib/CodeGen/LiveRangeShrink.cpp new file mode 100644 index 000000000000..00182e2c779f --- /dev/null +++ b/lib/CodeGen/LiveRangeShrink.cpp @@ -0,0 +1,211 @@ +//===-- LiveRangeShrink.cpp - Move instructions to shrink live range ------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +///===---------------------------------------------------------------------===// +/// +/// \file +/// This pass moves instructions close to the definition of its operands to +/// shrink live range of the def instruction. The code motion is limited within +/// the basic block. The moved instruction should have 1 def, and more than one +/// uses, all of which are the only use of the def. +/// +///===---------------------------------------------------------------------===// +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Support/Debug.h" + +#define DEBUG_TYPE "lrshrink" + +STATISTIC(NumInstrsHoistedToShrinkLiveRange, + "Number of insructions hoisted to shrink live range."); + +using namespace llvm; + +namespace { +class LiveRangeShrink : public MachineFunctionPass { +public: + static char ID; + + LiveRangeShrink() : MachineFunctionPass(ID) { + initializeLiveRangeShrinkPass(*PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { return "Live Range Shrink"; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} // End anonymous namespace. + +char LiveRangeShrink::ID = 0; +char &llvm::LiveRangeShrinkID = LiveRangeShrink::ID; + +INITIALIZE_PASS(LiveRangeShrink, "lrshrink", "Live Range Shrink Pass", false, + false) +namespace { +typedef DenseMap InstOrderMap; + +/// Returns \p New if it's dominated by \p Old, otherwise return \p Old. +/// \p M maintains a map from instruction to its dominating order that satisfies +/// M[A] > M[B] guarantees that A is dominated by B. +/// If \p New is not in \p M, return \p Old. Otherwise if \p Old is null, return +/// \p New. +MachineInstr *FindDominatedInstruction(MachineInstr &New, MachineInstr *Old, + const InstOrderMap &M) { + auto NewIter = M.find(&New); + if (NewIter == M.end()) + return Old; + if (Old == nullptr) + return &New; + unsigned OrderOld = M.find(Old)->second; + unsigned OrderNew = NewIter->second; + if (OrderOld != OrderNew) + return OrderOld < OrderNew ? &New : Old; + // OrderOld == OrderNew, we need to iterate down from Old to see if it + // can reach New, if yes, New is dominated by Old. + for (MachineInstr *I = Old->getNextNode(); M.find(I)->second == OrderNew; + I = I->getNextNode()) + if (I == &New) + return &New; + return Old; +} + +/// Builds Instruction to its dominating order number map \p M by traversing +/// from instruction \p Start. +void BuildInstOrderMap(MachineBasicBlock::iterator Start, InstOrderMap &M) { + M.clear(); + unsigned i = 0; + for (MachineInstr &I : make_range(Start, Start->getParent()->end())) + M[&I] = i++; +} +} // end anonymous namespace + +bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) { + if (skipFunction(*MF.getFunction())) + return false; + + MachineRegisterInfo &MRI = MF.getRegInfo(); + + DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); + + InstOrderMap IOM; + // Map from register to instruction order (value of IOM) where the + // register is used last. When moving instructions up, we need to + // make sure all its defs (including dead def) will not cross its + // last use when moving up. + DenseMap UseMap; + + for (MachineBasicBlock &MBB : MF) { + if (MBB.empty()) + continue; + bool SawStore = false; + BuildInstOrderMap(MBB.begin(), IOM); + UseMap.clear(); + + for (MachineBasicBlock::iterator Next = MBB.begin(); Next != MBB.end();) { + MachineInstr &MI = *Next; + ++Next; + if (MI.isPHI() || MI.isDebugValue()) + continue; + if (MI.mayStore()) + SawStore = true; + + unsigned CurrentOrder = IOM[&MI]; + unsigned Barrier = 0; + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDebug()) + continue; + if (MO.isUse()) + UseMap[MO.getReg()] = CurrentOrder; + else if (MO.isDead() && UseMap.count(MO.getReg())) + // Barrier is the last instruction where MO get used. MI should not + // be moved above Barrier. + Barrier = std::max(Barrier, UseMap[MO.getReg()]); + } + + if (!MI.isSafeToMove(nullptr, SawStore)) { + // If MI has side effects, it should become a barrier for code motion. + // IOM is rebuild from the next instruction to prevent later + // instructions from being moved before this MI. + if (MI.hasUnmodeledSideEffects() && Next != MBB.end()) { + BuildInstOrderMap(Next, IOM); + SawStore = false; + } + continue; + } + + const MachineOperand *DefMO = nullptr; + MachineInstr *Insert = nullptr; + + // Number of live-ranges that will be shortened. We do not count + // live-ranges that are defined by a COPY as it could be coalesced later. + unsigned NumEligibleUse = 0; + + for (const MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || MO.isDead() || MO.isDebug()) + continue; + unsigned Reg = MO.getReg(); + // Do not move the instruction if it def/uses a physical register, + // unless it is a constant physical register. + if (TargetRegisterInfo::isPhysicalRegister(Reg) && + !MRI.isConstantPhysReg(Reg)) { + Insert = nullptr; + break; + } + if (MO.isDef()) { + // Do not move if there is more than one def. + if (DefMO) { + Insert = nullptr; + break; + } + DefMO = &MO; + } else if (MRI.hasOneNonDBGUse(Reg) && MRI.hasOneDef(Reg)) { + MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); + if (!DefInstr.isCopy()) + NumEligibleUse++; + Insert = FindDominatedInstruction(DefInstr, Insert, IOM); + } else { + Insert = nullptr; + break; + } + } + // Move the instruction when # of shrunk live range > 1. + if (DefMO && Insert && NumEligibleUse > 1 && Barrier <= IOM[Insert]) { + MachineBasicBlock::iterator I = std::next(Insert->getIterator()); + // Skip all the PHI and debug instructions. + while (I != MBB.end() && (I->isPHI() || I->isDebugValue())) + I = std::next(I); + if (I == MI.getIterator()) + continue; + + // Update the dominator order to be the same as the insertion point. + // We do this to maintain a non-decreasing order without need to update + // all instruction orders after the insertion point. + unsigned NewOrder = IOM[&*I]; + IOM[&MI] = NewOrder; + NumInstrsHoistedToShrinkLiveRange++; + + // Find MI's debug value following MI. + MachineBasicBlock::iterator EndIter = std::next(MI.getIterator()); + if (MI.getOperand(0).isReg()) + for (; EndIter != MBB.end() && EndIter->isDebugValue() && + EndIter->getOperand(0).isReg() && + EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg(); + ++EndIter, ++Next) + IOM[&*EndIter] = NewOrder; + MBB.splice(I, &MBB, MI.getIterator(), EndIter); + } + } + } + return false; +} diff --git a/lib/CodeGen/LiveVariables.cpp b/lib/CodeGen/LiveVariables.cpp index 3568b0294ad9..a9aec926115a 100644 --- a/lib/CodeGen/LiveVariables.cpp +++ b/lib/CodeGen/LiveVariables.cpp @@ -767,7 +767,7 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB, MachineBasicBlock *SuccBB) { const unsigned NumNew = BB->getNumber(); - SmallSet Defs, Kills; + DenseSet Defs, Kills; MachineBasicBlock::iterator BBI = SuccBB->begin(), BBE = SuccBB->end(); for (; BBI != BBE && BBI->isPHI(); ++BBI) { diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp index 4cfc128a8c1d..5003115a770f 100644 --- a/lib/CodeGen/MachineBlockPlacement.cpp +++ b/lib/CodeGen/MachineBlockPlacement.cpp @@ -133,6 +133,14 @@ static cl::opt TailDupPlacementThreshold( "that won't conflict."), cl::init(2), cl::Hidden); +// Heuristic for aggressive tail duplication. +static cl::opt TailDupPlacementAggressiveThreshold( + "tail-dup-placement-aggressive-threshold", + cl::desc("Instruction cutoff for aggressive tail duplication during " + "layout. Used at -O3. Tail merging during layout is forced to " + "have a threshold that won't conflict."), cl::init(3), + cl::Hidden); + // Heuristic for tail duplication. static cl::opt TailDupPlacementPenalty( "tail-dup-placement-penalty", @@ -2646,9 +2654,26 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { assert(BlockToChain.empty()); assert(ComputedEdges.empty()); + unsigned TailDupSize = TailDupPlacementThreshold; + // If only the aggressive threshold is explicitly set, use it. + if (TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0 && + TailDupPlacementThreshold.getNumOccurrences() == 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + + TargetPassConfig *PassConfig = &getAnalysis(); + // For agressive optimization, we can adjust some thresholds to be less + // conservative. + if (PassConfig->getOptLevel() >= CodeGenOpt::Aggressive) { + // At O3 we should be more willing to copy blocks for tail duplication. This + // increases size pressure, so we only do it at O3 + // Do this unless only the regular threshold is explicitly set. + if (TailDupPlacementThreshold.getNumOccurrences() == 0 || + TailDupPlacementAggressiveThreshold.getNumOccurrences() != 0) + TailDupSize = TailDupPlacementAggressiveThreshold; + } + if (TailDupPlacement) { MPDT = &getAnalysis(); - unsigned TailDupSize = TailDupPlacementThreshold; if (MF.getFunction()->optForSize()) TailDupSize = 1; TailDup.initMF(MF, MBPI, /* LayoutMode */ true, TailDupSize); @@ -2658,7 +2683,6 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { buildCFGChains(); // Changing the layout can create new tail merging opportunities. - TargetPassConfig *PassConfig = &getAnalysis(); // TailMerge can create jump into if branches that make CFG irreducible for // HW that requires structured CFG. bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() && @@ -2666,7 +2690,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) { BranchFoldPlacement; // No tail merging opportunities if the block number is less than four. if (MF.size() > 3 && EnableTailMerge) { - unsigned TailMergeSize = TailDupPlacementThreshold + 1; + unsigned TailMergeSize = TailDupSize + 1; BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI, *MBPI, TailMergeSize); diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp index bfb2cde030dc..ab433273b189 100644 --- a/lib/CodeGen/MachineVerifier.cpp +++ b/lib/CodeGen/MachineVerifier.cpp @@ -2063,12 +2063,12 @@ void MachineVerifier::verifyStackFrame() { if (I.getOpcode() == FrameSetupOpcode) { if (BBState.ExitIsSetup) report("FrameSetup is after another FrameSetup", &I); - BBState.ExitValue -= TII->getFrameSize(I); + BBState.ExitValue -= TII->getFrameTotalSize(I); BBState.ExitIsSetup = true; } if (I.getOpcode() == FrameDestroyOpcode) { - int Size = TII->getFrameSize(I); + int Size = TII->getFrameTotalSize(I); if (!BBState.ExitIsSetup) report("FrameDestroy is not after a FrameSetup", &I); int AbsSPAdj = BBState.ExitValue < 0 ? -BBState.ExitValue : diff --git a/lib/CodeGen/PHIElimination.cpp b/lib/CodeGen/PHIElimination.cpp index c67a25b888bf..db2264b2439d 100644 --- a/lib/CodeGen/PHIElimination.cpp +++ b/lib/CodeGen/PHIElimination.cpp @@ -34,7 +34,7 @@ #include using namespace llvm; -#define DEBUG_TYPE "phielim" +#define DEBUG_TYPE "phi-node-elimination" static cl::opt DisableEdgeSplitting("disable-phi-elim-edge-splitting", cl::init(false), diff --git a/lib/CodeGen/RegisterCoalescer.cpp b/lib/CodeGen/RegisterCoalescer.cpp index bf44ee8453b6..1803ea2b9249 100644 --- a/lib/CodeGen/RegisterCoalescer.cpp +++ b/lib/CodeGen/RegisterCoalescer.cpp @@ -3214,7 +3214,7 @@ RegisterCoalescer::copyCoalesceInMBB(MachineBasicBlock *MBB) { CurrList(WorkList.begin() + PrevSize, WorkList.end()); if (copyCoalesceWorkList(CurrList)) WorkList.erase(std::remove(WorkList.begin() + PrevSize, WorkList.end(), - (MachineInstr*)nullptr), WorkList.end()); + nullptr), WorkList.end()); } void RegisterCoalescer::coalesceLocals() { diff --git a/lib/CodeGen/RegisterScavenging.cpp b/lib/CodeGen/RegisterScavenging.cpp index 35db30f89976..0635e5c0a63c 100644 --- a/lib/CodeGen/RegisterScavenging.cpp +++ b/lib/CodeGen/RegisterScavenging.cpp @@ -62,10 +62,9 @@ void RegScavenger::init(MachineBasicBlock &MBB) { } this->MBB = &MBB; - for (SmallVectorImpl::iterator I = Scavenged.begin(), - IE = Scavenged.end(); I != IE; ++I) { - I->Reg = 0; - I->Restore = nullptr; + for (ScavengedInfo &SI : Scavenged) { + SI.Reg = 0; + SI.Restore = nullptr; } Tracking = false; diff --git a/lib/CodeGen/SafeStack.cpp b/lib/CodeGen/SafeStack.cpp index 7fa379d80c6c..08b3d345f689 100644 --- a/lib/CodeGen/SafeStack.cpp +++ b/lib/CodeGen/SafeStack.cpp @@ -19,6 +19,7 @@ #include "SafeStackLayout.h" #include "llvm/ADT/Statistic.h" #include "llvm/ADT/Triple.h" +#include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/BranchProbabilityInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" @@ -92,11 +93,11 @@ public: /// determined statically), and the unsafe stack, which contains all /// local variables that are accessed in ways that we can't prove to /// be safe. -class SafeStack : public FunctionPass { - const TargetMachine *TM; - const TargetLoweringBase *TL; - const DataLayout *DL; - ScalarEvolution *SE; +class SafeStack { + Function &F; + const TargetLoweringBase &TL; + const DataLayout &DL; + ScalarEvolution &SE; Type *StackPtrTy; Type *IntPtrTy; @@ -171,33 +172,21 @@ class SafeStack : public FunctionPass { uint64_t AllocaSize); public: - static char ID; // Pass identification, replacement for typeid. - SafeStack(const TargetMachine *TM) - : FunctionPass(ID), TM(TM), TL(nullptr), DL(nullptr) { - initializeSafeStackPass(*PassRegistry::getPassRegistry()); - } - SafeStack() : SafeStack(nullptr) {} - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - } - - bool doInitialization(Module &M) override { - DL = &M.getDataLayout(); - - StackPtrTy = Type::getInt8PtrTy(M.getContext()); - IntPtrTy = DL->getIntPtrType(M.getContext()); - Int32Ty = Type::getInt32Ty(M.getContext()); - Int8Ty = Type::getInt8Ty(M.getContext()); - - return false; - } - - bool runOnFunction(Function &F) override; -}; // class SafeStack + SafeStack(Function &F, const TargetLoweringBase &TL, const DataLayout &DL, + ScalarEvolution &SE) + : F(F), TL(TL), DL(DL), SE(SE), + StackPtrTy(Type::getInt8PtrTy(F.getContext())), + IntPtrTy(DL.getIntPtrType(F.getContext())), + Int32Ty(Type::getInt32Ty(F.getContext())), + Int8Ty(Type::getInt8Ty(F.getContext())) {} + + // Run the transformation on the associated function. + // Returns whether the function was changed. + bool run(); +}; uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { - uint64_t Size = DL->getTypeAllocSize(AI->getAllocatedType()); + uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType()); if (AI->isArrayAllocation()) { auto C = dyn_cast(AI->getArraySize()); if (!C) @@ -209,11 +198,11 @@ uint64_t SafeStack::getStaticAllocaAllocationSize(const AllocaInst* AI) { bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, const Value *AllocaPtr, uint64_t AllocaSize) { - AllocaOffsetRewriter Rewriter(*SE, AllocaPtr); - const SCEV *Expr = Rewriter.visit(SE->getSCEV(Addr)); + AllocaOffsetRewriter Rewriter(SE, AllocaPtr); + const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr)); - uint64_t BitWidth = SE->getTypeSizeInBits(Expr->getType()); - ConstantRange AccessStartRange = SE->getUnsignedRange(Expr); + uint64_t BitWidth = SE.getTypeSizeInBits(Expr->getType()); + ConstantRange AccessStartRange = SE.getUnsignedRange(Expr); ConstantRange SizeRange = ConstantRange(APInt(BitWidth, 0), APInt(BitWidth, AccessSize)); ConstantRange AccessRange = AccessStartRange.add(SizeRange); @@ -226,8 +215,8 @@ bool SafeStack::IsAccessSafe(Value *Addr, uint64_t AccessSize, << *AllocaPtr << "\n" << " Access " << *Addr << "\n" << " SCEV " << *Expr - << " U: " << SE->getUnsignedRange(Expr) - << ", S: " << SE->getSignedRange(Expr) << "\n" + << " U: " << SE.getUnsignedRange(Expr) + << ", S: " << SE.getSignedRange(Expr) << "\n" << " Range " << AccessRange << "\n" << " AllocaRange " << AllocaRange << "\n" << " " << (Safe ? "safe" : "unsafe") << "\n"); @@ -266,7 +255,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { switch (I->getOpcode()) { case Instruction::Load: { - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getType()), AllocaPtr, + if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getType()), AllocaPtr, AllocaSize)) return false; break; @@ -282,7 +271,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { return false; } - if (!IsAccessSafe(UI, DL->getTypeStoreSize(I->getOperand(0)->getType()), + if (!IsAccessSafe(UI, DL.getTypeStoreSize(I->getOperand(0)->getType()), AllocaPtr, AllocaSize)) return false; break; @@ -343,7 +332,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) { } Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) { - Value *StackGuardVar = TL->getIRStackGuard(IRB); + Value *StackGuardVar = TL.getIRStackGuard(IRB); if (!StackGuardVar) StackGuardVar = F.getParent()->getOrInsertGlobal("__stack_chk_guard", StackPtrTy); @@ -390,7 +379,7 @@ void SafeStack::findInsts(Function &F, if (!Arg.hasByValAttr()) continue; uint64_t Size = - DL->getTypeStoreSize(Arg.getType()->getPointerElementType()); + DL.getTypeStoreSize(Arg.getType()->getPointerElementType()); if (IsSafeStackAlloca(&Arg, Size)) continue; @@ -476,19 +465,19 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( if (StackGuardSlot) { Type *Ty = StackGuardSlot->getAllocatedType(); unsigned Align = - std::max(DL->getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); + std::max(DL.getPrefTypeAlignment(Ty), StackGuardSlot->getAlignment()); SSL.addObject(StackGuardSlot, getStaticAllocaAllocationSize(StackGuardSlot), Align, SSC.getFullLiveRange()); } for (Argument *Arg : ByValArguments) { Type *Ty = Arg->getType()->getPointerElementType(); - uint64_t Size = DL->getTypeStoreSize(Ty); + uint64_t Size = DL.getTypeStoreSize(Ty); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. // Ensure the object is properly aligned. - unsigned Align = std::max((unsigned)DL->getPrefTypeAlignment(Ty), + unsigned Align = std::max((unsigned)DL.getPrefTypeAlignment(Ty), Arg->getParamAlignment()); SSL.addObject(Arg, Size, Align, SSC.getFullLiveRange()); } @@ -501,7 +490,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( // Ensure the object is properly aligned. unsigned Align = - std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()); + std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()); SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI)); } @@ -539,7 +528,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack( unsigned Offset = SSL.getObjectOffset(Arg); Type *Ty = Arg->getType()->getPointerElementType(); - uint64_t Size = DL->getTypeStoreSize(Ty); + uint64_t Size = DL.getTypeStoreSize(Ty); if (Size == 0) Size = 1; // Don't create zero-sized stack objects. @@ -630,7 +619,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( ArraySize = IRB.CreateIntCast(ArraySize, IntPtrTy, false); Type *Ty = AI->getAllocatedType(); - uint64_t TySize = DL->getTypeAllocSize(Ty); + uint64_t TySize = DL.getTypeAllocSize(Ty); Value *Size = IRB.CreateMul(ArraySize, ConstantInt::get(IntPtrTy, TySize)); Value *SP = IRB.CreatePtrToInt(IRB.CreateLoad(UnsafeStackPtr), IntPtrTy); @@ -638,7 +627,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( // Align the SP value to satisfy the AllocaInst, type and stack alignments. unsigned Align = std::max( - std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI->getAlignment()), + std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment()), (unsigned)StackAlignment); assert(isPowerOf2_32(Align)); @@ -685,25 +674,10 @@ void SafeStack::moveDynamicAllocasToUnsafeStack( } } -bool SafeStack::runOnFunction(Function &F) { - DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); - - if (!F.hasFnAttribute(Attribute::SafeStack)) { - DEBUG(dbgs() << "[SafeStack] safestack is not requested" - " for this function\n"); - return false; - } - - if (F.isDeclaration()) { - DEBUG(dbgs() << "[SafeStack] function definition" - " is not available\n"); - return false; - } - - if (!TM) - report_fatal_error("Target machine is required"); - TL = TM->getSubtargetImpl(F)->getTargetLowering(); - SE = &getAnalysis().getSE(); +bool SafeStack::run() { + assert(F.hasFnAttribute(Attribute::SafeStack) && + "Can't run SafeStack on a function without the attribute"); + assert(!F.isDeclaration() && "Can't run SafeStack on a function declaration"); ++NumFunctions; @@ -736,7 +710,7 @@ bool SafeStack::runOnFunction(Function &F) { ++NumUnsafeStackRestorePointsFunctions; IRBuilder<> IRB(&F.front(), F.begin()->getFirstInsertionPt()); - UnsafeStackPtr = TL->getSafeStackPointerLocation(IRB); + UnsafeStackPtr = TL.getSafeStackPointerLocation(IRB); // Load the current stack pointer (we'll also use it as a base pointer). // FIXME: use a dedicated register for it ? @@ -788,14 +762,70 @@ bool SafeStack::runOnFunction(Function &F) { return true; } +class SafeStackLegacyPass : public FunctionPass { + const TargetMachine *TM; + +public: + static char ID; // Pass identification, replacement for typeid.. + SafeStackLegacyPass(const TargetMachine *TM) : FunctionPass(ID), TM(TM) { + initializeSafeStackLegacyPassPass(*PassRegistry::getPassRegistry()); + } + + SafeStackLegacyPass() : SafeStackLegacyPass(nullptr) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addRequired(); + } + + bool runOnFunction(Function &F) override { + DEBUG(dbgs() << "[SafeStack] Function: " << F.getName() << "\n"); + + if (!F.hasFnAttribute(Attribute::SafeStack)) { + DEBUG(dbgs() << "[SafeStack] safestack is not requested" + " for this function\n"); + return false; + } + + if (F.isDeclaration()) { + DEBUG(dbgs() << "[SafeStack] function definition" + " is not available\n"); + return false; + } + + if (!TM) + report_fatal_error("Target machine is required"); + auto *TL = TM->getSubtargetImpl(F)->getTargetLowering(); + if (!TL) + report_fatal_error("TargetLowering instance is required"); + + auto *DL = &F.getParent()->getDataLayout(); + auto &TLI = getAnalysis().getTLI(); + auto &ACT = getAnalysis().getAssumptionCache(F); + + // Compute DT and LI only for functions that have the attribute. + // This is only useful because the legacy pass manager doesn't let us + // compute analyzes lazily. + // In the backend pipeline, nothing preserves DT before SafeStack, so we + // would otherwise always compute it wastefully, even if there is no + // function with the safestack attribute. + DominatorTree DT(F); + LoopInfo LI(DT); + + ScalarEvolution SE(F, TLI, ACT, DT, LI); + + return SafeStack(F, *TL, *DL, SE).run(); + } +}; + } // anonymous namespace -char SafeStack::ID = 0; -INITIALIZE_TM_PASS_BEGIN(SafeStack, "safe-stack", +char SafeStackLegacyPass::ID = 0; +INITIALIZE_TM_PASS_BEGIN(SafeStackLegacyPass, "safe-stack", "Safe Stack instrumentation pass", false, false) -INITIALIZE_TM_PASS_END(SafeStack, "safe-stack", +INITIALIZE_TM_PASS_END(SafeStackLegacyPass, "safe-stack", "Safe Stack instrumentation pass", false, false) FunctionPass *llvm::createSafeStackPass(const llvm::TargetMachine *TM) { - return new SafeStack(TM); + return new SafeStackLegacyPass(TM); } diff --git a/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp new file mode 100644 index 000000000000..dab5b91f50ad --- /dev/null +++ b/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -0,0 +1,660 @@ +//=== ScalarizeMaskedMemIntrin.cpp - Scalarize unsupported masked mem ===// +//=== instrinsics ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass replaces masked memory intrinsics - when unsupported by the target +// - with a chain of basic blocks, that deal with the elements one-by-one if the +// appropriate mask bit is set. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Analysis/TargetTransformInfo.h" +#include "llvm/IR/IRBuilder.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "scalarize-masked-mem-intrin" + +namespace { + +class ScalarizeMaskedMemIntrin : public FunctionPass { + const TargetTransformInfo *TTI; + +public: + static char ID; // Pass identification, replacement for typeid + explicit ScalarizeMaskedMemIntrin() : FunctionPass(ID), TTI(nullptr) { + initializeScalarizeMaskedMemIntrinPass(*PassRegistry::getPassRegistry()); + } + bool runOnFunction(Function &F) override; + + StringRef getPassName() const override { + return "Scalarize Masked Memory Intrinsics"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + } + +private: + bool optimizeBlock(BasicBlock &BB, bool &ModifiedDT); + bool optimizeCallInst(CallInst *CI, bool &ModifiedDT); +}; +} // namespace + +char ScalarizeMaskedMemIntrin::ID = 0; +INITIALIZE_PASS_BEGIN(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin", + "Scalarize unsupported masked memory intrinsics", false, + false) +INITIALIZE_PASS_END(ScalarizeMaskedMemIntrin, "scalarize-masked-mem-intrin", + "Scalarize unsupported masked memory intrinsics", false, + false) + +FunctionPass *llvm::createScalarizeMaskedMemIntrinPass() { + return new ScalarizeMaskedMemIntrin(); +} + +// Translate a masked load intrinsic like +// <16 x i32 > @llvm.masked.load( <16 x i32>* %addr, i32 align, +// <16 x i1> %mask, <16 x i32> %passthru) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.load, label %else +// +// cond.load: ; preds = %0 +// %4 = getelementptr i32* %1, i32 0 +// %5 = load i32* %4 +// %6 = insertelement <16 x i32> undef, i32 %5, i32 0 +// br label %else +// +// else: ; preds = %0, %cond.load +// %res.phi.else = phi <16 x i32> [ %6, %cond.load ], [ undef, %0 ] +// %7 = extractelement <16 x i1> %mask, i32 1 +// %8 = icmp eq i1 %7, true +// br i1 %8, label %cond.load1, label %else2 +// +// cond.load1: ; preds = %else +// %9 = getelementptr i32* %1, i32 1 +// %10 = load i32* %9 +// %11 = insertelement <16 x i32> %res.phi.else, i32 %10, i32 1 +// br label %else2 +// +// else2: ; preds = %else, %cond.load1 +// %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] +// %12 = extractelement <16 x i1> %mask, i32 2 +// %13 = icmp eq i1 %12, true +// br i1 %13, label %cond.load4, label %else5 +// +static void scalarizeMaskedLoad(CallInst *CI) { + Value *Ptr = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast(CI->getType()); + assert(VecType && "Unexpected return type of masked load intrinsic"); + + Type *EltTy = CI->getType()->getVectorElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = + isa(Mask) && cast(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Value *NewI = Builder.CreateAlignedLoad(Ptr, AlignVal); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::min(AlignVal, VecType->getScalarSizeInBits() / 8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + + if (isa(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = + Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %res.phi.else3 = phi <16 x i32> [ %11, %cond.load1 ], [ %res.phi.else, %else ] + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_load = icmp eq i1 %mask_1, true + // br i1 %to_load, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = + Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + LoadInst *Load = Builder.CreateAlignedLoad(Gep, AlignVal); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked store intrinsic, like +// void @llvm.masked.store(<16 x i32> %src, <16 x i32>* %addr, i32 align, +// <16 x i1> %mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set +// +// %1 = bitcast i8* %addr to i32* +// %2 = extractelement <16 x i1> %mask, i32 0 +// %3 = icmp eq i1 %2, true +// br i1 %3, label %cond.store, label %else +// +// cond.store: ; preds = %0 +// %4 = extractelement <16 x i32> %val, i32 0 +// %5 = getelementptr i32* %1, i32 0 +// store i32 %4, i32* %5 +// br label %else +// +// else: ; preds = %0, %cond.store +// %6 = extractelement <16 x i1> %mask, i32 1 +// %7 = icmp eq i1 %6, true +// br i1 %7, label %cond.store1, label %else2 +// +// cond.store1: ; preds = %else +// %8 = extractelement <16 x i32> %val, i32 1 +// %9 = getelementptr i32* %1, i32 1 +// store i32 %8, i32* %9 +// br label %else2 +// . . . +static void scalarizeMaskedStore(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptr = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + VectorType *VecType = dyn_cast(Src->getType()); + assert(VecType && "Unexpected data type in masked store intrinsic"); + + Type *EltTy = VecType->getElementType(); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + // Short-cut if the mask is all-true. + bool IsAllOnesMask = + isa(Mask) && cast(Mask)->isAllOnesValue(); + + if (IsAllOnesMask) { + Builder.CreateAlignedStore(Src, Ptr, AlignVal); + CI->eraseFromParent(); + return; + } + + // Adjust alignment for the scalar instruction. + AlignVal = std::max(AlignVal, VecType->getScalarSizeInBits() / 8); + // Bitcast %addr fron i8* to EltTy* + Type *NewPtrType = + EltTy->getPointerTo(cast(Ptr->getType())->getAddressSpace()); + Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType); + unsigned VectorWidth = VecType->getNumElements(); + + if (isa(Mask)) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + } + CI->eraseFromParent(); + return; + } + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %mask_1 = extractelement <16 x i1> %mask, i32 Idx + // %to_store = icmp eq i1 %mask_1, true + // br i1 %to_store, label %cond.store, label %else + // + Value *Predicate = + Builder.CreateExtractElement(Mask, Builder.getInt32(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1)); + + // Create "cond" block + // + // %OneElt = extractelement <16 x i32> %Src, i32 Idx + // %EltAddr = getelementptr i32* %1, i32 0 + // %store i32 %OneElt, i32* %EltAddr + // + BasicBlock *CondBlock = + IfBlock->splitBasicBlock(InsertPt->getIterator(), "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx)); + Value *Gep = + Builder.CreateInBoundsGEP(EltTy, FirstEltPtr, Builder.getInt32(Idx)); + Builder.CreateAlignedStore(OneElt, Gep, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = + CondBlock->splitBasicBlock(InsertPt->getIterator(), "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +// Translate a masked gather intrinsic like +// <16 x i32 > @llvm.masked.gather.v16i32( <16 x i32*> %Ptrs, i32 4, +// <16 x i1> %Mask, <16 x i32> %Src) +// to a chain of basic blocks, with loading element one-by-one if +// the appropriate mask bit is set +// +// % Ptrs = getelementptr i32, i32* %base, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> %Mask, i32 0 +// % ToLoad0 = icmp eq i1 % Mask0, true +// br i1 % ToLoad0, label %cond.load, label %else +// +// cond.load: +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// % Load0 = load i32, i32* % Ptr0, align 4 +// % Res0 = insertelement <16 x i32> undef, i32 % Load0, i32 0 +// br label %else +// +// else: +// %res.phi.else = phi <16 x i32>[% Res0, %cond.load], [undef, % 0] +// % Mask1 = extractelement <16 x i1> %Mask, i32 1 +// % ToLoad1 = icmp eq i1 % Mask1, true +// br i1 % ToLoad1, label %cond.load1, label %else2 +// +// cond.load1: +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// % Load1 = load i32, i32* % Ptr1, align 4 +// % Res1 = insertelement <16 x i32> %res.phi.else, i32 % Load1, i32 1 +// br label %else2 +// . . . +// % Result = select <16 x i1> %Mask, <16 x i32> %res.phi.select, <16 x i32> %Src +// ret <16 x i32> %Result +static void scalarizeMaskedGather(CallInst *CI) { + Value *Ptrs = CI->getArgOperand(0); + Value *Alignment = CI->getArgOperand(1); + Value *Mask = CI->getArgOperand(2); + Value *Src0 = CI->getArgOperand(3); + + VectorType *VecType = dyn_cast(CI->getType()); + + assert(VecType && "Unexpected return type of masked load intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + BasicBlock *CondBlock = nullptr; + BasicBlock *PrevIfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + unsigned AlignVal = cast(Alignment)->getZExtValue(); + + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + Value *UndefVal = UndefValue::get(VecType); + + // The result vector + Value *VResult = UndefVal; + unsigned VectorWidth = VecType->getNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = + Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement( + VResult, Load, Builder.getInt32(Idx), "Res" + Twine(Idx)); + } + Value *NewI = Builder.CreateSelect(Mask, VResult, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); + return; + } + + PHINode *Phi = nullptr; + Value *PrevPhi = UndefVal; + + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + + // Fill the "else" block, created in the previous iteration + // + // %Mask1 = extractelement <16 x i1> %Mask, i32 1 + // %ToLoad1 = icmp eq i1 %Mask1, true + // br i1 %ToLoad1, label %cond.load, label %else + // + if (Idx > 0) { + Phi = Builder.CreatePHI(VecType, 2, "res.phi.else"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + PrevPhi = Phi; + VResult = Phi; + } + + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToLoad" + Twine(Idx)); + + // Create "cond" block + // + // %EltAddr = getelementptr i32* %1, i32 0 + // %Elt = load i32* %EltAddr + // VResult = insertelement <16 x i32> VResult, i32 %Elt, i32 Idx + // + CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.load"); + Builder.SetInsertPoint(InsertPt); + + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + LoadInst *Load = + Builder.CreateAlignedLoad(Ptr, AlignVal, "Load" + Twine(Idx)); + VResult = Builder.CreateInsertElement(VResult, Load, Builder.getInt32(Idx), + "Res" + Twine(Idx)); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + PrevIfBlock = IfBlock; + IfBlock = NewIfBlock; + } + + Phi = Builder.CreatePHI(VecType, 2, "res.phi.select"); + Phi->addIncoming(VResult, CondBlock); + Phi->addIncoming(PrevPhi, PrevIfBlock); + Value *NewI = Builder.CreateSelect(Mask, Phi, Src0); + CI->replaceAllUsesWith(NewI); + CI->eraseFromParent(); +} + +// Translate a masked scatter intrinsic, like +// void @llvm.masked.scatter.v16i32(<16 x i32> %Src, <16 x i32*>* %Ptrs, i32 4, +// <16 x i1> %Mask) +// to a chain of basic blocks, that stores element one-by-one if +// the appropriate mask bit is set. +// +// % Ptrs = getelementptr i32, i32* %ptr, <16 x i64> %ind +// % Mask0 = extractelement <16 x i1> % Mask, i32 0 +// % ToStore0 = icmp eq i1 % Mask0, true +// br i1 %ToStore0, label %cond.store, label %else +// +// cond.store: +// % Elt0 = extractelement <16 x i32> %Src, i32 0 +// % Ptr0 = extractelement <16 x i32*> %Ptrs, i32 0 +// store i32 %Elt0, i32* % Ptr0, align 4 +// br label %else +// +// else: +// % Mask1 = extractelement <16 x i1> % Mask, i32 1 +// % ToStore1 = icmp eq i1 % Mask1, true +// br i1 % ToStore1, label %cond.store1, label %else2 +// +// cond.store1: +// % Elt1 = extractelement <16 x i32> %Src, i32 1 +// % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 +// store i32 % Elt1, i32* % Ptr1, align 4 +// br label %else2 +// . . . +static void scalarizeMaskedScatter(CallInst *CI) { + Value *Src = CI->getArgOperand(0); + Value *Ptrs = CI->getArgOperand(1); + Value *Alignment = CI->getArgOperand(2); + Value *Mask = CI->getArgOperand(3); + + assert(isa(Src->getType()) && + "Unexpected data type in masked scatter intrinsic"); + assert(isa(Ptrs->getType()) && + isa(Ptrs->getType()->getVectorElementType()) && + "Vector of pointers is expected in masked scatter intrinsic"); + + IRBuilder<> Builder(CI->getContext()); + Instruction *InsertPt = CI; + BasicBlock *IfBlock = CI->getParent(); + Builder.SetInsertPoint(InsertPt); + Builder.SetCurrentDebugLocation(CI->getDebugLoc()); + + unsigned AlignVal = cast(Alignment)->getZExtValue(); + unsigned VectorWidth = Src->getType()->getVectorNumElements(); + + // Shorten the way if the mask is a vector of constants. + bool IsConstMask = isa(Mask); + + if (IsConstMask) { + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + if (cast(Mask)->getOperand(Idx)->isNullValue()) + continue; + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + } + CI->eraseFromParent(); + return; + } + for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) { + // Fill the "else" block, created in the previous iteration + // + // % Mask1 = extractelement <16 x i1> % Mask, i32 Idx + // % ToStore = icmp eq i1 % Mask1, true + // br i1 % ToStore, label %cond.store, label %else + // + Value *Predicate = Builder.CreateExtractElement(Mask, Builder.getInt32(Idx), + "Mask" + Twine(Idx)); + Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Predicate, + ConstantInt::get(Predicate->getType(), 1), + "ToStore" + Twine(Idx)); + + // Create "cond" block + // + // % Elt1 = extractelement <16 x i32> %Src, i32 1 + // % Ptr1 = extractelement <16 x i32*> %Ptrs, i32 1 + // %store i32 % Elt1, i32* % Ptr1 + // + BasicBlock *CondBlock = IfBlock->splitBasicBlock(InsertPt, "cond.store"); + Builder.SetInsertPoint(InsertPt); + + Value *OneElt = Builder.CreateExtractElement(Src, Builder.getInt32(Idx), + "Elt" + Twine(Idx)); + Value *Ptr = Builder.CreateExtractElement(Ptrs, Builder.getInt32(Idx), + "Ptr" + Twine(Idx)); + Builder.CreateAlignedStore(OneElt, Ptr, AlignVal); + + // Create "else" block, fill it in the next iteration + BasicBlock *NewIfBlock = CondBlock->splitBasicBlock(InsertPt, "else"); + Builder.SetInsertPoint(InsertPt); + Instruction *OldBr = IfBlock->getTerminator(); + BranchInst::Create(CondBlock, NewIfBlock, Cmp, OldBr); + OldBr->eraseFromParent(); + IfBlock = NewIfBlock; + } + CI->eraseFromParent(); +} + +bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) { + if (skipFunction(F)) + return false; + + bool EverMadeChange = false; + + TTI = &getAnalysis().getTTI(F); + + bool MadeChange = true; + while (MadeChange) { + MadeChange = false; + for (Function::iterator I = F.begin(); I != F.end();) { + BasicBlock *BB = &*I++; + bool ModifiedDTOnIteration = false; + MadeChange |= optimizeBlock(*BB, ModifiedDTOnIteration); + + // Restart BB iteration if the dominator tree of the Function was changed + if (ModifiedDTOnIteration) + break; + } + + EverMadeChange |= MadeChange; + } + + return EverMadeChange; +} + +bool ScalarizeMaskedMemIntrin::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) { + bool MadeChange = false; + + BasicBlock::iterator CurInstIterator = BB.begin(); + while (CurInstIterator != BB.end()) { + if (CallInst *CI = dyn_cast(&*CurInstIterator++)) + MadeChange |= optimizeCallInst(CI, ModifiedDT); + if (ModifiedDT) + return true; + } + + return MadeChange; +} + +bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI, + bool &ModifiedDT) { + + IntrinsicInst *II = dyn_cast(CI); + if (II) { + switch (II->getIntrinsicID()) { + default: + break; + case Intrinsic::masked_load: { + // Scalarize unsupported vector masked load + if (!TTI->isLegalMaskedLoad(CI->getType())) { + scalarizeMaskedLoad(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_store: { + if (!TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType())) { + scalarizeMaskedStore(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_gather: { + if (!TTI->isLegalMaskedGather(CI->getType())) { + scalarizeMaskedGather(CI); + ModifiedDT = true; + return true; + } + return false; + } + case Intrinsic::masked_scatter: { + if (!TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType())) { + scalarizeMaskedScatter(CI); + ModifiedDT = true; + return true; + } + return false; + } + } + } + + return false; +} diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index c77046fdfaf5..caf5cb497a71 100644 --- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -114,7 +114,7 @@ namespace { SmallPtrSet CombinedNodes; // AA - Used for DAG load/store alias analysis. - AliasAnalysis &AA; + AliasAnalysis *AA; /// When an instruction is simplified, add all users of the instruction to /// the work lists because they might get more simplified now. @@ -496,9 +496,9 @@ namespace { SDValue distributeTruncateThroughAnd(SDNode *N); public: - DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) + DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL) : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes), - OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(A) { + OptLevel(OL), LegalOperations(false), LegalTypes(false), AA(AA) { ForCodeSize = DAG.getMachineFunction().getFunction()->optForSize(); MaximumLegalStoreInBits = 0; @@ -1729,10 +1729,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { NumLeftToConsider--; } - SDValue Result; - // If we've changed things around then replace token factor. if (Changed) { + SDValue Result; if (Ops.empty()) { // The entry token is the only possible outcome. Result = DAG.getEntryNode(); @@ -1749,13 +1748,9 @@ SDValue DAGCombiner::visitTokenFactor(SDNode *N) { Result = DAG.getNode(ISD::TokenFactor, SDLoc(N), MVT::Other, Ops); } } - - // Add users to worklist, since we may introduce a lot of new - // chained token factors while removing memory deps. - return CombineTo(N, Result, true /*add to worklist*/); + return Result; } - - return Result; + return SDValue(); } /// MERGE_VALUES can always be eliminated. @@ -2131,17 +2126,17 @@ SDValue DAGCombiner::visitADDCARRY(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); SDValue CarryIn = N->getOperand(2); + SDLoc DL(N); // canonicalize constant to RHS ConstantSDNode *N0C = dyn_cast(N0); ConstantSDNode *N1C = dyn_cast(N1); if (N0C && !N1C) - return DAG.getNode(ISD::ADDCARRY, SDLoc(N), N->getVTList(), - N1, N0, CarryIn); + return DAG.getNode(ISD::ADDCARRY, DL, N->getVTList(), N1, N0, CarryIn); // fold (addcarry x, y, false) -> (uaddo x, y) if (isNullConstant(CarryIn)) - return DAG.getNode(ISD::UADDO, SDLoc(N), N->getVTList(), N0, N1); + return DAG.getNode(ISD::UADDO, DL, N->getVTList(), N0, N1); if (SDValue Combined = visitADDCARRYLike(N0, N1, CarryIn, N)) return Combined; @@ -5313,17 +5308,6 @@ SDValue DAGCombiner::visitSHL(SDNode *N) { } } - // If the target supports masking y in (shl, y), - // fold (shl x, (and y, ((1 << numbits(x)) - 1))) -> (shl x, y) - if (TLI.isOperationLegal(ISD::SHL, VT) && - TLI.supportsModuloShift(ISD::SHL, VT) && N1->getOpcode() == ISD::AND) { - if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) { - if (Mask->getZExtValue() == OpSizeInBits - 1) { - return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, N1->getOperand(0)); - } - } - } - ConstantSDNode *N1C = isConstOrConstSplat(N1); // fold (shl c1, c2) -> c1<isOpaque()) return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C); // fold (shl 0, x) -> 0 - if (isNullConstant(N0)) + if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (shl x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) @@ -5522,18 +5506,9 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); - // If the target supports masking y in (sra, y), - // fold (sra x, (and y, ((1 << numbits(x)) - 1))) -> (sra x, y) - if (TLI.isOperationLegal(ISD::SRA, VT) && - TLI.supportsModuloShift(ISD::SRA, VT) && N1->getOpcode() == ISD::AND) { - if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) { - if (Mask->getZExtValue() == OpSizeInBits - 1) { - return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0, N1->getOperand(0)); - } - } - } - // Arithmetic shifting an all-sign-bit value is a no-op. + // fold (sra 0, x) -> 0 + // fold (sra -1, x) -> -1 if (DAG.ComputeNumSignBits(N0) == OpSizeInBits) return N0; @@ -5548,12 +5523,6 @@ SDValue DAGCombiner::visitSRA(SDNode *N) { ConstantSDNode *N0C = getAsNonOpaqueConstant(N0); if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C); - // fold (sra 0, x) -> 0 - if (isNullConstant(N0)) - return N0; - // fold (sra -1, x) -> -1 - if (isAllOnesConstant(N0)) - return N0; // fold (sra x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) return DAG.getUNDEF(VT); @@ -5691,17 +5660,6 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { EVT VT = N0.getValueType(); unsigned OpSizeInBits = VT.getScalarSizeInBits(); - // If the target supports masking y in (srl, y), - // fold (srl x, (and y, ((1 << numbits(x)) - 1))) -> (srl x, y) - if (TLI.isOperationLegal(ISD::SRL, VT) && - TLI.supportsModuloShift(ISD::SRL, VT) && N1->getOpcode() == ISD::AND) { - if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) { - if (Mask->getZExtValue() == OpSizeInBits - 1) { - return DAG.getNode(ISD::SRL, SDLoc(N), VT, N0, N1->getOperand(0)); - } - } - } - // fold vector ops if (VT.isVector()) if (SDValue FoldedVOp = SimplifyVBinOp(N)) @@ -5714,7 +5672,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) { if (N0C && N1C && !N1C->isOpaque()) return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C); // fold (srl 0, x) -> 0 - if (isNullConstant(N0)) + if (isNullConstantOrNullSplatConstant(N0)) return N0; // fold (srl x, c >= size(x)) -> undef if (N1C && N1C->getAPIntValue().uge(OpSizeInBits)) @@ -7365,14 +7323,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueSizeInBits(), std::min(Op.getValueSizeInBits(), VT.getSizeInBits())); - if (TruncatedBits.isSubsetOf(Known.Zero)) { - if (VT.bitsGT(Op.getValueType())) - return DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), VT, Op); - if (VT.bitsLT(Op.getValueType())) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - - return Op; - } + if (TruncatedBits.isSubsetOf(Known.Zero)) + return DAG.getZExtOrTrunc(Op, SDLoc(N), VT); } // fold (zext (truncate (load x))) -> (zext (smaller load x)) @@ -7419,14 +7371,8 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { } if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) { - SDValue Op = N0.getOperand(0); - if (SrcVT.bitsLT(VT)) { - Op = DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } else if (SrcVT.bitsGT(VT)) { - Op = DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Op); - AddToWorklist(Op.getNode()); - } + SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); + AddToWorklist(Op.getNode()); return DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType()); } } @@ -7440,11 +7386,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) { N0.getValueType()) || !TLI.isZExtFree(N0.getValueType(), VT))) { SDValue X = N0.getOperand(0).getOperand(0); - if (X.getValueType().bitsLT(VT)) { - X = DAG.getNode(ISD::ANY_EXTEND, SDLoc(X), VT, X); - } else if (X.getValueType().bitsGT(VT)) { - X = DAG.getNode(ISD::TRUNCATE, SDLoc(X), VT, X); - } + X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); SDLoc DL(N); @@ -7669,14 +7611,8 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { } // fold (aext (truncate x)) - if (N0.getOpcode() == ISD::TRUNCATE) { - SDValue TruncOp = N0.getOperand(0); - if (TruncOp.getValueType() == VT) - return TruncOp; // x iff x size == zext size. - if (TruncOp.getValueType().bitsGT(VT)) - return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, TruncOp); - return DAG.getNode(ISD::ANY_EXTEND, SDLoc(N), VT, TruncOp); - } + if (N0.getOpcode() == ISD::TRUNCATE) + return DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT); // Fold (aext (and (trunc x), cst)) -> (and x, cst) // if the trunc is not free. @@ -7687,11 +7623,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) { N0.getValueType())) { SDLoc DL(N); SDValue X = N0.getOperand(0).getOperand(0); - if (X.getValueType().bitsLT(VT)) { - X = DAG.getNode(ISD::ANY_EXTEND, DL, VT, X); - } else if (X.getValueType().bitsGT(VT)) { - X = DAG.getNode(ISD::TRUNCATE, DL, VT, X); - } + X = DAG.getAnyExtOrTrunc(X, DL, VT); APInt Mask = cast(N0.getOperand(1))->getAPIntValue(); Mask = Mask.zext(VT.getSizeInBits()); return DAG.getNode(ISD::AND, DL, VT, @@ -14868,6 +14800,55 @@ SDValue combineTruncationShuffle(ShuffleVectorSDNode *SVN, SelectionDAG &DAG) { return SDValue(); } +// Combine shuffles of splat-shuffles of the form: +// shuffle (shuffle V, undef, splat-mask), undef, M +// If splat-mask contains undef elements, we need to be careful about +// introducing undef's in the folded mask which are not the result of composing +// the masks of the shuffles. +static SDValue combineShuffleOfSplat(ArrayRef UserMask, + ShuffleVectorSDNode *Splat, + SelectionDAG &DAG) { + ArrayRef SplatMask = Splat->getMask(); + assert(UserMask.size() == SplatMask.size() && "Mask length mismatch"); + + // Prefer simplifying to the splat-shuffle, if possible. This is legal if + // every undef mask element in the splat-shuffle has a corresponding undef + // element in the user-shuffle's mask or if the composition of mask elements + // would result in undef. + // Examples for (shuffle (shuffle v, undef, SplatMask), undef, UserMask): + // * UserMask=[0,2,u,u], SplatMask=[2,u,2,u] -> [2,2,u,u] + // In this case it is not legal to simplify to the splat-shuffle because we + // may be exposing the users of the shuffle an undef element at index 1 + // which was not there before the combine. + // * UserMask=[0,u,2,u], SplatMask=[2,u,2,u] -> [2,u,2,u] + // In this case the composition of masks yields SplatMask, so it's ok to + // simplify to the splat-shuffle. + // * UserMask=[3,u,2,u], SplatMask=[2,u,2,u] -> [u,u,2,u] + // In this case the composed mask includes all undef elements of SplatMask + // and in addition sets element zero to undef. It is safe to simplify to + // the splat-shuffle. + auto CanSimplifyToExistingSplat = [](ArrayRef UserMask, + ArrayRef SplatMask) { + for (unsigned i = 0, e = UserMask.size(); i != e; ++i) + if (UserMask[i] != -1 && SplatMask[i] == -1 && + SplatMask[UserMask[i]] != -1) + return false; + return true; + }; + if (CanSimplifyToExistingSplat(UserMask, SplatMask)) + return SDValue(Splat, 0); + + // Create a new shuffle with a mask that is composed of the two shuffles' + // masks. + SmallVector NewMask; + for (int Idx : UserMask) + NewMask.push_back(Idx == -1 ? -1 : SplatMask[Idx]); + + return DAG.getVectorShuffle(Splat->getValueType(0), SDLoc(Splat), + Splat->getOperand(0), Splat->getOperand(1), + NewMask); +} + SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { EVT VT = N->getValueType(0); unsigned NumElts = VT.getVectorNumElements(); @@ -14914,6 +14895,11 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) { return DAG.getVectorShuffle(VT, SDLoc(N), N0, N1, NewMask); } + // A shuffle of a single vector that is a splat can always be folded. + if (auto *N0Shuf = dyn_cast(N0)) + if (N1->isUndef() && N0Shuf->isSplat()) + return combineShuffleOfSplat(SVN->getMask(), N0Shuf, DAG); + // If it is a splat, check if the argument vector is another splat or a // build_vector. if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) { @@ -16381,17 +16367,17 @@ bool DAGCombiner::isAlias(LSBaseSDNode *Op0, LSBaseSDNode *Op1) const { UseAA = false; #endif - if (UseAA && + if (UseAA && AA && Op0->getMemOperand()->getValue() && Op1->getMemOperand()->getValue()) { // Use alias analysis information. int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1); int64_t Overlap0 = NumBytes0 + SrcValOffset0 - MinOffset; int64_t Overlap1 = NumBytes1 + SrcValOffset1 - MinOffset; AliasResult AAResult = - AA.alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0, - UseTBAA ? Op0->getAAInfo() : AAMDNodes()), - MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1, - UseTBAA ? Op1->getAAInfo() : AAMDNodes())); + AA->alias(MemoryLocation(Op0->getMemOperand()->getValue(), Overlap0, + UseTBAA ? Op0->getAAInfo() : AAMDNodes()), + MemoryLocation(Op1->getMemOperand()->getValue(), Overlap1, + UseTBAA ? Op1->getAAInfo() : AAMDNodes()) ); if (AAResult == NoAlias) return false; } @@ -16605,7 +16591,7 @@ bool DAGCombiner::findBetterNeighborChains(StoreSDNode *St) { } /// This is the entry point for the file. -void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis &AA, +void SelectionDAG::Combine(CombineLevel Level, AliasAnalysis *AA, CodeGenOpt::Level OptLevel) { /// This is the main entry point to this class. DAGCombiner(*this, AA, OptLevel).Run(Level); diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp index 8c98e3740f6d..5003b79974eb 100644 --- a/lib/CodeGen/SelectionDAG/FastISel.cpp +++ b/lib/CodeGen/SelectionDAG/FastISel.cpp @@ -622,7 +622,7 @@ bool FastISel::selectStackmap(const CallInst *I) { // have to worry about calling conventions and target-specific lowering code. // Instead we perform the call lowering right here. // - // CALLSEQ_START(0...) + // CALLSEQ_START(0, 0...) // STACKMAP(id, nbytes, ...) // CALLSEQ_END(0, 0) // @@ -1150,16 +1150,16 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) { return true; } - unsigned Offset = 0; + // Byval arguments with frame indices were already handled after argument + // lowering and before isel. + const auto *Arg = + dyn_cast(Address->stripInBoundsConstantOffsets()); + if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX) + return true; + Optional Op; - if (const auto *Arg = dyn_cast(Address)) - // Some arguments' frame index is recorded during argument lowering. - Offset = FuncInfo.getArgumentFrameIndex(Arg); - if (Offset) - Op = MachineOperand::CreateFI(Offset); - if (!Op) - if (unsigned Reg = lookUpRegForValue(Address)) - Op = MachineOperand::CreateReg(Reg, false); + if (unsigned Reg = lookUpRegForValue(Address)) + Op = MachineOperand::CreateReg(Reg, false); // If we have a VLA that has a "use" in a metadata node that's then used // here but it has no other uses, then we have a problem. E.g., diff --git a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp index cdf4d3a8b4e5..606b8952f3c1 100644 --- a/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp +++ b/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp @@ -85,7 +85,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, MF = &mf; TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); - MachineModuleInfo &MMI = MF->getMMI(); const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering(); unsigned StackAlign = TFI->getStackAlignment(); @@ -214,33 +213,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf, if (!isa(I) || !StaticAllocaMap.count(cast(&I))) InitializeRegForValue(&I); - // Collect llvm.dbg.declare information. This is done now instead of - // during the initial isel pass through the IR so that it is done - // in a predictable order. - if (const DbgDeclareInst *DI = dyn_cast(&I)) { - assert(DI->getVariable() && "Missing variable"); - assert(DI->getDebugLoc() && "Missing location"); - if (MMI.hasDebugInfo()) { - // Don't handle byval struct arguments or VLAs, for example. - // Non-byval arguments are handled here (they refer to the stack - // temporary alloca at this point). - const Value *Address = DI->getAddress(); - if (Address) { - if (const BitCastInst *BCI = dyn_cast(Address)) - Address = BCI->getOperand(0); - if (const AllocaInst *AI = dyn_cast(Address)) { - DenseMap::iterator SI = - StaticAllocaMap.find(AI); - if (SI != StaticAllocaMap.end()) { // Check for VLAs. - int FI = SI->second; - MF->setVariableDbgInfo(DI->getVariable(), DI->getExpression(), - FI, DI->getDebugLoc()); - } - } - } - } - } - // Decide the preferred extend type for a value. PreferredExtendType[&I] = getPreferredExtendForValue(&I); } @@ -510,12 +482,11 @@ void FunctionLoweringInfo::setArgumentFrameIndex(const Argument *A, /// If the argument does not have any assigned frame index then 0 is /// returned. int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) { - DenseMap::iterator I = - ByValArgFrameIndexMap.find(A); + auto I = ByValArgFrameIndexMap.find(A); if (I != ByValArgFrameIndexMap.end()) return I->second; DEBUG(dbgs() << "Argument does not have assigned frame index!\n"); - return 0; + return INT_MAX; } unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg( diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp index 2654b3ad7a62..9a47a914df91 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp @@ -1493,7 +1493,7 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); SDValue Size = Tmp2.getOperand(1); SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT); @@ -4187,6 +4187,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) { ReplacedNode(Node); break; } + case ISD::MUL: case ISD::SDIV: case ISD::SREM: case ISD::UDIV: diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/lib/CodeGen/SelectionDAG/LegalizeTypes.h index cde4331cc42d..4c3b514856b7 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -675,6 +675,7 @@ private: // Vector Operand Splitting: <128 x ty> -> 2 x <64 x ty>. bool SplitVectorOperand(SDNode *N, unsigned OpNo); SDValue SplitVecOp_VSELECT(SDNode *N, unsigned OpNo); + SDValue SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo); SDValue SplitVecOp_UnaryOp(SDNode *N); SDValue SplitVecOp_TruncateHelper(SDNode *N); diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp index 97a7fab6efd0..ff0e609803d8 100644 --- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp +++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp @@ -1513,6 +1513,22 @@ bool DAGTypeLegalizer::SplitVectorOperand(SDNode *N, unsigned OpNo) { case ISD::ZERO_EXTEND_VECTOR_INREG: Res = SplitVecOp_ExtVecInRegOp(N); break; + + case ISD::VECREDUCE_FADD: + case ISD::VECREDUCE_FMUL: + case ISD::VECREDUCE_ADD: + case ISD::VECREDUCE_MUL: + case ISD::VECREDUCE_AND: + case ISD::VECREDUCE_OR: + case ISD::VECREDUCE_XOR: + case ISD::VECREDUCE_SMAX: + case ISD::VECREDUCE_SMIN: + case ISD::VECREDUCE_UMAX: + case ISD::VECREDUCE_UMIN: + case ISD::VECREDUCE_FMAX: + case ISD::VECREDUCE_FMIN: + Res = SplitVecOp_VECREDUCE(N, OpNo); + break; } } @@ -1565,6 +1581,48 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSELECT(SDNode *N, unsigned OpNo) { return DAG.getNode(ISD::CONCAT_VECTORS, DL, Src0VT, LoSelect, HiSelect); } +SDValue DAGTypeLegalizer::SplitVecOp_VECREDUCE(SDNode *N, unsigned OpNo) { + EVT ResVT = N->getValueType(0); + SDValue Lo, Hi; + SDLoc dl(N); + + SDValue VecOp = N->getOperand(OpNo); + EVT VecVT = VecOp.getValueType(); + assert(VecVT.isVector() && "Can only split reduce vector operand"); + GetSplitVector(VecOp, Lo, Hi); + EVT LoOpVT, HiOpVT; + std::tie(LoOpVT, HiOpVT) = DAG.GetSplitDestVTs(VecVT); + + bool NoNaN = N->getFlags().hasNoNaNs(); + unsigned CombineOpc = 0; + switch (N->getOpcode()) { + case ISD::VECREDUCE_FADD: CombineOpc = ISD::FADD; break; + case ISD::VECREDUCE_FMUL: CombineOpc = ISD::FMUL; break; + case ISD::VECREDUCE_ADD: CombineOpc = ISD::ADD; break; + case ISD::VECREDUCE_MUL: CombineOpc = ISD::MUL; break; + case ISD::VECREDUCE_AND: CombineOpc = ISD::AND; break; + case ISD::VECREDUCE_OR: CombineOpc = ISD::OR; break; + case ISD::VECREDUCE_XOR: CombineOpc = ISD::XOR; break; + case ISD::VECREDUCE_SMAX: CombineOpc = ISD::SMAX; break; + case ISD::VECREDUCE_SMIN: CombineOpc = ISD::SMIN; break; + case ISD::VECREDUCE_UMAX: CombineOpc = ISD::UMAX; break; + case ISD::VECREDUCE_UMIN: CombineOpc = ISD::UMIN; break; + case ISD::VECREDUCE_FMAX: + CombineOpc = NoNaN ? ISD::FMAXNUM : ISD::FMAXNAN; + break; + case ISD::VECREDUCE_FMIN: + CombineOpc = NoNaN ? ISD::FMINNUM : ISD::FMINNAN; + break; + default: + llvm_unreachable("Unexpected reduce ISD node"); + } + + // Use the appropriate scalar instruction on the split subvectors before + // reducing the now partially reduced smaller vector. + SDValue Partial = DAG.getNode(CombineOpc, dl, LoOpVT, Lo, Hi); + return DAG.getNode(N->getOpcode(), dl, ResVT, Partial); +} + SDValue DAGTypeLegalizer::SplitVecOp_UnaryOp(SDNode *N) { // The result has a legal vector type, but the input needs splitting. EVT ResVT = N->getValueType(0); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp index d605a1dc1c20..057badcd6b74 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -2217,10 +2217,10 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, // Also compute a conservative estimate for high known-0 bits. // More trickiness is possible, but this is sufficient for the // interesting case of alignment computation. - unsigned TrailZ = Known.Zero.countTrailingOnes() + - Known2.Zero.countTrailingOnes(); - unsigned LeadZ = std::max(Known.Zero.countLeadingOnes() + - Known2.Zero.countLeadingOnes(), + unsigned TrailZ = Known.countMinTrailingZeros() + + Known2.countMinTrailingZeros(); + unsigned LeadZ = std::max(Known.countMinLeadingZeros() + + Known2.countMinLeadingZeros(), BitWidth) - BitWidth; Known.resetAll(); @@ -2233,13 +2233,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, // treat a udiv as a logical right shift by the power of 2 known to // be less than the denominator. computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); - unsigned LeadZ = Known2.Zero.countLeadingOnes(); + unsigned LeadZ = Known2.countMinLeadingZeros(); computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - unsigned RHSUnknownLeadingOnes = Known2.One.countLeadingZeros(); - if (RHSUnknownLeadingOnes != BitWidth) - LeadZ = std::min(BitWidth, - LeadZ + BitWidth - RHSUnknownLeadingOnes - 1); + unsigned RHSMaxLeadingZeros = Known2.countMaxLeadingZeros(); + if (RHSMaxLeadingZeros != BitWidth) + LeadZ = std::min(BitWidth, LeadZ + BitWidth - RHSMaxLeadingZeros - 1); Known.Zero.setHighBits(LeadZ); break; @@ -2359,7 +2358,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, case ISD::CTTZ_ZERO_UNDEF: { computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // If we have a known 1, its position is our upper bound. - unsigned PossibleTZ = Known2.One.countTrailingZeros(); + unsigned PossibleTZ = Known2.countMaxTrailingZeros(); unsigned LowBits = Log2_32(PossibleTZ) + 1; Known.Zero.setBitsFrom(LowBits); break; @@ -2368,7 +2367,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, case ISD::CTLZ_ZERO_UNDEF: { computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // If we have a known 1, its position is our upper bound. - unsigned PossibleLZ = Known2.One.countLeadingZeros(); + unsigned PossibleLZ = Known2.countMaxLeadingZeros(); unsigned LowBits = Log2_32(PossibleLZ) + 1; Known.Zero.setBitsFrom(LowBits); break; @@ -2376,7 +2375,7 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, case ISD::CTPOP: { computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); // If we know some of the bits are zero, they can't be one. - unsigned PossibleOnes = BitWidth - Known2.Zero.countPopulation(); + unsigned PossibleOnes = Known2.countMaxPopulation(); Known.Zero.setBitsFrom(Log2_32(PossibleOnes) + 1); break; } @@ -2493,13 +2492,12 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, // going to be 0 in the result. Both addition and complement operations // preserve the low zero bits. computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); - unsigned KnownZeroLow = Known2.Zero.countTrailingOnes(); + unsigned KnownZeroLow = Known2.countMinTrailingZeros(); if (KnownZeroLow == 0) break; computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - KnownZeroLow = std::min(KnownZeroLow, - Known2.Zero.countTrailingOnes()); + KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros()); Known.Zero.setLowBits(KnownZeroLow); break; } @@ -2526,15 +2524,13 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, // and the other has the top 8 bits clear, we know the top 7 bits of the // output must be clear. computeKnownBits(Op.getOperand(0), Known2, DemandedElts, Depth + 1); - unsigned KnownZeroHigh = Known2.Zero.countLeadingOnes(); - unsigned KnownZeroLow = Known2.Zero.countTrailingOnes(); + unsigned KnownZeroHigh = Known2.countMinLeadingZeros(); + unsigned KnownZeroLow = Known2.countMinTrailingZeros(); computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - KnownZeroHigh = std::min(KnownZeroHigh, - Known2.Zero.countLeadingOnes()); - KnownZeroLow = std::min(KnownZeroLow, - Known2.Zero.countTrailingOnes()); + KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros()); + KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros()); if (Opcode == ISD::ADDE || Opcode == ISD::ADDCARRY) { // With ADDE and ADDCARRY, a carry bit may be added in, so we can only @@ -2594,8 +2590,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, computeKnownBits(Op.getOperand(0), Known, DemandedElts, Depth + 1); computeKnownBits(Op.getOperand(1), Known2, DemandedElts, Depth + 1); - uint32_t Leaders = std::max(Known.Zero.countLeadingOnes(), - Known2.Zero.countLeadingOnes()); + uint32_t Leaders = + std::max(Known.countMinLeadingZeros(), Known2.countMinLeadingZeros()); Known.resetAll(); Known.Zero.setHighBits(Leaders); break; @@ -2711,8 +2707,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, // UMIN - we know that the result will have the maximum of the // known zero leading bits of the inputs. - unsigned LeadZero = Known.Zero.countLeadingOnes(); - LeadZero = std::max(LeadZero, Known2.Zero.countLeadingOnes()); + unsigned LeadZero = Known.countMinLeadingZeros(); + LeadZero = std::max(LeadZero, Known2.countMinLeadingZeros()); Known.Zero &= Known2.Zero; Known.One &= Known2.One; @@ -2726,8 +2722,8 @@ void SelectionDAG::computeKnownBits(SDValue Op, KnownBits &Known, // UMAX - we know that the result will have the maximum of the // known one leading bits of the inputs. - unsigned LeadOne = Known.One.countLeadingOnes(); - LeadOne = std::max(LeadOne, Known2.One.countLeadingOnes()); + unsigned LeadOne = Known.countMinLeadingOnes(); + LeadOne = std::max(LeadOne, Known2.countMinLeadingOnes()); Known.Zero &= Known2.Zero; Known.One &= Known2.One; @@ -2843,8 +2839,7 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const { // Fall back to computeKnownBits to catch other known cases. KnownBits Known; computeKnownBits(Val, Known); - return (Known.Zero.countPopulation() == BitWidth - 1) && - (Known.One.countPopulation() == 1); + return (Known.countMaxPopulation() == 1) && (Known.countMinPopulation() == 1); } unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const { @@ -2860,6 +2855,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, EVT VT = Op.getValueType(); assert(VT.isInteger() && "Invalid VT!"); unsigned VTBits = VT.getScalarSizeInBits(); + unsigned NumElts = DemandedElts.getBitWidth(); unsigned Tmp, Tmp2; unsigned FirstAnswer = 1; @@ -2903,6 +2899,39 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, } return Tmp; + case ISD::VECTOR_SHUFFLE: { + // Collect the minimum number of sign bits that are shared by every vector + // element referenced by the shuffle. + APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0); + const ShuffleVectorSDNode *SVN = cast(Op); + assert(NumElts == SVN->getMask().size() && "Unexpected vector size"); + for (unsigned i = 0; i != NumElts; ++i) { + int M = SVN->getMaskElt(i); + if (!DemandedElts[i]) + continue; + // For UNDEF elements, we don't know anything about the common state of + // the shuffle result. + if (M < 0) + return 1; + if ((unsigned)M < NumElts) + DemandedLHS.setBit((unsigned)M % NumElts); + else + DemandedRHS.setBit((unsigned)M % NumElts); + } + Tmp = UINT_MAX; + if (!!DemandedLHS) + Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedLHS, Depth + 1); + if (!!DemandedRHS) { + Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedRHS, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + // If we don't know anything, early out and try computeKnownBits fall-back. + if (Tmp == 1) + break; + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); + return Tmp; + } + case ISD::SIGN_EXTEND: case ISD::SIGN_EXTEND_VECTOR_INREG: Tmp = VTBits - Op.getOperand(0).getScalarValueSizeInBits(); @@ -3142,14 +3171,36 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts, return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1); } - case ISD::EXTRACT_SUBVECTOR: - return ComputeNumSignBits(Op.getOperand(0), Depth + 1); + case ISD::EXTRACT_SUBVECTOR: { + // If we know the element index, just demand that subvector elements, + // otherwise demand them all. + SDValue Src = Op.getOperand(0); + ConstantSDNode *SubIdx = dyn_cast(Op.getOperand(1)); + unsigned NumSrcElts = Src.getValueType().getVectorNumElements(); + if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) { + // Offset the demanded elts by the subvector index. + uint64_t Idx = SubIdx->getZExtValue(); + APInt DemandedSrc = DemandedElts.zext(NumSrcElts).shl(Idx); + return ComputeNumSignBits(Src, DemandedSrc, Depth + 1); + } + return ComputeNumSignBits(Src, Depth + 1); + } case ISD::CONCAT_VECTORS: - // Determine the minimum number of sign bits across all input vectors. - // Early out if the result is already 1. - Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1); - for (unsigned i = 1, e = Op.getNumOperands(); (i < e) && (Tmp > 1); ++i) - Tmp = std::min(Tmp, ComputeNumSignBits(Op.getOperand(i), Depth + 1)); + // Determine the minimum number of sign bits across all demanded + // elts of the input vectors. Early out if the result is already 1. + Tmp = UINT_MAX; + EVT SubVectorVT = Op.getOperand(0).getValueType(); + unsigned NumSubVectorElts = SubVectorVT.getVectorNumElements(); + unsigned NumSubVectors = Op.getNumOperands(); + for (unsigned i = 0; (i < NumSubVectors) && (Tmp > 1); ++i) { + APInt DemandedSub = DemandedElts.lshr(i * NumSubVectorElts); + DemandedSub = DemandedSub.trunc(NumSubVectorElts); + if (!DemandedSub) + continue; + Tmp2 = ComputeNumSignBits(Op.getOperand(i), DemandedSub, Depth + 1); + Tmp = std::min(Tmp, Tmp2); + } + assert(Tmp <= VTBits && "Failed to determine minimum sign bits"); return Tmp; } @@ -3543,7 +3594,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsLT(VT) && "Invalid sext node, dst < src!"); if (OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ZERO_EXTEND) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) // sext(undef) = 0, because the top bits will all be the same. return getConstant(0, DL, VT); @@ -3559,8 +3610,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsLT(VT) && "Invalid zext node, dst < src!"); if (OpOpcode == ISD::ZERO_EXTEND) // (zext (zext x)) -> (zext x) - return getNode(ISD::ZERO_EXTEND, DL, VT, - Operand.getNode()->getOperand(0)); + return getNode(ISD::ZERO_EXTEND, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) // zext(undef) = 0, because the top bits will be zero. return getConstant(0, DL, VT); @@ -3579,13 +3629,13 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) // (ext (zext x)) -> (zext x) and (ext (sext x)) -> (sext x) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); else if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); // (ext (trunx x)) -> x if (OpOpcode == ISD::TRUNCATE) { - SDValue OpOp = Operand.getNode()->getOperand(0); + SDValue OpOp = Operand.getOperand(0); if (OpOp.getValueType() == VT) return OpOp; } @@ -3601,16 +3651,16 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, assert(Operand.getValueType().bitsGT(VT) && "Invalid truncate node, src < dst!"); if (OpOpcode == ISD::TRUNCATE) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); if (OpOpcode == ISD::ZERO_EXTEND || OpOpcode == ISD::SIGN_EXTEND || OpOpcode == ISD::ANY_EXTEND) { // If the source is smaller than the dest, we still need an extend. - if (Operand.getNode()->getOperand(0).getValueType().getScalarType() + if (Operand.getOperand(0).getValueType().getScalarType() .bitsLT(VT.getScalarType())) - return getNode(OpOpcode, DL, VT, Operand.getNode()->getOperand(0)); - if (Operand.getNode()->getOperand(0).getValueType().bitsGT(VT)) - return getNode(ISD::TRUNCATE, DL, VT, Operand.getNode()->getOperand(0)); - return Operand.getNode()->getOperand(0); + return getNode(OpOpcode, DL, VT, Operand.getOperand(0)); + if (Operand.getOperand(0).getValueType().bitsGT(VT)) + return getNode(ISD::TRUNCATE, DL, VT, Operand.getOperand(0)); + return Operand.getOperand(0); } if (OpOpcode == ISD::UNDEF) return getUNDEF(VT); @@ -3665,15 +3715,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 if (getTarget().Options.UnsafeFPMath && OpOpcode == ISD::FSUB) // FIXME: FNEG has no fast-math-flags to propagate; use the FSUB's flags? - return getNode(ISD::FSUB, DL, VT, Operand.getNode()->getOperand(1), - Operand.getNode()->getOperand(0), - Operand.getNode()->getFlags()); + return getNode(ISD::FSUB, DL, VT, Operand.getOperand(1), + Operand.getOperand(0), Operand.getNode()->getFlags()); if (OpOpcode == ISD::FNEG) // --X -> X - return Operand.getNode()->getOperand(0); + return Operand.getOperand(0); break; case ISD::FABS: if (OpOpcode == ISD::FNEG) // abs(-X) -> abs(X) - return getNode(ISD::FABS, DL, VT, Operand.getNode()->getOperand(0)); + return getNode(ISD::FABS, DL, VT, Operand.getOperand(0)); break; } @@ -5970,7 +6019,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT, unsigned NumOps = Ops.size(); switch (NumOps) { case 0: return getNode(Opcode, DL, VT); - case 1: return getNode(Opcode, DL, VT, Ops[0]); + case 1: return getNode(Opcode, DL, VT, Ops[0], Flags); case 2: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Flags); case 3: return getNode(Opcode, DL, VT, Ops[0], Ops[1], Ops[2]); default: break; @@ -7520,9 +7569,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const { if (TLI->isGAPlusOffset(Ptr.getNode(), GV, GVOffset)) { unsigned PtrWidth = getDataLayout().getPointerTypeSizeInBits(GV->getType()); KnownBits Known(PtrWidth); - llvm::computeKnownBits(const_cast(GV), Known, - getDataLayout()); - unsigned AlignBits = Known.Zero.countTrailingOnes(); + llvm::computeKnownBits(GV, Known, getDataLayout()); + unsigned AlignBits = Known.countMinTrailingZeros(); unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0; if (Align) return MinAlign(Align, GVOffset); @@ -7621,7 +7669,7 @@ bool BuildVectorSDNode::isConstantSplat(APInt &SplatValue, APInt &SplatUndef, return false; // FIXME: The widths are based on this node's type, but build vectors can - // truncate their operands. + // truncate their operands. SplatValue = APInt(VecWidth, 0); SplatUndef = APInt(VecWidth, 0); diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp index 50313e2da884..57d340c41c39 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -661,7 +661,7 @@ SDValue RegsForValue::getCopyFromRegs(SelectionDAG &DAG, unsigned RegSize = RegisterVT.getSizeInBits(); unsigned NumSignBits = LOI->NumSignBits; - unsigned NumZeroBits = LOI->Known.Zero.countLeadingOnes(); + unsigned NumZeroBits = LOI->Known.countMinLeadingZeros(); if (NumZeroBits == RegSize) { // The current value is a zero. @@ -811,9 +811,9 @@ void RegsForValue::AddInlineAsmOperands(unsigned Code, bool HasMatching, } } -void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis &aa, +void SelectionDAGBuilder::init(GCFunctionInfo *gfi, AliasAnalysis *aa, const TargetLibraryInfo *li) { - AA = &aa; + AA = aa; GFI = gfi; LibInfo = li; DL = &DAG.getDataLayout(); @@ -3423,7 +3423,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) { if (isVolatile || NumValues > MaxParallelChains) // Serialize volatile loads with other side effects. Root = getRoot(); - else if (AA->pointsToConstantMemory(MemoryLocation( + else if (AA && AA->pointsToConstantMemory(MemoryLocation( SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. Root = DAG.getEntryNode(); @@ -3535,8 +3535,8 @@ void SelectionDAGBuilder::visitLoadFromSwiftError(const LoadInst &I) { Type *Ty = I.getType(); AAMDNodes AAInfo; I.getAAMetadata(AAInfo); - assert(!AA->pointsToConstantMemory(MemoryLocation( - SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo)) && + assert((!AA || !AA->pointsToConstantMemory(MemoryLocation( + SV, DAG.getDataLayout().getTypeStoreSize(Ty), AAInfo))) && "load_from_swift_error should not be constant memory"); SmallVector ValueVTs; @@ -3817,7 +3817,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) { const MDNode *Ranges = I.getMetadata(LLVMContext::MD_range); // Do not serialize masked loads of constant memory with anything. - bool AddToChain = !AA->pointsToConstantMemory(MemoryLocation( + bool AddToChain = !AA || !AA->pointsToConstantMemory(MemoryLocation( PtrOperand, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo)); SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode(); @@ -3861,7 +3861,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) { bool UniformBase = getUniformBase(BasePtr, Base, Index, this); bool ConstantMemory = false; if (UniformBase && - AA->pointsToConstantMemory(MemoryLocation( + AA && AA->pointsToConstantMemory(MemoryLocation( BasePtr, DAG.getDataLayout().getTypeStoreSize(I.getType()), AAInfo))) { // Do not serialize (non-volatile) loads of constant memory with anything. @@ -4676,7 +4676,8 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue( bool IsIndirect = false; Optional Op; // Some arguments' frame index is recorded during argument lowering. - if (int FI = FuncInfo.getArgumentFrameIndex(Arg)) + int FI = FuncInfo.getArgumentFrameIndex(Arg); + if (FI != INT_MAX) Op = MachineOperand::CreateFI(FI); if (!Op && N.getNode()) { @@ -4927,6 +4928,13 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { return nullptr; } + // Byval arguments with frame indices were already handled after argument + // lowering and before isel. + const auto *Arg = + dyn_cast(Address->stripInBoundsConstantOffsets()); + if (Arg && FuncInfo.getArgumentFrameIndex(Arg) != INT_MAX) + return nullptr; + SDValue &N = NodeMap[Address]; if (!N.getNode() && isa(Address)) // Check unused arguments map. @@ -4957,20 +4965,6 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { // virtual register info from the FuncInfo.ValueMap. if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, 0, true, N)) { - // If variable is pinned by a alloca in dominating bb then - // use StaticAllocaMap. - if (const AllocaInst *AI = dyn_cast(Address)) { - if (AI->getParent() != DI.getParent()) { - DenseMap::iterator SI = - FuncInfo.StaticAllocaMap.find(AI); - if (SI != FuncInfo.StaticAllocaMap.end()) { - SDV = DAG.getFrameIndexDbgValue(Variable, Expression, SI->second, - 0, dl, SDNodeOrder); - DAG.AddDbgValue(SDV, nullptr, false); - return nullptr; - } - } - } DEBUG(dbgs() << "Dropping debug info for " << DI << "\n"); } } @@ -5651,7 +5645,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { int FI = FuncInfo.StaticAllocaMap[Slot]; MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol( - GlobalValue::getRealLinkageName(MF.getName()), Idx); + GlobalValue::dropLLVMManglingEscape(MF.getName()), Idx); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, dl, TII->get(TargetOpcode::LOCAL_ESCAPE)) .addSym(FrameAllocSym) @@ -5672,7 +5666,7 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { unsigned IdxVal = unsigned(Idx->getLimitedValue(INT_MAX)); MCSymbol *FrameAllocSym = MF.getMMI().getContext().getOrCreateFrameAllocSymbol( - GlobalValue::getRealLinkageName(Fn->getName()), IdxVal); + GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal); // Create a MCSymbol for the label to avoid any target lowering // that would make this PC relative. @@ -5737,6 +5731,24 @@ SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I, unsigned Intrinsic) { case Intrinsic::experimental_deoptimize: LowerDeoptimizeCall(&I); return nullptr; + + case Intrinsic::experimental_vector_reduce_fadd: + case Intrinsic::experimental_vector_reduce_fmul: + case Intrinsic::experimental_vector_reduce_add: + case Intrinsic::experimental_vector_reduce_mul: + case Intrinsic::experimental_vector_reduce_and: + case Intrinsic::experimental_vector_reduce_or: + case Intrinsic::experimental_vector_reduce_xor: + case Intrinsic::experimental_vector_reduce_smax: + case Intrinsic::experimental_vector_reduce_smin: + case Intrinsic::experimental_vector_reduce_umax: + case Intrinsic::experimental_vector_reduce_umin: + case Intrinsic::experimental_vector_reduce_fmax: + case Intrinsic::experimental_vector_reduce_fmin: { + visitVectorReduce(I, Intrinsic); + return nullptr; + } + } } @@ -5982,7 +5994,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT, bool ConstantMemory = false; // Do not serialize (non-volatile) loads of constant memory with anything. - if (Builder.AA->pointsToConstantMemory(PtrVal)) { + if (Builder.AA && Builder.AA->pointsToConstantMemory(PtrVal)) { Root = Builder.DAG.getEntryNode(); ConstantMemory = true; } else { @@ -7422,11 +7434,11 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) { // have to worry about calling conventions and target specific lowering code. // Instead we perform the call lowering right here. // - // chain, flag = CALLSEQ_START(chain, 0) + // chain, flag = CALLSEQ_START(chain, 0, 0) // chain, flag = STACKMAP(id, nbytes, ..., chain, flag) // chain, flag = CALLSEQ_END(chain, 0, 0, flag) // - Chain = DAG.getCALLSEQ_START(getRoot(), NullPtr, DL); + Chain = DAG.getCALLSEQ_START(getRoot(), 0, 0, DL); InFlag = Chain.getValue(1); // Add the and constants. @@ -7616,6 +7628,76 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS, FuncInfo.MF->getFrameInfo().setHasPatchPoint(); } +void SelectionDAGBuilder::visitVectorReduce(const CallInst &I, + unsigned Intrinsic) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + SDValue Op1 = getValue(I.getArgOperand(0)); + SDValue Op2; + if (I.getNumArgOperands() > 1) + Op2 = getValue(I.getArgOperand(1)); + SDLoc dl = getCurSDLoc(); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue Res; + FastMathFlags FMF; + if (isa(I)) + FMF = I.getFastMathFlags(); + SDNodeFlags SDFlags; + SDFlags.setNoNaNs(FMF.noNaNs()); + + switch (Intrinsic) { + case Intrinsic::experimental_vector_reduce_fadd: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FADD, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FADD, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_fmul: + if (FMF.unsafeAlgebra()) + Res = DAG.getNode(ISD::VECREDUCE_FMUL, dl, VT, Op2); + else + Res = DAG.getNode(ISD::VECREDUCE_STRICT_FMUL, dl, VT, Op1, Op2); + break; + case Intrinsic::experimental_vector_reduce_add: + Res = DAG.getNode(ISD::VECREDUCE_ADD, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_mul: + Res = DAG.getNode(ISD::VECREDUCE_MUL, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_and: + Res = DAG.getNode(ISD::VECREDUCE_AND, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_or: + Res = DAG.getNode(ISD::VECREDUCE_OR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_xor: + Res = DAG.getNode(ISD::VECREDUCE_XOR, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smax: + Res = DAG.getNode(ISD::VECREDUCE_SMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_smin: + Res = DAG.getNode(ISD::VECREDUCE_SMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umax: + Res = DAG.getNode(ISD::VECREDUCE_UMAX, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_umin: + Res = DAG.getNode(ISD::VECREDUCE_UMIN, dl, VT, Op1); + break; + case Intrinsic::experimental_vector_reduce_fmax: { + Res = DAG.getNode(ISD::VECREDUCE_FMAX, dl, VT, Op1, SDFlags); + break; + } + case Intrinsic::experimental_vector_reduce_fmin: { + Res = DAG.getNode(ISD::VECREDUCE_FMIN, dl, VT, Op1, SDFlags); + break; + } + default: + llvm_unreachable("Unhandled vector reduce intrinsic"); + } + setValue(&I, Res); +} + /// Returns an AttributeList representing the attributes applied to the return /// value of the given call. static AttributeList getReturnAttrs(TargetLowering::CallLoweringInfo &CLI) { diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h index 9e9989058ae5..bdaee858da61 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -604,11 +604,11 @@ public: SelectionDAGBuilder(SelectionDAG &dag, FunctionLoweringInfo &funcinfo, CodeGenOpt::Level ol) : CurInst(nullptr), SDNodeOrder(LowestSDNodeOrder), TM(dag.getTarget()), - DAG(dag), FuncInfo(funcinfo), + DAG(dag), DL(nullptr), AA(nullptr), FuncInfo(funcinfo), HasTailCall(false) { } - void init(GCFunctionInfo *gfi, AliasAnalysis &aa, + void init(GCFunctionInfo *gfi, AliasAnalysis *AA, const TargetLibraryInfo *li); /// Clear out the current SelectionDAG and the associated state and prepare @@ -909,6 +909,8 @@ private: void visitGCRelocate(const GCRelocateInst &I); void visitGCResult(const GCResultInst &I); + void visitVectorReduce(const CallInst &I, unsigned Intrinsic); + void visitUserOp1(const Instruction &I) { llvm_unreachable("UserOp1 should not exist at instruction selection time!"); } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp index 26dd45ef933f..c37d7080f2c5 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -346,6 +346,19 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const { case ISD::SETFALSE: return "setfalse"; case ISD::SETFALSE2: return "setfalse2"; } + case ISD::VECREDUCE_FADD: return "vecreduce_fadd"; + case ISD::VECREDUCE_FMUL: return "vecreduce_fmul"; + case ISD::VECREDUCE_ADD: return "vecreduce_add"; + case ISD::VECREDUCE_MUL: return "vecreduce_mul"; + case ISD::VECREDUCE_AND: return "vecreduce_and"; + case ISD::VECREDUCE_OR: return "vecreduce_or"; + case ISD::VECREDUCE_XOR: return "vecreduce_xor"; + case ISD::VECREDUCE_SMAX: return "vecreduce_smax"; + case ISD::VECREDUCE_SMIN: return "vecreduce_smin"; + case ISD::VECREDUCE_UMAX: return "vecreduce_umax"; + case ISD::VECREDUCE_UMIN: return "vecreduce_umin"; + case ISD::VECREDUCE_FMAX: return "vecreduce_fmax"; + case ISD::VECREDUCE_FMIN: return "vecreduce_fmin"; } } diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp index 3aabdaeaa094..5e0feccb6b4c 100644 --- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp +++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp @@ -38,6 +38,7 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineMemOperand.h" +#include "llvm/CodeGen/MachineModuleInfo.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachinePassRegistry.h" #include "llvm/CodeGen/MachineRegisterInfo.h" @@ -299,7 +300,7 @@ SelectionDAGISel::SelectionDAGISel(TargetMachine &tm, FuncInfo(new FunctionLoweringInfo()), CurDAG(new SelectionDAG(tm, OL)), SDB(new SelectionDAGBuilder(*CurDAG, *FuncInfo, OL)), - GFI(), + AA(), GFI(), OptLevel(OL), DAGSize(0) { initializeGCModuleInfoPass(*PassRegistry::getPassRegistry()); @@ -317,7 +318,8 @@ SelectionDAGISel::~SelectionDAGISel() { } void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const { - AU.addRequired(); + if (OptLevel != CodeGenOpt::None) + AU.addRequired(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -394,7 +396,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { TII = MF->getSubtarget().getInstrInfo(); TLI = MF->getSubtarget().getTargetLowering(); RegInfo = &MF->getRegInfo(); - AA = &getAnalysis().getAAResults(); LibInfo = &getAnalysis().getTLI(); GFI = Fn.hasGC() ? &getAnalysis().getFunctionInfo(Fn) : nullptr; ORE = make_unique(&Fn); @@ -406,12 +407,22 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) { CurDAG->init(*MF, *ORE); FuncInfo->set(Fn, *MF, CurDAG); + // Now get the optional analyzes if we want to. + // This is based on the possibly changed OptLevel (after optnone is taken + // into account). That's unfortunate but OK because it just means we won't + // ask for passes that have been required anyway. + if (UseMBPI && OptLevel != CodeGenOpt::None) FuncInfo->BPI = &getAnalysis().getBPI(); else FuncInfo->BPI = nullptr; - SDB->init(GFI, *AA, LibInfo); + if (OptLevel != CodeGenOpt::None) + AA = &getAnalysis().getAAResults(); + else + AA = nullptr; + + SDB->init(GFI, AA, LibInfo); MF->setHasInlineAsm(false); @@ -715,7 +726,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine1", "DAG Combining 1", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(BeforeLegalizeTypes, *AA, OptLevel); + CurDAG->Combine(BeforeLegalizeTypes, AA, OptLevel); } DEBUG(dbgs() << "Optimized lowered selection DAG: BB#" << BlockNumber @@ -747,7 +758,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lt", "DAG Combining after legalize types", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeTypes, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeTypes, AA, OptLevel); } DEBUG(dbgs() << "Optimized type-legalized selection DAG: BB#" << BlockNumber @@ -781,7 +792,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine_lv", "DAG Combining after legalize vectors", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeVectorOps, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeVectorOps, AA, OptLevel); } DEBUG(dbgs() << "Optimized vector-legalized selection DAG: BB#" @@ -807,7 +818,7 @@ void SelectionDAGISel::CodeGenAndEmitDAG() { { NamedRegionTimer T("combine2", "DAG Combining 2", GroupName, GroupDescription, TimePassesIsEnabled); - CurDAG->Combine(AfterLegalizeDAG, *AA, OptLevel); + CurDAG->Combine(AfterLegalizeDAG, AA, OptLevel); } DEBUG(dbgs() << "Optimized legalized selection DAG: BB#" << BlockNumber @@ -1145,6 +1156,51 @@ static void createSwiftErrorEntriesInEntryBlock(FunctionLoweringInfo *FuncInfo, } } +/// Collect llvm.dbg.declare information. This is done after argument lowering +/// in case the declarations refer to arguments. +static void processDbgDeclares(FunctionLoweringInfo *FuncInfo) { + MachineFunction *MF = FuncInfo->MF; + const DataLayout &DL = MF->getDataLayout(); + for (const BasicBlock &BB : *FuncInfo->Fn) { + for (const Instruction &I : BB) { + const DbgDeclareInst *DI = dyn_cast(&I); + if (!DI) + continue; + + assert(DI->getVariable() && "Missing variable"); + assert(DI->getDebugLoc() && "Missing location"); + const Value *Address = DI->getAddress(); + if (!Address) + continue; + + // Look through casts and constant offset GEPs. These mostly come from + // inalloca. + APInt Offset(DL.getPointerSizeInBits(0), 0); + Address = Address->stripAndAccumulateInBoundsConstantOffsets(DL, Offset); + + // Check if the variable is a static alloca or a byval or inalloca + // argument passed in memory. If it is not, then we will ignore this + // intrinsic and handle this during isel like dbg.value. + int FI = INT_MAX; + if (const auto *AI = dyn_cast(Address)) { + auto SI = FuncInfo->StaticAllocaMap.find(AI); + if (SI != FuncInfo->StaticAllocaMap.end()) + FI = SI->second; + } else if (const auto *Arg = dyn_cast(Address)) + FI = FuncInfo->getArgumentFrameIndex(Arg); + + if (FI == INT_MAX) + continue; + + DIExpression *Expr = DI->getExpression(); + if (Offset.getBoolValue()) + Expr = DIExpression::prepend(Expr, DIExpression::NoDeref, + Offset.getZExtValue()); + MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc()); + } + } +} + /// Propagate swifterror values through the machine function CFG. static void propagateSwiftErrorVRegs(FunctionLoweringInfo *FuncInfo) { auto *TLI = FuncInfo->TLI; @@ -1317,6 +1373,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) { } createSwiftErrorEntriesInEntryBlock(FuncInfo, FastIS, TLI, TII, SDB); + processDbgDeclares(FuncInfo); + // Iterate over all basic blocks in the function. for (const BasicBlock *LLVMBB : RPOT) { if (OptLevel != CodeGenOpt::None) { diff --git a/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/lib/CodeGen/SelectionDAG/TargetLowering.cpp index 23f597db140c..befbd80d7965 100644 --- a/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -417,11 +417,10 @@ bool TargetLowering::ShrinkDemandedOp(SDValue Op, unsigned BitWidth, if (TLI.isTruncateFree(Op.getValueType(), SmallVT) && TLI.isZExtFree(SmallVT, Op.getValueType())) { // We found a type with free casts. - SDValue X = DAG.getNode(Op.getOpcode(), dl, SmallVT, - DAG.getNode(ISD::TRUNCATE, dl, SmallVT, - Op.getNode()->getOperand(0)), - DAG.getNode(ISD::TRUNCATE, dl, SmallVT, - Op.getNode()->getOperand(1))); + SDValue X = DAG.getNode( + Op.getOpcode(), dl, SmallVT, + DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(0)), + DAG.getNode(ISD::TRUNCATE, dl, SmallVT, Op.getOperand(1))); bool NeedZext = DemandedSize > SmallVTBits; SDValue Z = DAG.getNode(NeedZext ? ISD::ZERO_EXTEND : ISD::ANY_EXTEND, dl, Op.getValueType(), X); @@ -817,7 +816,7 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits // are not demanded. This will likely allow the anyext to be folded away. if (InOp.getNode()->getOpcode() == ISD::ANY_EXTEND) { - SDValue InnerOp = InOp.getNode()->getOperand(0); + SDValue InnerOp = InOp.getOperand(0); EVT InnerVT = InnerOp.getValueType(); unsigned InnerBits = InnerVT.getSizeInBits(); if (ShAmt < InnerBits && NewMask.getActiveBits() <= InnerBits && diff --git a/lib/CodeGen/ShrinkWrap.cpp b/lib/CodeGen/ShrinkWrap.cpp index 4837495777da..2638702da152 100644 --- a/lib/CodeGen/ShrinkWrap.cpp +++ b/lib/CodeGen/ShrinkWrap.cpp @@ -282,8 +282,14 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, if (!Restore) Restore = &MBB; - else + else if (MPDT->getNode(&MBB)) // If the block is not in the post dom tree, it + // means the block never returns. If that's the + // case, we don't want to call + // `findNearestCommonDominator`, which will + // return `Restore`. Restore = MPDT->findNearestCommonDominator(Restore, &MBB); + else + Restore = nullptr; // Abort, we can't find a restore point in this case. // Make sure we would be able to insert the restore code before the // terminator. @@ -293,7 +299,7 @@ void ShrinkWrap::updateSaveRestorePoints(MachineBasicBlock &MBB, continue; // One of the terminator needs to happen before the restore point. if (MBB.succ_empty()) { - Restore = nullptr; + Restore = nullptr; // Abort, we can't find a restore point in this case. break; } // Look for a restore point that post-dominates all the successors. @@ -419,7 +425,7 @@ static bool isIrreducibleCFG(const MachineFunction &MF, } bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) { - if (MF.empty() || !isShrinkWrapEnabled(MF)) + if (skipFunction(*MF.getFunction()) || MF.empty() || !isShrinkWrapEnabled(MF)) return false; DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp index ab578df4069d..e9eff4d0acb2 100644 --- a/lib/CodeGen/SjLjEHPrepare.cpp +++ b/lib/CodeGen/SjLjEHPrepare.cpp @@ -93,8 +93,8 @@ bool SjLjEHPrepare::doInitialization(Module &M) { doubleUnderDataTy, // __data VoidPtrTy, // __personality VoidPtrTy, // __lsda - doubleUnderJBufTy, // __jbuf - nullptr); + doubleUnderJBufTy // __jbuf + ); return true; } diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp index 34892680aceb..1d232c71d824 100644 --- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp +++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp @@ -232,7 +232,11 @@ static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO, if (!MD) return nullptr; - auto *VM = dyn_cast(MD->getOperand(0)); + const MDOperand &Op = MD->getOperand(0); + if (!Op.get()) + return nullptr; + + auto *VM = dyn_cast(Op); if (!VM) report_fatal_error("MD_associated operand is not ValueAsMetadata"); diff --git a/lib/CodeGen/TargetPassConfig.cpp b/lib/CodeGen/TargetPassConfig.cpp index 150195f5f85b..e6c5d8753b83 100644 --- a/lib/CodeGen/TargetPassConfig.cpp +++ b/lib/CodeGen/TargetPassConfig.cpp @@ -487,6 +487,14 @@ void TargetPassConfig::addIRPasses() { // Insert calls to mcount-like functions. addPass(createCountingFunctionInserterPass()); + + // Add scalarization of target's unsupported masked memory intrinsics pass. + // the unsupported intrinsic will be replaced with a chain of basic blocks, + // that stores/loads element one-by-one if the appropriate mask bit is set. + addPass(createScalarizeMaskedMemIntrinPass()); + + // Expand reduction intrinsics into shuffle sequences if the target wants to. + addPass(createExpandReductionsPass()); } /// Turn exception handling constructs into something the code generators can @@ -607,6 +615,9 @@ void TargetPassConfig::addMachinePasses() { addPass(&LocalStackSlotAllocationID, false); } + if (getOptLevel() != CodeGenOpt::None) + addPass(&LiveRangeShrinkID); + // Run pre-ra passes. addPreRegAlloc(); diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp index 75359fe3c0ea..7392c8327148 100644 --- a/lib/CodeGen/TwoAddressInstructionPass.cpp +++ b/lib/CodeGen/TwoAddressInstructionPass.cpp @@ -155,7 +155,7 @@ public: void getAnalysisUsage(AnalysisUsage &AU) const override { AU.setPreservesCFG(); - AU.addRequired(); + AU.addUsedIfAvailable(); AU.addUsedIfAvailable(); AU.addPreserved(); AU.addPreserved(); @@ -1627,7 +1627,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) { InstrItins = MF->getSubtarget().getInstrItineraryData(); LV = getAnalysisIfAvailable(); LIS = getAnalysisIfAvailable(); - AA = &getAnalysis().getAAResults(); + if (auto *AAPass = getAnalysisIfAvailable()) + AA = &AAPass->getAAResults(); + else + AA = nullptr; OptLevel = TM.getOptLevel(); bool MadeChange = false; diff --git a/lib/CodeGen/UnreachableBlockElim.cpp b/lib/CodeGen/UnreachableBlockElim.cpp index f085132b6a94..407fd9b162e9 100644 --- a/lib/CodeGen/UnreachableBlockElim.cpp +++ b/lib/CodeGen/UnreachableBlockElim.cpp @@ -206,11 +206,12 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) { if (InputReg != OutputReg) { MachineRegisterInfo &MRI = F.getRegInfo(); unsigned InputSub = Input.getSubReg(); - if (InputSub == 0) { - MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg)); + if (InputSub == 0 && + MRI.constrainRegClass(InputReg, MRI.getRegClass(OutputReg))) { MRI.replaceRegWith(OutputReg, InputReg); } else { - // The input register to the PHI has a subregister: + // The input register to the PHI has a subregister or it can't be + // constrained to the proper register class: // insert a COPY instead of simply replacing the output // with the input. const TargetInstrInfo *TII = F.getSubtarget().getInstrInfo(); diff --git a/lib/DebugInfo/CodeView/CMakeLists.txt b/lib/DebugInfo/CodeView/CMakeLists.txt index 410d5a3777d4..8d9353ae5f5e 100644 --- a/lib/DebugInfo/CodeView/CMakeLists.txt +++ b/lib/DebugInfo/CodeView/CMakeLists.txt @@ -13,7 +13,7 @@ add_llvm_library(LLVMDebugInfoCodeView ModuleDebugFragmentVisitor.cpp ModuleDebugInlineeLinesFragment.cpp ModuleDebugLineFragment.cpp - ModuleDebugUnknownFragment.cpp + RandomAccessTypeVisitor.cpp RecordSerialization.cpp StringTable.cpp SymbolRecordMapping.cpp diff --git a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp index 0069ee3cc904..b6ed0453d9c4 100644 --- a/lib/DebugInfo/CodeView/CVTypeVisitor.cpp +++ b/lib/DebugInfo/CodeView/CVTypeVisitor.cpp @@ -26,8 +26,7 @@ CVTypeVisitor::CVTypeVisitor(TypeVisitorCallbacks &Callbacks) : Callbacks(Callbacks) {} template -static Error visitKnownRecord(CVTypeVisitor &Visitor, CVType &Record, - TypeVisitorCallbacks &Callbacks) { +static Error visitKnownRecord(CVType &Record, TypeVisitorCallbacks &Callbacks) { TypeRecordKind RK = static_cast(Record.Type); T KnownRecord(RK); if (auto EC = Callbacks.visitKnownRecord(Record, KnownRecord)) @@ -76,7 +75,7 @@ void CVTypeVisitor::addTypeServerHandler(TypeServerHandler &Handler) { Handlers.push_back(&Handler); } -Error CVTypeVisitor::visitTypeRecord(CVType &Record) { +Expected CVTypeVisitor::handleTypeServer(CVType &Record) { if (Record.Type == TypeLeafKind::LF_TYPESERVER2 && !Handlers.empty()) { auto TS = deserializeTypeServerRecord(Record); if (!TS) @@ -90,16 +89,16 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) { // If the handler processed the record, return success. if (*ExpectedResult) - return Error::success(); + return true; // Otherwise keep searching for a handler, eventually falling out and // using the default record handler. } } + return false; +} - if (auto EC = Callbacks.visitTypeBegin(Record)) - return EC; - +Error CVTypeVisitor::finishVisitation(CVType &Record) { switch (Record.Type) { default: if (auto EC = Callbacks.visitUnknownType(Record)) @@ -107,7 +106,7 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) { break; #define TYPE_RECORD(EnumName, EnumVal, Name) \ case EnumName: { \ - if (auto EC = visitKnownRecord(*this, Record, Callbacks)) \ + if (auto EC = visitKnownRecord(Record, Callbacks)) \ return EC; \ break; \ } @@ -124,6 +123,32 @@ Error CVTypeVisitor::visitTypeRecord(CVType &Record) { return Error::success(); } +Error CVTypeVisitor::visitTypeRecord(CVType &Record, TypeIndex Index) { + auto ExpectedResult = handleTypeServer(Record); + if (!ExpectedResult) + return ExpectedResult.takeError(); + if (*ExpectedResult) + return Error::success(); + + if (auto EC = Callbacks.visitTypeBegin(Record, Index)) + return EC; + + return finishVisitation(Record); +} + +Error CVTypeVisitor::visitTypeRecord(CVType &Record) { + auto ExpectedResult = handleTypeServer(Record); + if (!ExpectedResult) + return ExpectedResult.takeError(); + if (*ExpectedResult) + return Error::success(); + + if (auto EC = Callbacks.visitTypeBegin(Record)) + return EC; + + return finishVisitation(Record); +} + static Error visitMemberRecord(CVMemberRecord &Record, TypeVisitorCallbacks &Callbacks) { if (auto EC = Callbacks.visitMemberBegin(Record)) diff --git a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp b/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp deleted file mode 100644 index 9fd2cb8ed3e8..000000000000 --- a/lib/DebugInfo/CodeView/ModuleDebugUnknownFragment.cpp +++ /dev/null @@ -1,10 +0,0 @@ -//===- ModuleDebugUnknownFragment.cpp ---------------------------*- C++ -*-===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// - -#include "llvm/DebugInfo/CodeView/ModuleDebugUnknownFragment.h" \ No newline at end of file diff --git a/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp new file mode 100644 index 000000000000..4cb9acbe07d9 --- /dev/null +++ b/lib/DebugInfo/CodeView/RandomAccessTypeVisitor.cpp @@ -0,0 +1,91 @@ +//===- RandomAccessTypeVisitor.cpp ---------------------------- *- C++ --*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/DebugInfo/CodeView/RandomAccessTypeVisitor.h" + +#include "llvm/DebugInfo/CodeView/TypeDatabase.h" +#include "llvm/DebugInfo/CodeView/TypeServerHandler.h" +#include "llvm/DebugInfo/CodeView/TypeVisitorCallbacks.h" + +using namespace llvm; +using namespace llvm::codeview; + +RandomAccessTypeVisitor::RandomAccessTypeVisitor( + const CVTypeArray &Types, uint32_t NumRecords, + PartialOffsetArray PartialOffsets) + : Database(NumRecords), Types(Types), DatabaseVisitor(Database), + InternalVisitor(Pipeline), PartialOffsets(PartialOffsets) { + Pipeline.addCallbackToPipeline(Deserializer); + Pipeline.addCallbackToPipeline(DatabaseVisitor); + + KnownOffsets.resize(Database.capacity()); +} + +Error RandomAccessTypeVisitor::visitTypeIndex(TypeIndex TI, + TypeVisitorCallbacks &Callbacks) { + assert(TI.toArrayIndex() < Database.capacity()); + + if (!Database.contains(TI)) { + if (auto EC = visitRangeForType(TI)) + return EC; + } + + assert(Database.contains(TI)); + auto &Record = Database.getTypeRecord(TI); + CVTypeVisitor V(Callbacks); + return V.visitTypeRecord(Record, TI); +} + +Error RandomAccessTypeVisitor::visitRangeForType(TypeIndex TI) { + if (PartialOffsets.empty()) { + TypeIndex TIB(TypeIndex::FirstNonSimpleIndex); + TypeIndex TIE = TIB + Database.capacity(); + return visitRange(TIB, 0, TIE); + } + + auto Next = std::upper_bound(PartialOffsets.begin(), PartialOffsets.end(), TI, + [](TypeIndex Value, const TypeIndexOffset &IO) { + return Value < IO.Type; + }); + + assert(Next != PartialOffsets.begin()); + auto Prev = std::prev(Next); + + TypeIndex TIB = Prev->Type; + TypeIndex TIE; + if (Next == PartialOffsets.end()) { + TIE = TypeIndex::fromArrayIndex(Database.capacity()); + } else { + TIE = Next->Type; + } + + if (auto EC = visitRange(TIB, Prev->Offset, TIE)) + return EC; + return Error::success(); +} + +Error RandomAccessTypeVisitor::visitRange(TypeIndex Begin, uint32_t BeginOffset, + TypeIndex End) { + + auto RI = Types.at(BeginOffset); + assert(RI != Types.end()); + + while (Begin != End) { + assert(!Database.contains(Begin)); + if (auto EC = InternalVisitor.visitTypeRecord(*RI, Begin)) + return EC; + KnownOffsets[Begin.toArrayIndex()] = BeginOffset; + + BeginOffset += RI.getRecordLength(); + ++Begin; + ++RI; + } + + return Error::success(); +} diff --git a/lib/DebugInfo/CodeView/TypeDatabase.cpp b/lib/DebugInfo/CodeView/TypeDatabase.cpp index 5b8841041f88..7924440e5e29 100644 --- a/lib/DebugInfo/CodeView/TypeDatabase.cpp +++ b/lib/DebugInfo/CodeView/TypeDatabase.cpp @@ -65,20 +65,32 @@ static const SimpleTypeEntry SimpleTypeNames[] = { {"__bool64*", SimpleTypeKind::Boolean64}, }; -TypeDatabase::TypeDatabase(uint32_t ExpectedSize) : TypeNameStorage(Allocator) { - CVUDTNames.reserve(ExpectedSize); - TypeRecords.reserve(ExpectedSize); +TypeDatabase::TypeDatabase(uint32_t Capacity) : TypeNameStorage(Allocator) { + CVUDTNames.resize(Capacity); + TypeRecords.resize(Capacity); + ValidRecords.resize(Capacity); } -/// Gets the type index for the next type record. -TypeIndex TypeDatabase::getNextTypeIndex() const { - return TypeIndex(TypeIndex::FirstNonSimpleIndex + CVUDTNames.size()); +TypeIndex TypeDatabase::appendType(StringRef Name, const CVType &Data) { + TypeIndex TI; + TI = getAppendIndex(); + if (TI.toArrayIndex() >= capacity()) + grow(); + recordType(Name, TI, Data); + return TI; } -/// Records the name of a type, and reserves its type index. -void TypeDatabase::recordType(StringRef Name, const CVType &Data) { - CVUDTNames.push_back(Name); - TypeRecords.push_back(Data); +void TypeDatabase::recordType(StringRef Name, TypeIndex Index, + const CVType &Data) { + uint32_t AI = Index.toArrayIndex(); + + assert(!contains(Index)); + assert(AI < capacity()); + + CVUDTNames[AI] = Name; + TypeRecords[AI] = Data; + ValidRecords.set(AI); + ++Count; } /// Saves the name in a StringSet and creates a stable StringRef. @@ -104,24 +116,47 @@ StringRef TypeDatabase::getTypeName(TypeIndex Index) const { return ""; } - uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex; - if (I < CVUDTNames.size()) - return CVUDTNames[I]; + if (contains(Index)) + return CVUDTNames[Index.toArrayIndex()]; return ""; } const CVType &TypeDatabase::getTypeRecord(TypeIndex Index) const { - return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex]; + assert(contains(Index)); + return TypeRecords[Index.toArrayIndex()]; } CVType &TypeDatabase::getTypeRecord(TypeIndex Index) { - return TypeRecords[Index.getIndex() - TypeIndex::FirstNonSimpleIndex]; + assert(contains(Index)); + return TypeRecords[Index.toArrayIndex()]; +} + +bool TypeDatabase::contains(TypeIndex Index) const { + uint32_t AI = Index.toArrayIndex(); + if (AI >= capacity()) + return false; + + return ValidRecords.test(AI); } -bool TypeDatabase::containsTypeIndex(TypeIndex Index) const { - uint32_t I = Index.getIndex() - TypeIndex::FirstNonSimpleIndex; - return I < CVUDTNames.size(); +uint32_t TypeDatabase::size() const { return Count; } + +uint32_t TypeDatabase::capacity() const { return TypeRecords.size(); } + +void TypeDatabase::grow() { + TypeRecords.emplace_back(); + CVUDTNames.emplace_back(); + ValidRecords.resize(ValidRecords.size() + 1); } -uint32_t TypeDatabase::size() const { return CVUDTNames.size(); } +bool TypeDatabase::empty() const { return size() == 0; } + +TypeIndex TypeDatabase::getAppendIndex() const { + if (empty()) + return TypeIndex::fromArrayIndex(0); + + int Index = ValidRecords.find_last(); + assert(Index != -1); + return TypeIndex::fromArrayIndex(Index) + 1; +} diff --git a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp index c234afd2288b..8d97f8b1cb40 100644 --- a/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp +++ b/lib/DebugInfo/CodeView/TypeDatabaseVisitor.cpp @@ -15,7 +15,7 @@ using namespace llvm; using namespace llvm::codeview; -Error TypeDatabaseVisitor::visitTypeBegin(CVRecord &Record) { +Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record) { assert(!IsInFieldList); // Reset Name to the empty string. If the visitor sets it, we know it. Name = ""; @@ -28,6 +28,22 @@ Error TypeDatabaseVisitor::visitTypeBegin(CVRecord &Record) { return Error::success(); } +Error TypeDatabaseVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) { + if (auto EC = visitTypeBegin(Record)) + return EC; + + CurrentTypeIndex = Index; + return Error::success(); +} + +StringRef TypeDatabaseVisitor::getTypeName(TypeIndex Index) const { + return TypeDB->getTypeName(Index); +} + +StringRef TypeDatabaseVisitor::saveTypeName(StringRef Name) { + return TypeDB->saveTypeName(Name); +} + Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) { if (CVR.Type == LF_FIELDLIST) { assert(IsInFieldList); @@ -39,7 +55,12 @@ Error TypeDatabaseVisitor::visitTypeEnd(CVType &CVR) { // CVUDTNames is indexed by type index, and must have one entry for every // type. Field list members are not recorded, and are only referenced by // their containing field list record. - TypeDB.recordType(Name, CVR); + if (CurrentTypeIndex) + TypeDB->recordType(Name, *CurrentTypeIndex, CVR); + else + TypeDB->appendType(Name, CVR); + + CurrentTypeIndex.reset(); return Error::success(); } @@ -73,13 +94,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ArgListRecord &Args) { uint32_t Size = Indices.size(); SmallString<256> TypeName("("); for (uint32_t I = 0; I < Size; ++I) { - StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]); + StringRef ArgTypeName = getTypeName(Indices[I]); TypeName.append(ArgTypeName); if (I + 1 != Size) TypeName.append(", "); } TypeName.push_back(')'); - Name = TypeDB.saveTypeName(TypeName); + Name = saveTypeName(TypeName); return Error::success(); } @@ -89,13 +110,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, uint32_t Size = Indices.size(); SmallString<256> TypeName("\""); for (uint32_t I = 0; I < Size; ++I) { - StringRef ArgTypeName = TypeDB.getTypeName(Indices[I]); + StringRef ArgTypeName = getTypeName(Indices[I]); TypeName.append(ArgTypeName); if (I + 1 != Size) TypeName.append("\" \""); } TypeName.push_back('\"'); - Name = TypeDB.saveTypeName(TypeName); + Name = saveTypeName(TypeName); return Error::success(); } @@ -132,26 +153,26 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ProcedureRecord &Proc) { - StringRef ReturnTypeName = TypeDB.getTypeName(Proc.getReturnType()); - StringRef ArgListTypeName = TypeDB.getTypeName(Proc.getArgumentList()); + StringRef ReturnTypeName = getTypeName(Proc.getReturnType()); + StringRef ArgListTypeName = getTypeName(Proc.getArgumentList()); SmallString<256> TypeName(ReturnTypeName); TypeName.push_back(' '); TypeName.append(ArgListTypeName); - Name = TypeDB.saveTypeName(TypeName); + Name = saveTypeName(TypeName); return Error::success(); } Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, MemberFunctionRecord &MF) { - StringRef ReturnTypeName = TypeDB.getTypeName(MF.getReturnType()); - StringRef ClassTypeName = TypeDB.getTypeName(MF.getClassType()); - StringRef ArgListTypeName = TypeDB.getTypeName(MF.getArgumentList()); + StringRef ReturnTypeName = getTypeName(MF.getReturnType()); + StringRef ClassTypeName = getTypeName(MF.getClassType()); + StringRef ArgListTypeName = getTypeName(MF.getArgumentList()); SmallString<256> TypeName(ReturnTypeName); TypeName.push_back(' '); TypeName.append(ClassTypeName); TypeName.append("::"); TypeName.append(ArgListTypeName); - Name = TypeDB.saveTypeName(TypeName); + Name = saveTypeName(TypeName); return Error::success(); } @@ -171,13 +192,13 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) { if (Ptr.isPointerToMember()) { const MemberPointerInfo &MI = Ptr.getMemberInfo(); - StringRef PointeeName = TypeDB.getTypeName(Ptr.getReferentType()); - StringRef ClassName = TypeDB.getTypeName(MI.getContainingType()); + StringRef PointeeName = getTypeName(Ptr.getReferentType()); + StringRef ClassName = getTypeName(MI.getContainingType()); SmallString<256> TypeName(PointeeName); TypeName.push_back(' '); TypeName.append(ClassName); TypeName.append("::*"); - Name = TypeDB.saveTypeName(TypeName); + Name = saveTypeName(TypeName); } else { SmallString<256> TypeName; if (Ptr.isConst()) @@ -187,7 +208,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) { if (Ptr.isUnaligned()) TypeName.append("__unaligned "); - TypeName.append(TypeDB.getTypeName(Ptr.getReferentType())); + TypeName.append(getTypeName(Ptr.getReferentType())); if (Ptr.getMode() == PointerMode::LValueReference) TypeName.append("&"); @@ -197,7 +218,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) { TypeName.append("*"); if (!TypeName.empty()) - Name = TypeDB.saveTypeName(TypeName); + Name = saveTypeName(TypeName); } return Error::success(); } @@ -205,7 +226,7 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, PointerRecord &Ptr) { Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) { uint16_t Mods = static_cast(Mod.getModifiers()); - StringRef ModifiedName = TypeDB.getTypeName(Mod.getModifiedType()); + StringRef ModifiedName = getTypeName(Mod.getModifiedType()); SmallString<256> TypeName; if (Mods & uint16_t(ModifierOptions::Const)) TypeName.append("const "); @@ -214,14 +235,14 @@ Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, ModifierRecord &Mod) { if (Mods & uint16_t(ModifierOptions::Unaligned)) TypeName.append("__unaligned "); TypeName.append(ModifiedName); - Name = TypeDB.saveTypeName(TypeName); + Name = saveTypeName(TypeName); return Error::success(); } Error TypeDatabaseVisitor::visitKnownRecord(CVType &CVR, VFTableShapeRecord &Shape) { - Name = TypeDB.saveTypeName(""); + Name = + saveTypeName(""); return Error::success(); } diff --git a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp index 870d95221e7d..27a6e0987886 100644 --- a/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp +++ b/lib/DebugInfo/CodeView/TypeDumpVisitor.cpp @@ -173,10 +173,13 @@ void TypeDumpVisitor::printItemIndex(StringRef FieldName, TypeIndex TI) const { } Error TypeDumpVisitor::visitTypeBegin(CVType &Record) { + TypeIndex TI = getSourceDB().getAppendIndex(); + return visitTypeBegin(Record, TI); +} + +Error TypeDumpVisitor::visitTypeBegin(CVType &Record, TypeIndex Index) { W->startLine() << getLeafTypeName(Record.Type); - W->getOStream() << " (" - << HexNumber(getSourceDB().getNextTypeIndex().getIndex()) - << ")"; + W->getOStream() << " (" << HexNumber(Index.getIndex()) << ")"; W->getOStream() << " {\n"; W->indent(); W->printEnum("TypeLeafKind", unsigned(Record.Type), diff --git a/lib/DebugInfo/DWARF/DWARFContext.cpp b/lib/DebugInfo/DWARF/DWARFContext.cpp index 246899ac12b9..59a060d143ff 100644 --- a/lib/DebugInfo/DWARF/DWARFContext.cpp +++ b/lib/DebugInfo/DWARF/DWARFContext.cpp @@ -66,7 +66,7 @@ uint64_t llvm::getRelocatedValue(const DataExtractor &Data, uint32_t Size, RelocAddrMap::const_iterator AI = Relocs->find(*Off); if (AI == Relocs->end()) return Data.getUnsigned(Off, Size); - return Data.getUnsigned(Off, Size) + AI->second.second; + return Data.getUnsigned(Off, Size) + AI->second.Value; } static void dumpAccelSection(raw_ostream &OS, StringRef Name, @@ -905,16 +905,23 @@ static Error createError(const Twine &Reason, llvm::Error E) { /// Returns the address of symbol relocation used against. Used for futher /// relocations computation. Symbol's section load address is taken in account if /// LoadedObjectInfo interface is provided. -static Expected getSymbolAddress(const object::ObjectFile &Obj, - const RelocationRef &Reloc, - const LoadedObjectInfo *L) { +static Expected +getSymbolAddress(const object::ObjectFile &Obj, const RelocationRef &Reloc, + const LoadedObjectInfo *L, + std::map &Cache) { uint64_t Ret = 0; object::section_iterator RSec = Obj.section_end(); object::symbol_iterator Sym = Reloc.getSymbol(); + std::map::iterator CacheIt = Cache.end(); // First calculate the address of the symbol or section as it appears // in the object file if (Sym != Obj.symbol_end()) { + bool New; + std::tie(CacheIt, New) = Cache.insert({*Sym, 0}); + if (!New) + return CacheIt->second; + Expected SymAddrOrErr = Sym->getAddress(); if (!SymAddrOrErr) return createError("error: failed to compute symbol address: ", @@ -943,6 +950,10 @@ static Expected getSymbolAddress(const object::ObjectFile &Obj, if (L && RSec != Obj.section_end()) if (uint64_t SectionLoadAddress = L->getSectionLoadAddress(*RSec)) Ret += SectionLoadAddress - RSec->getAddress(); + + if (CacheIt != Cache.end()) + CacheIt->second = Ret; + return Ret; } @@ -1075,6 +1086,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, continue; } + std::map AddrCache; if (Section.relocation_begin() != Section.relocation_end()) { uint64_t SectionSize = RelocatedSection->getSize(); for (const RelocationRef &Reloc : Section.relocations()) { @@ -1083,7 +1095,8 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, if (isRelocScattered(Obj, Reloc)) continue; - Expected SymAddrOrErr = getSymbolAddress(Obj, Reloc, L); + Expected SymAddrOrErr = + getSymbolAddress(Obj, Reloc, L, AddrCache); if (!SymAddrOrErr) { errs() << toString(SymAddrOrErr.takeError()) << '\n'; continue; @@ -1114,7 +1127,7 @@ DWARFContextInMemory::DWARFContextInMemory(const object::ObjectFile &Obj, << " at " << format("%p", Address) << " with width " << format("%d", R.Width) << "\n"); - Map->insert(std::make_pair(Address, std::make_pair(R.Width, R.Value))); + Map->insert({Address, {(uint8_t)R.Width, R.Value}}); } } } diff --git a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp index 0cf71f530446..6601393d7459 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp @@ -54,9 +54,8 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) { if (ParsedCUOffsets.insert(CUOffset).second) { DWARFAddressRangesVector CURanges; CU->collectAddressRanges(CURanges); - for (const auto &R : CURanges) { - appendRange(CUOffset, R.first, R.second); - } + for (const auto &R : CURanges) + appendRange(CUOffset, R.LowPC, R.HighPC); } } diff --git a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp index 9380fe8fe85d..8da797750abd 100644 --- a/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp +++ b/lib/DebugInfo/DWARF/DWARFDebugRangeList.cpp @@ -69,8 +69,8 @@ DWARFDebugRangeList::getAbsoluteRanges(uint64_t BaseAddress) const { if (RLE.isBaseAddressSelectionEntry(AddressSize)) { BaseAddress = RLE.EndAddress; } else { - Res.push_back(std::make_pair(BaseAddress + RLE.StartAddress, - BaseAddress + RLE.EndAddress)); + Res.push_back( + {BaseAddress + RLE.StartAddress, BaseAddress + RLE.EndAddress}); } } return Res; diff --git a/lib/DebugInfo/DWARF/DWARFDie.cpp b/lib/DebugInfo/DWARF/DWARFDie.cpp index 24039eb35209..e3bd759ba94b 100644 --- a/lib/DebugInfo/DWARF/DWARFDie.cpp +++ b/lib/DebugInfo/DWARF/DWARFDie.cpp @@ -60,8 +60,8 @@ static void dumpRanges(raw_ostream &OS, const DWARFAddressRangesVector& Ranges, OS << '\n'; OS.indent(Indent); OS << format("[0x%0*" PRIx64 " - 0x%0*" PRIx64 ")", - AddressSize*2, Range.first, - AddressSize*2, Range.second); + AddressSize*2, Range.LowPC, + AddressSize*2, Range.HighPC); } } @@ -229,9 +229,9 @@ DWARFDie::getAddressRanges() const { return DWARFAddressRangesVector(); // Single range specified by low/high PC. uint64_t LowPC, HighPC; - if (getLowAndHighPC(LowPC, HighPC)) { - return DWARFAddressRangesVector(1, std::make_pair(LowPC, HighPC)); - } + if (getLowAndHighPC(LowPC, HighPC)) + return {{LowPC, HighPC}}; + // Multiple ranges from .debug_ranges section. auto RangesOffset = toSectionOffset(find(DW_AT_ranges)); if (RangesOffset) { @@ -257,7 +257,7 @@ DWARFDie::collectChildrenAddressRanges(DWARFAddressRangesVector& Ranges) const { bool DWARFDie::addressRangeContainsAddress(const uint64_t Address) const { for (const auto& R : getAddressRanges()) { - if (R.first <= Address && Address < R.second) + if (R.LowPC <= Address && Address < R.HighPC) return true; } return false; diff --git a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp index e0f819383289..25824f6eb83b 100644 --- a/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp @@ -24,7 +24,11 @@ bool DWARFTypeUnit::extractImpl(DataExtractor debug_info, return false; TypeHash = debug_info.getU64(offset_ptr); TypeOffset = debug_info.getU32(offset_ptr); - return TypeOffset < getLength(); + // TypeOffset is relative to the beginning of the header, + // so we have to account for the leading length field. + // FIXME: The size of the length field is 12 in DWARF64. + unsigned SizeOfLength = 4; + return TypeOffset < getLength() + SizeOfLength; } void DWARFTypeUnit::dump(raw_ostream &OS, bool SummarizeTypes) { diff --git a/lib/DebugInfo/DWARF/DWARFUnit.cpp b/lib/DebugInfo/DWARF/DWARFUnit.cpp index f50487fc3ba3..3835d4da9ae9 100644 --- a/lib/DebugInfo/DWARF/DWARFUnit.cpp +++ b/lib/DebugInfo/DWARF/DWARFUnit.cpp @@ -349,18 +349,18 @@ void DWARFUnit::updateAddressDieMap(DWARFDie Die) { if (Die.isSubroutineDIE()) { for (const auto &R : Die.getAddressRanges()) { // Ignore 0-sized ranges. - if (R.first == R.second) + if (R.LowPC == R.HighPC) continue; - auto B = AddrDieMap.upper_bound(R.first); - if (B != AddrDieMap.begin() && R.first < (--B)->second.first) { + auto B = AddrDieMap.upper_bound(R.LowPC); + if (B != AddrDieMap.begin() && R.LowPC < (--B)->second.first) { // The range is a sub-range of existing ranges, we need to split the // existing range. - if (R.second < B->second.first) - AddrDieMap[R.second] = B->second; - if (R.first > B->first) - AddrDieMap[B->first].first = R.first; + if (R.HighPC < B->second.first) + AddrDieMap[R.HighPC] = B->second; + if (R.LowPC > B->first) + AddrDieMap[B->first].first = R.LowPC; } - AddrDieMap[R.first] = std::make_pair(R.second, Die); + AddrDieMap[R.LowPC] = std::make_pair(R.HighPC, Die); } } // Parent DIEs are added to the AddrDieMap prior to the Children DIEs to diff --git a/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/lib/DebugInfo/DWARF/DWARFVerifier.cpp index 9494e876da15..8a544296f65c 100644 --- a/lib/DebugInfo/DWARF/DWARFVerifier.cpp +++ b/lib/DebugInfo/DWARF/DWARFVerifier.cpp @@ -23,7 +23,7 @@ using namespace llvm; using namespace dwarf; using namespace object; -void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die, +void DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die, DWARFAttribute &AttrValue) { const auto Attr = AttrValue.Attr; switch (Attr) { @@ -68,7 +68,7 @@ void DWARFVerifier::verifyDebugInfoAttribute(DWARFDie &Die, } } -void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die, +void DWARFVerifier::verifyDebugInfoForm(const DWARFDie &Die, DWARFAttribute &AttrValue) { const auto Form = AttrValue.Value.getForm(); switch (Form) { @@ -136,7 +136,7 @@ void DWARFVerifier::verifyDebugInfoForm(DWARFDie &Die, } } -void DWARFVerifier::veifyDebugInfoReferences() { +void DWARFVerifier::verifyDebugInfoReferences() { // Take all references and make sure they point to an actual DIE by // getting the DIE by offset and emitting an error OS << "Verifying .debug_info references...\n"; @@ -172,7 +172,7 @@ bool DWARFVerifier::handleDebugInfo() { } } } - veifyDebugInfoReferences(); + verifyDebugInfoReferences(); return NumDebugInfoErrors == 0; } diff --git a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp index 375c35b11145..701a318511b8 100644 --- a/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp +++ b/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp @@ -109,7 +109,7 @@ uint32_t TpiStreamBuilder::calculateHashBufferSize() const { } uint32_t TpiStreamBuilder::calculateIndexOffsetSize() const { - return TypeIndexOffsets.size() * sizeof(TypeIndexOffset); + return TypeIndexOffsets.size() * sizeof(codeview::TypeIndexOffset); } Error TpiStreamBuilder::finalizeMsfLayout() { diff --git a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h index a5100a56bcf1..a27573f93b97 100644 --- a/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h +++ b/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h @@ -94,9 +94,8 @@ class OrcMCJITReplacement : public ExecutionEngine { return ClientMM->registerEHFrames(Addr, LoadAddr, Size); } - void deregisterEHFrames(uint8_t *Addr, uint64_t LoadAddr, - size_t Size) override { - return ClientMM->deregisterEHFrames(Addr, LoadAddr, Size); + void deregisterEHFrames() override { + return ClientMM->deregisterEHFrames(); } void notifyObjectLoaded(RuntimeDyld &RTDyld, diff --git a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp index de73fbde8eb7..99e84b7496d4 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RTDyldMemoryManager.cpp @@ -134,6 +134,18 @@ void RTDyldMemoryManager::deregisterEHFramesInProcess(uint8_t *Addr, #endif +void RTDyldMemoryManager::registerEHFrames(uint8_t *Addr, uint64_t LoadAddr, + size_t Size) { + registerEHFramesInProcess(Addr, Size); + EHFrames.push_back({Addr, Size}); +} + +void RTDyldMemoryManager::deregisterEHFrames() { + for (auto &Frame : EHFrames) + deregisterEHFramesInProcess(Frame.Addr, Frame.Size); + EHFrames.clear(); +} + static int jit_noop() { return 0; } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp index df9d2ceba329..e9a4b71c903d 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp @@ -73,7 +73,9 @@ namespace llvm { void RuntimeDyldImpl::registerEHFrames() {} -void RuntimeDyldImpl::deregisterEHFrames() {} +void RuntimeDyldImpl::deregisterEHFrames() { + MemMgr.deregisterEHFrames(); +} #ifndef NDEBUG static void dumpSectionMemory(const SectionEntry &S, StringRef State) { diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp index 50f63fb8dd39..660843765b3f 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp @@ -221,22 +221,10 @@ void RuntimeDyldELF::registerEHFrames() { uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress(); size_t EHFrameSize = Sections[EHFrameSID].getSize(); MemMgr.registerEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize); - RegisteredEHFrameSections.push_back(EHFrameSID); } UnregisteredEHFrameSections.clear(); } -void RuntimeDyldELF::deregisterEHFrames() { - for (int i = 0, e = RegisteredEHFrameSections.size(); i != e; ++i) { - SID EHFrameSID = RegisteredEHFrameSections[i]; - uint8_t *EHFrameAddr = Sections[EHFrameSID].getAddress(); - uint64_t EHFrameLoadAddr = Sections[EHFrameSID].getLoadAddress(); - size_t EHFrameSize = Sections[EHFrameSID].getSize(); - MemMgr.deregisterEHFrames(EHFrameAddr, EHFrameLoadAddr, EHFrameSize); - } - RegisteredEHFrameSections.clear(); -} - std::unique_ptr llvm::RuntimeDyldELF::create(Triple::ArchType Arch, RuntimeDyld::MemoryManager &MemMgr, @@ -802,20 +790,35 @@ void RuntimeDyldELF::resolveSystemZRelocation(const SectionEntry &Section, writeInt32BE(LocalAddress, Delta / 2); break; } + case ELF::R_390_PC16: { + int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset); + assert(int16_t(Delta) == Delta && "R_390_PC16 overflow"); + writeInt16BE(LocalAddress, Delta); + break; + } case ELF::R_390_PC32: { int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset); assert(int32_t(Delta) == Delta && "R_390_PC32 overflow"); writeInt32BE(LocalAddress, Delta); break; } - case ELF::R_390_64: - writeInt64BE(LocalAddress, Value + Addend); - break; case ELF::R_390_PC64: { int64_t Delta = (Value + Addend) - Section.getLoadAddressWithOffset(Offset); writeInt64BE(LocalAddress, Delta); break; } + case ELF::R_390_8: + *LocalAddress = (uint8_t)(Value + Addend); + break; + case ELF::R_390_16: + writeInt16BE(LocalAddress, Value + Addend); + break; + case ELF::R_390_32: + writeInt32BE(LocalAddress, Value + Addend); + break; + case ELF::R_390_64: + writeInt64BE(LocalAddress, Value + Addend); + break; } } diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h index 84dd810101f3..fb5da6dd8bbb 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h @@ -152,7 +152,6 @@ private: // in a table until we receive a request to register all unregistered // EH frame sections with the memory manager. SmallVector UnregisteredEHFrameSections; - SmallVector RegisteredEHFrameSections; // Map between GOT relocation value and corresponding GOT offset std::map GOTOffsetMap; @@ -180,7 +179,6 @@ public: StubMap &Stubs) override; bool isCompatibleFile(const object::ObjectFile &Obj) const override; void registerEHFrames() override; - void deregisterEHFrames() override; Error finalizeLoad(const ObjectFile &Obj, ObjSectionToIDMap &SectionMap) override; }; diff --git a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h index f5cc883d98fd..18c23c5a2a5d 100644 --- a/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h +++ b/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h @@ -515,7 +515,7 @@ public: virtual void registerEHFrames(); - virtual void deregisterEHFrames(); + void deregisterEHFrames(); virtual Error finalizeLoad(const ObjectFile &ObjImg, ObjSectionToIDMap &SectionMap) { diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h index 0398413e1532..6aa1a2bdb926 100644 --- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h @@ -217,7 +217,6 @@ public: } void registerEHFrames() override {} - void deregisterEHFrames() override {} }; } diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h index 8c6af0bd9c6d..318afa21a88b 100644 --- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h @@ -316,7 +316,6 @@ public: } void registerEHFrames() override {} - void deregisterEHFrames() override {} }; } diff --git a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h index 109beb36f1ee..26e73989d7ed 100644 --- a/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h +++ b/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h @@ -194,9 +194,6 @@ public: } UnregisteredEHFrameSections.clear(); } - void deregisterEHFrames() override { - // Stub - } Error finalizeLoad(const ObjectFile &Obj, ObjSectionToIDMap &SectionMap) override { // Look for and record the EH frame section IDs. diff --git a/lib/Fuzzer/FuzzerDriver.cpp b/lib/Fuzzer/FuzzerDriver.cpp index b85ba210afb3..e93c79cfcec6 100644 --- a/lib/Fuzzer/FuzzerDriver.cpp +++ b/lib/Fuzzer/FuzzerDriver.cpp @@ -656,7 +656,8 @@ int FuzzerDriver(int *argc, char ***argv, UserCallback Callback) { SMR.WaitClient(); size_t Size = SMR.ReadByteArraySize(); SMR.WriteByteArray(nullptr, 0); - F->RunOne(SMR.GetByteArray(), Size); + const Unit tmp(SMR.GetByteArray(), SMR.GetByteArray() + Size); + F->RunOne(tmp.data(), tmp.size()); SMR.PostServer(); } return 0; diff --git a/lib/Fuzzer/FuzzerFlags.def b/lib/Fuzzer/FuzzerFlags.def index 0a1ff1b1df6a..7ff196c8fa96 100644 --- a/lib/Fuzzer/FuzzerFlags.def +++ b/lib/Fuzzer/FuzzerFlags.def @@ -92,10 +92,10 @@ FUZZER_FLAG_INT(print_pcs, 0, "If 1, print out newly covered PCs.") FUZZER_FLAG_INT(print_final_stats, 0, "If 1, print statistics at exit.") FUZZER_FLAG_INT(print_corpus_stats, 0, "If 1, print statistics on corpus elements at exit.") -FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information at exit." - " Experimental, only with trace-pc-guard") -FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information at exit." - " Experimental, only with trace-pc-guard") +FUZZER_FLAG_INT(print_coverage, 0, "If 1, print coverage information as text" + " at exit.") +FUZZER_FLAG_INT(dump_coverage, 0, "If 1, dump coverage information as a" + " .sancov file at exit.") FUZZER_FLAG_INT(handle_segv, 1, "If 1, try to intercept SIGSEGV.") FUZZER_FLAG_INT(handle_bus, 1, "If 1, try to intercept SIGBUS.") FUZZER_FLAG_INT(handle_abrt, 1, "If 1, try to intercept SIGABRT.") diff --git a/lib/Fuzzer/FuzzerInternal.h b/lib/Fuzzer/FuzzerInternal.h index ad067ee2c0d9..5f184c2316e2 100644 --- a/lib/Fuzzer/FuzzerInternal.h +++ b/lib/Fuzzer/FuzzerInternal.h @@ -91,6 +91,7 @@ public: private: void AlarmCallback(); void CrashCallback(); + void CrashOnOverwrittenData(); void InterruptCallback(); void MutateAndTestOne(); void ReportNewCoverage(InputInfo *II, const Unit &U); diff --git a/lib/Fuzzer/FuzzerLoop.cpp b/lib/Fuzzer/FuzzerLoop.cpp index d84c3dbdaf77..14caa203c5ef 100644 --- a/lib/Fuzzer/FuzzerLoop.cpp +++ b/lib/Fuzzer/FuzzerLoop.cpp @@ -422,6 +422,24 @@ size_t Fuzzer::GetCurrentUnitInFuzzingThead(const uint8_t **Data) const { return CurrentUnitSize; } +void Fuzzer::CrashOnOverwrittenData() { + Printf("==%d== ERROR: libFuzzer: fuzz target overwrites it's const input\n", + GetPid()); + DumpCurrentUnit("crash-"); + Printf("SUMMARY: libFuzzer: out-of-memory\n"); + _Exit(Options.ErrorExitCode); // Stop right now. +} + +// Compare two arrays, but not all bytes if the arrays are large. +static bool LooseMemeq(const uint8_t *A, const uint8_t *B, size_t Size) { + const size_t Limit = 64; + if (Size <= 64) + return !memcmp(A, B, Size); + // Compare first and last Limit/2 bytes. + return !memcmp(A, B, Limit / 2) && + !memcmp(A + Size - Limit / 2, B + Size - Limit / 2, Limit / 2); +} + void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) { assert(InFuzzingThread()); if (SMR.IsClient()) @@ -443,6 +461,8 @@ void Fuzzer::ExecuteCallback(const uint8_t *Data, size_t Size) { (void)Res; assert(Res == 0); HasMoreMallocsThanFrees = AllocTracer.Stop(); + if (!LooseMemeq(DataCopy, Data, Size)) + CrashOnOverwrittenData(); CurrentUnitSize = 0; delete[] DataCopy; } diff --git a/lib/Fuzzer/FuzzerMutate.cpp b/lib/Fuzzer/FuzzerMutate.cpp index cd846c7deec5..e60d4130de10 100644 --- a/lib/Fuzzer/FuzzerMutate.cpp +++ b/lib/Fuzzer/FuzzerMutate.cpp @@ -217,11 +217,12 @@ DictionaryEntry MutationDispatcher::MakeDictionaryEntryFromCMP( size_t NumPositions = 0; for (const uint8_t *Cur = Data; Cur < End && NumPositions < kMaxNumPositions; Cur++) { - Cur = (uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize); + Cur = + (const uint8_t *)SearchMemory(Cur, End - Cur, ExistingBytes, ArgSize); if (!Cur) break; Positions[NumPositions++] = Cur - Data; } - if (!NumPositions) break; + if (!NumPositions) continue; return DictionaryEntry(W, Positions[Rand(NumPositions)]); } DictionaryEntry DE(W); diff --git a/lib/Fuzzer/afl/afl_driver.cpp b/lib/Fuzzer/afl/afl_driver.cpp index b3a54e57fceb..3815ed11cf60 100644 --- a/lib/Fuzzer/afl/afl_driver.cpp +++ b/lib/Fuzzer/afl/afl_driver.cpp @@ -59,6 +59,11 @@ statistics from the file. If that fails then the process will quit. #include #include #include + +#include +#include +#include + // Platform detection. Copied from FuzzerInternal.h #ifdef __linux__ #define LIBFUZZER_LINUX 1 @@ -245,17 +250,39 @@ extern "C" size_t LLVMFuzzerMutate(uint8_t *Data, size_t Size, size_t MaxSize) { return 0; } +// Execute any files provided as parameters. +int ExecuteFilesOnyByOne(int argc, char **argv) { + for (int i = 1; i < argc; i++) { + std::ifstream in(argv[i]); + in.seekg(0, in.end); + size_t length = in.tellg(); + in.seekg (0, in.beg); + std::cout << "Reading " << length << " bytes from " << argv[i] << std::endl; + // Allocate exactly length bytes so that we reliably catch buffer overflows. + std::vector bytes(length); + in.read(bytes.data(), bytes.size()); + assert(in); + LLVMFuzzerTestOneInput(reinterpret_cast(bytes.data()), + bytes.size()); + std::cout << "Execution successfull" << std::endl; + } + return 0; +} + int main(int argc, char **argv) { - fprintf(stderr, "======================= INFO =========================\n" - "This binary is built for AFL-fuzz.\n" - "To run the target function on a single input execute this:\n" - " %s < INPUT_FILE\n" - "To run the fuzzing execute this:\n" - " afl-fuzz [afl-flags] %s [N] " - "-- run N fuzzing iterations before " - "re-spawning the process (default: 1000)\n" - "======================================================\n", - argv[0], argv[0]); + fprintf(stderr, + "======================= INFO =========================\n" + "This binary is built for AFL-fuzz.\n" + "To run the target function on individual input(s) execute this:\n" + " %s < INPUT_FILE\n" + "or\n" + " %s INPUT_FILE1 [INPUT_FILE2 ... ]\n" + "To fuzz with afl-fuzz execute this:\n" + " afl-fuzz [afl-flags] %s [-N]\n" + "afl-fuzz will run N iterations before " + "re-spawning the process (default: 1000)\n" + "======================================================\n", + argv[0], argv[0], argv[0]); if (LLVMFuzzerInitialize) LLVMFuzzerInitialize(&argc, &argv); // Do any other expensive one-time initialization here. @@ -266,8 +293,14 @@ int main(int argc, char **argv) { __afl_manual_init(); int N = 1000; - if (argc >= 2) - N = atoi(argv[1]); + if (argc == 2 && argv[1][0] == '-') + N = atoi(argv[1] + 1); + else if(argc == 2 && (N = atoi(argv[1])) > 0) + fprintf(stderr, "WARNING: using the deprecated call style `%s %d`\n", + argv[0], N); + else if (argc > 1) + return ExecuteFilesOnyByOne(argc, argv); + assert(N > 0); time_t unit_time_secs; int num_runs = 0; diff --git a/lib/Fuzzer/test/AFLDriverTest.cpp b/lib/Fuzzer/test/AFLDriverTest.cpp index 3dd0b6117305..e3f5f7100883 100644 --- a/lib/Fuzzer/test/AFLDriverTest.cpp +++ b/lib/Fuzzer/test/AFLDriverTest.cpp @@ -4,19 +4,25 @@ // Contains dummy functions used to avoid dependency on AFL. #include #include +#include extern "C" void __afl_manual_init() {} -extern "C" int __afl_persistent_loop(unsigned int) { +extern "C" int __afl_persistent_loop(unsigned int N) { + static int Count = N; + fprintf(stderr, "__afl_persistent_loop calle, Count = %d\n", Count); + if (Count--) return 1; return 0; } // This declaration exists to prevent the Darwin linker // from complaining about this being a missing weak symbol. extern "C" int LLVMFuzzerInitialize(int *argc, char ***argv) { + fprintf(stderr, "LLVMFuzzerInitialize called\n"); return 0; } extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + fprintf(stderr, "LLVMFuzzerTestOneInput called; Size = %zd\n", Size); return 0; } diff --git a/lib/Fuzzer/test/CMakeLists.txt b/lib/Fuzzer/test/CMakeLists.txt index cd049d3f03d8..b39938a705f6 100644 --- a/lib/Fuzzer/test/CMakeLists.txt +++ b/lib/Fuzzer/test/CMakeLists.txt @@ -104,6 +104,7 @@ set(Tests OneHugeAllocTest OutOfMemoryTest OutOfMemorySingleLargeMallocTest + OverwriteInputTest RepeatedMemcmp RepeatedBytesTest SimpleCmpTest diff --git a/lib/Fuzzer/test/OverwriteInputTest.cpp b/lib/Fuzzer/test/OverwriteInputTest.cpp new file mode 100644 index 000000000000..e688682346a6 --- /dev/null +++ b/lib/Fuzzer/test/OverwriteInputTest.cpp @@ -0,0 +1,13 @@ +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. + +// Simple test for a fuzzer. Make sure we abort if Data is overwritten. +#include +#include + +extern "C" int LLVMFuzzerTestOneInput(const uint8_t *Data, size_t Size) { + if (Size) + *const_cast(Data) = 1; + return 0; +} + diff --git a/lib/Fuzzer/test/afl-driver.test b/lib/Fuzzer/test/afl-driver.test new file mode 100644 index 000000000000..6eab23cc3636 --- /dev/null +++ b/lib/Fuzzer/test/afl-driver.test @@ -0,0 +1,26 @@ +REQUIRES: linux +RUN: echo -n "abc" > %t.file3 +RUN: echo -n "abcd" > %t.file4 + +RUN: AFLDriverTest < %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK1 +CHECK1: __afl_persistent_loop calle, Count = 1000 +CHECK1: LLVMFuzzerTestOneInput called; Size = 3 + + +RUN: AFLDriverTest < %t.file3 -42 2>&1 | FileCheck %s --check-prefix=CHECK2 +CHECK2: __afl_persistent_loop calle, Count = 42 +CHECK2: LLVMFuzzerTestOneInput called; Size = 3 + + +RUN: AFLDriverTest < %t.file3 666 2>&1 | FileCheck %s --check-prefix=CHECK3 +CHECK3: WARNING: using the deprecated call style +CHECK3: __afl_persistent_loop calle, Count = 666 +CHECK3: LLVMFuzzerTestOneInput called; Size = 3 + + +RUN: AFLDriverTest %t.file3 2>&1 | FileCheck %s --check-prefix=CHECK4 +CHECK4: LLVMFuzzerTestOneInput called; Size = 3 + +RUN: AFLDriverTest %t.file3 %t.file4 2>&1 | FileCheck %s --check-prefix=CHECK5 +CHECK5: LLVMFuzzerTestOneInput called; Size = 3 +CHECK5: LLVMFuzzerTestOneInput called; Size = 4 diff --git a/lib/Fuzzer/test/overwrite-input.test b/lib/Fuzzer/test/overwrite-input.test new file mode 100644 index 000000000000..81c27909e8df --- /dev/null +++ b/lib/Fuzzer/test/overwrite-input.test @@ -0,0 +1,2 @@ +RUN: not LLVMFuzzer-OverwriteInputTest 2>&1 | FileCheck %s +CHECK: ERROR: libFuzzer: fuzz target overwrites it's const input diff --git a/lib/IR/AsmWriter.cpp b/lib/IR/AsmWriter.cpp index 4c6e3e3788bd..ec4663018bd4 100644 --- a/lib/IR/AsmWriter.cpp +++ b/lib/IR/AsmWriter.cpp @@ -805,6 +805,9 @@ void SlotTracker::processModule() { if (!Var.hasName()) CreateModuleSlot(&Var); processGlobalObjectMetadata(Var); + auto Attrs = Var.getAttributes(); + if (Attrs.hasAttributes()) + CreateAttributeSetSlot(Attrs); } for (const GlobalAlias &A : TheModule->aliases()) { @@ -2502,6 +2505,10 @@ void AssemblyWriter::printGlobal(const GlobalVariable *GV) { GV->getAllMetadata(MDs); printMetadataAttachments(MDs, ", "); + auto Attrs = GV->getAttributes(); + if (Attrs.hasAttributes()) + Out << " #" << Machine.getAttributeGroupSlot(Attrs); + printInfoComment(*GV); } diff --git a/lib/IR/AttributeImpl.h b/lib/IR/AttributeImpl.h index cf2925254695..acfac316e91e 100644 --- a/lib/IR/AttributeImpl.h +++ b/lib/IR/AttributeImpl.h @@ -1,4 +1,4 @@ -//===-- AttributeImpl.h - Attribute Internals -------------------*- C++ -*-===// +//===- AttributeImpl.h - Attribute Internals --------------------*- C++ -*-===// // // The LLVM Compiler Infrastructure // @@ -21,9 +21,7 @@ #include "llvm/ADT/StringRef.h" #include "llvm/IR/Attributes.h" #include "llvm/Support/TrailingObjects.h" -#include #include -#include #include #include #include @@ -80,11 +78,13 @@ public: else Profile(ID, getKindAsString(), getValueAsString()); } + static void Profile(FoldingSetNodeID &ID, Attribute::AttrKind Kind, uint64_t Val) { ID.AddInteger(Kind); if (Val) ID.AddInteger(Val); } + static void Profile(FoldingSetNodeID &ID, StringRef Kind, StringRef Values) { ID.AddString(Kind); if (!Values.empty()) ID.AddString(Values); @@ -114,9 +114,10 @@ public: }; class IntAttributeImpl : public EnumAttributeImpl { - void anchor() override; uint64_t Val; + void anchor() override; + public: IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val) : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) { @@ -188,20 +189,22 @@ public: std::pair> getAllocSizeArgs() const; std::string getAsString(bool InAttrGrp) const; - typedef const Attribute *iterator; + using iterator = const Attribute *; + iterator begin() const { return getTrailingObjects(); } iterator end() const { return begin() + NumAttrs; } void Profile(FoldingSetNodeID &ID) const { Profile(ID, makeArrayRef(begin(), end())); } + static void Profile(FoldingSetNodeID &ID, ArrayRef AttrList) { for (const auto &Attr : AttrList) Attr.Profile(ID); } }; -typedef std::pair IndexAttrPair; +using IndexAttrPair = std::pair; //===----------------------------------------------------------------------===// /// \class @@ -265,7 +268,8 @@ public: return AvailableFunctionAttrs & ((uint64_t)1) << Kind; } - typedef AttributeSet::iterator iterator; + using iterator = AttributeSet::iterator; + iterator begin(unsigned Slot) const { return getSlotAttributes(Slot).begin(); } diff --git a/lib/IR/Attributes.cpp b/lib/IR/Attributes.cpp index 3b1140ab542c..ce60367a6c8b 100644 --- a/lib/IR/Attributes.cpp +++ b/lib/IR/Attributes.cpp @@ -34,6 +34,8 @@ #include "llvm/Support/raw_ostream.h" #include #include +#include +#include #include #include #include @@ -504,16 +506,74 @@ AttributeSet AttributeSet::get(LLVMContext &C, ArrayRef Attrs) { return AttributeSet(AttributeSetNode::get(C, Attrs)); } +AttributeSet AttributeSet::addAttribute(LLVMContext &C, + Attribute::AttrKind Kind) const { + if (hasAttribute(Kind)) return *this; + AttrBuilder B; + B.addAttribute(Kind); + return addAttributes(C, AttributeSet::get(C, B)); +} + +AttributeSet AttributeSet::addAttribute(LLVMContext &C, StringRef Kind, + StringRef Value) const { + AttrBuilder B; + B.addAttribute(Kind, Value); + return addAttributes(C, AttributeSet::get(C, B)); +} + +AttributeSet AttributeSet::addAttributes(LLVMContext &C, + const AttributeSet AS) const { + if (!hasAttributes()) + return AS; + + if (!AS.hasAttributes()) + return *this; + + AttrBuilder B(AS); + for (Attribute I : *this) + B.addAttribute(I); + + return get(C, B); +} + +AttributeSet AttributeSet::removeAttribute(LLVMContext &C, + Attribute::AttrKind Kind) const { + if (!hasAttribute(Kind)) return *this; + AttrBuilder B; + B.addAttribute(Kind); + return removeAttributes(C, B); +} + +AttributeSet AttributeSet::removeAttribute(LLVMContext &C, + StringRef Kind) const { + if (!hasAttribute(Kind)) return *this; + AttrBuilder B; + B.addAttribute(Kind); + return removeAttributes(C, B); +} + +AttributeSet AttributeSet::removeAttributes(LLVMContext &C, + const AttrBuilder &Attrs) const { + + // FIXME it is not obvious how this should work for alignment. + // For now, say we can't pass in alignment, which no current use does. + assert(!Attrs.hasAlignmentAttr() && "Attempt to change alignment!"); + + AttrBuilder B(*this); + B.remove(Attrs); + return get(C, B); +} + unsigned AttributeSet::getNumAttributes() const { return SetNode ? SetNode->getNumAttributes() : 0; } bool AttributeSet::hasAttribute(Attribute::AttrKind Kind) const { - return SetNode ? SetNode->hasAttribute(Kind) : 0; + return SetNode ? SetNode->hasAttribute(Kind) : false; } bool AttributeSet::hasAttribute(StringRef Kind) const { - return SetNode ? SetNode->hasAttribute(Kind) : 0; + return SetNode ? SetNode->hasAttribute(Kind) : false; } Attribute AttributeSet::getAttribute(Attribute::AttrKind Kind) const { @@ -557,6 +617,14 @@ AttributeSet::iterator AttributeSet::end() const { return SetNode ? SetNode->end() : nullptr; } +#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) +LLVM_DUMP_METHOD void AttributeSet::dump() const { + dbgs() << "AS =\n"; + dbgs() << " { "; + dbgs() << getAsString(true) << " }\n"; +} +#endif + //===----------------------------------------------------------------------===// // AttributeSetNode Definition //===----------------------------------------------------------------------===// diff --git a/lib/IR/ConstantFold.cpp b/lib/IR/ConstantFold.cpp index 80b117015ede..a20f3f811c8d 100644 --- a/lib/IR/ConstantFold.cpp +++ b/lib/IR/ConstantFold.cpp @@ -2041,9 +2041,6 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C, Optional InRangeIndex, ArrayRef Idxs) { if (Idxs.empty()) return C; - Constant *Idx0 = cast(Idxs[0]); - if ((Idxs.size() == 1 && Idx0->isNullValue())) - return C; if (isa(C)) { Type *GEPTy = GetElementPtrInst::getGEPReturnType( @@ -2051,10 +2048,15 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C, return UndefValue::get(GEPTy); } + Constant *Idx0 = cast(Idxs[0]); + if (Idxs.size() == 1 && (Idx0->isNullValue() || isa(Idx0))) + return C; + if (C->isNullValue()) { bool isNull = true; for (unsigned i = 0, e = Idxs.size(); i != e; ++i) - if (!cast(Idxs[i])->isNullValue()) { + if (!isa(Idxs[i]) && + !cast(Idxs[i])->isNullValue()) { isNull = false; break; } diff --git a/lib/IR/ConstantRange.cpp b/lib/IR/ConstantRange.cpp index aeb1257754f3..509caba3acd4 100644 --- a/lib/IR/ConstantRange.cpp +++ b/lib/IR/ConstantRange.cpp @@ -278,7 +278,7 @@ APInt ConstantRange::getUnsignedMax() const { } APInt ConstantRange::getUnsignedMin() const { - if (isFullSet() || (isWrappedSet() && getUpper() != 0)) + if (isFullSet() || (isWrappedSet() && !getUpper().isNullValue())) return APInt::getMinValue(getBitWidth()); return getLower(); } @@ -442,7 +442,7 @@ ConstantRange ConstantRange::unionWith(const ConstantRange &CR) const { APInt L = CR.Lower.ult(Lower) ? CR.Lower : Lower; APInt U = (CR.Upper - 1).ugt(Upper - 1) ? CR.Upper : Upper; - if (L == 0 && U == 0) + if (L.isNullValue() && U.isNullValue()) return ConstantRange(getBitWidth()); return ConstantRange(std::move(L), std::move(U)); @@ -757,7 +757,8 @@ ConstantRange::multiply(const ConstantRange &Other) const { // from one positive number to another which is as good as we can generate. // In this case, skip the extra work of generating signed ranges which aren't // going to be better than this range. - if (!UR.isWrappedSet() && UR.getLower().isNonNegative()) + if (!UR.isWrappedSet() && + (UR.getUpper().isNonNegative() || UR.getUpper().isMinSignedValue())) return UR; // Now the signed range. Because we could be dealing with negative numbers @@ -834,7 +835,7 @@ ConstantRange::umin(const ConstantRange &Other) const { ConstantRange ConstantRange::udiv(const ConstantRange &RHS) const { - if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax() == 0) + if (isEmptySet() || RHS.isEmptySet() || RHS.getUnsignedMax().isNullValue()) return ConstantRange(getBitWidth(), /*isFullSet=*/false); if (RHS.isFullSet()) return ConstantRange(getBitWidth(), /*isFullSet=*/true); @@ -842,7 +843,7 @@ ConstantRange::udiv(const ConstantRange &RHS) const { APInt Lower = getUnsignedMin().udiv(RHS.getUnsignedMax()); APInt RHS_umin = RHS.getUnsignedMin(); - if (RHS_umin == 0) { + if (RHS_umin.isNullValue()) { // We want the lowest value in RHS excluding zero. Usually that would be 1 // except for a range in the form of [X, 1) in which case it would be X. if (RHS.getUpper() == 1) @@ -892,29 +893,33 @@ ConstantRange::shl(const ConstantRange &Other) const { if (isEmptySet() || Other.isEmptySet()) return ConstantRange(getBitWidth(), /*isFullSet=*/false); - APInt min = getUnsignedMin().shl(Other.getUnsignedMin()); - APInt max = getUnsignedMax().shl(Other.getUnsignedMax()); + APInt max = getUnsignedMax(); + APInt Other_umax = Other.getUnsignedMax(); - // there's no overflow! - APInt Zeros(getBitWidth(), getUnsignedMax().countLeadingZeros()); - if (Zeros.ugt(Other.getUnsignedMax())) - return ConstantRange(std::move(min), std::move(max) + 1); + // there's overflow! + if (Other_umax.uge(max.countLeadingZeros())) + return ConstantRange(getBitWidth(), /*isFullSet=*/true); // FIXME: implement the other tricky cases - return ConstantRange(getBitWidth(), /*isFullSet=*/true); + + APInt min = getUnsignedMin(); + min <<= Other.getUnsignedMin(); + max <<= Other_umax; + + return ConstantRange(std::move(min), std::move(max) + 1); } ConstantRange ConstantRange::lshr(const ConstantRange &Other) const { if (isEmptySet() || Other.isEmptySet()) return ConstantRange(getBitWidth(), /*isFullSet=*/false); - - APInt max = getUnsignedMax().lshr(Other.getUnsignedMin()); + + APInt max = getUnsignedMax().lshr(Other.getUnsignedMin()) + 1; APInt min = getUnsignedMin().lshr(Other.getUnsignedMax()); - if (min == max + 1) + if (min == max) return ConstantRange(getBitWidth(), /*isFullSet=*/true); - return ConstantRange(std::move(min), std::move(max) + 1); + return ConstantRange(std::move(min), std::move(max)); } ConstantRange ConstantRange::inverse() const { diff --git a/lib/IR/Constants.cpp b/lib/IR/Constants.cpp index ffc8f2e4303b..4b9d89cda539 100644 --- a/lib/IR/Constants.cpp +++ b/lib/IR/Constants.cpp @@ -30,7 +30,7 @@ #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include -#include + using namespace llvm; //===----------------------------------------------------------------------===// @@ -966,16 +966,6 @@ Constant *ConstantStruct::get(StructType *ST, ArrayRef V) { return ST->getContext().pImpl->StructConstants.getOrCreate(ST, V); } -Constant *ConstantStruct::get(StructType *T, ...) { - va_list ap; - SmallVector Values; - va_start(ap, T); - while (Constant *Val = va_arg(ap, llvm::Constant*)) - Values.push_back(Val); - va_end(ap); - return get(T, Values); -} - ConstantVector::ConstantVector(VectorType *T, ArrayRef V) : ConstantAggregate(T, ConstantVectorVal, V) { assert(V.size() == T->getNumElements() && @@ -1810,8 +1800,7 @@ Constant *ConstantExpr::getSizeOf(Type* Ty) { Constant *ConstantExpr::getAlignOf(Type* Ty) { // alignof is implemented as: (i64) gep ({i1,Ty}*)null, 0, 1 // Note that a non-inbounds gep is used, as null isn't within any object. - Type *AligningTy = - StructType::get(Type::getInt1Ty(Ty->getContext()), Ty, nullptr); + Type *AligningTy = StructType::get(Type::getInt1Ty(Ty->getContext()), Ty); Constant *NullPtr = Constant::getNullValue(AligningTy->getPointerTo(0)); Constant *Zero = ConstantInt::get(Type::getInt64Ty(Ty->getContext()), 0); Constant *One = ConstantInt::get(Type::getInt32Ty(Ty->getContext()), 1); diff --git a/lib/IR/ConstantsContext.h b/lib/IR/ConstantsContext.h index eda751d8af4a..25eb9452d9d0 100644 --- a/lib/IR/ConstantsContext.h +++ b/lib/IR/ConstantsContext.h @@ -22,6 +22,7 @@ #include "llvm/ADT/None.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringRef.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/InlineAsm.h" @@ -387,31 +388,34 @@ struct ConstantExprKeyType; template struct ConstantInfo; template <> struct ConstantInfo { - typedef ConstantExprKeyType ValType; - typedef Type TypeClass; + using ValType = ConstantExprKeyType; + using TypeClass = Type; }; template <> struct ConstantInfo { - typedef InlineAsmKeyType ValType; - typedef PointerType TypeClass; + using ValType = InlineAsmKeyType; + using TypeClass = PointerType; }; template <> struct ConstantInfo { - typedef ConstantAggrKeyType ValType; - typedef ArrayType TypeClass; + using ValType = ConstantAggrKeyType; + using TypeClass = ArrayType; }; template <> struct ConstantInfo { - typedef ConstantAggrKeyType ValType; - typedef StructType TypeClass; + using ValType = ConstantAggrKeyType; + using TypeClass = StructType; }; template <> struct ConstantInfo { - typedef ConstantAggrKeyType ValType; - typedef VectorType TypeClass; + using ValType = ConstantAggrKeyType; + using TypeClass = VectorType; }; template struct ConstantAggrKeyType { ArrayRef Operands; + ConstantAggrKeyType(ArrayRef Operands) : Operands(Operands) {} + ConstantAggrKeyType(ArrayRef Operands, const ConstantClass *) : Operands(Operands) {} + ConstantAggrKeyType(const ConstantClass *C, SmallVectorImpl &Storage) { assert(Storage.empty() && "Expected empty storage"); @@ -437,7 +441,8 @@ template struct ConstantAggrKeyType { return hash_combine_range(Operands.begin(), Operands.end()); } - typedef typename ConstantInfo::TypeClass TypeClass; + using TypeClass = typename ConstantInfo::TypeClass; + ConstantClass *create(TypeClass *Ty) const { return new (Operands.size()) ConstantClass(Ty, Operands); } @@ -457,6 +462,7 @@ struct InlineAsmKeyType { : AsmString(AsmString), Constraints(Constraints), FTy(FTy), HasSideEffects(HasSideEffects), IsAlignStack(IsAlignStack), AsmDialect(AsmDialect) {} + InlineAsmKeyType(const InlineAsm *Asm, SmallVectorImpl &) : AsmString(Asm->getAsmString()), Constraints(Asm->getConstraintString()), FTy(Asm->getFunctionType()), HasSideEffects(Asm->hasSideEffects()), @@ -483,7 +489,8 @@ struct InlineAsmKeyType { AsmDialect, FTy); } - typedef ConstantInfo::TypeClass TypeClass; + using TypeClass = ConstantInfo::TypeClass; + InlineAsm *create(TypeClass *Ty) const { assert(PointerType::getUnqual(FTy) == Ty); return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects, @@ -507,11 +514,13 @@ struct ConstantExprKeyType { : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData), SubclassData(SubclassData), Ops(Ops), Indexes(Indexes), ExplicitTy(ExplicitTy) {} + ConstantExprKeyType(ArrayRef Operands, const ConstantExpr *CE) : Opcode(CE->getOpcode()), SubclassOptionalData(CE->getRawSubclassOptionalData()), SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands), Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef()) {} + ConstantExprKeyType(const ConstantExpr *CE, SmallVectorImpl &Storage) : Opcode(CE->getOpcode()), @@ -553,7 +562,8 @@ struct ConstantExprKeyType { hash_combine_range(Indexes.begin(), Indexes.end())); } - typedef ConstantInfo::TypeClass TypeClass; + using TypeClass = ConstantInfo::TypeClass; + ConstantExpr *create(TypeClass *Ty) const { switch (Opcode) { default: @@ -594,16 +604,17 @@ struct ConstantExprKeyType { template class ConstantUniqueMap { public: - typedef typename ConstantInfo::ValType ValType; - typedef typename ConstantInfo::TypeClass TypeClass; - typedef std::pair LookupKey; + using ValType = typename ConstantInfo::ValType; + using TypeClass = typename ConstantInfo::TypeClass; + using LookupKey = std::pair; /// Key and hash together, so that we compute the hash only once and reuse it. - typedef std::pair LookupKeyHashed; + using LookupKeyHashed = std::pair; private: struct MapInfo { - typedef DenseMapInfo ConstantClassInfo; + using ConstantClassInfo = DenseMapInfo; + static inline ConstantClass *getEmptyKey() { return ConstantClassInfo::getEmptyKey(); } @@ -643,7 +654,7 @@ private: }; public: - typedef DenseSet MapTy; + using MapTy = DenseSet; private: MapTy Map; diff --git a/lib/IR/DebugInfoMetadata.cpp b/lib/IR/DebugInfoMetadata.cpp index cdbe237766a3..e6c49cad0722 100644 --- a/lib/IR/DebugInfoMetadata.cpp +++ b/lib/IR/DebugInfoMetadata.cpp @@ -672,6 +672,24 @@ void DIExpression::appendOffset(SmallVectorImpl &Ops, } } +bool DIExpression::extractIfOffset(int64_t &Offset) const { + if (getNumElements() == 0) { + Offset = 0; + return true; + } + if (getNumElements() != 2) + return false; + if (Elements[0] == dwarf::DW_OP_plus) { + Offset = Elements[1]; + return true; + } + if (Elements[0] == dwarf::DW_OP_minus) { + Offset = -Elements[1]; + return true; + } + return false; +} + DIExpression *DIExpression::prepend(const DIExpression *Expr, bool Deref, int64_t Offset, bool StackValue) { SmallVector Ops; diff --git a/lib/IR/DebugLoc.cpp b/lib/IR/DebugLoc.cpp index f31074a7ad44..3168ec6944a3 100644 --- a/lib/IR/DebugLoc.cpp +++ b/lib/IR/DebugLoc.cpp @@ -8,6 +8,7 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/DebugLoc.h" +#include "llvm/IR/IntrinsicInst.h" #include "LLVMContextImpl.h" #include "llvm/IR/DebugInfo.h" using namespace llvm; @@ -66,6 +67,119 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope, const_cast(InlinedAt)); } +DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt, + LLVMContext &Ctx, + DenseMap &Cache, + bool ReplaceLast) { + SmallVector InlinedAtLocations; + DILocation *Last = InlinedAt; + DILocation *CurInlinedAt = DL; + + // Gather all the inlined-at nodes. + while (DILocation *IA = CurInlinedAt->getInlinedAt()) { + // Skip any we've already built nodes for. + if (auto *Found = Cache[IA]) { + Last = cast(Found); + break; + } + + if (ReplaceLast && !IA->getInlinedAt()) + break; + InlinedAtLocations.push_back(IA); + CurInlinedAt = IA; + } + + // Starting from the top, rebuild the nodes to point to the new inlined-at + // location (then rebuilding the rest of the chain behind it) and update the + // map of already-constructed inlined-at nodes. + for (const DILocation *MD : reverse(InlinedAtLocations)) + Cache[MD] = Last = DILocation::getDistinct( + Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); + + return Last; +} + +/// Reparent \c Scope from \c OrigSP to \c NewSP. +static DIScope *reparentScope(LLVMContext &Ctx, DIScope *Scope, + DISubprogram *OrigSP, DISubprogram *NewSP, + DenseMap &Cache) { + SmallVector ScopeChain; + DIScope *Last = NewSP; + DIScope *CurScope = Scope; + do { + if (auto *SP = dyn_cast(CurScope)) { + // Don't rewrite this scope chain if it doesn't lead to the replaced SP. + if (SP != OrigSP) + return Scope; + Cache.insert({OrigSP, NewSP}); + break; + } + if (auto *Found = Cache[CurScope]) { + Last = cast(Found); + break; + } + ScopeChain.push_back(CurScope); + } while ((CurScope = CurScope->getScope().resolve())); + + // Starting from the top, rebuild the nodes to point to the new inlined-at + // location (then rebuilding the rest of the chain behind it) and update the + // map of already-constructed inlined-at nodes. + for (const DIScope *MD : reverse(ScopeChain)) { + if (auto *LB = dyn_cast(MD)) + Cache[MD] = Last = DILexicalBlock::getDistinct( + Ctx, Last, LB->getFile(), LB->getLine(), LB->getColumn()); + else if (auto *LB = dyn_cast(MD)) + Cache[MD] = Last = DILexicalBlockFile::getDistinct( + Ctx, Last, LB->getFile(), LB->getDiscriminator()); + else + llvm_unreachable("illegal parent scope"); + } + return Last; +} + +void DebugLoc::reparentDebugInfo(Instruction &I, DISubprogram *OrigSP, + DISubprogram *NewSP, + DenseMap &Cache) { + auto DL = I.getDebugLoc(); + if (!OrigSP || !NewSP || OrigSP == NewSP || !DL) + return; + + // Reparent the debug location. + auto &Ctx = I.getContext(); + DILocation *InlinedAt = DL->getInlinedAt(); + if (InlinedAt) { + while (auto *IA = InlinedAt->getInlinedAt()) + InlinedAt = IA; + auto NewScope = + reparentScope(Ctx, InlinedAt->getScope(), OrigSP, NewSP, Cache); + InlinedAt = + DebugLoc::get(InlinedAt->getLine(), InlinedAt->getColumn(), NewScope); + } + I.setDebugLoc( + DebugLoc::get(DL.getLine(), DL.getCol(), + reparentScope(Ctx, DL->getScope(), OrigSP, NewSP, Cache), + DebugLoc::appendInlinedAt(DL, InlinedAt, Ctx, Cache, + ReplaceLastInlinedAt))); + + // Fix up debug variables to point to NewSP. + auto reparentVar = [&](DILocalVariable *Var) { + return DILocalVariable::getDistinct( + Ctx, + cast( + reparentScope(Ctx, Var->getScope(), OrigSP, NewSP, Cache)), + Var->getName(), Var->getFile(), Var->getLine(), Var->getType(), + Var->getArg(), Var->getFlags(), Var->getAlignInBits()); + }; + if (auto *DbgValue = dyn_cast(&I)) { + auto *Var = DbgValue->getVariable(); + I.setOperand(2, MetadataAsValue::get(Ctx, reparentVar(Var))); + } else if (auto *DbgDeclare = dyn_cast(&I)) { + auto *Var = DbgDeclare->getVariable(); + I.setOperand(1, MetadataAsValue::get(Ctx, reparentVar(Var))); + } +} + + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void DebugLoc::dump() const { if (!Loc) diff --git a/lib/IR/DiagnosticInfo.cpp b/lib/IR/DiagnosticInfo.cpp index 395b6158e0c8..e73f53f3202d 100644 --- a/lib/IR/DiagnosticInfo.cpp +++ b/lib/IR/DiagnosticInfo.cpp @@ -12,20 +12,31 @@ // Diagnostics reporting is still done as part of the LLVMContext. //===----------------------------------------------------------------------===// -#include "llvm/IR/DiagnosticInfo.h" -#include "LLVMContextImpl.h" +#include "llvm/ADT/iterator_range.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/Twine.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constants.h" -#include "llvm/IR/DebugInfo.h" +#include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/DiagnosticPrinter.h" #include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/Instruction.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/Support/Regex.h" #include +#include +#include #include using namespace llvm; @@ -53,6 +64,8 @@ struct PassRemarksOpt { } }; +} // end anonymous namespace + static PassRemarksOpt PassRemarksOptLoc; static PassRemarksOpt PassRemarksMissedOptLoc; static PassRemarksOpt PassRemarksAnalysisOptLoc; @@ -85,7 +98,6 @@ PassRemarksAnalysis( "the given regular expression"), cl::Hidden, cl::location(PassRemarksAnalysisOptLoc), cl::ValueRequired, cl::ZeroOrMore); -} int llvm::getNextAvailablePluginDiagnosticKind() { static std::atomic PluginKindID(DK_FirstPluginKind); @@ -97,8 +109,7 @@ const char *OptimizationRemarkAnalysis::AlwaysPrint = ""; DiagnosticInfoInlineAsm::DiagnosticInfoInlineAsm(const Instruction &I, const Twine &MsgStr, DiagnosticSeverity Severity) - : DiagnosticInfo(DK_InlineAsm, Severity), LocCookie(0), MsgStr(MsgStr), - Instr(&I) { + : DiagnosticInfo(DK_InlineAsm, Severity), MsgStr(MsgStr), Instr(&I) { if (const MDNode *SrcLoc = I.getMetadata("srcloc")) { if (SrcLoc->getNumOperands() != 0) if (const auto *CI = @@ -193,7 +204,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V // Only include names that correspond to user variables. FIXME: we should use // debug info if available to get the name of the user variable. if (isa(V) || isa(V)) - Val = GlobalValue::getRealLinkageName(V->getName()); + Val = GlobalValue::dropLLVMManglingEscape(V->getName()); else if (isa(V)) { raw_string_ostream OS(Val); V->printAsOperand(OS, /*PrintType=*/false); diff --git a/lib/IR/Function.cpp b/lib/IR/Function.cpp index 58c060550322..16a9e51b8306 100644 --- a/lib/IR/Function.cpp +++ b/lib/IR/Function.cpp @@ -1,4 +1,4 @@ -//===-- Function.cpp - Implement the Global object classes ----------------===// +//===- Function.cpp - Implement the Global object classes -----------------===// // // The LLVM Compiler Infrastructure // @@ -11,21 +11,51 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Function.h" #include "LLVMContextImpl.h" #include "SymbolTableListTraitsImpl.h" +#include "llvm/ADT/ArrayRef.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/SmallString.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/CodeGen/ValueTypes.h" +#include "llvm/IR/Argument.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalValue.h" #include "llvm/IR/InstIterator.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/Intrinsics.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/MDBuilder.h" #include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" +#include "llvm/IR/SymbolTableListTraits.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Use.h" +#include "llvm/IR/User.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Compiler.h" +#include "llvm/Support/ErrorHandling.h" +#include +#include +#include +#include +#include +#include + using namespace llvm; // Explicit instantiations of SymbolTableListTraits since some of the methods @@ -36,7 +66,7 @@ template class llvm::SymbolTableListTraits; // Argument Implementation //===----------------------------------------------------------------------===// -void Argument::anchor() { } +void Argument::anchor() {} Argument::Argument(Type *Ty, const Twine &Name, Function *Par, unsigned ArgNo) : Value(Ty, Value::ArgumentVal), Parent(Par), ArgNo(ArgNo) { @@ -186,7 +216,7 @@ Function::Function(FunctionType *Ty, LinkageTypes Linkage, const Twine &name, Module *ParentModule) : GlobalObject(Ty, Value::FunctionVal, OperandTraits::op_begin(this), 0, Linkage, name), - Arguments(nullptr), NumArgs(Ty->getNumParams()) { + NumArgs(Ty->getNumParams()) { assert(FunctionType::isValidReturnType(getReturnType()) && "invalid return type"); setGlobalObjectSubClassData(0); @@ -386,24 +416,20 @@ void Function::clearGC() { /// Copy all additional attributes (those not needed to create a Function) from /// the Function Src to this one. -void Function::copyAttributesFrom(const GlobalValue *Src) { +void Function::copyAttributesFrom(const Function *Src) { GlobalObject::copyAttributesFrom(Src); - const Function *SrcF = dyn_cast(Src); - if (!SrcF) - return; - - setCallingConv(SrcF->getCallingConv()); - setAttributes(SrcF->getAttributes()); - if (SrcF->hasGC()) - setGC(SrcF->getGC()); + setCallingConv(Src->getCallingConv()); + setAttributes(Src->getAttributes()); + if (Src->hasGC()) + setGC(Src->getGC()); else clearGC(); - if (SrcF->hasPersonalityFn()) - setPersonalityFn(SrcF->getPersonalityFn()); - if (SrcF->hasPrefixData()) - setPrefixData(SrcF->getPrefixData()); - if (SrcF->hasPrologueData()) - setPrologueData(SrcF->getPrologueData()); + if (Src->hasPersonalityFn()) + setPersonalityFn(Src->getPersonalityFn()); + if (Src->hasPrefixData()) + setPrefixData(Src->getPrefixData()); + if (Src->hasPrologueData()) + setPrologueData(Src->getPrologueData()); } /// Table of string intrinsic names indexed by enum value. @@ -486,10 +512,10 @@ void Function::recalculateIntrinsicID() { static std::string getMangledTypeStr(Type* Ty) { std::string Result; if (PointerType* PTyp = dyn_cast(Ty)) { - Result += "p" + llvm::utostr(PTyp->getAddressSpace()) + + Result += "p" + utostr(PTyp->getAddressSpace()) + getMangledTypeStr(PTyp->getElementType()); } else if (ArrayType* ATyp = dyn_cast(Ty)) { - Result += "a" + llvm::utostr(ATyp->getNumElements()) + + Result += "a" + utostr(ATyp->getNumElements()) + getMangledTypeStr(ATyp->getElementType()); } else if (StructType *STyp = dyn_cast(Ty)) { if (!STyp->isLiteral()) { @@ -534,7 +560,6 @@ std::string Intrinsic::getName(ID id, ArrayRef Tys) { return Result; } - /// IIT_Info - These are enumerators that describe the entries returned by the /// getIntrinsicInfoTableEntries function. /// @@ -585,9 +610,10 @@ enum IIT_Info { static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, SmallVectorImpl &OutputTable) { + using namespace Intrinsic; + IIT_Info Info = IIT_Info(Infos[NextElt++]); unsigned StructElts = 2; - using namespace Intrinsic; switch (Info) { case IIT_Done: @@ -742,7 +768,6 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef Infos, llvm_unreachable("unhandled"); } - #define GET_INTRINSIC_GENERATOR_GLOBAL #include "llvm/IR/Intrinsics.gen" #undef GET_INTRINSIC_GENERATOR_GLOBAL @@ -780,10 +805,10 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id, DecodeIITType(NextElt, IITEntries, T); } - static Type *DecodeFixedType(ArrayRef &Infos, ArrayRef Tys, LLVMContext &Context) { using namespace Intrinsic; + IITDescriptor D = Infos.front(); Infos = Infos.slice(1); @@ -855,12 +880,10 @@ static Type *DecodeFixedType(ArrayRef &Infos, case IITDescriptor::VecOfAnyPtrsToElt: // Return the overloaded type (which determines the pointers address space) return Tys[D.getOverloadArgNumber()]; - } + } llvm_unreachable("unhandled"); } - - FunctionType *Intrinsic::getType(LLVMContext &Context, ID id, ArrayRef Tys) { SmallVector Table; diff --git a/lib/IR/Globals.cpp b/lib/IR/Globals.cpp index 5f338f58d940..17d27b016cf2 100644 --- a/lib/IR/Globals.cpp +++ b/lib/IR/Globals.cpp @@ -69,6 +69,30 @@ void GlobalValue::copyAttributesFrom(const GlobalValue *Src) { setDLLStorageClass(Src->getDLLStorageClass()); } +void GlobalValue::removeFromParent() { + switch (getValueID()) { +#define HANDLE_GLOBAL_VALUE(NAME) \ + case Value::NAME##Val: \ + return static_cast(this)->removeFromParent(); +#include "llvm/IR/Value.def" + default: + break; + } + llvm_unreachable("not a global"); +} + +void GlobalValue::eraseFromParent() { + switch (getValueID()) { +#define HANDLE_GLOBAL_VALUE(NAME) \ + case Value::NAME##Val: \ + return static_cast(this)->eraseFromParent(); +#include "llvm/IR/Value.def" + default: + break; + } + llvm_unreachable("not a global"); +} + unsigned GlobalValue::getAlignment() const { if (auto *GA = dyn_cast(this)) { // In general we cannot compute this at the IR level, but we try. @@ -93,12 +117,10 @@ void GlobalObject::setAlignment(unsigned Align) { assert(getAlignment() == Align && "Alignment representation error!"); } -void GlobalObject::copyAttributesFrom(const GlobalValue *Src) { +void GlobalObject::copyAttributesFrom(const GlobalObject *Src) { GlobalValue::copyAttributesFrom(Src); - if (const auto *GV = dyn_cast(Src)) { - setAlignment(GV->getAlignment()); - setSection(GV->getSection()); - } + setAlignment(Src->getAlignment()); + setSection(Src->getSection()); } std::string GlobalValue::getGlobalIdentifier(StringRef Name, @@ -233,7 +255,7 @@ bool GlobalValue::canIncreaseAlignment() const { const GlobalObject *GlobalValue::getBaseObject() const { if (auto *GO = dyn_cast(this)) return GO; - if (auto *GA = dyn_cast(this)) + if (auto *GA = dyn_cast(this)) return GA->getBaseObject(); return nullptr; } @@ -333,12 +355,11 @@ void GlobalVariable::setInitializer(Constant *InitVal) { /// Copy all additional attributes (those not needed to create a GlobalVariable) /// from the GlobalVariable Src to this one. -void GlobalVariable::copyAttributesFrom(const GlobalValue *Src) { +void GlobalVariable::copyAttributesFrom(const GlobalVariable *Src) { GlobalObject::copyAttributesFrom(Src); - if (const GlobalVariable *SrcVar = dyn_cast(Src)) { - setThreadLocalMode(SrcVar->getThreadLocalMode()); - setExternallyInitialized(SrcVar->isExternallyInitialized()); - } + setThreadLocalMode(Src->getThreadLocalMode()); + setExternallyInitialized(Src->isExternallyInitialized()); + setAttributes(Src->getAttributes()); } void GlobalVariable::dropAllReferences() { diff --git a/lib/IR/IRBuilder.cpp b/lib/IR/IRBuilder.cpp index e265a823687f..3477c087967f 100644 --- a/lib/IR/IRBuilder.cpp +++ b/lib/IR/IRBuilder.cpp @@ -161,6 +161,94 @@ CreateMemMove(Value *Dst, Value *Src, Value *Size, unsigned Align, return CI; } +static CallInst *getReductionIntrinsic(IRBuilderBase *Builder, Intrinsic::ID ID, + Value *Src) { + Module *M = Builder->GetInsertBlock()->getParent()->getParent(); + Value *Ops[] = {Src}; + Type *Tys[] = { Src->getType()->getVectorElementType(), Src->getType() }; + auto Decl = Intrinsic::getDeclaration(M, ID, Tys); + return createCallHelper(Decl, Ops, Builder); +} + +CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) { + Module *M = GetInsertBlock()->getParent()->getParent(); + Value *Ops[] = {Acc, Src}; + Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(), + Src->getType()}; + auto Decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reduce_fadd, Tys); + return createCallHelper(Decl, Ops, this); +} + +CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) { + Module *M = GetInsertBlock()->getParent()->getParent(); + Value *Ops[] = {Acc, Src}; + Type *Tys[] = {Src->getType()->getVectorElementType(), Acc->getType(), + Src->getType()}; + auto Decl = Intrinsic::getDeclaration( + M, Intrinsic::experimental_vector_reduce_fmul, Tys); + return createCallHelper(Decl, Ops, this); +} + +CallInst *IRBuilderBase::CreateAddReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_add, + Src); +} + +CallInst *IRBuilderBase::CreateMulReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_mul, + Src); +} + +CallInst *IRBuilderBase::CreateAndReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_and, + Src); +} + +CallInst *IRBuilderBase::CreateOrReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_or, + Src); +} + +CallInst *IRBuilderBase::CreateXorReduce(Value *Src) { + return getReductionIntrinsic(this, Intrinsic::experimental_vector_reduce_xor, + Src); +} + +CallInst *IRBuilderBase::CreateIntMaxReduce(Value *Src, bool IsSigned) { + auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smax + : Intrinsic::experimental_vector_reduce_umax; + return getReductionIntrinsic(this, ID, Src); +} + +CallInst *IRBuilderBase::CreateIntMinReduce(Value *Src, bool IsSigned) { + auto ID = IsSigned ? Intrinsic::experimental_vector_reduce_smin + : Intrinsic::experimental_vector_reduce_umin; + return getReductionIntrinsic(this, ID, Src); +} + +CallInst *IRBuilderBase::CreateFPMaxReduce(Value *Src, bool NoNaN) { + auto Rdx = getReductionIntrinsic( + this, Intrinsic::experimental_vector_reduce_fmax, Src); + if (NoNaN) { + FastMathFlags FMF; + FMF.setNoNaNs(); + Rdx->setFastMathFlags(FMF); + } + return Rdx; +} + +CallInst *IRBuilderBase::CreateFPMinReduce(Value *Src, bool NoNaN) { + auto Rdx = getReductionIntrinsic( + this, Intrinsic::experimental_vector_reduce_fmin, Src); + if (NoNaN) { + FastMathFlags FMF; + FMF.setNoNaNs(); + Rdx->setFastMathFlags(FMF); + } + return Rdx; +} + CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) { assert(isa(Ptr->getType()) && "lifetime.start only applies to pointers."); diff --git a/lib/IR/Instruction.cpp b/lib/IR/Instruction.cpp index 906a28a5c887..91b9d9232b54 100644 --- a/lib/IR/Instruction.cpp +++ b/lib/IR/Instruction.cpp @@ -534,6 +534,30 @@ bool Instruction::isAtomic() const { } } +bool Instruction::hasAtomicLoad() const { + assert(isAtomic()); + switch (getOpcode()) { + default: + return false; + case Instruction::AtomicCmpXchg: + case Instruction::AtomicRMW: + case Instruction::Load: + return true; + } +} + +bool Instruction::hasAtomicStore() const { + assert(isAtomic()); + switch (getOpcode()) { + default: + return false; + case Instruction::AtomicCmpXchg: + case Instruction::AtomicRMW: + case Instruction::Store: + return true; + } +} + bool Instruction::mayThrow() const { if (const CallInst *CI = dyn_cast(this)) return !CI->doesNotThrow(); diff --git a/lib/IR/Instructions.cpp b/lib/IR/Instructions.cpp index a60cc375d568..5a5b9c0d06bb 100644 --- a/lib/IR/Instructions.cpp +++ b/lib/IR/Instructions.cpp @@ -1,4 +1,4 @@ -//===-- Instructions.cpp - Implement the LLVM instructions ----------------===// +//===- Instructions.cpp - Implement the LLVM instructions -----------------===// // // The LLVM Compiler Infrastructure // @@ -12,18 +12,36 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Instructions.h" #include "LLVMContextImpl.h" +#include "llvm/ADT/None.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/BasicBlock.h" #include "llvm/IR/CallSite.h" -#include "llvm/IR/ConstantRange.h" +#include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" +#include "llvm/IR/InstrTypes.h" +#include "llvm/IR/Instruction.h" +#include "llvm/IR/Instructions.h" +#include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" #include "llvm/IR/Module.h" #include "llvm/IR/Operator.h" +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/AtomicOrdering.h" +#include "llvm/Support/Casting.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/MathExtras.h" +#include +#include +#include +#include + using namespace llvm; //===----------------------------------------------------------------------===// @@ -42,7 +60,42 @@ User::op_iterator CallSite::getCallee() const { //===----------------------------------------------------------------------===// // Out of line virtual method, so the vtable, etc has a home. -TerminatorInst::~TerminatorInst() { +TerminatorInst::~TerminatorInst() = default; + +unsigned TerminatorInst::getNumSuccessors() const { + switch (getOpcode()) { +#define HANDLE_TERM_INST(N, OPC, CLASS) \ + case Instruction::OPC: \ + return static_cast(this)->getNumSuccessorsV(); +#include "llvm/IR/Instruction.def" + default: + break; + } + llvm_unreachable("not a terminator"); +} + +BasicBlock *TerminatorInst::getSuccessor(unsigned idx) const { + switch (getOpcode()) { +#define HANDLE_TERM_INST(N, OPC, CLASS) \ + case Instruction::OPC: \ + return static_cast(this)->getSuccessorV(idx); +#include "llvm/IR/Instruction.def" + default: + break; + } + llvm_unreachable("not a terminator"); +} + +void TerminatorInst::setSuccessor(unsigned idx, BasicBlock *B) { + switch (getOpcode()) { +#define HANDLE_TERM_INST(N, OPC, CLASS) \ + case Instruction::OPC: \ + return static_cast(this)->setSuccessorV(idx, B); +#include "llvm/IR/Instruction.def" + default: + break; + } + llvm_unreachable("not a terminator"); } //===----------------------------------------------------------------------===// @@ -50,8 +103,7 @@ TerminatorInst::~TerminatorInst() { //===----------------------------------------------------------------------===// // Out of line virtual method, so the vtable, etc has a home. -UnaryInstruction::~UnaryInstruction() { -} +UnaryInstruction::~UnaryInstruction() = default; //===----------------------------------------------------------------------===// // SelectInst Class @@ -82,7 +134,6 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) { return nullptr; } - //===----------------------------------------------------------------------===// // PHINode Class //===----------------------------------------------------------------------===// @@ -242,8 +293,7 @@ void LandingPadInst::addClause(Constant *Val) { // CallInst Implementation //===----------------------------------------------------------------------===// -CallInst::~CallInst() { -} +CallInst::~CallInst() = default; void CallInst::init(FunctionType *FTy, Value *Func, ArrayRef Args, ArrayRef Bundles, const Twine &NameStr) { @@ -541,7 +591,6 @@ Instruction *CallInst::CreateMalloc(Instruction *InsertBefore, ArraySize, OpB, MallocF, Name); } - /// CreateMalloc - Generate the IR for a call to malloc: /// 1. Compute the malloc call's argument as the specified type's size, /// possibly multiplied by the array size if the array size is not @@ -692,9 +741,11 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef OpB, BasicBlock *InvokeInst::getSuccessorV(unsigned idx) const { return getSuccessor(idx); } + unsigned InvokeInst::getNumSuccessorsV() const { return getNumSuccessors(); } + void InvokeInst::setSuccessorV(unsigned idx, BasicBlock *B) { return setSuccessor(idx, B); } @@ -821,6 +872,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, Instruction *InsertBefore) if (retVal) Op<0>() = retVal; } + ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd) : TerminatorInst(Type::getVoidTy(C), Instruction::Ret, OperandTraits::op_end(this) - !!retVal, !!retVal, @@ -828,6 +880,7 @@ ReturnInst::ReturnInst(LLVMContext &C, Value *retVal, BasicBlock *InsertAtEnd) if (retVal) Op<0>() = retVal; } + ReturnInst::ReturnInst(LLVMContext &Context, BasicBlock *InsertAtEnd) : TerminatorInst(Type::getVoidTy(Context), Instruction::Ret, OperandTraits::op_end(this), 0, InsertAtEnd) { @@ -847,8 +900,7 @@ BasicBlock *ReturnInst::getSuccessorV(unsigned idx) const { llvm_unreachable("ReturnInst has no successors!"); } -ReturnInst::~ReturnInst() { -} +ReturnInst::~ReturnInst() = default; //===----------------------------------------------------------------------===// // ResumeInst Implementation @@ -930,9 +982,11 @@ BasicBlock *CleanupReturnInst::getSuccessorV(unsigned Idx) const { assert(Idx == 0); return getUnwindDest(); } + unsigned CleanupReturnInst::getNumSuccessorsV() const { return getNumSuccessors(); } + void CleanupReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) { assert(Idx == 0); setUnwindDest(B); @@ -973,9 +1027,11 @@ BasicBlock *CatchReturnInst::getSuccessorV(unsigned Idx) const { assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!"); return getSuccessor(); } + unsigned CatchReturnInst::getNumSuccessorsV() const { return getNumSuccessors(); } + void CatchReturnInst::setSuccessorV(unsigned Idx, BasicBlock *B) { assert(Idx < getNumSuccessors() && "Successor # out of range for catchret!"); setSuccessor(B); @@ -1067,9 +1123,11 @@ void CatchSwitchInst::removeHandler(handler_iterator HI) { BasicBlock *CatchSwitchInst::getSuccessorV(unsigned idx) const { return getSuccessor(idx); } + unsigned CatchSwitchInst::getNumSuccessorsV() const { return getNumSuccessors(); } + void CatchSwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) { setSuccessor(idx, B); } @@ -1155,6 +1213,7 @@ BranchInst::BranchInst(BasicBlock *IfTrue, Instruction *InsertBefore) assert(IfTrue && "Branch destination may not be null!"); Op<-1>() = IfTrue; } + BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, Instruction *InsertBefore) : TerminatorInst(Type::getVoidTy(IfTrue->getContext()), Instruction::Br, @@ -1189,7 +1248,6 @@ BranchInst::BranchInst(BasicBlock *IfTrue, BasicBlock *IfFalse, Value *Cond, #endif } - BranchInst::BranchInst(const BranchInst &BI) : TerminatorInst(Type::getVoidTy(BI.getContext()), Instruction::Br, OperandTraits::op_end(this) - BI.getNumOperands(), @@ -1216,14 +1274,15 @@ void BranchInst::swapSuccessors() { BasicBlock *BranchInst::getSuccessorV(unsigned idx) const { return getSuccessor(idx); } + unsigned BranchInst::getNumSuccessorsV() const { return getNumSuccessors(); } + void BranchInst::setSuccessorV(unsigned idx, BasicBlock *B) { setSuccessor(idx, B); } - //===----------------------------------------------------------------------===// // AllocaInst Implementation //===----------------------------------------------------------------------===// @@ -1279,8 +1338,7 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize, } // Out of line virtual method, so the vtable, etc has a home. -AllocaInst::~AllocaInst() { -} +AllocaInst::~AllocaInst() = default; void AllocaInst::setAlignment(unsigned Align) { assert((Align & (Align-1)) == 0 && "Alignment is not a power of 2!"); @@ -1543,8 +1601,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal, SynchronizationScope SynchScope, Instruction *InsertBefore) : Instruction( - StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()), - nullptr), + StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())), AtomicCmpXchg, OperandTraits::op_begin(this), OperandTraits::operands(this), InsertBefore) { Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope); @@ -1556,8 +1613,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal, SynchronizationScope SynchScope, BasicBlock *InsertAtEnd) : Instruction( - StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext()), - nullptr), + StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())), AtomicCmpXchg, OperandTraits::op_begin(this), OperandTraits::operands(this), InsertAtEnd) { Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SynchScope); @@ -1771,14 +1827,12 @@ ExtractElementInst::ExtractElementInst(Value *Val, Value *Index, setName(Name); } - bool ExtractElementInst::isValidOperands(const Value *Val, const Value *Index) { if (!Val->getType()->isVectorTy() || !Index->getType()->isIntegerTy()) return false; return true; } - //===----------------------------------------------------------------------===// // InsertElementInst Implementation //===----------------------------------------------------------------------===// @@ -1825,7 +1879,6 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt, return true; } - //===----------------------------------------------------------------------===// // ShuffleVectorInst Implementation //===----------------------------------------------------------------------===// @@ -1938,7 +1991,6 @@ void ShuffleVectorInst::getShuffleMask(Constant *Mask, } } - //===----------------------------------------------------------------------===// // InsertValueInst Class //===----------------------------------------------------------------------===// @@ -1951,7 +2003,7 @@ void InsertValueInst::init(Value *Agg, Value *Val, ArrayRef Idxs, // (other than weirdness with &*IdxBegin being invalid; see // getelementptr's init routine for example). But there's no // present need to support it. - assert(Idxs.size() > 0 && "InsertValueInst must have at least one index"); + assert(!Idxs.empty() && "InsertValueInst must have at least one index"); assert(ExtractValueInst::getIndexedType(Agg->getType(), Idxs) == Val->getType() && "Inserted value must match indexed type!"); @@ -1980,7 +2032,7 @@ void ExtractValueInst::init(ArrayRef Idxs, const Twine &Name) { // There's no fundamental reason why we require at least one index. // But there's no present need to support it. - assert(Idxs.size() > 0 && "ExtractValueInst must have at least one index"); + assert(!Idxs.empty() && "ExtractValueInst must have at least one index"); Indices.append(Idxs.begin(), Idxs.end()); setName(Name); @@ -2053,7 +2105,6 @@ BinaryOperator::BinaryOperator(BinaryOps iType, Value *S1, Value *S2, setName(Name); } - void BinaryOperator::init(BinaryOps iType) { Value *LHS = getOperand(0), *RHS = getOperand(1); (void)LHS; (void)RHS; // Silence warnings. @@ -2213,7 +2264,6 @@ BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name, Op->getType(), Name, InsertAtEnd); } - // isConstantAllOnes - Helper function for several functions below static inline bool isConstantAllOnes(const Value *V) { if (const Constant *C = dyn_cast(V)) @@ -2279,7 +2329,6 @@ const Value *BinaryOperator::getNotArgument(const Value *BinOp) { return getNotArgument(const_cast(BinOp)); } - // Exchange the two operands to this instruction. This instruction is safe to // use on any binary instruction and does not modify the semantics of the // instruction. If the instruction is order-dependent (SetLT f.e.), the opcode @@ -2291,7 +2340,6 @@ bool BinaryOperator::swapOperands() { return false; } - //===----------------------------------------------------------------------===// // FPMathOperator Class //===----------------------------------------------------------------------===// @@ -2305,7 +2353,6 @@ float FPMathOperator::getFPAccuracy() const { return Accuracy->getValueAPF().convertToFloat(); } - //===----------------------------------------------------------------------===// // CastInst Class //===----------------------------------------------------------------------===// @@ -2567,13 +2614,12 @@ unsigned CastInst::isEliminableCastPair( return Instruction::BitCast; return 0; } - case 12: { + case 12: // addrspacecast, addrspacecast -> bitcast, if SrcAS == DstAS // addrspacecast, addrspacecast -> addrspacecast, if SrcAS != DstAS if (SrcTy->getPointerAddressSpace() != DstTy->getPointerAddressSpace()) return Instruction::AddrSpaceCast; return Instruction::BitCast; - } case 13: // FIXME: this state can be merged with (1), but the following assert // is useful to check the correcteness of the sequence due to semantic @@ -2594,7 +2640,6 @@ unsigned CastInst::isEliminableCastPair( DstTy->getScalarType()->getPointerElementType()) return Instruction::AddrSpaceCast; return 0; - case 15: // FIXME: this state can be merged with (1), but the following assert // is useful to check the correcteness of the sequence due to semantic @@ -3070,7 +3115,6 @@ CastInst::getCastOpcode( /// of the types involved. bool CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) { - // Check for type sanity on the arguments Type *SrcTy = S->getType(); @@ -3419,7 +3463,6 @@ bool CmpInst::isEquality() const { return cast(this)->isEquality(); } - CmpInst::Predicate CmpInst::getInversePredicate(Predicate pred) { switch (pred) { default: llvm_unreachable("Unknown cmp predicate!"); @@ -3743,9 +3786,11 @@ void SwitchInst::growOperands() { BasicBlock *SwitchInst::getSuccessorV(unsigned idx) const { return getSuccessor(idx); } + unsigned SwitchInst::getNumSuccessorsV() const { return getNumSuccessors(); } + void SwitchInst::setSuccessorV(unsigned idx, BasicBlock *B) { setSuccessor(idx, B); } @@ -3832,9 +3877,11 @@ void IndirectBrInst::removeDestination(unsigned idx) { BasicBlock *IndirectBrInst::getSuccessorV(unsigned idx) const { return getSuccessor(idx); } + unsigned IndirectBrInst::getNumSuccessorsV() const { return getNumSuccessors(); } + void IndirectBrInst::setSuccessorV(unsigned idx, BasicBlock *B) { setSuccessor(idx, B); } diff --git a/lib/IR/LegacyPassManager.cpp b/lib/IR/LegacyPassManager.cpp index 628a67bd639c..b2b12289f871 100644 --- a/lib/IR/LegacyPassManager.cpp +++ b/lib/IR/LegacyPassManager.cpp @@ -12,6 +12,7 @@ //===----------------------------------------------------------------------===// #include "llvm/IR/LegacyPassManager.h" +#include "llvm/ADT/Statistic.h" #include "llvm/IR/IRPrintingPasses.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/LegacyPassManagers.h" @@ -465,6 +466,11 @@ public: // null. It may be called multiple times. static void createTheTimeInfo(); + // print - Prints out timing information and then resets the timers. + void print() { + TG.print(*CreateInfoOutputFile()); + } + /// getPassTimer - Return the timer for the specified pass if it exists. Timer *getPassTimer(Pass *P) { if (P->getAsPMDataManager()) @@ -1752,6 +1758,13 @@ Timer *llvm::getPassTimer(Pass *P) { return nullptr; } +/// If timing is enabled, report the times collected up to now and then reset +/// them. +void llvm::reportAndResetTimings() { + if (TheTimeInfo) + TheTimeInfo->print(); +} + //===----------------------------------------------------------------------===// // PMStack implementation // diff --git a/lib/IR/Module.cpp b/lib/IR/Module.cpp index fec9df193685..12c258d95f52 100644 --- a/lib/IR/Module.cpp +++ b/lib/IR/Module.cpp @@ -1,4 +1,4 @@ -//===-- Module.cpp - Implement the Module class ---------------------------===// +//===- Module.cpp - Implement the Module class ----------------------------===// // // The LLVM Compiler Infrastructure // @@ -11,27 +11,46 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Module.h" #include "SymbolTableListTraitsImpl.h" -#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" -#include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" +#include "llvm/IR/Attributes.h" +#include "llvm/IR/Comdat.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/DebugInfoMetadata.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/GlobalAlias.h" +#include "llvm/IR/GlobalIFunc.h" +#include "llvm/IR/GlobalValue.h" +#include "llvm/IR/GlobalVariable.h" #include "llvm/IR/GVMaterializer.h" -#include "llvm/IR/InstrTypes.h" #include "llvm/IR/LLVMContext.h" +#include "llvm/IR/Metadata.h" +#include "llvm/IR/Module.h" +#include "llvm/IR/SymbolTableListTraits.h" +#include "llvm/IR/Type.h" #include "llvm/IR/TypeFinder.h" -#include "llvm/Support/Dwarf.h" +#include "llvm/IR/Value.h" +#include "llvm/IR/ValueSymbolTable.h" +#include "llvm/Pass.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/CodeGen.h" #include "llvm/Support/Error.h" #include "llvm/Support/MemoryBuffer.h" #include "llvm/Support/Path.h" #include "llvm/Support/RandomNumberGenerator.h" #include -#include -#include +#include +#include +#include +#include +#include using namespace llvm; diff --git a/lib/IR/Type.cpp b/lib/IR/Type.cpp index b67b0a307861..c9f957c244f8 100644 --- a/lib/IR/Type.cpp +++ b/lib/IR/Type.cpp @@ -1,4 +1,4 @@ -//===-- Type.cpp - Implement the Type class -------------------------------===// +//===- Type.cpp - Implement the Type class --------------------------------===// // // The LLVM Compiler Infrastructure // @@ -11,12 +11,25 @@ // //===----------------------------------------------------------------------===// -#include "llvm/IR/Type.h" #include "LLVMContextImpl.h" +#include "llvm/ADT/APInt.h" +#include "llvm/ADT/None.h" #include "llvm/ADT/SmallString.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/IR/Constant.h" +#include "llvm/IR/Constants.h" +#include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/LLVMContext.h" #include "llvm/IR/Module.h" -#include -#include +#include "llvm/IR/Type.h" +#include "llvm/IR/Value.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/MathExtras.h" +#include "llvm/Support/raw_ostream.h" +#include +#include + using namespace llvm; //===----------------------------------------------------------------------===// @@ -220,7 +233,6 @@ PointerType *Type::getInt64PtrTy(LLVMContext &C, unsigned AS) { return getInt64Ty(C)->getPointerTo(AS); } - //===----------------------------------------------------------------------===// // IntegerType Implementation //===----------------------------------------------------------------------===// @@ -362,7 +374,8 @@ void StructType::setName(StringRef Name) { if (Name == getName()) return; StringMap &SymbolTable = getContext().pImpl->NamedStructTypes; - typedef StringMap::MapEntryTy EntryTy; + + using EntryTy = StringMap::MapEntryTy; // If this struct already had a name, remove its symbol table entry. Don't // delete the data yet because it may be part of the new name. @@ -419,21 +432,6 @@ StructType *StructType::get(LLVMContext &Context, bool isPacked) { return get(Context, None, isPacked); } -StructType *StructType::get(Type *type, ...) { - assert(type && "Cannot create a struct type with no elements with this"); - LLVMContext &Ctx = type->getContext(); - va_list ap; - SmallVector StructFields; - va_start(ap, type); - while (type) { - StructFields.push_back(type); - type = va_arg(ap, llvm::Type*); - } - auto *Ret = llvm::StructType::get(Ctx, StructFields); - va_end(ap); - return Ret; -} - StructType *StructType::create(LLVMContext &Context, ArrayRef Elements, StringRef Name, bool isPacked) { StructType *ST = create(Context, Name); @@ -462,21 +460,6 @@ StructType *StructType::create(ArrayRef Elements) { return create(Elements[0]->getContext(), Elements, StringRef()); } -StructType *StructType::create(StringRef Name, Type *type, ...) { - assert(type && "Cannot create a struct type with no elements with this"); - LLVMContext &Ctx = type->getContext(); - va_list ap; - SmallVector StructFields; - va_start(ap, type); - while (type) { - StructFields.push_back(type); - type = va_arg(ap, llvm::Type*); - } - auto *Ret = llvm::StructType::create(Ctx, StructFields, Name); - va_end(ap); - return Ret; -} - bool StructType::isSized(SmallPtrSetImpl *Visited) const { if ((getSubclassData() & SCDB_IsSized) != 0) return true; @@ -508,19 +491,6 @@ StringRef StructType::getName() const { return ((StringMapEntry *)SymbolTableEntry)->getKey(); } -void StructType::setBody(Type *type, ...) { - assert(type && "Cannot create a struct type with no elements with this"); - va_list ap; - SmallVector StructFields; - va_start(ap, type); - while (type) { - StructFields.push_back(type); - type = va_arg(ap, llvm::Type*); - } - setBody(StructFields); - va_end(ap); -} - bool StructType::isValidElementType(Type *ElemTy) { return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() && !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() && @@ -540,7 +510,6 @@ StructType *Module::getTypeByName(StringRef Name) const { return getContext().pImpl->NamedStructTypes.lookup(Name); } - //===----------------------------------------------------------------------===// // CompositeType Implementation //===----------------------------------------------------------------------===// @@ -589,7 +558,6 @@ bool CompositeType::indexValid(unsigned Idx) const { return true; } - //===----------------------------------------------------------------------===// // ArrayType Implementation //===----------------------------------------------------------------------===// @@ -661,7 +629,6 @@ PointerType *PointerType::get(Type *EltTy, unsigned AddressSpace) { return Entry; } - PointerType::PointerType(Type *E, unsigned AddrSpace) : Type(E->getContext(), PointerTyID), PointeeTy(E) { ContainedTys = &PointeeTy; diff --git a/lib/IR/Verifier.cpp b/lib/IR/Verifier.cpp index 65e124562493..3b68d6365872 100644 --- a/lib/IR/Verifier.cpp +++ b/lib/IR/Verifier.cpp @@ -267,6 +267,9 @@ class Verifier : public InstVisitor, VerifierSupport { /// \brief Keep track of the metadata nodes that have been checked already. SmallPtrSet MDNodes; + /// Keep track which DISubprogram is attached to which function. + DenseMap DISubprogramAttachments; + /// Track all DICompileUnits visited. SmallPtrSet CUVisited; @@ -386,7 +389,7 @@ public: verifyCompileUnits(); verifyDeoptimizeCallingConvs(); - + DISubprogramAttachments.clear(); return !Broken; } @@ -2085,13 +2088,19 @@ void Verifier::visitFunction(const Function &F) { switch (I.first) { default: break; - case LLVMContext::MD_dbg: + case LLVMContext::MD_dbg: { ++NumDebugAttachments; AssertDI(NumDebugAttachments == 1, "function must have a single !dbg attachment", &F, I.second); AssertDI(isa(I.second), "function !dbg attachment must be a subprogram", &F, I.second); + auto *SP = cast(I.second); + const Function *&AttachedTo = DISubprogramAttachments[SP]; + AssertDI(!AttachedTo || AttachedTo == &F, + "DISubprogram attached to more than one function", SP, &F); + AttachedTo = &F; break; + } case LLVMContext::MD_prof: ++NumProfAttachments; Assert(NumProfAttachments == 1, diff --git a/lib/LLVMBuild.txt b/lib/LLVMBuild.txt index 684b378c93e5..89ddd0fc1af3 100644 --- a/lib/LLVMBuild.txt +++ b/lib/LLVMBuild.txt @@ -24,7 +24,6 @@ subdirectories = DebugInfo Demangle ExecutionEngine - LibDriver LineEditor Linker IR @@ -39,6 +38,7 @@ subdirectories = Support TableGen Target + ToolDrivers Transforms [component_0] diff --git a/lib/LTO/LTO.cpp b/lib/LTO/LTO.cpp index 2d2dcdec05fb..c73b6b6b15c1 100644 --- a/lib/LTO/LTO.cpp +++ b/lib/LTO/LTO.cpp @@ -973,7 +973,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, // this value. If not, no need to preserve any ThinLTO copies. !Res.second.IRName.empty()) GUIDPreservedSymbols.insert(GlobalValue::getGUID( - GlobalValue::getRealLinkageName(Res.second.IRName))); + GlobalValue::dropLLVMManglingEscape(Res.second.IRName))); } auto DeadSymbols = @@ -993,7 +993,7 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache, if (Res.second.IRName.empty()) continue; auto GUID = GlobalValue::getGUID( - GlobalValue::getRealLinkageName(Res.second.IRName)); + GlobalValue::dropLLVMManglingEscape(Res.second.IRName)); // Mark exported unless index-based analysis determined it to be dead. if (!DeadSymbols.count(GUID)) ExportedGUIDs.insert(GUID); diff --git a/lib/LTO/LTOCodeGenerator.cpp b/lib/LTO/LTOCodeGenerator.cpp index 86fba843e980..6a275560dc92 100644 --- a/lib/LTO/LTOCodeGenerator.cpp +++ b/lib/LTO/LTOCodeGenerator.cpp @@ -495,17 +495,14 @@ void LTOCodeGenerator::verifyMergedModuleOnce() { return; HasVerifiedInput = true; - if (LTOStripInvalidDebugInfo) { - bool BrokenDebugInfo = false; - if (verifyModule(*MergedModule, &dbgs(), &BrokenDebugInfo)) - report_fatal_error("Broken module found, compilation aborted!"); - if (BrokenDebugInfo) { - emitWarning("Invalid debug info found, debug info will be stripped"); - StripDebugInfo(*MergedModule); - } - } - if (verifyModule(*MergedModule, &dbgs())) + bool BrokenDebugInfo = false; + if (verifyModule(*MergedModule, &dbgs(), + LTOStripInvalidDebugInfo ? &BrokenDebugInfo : nullptr)) report_fatal_error("Broken module found, compilation aborted!"); + if (BrokenDebugInfo) { + emitWarning("Invalid debug info found, debug info will be stripped"); + StripDebugInfo(*MergedModule); + } } void LTOCodeGenerator::finishOptimizationRemarks() { @@ -600,6 +597,7 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef Out) { // If statistics were requested, print them out after codegen. if (llvm::AreStatisticsEnabled()) llvm::PrintStatistics(); + reportAndResetTimings(); finishOptimizationRemarks(); diff --git a/lib/LTO/ThinLTOCodeGenerator.cpp b/lib/LTO/ThinLTOCodeGenerator.cpp index b4ee7c2b2fbc..65a7994325bc 100644 --- a/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/lib/LTO/ThinLTOCodeGenerator.cpp @@ -446,7 +446,7 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index, { raw_svector_ostream OS(OutputBuffer); ProfileSummaryInfo PSI(TheModule); - auto Index = buildModuleSummaryIndex(TheModule, nullptr, nullptr); + auto Index = buildModuleSummaryIndex(TheModule, nullptr, &PSI); WriteBitcodeToFile(&TheModule, OS, true, &Index); } return make_unique(std::move(OutputBuffer)); @@ -1024,4 +1024,5 @@ void ThinLTOCodeGenerator::run() { // If statistics were requested, print them out now. if (llvm::AreStatisticsEnabled()) llvm::PrintStatistics(); + reportAndResetTimings(); } diff --git a/lib/LibDriver/CMakeLists.txt b/lib/LibDriver/CMakeLists.txt deleted file mode 100644 index ab53a6843446..000000000000 --- a/lib/LibDriver/CMakeLists.txt +++ /dev/null @@ -1,8 +0,0 @@ -set(LLVM_TARGET_DEFINITIONS Options.td) -tablegen(LLVM Options.inc -gen-opt-parser-defs) -add_public_tablegen_target(LibOptionsTableGen) - -add_llvm_library(LLVMLibDriver - LibDriver.cpp - ) -add_dependencies(LLVMLibDriver LibOptionsTableGen) diff --git a/lib/LibDriver/LLVMBuild.txt b/lib/LibDriver/LLVMBuild.txt deleted file mode 100644 index 799dc997c0bb..000000000000 --- a/lib/LibDriver/LLVMBuild.txt +++ /dev/null @@ -1,22 +0,0 @@ -;===- ./lib/LibDriver/LLVMBuild.txt ----------------------------*- Conf -*--===; -; -; The LLVM Compiler Infrastructure -; -; This file is distributed under the University of Illinois Open Source -; License. See LICENSE.TXT for details. -; -;===------------------------------------------------------------------------===; -; -; This is an LLVMBuild description file for the components in this subdirectory. -; -; For more information on the LLVMBuild system, please see: -; -; http://llvm.org/docs/LLVMBuild.html -; -;===------------------------------------------------------------------------===; - -[component_0] -type = Library -name = LibDriver -parent = Libraries -required_libraries = Object Option Support diff --git a/lib/LibDriver/LibDriver.cpp b/lib/LibDriver/LibDriver.cpp deleted file mode 100644 index c50629d71501..000000000000 --- a/lib/LibDriver/LibDriver.cpp +++ /dev/null @@ -1,171 +0,0 @@ -//===- LibDriver.cpp - lib.exe-compatible driver --------------------------===// -// -// The LLVM Compiler Infrastructure -// -// This file is distributed under the University of Illinois Open Source -// License. See LICENSE.TXT for details. -// -//===----------------------------------------------------------------------===// -// -// Defines an interface to a lib.exe-compatible driver that also understands -// bitcode files. Used by llvm-lib and lld-link /lib. -// -//===----------------------------------------------------------------------===// - -#include "llvm/LibDriver/LibDriver.h" -#include "llvm/ADT/STLExtras.h" -#include "llvm/Object/ArchiveWriter.h" -#include "llvm/Option/Arg.h" -#include "llvm/Option/ArgList.h" -#include "llvm/Option/Option.h" -#include "llvm/Support/CommandLine.h" -#include "llvm/Support/StringSaver.h" -#include "llvm/Support/Path.h" -#include "llvm/Support/Process.h" -#include "llvm/Support/raw_ostream.h" - -using namespace llvm; - -namespace { - -enum { - OPT_INVALID = 0, -#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID, -#include "Options.inc" -#undef OPTION -}; - -#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE; -#include "Options.inc" -#undef PREFIX - -static const llvm::opt::OptTable::Info infoTable[] = { -#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10) \ - { \ - X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \ - OPT_##GROUP, OPT_##ALIAS, X6 \ - }, -#include "Options.inc" -#undef OPTION -}; - -class LibOptTable : public llvm::opt::OptTable { -public: - LibOptTable() : OptTable(infoTable, true) {} -}; - -} - -static std::string getOutputPath(llvm::opt::InputArgList *Args, - const llvm::NewArchiveMember &FirstMember) { - if (auto *Arg = Args->getLastArg(OPT_out)) - return Arg->getValue(); - SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier()); - llvm::sys::path::replace_extension(Val, ".lib"); - return Val.str(); -} - -static std::vector getSearchPaths(llvm::opt::InputArgList *Args, - StringSaver &Saver) { - std::vector Ret; - // Add current directory as first item of the search path. - Ret.push_back(""); - - // Add /libpath flags. - for (auto *Arg : Args->filtered(OPT_libpath)) - Ret.push_back(Arg->getValue()); - - // Add $LIB. - Optional EnvOpt = sys::Process::GetEnv("LIB"); - if (!EnvOpt.hasValue()) - return Ret; - StringRef Env = Saver.save(*EnvOpt); - while (!Env.empty()) { - StringRef Path; - std::tie(Path, Env) = Env.split(';'); - Ret.push_back(Path); - } - return Ret; -} - -static Optional findInputFile(StringRef File, - ArrayRef Paths) { - for (auto Dir : Paths) { - SmallString<128> Path = Dir; - sys::path::append(Path, File); - if (sys::fs::exists(Path)) - return Path.str().str(); - } - return Optional(); -} - -int llvm::libDriverMain(llvm::ArrayRef ArgsArr) { - SmallVector NewArgs(ArgsArr.begin(), ArgsArr.end()); - BumpPtrAllocator Alloc; - StringSaver Saver(Alloc); - cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs); - ArgsArr = NewArgs; - - LibOptTable Table; - unsigned MissingIndex; - unsigned MissingCount; - llvm::opt::InputArgList Args = - Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount); - if (MissingCount) { - llvm::errs() << "missing arg value for \"" - << Args.getArgString(MissingIndex) << "\", expected " - << MissingCount - << (MissingCount == 1 ? " argument.\n" : " arguments.\n"); - return 1; - } - for (auto *Arg : Args.filtered(OPT_UNKNOWN)) - llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n"; - - if (!Args.hasArgNoClaim(OPT_INPUT)) { - // No input files. To match lib.exe, silently do nothing. - return 0; - } - - std::vector SearchPaths = getSearchPaths(&Args, Saver); - - std::vector Members; - for (auto *Arg : Args.filtered(OPT_INPUT)) { - Optional Path = findInputFile(Arg->getValue(), SearchPaths); - if (!Path.hasValue()) { - llvm::errs() << Arg->getValue() << ": no such file or directory\n"; - return 1; - } - Expected MOrErr = - NewArchiveMember::getFile(Saver.save(*Path), /*Deterministic=*/true); - if (!MOrErr) { - handleAllErrors(MOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) { - llvm::errs() << Arg->getValue() << ": " << EIB.message() << "\n"; - }); - return 1; - } - sys::fs::file_magic Magic = - sys::fs::identify_magic(MOrErr->Buf->getBuffer()); - if (Magic != sys::fs::file_magic::coff_object && - Magic != sys::fs::file_magic::bitcode && - Magic != sys::fs::file_magic::windows_resource) { - llvm::errs() << Arg->getValue() - << ": not a COFF object, bitcode or resource file\n"; - return 1; - } - Members.emplace_back(std::move(*MOrErr)); - } - - std::pair Result = - llvm::writeArchive(getOutputPath(&Args, Members[0]), Members, - /*WriteSymtab=*/true, object::Archive::K_GNU, - /*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin)); - - if (Result.second) { - if (Result.first.empty()) - Result.first = ArgsArr[0]; - llvm::errs() << Result.first << ": " << Result.second.message() << "\n"; - return 1; - } - - return 0; -} diff --git a/lib/LibDriver/Options.td b/lib/LibDriver/Options.td deleted file mode 100644 index 5a56ef7468d4..000000000000 --- a/lib/LibDriver/Options.td +++ /dev/null @@ -1,25 +0,0 @@ -include "llvm/Option/OptParser.td" - -// lib.exe accepts options starting with either a dash or a slash. - -// Flag that takes no arguments. -class F : Flag<["/", "-", "-?"], name>; - -// Flag that takes one argument after ":". -class P : - Joined<["/", "-", "-?"], name#":">, HelpText; - -def libpath: P<"libpath", "Object file search path">; -def out : P<"out", "Path to file to write output">; - -def llvmlibthin : F<"llvmlibthin">; - -//============================================================================== -// The flags below do nothing. They are defined only for lib.exe compatibility. -//============================================================================== - -class QF : Joined<["/", "-", "-?"], name#":">; - -def ignore : QF<"ignore">; -def machine: QF<"machine">; -def nologo : F<"nologo">; diff --git a/lib/Linker/IRMover.cpp b/lib/Linker/IRMover.cpp index 15a46a2d0420..ecef1efda1a2 100644 --- a/lib/Linker/IRMover.cpp +++ b/lib/Linker/IRMover.cpp @@ -602,6 +602,7 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) { /*insertbefore*/ nullptr, SGVar->getThreadLocalMode(), SGVar->getType()->getAddressSpace()); NewDGV->setAlignment(SGVar->getAlignment()); + NewDGV->copyAttributesFrom(SGVar); return NewDGV; } @@ -610,8 +611,11 @@ GlobalVariable *IRLinker::copyGlobalVariableProto(const GlobalVariable *SGVar) { Function *IRLinker::copyFunctionProto(const Function *SF) { // If there is no linkage to be performed or we are linking from the source, // bring SF over. - return Function::Create(TypeMap.get(SF->getFunctionType()), - GlobalValue::ExternalLinkage, SF->getName(), &DstM); + auto *F = + Function::Create(TypeMap.get(SF->getFunctionType()), + GlobalValue::ExternalLinkage, SF->getName(), &DstM); + F->copyAttributesFrom(SF); + return F; } /// Set up prototypes for any aliases that come over from the source module. @@ -619,9 +623,11 @@ GlobalValue *IRLinker::copyGlobalAliasProto(const GlobalAlias *SGA) { // If there is no linkage to be performed or we're linking from the source, // bring over SGA. auto *Ty = TypeMap.get(SGA->getValueType()); - return GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(), - GlobalValue::ExternalLinkage, SGA->getName(), - &DstM); + auto *GA = + GlobalAlias::create(Ty, SGA->getType()->getPointerAddressSpace(), + GlobalValue::ExternalLinkage, SGA->getName(), &DstM); + GA->copyAttributesFrom(SGA); + return GA; } GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV, @@ -648,8 +654,6 @@ GlobalValue *IRLinker::copyGlobalValueProto(const GlobalValue *SGV, else if (SGV->hasExternalWeakLinkage()) NewGV->setLinkage(GlobalValue::ExternalWeakLinkage); - NewGV->copyAttributesFrom(SGV); - if (auto *NewGO = dyn_cast(NewGV)) { // Metadata for global variables and function declarations is copied eagerly. if (isa(SGV) || SGV->isDeclaration()) diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp index f7f2253256eb..174397e27396 100644 --- a/lib/MC/MCObjectStreamer.cpp +++ b/lib/MC/MCObjectStreamer.cpp @@ -133,6 +133,11 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, // Avoid fixups when possible. int64_t AbsValue; if (Value->evaluateAsAbsolute(AbsValue, getAssembler())) { + if (!isUIntN(8 * Size, AbsValue) && !isIntN(8 * Size, AbsValue)) { + getContext().reportError( + Loc, "value evaluated as " + Twine(AbsValue) + " is out of range."); + return; + } EmitIntValue(AbsValue, Size); return; } diff --git a/lib/MC/MCParser/AsmParser.cpp b/lib/MC/MCParser/AsmParser.cpp index 66ba853da2fe..3b213ef4ce09 100644 --- a/lib/MC/MCParser/AsmParser.cpp +++ b/lib/MC/MCParser/AsmParser.cpp @@ -288,6 +288,7 @@ public: private: bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc); + void altMacroString(StringRef AltMacroStr, std::string &Res); bool parseStatement(ParseStatementInfo &Info, MCAsmParserSemaCallback *SI); bool parseCurlyBlockScope(SmallVectorImpl& AsmStrRewrites); @@ -1209,6 +1210,8 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) { const char *CharPtr = StrLoc.getPointer(); while ((*CharPtr != '>') && (*CharPtr != '\n') && (*CharPtr != '\r') && (*CharPtr != '\0')){ + if(*CharPtr == '!') + CharPtr++; CharPtr++; } if (*CharPtr == '>') { @@ -1218,6 +1221,15 @@ bool AsmParser::isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) { return false; } +/// \brief creating a string without the escape characters '!'. +void AsmParser::altMacroString(StringRef AltMacroStr,std::string &Res) { + for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) { + if (AltMacroStr[Pos] == '!') + Pos++; + Res += AltMacroStr[Pos]; + } +} + /// \brief Parse an expression and return it. /// /// expr ::= expr &&,|| expr -> lowest. @@ -2309,6 +2321,15 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body, (*(Token.getString().begin()) == '%') && Token.is(AsmToken::Integer)) // Emit an integer value to the buffer. OS << Token.getIntVal(); + // Only Token that was validated as a string and begins with '<' + // is considered altMacroString!!! + else if ((Lexer.IsaAltMacroMode()) && + (*(Token.getString().begin()) == '<') && + Token.is(AsmToken::String)) { + std::string Res; + altMacroString(Token.getStringContents(), Res); + OS << Res; + } // We expect no quotes around the string's contents when // parsing for varargs. else if (Token.isNot(AsmToken::String) || VarargParameter) diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp index b1223e81be43..28531feccfe1 100644 --- a/lib/Object/COFFObjectFile.cpp +++ b/lib/Object/COFFObjectFile.cpp @@ -1062,7 +1062,7 @@ COFFObjectFile::getSectionContents(const coff_section *Sec, // In COFF, a virtual section won't have any in-file // content, so the file pointer to the content will be zero. if (Sec->PointerToRawData == 0) - return object_error::parse_failed; + return std::error_code(); // The only thing that we need to verify is that the contents is contained // within the file bounds. We don't need to make sure it doesn't cover other // data, as there's nothing that says that is not allowed. @@ -1602,8 +1602,6 @@ ErrorOr> ResourceSectionRef::getDirStringAtOffset(uint32_t Offse uint16_t Length; RETURN_IF_ERROR(Reader.readInteger(Length)); ArrayRef RawDirString; - // Strings are stored as 2-byte aligned unicode characters but readFixedString - // assumes byte string, so we double length. RETURN_IF_ERROR(Reader.readArray(RawDirString, Length)); return RawDirString; } diff --git a/lib/Object/WasmObjectFile.cpp b/lib/Object/WasmObjectFile.cpp index 39f8704aacf2..058686e4db9e 100644 --- a/lib/Object/WasmObjectFile.cpp +++ b/lib/Object/WasmObjectFile.cpp @@ -168,6 +168,13 @@ static wasm::WasmLimits readLimits(const uint8_t *&Ptr) { return Result; } +static wasm::WasmTable readTable(const uint8_t *&Ptr) { + wasm::WasmTable Table; + Table.ElemType = readVarint7(Ptr); + Table.Limits = readLimits(Ptr); + return Table; +} + static Error readSection(WasmSection &Section, const uint8_t *&Ptr, const uint8_t *Start) { // TODO(sbc): Avoid reading past EOF in the case of malformed files. @@ -397,13 +404,22 @@ Error WasmObjectFile::parseImportSection(const uint8_t *Ptr, const uint8_t *End) Sections.size(), i); break; case wasm::WASM_EXTERNAL_GLOBAL: - Im.GlobalType = readVarint7(Ptr); - Im.GlobalMutable = readVaruint1(Ptr); + Im.Global.Type = readVarint7(Ptr); + Im.Global.Mutable = readVaruint1(Ptr); Symbols.emplace_back(Im.Field, WasmSymbol::SymbolType::GLOBAL_IMPORT, Sections.size(), i); break; + case wasm::WASM_EXTERNAL_MEMORY: + Im.Memory = readLimits(Ptr); + break; + case wasm::WASM_EXTERNAL_TABLE: + Im.Table = readTable(Ptr); + if (Im.Table.ElemType != wasm::WASM_TYPE_ANYFUNC) { + return make_error("Invalid table element type", + object_error::parse_failed); + } + break; default: - // TODO(sbc): Handle other kinds of imports return make_error( "Unexpected import kind", object_error::parse_failed); } @@ -431,14 +447,11 @@ Error WasmObjectFile::parseTableSection(const uint8_t *Ptr, const uint8_t *End) uint32_t Count = readVaruint32(Ptr); Tables.reserve(Count); while (Count--) { - wasm::WasmTable Table; - Table.ElemType = readVarint7(Ptr); - if (Table.ElemType != wasm::WASM_TYPE_ANYFUNC) { + Tables.push_back(readTable(Ptr)); + if (Tables.back().ElemType != wasm::WASM_TYPE_ANYFUNC) { return make_error("Invalid table element type", object_error::parse_failed); } - Table.Limits = readLimits(Ptr); - Tables.push_back(Table); } if (Ptr != End) return make_error("Table section ended prematurely", @@ -493,8 +506,10 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End) Symbols.emplace_back(Ex.Name, WasmSymbol::SymbolType::GLOBAL_EXPORT, Sections.size(), i); break; + case wasm::WASM_EXTERNAL_MEMORY: + case wasm::WASM_EXTERNAL_TABLE: + break; default: - // TODO(sbc): Handle other kinds of exports return make_error( "Unexpected export kind", object_error::parse_failed); } @@ -507,7 +522,7 @@ Error WasmObjectFile::parseExportSection(const uint8_t *Ptr, const uint8_t *End) Error WasmObjectFile::parseStartSection(const uint8_t *Ptr, const uint8_t *End) { StartFunction = readVaruint32(Ptr); - if (StartFunction < FunctionTypes.size()) + if (StartFunction >= FunctionTypes.size()) return make_error("Invalid start function", object_error::parse_failed); return Error::success(); @@ -638,10 +653,14 @@ basic_symbol_iterator WasmObjectFile::symbol_end() const { return BasicSymbolRef(Ref, this); } -const WasmSymbol &WasmObjectFile::getWasmSymbol(DataRefImpl Symb) const { +const WasmSymbol &WasmObjectFile::getWasmSymbol(const DataRefImpl &Symb) const { return Symbols[Symb.d.a]; } +const WasmSymbol &WasmObjectFile::getWasmSymbol(const SymbolRef &Symb) const { + return getWasmSymbol(Symb.getRawDataRefImpl()); +} + Expected WasmObjectFile::getSymbolName(DataRefImpl Symb) const { const WasmSymbol &Sym = getWasmSymbol(Symb); return Sym.Name; diff --git a/lib/ObjectYAML/WasmYAML.cpp b/lib/ObjectYAML/WasmYAML.cpp index c5d1b438ee2a..910d32f16af9 100644 --- a/lib/ObjectYAML/WasmYAML.cpp +++ b/lib/ObjectYAML/WasmYAML.cpp @@ -265,8 +265,12 @@ void MappingTraits::mapping(IO &IO, if (Import.Kind == wasm::WASM_EXTERNAL_FUNCTION) { IO.mapRequired("SigIndex", Import.SigIndex); } else if (Import.Kind == wasm::WASM_EXTERNAL_GLOBAL) { - IO.mapRequired("GlobalType", Import.GlobalType); - IO.mapRequired("GlobalMutable", Import.GlobalMutable); + IO.mapRequired("GlobalType", Import.GlobalImport.Type); + IO.mapRequired("GlobalMutable", Import.GlobalImport.Mutable); + } else if (Import.Kind == wasm::WASM_EXTERNAL_TABLE) { + IO.mapRequired("Table", Import.TableImport); + } else if (Import.Kind == wasm::WASM_EXTERNAL_MEMORY ) { + IO.mapRequired("Memory", Import.Memory); } else { llvm_unreachable("unhandled import type"); } diff --git a/lib/ProfileData/SampleProfWriter.cpp b/lib/ProfileData/SampleProfWriter.cpp index b91b6fb7c7ad..b05efa7417b9 100644 --- a/lib/ProfileData/SampleProfWriter.cpp +++ b/lib/ProfileData/SampleProfWriter.cpp @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,32 @@ using namespace llvm; using namespace sampleprof; +std::error_code +SampleProfileWriter::write(const StringMap &ProfileMap) { + if (std::error_code EC = writeHeader(ProfileMap)) + return EC; + + // Sort the ProfileMap by total samples. + typedef std::pair NameFunctionSamples; + std::vector V; + for (const auto &I : ProfileMap) + V.push_back(std::make_pair(I.getKey(), &I.second)); + + std::stable_sort( + V.begin(), V.end(), + [](const NameFunctionSamples &A, const NameFunctionSamples &B) { + if (A.second->getTotalSamples() == B.second->getTotalSamples()) + return A.first > B.first; + return A.second->getTotalSamples() > B.second->getTotalSamples(); + }); + + for (const auto &I : V) { + if (std::error_code EC = write(*I.second)) + return EC; + } + return sampleprof_error::success; +} + /// \brief Write samples to a text file. /// /// Note: it may be tempting to implement this in terms of @@ -97,8 +124,7 @@ std::error_code SampleProfileWriterBinary::writeNameIdx(StringRef FName) { } void SampleProfileWriterBinary::addName(StringRef FName) { - auto NextIdx = NameTable.size(); - NameTable.insert(std::make_pair(FName, NextIdx)); + NameTable.insert(std::make_pair(FName, 0)); } void SampleProfileWriterBinary::addNames(const FunctionSamples &S) { @@ -136,10 +162,18 @@ std::error_code SampleProfileWriterBinary::writeHeader( addNames(I.second); } + // Sort the names to make NameTable is deterministic. + std::set V; + for (const auto &I : NameTable) + V.insert(I.first); + int i = 0; + for (const StringRef &N : V) + NameTable[N] = i++; + // Write out the name table. encodeULEB128(NameTable.size(), OS); - for (auto N : NameTable) { - OS << N.first; + for (auto N : V) { + OS << N; encodeULEB128(0, OS); } return sampleprof_error::success; diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp index caa0691f9205..17144522db82 100644 --- a/lib/Support/APInt.cpp +++ b/lib/Support/APInt.cpp @@ -122,35 +122,38 @@ APInt::APInt(unsigned numbits, StringRef Str, uint8_t radix) fromString(numbits, Str, radix); } +void APInt::reallocate(unsigned NewBitWidth) { + // If the number of words is the same we can just change the width and stop. + if (getNumWords() == getNumWords(NewBitWidth)) { + BitWidth = NewBitWidth; + return; + } + + // If we have an allocation, delete it. + if (!isSingleWord()) + delete [] U.pVal; + + // Update BitWidth. + BitWidth = NewBitWidth; + + // If we are supposed to have an allocation, create it. + if (!isSingleWord()) + U.pVal = getMemory(getNumWords()); +} + void APInt::AssignSlowCase(const APInt& RHS) { // Don't do anything for X = X if (this == &RHS) return; - if (BitWidth == RHS.getBitWidth()) { - // assume same bit-width single-word case is already handled - assert(!isSingleWord()); - memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE); - return; - } + // Adjust the bit width and handle allocations as necessary. + reallocate(RHS.getBitWidth()); - if (isSingleWord()) { - // assume case where both are single words is already handled - assert(!RHS.isSingleWord()); - U.pVal = getMemory(RHS.getNumWords()); - memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE); - } else if (getNumWords() == RHS.getNumWords()) - memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE); - else if (RHS.isSingleWord()) { - delete [] U.pVal; + // Copy the data. + if (isSingleWord()) U.VAL = RHS.U.VAL; - } else { - delete [] U.pVal; - U.pVal = getMemory(RHS.getNumWords()); - memcpy(U.pVal, RHS.U.pVal, RHS.getNumWords() * APINT_WORD_SIZE); - } - BitWidth = RHS.BitWidth; - clearUnusedBits(); + else + memcpy(U.pVal, RHS.U.pVal, getNumWords() * APINT_WORD_SIZE); } /// This method 'profiles' an APInt for use with FoldingSet. @@ -1138,10 +1141,13 @@ APInt APInt::multiplicativeInverse(const APInt& modulo) const { return APInt(BitWidth, 0); // The next-to-last t is the multiplicative inverse. However, we are - // interested in a positive inverse. Calcuate a positive one from a negative + // interested in a positive inverse. Calculate a positive one from a negative // one if necessary. A simple addition of the modulo suffices because // abs(t[i]) is known to be less than *this/2 (see the link above). - return t[i].isNegative() ? t[i] + modulo : t[i]; + if (t[i].isNegative()) + t[i] += modulo; + + return std::move(t[i]); } /// Calculate the magic numbers required to implement a signed integer division @@ -1240,7 +1246,7 @@ APInt::mu APInt::magicu(unsigned LeadingZeros) const { /// from "Art of Computer Programming, Volume 2", section 4.3.1, p. 272. The /// variables here have the same names as in the algorithm. Comments explain /// the algorithm and any deviation from it. -static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r, +static void KnuthDiv(uint32_t *u, uint32_t *v, uint32_t *q, uint32_t* r, unsigned m, unsigned n) { assert(u && "Must provide dividend"); assert(v && "Must provide divisor"); @@ -1266,16 +1272,16 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r, // overflow. Note that this can require an extra word in u so that u must // be of length m+n+1. unsigned shift = countLeadingZeros(v[n-1]); - unsigned v_carry = 0; - unsigned u_carry = 0; + uint32_t v_carry = 0; + uint32_t u_carry = 0; if (shift) { for (unsigned i = 0; i < m+n; ++i) { - unsigned u_tmp = u[i] >> (32 - shift); + uint32_t u_tmp = u[i] >> (32 - shift); u[i] = (u[i] << shift) | u_carry; u_carry = u_tmp; } for (unsigned i = 0; i < n; ++i) { - unsigned v_tmp = v[i] >> (32 - shift); + uint32_t v_tmp = v[i] >> (32 - shift); v[i] = (v[i] << shift) | v_carry; v_carry = v_tmp; } @@ -1296,11 +1302,11 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r, // Set qp = (u[j+n]*b + u[j+n-1]) / v[n-1]. (qp=qprime=q') // Set rp = (u[j+n]*b + u[j+n-1]) % v[n-1]. (rp=rprime=r') // Now test if qp == b or qp*v[n-2] > b*rp + u[j+n-2]; if so, decrease - // qp by 1, inrease rp by v[n-1], and repeat this test if rp < b. The test + // qp by 1, increase rp by v[n-1], and repeat this test if rp < b. The test // on v[n-2] determines at high speed most of the cases in which the trial // value qp is one too large, and it eliminates all cases where qp is two // too large. - uint64_t dividend = ((uint64_t(u[j+n]) << 32) + u[j+n-1]); + uint64_t dividend = Make_64(u[j+n], u[j+n-1]); DEBUG(dbgs() << "KnuthDiv: dividend == " << dividend << '\n'); uint64_t qp = dividend / v[n-1]; uint64_t rp = dividend % v[n-1]; @@ -1323,14 +1329,14 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r, int64_t borrow = 0; for (unsigned i = 0; i < n; ++i) { uint64_t p = uint64_t(qp) * uint64_t(v[i]); - int64_t subres = int64_t(u[j+i]) - borrow - (unsigned)p; - u[j+i] = (unsigned)subres; - borrow = (p >> 32) - (subres >> 32); + int64_t subres = int64_t(u[j+i]) - borrow - Lo_32(p); + u[j+i] = Lo_32(subres); + borrow = Hi_32(p) - Hi_32(subres); DEBUG(dbgs() << "KnuthDiv: u[j+i] = " << u[j+i] << ", borrow = " << borrow << '\n'); } bool isNeg = u[j+n] < borrow; - u[j+n] -= (unsigned)borrow; + u[j+n] -= Lo_32(borrow); DEBUG(dbgs() << "KnuthDiv: after subtraction:"); DEBUG(for (int i = m+n; i >=0; i--) dbgs() << " " << u[i]); @@ -1338,7 +1344,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r, // D5. [Test remainder.] Set q[j] = qp. If the result of step D4 was // negative, go to step D6; otherwise go on to step D7. - q[j] = (unsigned)qp; + q[j] = Lo_32(qp); if (isNeg) { // D6. [Add back]. The probability that this step is necessary is very // small, on the order of only 2/b. Make sure that test data accounts for @@ -1349,7 +1355,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r, // since it cancels with the borrow that occurred in D4. bool carry = false; for (unsigned i = 0; i < n; i++) { - unsigned limit = std::min(u[j+i],v[i]); + uint32_t limit = std::min(u[j+i],v[i]); u[j+i] += v[i] + carry; carry = u[j+i] < limit || (carry && u[j+i] == limit); } @@ -1374,7 +1380,7 @@ static void KnuthDiv(unsigned *u, unsigned *v, unsigned *q, unsigned* r, // multiplication by d by using a shift left. So, all we have to do is // shift right here. if (shift) { - unsigned carry = 0; + uint32_t carry = 0; DEBUG(dbgs() << "KnuthDiv: remainder:"); for (int i = n-1; i >= 0; i--) { r[i] = (u[i] >> shift) | carry; @@ -1403,17 +1409,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS, // can't use 64-bit operands here because we don't have native results of // 128-bits. Furthermore, casting the 64-bit values to 32-bit values won't // work on large-endian machines. - uint64_t mask = ~0ull >> (sizeof(unsigned)*CHAR_BIT); unsigned n = rhsWords * 2; unsigned m = (lhsWords * 2) - n; // Allocate space for the temporary values we need either on the stack, if // it will fit, or on the heap if it won't. - unsigned SPACE[128]; - unsigned *U = nullptr; - unsigned *V = nullptr; - unsigned *Q = nullptr; - unsigned *R = nullptr; + uint32_t SPACE[128]; + uint32_t *U = nullptr; + uint32_t *V = nullptr; + uint32_t *Q = nullptr; + uint32_t *R = nullptr; if ((Remainder?4:3)*n+2*m+1 <= 128) { U = &SPACE[0]; V = &SPACE[m+n+1]; @@ -1421,34 +1426,34 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS, if (Remainder) R = &SPACE[(m+n+1) + n + (m+n)]; } else { - U = new unsigned[m + n + 1]; - V = new unsigned[n]; - Q = new unsigned[m+n]; + U = new uint32_t[m + n + 1]; + V = new uint32_t[n]; + Q = new uint32_t[m+n]; if (Remainder) - R = new unsigned[n]; + R = new uint32_t[n]; } // Initialize the dividend - memset(U, 0, (m+n+1)*sizeof(unsigned)); + memset(U, 0, (m+n+1)*sizeof(uint32_t)); for (unsigned i = 0; i < lhsWords; ++i) { - uint64_t tmp = (LHS.getNumWords() == 1 ? LHS.U.VAL : LHS.U.pVal[i]); - U[i * 2] = (unsigned)(tmp & mask); - U[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT)); + uint64_t tmp = LHS.getRawData()[i]; + U[i * 2] = Lo_32(tmp); + U[i * 2 + 1] = Hi_32(tmp); } U[m+n] = 0; // this extra word is for "spill" in the Knuth algorithm. // Initialize the divisor - memset(V, 0, (n)*sizeof(unsigned)); + memset(V, 0, (n)*sizeof(uint32_t)); for (unsigned i = 0; i < rhsWords; ++i) { - uint64_t tmp = (RHS.getNumWords() == 1 ? RHS.U.VAL : RHS.U.pVal[i]); - V[i * 2] = (unsigned)(tmp & mask); - V[i * 2 + 1] = (unsigned)(tmp >> (sizeof(unsigned)*CHAR_BIT)); + uint64_t tmp = RHS.getRawData()[i]; + V[i * 2] = Lo_32(tmp); + V[i * 2 + 1] = Hi_32(tmp); } // initialize the quotient and remainder - memset(Q, 0, (m+n) * sizeof(unsigned)); + memset(Q, 0, (m+n) * sizeof(uint32_t)); if (Remainder) - memset(R, 0, n * sizeof(unsigned)); + memset(R, 0, n * sizeof(uint32_t)); // Now, adjust m and n for the Knuth division. n is the number of words in // the divisor. m is the number of words by which the dividend exceeds the @@ -1469,22 +1474,22 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS, // are using base 2^32 instead of base 10. assert(n != 0 && "Divide by zero?"); if (n == 1) { - unsigned divisor = V[0]; - unsigned remainder = 0; - for (int i = m+n-1; i >= 0; i--) { - uint64_t partial_dividend = uint64_t(remainder) << 32 | U[i]; + uint32_t divisor = V[0]; + uint32_t remainder = 0; + for (int i = m; i >= 0; i--) { + uint64_t partial_dividend = Make_64(remainder, U[i]); if (partial_dividend == 0) { Q[i] = 0; remainder = 0; } else if (partial_dividend < divisor) { Q[i] = 0; - remainder = (unsigned)partial_dividend; + remainder = Lo_32(partial_dividend); } else if (partial_dividend == divisor) { Q[i] = 1; remainder = 0; } else { - Q[i] = (unsigned)(partial_dividend / divisor); - remainder = (unsigned)(partial_dividend - (Q[i] * divisor)); + Q[i] = Lo_32(partial_dividend / divisor); + remainder = Lo_32(partial_dividend - (Q[i] * divisor)); } } if (R) @@ -1498,24 +1503,16 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS, // If the caller wants the quotient if (Quotient) { // Set up the Quotient value's memory. - if (Quotient->BitWidth != LHS.BitWidth) { - if (Quotient->isSingleWord()) - Quotient->U.VAL = 0; - else - delete [] Quotient->U.pVal; - Quotient->BitWidth = LHS.BitWidth; - if (!Quotient->isSingleWord()) - Quotient->U.pVal = getClearedMemory(Quotient->getNumWords()); - } else - Quotient->clearAllBits(); + Quotient->reallocate(LHS.BitWidth); + // Clear out any previous bits. + Quotient->clearAllBits(); // The quotient is in Q. Reconstitute the quotient into Quotient's low // order words. // This case is currently dead as all users of divide() handle trivial cases // earlier. if (lhsWords == 1) { - uint64_t tmp = - uint64_t(Q[0]) | (uint64_t(Q[1]) << (APINT_BITS_PER_WORD / 2)); + uint64_t tmp = Make_64(Q[1], Q[0]); if (Quotient->isSingleWord()) Quotient->U.VAL = tmp; else @@ -1523,30 +1520,21 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS, } else { assert(!Quotient->isSingleWord() && "Quotient APInt not large enough"); for (unsigned i = 0; i < lhsWords; ++i) - Quotient->U.pVal[i] = - uint64_t(Q[i*2]) | (uint64_t(Q[i*2+1]) << (APINT_BITS_PER_WORD / 2)); + Quotient->U.pVal[i] = Make_64(Q[i*2+1], Q[i*2]); } } // If the caller wants the remainder if (Remainder) { // Set up the Remainder value's memory. - if (Remainder->BitWidth != RHS.BitWidth) { - if (Remainder->isSingleWord()) - Remainder->U.VAL = 0; - else - delete [] Remainder->U.pVal; - Remainder->BitWidth = RHS.BitWidth; - if (!Remainder->isSingleWord()) - Remainder->U.pVal = getClearedMemory(Remainder->getNumWords()); - } else - Remainder->clearAllBits(); + Remainder->reallocate(RHS.BitWidth); + // Clear out any previous bits. + Remainder->clearAllBits(); // The remainder is in R. Reconstitute the remainder into Remainder's low // order words. if (rhsWords == 1) { - uint64_t tmp = - uint64_t(R[0]) | (uint64_t(R[1]) << (APINT_BITS_PER_WORD / 2)); + uint64_t tmp = Make_64(R[1], R[0]); if (Remainder->isSingleWord()) Remainder->U.VAL = tmp; else @@ -1554,8 +1542,7 @@ void APInt::divide(const APInt &LHS, unsigned lhsWords, const APInt &RHS, } else { assert(!Remainder->isSingleWord() && "Remainder APInt not large enough"); for (unsigned i = 0; i < rhsWords; ++i) - Remainder->U.pVal[i] = - uint64_t(R[i*2]) | (uint64_t(R[i*2+1]) << (APINT_BITS_PER_WORD / 2)); + Remainder->U.pVal[i] = Make_64(R[i*2+1], R[i*2]); } } @@ -1578,29 +1565,30 @@ APInt APInt::udiv(const APInt& RHS) const { } // Get some facts about the LHS and RHS number of bits and words - unsigned rhsBits = RHS.getActiveBits(); - unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1); + unsigned lhsWords = getNumWords(getActiveBits()); + unsigned rhsBits = RHS.getActiveBits(); + unsigned rhsWords = getNumWords(rhsBits); assert(rhsWords && "Divided by zero???"); - unsigned lhsBits = this->getActiveBits(); - unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1); // Deal with some degenerate cases if (!lhsWords) // 0 / X ===> 0 return APInt(BitWidth, 0); - else if (lhsWords < rhsWords || this->ult(RHS)) { + if (rhsBits == 1) + // X / 1 ===> X + return *this; + if (lhsWords < rhsWords || this->ult(RHS)) // X / Y ===> 0, iff X < Y return APInt(BitWidth, 0); - } else if (*this == RHS) { + if (*this == RHS) // X / X ===> 1 return APInt(BitWidth, 1); - } else if (lhsWords == 1 && rhsWords == 1) { + if (lhsWords == 1) // rhsWords is 1 if lhsWords is 1. // All high words are zero, just use native divide return APInt(BitWidth, this->U.pVal[0] / RHS.U.pVal[0]); - } // We have to compute it the hard way. Invoke the Knuth divide algorithm. - APInt Quotient(1,0); // to hold result. + APInt Quotient; // to hold result. divide(*this, lhsWords, RHS, rhsWords, &Quotient, nullptr); return Quotient; } @@ -1624,31 +1612,32 @@ APInt APInt::urem(const APInt& RHS) const { } // Get some facts about the LHS - unsigned lhsBits = getActiveBits(); - unsigned lhsWords = !lhsBits ? 0 : (whichWord(lhsBits - 1) + 1); + unsigned lhsWords = getNumWords(getActiveBits()); // Get some facts about the RHS unsigned rhsBits = RHS.getActiveBits(); - unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1); + unsigned rhsWords = getNumWords(rhsBits); assert(rhsWords && "Performing remainder operation by zero ???"); // Check the degenerate cases - if (lhsWords == 0) { + if (lhsWords == 0) // 0 % Y ===> 0 return APInt(BitWidth, 0); - } else if (lhsWords < rhsWords || this->ult(RHS)) { + if (rhsBits == 1) + // X % 1 ===> 0 + return APInt(BitWidth, 0); + if (lhsWords < rhsWords || this->ult(RHS)) // X % Y ===> X, iff X < Y return *this; - } else if (*this == RHS) { + if (*this == RHS) // X % X == 0; return APInt(BitWidth, 0); - } else if (lhsWords == 1) { + if (lhsWords == 1) // All high words are zero, just use native remainder return APInt(BitWidth, U.pVal[0] % RHS.U.pVal[0]); - } // We have to compute it the hard way. Invoke the Knuth divide algorithm. - APInt Remainder(1,0); + APInt Remainder; divide(*this, lhsWords, RHS, rhsWords, nullptr, &Remainder); return Remainder; } @@ -1667,22 +1656,23 @@ APInt APInt::srem(const APInt &RHS) const { void APInt::udivrem(const APInt &LHS, const APInt &RHS, APInt &Quotient, APInt &Remainder) { assert(LHS.BitWidth == RHS.BitWidth && "Bit widths must be the same"); + unsigned BitWidth = LHS.BitWidth; // First, deal with the easy case if (LHS.isSingleWord()) { assert(RHS.U.VAL != 0 && "Divide by zero?"); uint64_t QuotVal = LHS.U.VAL / RHS.U.VAL; uint64_t RemVal = LHS.U.VAL % RHS.U.VAL; - Quotient = APInt(LHS.BitWidth, QuotVal); - Remainder = APInt(LHS.BitWidth, RemVal); + Quotient = APInt(BitWidth, QuotVal); + Remainder = APInt(BitWidth, RemVal); return; } // Get some size facts about the dividend and divisor - unsigned lhsBits = LHS.getActiveBits(); - unsigned lhsWords = !lhsBits ? 0 : (APInt::whichWord(lhsBits - 1) + 1); + unsigned lhsWords = getNumWords(LHS.getActiveBits()); unsigned rhsBits = RHS.getActiveBits(); - unsigned rhsWords = !rhsBits ? 0 : (APInt::whichWord(rhsBits - 1) + 1); + unsigned rhsWords = getNumWords(rhsBits); + assert(rhsWords && "Performing divrem operation by zero ???"); // Check the degenerate cases if (lhsWords == 0) { @@ -1691,6 +1681,11 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS, return; } + if (rhsBits == 1) { + Quotient = LHS; // X / 1 ===> X + Remainder = 0; // X % 1 ===> 0 + } + if (lhsWords < rhsWords || LHS.ult(RHS)) { Remainder = LHS; // X % Y ===> X, iff X < Y Quotient = 0; // X / Y ===> 0, iff X < Y @@ -1703,12 +1698,15 @@ void APInt::udivrem(const APInt &LHS, const APInt &RHS, return; } - if (lhsWords == 1 && rhsWords == 1) { + if (lhsWords == 1) { // rhsWords is 1 if lhsWords is 1. // There is only one word to consider so use the native versions. - uint64_t lhsValue = LHS.isSingleWord() ? LHS.U.VAL : LHS.U.pVal[0]; - uint64_t rhsValue = RHS.isSingleWord() ? RHS.U.VAL : RHS.U.pVal[0]; - Quotient = APInt(LHS.getBitWidth(), lhsValue / rhsValue); - Remainder = APInt(LHS.getBitWidth(), lhsValue % rhsValue); + uint64_t lhsValue = LHS.U.pVal[0]; + uint64_t rhsValue = RHS.U.pVal[0]; + // Make sure there is enough space to hold the results. + Quotient.reallocate(BitWidth); + Remainder.reallocate(BitWidth); + Quotient = lhsValue / rhsValue; + Remainder = lhsValue % rhsValue; return; } @@ -1723,12 +1721,12 @@ void APInt::sdivrem(const APInt &LHS, const APInt &RHS, APInt::udivrem(-LHS, -RHS, Quotient, Remainder); else { APInt::udivrem(-LHS, RHS, Quotient, Remainder); - Quotient = -Quotient; + Quotient.negate(); } - Remainder = -Remainder; + Remainder.negate(); } else if (RHS.isNegative()) { APInt::udivrem(LHS, -RHS, Quotient, Remainder); - Quotient = -Quotient; + Quotient.negate(); } else { APInt::udivrem(LHS, RHS, Quotient, Remainder); } @@ -1859,10 +1857,8 @@ void APInt::fromString(unsigned numbits, StringRef str, uint8_t radix) { *this += digit; } // If its negative, put it in two's complement form - if (isNeg) { - --(*this); - this->flipAllBits(); - } + if (isNeg) + this->negate(); } void APInt::toString(SmallVectorImpl &Str, unsigned Radix, @@ -1940,8 +1936,7 @@ void APInt::toString(SmallVectorImpl &Str, unsigned Radix, // They want to print the signed version and it is a negative value // Flip the bits and add one to turn it into the equivalent positive // value and put a '-' in the result. - Tmp.flipAllBits(); - ++Tmp; + Tmp.negate(); Str.push_back('-'); } @@ -1961,22 +1956,19 @@ void APInt::toString(SmallVectorImpl &Str, unsigned Radix, unsigned ShiftAmt = (Radix == 16 ? 4 : (Radix == 8 ? 3 : 1)); unsigned MaskAmt = Radix - 1; - while (Tmp != 0) { + while (Tmp.getBoolValue()) { unsigned Digit = unsigned(Tmp.getRawData()[0]) & MaskAmt; Str.push_back(Digits[Digit]); Tmp.lshrInPlace(ShiftAmt); } } else { - APInt divisor(Radix == 10? 4 : 8, Radix); - while (Tmp != 0) { - APInt APdigit(1, 0); - APInt tmp2(Tmp.getBitWidth(), 0); - divide(Tmp, Tmp.getNumWords(), divisor, divisor.getNumWords(), &tmp2, - &APdigit); + APInt divisor(Tmp.getBitWidth(), Radix); + APInt APdigit; + while (Tmp.getBoolValue()) { + udivrem(Tmp, divisor, Tmp, APdigit); unsigned Digit = (unsigned)APdigit.getZExtValue(); assert(Digit < Radix && "divide failed"); Str.push_back(Digits[Digit]); - Tmp = tmp2; } } @@ -2346,13 +2338,11 @@ int APInt::tcMultiply(WordType *dst, const WordType *lhs, return overflow; } -/* DST = LHS * RHS, where DST has width the sum of the widths of the - operands. No overflow occurs. DST must be disjoint from both - operands. Returns the number of parts required to hold the - result. */ -unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs, - const WordType *rhs, unsigned lhsParts, - unsigned rhsParts) { +/// DST = LHS * RHS, where DST has width the sum of the widths of the +/// operands. No overflow occurs. DST must be disjoint from both operands. +void APInt::tcFullMultiply(WordType *dst, const WordType *lhs, + const WordType *rhs, unsigned lhsParts, + unsigned rhsParts) { /* Put the narrower number on the LHS for less loops below. */ if (lhsParts > rhsParts) return tcFullMultiply (dst, rhs, lhs, rhsParts, lhsParts); @@ -2363,10 +2353,6 @@ unsigned APInt::tcFullMultiply(WordType *dst, const WordType *lhs, for (unsigned i = 0; i < lhsParts; i++) tcMultiplyPart(&dst[i], rhs, lhs[i], 0, rhsParts, rhsParts + 1, true); - - unsigned n = lhsParts + rhsParts; - - return n - (dst[n - 1] == 0); } /* If RHS is zero LHS and REMAINDER are left unchanged, return one. @@ -2400,22 +2386,20 @@ int APInt::tcDivide(WordType *lhs, const WordType *rhs, /* Loop, subtracting SRHS if REMAINDER is greater and adding that to the total. */ for (;;) { - int compare; - - compare = tcCompare(remainder, srhs, parts); - if (compare >= 0) { - tcSubtract(remainder, srhs, 0, parts); - lhs[n] |= mask; - } + int compare = tcCompare(remainder, srhs, parts); + if (compare >= 0) { + tcSubtract(remainder, srhs, 0, parts); + lhs[n] |= mask; + } - if (shiftCount == 0) - break; - shiftCount--; - tcShiftRight(srhs, parts, 1); - if ((mask >>= 1) == 0) { - mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1); - n--; - } + if (shiftCount == 0) + break; + shiftCount--; + tcShiftRight(srhs, parts, 1); + if ((mask >>= 1) == 0) { + mask = (WordType) 1 << (APINT_BITS_PER_WORD - 1); + n--; + } } return false; diff --git a/lib/Support/CMakeLists.txt b/lib/Support/CMakeLists.txt index 63c440037c22..83376284548f 100644 --- a/lib/Support/CMakeLists.txt +++ b/lib/Support/CMakeLists.txt @@ -81,6 +81,7 @@ add_llvm_library(LLVMSupport MD5.cpp NativeFormatting.cpp Options.cpp + Parallel.cpp PluginLoader.cpp PrettyStackTrace.cpp RandomNumberGenerator.cpp diff --git a/lib/Support/Parallel.cpp b/lib/Support/Parallel.cpp new file mode 100644 index 000000000000..ab2cfdebf07d --- /dev/null +++ b/lib/Support/Parallel.cpp @@ -0,0 +1,138 @@ +//===- llvm/Support/Parallel.cpp - Parallel algorithms --------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// + +#include "llvm/Support/Parallel.h" +#include "llvm/Config/llvm-config.h" + +#include +#include +#include + +using namespace llvm; + +namespace { + +/// \brief An abstract class that takes closures and runs them asynchronously. +class Executor { +public: + virtual ~Executor() = default; + virtual void add(std::function func) = 0; + + static Executor *getDefaultExecutor(); +}; + +#if !LLVM_ENABLE_THREADS +class SyncExecutor : public Executor { +public: + virtual void add(std::function F) { F(); } +}; + +Executor *Executor::getDefaultExecutor() { + static SyncExecutor Exec; + return &Exec; +} + +#elif defined(_MSC_VER) +/// \brief An Executor that runs tasks via ConcRT. +class ConcRTExecutor : public Executor { + struct Taskish { + Taskish(std::function Task) : Task(Task) {} + + std::function Task; + + static void run(void *P) { + Taskish *Self = static_cast(P); + Self->Task(); + concurrency::Free(Self); + } + }; + +public: + virtual void add(std::function F) { + Concurrency::CurrentScheduler::ScheduleTask( + Taskish::run, new (concurrency::Alloc(sizeof(Taskish))) Taskish(F)); + } +}; + +Executor *Executor::getDefaultExecutor() { + static ConcRTExecutor exec; + return &exec; +} + +#else +/// \brief An implementation of an Executor that runs closures on a thread pool +/// in filo order. +class ThreadPoolExecutor : public Executor { +public: + explicit ThreadPoolExecutor( + unsigned ThreadCount = std::thread::hardware_concurrency()) + : Done(ThreadCount) { + // Spawn all but one of the threads in another thread as spawning threads + // can take a while. + std::thread([&, ThreadCount] { + for (size_t i = 1; i < ThreadCount; ++i) { + std::thread([=] { work(); }).detach(); + } + work(); + }).detach(); + } + + ~ThreadPoolExecutor() override { + std::unique_lock Lock(Mutex); + Stop = true; + Lock.unlock(); + Cond.notify_all(); + // Wait for ~Latch. + } + + void add(std::function F) override { + std::unique_lock Lock(Mutex); + WorkStack.push(F); + Lock.unlock(); + Cond.notify_one(); + } + +private: + void work() { + while (true) { + std::unique_lock Lock(Mutex); + Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); }); + if (Stop) + break; + auto Task = WorkStack.top(); + WorkStack.pop(); + Lock.unlock(); + Task(); + } + Done.dec(); + } + + std::atomic Stop{false}; + std::stack> WorkStack; + std::mutex Mutex; + std::condition_variable Cond; + parallel::detail::Latch Done; +}; + +Executor *Executor::getDefaultExecutor() { + static ThreadPoolExecutor exec; + return &exec; +} +#endif +} + +#if LLVM_ENABLE_THREADS +void parallel::detail::TaskGroup::spawn(std::function F) { + L.inc(); + Executor::getDefaultExecutor()->add([&, F] { + F(); + L.dec(); + }); +} +#endif diff --git a/lib/Support/Unix/Path.inc b/lib/Support/Unix/Path.inc index fa28ba1b6ab6..cdea09be41e0 100644 --- a/lib/Support/Unix/Path.inc +++ b/lib/Support/Unix/Path.inc @@ -103,13 +103,16 @@ #define STATVFS_F_FLAG(vfs) (vfs).f_flags #endif +#if defined(__FreeBSD__) || defined(__NetBSD__) +#include +#endif + using namespace llvm; namespace llvm { namespace sys { namespace fs { -#if defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \ - defined(__OpenBSD__) || defined(__minix) || defined(__FreeBSD_kernel__) || \ +#if defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \ defined(__linux__) || defined(__CYGWIN__) || defined(__DragonFly__) || \ defined(_AIX) static int @@ -164,7 +167,7 @@ getprogpath(char ret[PATH_MAX], const char *bin) free(pv); return nullptr; } -#endif // __FreeBSD__ || __NetBSD__ || __FreeBSD_kernel__ +#endif // Bitrig || OpenBSD || minix || linux || CYGWIN || DragonFly || AIX /// GetMainExecutable - Return the path to the main executable, given the /// value of argv[0] from program startup. @@ -180,9 +183,24 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) { if (realpath(exe_path, link_path)) return link_path; } -#elif defined(__FreeBSD__) || defined (__NetBSD__) || defined(__Bitrig__) || \ - defined(__OpenBSD__) || defined(__minix) || defined(__DragonFly__) || \ - defined(__FreeBSD_kernel__) || defined(_AIX) +#elif defined(__FreeBSD__) || defined(__FreeBSD_kernel__) || defined(__NetBSD__) + int mib[4]; + mib[0] = CTL_KERN; +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) + mib[1] = KERN_PROC; + mib[2] = KERN_PROC_PATHNAME; + mib[3] = -1; +#else + mib[1] = KERN_PROC_ARGS; + mib[2] = -1; + mib[3] = KERN_PROC_PATHNAME; +#endif + char exe_path[PATH_MAX]; + size_t cb = sizeof(exe_path); + if (sysctl(mib, 4, exe_path, &cb, NULL, 0) == 0) + return exe_path; +#elif defined(__Bitrig__) || defined(__OpenBSD__) || defined(__minix) || \ + defined(__DragonFly__) || defined(_AIX) char exe_path[PATH_MAX]; if (getprogpath(exe_path, argv0) != NULL) diff --git a/lib/Support/Unix/Process.inc b/lib/Support/Unix/Process.inc index 16f8f5a98e52..1d0143c6716e 100644 --- a/lib/Support/Unix/Process.inc +++ b/lib/Support/Unix/Process.inc @@ -347,7 +347,7 @@ static bool terminalHasColors(int fd) { MutexGuard G(*TermColorMutex); int errret = 0; - if (setupterm((char *)nullptr, fd, &errret) != 0) + if (setupterm(nullptr, fd, &errret) != 0) // Regardless of why, if we can't get terminfo, we shouldn't try to print // colors. return false; @@ -369,7 +369,7 @@ static bool terminalHasColors(int fd) { // Now extract the structure allocated by setupterm and free its memory // through a really silly dance. - struct term *termp = set_curterm((struct term *)nullptr); + struct term *termp = set_curterm(nullptr); (void)del_curterm(termp); // Drop any errors here. // Return true if we found a color capabilities for the current terminal. diff --git a/lib/Target/AArch64/AArch64.td b/lib/Target/AArch64/AArch64.td index 73f2b6a25f66..4af5fef4287c 100644 --- a/lib/Target/AArch64/AArch64.td +++ b/lib/Target/AArch64/AArch64.td @@ -216,6 +216,7 @@ def ProcA72 : SubtargetFeature<"a72", "ARMProcFamily", "CortexA72", FeatureCRC, FeatureCrypto, FeatureFPARMv8, + FeatureFuseAES, FeatureNEON, FeaturePerfMon ]>; diff --git a/lib/Target/AArch64/AArch64CallLowering.cpp b/lib/Target/AArch64/AArch64CallLowering.cpp index ff3e4c40e2c2..29f6d571d6bd 100644 --- a/lib/Target/AArch64/AArch64CallLowering.cpp +++ b/lib/Target/AArch64/AArch64CallLowering.cpp @@ -380,7 +380,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, MIRBuilder.buildSequence(OrigRet.Reg, SplitRegs, RegOffsets); } - CallSeqStart.addImm(Handler.StackSize); + CallSeqStart.addImm(Handler.StackSize).addImm(0); MIRBuilder.buildInstr(AArch64::ADJCALLSTACKUP) .addImm(Handler.StackSize) .addImm(0); diff --git a/lib/Target/AArch64/AArch64FastISel.cpp b/lib/Target/AArch64/AArch64FastISel.cpp index 083708001757..9ac7ecb9cdb4 100644 --- a/lib/Target/AArch64/AArch64FastISel.cpp +++ b/lib/Target/AArch64/AArch64FastISel.cpp @@ -3014,7 +3014,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI, // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Process the args. for (CCValAssign &VA : ArgLocs) { diff --git a/lib/Target/AArch64/AArch64ISelLowering.cpp b/lib/Target/AArch64/AArch64ISelLowering.cpp index 4b1bb27dce73..4f7c2e122390 100644 --- a/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2265,7 +2265,7 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op, SDValue Callee = DAG.getExternalSymbol(LibcallName, getPointerTy(DAG.getDataLayout())); - StructType *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + StructType *RetTy = StructType::get(ArgTy, ArgTy); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) .setChain(DAG.getEntryNode()) @@ -3249,9 +3249,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, - true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, AArch64::SP, getPointerTy(DAG.getDataLayout())); diff --git a/lib/Target/AArch64/AArch64InstrInfo.cpp b/lib/Target/AArch64/AArch64InstrInfo.cpp index cb268828455e..c42738da7ab0 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3427,6 +3427,10 @@ static bool getFMAPatterns(MachineInstr &Root, Patterns.push_back(MachineCombinerPattern::FMLSv1i32_indexed_OP2); Found = true; } + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULSrr)) { + Patterns.push_back(MachineCombinerPattern::FNMULSUBS_OP1); + Found = true; + } break; case AArch64::FSUBDrr: if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FMULDrr)) { @@ -3441,6 +3445,10 @@ static bool getFMAPatterns(MachineInstr &Root, Patterns.push_back(MachineCombinerPattern::FMLSv1i64_indexed_OP2); Found = true; } + if (canCombineWithFMUL(MBB, Root.getOperand(1), AArch64::FNMULDrr)) { + Patterns.push_back(MachineCombinerPattern::FNMULSUBD_OP1); + Found = true; + } break; case AArch64::FSUBv2f32: if (canCombineWithFMUL(MBB, Root.getOperand(2), @@ -3495,6 +3503,8 @@ AArch64InstrInfo::isThroughputPattern(MachineCombinerPattern Pattern) const { case MachineCombinerPattern::FMULADDD_OP2: case MachineCombinerPattern::FMULSUBD_OP1: case MachineCombinerPattern::FMULSUBD_OP2: + case MachineCombinerPattern::FNMULSUBS_OP1: + case MachineCombinerPattern::FNMULSUBD_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP1: case MachineCombinerPattern::FMLAv1i32_indexed_OP2: case MachineCombinerPattern::FMLAv1i64_indexed_OP1: @@ -3996,6 +4006,24 @@ void AArch64InstrInfo::genAlternativeCodeSequence( MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); break; } + + case MachineCombinerPattern::FNMULSUBS_OP1: + case MachineCombinerPattern::FNMULSUBD_OP1: { + // FNMUL I=A,B,0 + // FSUB R,I,C + // ==> FNMADD R,A,B,C // = -A*B - C + // --- Create(FNMADD); + if (Pattern == MachineCombinerPattern::FNMULSUBS_OP1) { + Opc = AArch64::FNMADDSrrr; + RC = &AArch64::FPR32RegClass; + } else { + Opc = AArch64::FNMADDDrrr; + RC = &AArch64::FPR64RegClass; + } + MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 1, Opc, RC); + break; + } + case MachineCombinerPattern::FMULSUBS_OP2: case MachineCombinerPattern::FMULSUBD_OP2: { // FMUL I=A,B,0 @@ -4011,6 +4039,7 @@ void AArch64InstrInfo::genAlternativeCodeSequence( } MUL = genFusedMultiply(MF, MRI, TII, Root, InsInstrs, 2, Opc, RC); break; + } case MachineCombinerPattern::FMLSv1i32_indexed_OP2: Opc = AArch64::FMLSv1i32_indexed; @@ -4067,7 +4096,6 @@ void AArch64InstrInfo::genAlternativeCodeSequence( FMAInstKind::Accumulator); } break; - } } // end switch (Pattern) // Record MUL and ADD/SUB for deletion DelInstrs.push_back(MUL); diff --git a/lib/Target/AArch64/AArch64InstrInfo.td b/lib/Target/AArch64/AArch64InstrInfo.td index 902b08844216..5ddf66654a67 100644 --- a/lib/Target/AArch64/AArch64InstrInfo.td +++ b/lib/Target/AArch64/AArch64InstrInfo.td @@ -156,7 +156,8 @@ def AArch64adrp : SDNode<"AArch64ISD::ADRP", SDTIntUnaryOp, []>; def AArch64addlow : SDNode<"AArch64ISD::ADDlow", SDTIntBinOp, []>; def AArch64LOADgot : SDNode<"AArch64ISD::LOADgot", SDTIntUnaryOp>; def AArch64callseq_start : SDNode<"ISD::CALLSEQ_START", - SDCallSeqStart<[ SDTCisVT<0, i32> ]>, + SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>, [SDNPHasChain, SDNPOutGlue]>; def AArch64callseq_end : SDNode<"ISD::CALLSEQ_END", SDCallSeqEnd<[ SDTCisVT<0, i32>, @@ -328,8 +329,9 @@ include "AArch64InstrFormats.td" let Defs = [SP], Uses = [SP], hasSideEffects = 1, isCodeGenOnly = 1 in { // We set Sched to empty list because we expect these instructions to simply get // removed in most cases. -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - [(AArch64callseq_start timm:$amt)]>, Sched<[]>; +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(AArch64callseq_start timm:$amt1, timm:$amt2)]>, + Sched<[]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(AArch64callseq_end timm:$amt1, timm:$amt2)]>, Sched<[]>; diff --git a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp index 5f895903da6f..789270c2a34b 100644 --- a/lib/Target/AArch64/AArch64RegisterBankInfo.cpp +++ b/lib/Target/AArch64/AArch64RegisterBankInfo.cpp @@ -529,9 +529,34 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { // for the greedy mode the cost of the cross bank copy will // offset this number. // FIXME: Should be derived from the scheduling model. - if (OpRegBankIdx[0] >= PMI_FirstFPR) + if (OpRegBankIdx[0] != PMI_FirstGPR) Cost = 2; + else + // Check if that load feeds fp instructions. + // In that case, we want the default mapping to be on FPR + // instead of blind map every scalar to GPR. + for (const MachineInstr &UseMI : + MRI.use_instructions(MI.getOperand(0).getReg())) + // If we have at least one direct use in a FP instruction, + // assume this was a floating point load in the IR. + // If it was not, we would have had a bitcast before + // reaching that instruction. + if (isPreISelGenericFloatingPointOpcode(UseMI.getOpcode())) { + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } break; + case TargetOpcode::G_STORE: + // Check if that store is fed by fp instructions. + if (OpRegBankIdx[0] == PMI_FirstGPR) { + unsigned VReg = MI.getOperand(0).getReg(); + if (!VReg) + break; + MachineInstr *DefMI = MRI.getVRegDef(VReg); + if (isPreISelGenericFloatingPointOpcode(DefMI->getOpcode())) + OpRegBankIdx[0] = PMI_FirstFPR; + break; + } } // Finally construct the computed mapping. diff --git a/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/lib/Target/AArch64/AArch64SchedFalkorDetails.td index 8f8eeef8a6cf..a9b4d44a523e 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorDetails.td +++ b/lib/Target/AArch64/AArch64SchedFalkorDetails.td @@ -42,11 +42,11 @@ def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v1i32|v1 def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTXNv1i64)>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i32|v4i16)(_shift)?$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FMULX16, FMULX32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FMULX16, FMULX32)>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FMULX64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^(FMUL|FMULX)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FMULX64)>; def : InstRW<[FalkorWr_2VXVY_1cyc], (instregex "^(FABS|FNEG)(v2f64|v4f32|v8f16)$")>; @@ -62,9 +62,9 @@ def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVT(N|M|P|Z|A)(S|U)(v2f64|v4 def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(FCVTL|FCVTL2)(v2i32|v4i16|v4i32|v8i16)$")>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^FCVTZ(S|U)(v2i64|v4i32|v8i16)(_shift)?$")>; -def : InstRW<[FalkorWr_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instregex "^(FMUL|FMULX)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instregex "^(FMUL|FMULX)v2i64_indexed$")>; def : InstRW<[FalkorWr_3VXVY_4cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v1i32|v1i64|v1f16|v2f32|v4f16)$")>; @@ -72,13 +72,14 @@ def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^(FCVTX?N|FCVTX?N2)(v2i32|v4i1 def : InstRW<[FalkorWr_2VX_2VY_2cyc], (instregex "^(FDIV|FSQRT)(v2f64|v4f32|v8f16)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v8i8|v4i16|v2i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^ML(A|S)(v16i8|v8i16|v4i32|v2i64)(_indexed)?$")>; + +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)v1i64_indexed$")>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc, FalkorReadFMA32], (instregex "^FML(A|S)(v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc, FalkorReadFMA64], (instregex "^FML(A|S)(v2f64|v2i64_indexed)$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f32|v4f16|(v1i16_indexed|v4i16_indexed|v1i32_indexed|v2i32_indexed))$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v1i64_indexed$")>; -def : InstRW<[FalkorWr_2VXVY_5cyc, FalkorReadFMA],(instregex "^FML(A|S)(v2f64|v4f32|v8f16|v8i16_indexed|v4i32_indexed)$")>; -def : InstRW<[FalkorWr_2VXVY_6cyc, FalkorReadFMA],(instregex "^FML(A|S)v2i64_indexed$")>; // SIMD Integer Instructions // ----------------------------------------------------------------------------- def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^ADD(v1i64|v2i32|v4i16|v8i8)$")>; @@ -119,10 +120,10 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instregex "^SQNEG(v1i8|v1i16|v1i32|v1i64) def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)ADDLVv8i8v$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)?(MAX|MIN)V(v8i8v|v8i16v)$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs ADDVv8i8v)>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^SQRDML(A|S)?H(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^MUL(v2i32|v4i16|v8i8)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQR?DMULH(v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc], (instregex "^SQDMULL(i16|i32)$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(i16|i32|v8i8|v4i16|v1i32|v2i32|v1i16)(_indexed)?$")>; def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^(S|U)?(MAX|MIN)Vv16i8v$")>; @@ -169,9 +170,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^PMULL2?(v1i64|v2i64)$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^S(L|R)I(v16i8|v8i16|v4i32|v2i64)_shift$")>; def : InstRW<[FalkorWr_2VXVY_3cyc], (instregex "^SQ(ABS|NEG)(v16i8|v8i16|v4i32|v2i64)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^SQDMULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQRDML(A|S)H(v16i8|v8i16|v4i32)(_indexed)?$")>; def : InstRW<[FalkorWr_3VXVY_3cyc], (instregex "^(S|U)ADDLVv4i32v$")>; @@ -185,8 +186,9 @@ def : InstRW<[FalkorWr_4VXVY_3cyc], (instregex "^(S|U)ABALv.*$")>; def : InstRW<[FalkorWr_4VXVY_4cyc], (instregex "^(S|U)ABA(v16i8|v8i16|v4i32)$")>; -def : InstRW<[FalkorWr_1VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)(i16|i32)$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc, FalkorReadVMA],(instregex "^SQD(MLAL|MLSL)v.*$")>; +def : InstRW<[FalkorWr_VMUL32_1VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)(i16|i32|v1i32_indexed|v1i64_indexed)$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^SQD(MLAL|MLSL)v[248].*$")>; + // SIMD Load Instructions // ----------------------------------------------------------------------------- def : InstRW<[WriteVLD], (instregex "^LD1(i64|Onev(8b|4h|2s|1d|16b|8h|4s|2d))$")>; @@ -294,9 +296,9 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPEv1i32, FRECPEv1i64, FRSQRTEv def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FRECPXv1i32, FRECPXv1i64)>; def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs URECPEv2i32, URSQRTEv2i32)>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instrs FRECPS32, FRSQRTS32, FRECPSv2f32, FRSQRTSv2f32)>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instrs FRECPS64, FRSQRTS64)>; def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],(instregex "^INSv(i32|i64)(gpr|lane)$")>; def : InstRW<[FalkorWr_2GTOV_1cyc], (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>; @@ -311,9 +313,9 @@ def : InstRW<[FalkorWr_2VXVY_3cyc], (instrs URECPEv4i32, URSQRTEv4i32)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instrs TBLv8i8Two)>; def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^TBX(v8|v16)i8One$")>; -def : InstRW<[FalkorWr_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>; +def : InstRW<[FalkorWr_FMUL32_2VXVY_5cyc], (instrs FRECPSv4f32, FRSQRTSv4f32)>; -def : InstRW<[FalkorWr_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>; +def : InstRW<[FalkorWr_FMUL64_2VXVY_6cyc], (instrs FRECPSv2f64, FRSQRTSv2f64)>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBL(v8i8Three|v16i8Two)$")>; def : InstRW<[FalkorWr_3VXVY_5cyc], (instregex "^TBX(v8i8Two|v16i8Two)$")>; @@ -416,22 +418,25 @@ def : InstRW<[FalkorWr_1VXVY_3cyc], (instrs FCVTSHr, FCVTDHr)>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instrs FCVTSDr, FCVTDSr)>; -def : InstRW<[FalkorWr_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc], (instregex "^F(N)?MUL(H|S)rr$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc], (instregex "^F(N)?MULDrr$")>; def : InstRW<[FalkorWr_1VX_1VY_10cyc],(instregex "^FDIV(H|S|D)rr$")>; def : InstRW<[FalkorWr_1VX_1VY_2cyc], (instregex "^FSQRT(H|S|D)r$")>; -def : InstRW<[FalkorWr_1VXVY_5cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; -def : InstRW<[FalkorWr_1VXVY_6cyc, FalkorReadFMA],(instregex "^F(N)?M(ADD|SUB)Drrr$")>; +def : InstRW<[FalkorWr_FMUL32_1VXVY_5cyc, ReadDefault, ReadDefault, FalkorReadFMA32], (instregex "^F(N)?M(ADD|SUB)(H|S)rrr$")>; +def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc, ReadDefault, ReadDefault, FalkorReadFMA64], (instregex "^F(N)?M(ADD|SUB)Drrr$")>; // FP Miscellaneous Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; +def : InstRW<[FalkorWr_FMOV], (instregex "^FMOV(H|S|D)i$")>; +def : InstRW<[FalkorWr_1GTOV_1cyc], (instregex "^FMOV(HW|HX|SW|DX|DXHigh)r$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FCVTZ(S|U)(S|U)(W|X)(D|S)ri?$")>; def : InstRW<[FalkorWr_1VTOG_1cyc], (instregex "^FMOV(WH|WS|XH|XD|XDHigh)r$")>; -def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hi|Hr|S0|Si|Sr|D0|Di|Dr|v.*_ns)$")>; +def : InstRW<[FalkorWr_1VXVY_1cyc], (instregex "^FMOV(Hr|Sr|Dr|v.*_ns)$")>; +// FIXME: We are currently generating movi v0.2d, #0 for these, which is worse than fmov 0.0 +def : InstRW<[FalkorWr_2VXVY_1cyc], (instrs FMOVD0, FMOVS0)>; def : InstRW<[FalkorWr_1GTOV_4cyc], (instregex "^(S|U)CVTF(S|U)(W|X)(D|S)ri$")>; def : InstRW<[FalkorWr_1VXVY_4cyc], (instregex "^(S|U)CVTF(v1i16|v1i32|v2i32|v1i64|v4i16|v2f32|v4f16|d|s)(_shift)?")>; @@ -475,16 +480,17 @@ def : InstRW<[FalkorWr_2XYZ_2cyc], (instregex "^EXTR(W|X)rri$")>; // Divide and Multiply Instructions // ----------------------------------------------------------------------------- -def : InstRW<[FalkorWr_1X_4cyc], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; -def : InstRW<[FalkorWr_1X_4cyc], (instregex "^M(ADD|SUB)Wrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_4cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^(S|U)M(ADD|SUB)Lrrr$")>; +def : InstRW<[FalkorWr_IMUL32_1X_2cyc, ReadDefault, ReadDefault, FalkorReadIMA32], (instregex "^M(ADD|SUB)Wrrr$")>; -def : InstRW<[FalkorWr_1X_5cyc], (instregex "^(S|U)MULHrr$")>; -def : InstRW<[FalkorWr_1X_5cyc], (instregex "^M(ADD|SUB)Xrrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc], (instregex "^(S|U)MULHrr$")>; +def : InstRW<[FalkorWr_IMUL64_1X_5cyc, ReadDefault, ReadDefault, FalkorReadIMA64], (instregex "^M(ADD|SUB)Xrrr$")>; def : InstRW<[FalkorWr_1X_1Z_8cyc], (instregex "^(S|U)DIVWr$")>; def : InstRW<[FalkorWr_1X_1Z_16cyc], (instregex "^(S|U)DIVXr$")>; -def : InstRW<[FalkorWr_2VXVY_4cyc], (instregex "^(S|U)(MLAL|MLSL|MULL)v.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc], (instregex "^(S|U)MULLv.*$")>; +def : InstRW<[FalkorWr_VMUL32_2VXVY_4cyc, FalkorReadVMA], (instregex "^(S|U)(MLAL|MLSL)v.*$")>; // Move and Shift Instructions // ----------------------------------------------------------------------------- diff --git a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td index e64b2c441a19..6526cc28e806 100644 --- a/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td +++ b/lib/Target/AArch64/AArch64SchedFalkorWriteRes.td @@ -29,8 +29,9 @@ // Define 1 micro-op types def FalkorWr_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 2; } -def FalkorWr_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } -def FalkorWr_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } +def FalkorWr_IMUL32_1X_2cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_4cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 4; } +def FalkorWr_IMUL64_1X_5cyc : SchedWriteRes<[FalkorUnitX]> { let Latency = 5; } def FalkorWr_1Z_0cyc : SchedWriteRes<[FalkorUnitZ]> { let Latency = 0; } def FalkorWr_1ZB_0cyc : SchedWriteRes<[FalkorUnitZB]> { let Latency = 0; } def FalkorWr_1LD_3cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 3; } @@ -45,8 +46,10 @@ def FalkorWr_1VXVY_1cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 1; } def FalkorWr_1VXVY_2cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 2; } def FalkorWr_1VXVY_3cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 3; } def FalkorWr_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } +def FalkorWr_VMUL32_1VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 4; } def FalkorWr_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } -def FalkorWr_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } +def FalkorWr_FMUL32_1VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 5; } +def FalkorWr_FMUL64_1VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY]>{ let Latency = 6; } def FalkorWr_1LD_0cyc : SchedWriteRes<[FalkorUnitLD]> { let Latency = 0; } def FalkorWr_1ST_0cyc : SchedWriteRes<[FalkorUnitST]> { let Latency = 0; } @@ -75,14 +78,26 @@ def FalkorWr_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 4; let NumMicroOps = 2; } +def FalkorWr_VMUL32_2VXVY_4cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 4; + let NumMicroOps = 2; +} def FalkorWr_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 5; let NumMicroOps = 2; } +def FalkorWr_FMUL32_2VXVY_5cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 5; + let NumMicroOps = 2; +} def FalkorWr_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { let Latency = 6; let NumMicroOps = 2; } +def FalkorWr_FMUL64_2VXVY_6cyc : SchedWriteRes<[FalkorUnitVXVY, FalkorUnitVXVY]> { + let Latency = 6; + let NumMicroOps = 2; +} def FalkorWr_1LD_1VXVY_4cyc : SchedWriteRes<[FalkorUnitLD, FalkorUnitVXVY]> { let Latency = 4; @@ -350,18 +365,17 @@ def FalkorWr_2LD_2VXVY_1XYZ_2LD_2VXVY_4cyc:SchedWriteRes<[FalkorUnitLD, let NumMicroOps = 9; } -// Forwarding logic is modeled for vector multiply and accumulate +// Forwarding logic is modeled for multiply add/accumulate. // ----------------------------------------------------------------------------- -def FalkorReadVMA : SchedReadAdvance<2, [FalkorWr_1VXVY_4cyc, - FalkorWr_2VXVY_4cyc]>; -def FalkorReadFMA : SchedReadAdvance<3, [FalkorWr_1VXVY_5cyc, - FalkorWr_1VXVY_6cyc, - FalkorWr_2VXVY_5cyc, - FalkorWr_2VXVY_6cyc]>; +def FalkorReadIMA32 : SchedReadAdvance<3, [FalkorWr_IMUL32_1X_2cyc]>; +def FalkorReadIMA64 : SchedReadAdvance<4, [FalkorWr_IMUL64_1X_4cyc, FalkorWr_IMUL64_1X_5cyc]>; +def FalkorReadVMA : SchedReadAdvance<3, [FalkorWr_VMUL32_1VXVY_4cyc, FalkorWr_VMUL32_2VXVY_4cyc]>; +def FalkorReadFMA32 : SchedReadAdvance<1, [FalkorWr_FMUL32_1VXVY_5cyc, FalkorWr_FMUL32_2VXVY_5cyc]>; +def FalkorReadFMA64 : SchedReadAdvance<2, [FalkorWr_FMUL64_1VXVY_6cyc, FalkorWr_FMUL64_2VXVY_6cyc]>; // SchedPredicates and WriteVariants for Immediate Zero and LSLFast // ----------------------------------------------------------------------------- -def FalkorImmZPred : SchedPredicate<[{TII->isGPRZero(*MI)}]>; +def FalkorImmZPred : SchedPredicate<[{MI->getOperand(1).getImm() == 0}]>; def FalkorLSLFastPred : SchedPredicate<[{TII->isFalkorLSLFast(*MI)}]>; def FalkorWr_FMOV : SchedWriteVariant<[ @@ -378,7 +392,6 @@ def FalkorWr_LDR : SchedWriteVariant<[ def FalkorWr_ADD : SchedWriteVariant<[ SchedVar, - SchedVar, SchedVar]>; def FalkorWr_PRFM : SchedWriteVariant<[ diff --git a/lib/Target/AArch64/AArch64Subtarget.cpp b/lib/Target/AArch64/AArch64Subtarget.cpp index abdeac019a18..1c81d34014fd 100644 --- a/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/lib/Target/AArch64/AArch64Subtarget.cpp @@ -91,6 +91,8 @@ void AArch64Subtarget::initializeProperties() { case Falkor: MaxInterleaveFactor = 4; VectorInsertExtractBaseCost = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case Kryo: MaxInterleaveFactor = 4; @@ -99,6 +101,8 @@ void AArch64Subtarget::initializeProperties() { PrefetchDistance = 740; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case ThunderX2T99: CacheLineSize = 64; @@ -108,6 +112,8 @@ void AArch64Subtarget::initializeProperties() { PrefetchDistance = 128; MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 4; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case ThunderX: case ThunderXT88: @@ -116,6 +122,8 @@ void AArch64Subtarget::initializeProperties() { CacheLineSize = 128; PrefFunctionAlignment = 3; PrefLoopAlignment = 2; + // FIXME: remove this to enable 64-bit SLP if performance looks good. + MinVectorRegisterBitWidth = 128; break; case CortexA35: break; case CortexA53: break; diff --git a/lib/Target/AArch64/AArch64Subtarget.h b/lib/Target/AArch64/AArch64Subtarget.h index 5b9bee6e41b8..df54bf3f48e1 100644 --- a/lib/Target/AArch64/AArch64Subtarget.h +++ b/lib/Target/AArch64/AArch64Subtarget.h @@ -83,6 +83,9 @@ protected: // NegativeImmediates - transform instructions with negative immediates bool NegativeImmediates = true; + // Enable 64-bit vectorization in SLP. + unsigned MinVectorRegisterBitWidth = 64; + bool UseAA = false; bool PredictableSelectIsExpensive = false; bool BalanceFPOps = false; @@ -106,6 +109,7 @@ protected: unsigned PrefFunctionAlignment = 0; unsigned PrefLoopAlignment = 0; unsigned MaxJumpTableSize = 0; + unsigned WideningBaseCost = 0; // ReserveX18 - X18 is not available as a general purpose register. bool ReserveX18; @@ -190,6 +194,10 @@ public: bool isXRaySupported() const override { return true; } + unsigned getMinVectorRegisterBitWidth() const { + return MinVectorRegisterBitWidth; + } + bool isX18Reserved() const { return ReserveX18; } bool hasFPARMv8() const { return HasFPARMv8; } bool hasNEON() const { return HasNEON; } @@ -228,6 +236,8 @@ public: unsigned getMaximumJumpTableSize() const { return MaxJumpTableSize; } + unsigned getWideningBaseCost() const { return WideningBaseCost; } + /// CPU has TBI (top byte of addresses is ignored during HW address /// translation) and OS enables it. bool supportsAddressTopByteIgnored() const; diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/lib/Target/AArch64/AArch64TargetObjectFile.cpp index 8875f9b72647..12a2e9a867f0 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.cpp +++ b/lib/Target/AArch64/AArch64TargetObjectFile.cpp @@ -70,3 +70,11 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel( const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext()); return MCBinaryExpr::createSub(Res, PC, getContext()); } + +void AArch64_MachoTargetObjectFile::getNameWithPrefix( + SmallVectorImpl &OutName, const GlobalValue *GV, + const TargetMachine &TM) const { + // AArch64 does not use section-relative relocations so any global symbol must + // be accessed via at least a linker-private symbol. + getMangler().getNameWithPrefix(OutName, GV, /* CannotUsePrivateLabel */ true); +} diff --git a/lib/Target/AArch64/AArch64TargetObjectFile.h b/lib/Target/AArch64/AArch64TargetObjectFile.h index 05e1dfa9e6c9..47e3bce43f6e 100644 --- a/lib/Target/AArch64/AArch64TargetObjectFile.h +++ b/lib/Target/AArch64/AArch64TargetObjectFile.h @@ -40,6 +40,9 @@ public: const MCValue &MV, int64_t Offset, MachineModuleInfo *MMI, MCStreamer &Streamer) const override; + + void getNameWithPrefix(SmallVectorImpl &OutName, const GlobalValue *GV, + const TargetMachine &TM) const override; }; } // end namespace llvm diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp index 4d59da0c646d..7c6f55c06bce 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -176,11 +176,95 @@ AArch64TTIImpl::getPopcntSupport(unsigned TyWidth) { return TTI::PSK_Software; } +bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, + ArrayRef Args) { + + // A helper that returns a vector type from the given type. The number of + // elements in type Ty determine the vector width. + auto toVectorTy = [&](Type *ArgTy) { + return VectorType::get(ArgTy->getScalarType(), + DstTy->getVectorNumElements()); + }; + + // Exit early if DstTy is not a vector type whose elements are at least + // 16-bits wide. + if (!DstTy->isVectorTy() || DstTy->getScalarSizeInBits() < 16) + return false; + + // Determine if the operation has a widening variant. We consider both the + // "long" (e.g., usubl) and "wide" (e.g., usubw) versions of the + // instructions. + // + // TODO: Add additional widening operations (e.g., mul, shl, etc.) once we + // verify that their extending operands are eliminated during code + // generation. + switch (Opcode) { + case Instruction::Add: // UADDL(2), SADDL(2), UADDW(2), SADDW(2). + case Instruction::Sub: // USUBL(2), SSUBL(2), USUBW(2), SSUBW(2). + break; + default: + return false; + } + + // To be a widening instruction (either the "wide" or "long" versions), the + // second operand must be a sign- or zero extend having a single user. We + // only consider extends having a single user because they may otherwise not + // be eliminated. + if (Args.size() != 2 || + (!isa(Args[1]) && !isa(Args[1])) || + !Args[1]->hasOneUse()) + return false; + auto *Extend = cast(Args[1]); + + // Legalize the destination type and ensure it can be used in a widening + // operation. + auto DstTyL = TLI->getTypeLegalizationCost(DL, DstTy); + unsigned DstElTySize = DstTyL.second.getScalarSizeInBits(); + if (!DstTyL.second.isVector() || DstElTySize != DstTy->getScalarSizeInBits()) + return false; + + // Legalize the source type and ensure it can be used in a widening + // operation. + Type *SrcTy = toVectorTy(Extend->getSrcTy()); + auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy); + unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits(); + if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits()) + return false; + + // Get the total number of vector elements in the legalized types. + unsigned NumDstEls = DstTyL.first * DstTyL.second.getVectorNumElements(); + unsigned NumSrcEls = SrcTyL.first * SrcTyL.second.getVectorNumElements(); + + // Return true if the legalized types have the same number of vector elements + // and the destination element type size is twice that of the source type. + return NumDstEls == NumSrcEls && 2 * SrcElTySize == DstElTySize; +} + int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, const Instruction *I) { int ISD = TLI->InstructionOpcodeToISD(Opcode); assert(ISD && "Invalid opcode"); + // If the cast is observable, and it is used by a widening instruction (e.g., + // uaddl, saddw, etc.), it may be free. + if (I && I->hasOneUse()) { + auto *SingleUser = cast(*I->user_begin()); + SmallVector Operands(SingleUser->operand_values()); + if (isWideningInstruction(Dst, SingleUser->getOpcode(), Operands)) { + // If the cast is the second operand, it is free. We will generate either + // a "wide" or "long" version of the widening instruction. + if (I == SingleUser->getOperand(1)) + return 0; + // If the cast is not the second operand, it will be free if it looks the + // same as the second operand. In this case, we will generate a "long" + // version of the widening instruction. + if (auto *Cast = dyn_cast(SingleUser->getOperand(1))) + if (I->getOpcode() == Cast->getOpcode() && + cast(I)->getSrcTy() == Cast->getSrcTy()) + return 0; + } + } + EVT SrcTy = TLI->getValueType(DL, Src); EVT DstTy = TLI->getValueType(DL, Dst); @@ -379,6 +463,16 @@ int AArch64TTIImpl::getArithmeticInstrCost( // Legalize the type. std::pair LT = TLI->getTypeLegalizationCost(DL, Ty); + // If the instruction is a widening instruction (e.g., uaddl, saddw, etc.), + // add in the widening overhead specified by the sub-target. Since the + // extends feeding widening instructions are performed automatically, they + // aren't present in the generated code and have a zero cost. By adding a + // widening overhead here, we attach the total cost of the combined operation + // to the widening instruction. + int Cost = 0; + if (isWideningInstruction(Ty, Opcode, Args)) + Cost += ST->getWideningBaseCost(); + int ISD = TLI->InstructionOpcodeToISD(Opcode); if (ISD == ISD::SDIV && @@ -388,9 +482,9 @@ int AArch64TTIImpl::getArithmeticInstrCost( // normally expanded to the sequence ADD + CMP + SELECT + SRA. // The OperandValue properties many not be same as that of previous // operation; conservatively assume OP_None. - int Cost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, - TargetTransformInfo::OP_None, - TargetTransformInfo::OP_None); + Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info, + TargetTransformInfo::OP_None, + TargetTransformInfo::OP_None); Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info, TargetTransformInfo::OP_None, TargetTransformInfo::OP_None); @@ -405,8 +499,8 @@ int AArch64TTIImpl::getArithmeticInstrCost( switch (ISD) { default: - return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, - Opd1PropInfo, Opd2PropInfo); + return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info, + Opd1PropInfo, Opd2PropInfo); case ISD::ADD: case ISD::MUL: case ISD::XOR: @@ -414,7 +508,7 @@ int AArch64TTIImpl::getArithmeticInstrCost( case ISD::AND: // These nodes are marked as 'custom' for combining purposes only. // We know that they are legal. See LowerAdd in ISelLowering. - return 1 * LT.first; + return (Cost + 1) * LT.first; } } diff --git a/lib/Target/AArch64/AArch64TargetTransformInfo.h b/lib/Target/AArch64/AArch64TargetTransformInfo.h index e37c003e064c..280d97f3c502 100644 --- a/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -43,6 +43,9 @@ class AArch64TTIImpl : public BasicTTIImplBase { VECTOR_LDST_FOUR_ELEMENTS }; + bool isWideningInstruction(Type *Ty, unsigned Opcode, + ArrayRef Args); + public: explicit AArch64TTIImpl(const AArch64TargetMachine *TM, const Function &F) : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)), @@ -84,6 +87,10 @@ public: return 64; } + unsigned getMinVectorRegisterBitWidth() { + return ST->getMinVectorRegisterBitWidth(); + } + unsigned getMaxInterleaveFactor(unsigned VF); int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, @@ -134,6 +141,10 @@ public: unsigned getMinPrefetchStride(); unsigned getMaxPrefetchIterationsAhead(); + + bool shouldExpandReduction(const IntrinsicInst *II) const { + return false; + } /// @} }; diff --git a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp index 4dbcc9581a84..449d732a8d44 100644 --- a/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ b/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -3904,10 +3904,14 @@ bool AArch64AsmParser::parseDirectiveArch(SMLoc L) { return false; } +static SMLoc incrementLoc(SMLoc L, int Offset) { + return SMLoc::getFromPointer(L.getPointer() + Offset); +} + /// parseDirectiveCPU /// ::= .cpu id bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { - SMLoc CPULoc = getLoc(); + SMLoc CurLoc = getLoc(); StringRef CPU, ExtensionString; std::tie(CPU, ExtensionString) = @@ -3923,15 +3927,19 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { // FIXME This is using tablegen data, but should be moved to ARMTargetParser // once that is tablegen'ed if (!getSTI().isCPUStringValid(CPU)) { - Error(CPULoc, "unknown CPU name"); + Error(CurLoc, "unknown CPU name"); return false; } MCSubtargetInfo &STI = copySTI(); STI.setDefaultFeatures(CPU, ""); + CurLoc = incrementLoc(CurLoc, CPU.size()); FeatureBitset Features = STI.getFeatureBits(); for (auto Name : RequestedExtensions) { + // Advance source location past '+'. + CurLoc = incrementLoc(CurLoc, 1); + bool EnableFeature = true; if (Name.startswith_lower("no")) { @@ -3939,6 +3947,7 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { Name = Name.substr(2); } + bool FoundExtension = false; for (const auto &Extension : ExtensionMap) { if (Extension.Name != Name) continue; @@ -3952,9 +3961,15 @@ bool AArch64AsmParser::parseDirectiveCPU(SMLoc L) { uint64_t Features = ComputeAvailableFeatures(STI.ToggleFeature(ToggleFeatures)); setAvailableFeatures(Features); + FoundExtension = true; break; } + + if (!FoundExtension) + Error(CurLoc, "unsupported architectural extension"); + + CurLoc = incrementLoc(CurLoc, Name.size()); } return false; } diff --git a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp index 94112849f84e..1b28df963b40 100644 --- a/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp +++ b/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp @@ -32,8 +32,9 @@ static cl::opt AsmWriterVariant( clEnumValN(Apple, "apple", "Emit Apple-style NEON assembly"))); AArch64MCAsmInfoDarwin::AArch64MCAsmInfoDarwin() { - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 1 : AsmWriterVariant; + // We prefer NEON instructions to be printed in the short, Apple-specific + // form when targeting Darwin. + AssemblerDialect = AsmWriterVariant == Default ? Apple : AsmWriterVariant; PrivateGlobalPrefix = "L"; PrivateLabelPrefix = "L"; @@ -68,8 +69,9 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) { if (T.getArch() == Triple::aarch64_be) IsLittleEndian = false; - // We prefer NEON instructions to be printed in the short form. - AssemblerDialect = AsmWriterVariant == Default ? 0 : AsmWriterVariant; + // We prefer NEON instructions to be printed in the generic form when + // targeting ELF. + AssemblerDialect = AsmWriterVariant == Default ? Generic : AsmWriterVariant; CodePointerSize = 8; diff --git a/lib/Target/AMDGPU/AMDGPU.h b/lib/Target/AMDGPU/AMDGPU.h index 8f6e1e7d8846..3f89702bed50 100644 --- a/lib/Target/AMDGPU/AMDGPU.h +++ b/lib/Target/AMDGPU/AMDGPU.h @@ -50,6 +50,10 @@ FunctionPass *createSIDebuggerInsertNopsPass(); FunctionPass *createSIInsertWaitsPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createAMDGPUCodeGenPreparePass(const GCNTargetMachine *TM = nullptr); +FunctionPass *createAMDGPUMachineCFGStructurizerPass(); + +void initializeAMDGPUMachineCFGStructurizerPass(PassRegistry&); +extern char &AMDGPUMachineCFGStructurizerID; ModulePass *createAMDGPUAnnotateKernelFeaturesPass(const TargetMachine *TM = nullptr); void initializeAMDGPUAnnotateKernelFeaturesPass(PassRegistry &); diff --git a/lib/Target/AMDGPU/AMDGPU.td b/lib/Target/AMDGPU/AMDGPU.td index 2e5b78bbf7ef..b279bd61e180 100644 --- a/lib/Target/AMDGPU/AMDGPU.td +++ b/lib/Target/AMDGPU/AMDGPU.td @@ -61,6 +61,24 @@ def FeatureFlatAddressSpace : SubtargetFeature<"flat-address-space", "Support flat address space" >; +def FeatureFlatInstOffsets : SubtargetFeature<"flat-inst-offsets", + "FlatInstOffsets", + "true", + "Flat instructions have immediate offset addressing mode" +>; + +def FeatureFlatGlobalInsts : SubtargetFeature<"flat-global-insts", + "FlatGlobalInsts", + "true", + "Have global_* flat memory instructions" +>; + +def FeatureFlatScratchInsts : SubtargetFeature<"flat-scratch-insts", + "FlatScratchInsts", + "true", + "Have scratch_* flat memory instructions" +>; + def FeatureUnalignedBufferAccess : SubtargetFeature<"unaligned-buffer-access", "UnalignedBufferAccess", "true", @@ -407,7 +425,8 @@ def FeatureGFX9 : SubtargetFeatureGeneration<"GFX9", FeatureGCN3Encoding, FeatureCIInsts, Feature16BitInsts, FeatureSMemRealTime, FeatureScalarStores, FeatureInv2PiInlineImm, FeatureApertureRegs, FeatureGFX9Insts, FeatureVOP3P, FeatureVGPRIndexMode, - FeatureFastFMAF32, FeatureDPP + FeatureFastFMAF32, FeatureDPP, + FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts ] >; diff --git a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp index ccae36ced1f8..7c99752b881f 100644 --- a/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp @@ -136,8 +136,7 @@ private: bool SelectMUBUFIntrinsicVOffset(SDValue Offset, SDValue &SOffset, SDValue &ImmOffset, SDValue &VOffset) const; - bool SelectFlat(SDValue Addr, SDValue &VAddr, - SDValue &SLC, SDValue &TFE) const; + bool SelectFlat(SDValue Addr, SDValue &VAddr, SDValue &SLC) const; bool SelectSMRDOffset(SDValue ByteOffsetNode, SDValue &Offset, bool &Imm) const; @@ -1278,10 +1277,9 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFIntrinsicVOffset(SDValue Offset, bool AMDGPUDAGToDAGISel::SelectFlat(SDValue Addr, SDValue &VAddr, - SDValue &SLC, - SDValue &TFE) const { + SDValue &SLC) const { VAddr = Addr; - TFE = SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); + SLC = CurDAG->getTargetConstant(0, SDLoc(), MVT::i1); return true; } diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp index 915d1d9e0e68..f80652b87373 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -567,13 +567,19 @@ static bool hasSourceMods(const SDNode *N) { case AMDGPUISD::INTERP_P1: case AMDGPUISD::INTERP_P2: case AMDGPUISD::DIV_SCALE: + + // TODO: Should really be looking at the users of the bitcast. These are + // problematic because bitcasts are used to legalize all stores to integer + // types. + case ISD::BITCAST: return false; default: return true; } } -static bool allUsesHaveSourceMods(const SDNode *N, unsigned CostThreshold = 4) { +bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold) { // Some users (such as 3-operand FMA/MAD) must use a VOP3 encoding, and thus // it is truly free to use a source modifier in all cases. If there are // multiple users but for each one will necessitate using VOP3, there will be @@ -2299,7 +2305,7 @@ static bool isU24(SDValue Op, SelectionDAG &DAG) { EVT VT = Op.getValueType(); DAG.computeKnownBits(Op, Known); - return (VT.getSizeInBits() - Known.Zero.countLeadingOnes()) <= 24; + return (VT.getSizeInBits() - Known.countMinLeadingZeros()) <= 24; } static bool isI24(SDValue Op, SelectionDAG &DAG) { diff --git a/lib/Target/AMDGPU/AMDGPUISelLowering.h b/lib/Target/AMDGPU/AMDGPUISelLowering.h index e1a5a2072418..4c588a7bafd0 100644 --- a/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ b/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -132,6 +132,8 @@ public: return false; } + static bool allUsesHaveSourceMods(const SDNode *N, + unsigned CostThreshold = 4); bool isFAbsFree(EVT VT) const override; bool isFNegFree(EVT VT) const override; bool isTruncateFree(EVT Src, EVT Dest) const override; diff --git a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp index 8867ed689a31..a7eac080f885 100644 --- a/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp +++ b/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp @@ -127,9 +127,9 @@ bool AMDGPUInstructionSelector::selectG_STORE(MachineInstr &I) const { .add(I.getOperand(1)) .add(I.getOperand(0)) .addImm(0) - .addImm(0) .addImm(0); + // Now that we selected an opcode, we need to constrain the register // operands to use appropriate classes. bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); @@ -393,7 +393,6 @@ bool AMDGPUInstructionSelector::selectG_LOAD(MachineInstr &I) const { .add(I.getOperand(0)) .addReg(PtrReg) .addImm(0) - .addImm(0) .addImm(0); bool Ret = constrainSelectedInstRegOperands(*Flat, TII, TRI, RBI); diff --git a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp index a2567a549028..9de302994e68 100644 --- a/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp @@ -33,6 +33,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo() { const LLT P1 = LLT::pointer(1, 64); const LLT P2 = LLT::pointer(2, 64); + setAction({G_CONSTANT, S32}, Legal); setAction({G_CONSTANT, S64}, Legal); setAction({G_GEP, P1}, Legal); diff --git a/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp new file mode 100644 index 000000000000..6d2785ba1c60 --- /dev/null +++ b/lib/Target/AMDGPU/AMDGPUMachineCFGStructurizer.cpp @@ -0,0 +1,2881 @@ +//===- AMDGPUMachineCFGStructurizer.cpp - Machine code if conversion pass. ===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file implements the machine instruction level CFG structurizer pass. +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "SIInstrInfo.h" +#include "AMDGPUSubtarget.h" +#include "llvm/ADT/DenseSet.h" +#include "llvm/ADT/PostOrderIterator.h" +#include "llvm/ADT/SetVector.h" +#include "llvm/ADT/SmallPtrSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/Analysis/CFG.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegionInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/Passes.h" +#include "llvm/IR/DebugLoc.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetLowering.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include +using namespace llvm; + +#define DEBUG_TYPE "amdgpucfgstructurizer" + +namespace { +class PHILinearizeDestIterator; + +class PHILinearize { + friend class PHILinearizeDestIterator; + +public: + typedef std::pair PHISourceT; + +private: + typedef DenseSet PHISourcesT; + typedef struct { + unsigned DestReg; + DebugLoc DL; + PHISourcesT Sources; + } PHIInfoElementT; + typedef SmallPtrSet PHIInfoT; + PHIInfoT PHIInfo; + + static unsigned phiInfoElementGetDest(PHIInfoElementT *Info); + static void phiInfoElementSetDef(PHIInfoElementT *Info, unsigned NewDef); + static PHISourcesT &phiInfoElementGetSources(PHIInfoElementT *Info); + static void phiInfoElementAddSource(PHIInfoElementT *Info, unsigned SourceReg, + MachineBasicBlock *SourceMBB); + static void phiInfoElementRemoveSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB); + PHIInfoElementT *findPHIInfoElement(unsigned DestReg); + PHIInfoElementT *findPHIInfoElementFromSource(unsigned SourceReg, + MachineBasicBlock *SourceMBB); + +public: + bool findSourcesFromMBB(MachineBasicBlock *SourceMBB, + SmallVector &Sources); + void addDest(unsigned DestReg, const DebugLoc &DL); + void replaceDef(unsigned OldDestReg, unsigned NewDestReg); + void deleteDef(unsigned DestReg); + void addSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB); + void removeSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB = nullptr); + bool findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, + unsigned &DestReg); + bool isSource(unsigned Reg, MachineBasicBlock *SourceMBB = nullptr); + unsigned getNumSources(unsigned DestReg); + void dump(MachineRegisterInfo *MRI); + void clear(); + + typedef PHISourcesT::iterator source_iterator; + typedef PHILinearizeDestIterator dest_iterator; + + dest_iterator dests_begin(); + dest_iterator dests_end(); + + source_iterator sources_begin(unsigned Reg); + source_iterator sources_end(unsigned Reg); +}; + +class PHILinearizeDestIterator { +private: + PHILinearize::PHIInfoT::iterator Iter; + +public: + unsigned operator*() { return PHILinearize::phiInfoElementGetDest(*Iter); } + PHILinearizeDestIterator &operator++() { + ++Iter; + return *this; + } + bool operator==(const PHILinearizeDestIterator &I) const { + return I.Iter == Iter; + } + bool operator!=(const PHILinearizeDestIterator &I) const { + return I.Iter != Iter; + } + + PHILinearizeDestIterator(PHILinearize::PHIInfoT::iterator I) : Iter(I) {} +}; + +unsigned PHILinearize::phiInfoElementGetDest(PHIInfoElementT *Info) { + return Info->DestReg; +} + +void PHILinearize::phiInfoElementSetDef(PHIInfoElementT *Info, + unsigned NewDef) { + Info->DestReg = NewDef; +} + +PHILinearize::PHISourcesT & +PHILinearize::phiInfoElementGetSources(PHIInfoElementT *Info) { + return Info->Sources; +} + +void PHILinearize::phiInfoElementAddSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + // Assertion ensures we don't use the same SourceMBB for the + // sources, because we cannot have different registers with + // identical predecessors, but we can have the same register for + // multiple predecessors. +#if !defined(NDEBUG) + for (auto SI : phiInfoElementGetSources(Info)) { + assert((SI.second != SourceMBB || SourceReg == SI.first)); + } +#endif + + phiInfoElementGetSources(Info).insert(PHISourceT(SourceReg, SourceMBB)); +} + +void PHILinearize::phiInfoElementRemoveSource(PHIInfoElementT *Info, + unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + auto &Sources = phiInfoElementGetSources(Info); + SmallVector ElimiatedSources; + for (auto SI : Sources) { + if (SI.first == SourceReg && + (SI.second == nullptr || SI.second == SourceMBB)) { + ElimiatedSources.push_back(PHISourceT(SI.first, SI.second)); + } + } + + for (auto &Source : ElimiatedSources) { + Sources.erase(Source); + } +} + +PHILinearize::PHIInfoElementT * +PHILinearize::findPHIInfoElement(unsigned DestReg) { + for (auto I : PHIInfo) { + if (phiInfoElementGetDest(I) == DestReg) { + return I; + } + } + return nullptr; +} + +PHILinearize::PHIInfoElementT * +PHILinearize::findPHIInfoElementFromSource(unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + for (auto I : PHIInfo) { + for (auto SI : phiInfoElementGetSources(I)) { + if (SI.first == SourceReg && + (SI.second == nullptr || SI.second == SourceMBB)) { + return I; + } + } + } + return nullptr; +} + +bool PHILinearize::findSourcesFromMBB(MachineBasicBlock *SourceMBB, + SmallVector &Sources) { + bool FoundSource = false; + for (auto I : PHIInfo) { + for (auto SI : phiInfoElementGetSources(I)) { + if (SI.second == SourceMBB) { + FoundSource = true; + Sources.push_back(SI.first); + } + } + } + return FoundSource; +} + +void PHILinearize::addDest(unsigned DestReg, const DebugLoc &DL) { + assert(findPHIInfoElement(DestReg) == nullptr && "Dest already exsists"); + PHISourcesT EmptySet; + PHIInfoElementT *NewElement = new PHIInfoElementT(); + NewElement->DestReg = DestReg; + NewElement->DL = DL; + NewElement->Sources = EmptySet; + PHIInfo.insert(NewElement); +} + +void PHILinearize::replaceDef(unsigned OldDestReg, unsigned NewDestReg) { + phiInfoElementSetDef(findPHIInfoElement(OldDestReg), NewDestReg); +} + +void PHILinearize::deleteDef(unsigned DestReg) { + PHIInfoElementT *InfoElement = findPHIInfoElement(DestReg); + PHIInfo.erase(InfoElement); + delete InfoElement; +} + +void PHILinearize::addSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + phiInfoElementAddSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); +} + +void PHILinearize::removeSource(unsigned DestReg, unsigned SourceReg, + MachineBasicBlock *SourceMBB) { + phiInfoElementRemoveSource(findPHIInfoElement(DestReg), SourceReg, SourceMBB); +} + +bool PHILinearize::findDest(unsigned SourceReg, MachineBasicBlock *SourceMBB, + unsigned &DestReg) { + PHIInfoElementT *InfoElement = + findPHIInfoElementFromSource(SourceReg, SourceMBB); + if (InfoElement != nullptr) { + DestReg = phiInfoElementGetDest(InfoElement); + return true; + } + return false; +} + +bool PHILinearize::isSource(unsigned Reg, MachineBasicBlock *SourceMBB) { + unsigned DestReg; + return findDest(Reg, SourceMBB, DestReg); +} + +unsigned PHILinearize::getNumSources(unsigned DestReg) { + return phiInfoElementGetSources(findPHIInfoElement(DestReg)).size(); +} + +void PHILinearize::dump(MachineRegisterInfo *MRI) { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + dbgs() << "=PHIInfo Start=\n"; + for (auto PII : this->PHIInfo) { + PHIInfoElementT &Element = *PII; + dbgs() << "Dest: " << PrintReg(Element.DestReg, TRI) + << " Sources: {"; + for (auto &SI : Element.Sources) { + dbgs() << PrintReg(SI.first, TRI) << "(BB#" + << SI.second->getNumber() << "),"; + } + dbgs() << "}\n"; + } + dbgs() << "=PHIInfo End=\n"; +} + +void PHILinearize::clear() { PHIInfo = PHIInfoT(); } + +PHILinearize::dest_iterator PHILinearize::dests_begin() { + return PHILinearizeDestIterator(PHIInfo.begin()); +} + +PHILinearize::dest_iterator PHILinearize::dests_end() { + return PHILinearizeDestIterator(PHIInfo.end()); +} + +PHILinearize::source_iterator PHILinearize::sources_begin(unsigned Reg) { + auto InfoElement = findPHIInfoElement(Reg); + return phiInfoElementGetSources(InfoElement).begin(); +} +PHILinearize::source_iterator PHILinearize::sources_end(unsigned Reg) { + auto InfoElement = findPHIInfoElement(Reg); + return phiInfoElementGetSources(InfoElement).end(); +} + +class RegionMRT; +class MBBMRT; + +static unsigned getPHINumInputs(MachineInstr &PHI) { + assert(PHI.isPHI()); + return (PHI.getNumOperands() - 1) / 2; +} + +static MachineBasicBlock *getPHIPred(MachineInstr &PHI, unsigned Index) { + assert(PHI.isPHI()); + return PHI.getOperand(Index * 2 + 2).getMBB(); +} + +static void setPhiPred(MachineInstr &PHI, unsigned Index, + MachineBasicBlock *NewPred) { + PHI.getOperand(Index * 2 + 2).setMBB(NewPred); +} + +static unsigned getPHISourceReg(MachineInstr &PHI, unsigned Index) { + assert(PHI.isPHI()); + return PHI.getOperand(Index * 2 + 1).getReg(); +} + +static unsigned getPHIDestReg(MachineInstr &PHI) { + assert(PHI.isPHI()); + return PHI.getOperand(0).getReg(); +} + +class LinearizedRegion { +protected: + MachineBasicBlock *Entry; + // The exit block is part of the region, and is the last + // merge block before exiting the region. + MachineBasicBlock *Exit; + DenseSet LiveOuts; + SmallPtrSet MBBs; + bool HasLoop; + LinearizedRegion *Parent; + RegionMRT *RMRT; + + void storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, + MachineInstr *DefInstr, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + void storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo); + + void storeMBBLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, + RegionMRT *TopRegion); + + void storeLiveOuts(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + void storeLiveOuts(RegionMRT *Region, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo, + RegionMRT *TopRegion = nullptr); + +public: + void setRegionMRT(RegionMRT *Region) { RMRT = Region; } + + RegionMRT *getRegionMRT() { return RMRT; } + + void setParent(LinearizedRegion *P) { Parent = P; } + + LinearizedRegion *getParent() { return Parent; } + + void print(raw_ostream &OS, const TargetRegisterInfo *TRI = nullptr); + + void setBBSelectRegIn(unsigned Reg); + + unsigned getBBSelectRegIn(); + + void setBBSelectRegOut(unsigned Reg, bool IsLiveOut); + + unsigned getBBSelectRegOut(); + + void setHasLoop(bool Value); + + bool getHasLoop(); + + void addLiveOut(unsigned VReg); + + void removeLiveOut(unsigned Reg); + + void replaceLiveOut(unsigned OldReg, unsigned NewReg); + + void replaceRegister(unsigned Register, unsigned NewRegister, + MachineRegisterInfo *MRI, bool ReplaceInside, + bool ReplaceOutside, bool IncludeLoopPHIs); + + void replaceRegisterInsideRegion(unsigned Register, unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI); + + void replaceRegisterOutsideRegion(unsigned Register, unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI); + + DenseSet *getLiveOuts(); + + void setEntry(MachineBasicBlock *NewEntry); + + MachineBasicBlock *getEntry(); + + void setExit(MachineBasicBlock *NewExit); + + MachineBasicBlock *getExit(); + + void addMBB(MachineBasicBlock *MBB); + + void addMBBs(LinearizedRegion *InnerRegion); + + bool contains(MachineBasicBlock *MBB); + + bool isLiveOut(unsigned Reg); + + bool hasNoDef(unsigned Reg, MachineRegisterInfo *MRI); + + void removeFalseRegisterKills(MachineRegisterInfo *MRI); + + void initLiveOut(RegionMRT *Region, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + LinearizedRegion(MachineBasicBlock *MBB, const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, PHILinearize &PHIInfo); + + LinearizedRegion(); + + ~LinearizedRegion(); +}; + +class MRT { +protected: + RegionMRT *Parent; + unsigned BBSelectRegIn; + unsigned BBSelectRegOut; + +public: + unsigned getBBSelectRegIn() { return BBSelectRegIn; } + + unsigned getBBSelectRegOut() { return BBSelectRegOut; } + + void setBBSelectRegIn(unsigned Reg) { BBSelectRegIn = Reg; } + + void setBBSelectRegOut(unsigned Reg) { BBSelectRegOut = Reg; } + + virtual RegionMRT *getRegionMRT() { return nullptr; } + + virtual MBBMRT *getMBBMRT() { return nullptr; } + + bool isRegion() { return getRegionMRT() != nullptr; } + + bool isMBB() { return getMBBMRT() != nullptr; } + + bool isRoot() { return Parent == nullptr; } + + void setParent(RegionMRT *Region) { Parent = Region; } + + RegionMRT *getParent() { return Parent; } + + static MachineBasicBlock * + initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, + DenseMap &RegionMap); + + static RegionMRT *buildMRT(MachineFunction &MF, + const MachineRegionInfo *RegionInfo, + const SIInstrInfo *TII, + MachineRegisterInfo *MRI); + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) = 0; + + void dumpDepth(int depth) { + for (int i = depth; i > 0; --i) { + dbgs() << " "; + } + } + + virtual ~MRT() {} +}; + +class MBBMRT : public MRT { + MachineBasicBlock *MBB; + +public: + virtual MBBMRT *getMBBMRT() { return this; } + + MachineBasicBlock *getMBB() { return MBB; } + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + dumpDepth(depth); + dbgs() << "MBB: " << getMBB()->getNumber(); + dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; + } + + MBBMRT(MachineBasicBlock *BB) : MBB(BB) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } +}; + +class RegionMRT : public MRT { +protected: + MachineRegion *Region; + LinearizedRegion *LRegion; + MachineBasicBlock *Succ; + + SetVector Children; + +public: + virtual RegionMRT *getRegionMRT() { return this; } + + void setLinearizedRegion(LinearizedRegion *LinearizeRegion) { + LRegion = LinearizeRegion; + } + + LinearizedRegion *getLinearizedRegion() { return LRegion; } + + MachineRegion *getMachineRegion() { return Region; } + + unsigned getInnerOutputRegister() { + return (*(Children.begin()))->getBBSelectRegOut(); + } + + void addChild(MRT *Tree) { Children.insert(Tree); } + + SetVector *getChildren() { return &Children; } + + virtual void dump(const TargetRegisterInfo *TRI, int depth = 0) { + dumpDepth(depth); + dbgs() << "Region: " << (void *)Region; + dbgs() << " In: " << PrintReg(getBBSelectRegIn(), TRI); + dbgs() << ", Out: " << PrintReg(getBBSelectRegOut(), TRI) << "\n"; + + dumpDepth(depth); + if (getSucc()) + dbgs() << "Succ: " << getSucc()->getNumber() << "\n"; + else + dbgs() << "Succ: none \n"; + for (auto MRTI : Children) { + MRTI->dump(TRI, depth + 1); + } + } + + MRT *getEntryTree() { return Children.back(); } + + MRT *getExitTree() { return Children.front(); } + + MachineBasicBlock *getEntry() { + MRT *Tree = Children.back(); + return (Tree->isRegion()) ? Tree->getRegionMRT()->getEntry() + : Tree->getMBBMRT()->getMBB(); + } + + MachineBasicBlock *getExit() { + MRT *Tree = Children.front(); + return (Tree->isRegion()) ? Tree->getRegionMRT()->getExit() + : Tree->getMBBMRT()->getMBB(); + } + + void setSucc(MachineBasicBlock *MBB) { Succ = MBB; } + + MachineBasicBlock *getSucc() { return Succ; } + + bool contains(MachineBasicBlock *MBB) { + for (auto CI : Children) { + if (CI->isMBB()) { + if (MBB == CI->getMBBMRT()->getMBB()) { + return true; + } + } else { + if (CI->getRegionMRT()->contains(MBB)) { + return true; + } else if (CI->getRegionMRT()->getLinearizedRegion() != nullptr && + CI->getRegionMRT()->getLinearizedRegion()->contains(MBB)) { + return true; + } + } + } + return false; + } + + void replaceLiveOutReg(unsigned Register, unsigned NewRegister) { + LinearizedRegion *LRegion = getLinearizedRegion(); + LRegion->replaceLiveOut(Register, NewRegister); + for (auto &CI : Children) { + if (CI->isRegion()) { + CI->getRegionMRT()->replaceLiveOutReg(Register, NewRegister); + } + } + } + + RegionMRT(MachineRegion *MachineRegion) + : Region(MachineRegion), LRegion(nullptr), Succ(nullptr) { + setParent(nullptr); + setBBSelectRegOut(0); + setBBSelectRegIn(0); + } + + virtual ~RegionMRT() { + if (LRegion) { + delete LRegion; + } + + for (auto CI : Children) { + delete &(*CI); + } + } +}; + +static unsigned createBBSelectReg(const SIInstrInfo *TII, + MachineRegisterInfo *MRI) { + return MRI->createVirtualRegister(TII->getPreferredSelectRegClass(32)); +} + +MachineBasicBlock * +MRT::initializeMRT(MachineFunction &MF, const MachineRegionInfo *RegionInfo, + DenseMap &RegionMap) { + for (auto &MFI : MF) { + MachineBasicBlock *ExitMBB = &MFI; + if (ExitMBB->succ_size() == 0) { + return ExitMBB; + } + } + llvm_unreachable("CFG has no exit block"); + return nullptr; +} + +RegionMRT *MRT::buildMRT(MachineFunction &MF, + const MachineRegionInfo *RegionInfo, + const SIInstrInfo *TII, MachineRegisterInfo *MRI) { + SmallPtrSet PlacedRegions; + DenseMap RegionMap; + MachineRegion *TopLevelRegion = RegionInfo->getTopLevelRegion(); + RegionMRT *Result = new RegionMRT(TopLevelRegion); + RegionMap[TopLevelRegion] = Result; + + // Insert the exit block first, we need it to be the merge node + // for the top level region. + MachineBasicBlock *Exit = initializeMRT(MF, RegionInfo, RegionMap); + + unsigned BBSelectRegIn = createBBSelectReg(TII, MRI); + MBBMRT *ExitMRT = new MBBMRT(Exit); + RegionMap[RegionInfo->getRegionFor(Exit)]->addChild(ExitMRT); + ExitMRT->setBBSelectRegIn(BBSelectRegIn); + + for (auto MBBI : post_order(&(MF.front()))) { + MachineBasicBlock *MBB = &(*MBBI); + + // Skip Exit since we already added it + if (MBB == Exit) { + continue; + } + + DEBUG(dbgs() << "Visiting BB#" << MBB->getNumber() << "\n"); + MBBMRT *NewMBB = new MBBMRT(MBB); + MachineRegion *Region = RegionInfo->getRegionFor(MBB); + + // Ensure we have the MRT region + if (RegionMap.count(Region) == 0) { + RegionMRT *NewMRTRegion = new RegionMRT(Region); + RegionMap[Region] = NewMRTRegion; + + // Ensure all parents are in the RegionMap + MachineRegion *Parent = Region->getParent(); + while (RegionMap.count(Parent) == 0) { + RegionMRT *NewMRTParent = new RegionMRT(Parent); + NewMRTParent->addChild(NewMRTRegion); + NewMRTRegion->setParent(NewMRTParent); + RegionMap[Parent] = NewMRTParent; + NewMRTRegion = NewMRTParent; + Parent = Parent->getParent(); + } + RegionMap[Parent]->addChild(NewMRTRegion); + NewMRTRegion->setParent(RegionMap[Parent]); + } + + // Add MBB to Region MRT + RegionMap[Region]->addChild(NewMBB); + NewMBB->setParent(RegionMap[Region]); + RegionMap[Region]->setSucc(Region->getExit()); + } + return Result; +} + +void LinearizedRegion::storeLiveOutReg(MachineBasicBlock *MBB, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + if (TRI->isVirtualRegister(Reg)) { + DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + // If this is a source register to a PHI we are chaining, it + // must be live out. + if (PHIInfo.isSource(Reg)) { + DEBUG(dbgs() << "Add LiveOut (PHI): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } else { + // If this is live out of the MBB + for (auto &UI : MRI->use_operands(Reg)) { + if (UI.getParent()->getParent() != MBB) { + DEBUG(dbgs() << "Add LiveOut (MBB BB#" << MBB->getNumber() + << "): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } else { + // If the use is in the same MBB we have to make sure + // it is after the def, otherwise it is live out in a loop + MachineInstr *UseInstr = UI.getParent(); + for (MachineBasicBlock::instr_iterator + MII = UseInstr->getIterator(), + MIE = UseInstr->getParent()->instr_end(); + MII != MIE; ++MII) { + if ((&(*MII)) == DefInstr) { + DEBUG(dbgs() << "Add LiveOut (Loop): " << PrintReg(Reg, TRI) + << "\n"); + addLiveOut(Reg); + } + } + } + } + } + } +} + +void LinearizedRegion::storeLiveOutRegRegion(RegionMRT *Region, unsigned Reg, + MachineInstr *DefInstr, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + if (TRI->isVirtualRegister(Reg)) { + DEBUG(dbgs() << "Considering Register: " << PrintReg(Reg, TRI) << "\n"); + for (auto &UI : MRI->use_operands(Reg)) { + if (!Region->contains(UI.getParent()->getParent())) { + DEBUG(dbgs() << "Add LiveOut (Region " << (void *)Region + << "): " << PrintReg(Reg, TRI) << "\n"); + addLiveOut(Reg); + } + } + } +} + +void LinearizedRegion::storeLiveOuts(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + DEBUG(dbgs() << "-Store Live Outs Begin (BB#" << MBB->getNumber() << ")-\n"); + for (auto &II : *MBB) { + for (auto &RI : II.defs()) { + storeLiveOutReg(MBB, RI.getReg(), RI.getParent(), MRI, TRI, PHIInfo); + } + for (auto &IRI : II.implicit_operands()) { + if (IRI.isDef()) { + storeLiveOutReg(MBB, IRI.getReg(), IRI.getParent(), MRI, TRI, PHIInfo); + } + } + } + + // If we have a successor with a PHI, source coming from this MBB we have to + // add the register as live out + for (MachineBasicBlock::succ_iterator SI = MBB->succ_begin(), + E = MBB->succ_end(); + SI != E; ++SI) { + for (auto &II : *(*SI)) { + if (II.isPHI()) { + MachineInstr &PHI = II; + int numPreds = getPHINumInputs(PHI); + for (int i = 0; i < numPreds; ++i) { + if (getPHIPred(PHI, i) == MBB) { + unsigned PHIReg = getPHISourceReg(PHI, i); + DEBUG(dbgs() << "Add LiveOut (PhiSource BB#" << MBB->getNumber() + << " -> BB#" << (*SI)->getNumber() + << "): " << PrintReg(PHIReg, TRI) << "\n"); + addLiveOut(PHIReg); + } + } + } + } + } + + DEBUG(dbgs() << "-Store Live Outs Endn-\n"); +} + +void LinearizedRegion::storeMBBLiveOuts(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo, + RegionMRT *TopRegion) { + for (auto &II : *MBB) { + for (auto &RI : II.defs()) { + storeLiveOutRegRegion(TopRegion, RI.getReg(), RI.getParent(), MRI, TRI, + PHIInfo); + } + for (auto &IRI : II.implicit_operands()) { + if (IRI.isDef()) { + storeLiveOutRegRegion(TopRegion, IRI.getReg(), IRI.getParent(), MRI, + TRI, PHIInfo); + } + } + } +} + +void LinearizedRegion::storeLiveOuts(RegionMRT *Region, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo, + RegionMRT *CurrentTopRegion) { + MachineBasicBlock *Exit = Region->getSucc(); + + RegionMRT *TopRegion = + CurrentTopRegion == nullptr ? Region : CurrentTopRegion; + + // Check if exit is end of function, if so, no live outs. + if (Exit == nullptr) + return; + + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (CI->isMBB()) { + auto MBB = CI->getMBBMRT()->getMBB(); + storeMBBLiveOuts(MBB, MRI, TRI, PHIInfo, TopRegion); + } else { + LinearizedRegion *SubRegion = CI->getRegionMRT()->getLinearizedRegion(); + // We should be limited to only store registers that are live out from the + // lineaized region + for (auto MBBI : SubRegion->MBBs) { + storeMBBLiveOuts(MBBI, MRI, TRI, PHIInfo, TopRegion); + } + } + } + + if (CurrentTopRegion == nullptr) { + auto Succ = Region->getSucc(); + for (auto &II : *Succ) { + if (II.isPHI()) { + MachineInstr &PHI = II; + int numPreds = getPHINumInputs(PHI); + for (int i = 0; i < numPreds; ++i) { + if (Region->contains(getPHIPred(PHI, i))) { + unsigned PHIReg = getPHISourceReg(PHI, i); + DEBUG(dbgs() << "Add Region LiveOut (" << (void *)Region + << "): " << PrintReg(PHIReg, TRI) << "\n"); + addLiveOut(PHIReg); + } + } + } + } + } +} + +void LinearizedRegion::print(raw_ostream &OS, const TargetRegisterInfo *TRI) { + OS << "Linearized Region {"; + bool IsFirst = true; + for (const auto &MBB : MBBs) { + if (IsFirst) { + IsFirst = false; + } else { + OS << " ,"; + } + OS << MBB->getNumber(); + } + OS << "} (" << Entry->getNumber() << ", " + << (Exit == nullptr ? -1 : Exit->getNumber()) + << "): In:" << PrintReg(getBBSelectRegIn(), TRI) + << " Out:" << PrintReg(getBBSelectRegOut(), TRI) << " {"; + for (auto &LI : LiveOuts) { + OS << PrintReg(LI, TRI) << " "; + } + OS << "} \n"; +} + +unsigned LinearizedRegion::getBBSelectRegIn() { + return getRegionMRT()->getBBSelectRegIn(); +} + +unsigned LinearizedRegion::getBBSelectRegOut() { + return getRegionMRT()->getBBSelectRegOut(); +} + +void LinearizedRegion::setHasLoop(bool Value) { HasLoop = Value; } + +bool LinearizedRegion::getHasLoop() { return HasLoop; } + +void LinearizedRegion::addLiveOut(unsigned VReg) { LiveOuts.insert(VReg); } + +void LinearizedRegion::removeLiveOut(unsigned Reg) { + if (isLiveOut(Reg)) + LiveOuts.erase(Reg); +} + +void LinearizedRegion::replaceLiveOut(unsigned OldReg, unsigned NewReg) { + if (isLiveOut(OldReg)) { + removeLiveOut(OldReg); + addLiveOut(NewReg); + } +} + +void LinearizedRegion::replaceRegister(unsigned Register, unsigned NewRegister, + MachineRegisterInfo *MRI, + bool ReplaceInside, bool ReplaceOutside, + bool IncludeLoopPHI) { + assert(Register != NewRegister && "Cannot replace a reg with itself"); + + DEBUG(dbgs() << "Pepareing to replace register (region): " + << PrintReg(Register, MRI->getTargetRegisterInfo()) << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) << "\n"); + + // If we are replacing outside, we also need to update the LiveOuts + if (ReplaceOutside && + (isLiveOut(Register) || this->getParent()->isLiveOut(Register))) { + LinearizedRegion *Current = this; + while (Current != nullptr && Current->getEntry() != nullptr) { + DEBUG(dbgs() << "Region before register replace\n"); + DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + Current->replaceLiveOut(Register, NewRegister); + DEBUG(dbgs() << "Region after register replace\n"); + DEBUG(Current->print(dbgs(), MRI->getTargetRegisterInfo())); + Current = Current->getParent(); + } + } + + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), + E = MRI->reg_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + + // We don't rewrite defs. + if (O.isDef()) + continue; + + bool IsInside = contains(O.getParent()->getParent()); + bool IsLoopPHI = IsInside && (O.getParent()->isPHI() && + O.getParent()->getParent() == getEntry()); + bool ShouldReplace = (IsInside && ReplaceInside) || + (!IsInside && ReplaceOutside) || + (IncludeLoopPHI && IsLoopPHI); + if (ShouldReplace) { + + if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + DEBUG(dbgs() << "Trying to substitute physical register: " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + llvm_unreachable("Cannot substitute physical registers"); + } else { + DEBUG(dbgs() << "Replacing register (region): " + << PrintReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + O.setReg(NewRegister); + } + } + } +} + +void LinearizedRegion::replaceRegisterInsideRegion(unsigned Register, + unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI) { + replaceRegister(Register, NewRegister, MRI, true, false, IncludeLoopPHIs); +} + +void LinearizedRegion::replaceRegisterOutsideRegion(unsigned Register, + unsigned NewRegister, + bool IncludeLoopPHIs, + MachineRegisterInfo *MRI) { + replaceRegister(Register, NewRegister, MRI, false, true, IncludeLoopPHIs); +} + +DenseSet *LinearizedRegion::getLiveOuts() { return &LiveOuts; } + +void LinearizedRegion::setEntry(MachineBasicBlock *NewEntry) { + Entry = NewEntry; +} + +MachineBasicBlock *LinearizedRegion::getEntry() { return Entry; } + +void LinearizedRegion::setExit(MachineBasicBlock *NewExit) { Exit = NewExit; } + +MachineBasicBlock *LinearizedRegion::getExit() { return Exit; } + +void LinearizedRegion::addMBB(MachineBasicBlock *MBB) { MBBs.insert(MBB); } + +void LinearizedRegion::addMBBs(LinearizedRegion *InnerRegion) { + for (const auto &MBB : InnerRegion->MBBs) { + addMBB(MBB); + } +} + +bool LinearizedRegion::contains(MachineBasicBlock *MBB) { + return MBBs.count(MBB) == 1; +} + +bool LinearizedRegion::isLiveOut(unsigned Reg) { + return LiveOuts.count(Reg) == 1; +} + +bool LinearizedRegion::hasNoDef(unsigned Reg, MachineRegisterInfo *MRI) { + return MRI->def_begin(Reg) == MRI->def_end(); +} + +// After the code has been structurized, what was flagged as kills +// before are no longer register kills. +void LinearizedRegion::removeFalseRegisterKills(MachineRegisterInfo *MRI) { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + for (auto MBBI : MBBs) { + MachineBasicBlock *MBB = MBBI; + for (auto &II : *MBB) { + for (auto &RI : II.uses()) { + if (RI.isReg()) { + unsigned Reg = RI.getReg(); + if (TRI->isVirtualRegister(Reg)) { + if (hasNoDef(Reg, MRI)) + continue; + if (!MRI->hasOneDef(Reg)) { + DEBUG(this->getEntry()->getParent()->dump()); + DEBUG(dbgs() << PrintReg(Reg, TRI) << "\n"); + } + + if (MRI->def_begin(Reg) == MRI->def_end()) { + DEBUG(dbgs() << "Register " + << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); + } else if (!MRI->hasOneDef(Reg)) { + DEBUG(dbgs() << "Register " + << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); + } + + assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); + MachineOperand *Def = &(*(MRI->def_begin(Reg))); + MachineOperand *UseOperand = &(RI); + bool UseIsOutsideDefMBB = Def->getParent()->getParent() != MBB; + if (UseIsOutsideDefMBB && UseOperand->isKill()) { + DEBUG(dbgs() << "Removing kill flag on register: " + << PrintReg(Reg, TRI) << "\n"); + UseOperand->setIsKill(false); + } + } + } + } + } + } +} + +void LinearizedRegion::initLiveOut(RegionMRT *Region, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + storeLiveOuts(Region, MRI, TRI, PHIInfo); +} + +LinearizedRegion::LinearizedRegion(MachineBasicBlock *MBB, + const MachineRegisterInfo *MRI, + const TargetRegisterInfo *TRI, + PHILinearize &PHIInfo) { + setEntry(MBB); + setExit(MBB); + storeLiveOuts(MBB, MRI, TRI, PHIInfo); + MBBs.insert(MBB); + Parent = nullptr; +} + +LinearizedRegion::LinearizedRegion() { + setEntry(nullptr); + setExit(nullptr); + Parent = nullptr; +} + +LinearizedRegion::~LinearizedRegion() {} + +class AMDGPUMachineCFGStructurizer : public MachineFunctionPass { +private: + const MachineRegionInfo *Regions; + const SIInstrInfo *TII; + const TargetRegisterInfo *TRI; + MachineRegisterInfo *MRI; + unsigned BBSelectRegister; + PHILinearize PHIInfo; + DenseMap FallthroughMap; + + void getPHIRegionIndices(RegionMRT *Region, MachineInstr &PHI, + SmallVector &RegionIndices); + void getPHIRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, + SmallVector &RegionIndices); + void getPHINonRegionIndices(LinearizedRegion *Region, MachineInstr &PHI, + SmallVector &PHINonRegionIndices); + + void storePHILinearizationInfoDest( + unsigned LDestReg, MachineInstr &PHI, + SmallVector *RegionIndices = nullptr); + + unsigned storePHILinearizationInfo(MachineInstr &PHI, + SmallVector *RegionIndices); + + void extractKilledPHIs(MachineBasicBlock *MBB); + + bool shrinkPHI(MachineInstr &PHI, SmallVector &PHIIndices, + unsigned *ReplaceReg); + + bool shrinkPHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *SourceMBB, + SmallVector &PHIIndices, unsigned *ReplaceReg); + + void replacePHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *LastMerge, + SmallVector &PHIRegionIndices); + void replaceEntryPHI(MachineInstr &PHI, unsigned CombinedSourceReg, + MachineBasicBlock *IfMBB, + SmallVector &PHIRegionIndices); + void replaceLiveOutRegs(MachineInstr &PHI, + SmallVector &PHIRegionIndices, + unsigned CombinedSourceReg, + LinearizedRegion *LRegion); + void rewriteRegionExitPHI(RegionMRT *Region, MachineBasicBlock *LastMerge, + MachineInstr &PHI, LinearizedRegion *LRegion); + + void rewriteRegionExitPHIs(RegionMRT *Region, MachineBasicBlock *LastMerge, + LinearizedRegion *LRegion); + void rewriteRegionEntryPHI(LinearizedRegion *Region, MachineBasicBlock *IfMBB, + MachineInstr &PHI); + void rewriteRegionEntryPHIs(LinearizedRegion *Region, + MachineBasicBlock *IfMBB); + + bool regionIsSimpleIf(RegionMRT *Region); + + void transformSimpleIfRegion(RegionMRT *Region); + + void eliminateDeadBranchOperands(MachineBasicBlock::instr_iterator &II); + + void insertUnconditionalBranch(MachineBasicBlock *MBB, + MachineBasicBlock *Dest, + const DebugLoc &DL = DebugLoc()); + + MachineBasicBlock *createLinearizedExitBlock(RegionMRT *Region); + + void insertMergePHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, unsigned DestRegister, + unsigned IfSourceRegister, unsigned CodeSourceRegister, + bool IsUndefIfSource = false); + + MachineBasicBlock *createIfBlock(MachineBasicBlock *MergeBB, + MachineBasicBlock *CodeBBStart, + MachineBasicBlock *CodeBBEnd, + MachineBasicBlock *SelectBB, unsigned IfReg, + bool InheritPreds); + + void prunePHIInfo(MachineBasicBlock *MBB); + void createEntryPHI(LinearizedRegion *CurrentRegion, unsigned DestReg); + + void createEntryPHIs(LinearizedRegion *CurrentRegion); + void resolvePHIInfos(MachineBasicBlock *FunctionEntry); + + void replaceRegisterWith(unsigned Register, unsigned NewRegister); + + MachineBasicBlock *createIfRegion(MachineBasicBlock *MergeBB, + MachineBasicBlock *CodeBB, + LinearizedRegion *LRegion, + unsigned BBSelectRegIn, + unsigned BBSelectRegOut); + + MachineBasicBlock * + createIfRegion(MachineBasicBlock *MergeMBB, LinearizedRegion *InnerRegion, + LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, + unsigned BBSelectRegIn, unsigned BBSelectRegOut); + void ensureCondIsNotKilled(SmallVector Cond); + + void rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned BBSelectReg); + + MachineInstr *getDefInstr(unsigned Reg); + void insertChainedPHI(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, unsigned DestReg, + unsigned SourceReg); + bool containsDef(MachineBasicBlock *MBB, LinearizedRegion *InnerRegion, + unsigned Register); + void rewriteLiveOutRegs(MachineBasicBlock *IfBB, MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + LinearizedRegion *LRegion); + + void splitLoopPHI(MachineInstr &PHI, MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, LinearizedRegion *LRegion); + void splitLoopPHIs(MachineBasicBlock *Entry, MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion); + + MachineBasicBlock *splitExit(LinearizedRegion *LRegion); + + MachineBasicBlock *splitEntry(LinearizedRegion *LRegion); + + LinearizedRegion *initLinearizedRegion(RegionMRT *Region); + + bool structurizeComplexRegion(RegionMRT *Region); + + bool structurizeRegion(RegionMRT *Region); + + bool structurizeRegions(RegionMRT *Region, bool isTopRegion); + +public: + static char ID; + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + AMDGPUMachineCFGStructurizer() : MachineFunctionPass(ID) { + initializeAMDGPUMachineCFGStructurizerPass(*PassRegistry::getPassRegistry()); + } + + void initFallthroughMap(MachineFunction &MF); + + void createLinearizedRegion(RegionMRT *Region, unsigned SelectOut); + + unsigned initializeSelectRegisters(MRT *MRT, unsigned ExistingExitReg, + MachineRegisterInfo *MRI, + const SIInstrInfo *TII); + + RegionMRT *RMRT; + void setRegionMRT(RegionMRT *RegionTree) { RMRT = RegionTree; } + + RegionMRT *getRegionMRT() { return RMRT; } + + bool runOnMachineFunction(MachineFunction &MF) override; +}; +} + +char AMDGPUMachineCFGStructurizer::ID = 0; + +bool AMDGPUMachineCFGStructurizer::regionIsSimpleIf(RegionMRT *Region) { + MachineBasicBlock *Entry = Region->getEntry(); + MachineBasicBlock *Succ = Region->getSucc(); + bool FoundBypass = false; + bool FoundIf = false; + + if (Entry->succ_size() != 2) { + return false; + } + + for (MachineBasicBlock::const_succ_iterator SI = Entry->succ_begin(), + E = Entry->succ_end(); + SI != E; ++SI) { + MachineBasicBlock *Current = *SI; + + if (Current == Succ) { + FoundBypass = true; + } else if ((Current->succ_size() == 1) && + *(Current->succ_begin()) == Succ) { + FoundIf = true; + } + } + + return FoundIf && FoundBypass; +} + +void AMDGPUMachineCFGStructurizer::transformSimpleIfRegion(RegionMRT *Region) { + MachineBasicBlock *Entry = Region->getEntry(); + MachineBasicBlock *Exit = Region->getExit(); + TII->convertNonUniformIfRegion(Entry, Exit); +} + +static void fixMBBTerminator(MachineBasicBlock *MBB) { + + if (MBB->succ_size() == 1) { + auto *Succ = *(MBB->succ_begin()); + for (auto &TI : MBB->terminators()) { + for (auto &UI : TI.uses()) { + if (UI.isMBB() && UI.getMBB() != Succ) { + UI.setMBB(Succ); + } + } + } + } +} + +static void fixRegionTerminator(RegionMRT *Region) { + MachineBasicBlock *InternalSucc = nullptr; + MachineBasicBlock *ExternalSucc = nullptr; + LinearizedRegion *LRegion = Region->getLinearizedRegion(); + auto Exit = LRegion->getExit(); + + SmallPtrSet Successors; + for (MachineBasicBlock::const_succ_iterator SI = Exit->succ_begin(), + SE = Exit->succ_end(); + SI != SE; ++SI) { + MachineBasicBlock *Succ = *SI; + if (LRegion->contains(Succ)) { + // Do not allow re-assign + assert(InternalSucc == nullptr); + InternalSucc = Succ; + } else { + // Do not allow re-assign + assert(ExternalSucc == nullptr); + ExternalSucc = Succ; + } + } + + for (auto &TI : Exit->terminators()) { + for (auto &UI : TI.uses()) { + if (UI.isMBB()) { + auto Target = UI.getMBB(); + if (Target != InternalSucc && Target != ExternalSucc) { + UI.setMBB(ExternalSucc); + } + } + } + } +} + +// If a region region is just a sequence of regions (and the exit +// block in the case of the top level region), we can simply skip +// linearizing it, because it is already linear +bool regionIsSequence(RegionMRT *Region) { + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (!CI->isRegion()) { + if (CI->getMBBMRT()->getMBB()->succ_size() > 1) { + return false; + } + } + } + return true; +} + +void fixupRegionExits(RegionMRT *Region) { + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (!CI->isRegion()) { + fixMBBTerminator(CI->getMBBMRT()->getMBB()); + } else { + fixRegionTerminator(CI->getRegionMRT()); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( + RegionMRT *Region, MachineInstr &PHI, + SmallVector &PHIRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (Region->contains(Pred)) { + PHIRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHIRegionIndices( + LinearizedRegion *Region, MachineInstr &PHI, + SmallVector &PHIRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (Region->contains(Pred)) { + PHIRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::getPHINonRegionIndices( + LinearizedRegion *Region, MachineInstr &PHI, + SmallVector &PHINonRegionIndices) { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + MachineBasicBlock *Pred = getPHIPred(PHI, i); + if (!Region->contains(Pred)) { + PHINonRegionIndices.push_back(i); + } + } +} + +void AMDGPUMachineCFGStructurizer::storePHILinearizationInfoDest( + unsigned LDestReg, MachineInstr &PHI, + SmallVector *RegionIndices) { + if (RegionIndices) { + for (auto i : *RegionIndices) { + PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); + } + } else { + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + PHIInfo.addSource(LDestReg, getPHISourceReg(PHI, i), getPHIPred(PHI, i)); + } + } +} + +unsigned AMDGPUMachineCFGStructurizer::storePHILinearizationInfo( + MachineInstr &PHI, SmallVector *RegionIndices) { + unsigned DestReg = getPHIDestReg(PHI); + unsigned LinearizeDestReg = + MRI->createVirtualRegister(MRI->getRegClass(DestReg)); + PHIInfo.addDest(LinearizeDestReg, PHI.getDebugLoc()); + storePHILinearizationInfoDest(LinearizeDestReg, PHI, RegionIndices); + return LinearizeDestReg; +} + +void AMDGPUMachineCFGStructurizer::extractKilledPHIs(MachineBasicBlock *MBB) { + // We need to create a new chain for the killed phi, but there is no + // need to do the renaming outside or inside the block. + SmallPtrSet PHIs; + for (MachineBasicBlock::instr_iterator I = MBB->instr_begin(), + E = MBB->instr_end(); + I != E; ++I) { + MachineInstr &Instr = *I; + if (Instr.isPHI()) { + unsigned PHIDestReg = getPHIDestReg(Instr); + DEBUG(dbgs() << "Extractking killed phi:\n"); + DEBUG(Instr.dump()); + PHIs.insert(&Instr); + PHIInfo.addDest(PHIDestReg, Instr.getDebugLoc()); + storePHILinearizationInfoDest(PHIDestReg, Instr); + } + } + + for (auto PI : PHIs) { + PI->eraseFromParent(); + } +} + +static bool isPHIRegionIndex(SmallVector PHIRegionIndices, + unsigned Index) { + for (auto i : PHIRegionIndices) { + if (i == Index) + return true; + } + return false; +} + +bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, + SmallVector &PHIIndices, + unsigned *ReplaceReg) { + return shrinkPHI(PHI, 0, nullptr, PHIIndices, ReplaceReg); +} + +bool AMDGPUMachineCFGStructurizer::shrinkPHI(MachineInstr &PHI, + unsigned CombinedSourceReg, + MachineBasicBlock *SourceMBB, + SmallVector &PHIIndices, + unsigned *ReplaceReg) { + DEBUG(dbgs() << "Shrink PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " to " << PrintReg(getPHIDestReg(PHI), TRI) + << " = PHI("); + + bool Replaced = false; + unsigned NumInputs = getPHINumInputs(PHI); + int SingleExternalEntryIndex = -1; + for (unsigned i = 0; i < NumInputs; ++i) { + if (!isPHIRegionIndex(PHIIndices, i)) { + if (SingleExternalEntryIndex == -1) { + // Single entry + SingleExternalEntryIndex = i; + } else { + // Multiple entries + SingleExternalEntryIndex = -2; + } + } + } + + if (SingleExternalEntryIndex > -1) { + *ReplaceReg = getPHISourceReg(PHI, SingleExternalEntryIndex); + // We should not rewrite the code, we should only pick up the single value + // that represents the shrunk PHI. + Replaced = true; + } else { + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + if (SourceMBB) { + MIB.addReg(CombinedSourceReg); + MIB.addMBB(SourceMBB); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << SourceMBB->getNumber()); + } + + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + } + PHI.eraseFromParent(); + return Replaced; +} + +void AMDGPUMachineCFGStructurizer::replacePHI( + MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *LastMerge, + SmallVector &PHIRegionIndices) { + DEBUG(dbgs() << "Replace PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " with " << PrintReg(getPHIDestReg(PHI), TRI) + << " = PHI("); + + bool HasExternalEdge = false; + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + if (!isPHIRegionIndex(PHIRegionIndices, i)) { + HasExternalEdge = true; + } + } + + if (HasExternalEdge) { + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + MIB.addReg(CombinedSourceReg); + MIB.addMBB(LastMerge); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << LastMerge->getNumber()); + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + } else { + replaceRegisterWith(getPHIDestReg(PHI), CombinedSourceReg); + } + PHI.eraseFromParent(); +} + +void AMDGPUMachineCFGStructurizer::replaceEntryPHI( + MachineInstr &PHI, unsigned CombinedSourceReg, MachineBasicBlock *IfMBB, + SmallVector &PHIRegionIndices) { + + DEBUG(dbgs() << "Replace entry PHI: "); + DEBUG(PHI.dump()); + DEBUG(dbgs() << " with "); + + unsigned NumInputs = getPHINumInputs(PHI); + unsigned NumNonRegionInputs = NumInputs; + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + NumNonRegionInputs--; + } + } + + if (NumNonRegionInputs == 0) { + auto DestReg = getPHIDestReg(PHI); + replaceRegisterWith(DestReg, CombinedSourceReg); + DEBUG(dbgs() << " register " << PrintReg(CombinedSourceReg, TRI) << "\n"); + PHI.eraseFromParent(); + } else { + DEBUG(dbgs() << PrintReg(getPHIDestReg(PHI), TRI) << " = PHI("); + MachineBasicBlock *MBB = PHI.getParent(); + MachineInstrBuilder MIB = + BuildMI(*MBB, PHI, PHI.getDebugLoc(), TII->get(TargetOpcode::PHI), + getPHIDestReg(PHI)); + MIB.addReg(CombinedSourceReg); + MIB.addMBB(IfMBB); + DEBUG(dbgs() << PrintReg(CombinedSourceReg, TRI) << ", BB#" + << IfMBB->getNumber()); + unsigned NumInputs = getPHINumInputs(PHI); + for (unsigned i = 0; i < NumInputs; ++i) { + if (isPHIRegionIndex(PHIRegionIndices, i)) { + continue; + } + unsigned SourceReg = getPHISourceReg(PHI, i); + MachineBasicBlock *SourcePred = getPHIPred(PHI, i); + MIB.addReg(SourceReg); + MIB.addMBB(SourcePred); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << SourcePred->getNumber()); + } + DEBUG(dbgs() << ")\n"); + PHI.eraseFromParent(); + } +} + +void AMDGPUMachineCFGStructurizer::replaceLiveOutRegs( + MachineInstr &PHI, SmallVector &PHIRegionIndices, + unsigned CombinedSourceReg, LinearizedRegion *LRegion) { + bool WasLiveOut = false; + for (auto PII : PHIRegionIndices) { + unsigned Reg = getPHISourceReg(PHI, PII); + if (LRegion->isLiveOut(Reg)) { + bool IsDead = true; + + // Check if register is live out of the basic block + MachineBasicBlock *DefMBB = getDefInstr(Reg)->getParent(); + for (auto UI = MRI->use_begin(Reg), E = MRI->use_end(); UI != E; ++UI) { + if ((*UI).getParent()->getParent() != DefMBB) { + IsDead = false; + } + } + + DEBUG(dbgs() << "Register " << PrintReg(Reg, TRI) << " is " + << (IsDead ? "dead" : "alive") << " after PHI replace\n"); + if (IsDead) { + LRegion->removeLiveOut(Reg); + } + WasLiveOut = true; + } + } + + if (WasLiveOut) + LRegion->addLiveOut(CombinedSourceReg); +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHI(RegionMRT *Region, + MachineBasicBlock *LastMerge, + MachineInstr &PHI, + LinearizedRegion *LRegion) { + SmallVector PHIRegionIndices; + getPHIRegionIndices(Region, PHI, PHIRegionIndices); + unsigned LinearizedSourceReg = + storePHILinearizationInfo(PHI, &PHIRegionIndices); + + replacePHI(PHI, LinearizedSourceReg, LastMerge, PHIRegionIndices); + replaceLiveOutRegs(PHI, PHIRegionIndices, LinearizedSourceReg, LRegion); +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHI(LinearizedRegion *Region, + MachineBasicBlock *IfMBB, + MachineInstr &PHI) { + SmallVector PHINonRegionIndices; + getPHINonRegionIndices(Region, PHI, PHINonRegionIndices); + unsigned LinearizedSourceReg = + storePHILinearizationInfo(PHI, &PHINonRegionIndices); + replaceEntryPHI(PHI, LinearizedSourceReg, IfMBB, PHINonRegionIndices); +} + +static void collectPHIs(MachineBasicBlock *MBB, + SmallVector &PHIs) { + for (auto &BBI : *MBB) { + if (BBI.isPHI()) { + PHIs.push_back(&BBI); + } + } +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionExitPHIs(RegionMRT *Region, + MachineBasicBlock *LastMerge, + LinearizedRegion *LRegion) { + SmallVector PHIs; + auto Exit = Region->getSucc(); + if (Exit == nullptr) + return; + + collectPHIs(Exit, PHIs); + + for (auto PHII : PHIs) { + rewriteRegionExitPHI(Region, LastMerge, *PHII, LRegion); + } +} + +void AMDGPUMachineCFGStructurizer::rewriteRegionEntryPHIs(LinearizedRegion *Region, + MachineBasicBlock *IfMBB) { + SmallVector PHIs; + auto Entry = Region->getEntry(); + + collectPHIs(Entry, PHIs); + + for (auto PHII : PHIs) { + rewriteRegionEntryPHI(Region, IfMBB, *PHII); + } +} + +void AMDGPUMachineCFGStructurizer::insertUnconditionalBranch(MachineBasicBlock *MBB, + MachineBasicBlock *Dest, + const DebugLoc &DL) { + DEBUG(dbgs() << "Inserting unconditional branch: " << MBB->getNumber() + << " -> " << Dest->getNumber() << "\n"); + MachineBasicBlock::instr_iterator Terminator = MBB->getFirstInstrTerminator(); + bool HasTerminator = Terminator != MBB->instr_end(); + if (HasTerminator) { + TII->ReplaceTailWithBranchTo(Terminator, Dest); + } + if (++MachineFunction::iterator(MBB) != MachineFunction::iterator(Dest)) { + TII->insertUnconditionalBranch(*MBB, Dest, DL); + } +} + +static MachineBasicBlock *getSingleExitNode(MachineFunction &MF) { + MachineBasicBlock *result = nullptr; + for (auto &MFI : MF) { + if (MFI.succ_size() == 0) { + if (result == nullptr) { + result = &MFI; + } else { + return nullptr; + } + } + } + + return result; +} + +static bool hasOneExitNode(MachineFunction &MF) { + return getSingleExitNode(MF) != nullptr; +} + +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::createLinearizedExitBlock(RegionMRT *Region) { + auto Exit = Region->getSucc(); + + // If the exit is the end of the function, we just use the existing + MachineFunction *MF = Region->getEntry()->getParent(); + if (Exit == nullptr && hasOneExitNode(*MF)) { + return &(*(--(Region->getEntry()->getParent()->end()))); + } + + MachineBasicBlock *LastMerge = MF->CreateMachineBasicBlock(); + if (Exit == nullptr) { + MachineFunction::iterator ExitIter = MF->end(); + MF->insert(ExitIter, LastMerge); + } else { + MachineFunction::iterator ExitIter = Exit->getIterator(); + MF->insert(ExitIter, LastMerge); + LastMerge->addSuccessor(Exit); + insertUnconditionalBranch(LastMerge, Exit); + DEBUG(dbgs() << "Created exit block: " << LastMerge->getNumber() << "\n"); + } + return LastMerge; +} + +void AMDGPUMachineCFGStructurizer::insertMergePHI(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned DestRegister, + unsigned IfSourceRegister, + unsigned CodeSourceRegister, + bool IsUndefIfSource) { + // If this is the function exit block, we don't need a phi. + if (MergeBB->succ_begin() == MergeBB->succ_end()) { + return; + } + DEBUG(dbgs() << "Merge PHI (BB#" << MergeBB->getNumber() + << "): " << PrintReg(DestRegister, TRI) << " = PHI(" + << PrintReg(IfSourceRegister, TRI) << ", BB#" + << IfBB->getNumber() << PrintReg(CodeSourceRegister, TRI) + << ", BB#" << CodeBB->getNumber() << ")\n"); + const DebugLoc &DL = MergeBB->findDebugLoc(MergeBB->begin()); + MachineInstrBuilder MIB = BuildMI(*MergeBB, MergeBB->instr_begin(), DL, + TII->get(TargetOpcode::PHI), DestRegister); + if (IsUndefIfSource && false) { + MIB.addReg(IfSourceRegister, RegState::Undef); + } else { + MIB.addReg(IfSourceRegister); + } + MIB.addMBB(IfBB); + MIB.addReg(CodeSourceRegister); + MIB.addMBB(CodeBB); +} + +static void removeExternalCFGSuccessors(MachineBasicBlock *MBB) { + for (MachineBasicBlock::succ_iterator PI = MBB->succ_begin(), + E = MBB->succ_end(); + PI != E; ++PI) { + if ((*PI) != MBB) { + (MBB)->removeSuccessor(*PI); + } + } +} + +static void removeExternalCFGEdges(MachineBasicBlock *StartMBB, + MachineBasicBlock *EndMBB) { + + // We have to check against the StartMBB successor becasuse a + // structurized region with a loop will have the entry block split, + // and the backedge will go to the entry successor. + DenseSet> Succs; + unsigned SuccSize = StartMBB->succ_size(); + if (SuccSize > 0) { + MachineBasicBlock *StartMBBSucc = *(StartMBB->succ_begin()); + for (MachineBasicBlock::succ_iterator PI = EndMBB->succ_begin(), + E = EndMBB->succ_end(); + PI != E; ++PI) { + // Either we have a back-edge to the entry block, or a back-edge to the + // succesor of the entry block since the block may be split. + if ((*PI) != StartMBB && + !((*PI) == StartMBBSucc && StartMBB != EndMBB && SuccSize == 1)) { + Succs.insert( + std::pair(EndMBB, *PI)); + } + } + } + + for (MachineBasicBlock::pred_iterator PI = StartMBB->pred_begin(), + E = StartMBB->pred_end(); + PI != E; ++PI) { + if ((*PI) != EndMBB) { + Succs.insert( + std::pair(*PI, StartMBB)); + } + } + + for (auto SI : Succs) { + std::pair Edge = SI; + DEBUG(dbgs() << "Removing edge: BB#" << Edge.first->getNumber() << " -> BB#" + << Edge.second->getNumber() << "\n"); + Edge.first->removeSuccessor(Edge.second); + } +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfBlock( + MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBBStart, + MachineBasicBlock *CodeBBEnd, MachineBasicBlock *SelectBB, unsigned IfReg, + bool InheritPreds) { + MachineFunction *MF = MergeBB->getParent(); + MachineBasicBlock *IfBB = MF->CreateMachineBasicBlock(); + + if (InheritPreds) { + for (MachineBasicBlock::pred_iterator PI = CodeBBStart->pred_begin(), + E = CodeBBStart->pred_end(); + PI != E; ++PI) { + if ((*PI) != CodeBBEnd) { + MachineBasicBlock *Pred = (*PI); + Pred->addSuccessor(IfBB); + } + } + } + + removeExternalCFGEdges(CodeBBStart, CodeBBEnd); + + auto CodeBBStartI = CodeBBStart->getIterator(); + auto CodeBBEndI = CodeBBEnd->getIterator(); + auto MergeIter = MergeBB->getIterator(); + MF->insert(MergeIter, IfBB); + MF->splice(MergeIter, CodeBBStartI, ++CodeBBEndI); + IfBB->addSuccessor(MergeBB); + IfBB->addSuccessor(CodeBBStart); + + DEBUG(dbgs() << "Created If block: " << IfBB->getNumber() << "\n"); + // Ensure that the MergeBB is a succesor of the CodeEndBB. + if (!CodeBBEnd->isSuccessor(MergeBB)) + CodeBBEnd->addSuccessor(MergeBB); + + DEBUG(dbgs() << "Moved MBB#" << CodeBBStart->getNumber() << " through MBB#" + << CodeBBEnd->getNumber() << "\n"); + + // If we have a single predecessor we can find a reasonable debug location + MachineBasicBlock *SinglePred = + CodeBBStart->pred_size() == 1 ? *(CodeBBStart->pred_begin()) : nullptr; + const DebugLoc &DL = SinglePred + ? SinglePred->findDebugLoc(SinglePred->getFirstTerminator()) + : DebugLoc(); + + unsigned Reg = + TII->insertEQ(IfBB, IfBB->begin(), DL, IfReg, + SelectBB->getNumber() /* CodeBBStart->getNumber() */); + if (&(*(IfBB->getParent()->begin())) == IfBB) { + TII->materializeImmediate(*IfBB, IfBB->begin(), DL, IfReg, + CodeBBStart->getNumber()); + } + MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef Cond(RegOp); + TII->insertBranch(*IfBB, MergeBB, CodeBBStart, Cond, DL); + + return IfBB; +} + +void AMDGPUMachineCFGStructurizer::ensureCondIsNotKilled( + SmallVector Cond) { + if (Cond.size() != 1) + return; + if (!Cond[0].isReg()) + return; + + unsigned CondReg = Cond[0].getReg(); + for (auto UI = MRI->use_begin(CondReg), E = MRI->use_end(); UI != E; ++UI) { + (*UI).setIsKill(false); + } +} + +void AMDGPUMachineCFGStructurizer::rewriteCodeBBTerminator(MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + unsigned BBSelectReg) { + MachineBasicBlock *TrueBB = nullptr; + MachineBasicBlock *FalseBB = nullptr; + SmallVector Cond; + MachineBasicBlock *FallthroughBB = FallthroughMap[CodeBB]; + TII->analyzeBranch(*CodeBB, TrueBB, FalseBB, Cond); + + const DebugLoc &DL = CodeBB->findDebugLoc(CodeBB->getFirstTerminator()); + + if (FalseBB == nullptr && TrueBB == nullptr && FallthroughBB == nullptr) { + // This is an exit block, hence no successors. We will assign the + // bb select register to the entry block. + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, + CodeBB->getParent()->begin()->getNumber()); + insertUnconditionalBranch(CodeBB, MergeBB, DL); + return; + } + + if (FalseBB == nullptr && TrueBB == nullptr) { + TrueBB = FallthroughBB; + } else if (TrueBB != nullptr) { + FalseBB = + (FallthroughBB && (FallthroughBB != TrueBB)) ? FallthroughBB : FalseBB; + } + + if ((TrueBB != nullptr && FalseBB == nullptr) || (TrueBB == FalseBB)) { + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, TrueBB->getNumber()); + } else { + const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectReg); + unsigned TrueBBReg = MRI->createVirtualRegister(RegClass); + unsigned FalseBBReg = MRI->createVirtualRegister(RegClass); + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + TrueBBReg, TrueBB->getNumber()); + TII->materializeImmediate(*CodeBB, CodeBB->getFirstTerminator(), DL, + FalseBBReg, FalseBB->getNumber()); + ensureCondIsNotKilled(Cond); + TII->insertVectorSelect(*CodeBB, CodeBB->getFirstTerminator(), DL, + BBSelectReg, Cond, TrueBBReg, FalseBBReg); + } + + insertUnconditionalBranch(CodeBB, MergeBB, DL); +} + +MachineInstr *AMDGPUMachineCFGStructurizer::getDefInstr(unsigned Reg) { + if (MRI->def_begin(Reg) == MRI->def_end()) { + DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has NO defs\n"); + } else if (!MRI->hasOneDef(Reg)) { + DEBUG(dbgs() << "Register " << PrintReg(Reg, MRI->getTargetRegisterInfo()) + << " has multiple defs\n"); + DEBUG(dbgs() << "DEFS BEGIN:\n"); + for (auto DI = MRI->def_begin(Reg), DE = MRI->def_end(); DI != DE; ++DI) { + DEBUG(DI->getParent()->dump()); + } + DEBUG(dbgs() << "DEFS END\n"); + } + + assert(MRI->hasOneDef(Reg) && "Register has multiple definitions"); + return (*(MRI->def_begin(Reg))).getParent(); +} + +void AMDGPUMachineCFGStructurizer::insertChainedPHI(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + unsigned DestReg, + unsigned SourceReg) { + // In this function we know we are part of a chain already, so we need + // to add the registers to the existing chain, and rename the register + // inside the region. + bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); + MachineInstr *DefInstr = getDefInstr(SourceReg); + if (DefInstr->isPHI() && DefInstr->getParent() == CodeBB && IsSingleBB) { + // Handle the case where the def is a PHI-def inside a basic + // block, then we only need to do renaming. Special care needs to + // be taken if the PHI-def is part of an existing chain, or if a + // new one needs to be created. + InnerRegion->replaceRegisterInsideRegion(SourceReg, DestReg, true, MRI); + + // We collect all PHI Information, and if we are at the region entry, + // all PHIs will be removed, and then re-introduced if needed. + storePHILinearizationInfoDest(DestReg, *DefInstr); + // We have picked up all the information we need now and can remove + // the PHI + PHIInfo.removeSource(DestReg, SourceReg, CodeBB); + DefInstr->eraseFromParent(); + } else { + // If this is not a phi-def, or it is a phi-def but from a linearized region + if (IsSingleBB && DefInstr->getParent() == InnerRegion->getEntry()) { + // If this is a single BB and the definition is in this block we + // need to replace any uses outside the region. + InnerRegion->replaceRegisterOutsideRegion(SourceReg, DestReg, false, MRI); + } + const TargetRegisterClass *RegClass = MRI->getRegClass(DestReg); + unsigned NextDestReg = MRI->createVirtualRegister(RegClass); + bool IsLastDef = PHIInfo.getNumSources(DestReg) == 1; + DEBUG(dbgs() << "Insert Chained PHI\n"); + insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, DestReg, NextDestReg, + SourceReg, IsLastDef); + + PHIInfo.removeSource(DestReg, SourceReg, CodeBB); + if (IsLastDef) { + const DebugLoc &DL = IfBB->findDebugLoc(IfBB->getFirstTerminator()); + TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DL, + NextDestReg, 0); + PHIInfo.deleteDef(DestReg); + } else { + PHIInfo.replaceDef(DestReg, NextDestReg); + } + } +} + +bool AMDGPUMachineCFGStructurizer::containsDef(MachineBasicBlock *MBB, + LinearizedRegion *InnerRegion, + unsigned Register) { + return getDefInstr(Register)->getParent() == MBB || + InnerRegion->contains(getDefInstr(Register)->getParent()); +} + +void AMDGPUMachineCFGStructurizer::rewriteLiveOutRegs(MachineBasicBlock *IfBB, + MachineBasicBlock *CodeBB, + MachineBasicBlock *MergeBB, + LinearizedRegion *InnerRegion, + LinearizedRegion *LRegion) { + DenseSet *LiveOuts = InnerRegion->getLiveOuts(); + SmallVector OldLiveOuts; + bool IsSingleBB = InnerRegion->getEntry() == InnerRegion->getExit(); + for (auto OLI : *LiveOuts) { + OldLiveOuts.push_back(OLI); + } + + for (auto LI : OldLiveOuts) { + DEBUG(dbgs() << "LiveOut: " << PrintReg(LI, TRI)); + if (!containsDef(CodeBB, InnerRegion, LI) || + (!IsSingleBB && (getDefInstr(LI)->getParent() == LRegion->getExit()))) { + // If the register simly lives through the CodeBB, we don't have + // to rewrite anything since the register is not defined in this + // part of the code. + DEBUG(dbgs() << "- through"); + continue; + } + DEBUG(dbgs() << "\n"); + unsigned Reg = LI; + if (/*!PHIInfo.isSource(Reg) &&*/ Reg != InnerRegion->getBBSelectRegOut()) { + // If the register is live out, we do want to create a phi, + // unless it is from the Exit block, becasuse in that case there + // is already a PHI, and no need to create a new one. + + // If the register is just a live out def and not part of a phi + // chain, we need to create a PHI node to handle the if region, + // and replace all uses outside of the region with the new dest + // register, unless it is the outgoing BB select register. We have + // already creaed phi nodes for these. + const TargetRegisterClass *RegClass = MRI->getRegClass(Reg); + unsigned PHIDestReg = MRI->createVirtualRegister(RegClass); + unsigned IfSourceReg = MRI->createVirtualRegister(RegClass); + // Create initializer, this value is never used, but is needed + // to satisfy SSA. + DEBUG(dbgs() << "Initializer for reg: " << PrintReg(Reg) << "\n"); + TII->materializeImmediate(*IfBB, IfBB->getFirstTerminator(), DebugLoc(), + IfSourceReg, 0); + + InnerRegion->replaceRegisterOutsideRegion(Reg, PHIDestReg, true, MRI); + DEBUG(dbgs() << "Insert Non-Chained Live out PHI\n"); + insertMergePHI(IfBB, InnerRegion->getExit(), MergeBB, PHIDestReg, + IfSourceReg, Reg, true); + } + } + + // Handle the chained definitions in PHIInfo, checking if this basic block + // is a source block for a definition. + SmallVector Sources; + if (PHIInfo.findSourcesFromMBB(CodeBB, Sources)) { + DEBUG(dbgs() << "Inserting PHI Live Out from BB#" << CodeBB->getNumber() + << "\n"); + for (auto SI : Sources) { + unsigned DestReg; + PHIInfo.findDest(SI, CodeBB, DestReg); + insertChainedPHI(IfBB, CodeBB, MergeBB, InnerRegion, DestReg, SI); + } + DEBUG(dbgs() << "Insertion done.\n"); + } + + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::prunePHIInfo(MachineBasicBlock *MBB) { + DEBUG(dbgs() << "Before PHI Prune\n"); + DEBUG(PHIInfo.dump(MRI)); + SmallVector, 4> + ElimiatedSources; + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + + unsigned DestReg = *DRI; + auto SE = PHIInfo.sources_end(DestReg); + + bool MBBContainsPHISource = false; + // Check if there is a PHI source in this MBB + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + unsigned SourceReg = (*SRI).first; + MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); + if (Def->getParent()->getParent() == MBB) { + MBBContainsPHISource = true; + } + } + + // If so, all other sources are useless since we know this block + // is always executed when the region is executed. + if (MBBContainsPHISource) { + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + PHILinearize::PHISourceT Source = *SRI; + unsigned SourceReg = Source.first; + MachineBasicBlock *SourceMBB = Source.second; + MachineOperand *Def = &(*(MRI->def_begin(SourceReg))); + if (Def->getParent()->getParent() != MBB) { + ElimiatedSources.push_back( + std::make_tuple(DestReg, SourceReg, SourceMBB)); + } + } + } + } + + // Remove the PHI sources that are in the given MBB + for (auto &SourceInfo : ElimiatedSources) { + PHIInfo.removeSource(std::get<0>(SourceInfo), std::get<1>(SourceInfo), + std::get<2>(SourceInfo)); + } + DEBUG(dbgs() << "After PHI Prune\n"); + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::createEntryPHI(LinearizedRegion *CurrentRegion, + unsigned DestReg) { + MachineBasicBlock *Entry = CurrentRegion->getEntry(); + MachineBasicBlock *Exit = CurrentRegion->getExit(); + + DEBUG(dbgs() << "RegionExit: " << Exit->getNumber() + << " Pred: " << (*(Entry->pred_begin()))->getNumber() << "\n"); + + int NumSources = 0; + auto SE = PHIInfo.sources_end(DestReg); + + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + NumSources++; + } + + if (NumSources == 1) { + auto SRI = PHIInfo.sources_begin(DestReg); + unsigned SourceReg = (*SRI).first; + replaceRegisterWith(DestReg, SourceReg); + } else { + const DebugLoc &DL = Entry->findDebugLoc(Entry->begin()); + MachineInstrBuilder MIB = BuildMI(*Entry, Entry->instr_begin(), DL, + TII->get(TargetOpcode::PHI), DestReg); + DEBUG(dbgs() << "Entry PHI " << PrintReg(DestReg, TRI) << " = PHI("); + + unsigned CurrentBackedgeReg = 0; + + for (auto SRI = PHIInfo.sources_begin(DestReg); SRI != SE; ++SRI) { + unsigned SourceReg = (*SRI).first; + + if (CurrentRegion->contains((*SRI).second)) { + if (CurrentBackedgeReg == 0) { + CurrentBackedgeReg = SourceReg; + } else { + MachineInstr *PHIDefInstr = getDefInstr(SourceReg); + MachineBasicBlock *PHIDefMBB = PHIDefInstr->getParent(); + const TargetRegisterClass *RegClass = + MRI->getRegClass(CurrentBackedgeReg); + unsigned NewBackedgeReg = MRI->createVirtualRegister(RegClass); + MachineInstrBuilder BackedgePHI = + BuildMI(*PHIDefMBB, PHIDefMBB->instr_begin(), DL, + TII->get(TargetOpcode::PHI), NewBackedgeReg); + BackedgePHI.addReg(CurrentBackedgeReg); + BackedgePHI.addMBB(getPHIPred(*PHIDefInstr, 0)); + BackedgePHI.addReg(getPHISourceReg(*PHIDefInstr, 1)); + BackedgePHI.addMBB((*SRI).second); + CurrentBackedgeReg = NewBackedgeReg; + DEBUG(dbgs() << "Inserting backedge PHI: " + << PrintReg(NewBackedgeReg, TRI) << " = PHI(" + << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" + << getPHIPred(*PHIDefInstr, 0)->getNumber() << ", " + << PrintReg(getPHISourceReg(*PHIDefInstr, 1), TRI) + << ", BB#" << (*SRI).second->getNumber()); + } + } else { + MIB.addReg(SourceReg); + MIB.addMBB((*SRI).second); + DEBUG(dbgs() << PrintReg(SourceReg, TRI) << ", BB#" + << (*SRI).second->getNumber() << ", "); + } + } + + // Add the final backedge register source to the entry phi + if (CurrentBackedgeReg != 0) { + MIB.addReg(CurrentBackedgeReg); + MIB.addMBB(Exit); + DEBUG(dbgs() << PrintReg(CurrentBackedgeReg, TRI) << ", BB#" + << Exit->getNumber() << ")\n"); + } else { + DEBUG(dbgs() << ")\n"); + } + } +} + +void AMDGPUMachineCFGStructurizer::createEntryPHIs(LinearizedRegion *CurrentRegion) { + DEBUG(PHIInfo.dump(MRI)); + + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + + unsigned DestReg = *DRI; + createEntryPHI(CurrentRegion, DestReg); + } + PHIInfo.clear(); +} + +void AMDGPUMachineCFGStructurizer::replaceRegisterWith(unsigned Register, + unsigned NewRegister) { + assert(Register != NewRegister && "Cannot replace a reg with itself"); + + for (MachineRegisterInfo::reg_iterator I = MRI->reg_begin(Register), + E = MRI->reg_end(); + I != E;) { + MachineOperand &O = *I; + ++I; + if (TargetRegisterInfo::isPhysicalRegister(NewRegister)) { + DEBUG(dbgs() << "Trying to substitute physical register: " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + llvm_unreachable("Cannot substitute physical registers"); + // We don't handle physical registers, but if we need to + // in the future This is how we do it: + // O.substPhysReg(NewRegister, *TRI); + } else { + DEBUG(dbgs() << "Replacing register: " + << PrintReg(Register, MRI->getTargetRegisterInfo()) + << " with " + << PrintReg(NewRegister, MRI->getTargetRegisterInfo()) + << "\n"); + O.setReg(NewRegister); + } + } + PHIInfo.deleteDef(Register); + + getRegionMRT()->replaceLiveOutReg(Register, NewRegister); + + DEBUG(PHIInfo.dump(MRI)); +} + +void AMDGPUMachineCFGStructurizer::resolvePHIInfos(MachineBasicBlock *FunctionEntry) { + DEBUG(dbgs() << "Resolve PHI Infos\n"); + DEBUG(PHIInfo.dump(MRI)); + for (auto DRI = PHIInfo.dests_begin(), DE = PHIInfo.dests_end(); DRI != DE; + ++DRI) { + unsigned DestReg = *DRI; + DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) << "\n"); + auto SRI = PHIInfo.sources_begin(DestReg); + unsigned SourceReg = (*SRI).first; + DEBUG(dbgs() << "DestReg: " << PrintReg(DestReg, TRI) + << " SourceReg: " << PrintReg(SourceReg, TRI) << "\n"); + + assert(PHIInfo.sources_end(DestReg) == ++SRI && + "More than one phi source in entry node"); + replaceRegisterWith(DestReg, SourceReg); + } +} + +static bool isFunctionEntryBlock(MachineBasicBlock *MBB) { + return ((&(*(MBB->getParent()->begin()))) == MBB); +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( + MachineBasicBlock *MergeBB, MachineBasicBlock *CodeBB, + LinearizedRegion *CurrentRegion, unsigned BBSelectRegIn, + unsigned BBSelectRegOut) { + if (isFunctionEntryBlock(CodeBB) && !CurrentRegion->getHasLoop()) { + // Handle non-loop function entry block. + // We need to allow loops to the entry block and then + rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); + resolvePHIInfos(CodeBB); + removeExternalCFGSuccessors(CodeBB); + CodeBB->addSuccessor(MergeBB); + CurrentRegion->addMBB(CodeBB); + return nullptr; + } + if (CurrentRegion->getEntry() == CodeBB && !CurrentRegion->getHasLoop()) { + // Handle non-loop region entry block. + MachineFunction *MF = MergeBB->getParent(); + auto MergeIter = MergeBB->getIterator(); + auto CodeBBStartIter = CodeBB->getIterator(); + auto CodeBBEndIter = ++(CodeBB->getIterator()); + if (CodeBBEndIter != MergeIter) { + MF->splice(MergeIter, CodeBBStartIter, CodeBBEndIter); + } + rewriteCodeBBTerminator(CodeBB, MergeBB, BBSelectRegOut); + prunePHIInfo(CodeBB); + createEntryPHIs(CurrentRegion); + removeExternalCFGSuccessors(CodeBB); + CodeBB->addSuccessor(MergeBB); + CurrentRegion->addMBB(CodeBB); + return nullptr; + } else { + // Handle internal block. + const TargetRegisterClass *RegClass = MRI->getRegClass(BBSelectRegIn); + unsigned CodeBBSelectReg = MRI->createVirtualRegister(RegClass); + rewriteCodeBBTerminator(CodeBB, MergeBB, CodeBBSelectReg); + bool IsRegionEntryBB = CurrentRegion->getEntry() == CodeBB; + MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeBB, CodeBB, CodeBB, + BBSelectRegIn, IsRegionEntryBB); + CurrentRegion->addMBB(IfBB); + // If this is the entry block we need to make the If block the new + // linearized region entry. + if (IsRegionEntryBB) { + CurrentRegion->setEntry(IfBB); + + if (CurrentRegion->getHasLoop()) { + MachineBasicBlock *RegionExit = CurrentRegion->getExit(); + MachineBasicBlock *ETrueBB = nullptr; + MachineBasicBlock *EFalseBB = nullptr; + SmallVector ECond; + + const DebugLoc &DL = DebugLoc(); + TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); + TII->removeBranch(*RegionExit); + + // We need to create a backedge if there is a loop + unsigned Reg = TII->insertNE( + RegionExit, RegionExit->instr_end(), DL, + CurrentRegion->getRegionMRT()->getInnerOutputRegister(), + CurrentRegion->getRegionMRT()->getEntry()->getNumber()); + MachineOperand RegOp = + MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef Cond(RegOp); + DEBUG(dbgs() << "RegionExitReg: "); + DEBUG(Cond[0].print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, + Cond, DebugLoc()); + RegionExit->addSuccessor(CurrentRegion->getEntry()); + } + } + CurrentRegion->addMBB(CodeBB); + LinearizedRegion InnerRegion(CodeBB, MRI, TRI, PHIInfo); + + InnerRegion.setParent(CurrentRegion); + DEBUG(dbgs() << "Insert BB Select PHI (BB)\n"); + insertMergePHI(IfBB, CodeBB, MergeBB, BBSelectRegOut, BBSelectRegIn, + CodeBBSelectReg); + InnerRegion.addMBB(MergeBB); + + DEBUG(InnerRegion.print(dbgs(), TRI)); + rewriteLiveOutRegs(IfBB, CodeBB, MergeBB, &InnerRegion, CurrentRegion); + extractKilledPHIs(CodeBB); + if (IsRegionEntryBB) { + createEntryPHIs(CurrentRegion); + } + return IfBB; + } +} + +MachineBasicBlock *AMDGPUMachineCFGStructurizer::createIfRegion( + MachineBasicBlock *MergeBB, LinearizedRegion *InnerRegion, + LinearizedRegion *CurrentRegion, MachineBasicBlock *SelectBB, + unsigned BBSelectRegIn, unsigned BBSelectRegOut) { + unsigned CodeBBSelectReg = + InnerRegion->getRegionMRT()->getInnerOutputRegister(); + MachineBasicBlock *CodeEntryBB = InnerRegion->getEntry(); + MachineBasicBlock *CodeExitBB = InnerRegion->getExit(); + MachineBasicBlock *IfBB = createIfBlock(MergeBB, CodeEntryBB, CodeExitBB, + SelectBB, BBSelectRegIn, true); + CurrentRegion->addMBB(IfBB); + bool isEntry = CurrentRegion->getEntry() == InnerRegion->getEntry(); + if (isEntry) { + + if (CurrentRegion->getHasLoop()) { + MachineBasicBlock *RegionExit = CurrentRegion->getExit(); + MachineBasicBlock *ETrueBB = nullptr; + MachineBasicBlock *EFalseBB = nullptr; + SmallVector ECond; + + const DebugLoc &DL = DebugLoc(); + TII->analyzeBranch(*RegionExit, ETrueBB, EFalseBB, ECond); + TII->removeBranch(*RegionExit); + + // We need to create a backedge if there is a loop + unsigned Reg = + TII->insertNE(RegionExit, RegionExit->instr_end(), DL, + CurrentRegion->getRegionMRT()->getInnerOutputRegister(), + CurrentRegion->getRegionMRT()->getEntry()->getNumber()); + MachineOperand RegOp = MachineOperand::CreateReg(Reg, false, false, true); + ArrayRef Cond(RegOp); + DEBUG(dbgs() << "RegionExitReg: "); + DEBUG(Cond[0].print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + TII->insertBranch(*RegionExit, CurrentRegion->getEntry(), RegionExit, + Cond, DebugLoc()); + RegionExit->addSuccessor(IfBB); + } + } + CurrentRegion->addMBBs(InnerRegion); + DEBUG(dbgs() << "Insert BB Select PHI (region)\n"); + insertMergePHI(IfBB, CodeExitBB, MergeBB, BBSelectRegOut, BBSelectRegIn, + CodeBBSelectReg); + + rewriteLiveOutRegs(IfBB, /* CodeEntryBB */ CodeExitBB, MergeBB, InnerRegion, + CurrentRegion); + + rewriteRegionEntryPHIs(InnerRegion, IfBB); + + if (isEntry) { + CurrentRegion->setEntry(IfBB); + } + + if (isEntry) { + createEntryPHIs(CurrentRegion); + } + + return IfBB; +} + +void AMDGPUMachineCFGStructurizer::splitLoopPHI(MachineInstr &PHI, + MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion) { + SmallVector PHIRegionIndices; + getPHIRegionIndices(LRegion, PHI, PHIRegionIndices); + + assert(PHIRegionIndices.size() == 1); + + unsigned RegionIndex = PHIRegionIndices[0]; + unsigned RegionSourceReg = getPHISourceReg(PHI, RegionIndex); + MachineBasicBlock *RegionSourceMBB = getPHIPred(PHI, RegionIndex); + unsigned PHIDest = getPHIDestReg(PHI); + unsigned PHISource = PHIDest; + unsigned ReplaceReg; + + if (shrinkPHI(PHI, PHIRegionIndices, &ReplaceReg)) { + PHISource = ReplaceReg; + } + + const TargetRegisterClass *RegClass = MRI->getRegClass(PHIDest); + unsigned NewDestReg = MRI->createVirtualRegister(RegClass); + LRegion->replaceRegisterInsideRegion(PHIDest, NewDestReg, false, MRI); + MachineInstrBuilder MIB = + BuildMI(*EntrySucc, EntrySucc->instr_begin(), PHI.getDebugLoc(), + TII->get(TargetOpcode::PHI), NewDestReg); + DEBUG(dbgs() << "Split Entry PHI " << PrintReg(NewDestReg, TRI) + << " = PHI("); + MIB.addReg(PHISource); + MIB.addMBB(Entry); + DEBUG(dbgs() << PrintReg(PHISource, TRI) << ", BB#" << Entry->getNumber()); + MIB.addReg(RegionSourceReg); + MIB.addMBB(RegionSourceMBB); + DEBUG(dbgs() << " ," << PrintReg(RegionSourceReg, TRI) << ", BB#" + << RegionSourceMBB->getNumber() << ")\n"); +} + +void AMDGPUMachineCFGStructurizer::splitLoopPHIs(MachineBasicBlock *Entry, + MachineBasicBlock *EntrySucc, + LinearizedRegion *LRegion) { + SmallVector PHIs; + collectPHIs(Entry, PHIs); + + for (auto PHII : PHIs) { + splitLoopPHI(*PHII, Entry, EntrySucc, LRegion); + } +} + +// Split the exit block so that we can insert a end control flow +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::splitExit(LinearizedRegion *LRegion) { + auto MRTRegion = LRegion->getRegionMRT(); + auto Exit = LRegion->getExit(); + auto MF = Exit->getParent(); + auto Succ = MRTRegion->getSucc(); + + auto NewExit = MF->CreateMachineBasicBlock(); + auto AfterExitIter = Exit->getIterator(); + AfterExitIter++; + MF->insert(AfterExitIter, NewExit); + Exit->removeSuccessor(Succ); + Exit->addSuccessor(NewExit); + NewExit->addSuccessor(Succ); + insertUnconditionalBranch(NewExit, Succ); + LRegion->addMBB(NewExit); + LRegion->setExit(NewExit); + + DEBUG(dbgs() << "Created new exit block: " << NewExit->getNumber() << "\n"); + + // Replace any PHI Predecessors in the successor with NewExit + for (auto &II : *Succ) { + MachineInstr &Instr = II; + + // If we are past the PHI instructions we are done + if (!Instr.isPHI()) + break; + + int numPreds = getPHINumInputs(Instr); + for (int i = 0; i < numPreds; ++i) { + auto Pred = getPHIPred(Instr, i); + if (Pred == Exit) { + setPhiPred(Instr, i, NewExit); + } + } + } + + return NewExit; +} + + +static MachineBasicBlock *split(MachineBasicBlock::iterator I) { + // Create the fall-through block. + MachineBasicBlock *MBB = (*I).getParent(); + MachineFunction *MF = MBB->getParent(); + MachineBasicBlock *SuccMBB = MF->CreateMachineBasicBlock(); + auto MBBIter = ++(MBB->getIterator()); + MF->insert(MBBIter, SuccMBB); + SuccMBB->transferSuccessorsAndUpdatePHIs(MBB); + MBB->addSuccessor(SuccMBB); + + // Splice the code over. + SuccMBB->splice(SuccMBB->end(), MBB, I, MBB->end()); + + return SuccMBB; +} + +// Split the entry block separating PHI-nodes and the rest of the code +// This is needed to insert an initializer for the bb select register +// inloop regions. + +MachineBasicBlock * +AMDGPUMachineCFGStructurizer::splitEntry(LinearizedRegion *LRegion) { + MachineBasicBlock *Entry = LRegion->getEntry(); + MachineBasicBlock *EntrySucc = split(Entry->getFirstNonPHI()); + MachineBasicBlock *Exit = LRegion->getExit(); + + DEBUG(dbgs() << "Split BB#" << Entry->getNumber() << " to BB#" + << Entry->getNumber() << " -> BB#" << EntrySucc->getNumber() + << "\n"); + LRegion->addMBB(EntrySucc); + + // Make the backedge go to Entry Succ + if (Exit->isSuccessor(Entry)) { + Exit->removeSuccessor(Entry); + } + Exit->addSuccessor(EntrySucc); + MachineInstr &Branch = *(Exit->instr_rbegin()); + for (auto &UI : Branch.uses()) { + if (UI.isMBB() && UI.getMBB() == Entry) { + UI.setMBB(EntrySucc); + } + } + + splitLoopPHIs(Entry, EntrySucc, LRegion); + + return EntrySucc; +} + +LinearizedRegion * +AMDGPUMachineCFGStructurizer::initLinearizedRegion(RegionMRT *Region) { + LinearizedRegion *LRegion = Region->getLinearizedRegion(); + LRegion->initLiveOut(Region, MRI, TRI, PHIInfo); + LRegion->setEntry(Region->getEntry()); + return LRegion; +} + +static void removeOldExitPreds(RegionMRT *Region) { + MachineBasicBlock *Exit = Region->getSucc(); + if (Exit == nullptr) { + return; + } + for (MachineBasicBlock::pred_iterator PI = Exit->pred_begin(), + E = Exit->pred_end(); + PI != E; ++PI) { + if (Region->contains(*PI)) { + (*PI)->removeSuccessor(Exit); + } + } +} + +static bool mbbHasBackEdge(MachineBasicBlock *MBB, + SmallPtrSet &MBBs) { + for (auto SI = MBB->succ_begin(), SE = MBB->succ_end(); SI != SE; ++SI) { + if (MBBs.count(*SI) != 0) { + return true; + } + } + return false; +} + +static bool containsNewBackedge(MRT *Tree, + SmallPtrSet &MBBs) { + // Need to traverse this in reverse since it is in post order. + if (Tree == nullptr) + return false; + + if (Tree->isMBB()) { + MachineBasicBlock *MBB = Tree->getMBBMRT()->getMBB(); + MBBs.insert(MBB); + if (mbbHasBackEdge(MBB, MBBs)) { + return true; + } + } else { + RegionMRT *Region = Tree->getRegionMRT(); + SetVector *Children = Region->getChildren(); + for (auto CI = Children->rbegin(), CE = Children->rend(); CI != CE; ++CI) { + if (containsNewBackedge(*CI, MBBs)) + return true; + } + } + return false; +} + +static bool containsNewBackedge(RegionMRT *Region) { + SmallPtrSet MBBs; + return containsNewBackedge(Region, MBBs); +} + +bool AMDGPUMachineCFGStructurizer::structurizeComplexRegion(RegionMRT *Region) { + auto *LRegion = initLinearizedRegion(Region); + LRegion->setHasLoop(containsNewBackedge(Region)); + MachineBasicBlock *LastMerge = createLinearizedExitBlock(Region); + MachineBasicBlock *CurrentMerge = LastMerge; + LRegion->addMBB(LastMerge); + LRegion->setExit(LastMerge); + + rewriteRegionExitPHIs(Region, LastMerge, LRegion); + removeOldExitPreds(Region); + + DEBUG(PHIInfo.dump(MRI)); + + SetVector *Children = Region->getChildren(); + DEBUG(dbgs() << "===========If Region Start===============\n"); + if (LRegion->getHasLoop()) { + DEBUG(dbgs() << "Has Backedge: Yes\n"); + } else { + DEBUG(dbgs() << "Has Backedge: No\n"); + } + + unsigned BBSelectRegIn; + unsigned BBSelectRegOut; + for (auto CI = Children->begin(), CE = Children->end(); CI != CE; ++CI) { + DEBUG(dbgs() << "CurrentRegion: \n"); + DEBUG(LRegion->print(dbgs(), TRI)); + + auto CNI = CI; + ++CNI; + + MRT *Child = (*CI); + + if (Child->isRegion()) { + + LinearizedRegion *InnerLRegion = + Child->getRegionMRT()->getLinearizedRegion(); + // We found the block is the exit of an inner region, we need + // to put it in the current linearized region. + + DEBUG(dbgs() << "Linearizing region: "); + DEBUG(InnerLRegion->print(dbgs(), TRI)); + DEBUG(dbgs() << "\n"); + + MachineBasicBlock *InnerEntry = InnerLRegion->getEntry(); + if ((&(*(InnerEntry->getParent()->begin()))) == InnerEntry) { + // Entry has already been linearized, no need to do this region. + unsigned OuterSelect = InnerLRegion->getBBSelectRegOut(); + unsigned InnerSelectReg = + InnerLRegion->getRegionMRT()->getInnerOutputRegister(); + replaceRegisterWith(InnerSelectReg, OuterSelect), + resolvePHIInfos(InnerEntry); + if (!InnerLRegion->getExit()->isSuccessor(CurrentMerge)) + InnerLRegion->getExit()->addSuccessor(CurrentMerge); + continue; + } + + BBSelectRegOut = Child->getBBSelectRegOut(); + BBSelectRegIn = Child->getBBSelectRegIn(); + + DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + << "\n"); + DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + << "\n"); + + MachineBasicBlock *IfEnd = CurrentMerge; + CurrentMerge = createIfRegion(CurrentMerge, InnerLRegion, LRegion, + Child->getRegionMRT()->getEntry(), + BBSelectRegIn, BBSelectRegOut); + TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); + } else { + MachineBasicBlock *MBB = Child->getMBBMRT()->getMBB(); + DEBUG(dbgs() << "Linearizing block: " << MBB->getNumber() << "\n"); + + if (MBB == getSingleExitNode(*(MBB->getParent()))) { + // If this is the exit block then we need to skip to the next. + // The "in" register will be transferred to "out" in the next + // iteration. + continue; + } + + BBSelectRegOut = Child->getBBSelectRegOut(); + BBSelectRegIn = Child->getBBSelectRegIn(); + + DEBUG(dbgs() << "BBSelectRegIn: " << PrintReg(BBSelectRegIn, TRI) + << "\n"); + DEBUG(dbgs() << "BBSelectRegOut: " << PrintReg(BBSelectRegOut, TRI) + << "\n"); + + MachineBasicBlock *IfEnd = CurrentMerge; + // This is a basic block that is not part of an inner region, we + // need to put it in the current linearized region. + CurrentMerge = createIfRegion(CurrentMerge, MBB, LRegion, BBSelectRegIn, + BBSelectRegOut); + if (CurrentMerge) { + TII->convertNonUniformIfRegion(CurrentMerge, IfEnd); + } + + DEBUG(PHIInfo.dump(MRI)); + } + } + + LRegion->removeFalseRegisterKills(MRI); + + if (LRegion->getHasLoop()) { + MachineBasicBlock *NewSucc = splitEntry(LRegion); + if (isFunctionEntryBlock(LRegion->getEntry())) { + resolvePHIInfos(LRegion->getEntry()); + } + const DebugLoc &DL = NewSucc->findDebugLoc(NewSucc->getFirstNonPHI()); + unsigned InReg = LRegion->getBBSelectRegIn(); + unsigned InnerSelectReg = + MRI->createVirtualRegister(MRI->getRegClass(InReg)); + unsigned NewInReg = MRI->createVirtualRegister(MRI->getRegClass(InReg)); + TII->materializeImmediate(*(LRegion->getEntry()), + LRegion->getEntry()->getFirstTerminator(), DL, + NewInReg, Region->getEntry()->getNumber()); + // Need to be careful about updating the registers inside the region. + LRegion->replaceRegisterInsideRegion(InReg, InnerSelectReg, false, MRI); + DEBUG(dbgs() << "Loop BBSelect Merge PHI:\n"); + insertMergePHI(LRegion->getEntry(), LRegion->getExit(), NewSucc, + InnerSelectReg, NewInReg, + LRegion->getRegionMRT()->getInnerOutputRegister()); + splitExit(LRegion); + TII->convertNonUniformLoopRegion(NewSucc, LastMerge); + } + + if (Region->isRoot()) { + TII->insertReturn(*LastMerge); + } + + DEBUG(Region->getEntry()->getParent()->dump()); + DEBUG(LRegion->print(dbgs(), TRI)); + DEBUG(PHIInfo.dump(MRI)); + + DEBUG(dbgs() << "===========If Region End===============\n"); + + Region->setLinearizedRegion(LRegion); + return true; +} + +bool AMDGPUMachineCFGStructurizer::structurizeRegion(RegionMRT *Region) { + if (false && regionIsSimpleIf(Region)) { + transformSimpleIfRegion(Region); + return true; + } else if (regionIsSequence(Region)) { + fixupRegionExits(Region); + return false; + } else { + structurizeComplexRegion(Region); + } + return false; +} + +static int structurize_once = 0; + +bool AMDGPUMachineCFGStructurizer::structurizeRegions(RegionMRT *Region, + bool isTopRegion) { + bool Changed = false; + + auto Children = Region->getChildren(); + for (auto CI : *Children) { + if (CI->isRegion()) { + Changed |= structurizeRegions(CI->getRegionMRT(), false); + } + } + + if (structurize_once < 2 || true) { + Changed |= structurizeRegion(Region); + structurize_once++; + } + return Changed; +} + +void AMDGPUMachineCFGStructurizer::initFallthroughMap(MachineFunction &MF) { + DEBUG(dbgs() << "Fallthrough Map:\n"); + for (auto &MBBI : MF) { + MachineBasicBlock *MBB = MBBI.getFallThrough(); + if (MBB != nullptr) { + DEBUG(dbgs() << "Fallthrough: " << MBBI.getNumber() << " -> " + << MBB->getNumber() << "\n"); + } + FallthroughMap[&MBBI] = MBB; + } +} + +void AMDGPUMachineCFGStructurizer::createLinearizedRegion(RegionMRT *Region, + unsigned SelectOut) { + LinearizedRegion *LRegion = new LinearizedRegion(); + if (SelectOut) { + LRegion->addLiveOut(SelectOut); + DEBUG(dbgs() << "Add LiveOut (BBSelect): " << PrintReg(SelectOut, TRI) + << "\n"); + } + LRegion->setRegionMRT(Region); + Region->setLinearizedRegion(LRegion); + LRegion->setParent(Region->getParent() + ? Region->getParent()->getLinearizedRegion() + : nullptr); +} + +unsigned +AMDGPUMachineCFGStructurizer::initializeSelectRegisters(MRT *MRT, unsigned SelectOut, + MachineRegisterInfo *MRI, + const SIInstrInfo *TII) { + if (MRT->isRegion()) { + RegionMRT *Region = MRT->getRegionMRT(); + Region->setBBSelectRegOut(SelectOut); + unsigned InnerSelectOut = createBBSelectReg(TII, MRI); + + // Fixme: Move linearization creation to the original spot + createLinearizedRegion(Region, SelectOut); + + for (auto CI = Region->getChildren()->begin(), + CE = Region->getChildren()->end(); + CI != CE; ++CI) { + InnerSelectOut = + initializeSelectRegisters((*CI), InnerSelectOut, MRI, TII); + } + MRT->setBBSelectRegIn(InnerSelectOut); + return InnerSelectOut; + } else { + MRT->setBBSelectRegOut(SelectOut); + unsigned NewSelectIn = createBBSelectReg(TII, MRI); + MRT->setBBSelectRegIn(NewSelectIn); + return NewSelectIn; + } +} + +static void checkRegOnlyPHIInputs(MachineFunction &MF) { + for (auto &MBBI : MF) { + for (MachineBasicBlock::instr_iterator I = MBBI.instr_begin(), + E = MBBI.instr_end(); + I != E; ++I) { + MachineInstr &Instr = *I; + if (Instr.isPHI()) { + int numPreds = getPHINumInputs(Instr); + for (int i = 0; i < numPreds; ++i) { + assert(Instr.getOperand(i * 2 + 1).isReg() && + "PHI Operand not a register"); + } + } + } + } +} + + +INITIALIZE_PASS_BEGIN(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) +INITIALIZE_PASS_DEPENDENCY(MachineRegionInfoPass) +INITIALIZE_PASS_END(AMDGPUMachineCFGStructurizer, "amdgpu-machine-cfg-structurizer", + "AMDGPU Machine CFG Structurizer", false, false) + +char AMDGPUMachineCFGStructurizerID = AMDGPUMachineCFGStructurizer::ID; + + +bool AMDGPUMachineCFGStructurizer::runOnMachineFunction(MachineFunction &MF) { + const SISubtarget &ST = MF.getSubtarget(); + const SIInstrInfo *TII = ST.getInstrInfo(); + TRI = ST.getRegisterInfo(); + MRI = &(MF.getRegInfo()); + initFallthroughMap(MF); + + checkRegOnlyPHIInputs(MF); + DEBUG(dbgs() << "----STRUCTURIZER START----\n"); + DEBUG(MF.dump()); + + Regions = &(getAnalysis().getRegionInfo()); + DEBUG(Regions->dump()); + + RegionMRT *RTree = MRT::buildMRT(MF, Regions, TII, MRI); + setRegionMRT(RTree); + initializeSelectRegisters(RTree, 0, MRI, TII); + DEBUG(RTree->dump(TRI)); + bool result = structurizeRegions(RTree, true); + delete RTree; + DEBUG(dbgs() << "----STRUCTURIZER END----\n"); + initFallthroughMap(MF); + return result; +} + +FunctionPass *llvm::createAMDGPUMachineCFGStructurizerPass() { + return new AMDGPUMachineCFGStructurizer(); +} diff --git a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp index 36dcc699d4ea..e40f39557747 100644 --- a/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp +++ b/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp @@ -397,14 +397,17 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) { // instructions. static bool canVectorizeInst(Instruction *Inst, User *User) { switch (Inst->getOpcode()) { - case Instruction::Load: + case Instruction::Load: { + LoadInst *LI = cast(Inst); + return !LI->isVolatile(); + } case Instruction::BitCast: case Instruction::AddrSpaceCast: return true; case Instruction::Store: { // Must be the stored pointer operand, not a stored value. StoreInst *SI = cast(Inst); - return SI->getPointerOperand() == User; + return (SI->getPointerOperand() == User) && !SI->isVolatile(); } default: return false; diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp index 972c28579f7a..6e301b4ad527 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.cpp +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.cpp @@ -125,6 +125,9 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT, StringRef GPU, StringRef FS, HasSDWA(false), HasDPP(false), FlatAddressSpace(false), + FlatInstOffsets(false), + FlatGlobalInsts(false), + FlatScratchInsts(false), R600ALUInst(false), CaymanISA(false), diff --git a/lib/Target/AMDGPU/AMDGPUSubtarget.h b/lib/Target/AMDGPU/AMDGPUSubtarget.h index a5cda817ac11..bed7d326b3dd 100644 --- a/lib/Target/AMDGPU/AMDGPUSubtarget.h +++ b/lib/Target/AMDGPU/AMDGPUSubtarget.h @@ -145,6 +145,9 @@ protected: bool HasSDWA; bool HasDPP; bool FlatAddressSpace; + bool FlatInstOffsets; + bool FlatGlobalInsts; + bool FlatScratchInsts; bool R600ALUInst; bool CaymanISA; bool CFALUBug; @@ -380,6 +383,18 @@ public: return FlatAddressSpace; } + bool hasFlatInstOffsets() const { + return FlatInstOffsets; + } + + bool hasFlatGlobalInsts() const { + return FlatGlobalInsts; + } + + bool hasFlatScratchInsts() const { + return FlatScratchInsts; + } + bool isMesaKernel(const MachineFunction &MF) const { return isMesa3DOS() && !AMDGPU::isShader(MF.getFunction()->getCallingConv()); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp index cd5bad04d0b3..386a88b0520f 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -118,6 +118,13 @@ static cl::opt EnableSIInsertWaitcntsPass( cl::desc("Use new waitcnt insertion pass"), cl::init(false)); +// Option to run late CFG structurizer +static cl::opt LateCFGStructurize( + "amdgpu-late-structurize", + cl::desc("Enable late CFG structurization"), + cl::init(false), + cl::Hidden); + extern "C" void LLVMInitializeAMDGPUTarget() { // Register the target RegisterTargetMachine X(getTheAMDGPUTarget()); @@ -702,11 +709,15 @@ bool GCNPassConfig::addPreISel() { // Merge divergent exit nodes. StructurizeCFG won't recognize the multi-exit // regions formed by them. addPass(&AMDGPUUnifyDivergentExitNodesID); - addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + if (!LateCFGStructurize) { + addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions + } addPass(createSinkingPass()); addPass(createSITypeRewriter()); addPass(createAMDGPUAnnotateUniformValues()); - addPass(createSIAnnotateControlFlowPass()); + if (!LateCFGStructurize) { + addPass(createSIAnnotateControlFlowPass()); + } return false; } @@ -770,6 +781,9 @@ bool GCNPassConfig::addGlobalInstructionSelect() { #endif void GCNPassConfig::addPreRegAlloc() { + if (LateCFGStructurize) { + addPass(createAMDGPUMachineCFGStructurizerPass()); + } addPass(createSIWholeQuadModePass()); } diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp index c9482c37ec80..beafebc1284a 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp @@ -363,13 +363,22 @@ int AMDGPUTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index) { switch (Opcode) { case Instruction::ExtractElement: - case Instruction::InsertElement: + case Instruction::InsertElement: { + unsigned EltSize + = DL.getTypeSizeInBits(cast(ValTy)->getElementType()); + if (EltSize < 32) { + if (EltSize == 16 && Index == 0 && ST->has16BitInsts()) + return 0; + return BaseT::getVectorInstrCost(Opcode, ValTy, Index); + } + // Extracts are just reads of a subregister, so are free. Inserts are // considered free because we don't want to have any cost for scalarizing // operations, and we don't have to copy into a different register class. // Dynamic indexing isn't free and is best avoided. return Index == ~0u ? 2 : 0; + } default: return BaseT::getVectorInstrCost(Opcode, ValTy, Index); } @@ -479,3 +488,26 @@ bool AMDGPUTTIImpl::isSourceOfDivergence(const Value *V) const { return false; } + +unsigned AMDGPUTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp) { + if (ST->hasVOP3PInsts()) { + VectorType *VT = cast(Tp); + if (VT->getNumElements() == 2 && + DL.getTypeSizeInBits(VT->getElementType()) == 16) { + // With op_sel VOP3P instructions freely can access the low half or high + // half of a register, so any swizzle is free. + + switch (Kind) { + case TTI::SK_Broadcast: + case TTI::SK_Reverse: + case TTI::SK_PermuteSingleSrc: + return 0; + default: + break; + } + } + } + + return BaseT::getShuffleCost(Kind, Tp, Index, SubTp); +} diff --git a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h index 71d6306bc1a5..e0024e21e82b 100644 --- a/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h +++ b/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h @@ -114,6 +114,9 @@ public: } unsigned getVectorSplitCost() { return 0; } + + unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, + Type *SubTp); }; } // end namespace llvm diff --git a/lib/Target/AMDGPU/CMakeLists.txt b/lib/Target/AMDGPU/CMakeLists.txt index 7c0ef4aeac3c..cafce0164fa9 100644 --- a/lib/Target/AMDGPU/CMakeLists.txt +++ b/lib/Target/AMDGPU/CMakeLists.txt @@ -48,6 +48,7 @@ add_llvm_target(AMDGPUCodeGen AMDGPUISelDAGToDAG.cpp AMDGPULowerIntrinsics.cpp AMDGPUMCInstLower.cpp + AMDGPUMachineCFGStructurizer.cpp AMDGPUMachineFunction.cpp AMDGPUUnifyMetadata.cpp AMDGPUOpenCLImageTypeLoweringPass.cpp diff --git a/lib/Target/AMDGPU/FLATInstructions.td b/lib/Target/AMDGPU/FLATInstructions.td index b0ac0e689a0b..8ba9efd42c70 100644 --- a/lib/Target/AMDGPU/FLATInstructions.td +++ b/lib/Target/AMDGPU/FLATInstructions.td @@ -7,7 +7,7 @@ // //===----------------------------------------------------------------------===// -def FLATAtomic : ComplexPattern; +def FLATAtomic : ComplexPattern; //===----------------------------------------------------------------------===// // FLAT classes @@ -62,7 +62,9 @@ class FLAT_Real op, FLAT_Pseudo ps> : bits<8> vdst; bits<1> slc; bits<1> glc; - bits<1> tfe; + + // We don't use tfe right now, and it was removed in gfx9. + bits<1> tfe = 0; // 15-0 is reserved. let Inst{16} = !if(ps.has_glc, glc, ps.glcValue); @@ -79,8 +81,8 @@ class FLAT_Real op, FLAT_Pseudo ps> : class FLAT_Load_Pseudo : FLAT_Pseudo< opName, (outs regClass:$vdst), - (ins VReg_64:$vaddr, GLC:$glc, slc:$slc, tfe:$tfe), - " $vdst, $vaddr$glc$slc$tfe"> { + (ins VReg_64:$vaddr, GLC:$glc, slc:$slc), + " $vdst, $vaddr$glc$slc"> { let has_data = 0; let mayLoad = 1; } @@ -88,8 +90,8 @@ class FLAT_Load_Pseudo : FLAT_Pseudo< class FLAT_Store_Pseudo : FLAT_Pseudo< opName, (outs), - (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc, tfe:$tfe), - " $vaddr, $vdata$glc$slc$tfe"> { + (ins VReg_64:$vaddr, vdataClass:$vdata, GLC:$glc, slc:$slc), + " $vaddr, $vdata$glc$slc"> { let mayLoad = 0; let mayStore = 1; let has_vdst = 0; @@ -105,8 +107,8 @@ multiclass FLAT_Atomic_Pseudo< def "" : FLAT_Pseudo , AtomicNoRet { let mayLoad = 1; @@ -119,10 +121,10 @@ multiclass FLAT_Atomic_Pseudo< def _RTN : FLAT_Pseudo , + (atomic (FLATAtomic i64:$vaddr, i1:$slc), data_vt:$vdata))]>, AtomicNoRet { let mayLoad = 1; let mayStore = 1; @@ -311,30 +313,30 @@ def flat_truncstorei16 : flat_st ; // Patterns for global loads with no offset. class FlatLoadPat : Pat < (vt (node i64:$addr)), - (inst $addr, 0, 0, 0) + (inst $addr, 0, 0) >; class FlatLoadAtomicPat : Pat < (vt (node i64:$addr)), - (inst $addr, 1, 0, 0) + (inst $addr, 1, 0) >; class FlatStorePat : Pat < (node vt:$data, i64:$addr), - (inst $addr, $data, 0, 0, 0) + (inst $addr, $data, 0, 0) >; class FlatStoreAtomicPat : Pat < // atomic store follows atomic binop convention so the address comes // first. (node i64:$addr, vt:$data), - (inst $addr, $data, 1, 0, 0) + (inst $addr, $data, 1, 0) >; class FlatAtomicPat : Pat < (vt (node i64:$addr, data_vt:$data)), - (inst $addr, $data, 0, 0) + (inst $addr, $data, 0) >; let Predicates = [isCIVI] in { diff --git a/lib/Target/AMDGPU/GCNRegPressure.cpp b/lib/Target/AMDGPU/GCNRegPressure.cpp index bf16a8216001..8066428fe44a 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.cpp +++ b/lib/Target/AMDGPU/GCNRegPressure.cpp @@ -27,7 +27,7 @@ void llvm::printLivesAt(SlotIndex SI, unsigned Num = 0; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { const unsigned Reg = TargetRegisterInfo::index2VirtReg(I); - if (MRI.reg_nodbg_empty(Reg)) + if (!LIS.hasInterval(Reg)) continue; const auto &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { @@ -131,13 +131,13 @@ bool GCNRegPressure::less(const SISubtarget &ST, const GCNRegPressure& O, unsigned MaxOccupancy) const { const auto SGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumSGPRs(getSGRPNum())); + ST.getOccupancyWithNumSGPRs(getSGPRNum())); const auto VGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(getVGRPNum())); + ST.getOccupancyWithNumVGPRs(getVGPRNum())); const auto OtherSGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumSGPRs(O.getSGRPNum())); + ST.getOccupancyWithNumSGPRs(O.getSGPRNum())); const auto OtherVGPROcc = std::min(MaxOccupancy, - ST.getOccupancyWithNumVGPRs(O.getVGRPNum())); + ST.getOccupancyWithNumVGPRs(O.getVGPRNum())); const auto Occ = std::min(SGPROcc, VGPROcc); const auto OtherOcc = std::min(OtherSGPROcc, OtherVGPROcc); @@ -167,17 +167,17 @@ bool GCNRegPressure::less(const SISubtarget &ST, return VW < OtherVW; } } - return SGPRImportant ? (getSGRPNum() < O.getSGRPNum()): - (getVGRPNum() < O.getVGRPNum()); + return SGPRImportant ? (getSGPRNum() < O.getSGPRNum()): + (getVGPRNum() < O.getVGPRNum()); } #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD void GCNRegPressure::print(raw_ostream &OS, const SISubtarget *ST) const { - OS << "VGPRs: " << getVGRPNum(); - if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGRPNum()) << ')'; - OS << ", SGPRs: " << getSGRPNum(); - if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGRPNum()) << ')'; + OS << "VGPRs: " << getVGPRNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumVGPRs(getVGPRNum()) << ')'; + OS << ", SGPRs: " << getSGPRNum(); + if (ST) OS << "(O" << ST->getOccupancyWithNumSGPRs(getSGPRNum()) << ')'; OS << ", LVGPR WT: " << getVGPRTuplesWeight() << ", LSGPR WT: " << getSGPRTuplesWeight(); if (ST) OS << " -> Occ: " << getOccupancy(*ST); @@ -192,7 +192,6 @@ LaneBitmask llvm::getLiveLaneMask(unsigned Reg, SlotIndex SI, const LiveIntervals &LIS, const MachineRegisterInfo &MRI) { - assert(!MRI.reg_nodbg_empty(Reg)); LaneBitmask LiveMask; const auto &LI = LIS.getInterval(Reg); if (LI.hasSubRanges()) { @@ -214,7 +213,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, GCNRPTracker::LiveRegSet LiveRegs; for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { auto Reg = TargetRegisterInfo::index2VirtReg(I); - if (MRI.reg_nodbg_empty(Reg)) + if (!LIS.hasInterval(Reg)) continue; auto LiveMask = getLiveLaneMask(Reg, SI, LIS, MRI); if (LiveMask.any()) @@ -223,13 +222,7 @@ GCNRPTracker::LiveRegSet llvm::getLiveRegs(SlotIndex SI, return LiveRegs; } -void GCNUpwardRPTracker::reset(const MachineInstr &MI) { - MRI = &MI.getParent()->getParent()->getRegInfo(); - LiveRegs = getLiveRegsAfter(MI, LIS); - MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); -} - -LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const { +LaneBitmask GCNRPTracker::getDefRegMask(const MachineOperand &MO) const { assert(MO.isDef() && MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())); @@ -241,7 +234,7 @@ LaneBitmask GCNUpwardRPTracker::getDefRegMask(const MachineOperand &MO) const { MRI->getTargetRegisterInfo()->getSubRegIndexLaneMask(MO.getSubReg()); } -LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const { +LaneBitmask GCNRPTracker::getUsedRegMask(const MachineOperand &MO) const { assert(MO.isUse() && MO.isReg() && TargetRegisterInfo::isVirtualRegister(MO.getReg())); @@ -259,6 +252,18 @@ LaneBitmask GCNUpwardRPTracker::getUsedRegMask(const MachineOperand &MO) const { return getLiveLaneMask(MO.getReg(), SI, LIS, *MRI); } +void GCNUpwardRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + if (LiveRegsCopy) { + if (&LiveRegs != LiveRegsCopy) + LiveRegs = *LiveRegsCopy; + } else { + LiveRegs = getLiveRegsAfter(MI, LIS); + } + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); +} + void GCNUpwardRPTracker::recede(const MachineInstr &MI) { assert(MRI && "call reset first"); @@ -297,6 +302,100 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) { MaxPressure = max(MaxPressure, CurPressure); } +bool GCNDownwardRPTracker::reset(const MachineInstr &MI, + const LiveRegSet *LiveRegsCopy) { + MRI = &MI.getParent()->getParent()->getRegInfo(); + LastTrackedMI = nullptr; + MBBEnd = MI.getParent()->end(); + NextMI = &MI; + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + if (NextMI == MBBEnd) + return false; + if (LiveRegsCopy) { + if (&LiveRegs != LiveRegsCopy) + LiveRegs = *LiveRegsCopy; + } else { + LiveRegs = getLiveRegsBefore(*NextMI, LIS); + } + MaxPressure = CurPressure = getRegPressure(*MRI, LiveRegs); + return true; +} + +bool GCNDownwardRPTracker::advanceBeforeNext() { + assert(MRI && "call reset first"); + + NextMI = skipDebugInstructionsForward(NextMI, MBBEnd); + if (NextMI == MBBEnd) + return false; + + SlotIndex SI = LIS.getInstructionIndex(*NextMI).getBaseIndex(); + assert(SI.isValid()); + + // Remove dead registers or mask bits. + for (auto &It : LiveRegs) { + const LiveInterval &LI = LIS.getInterval(It.first); + if (LI.hasSubRanges()) { + for (const auto &S : LI.subranges()) { + if (!S.liveAt(SI)) { + auto PrevMask = It.second; + It.second &= ~S.LaneMask; + CurPressure.inc(It.first, PrevMask, It.second, *MRI); + } + } + } else if (!LI.liveAt(SI)) { + auto PrevMask = It.second; + It.second = LaneBitmask::getNone(); + CurPressure.inc(It.first, PrevMask, It.second, *MRI); + } + if (It.second.none()) + LiveRegs.erase(It.first); + } + + MaxPressure = max(MaxPressure, CurPressure); + + return true; +} + +void GCNDownwardRPTracker::advanceToNext() { + LastTrackedMI = &*NextMI++; + + // Add new registers or mask bits. + for (const auto &MO : LastTrackedMI->defs()) { + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (!TargetRegisterInfo::isVirtualRegister(Reg)) + continue; + auto &LiveMask = LiveRegs[Reg]; + auto PrevMask = LiveMask; + LiveMask |= getDefRegMask(MO); + CurPressure.inc(Reg, PrevMask, LiveMask, *MRI); + } + + MaxPressure = max(MaxPressure, CurPressure); +} + +bool GCNDownwardRPTracker::advance() { + // If we have just called reset live set is actual. + if ((NextMI == MBBEnd) || (LastTrackedMI && !advanceBeforeNext())) + return false; + advanceToNext(); + return true; +} + +bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator End) { + while (NextMI != End) + if (!advance()) return false; + return true; +} + +bool GCNDownwardRPTracker::advance(MachineBasicBlock::const_iterator Begin, + MachineBasicBlock::const_iterator End, + const LiveRegSet *LiveRegsCopy) { + reset(*Begin, LiveRegsCopy); + return advance(End); +} + #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP) LLVM_DUMP_METHOD static void reportMismatch(const GCNRPTracker::LiveRegSet &LISLR, @@ -352,4 +451,16 @@ bool GCNUpwardRPTracker::isValid() const { return true; } +void GCNRPTracker::printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, + const MachineRegisterInfo &MRI) { + const TargetRegisterInfo *TRI = MRI.getTargetRegisterInfo(); + for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { + unsigned Reg = TargetRegisterInfo::index2VirtReg(I); + auto It = LiveRegs.find(Reg); + if (It != LiveRegs.end() && It->second.any()) + OS << ' ' << PrintVRegOrUnit(Reg, TRI) << ':' + << PrintLaneMask(It->second); + } + OS << '\n'; +} #endif diff --git a/lib/Target/AMDGPU/GCNRegPressure.h b/lib/Target/AMDGPU/GCNRegPressure.h index 82e76a7bfddc..9875ca6a6d16 100644 --- a/lib/Target/AMDGPU/GCNRegPressure.h +++ b/lib/Target/AMDGPU/GCNRegPressure.h @@ -33,19 +33,19 @@ struct GCNRegPressure { clear(); } - bool empty() const { return getSGRPNum() == 0 && getVGRPNum() == 0; } + bool empty() const { return getSGPRNum() == 0 && getVGPRNum() == 0; } void clear() { std::fill(&Value[0], &Value[TOTAL_KINDS], 0); } - unsigned getSGRPNum() const { return Value[SGPR32]; } - unsigned getVGRPNum() const { return Value[VGPR32]; } + unsigned getSGPRNum() const { return Value[SGPR32]; } + unsigned getVGPRNum() const { return Value[VGPR32]; } unsigned getVGPRTuplesWeight() const { return Value[VGPR_TUPLE]; } unsigned getSGPRTuplesWeight() const { return Value[SGPR_TUPLE]; } unsigned getOccupancy(const SISubtarget &ST) const { - return std::min(ST.getOccupancyWithNumSGPRs(getSGRPNum()), - ST.getOccupancyWithNumVGPRs(getVGRPNum())); + return std::min(ST.getOccupancyWithNumSGPRs(getSGPRNum()), + ST.getOccupancyWithNumVGPRs(getVGPRNum())); } void inc(unsigned Reg, @@ -92,16 +92,21 @@ public: typedef DenseMap LiveRegSet; protected: + const LiveIntervals &LIS; LiveRegSet LiveRegs; GCNRegPressure CurPressure, MaxPressure; const MachineInstr *LastTrackedMI = nullptr; mutable const MachineRegisterInfo *MRI = nullptr; - GCNRPTracker() {} + GCNRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + LaneBitmask getDefRegMask(const MachineOperand &MO) const; + LaneBitmask getUsedRegMask(const MachineOperand &MO) const; public: // live regs for the current state const decltype(LiveRegs) &getLiveRegs() const { return LiveRegs; } const MachineInstr *getLastTrackedMI() const { return LastTrackedMI; } + void clearMaxPressure() { MaxPressure.clear(); } + // returns MaxPressure, resetting it decltype(MaxPressure) moveMaxPressure() { auto Res = MaxPressure; @@ -111,17 +116,16 @@ public: decltype(LiveRegs) moveLiveRegs() { return std::move(LiveRegs); } + static void printLiveRegs(raw_ostream &OS, const LiveRegSet& LiveRegs, + const MachineRegisterInfo &MRI); }; class GCNUpwardRPTracker : public GCNRPTracker { - const LiveIntervals &LIS; - LaneBitmask getDefRegMask(const MachineOperand &MO) const; - LaneBitmask getUsedRegMask(const MachineOperand &MO) const; public: - GCNUpwardRPTracker(const LiveIntervals &LIS_) : LIS(LIS_) {} + GCNUpwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} // reset tracker to the point just below MI // filling live regs upon this point using LIS - void reset(const MachineInstr &MI); + void reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); // move to the state just above the MI void recede(const MachineInstr &MI); @@ -131,6 +135,41 @@ public: bool isValid() const; }; +class GCNDownwardRPTracker : public GCNRPTracker { + // Last position of reset or advanceBeforeNext + MachineBasicBlock::const_iterator NextMI; + + MachineBasicBlock::const_iterator MBBEnd; + +public: + GCNDownwardRPTracker(const LiveIntervals &LIS_) : GCNRPTracker(LIS_) {} + + const MachineBasicBlock::const_iterator getNext() const { return NextMI; } + + // Reset tracker to the point before the MI + // filling live regs upon this point using LIS. + // Returns false if block is empty except debug values. + bool reset(const MachineInstr &MI, const LiveRegSet *LiveRegs = nullptr); + + // Move to the state right before the next MI. Returns false if reached + // end of the block. + bool advanceBeforeNext(); + + // Move to the state at the MI, advanceBeforeNext has to be called first. + void advanceToNext(); + + // Move to the state at the next MI. Returns false if reached end of block. + bool advance(); + + // Advance instructions until before End. + bool advance(MachineBasicBlock::const_iterator End); + + // Reset to Begin and advance to End. + bool advance(MachineBasicBlock::const_iterator Begin, + MachineBasicBlock::const_iterator End, + const LiveRegSet *LiveRegsCopy = nullptr); +}; + LaneBitmask getLiveLaneMask(unsigned Reg, SlotIndex SI, const LiveIntervals &LIS, diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/lib/Target/AMDGPU/GCNSchedStrategy.cpp index 630442625aa3..8ec46665daf5 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.cpp +++ b/lib/Target/AMDGPU/GCNSchedStrategy.cpp @@ -316,46 +316,57 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C, MFI(*MF.getInfo()), StartingOccupancy(ST.getOccupancyWithLocalMemSize(MFI.getLDSSize(), *MF.getFunction())), - MinOccupancy(StartingOccupancy), Stage(0) { + MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) { DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n"); } void GCNScheduleDAGMILive::schedule() { + if (Stage == 0) { + // Just record regions at the first pass. + Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); + return; + } + std::vector Unsched; Unsched.reserve(NumRegionInstrs); for (auto &I : *this) Unsched.push_back(&I); - std::pair PressureBefore; + GCNRegPressure PressureBefore; if (LIS) { - DEBUG(dbgs() << "Pressure before scheduling:\n"); - discoverLiveIns(); - PressureBefore = getRealRegPressure(); + PressureBefore = Pressure[RegionIdx]; + + DEBUG(dbgs() << "Pressure before scheduling:\nRegion live-ins:"; + GCNRPTracker::printLiveRegs(dbgs(), LiveIns[RegionIdx], MRI); + dbgs() << "Region live-in pressure: "; + llvm::getRegPressure(MRI, LiveIns[RegionIdx]).print(dbgs()); + dbgs() << "Region register pressure: "; + PressureBefore.print(dbgs())); } ScheduleDAGMILive::schedule(); - if (Stage == 0) - Regions.push_back(std::make_pair(RegionBegin, RegionEnd)); + Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); if (!LIS) return; // Check the results of scheduling. GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - DEBUG(dbgs() << "Pressure after scheduling:\n"); auto PressureAfter = getRealRegPressure(); - LiveIns.clear(); - if (PressureAfter.first <= S.SGPRCriticalLimit && - PressureAfter.second <= S.VGPRCriticalLimit) { + DEBUG(dbgs() << "Pressure after scheduling: "; PressureAfter.print(dbgs())); + + if (PressureAfter.getSGPRNum() <= S.SGPRCriticalLimit && + PressureAfter.getVGPRNum() <= S.VGPRCriticalLimit) { + Pressure[RegionIdx] = PressureAfter; DEBUG(dbgs() << "Pressure in desired limits, done.\n"); return; } - unsigned WavesAfter = getMaxWaves(PressureAfter.first, - PressureAfter.second, MF); - unsigned WavesBefore = getMaxWaves(PressureBefore.first, - PressureBefore.second, MF); + unsigned WavesAfter = getMaxWaves(PressureAfter.getSGPRNum(), + PressureAfter.getVGPRNum(), MF); + unsigned WavesBefore = getMaxWaves(PressureBefore.getSGPRNum(), + PressureBefore.getVGPRNum(), MF); DEBUG(dbgs() << "Occupancy before scheduling: " << WavesBefore << ", after " << WavesAfter << ".\n"); @@ -368,8 +379,10 @@ void GCNScheduleDAGMILive::schedule() { << MinOccupancy << ".\n"); } - if (WavesAfter >= WavesBefore) + if (WavesAfter >= WavesBefore) { + Pressure[RegionIdx] = PressureAfter; return; + } DEBUG(dbgs() << "Attempting to revert scheduling.\n"); RegionEnd = RegionBegin; @@ -398,166 +411,139 @@ void GCNScheduleDAGMILive::schedule() { DEBUG(dbgs() << "Scheduling " << *MI); } RegionBegin = Unsched.front()->getIterator(); - if (Stage == 0) - Regions.back() = std::make_pair(RegionBegin, RegionEnd); + Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd); placeDebugValues(); } -static inline void setMask(const MachineRegisterInfo &MRI, - const SIRegisterInfo *SRI, unsigned Reg, - LaneBitmask &PrevMask, LaneBitmask NewMask, - unsigned &SGPRs, unsigned &VGPRs) { - int NewRegs = countPopulation(NewMask.getAsInteger()) - - countPopulation(PrevMask.getAsInteger()); - if (SRI->isSGPRReg(MRI, Reg)) - SGPRs += NewRegs; - if (SRI->isVGPR(MRI, Reg)) - VGPRs += NewRegs; - assert ((int)SGPRs >= 0 && (int)VGPRs >= 0); - PrevMask = NewMask; +GCNRegPressure GCNScheduleDAGMILive::getRealRegPressure() const { + GCNDownwardRPTracker RPTracker(*LIS); + RPTracker.advance(begin(), end(), &LiveIns[RegionIdx]); + return RPTracker.moveMaxPressure(); } -void GCNScheduleDAGMILive::discoverLiveIns() { - unsigned SGPRs = 0; - unsigned VGPRs = 0; +void GCNScheduleDAGMILive::computeBlockPressure(const MachineBasicBlock *MBB) { + GCNDownwardRPTracker RPTracker(*LIS); + + // If the block has the only successor then live-ins of that successor are + // live-outs of the current block. We can reuse calculated live set if the + // successor will be sent to scheduling past current block. + const MachineBasicBlock *OnlySucc = nullptr; + if (MBB->succ_size() == 1 && !(*MBB->succ_begin())->empty()) { + SlotIndexes *Ind = LIS->getSlotIndexes(); + if (Ind->getMBBStartIdx(MBB) < Ind->getMBBStartIdx(*MBB->succ_begin())) + OnlySucc = *MBB->succ_begin(); + } - auto &MI = *begin()->getParent()->getFirstNonDebugInstr(); - const SIRegisterInfo *SRI = static_cast(TRI); - SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex(); - assert (SI.isValid()); - - DEBUG(dbgs() << "Region live-ins:"); - for (unsigned I = 0, E = MRI.getNumVirtRegs(); I != E; ++I) { - unsigned Reg = TargetRegisterInfo::index2VirtReg(I); - if (MRI.reg_nodbg_empty(Reg)) - continue; - const LiveInterval &LI = LIS->getInterval(Reg); - LaneBitmask LaneMask = LaneBitmask::getNone(); - if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) - if (S.liveAt(SI)) - LaneMask |= S.LaneMask; - } else if (LI.liveAt(SI)) { - LaneMask = MRI.getMaxLaneMaskForVReg(Reg); - } + // Scheduler sends regions from the end of the block upwards. + size_t CurRegion = RegionIdx; + for (size_t E = Regions.size(); CurRegion != E; ++CurRegion) + if (Regions[CurRegion].first->getParent() != MBB) + break; + --CurRegion; + + auto I = MBB->begin(); + auto LiveInIt = MBBLiveIns.find(MBB); + if (LiveInIt != MBBLiveIns.end()) { + auto LiveIn = std::move(LiveInIt->second); + RPTracker.reset(*MBB->begin(), &LiveIn); + MBBLiveIns.erase(LiveInIt); + } else { + I = Regions[CurRegion].first; + RPTracker.reset(*I); + } - if (LaneMask.any()) { - setMask(MRI, SRI, Reg, LiveIns[Reg], LaneMask, SGPRs, VGPRs); + for ( ; ; ) { + I = RPTracker.getNext(); - DEBUG(dbgs() << ' ' << PrintVRegOrUnit(Reg, SRI) << ':' - << PrintLaneMask(LiveIns[Reg])); + if (Regions[CurRegion].first == I) { + LiveIns[CurRegion] = RPTracker.getLiveRegs(); + RPTracker.clearMaxPressure(); } - } - LiveInPressure = std::make_pair(SGPRs, VGPRs); + if (Regions[CurRegion].second == I) { + Pressure[CurRegion] = RPTracker.moveMaxPressure(); + if (CurRegion-- == RegionIdx) + break; + } + RPTracker.advanceToNext(); + RPTracker.advanceBeforeNext(); + } - DEBUG(dbgs() << "\nLive-in pressure:\nSGPR = " << SGPRs - << "\nVGPR = " << VGPRs << '\n'); + if (OnlySucc) { + if (I != MBB->end()) { + RPTracker.advanceToNext(); + RPTracker.advance(MBB->end()); + } + RPTracker.reset(*OnlySucc->begin(), &RPTracker.getLiveRegs()); + RPTracker.advanceBeforeNext(); + MBBLiveIns[OnlySucc] = RPTracker.moveLiveRegs(); + } } -std::pair -GCNScheduleDAGMILive::getRealRegPressure() const { - unsigned SGPRs, MaxSGPRs, VGPRs, MaxVGPRs; - SGPRs = MaxSGPRs = LiveInPressure.first; - VGPRs = MaxVGPRs = LiveInPressure.second; - - const SIRegisterInfo *SRI = static_cast(TRI); - DenseMap LiveRegs(LiveIns); +void GCNScheduleDAGMILive::finalizeSchedule() { + GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; + DEBUG(dbgs() << "All regions recorded, starting actual scheduling.\n"); - for (const MachineInstr &MI : *this) { - if (MI.isDebugValue()) - continue; - SlotIndex SI = LIS->getInstructionIndex(MI).getBaseIndex(); - assert (SI.isValid()); + LiveIns.resize(Regions.size()); + Pressure.resize(Regions.size()); - // Remove dead registers or mask bits. - for (auto &It : LiveRegs) { - if (It.second.none()) - continue; - const LiveInterval &LI = LIS->getInterval(It.first); - if (LI.hasSubRanges()) { - for (const auto &S : LI.subranges()) - if (!S.liveAt(SI)) - setMask(MRI, SRI, It.first, It.second, It.second & ~S.LaneMask, - SGPRs, VGPRs); - } else if (!LI.liveAt(SI)) { - setMask(MRI, SRI, It.first, It.second, LaneBitmask::getNone(), - SGPRs, VGPRs); - } - } + do { + Stage++; + RegionIdx = 0; + MachineBasicBlock *MBB = nullptr; - // Add new registers or mask bits. - for (const auto &MO : MI.defs()) { - if (!MO.isReg()) - continue; - unsigned Reg = MO.getReg(); - if (!TargetRegisterInfo::isVirtualRegister(Reg)) - continue; - unsigned SubRegIdx = MO.getSubReg(); - LaneBitmask LaneMask = SubRegIdx != 0 - ? TRI->getSubRegIndexLaneMask(SubRegIdx) - : MRI.getMaxLaneMaskForVReg(Reg); - LaneBitmask &LM = LiveRegs[Reg]; - setMask(MRI, SRI, Reg, LM, LM | LaneMask, SGPRs, VGPRs); - } - MaxSGPRs = std::max(MaxSGPRs, SGPRs); - MaxVGPRs = std::max(MaxVGPRs, VGPRs); - } + if (Stage > 1) { + // Retry function scheduling if we found resulting occupancy and it is + // lower than used for first pass scheduling. This will give more freedom + // to schedule low register pressure blocks. + // Code is partially copied from MachineSchedulerBase::scheduleRegions(). - DEBUG(dbgs() << "Real region's register pressure:\nSGPR = " << MaxSGPRs - << "\nVGPR = " << MaxVGPRs << '\n'); + if (!LIS || StartingOccupancy <= MinOccupancy) + break; - return std::make_pair(MaxSGPRs, MaxVGPRs); -} + DEBUG(dbgs() + << "Retrying function scheduling with lowest recorded occupancy " + << MinOccupancy << ".\n"); -void GCNScheduleDAGMILive::finalizeSchedule() { - // Retry function scheduling if we found resulting occupancy and it is - // lower than used for first pass scheduling. This will give more freedom - // to schedule low register pressure blocks. - // Code is partially copied from MachineSchedulerBase::scheduleRegions(). + S.setTargetOccupancy(MinOccupancy); + } - if (!LIS || StartingOccupancy <= MinOccupancy) - return; + for (auto Region : Regions) { + RegionBegin = Region.first; + RegionEnd = Region.second; - DEBUG(dbgs() << "Retrying function scheduling with lowest recorded occupancy " - << MinOccupancy << ".\n"); + if (RegionBegin->getParent() != MBB) { + if (MBB) finishBlock(); + MBB = RegionBegin->getParent(); + startBlock(MBB); + if (Stage == 1) + computeBlockPressure(MBB); + } - Stage++; - GCNMaxOccupancySchedStrategy &S = (GCNMaxOccupancySchedStrategy&)*SchedImpl; - S.setTargetOccupancy(MinOccupancy); + unsigned NumRegionInstrs = std::distance(begin(), end()); + enterRegion(MBB, begin(), end(), NumRegionInstrs); - MachineBasicBlock *MBB = nullptr; - for (auto Region : Regions) { - RegionBegin = Region.first; - RegionEnd = Region.second; + // Skip empty scheduling regions (0 or 1 schedulable instructions). + if (begin() == end() || begin() == std::prev(end())) { + exitRegion(); + continue; + } - if (RegionBegin->getParent() != MBB) { - if (MBB) finishBlock(); - MBB = RegionBegin->getParent(); - startBlock(MBB); - } + DEBUG(dbgs() << "********** MI Scheduling **********\n"); + DEBUG(dbgs() << MF.getName() + << ":BB#" << MBB->getNumber() << " " << MBB->getName() + << "\n From: " << *begin() << " To: "; + if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; + else dbgs() << "End"; + dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); - unsigned NumRegionInstrs = std::distance(begin(), end()); - enterRegion(MBB, begin(), end(), NumRegionInstrs); + schedule(); - // Skip empty scheduling regions (0 or 1 schedulable instructions). - if (begin() == end() || begin() == std::prev(end())) { exitRegion(); - continue; + ++RegionIdx; } - DEBUG(dbgs() << "********** MI Scheduling **********\n"); - DEBUG(dbgs() << MF.getName() - << ":BB#" << MBB->getNumber() << " " << MBB->getName() - << "\n From: " << *begin() << " To: "; - if (RegionEnd != MBB->end()) dbgs() << *RegionEnd; - else dbgs() << "End"; - dbgs() << " RegionInstrs: " << NumRegionInstrs << '\n'); + finishBlock(); - schedule(); - - exitRegion(); - } - finishBlock(); - LiveIns.shrink_and_clear(); + } while (Stage < 2); } diff --git a/lib/Target/AMDGPU/GCNSchedStrategy.h b/lib/Target/AMDGPU/GCNSchedStrategy.h index 15af232704ff..3ed3cd5b3b1c 100644 --- a/lib/Target/AMDGPU/GCNSchedStrategy.h +++ b/lib/Target/AMDGPU/GCNSchedStrategy.h @@ -14,6 +14,7 @@ #ifndef LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H #define LLVM_LIB_TARGET_AMDGPU_GCNSCHEDSTRATEGY_H +#include "GCNRegPressure.h" #include "llvm/CodeGen/MachineScheduler.h" namespace llvm { @@ -74,21 +75,28 @@ class GCNScheduleDAGMILive : public ScheduleDAGMILive { // Scheduling stage number. unsigned Stage; + // Current region index. + size_t RegionIdx; + // Vecor of regions recorder for later rescheduling SmallVector, 32> Regions; - // Region live-ins. - DenseMap LiveIns; + // Region live-in cache. + SmallVector LiveIns; + + // Region pressure cache. + SmallVector Pressure; + + // Temporary basic block live-in cache. + DenseMap MBBLiveIns; - // Number of live-ins to the current region, first SGPR then VGPR. - std::pair LiveInPressure; + // Return current region pressure. + GCNRegPressure getRealRegPressure() const; - // Collect current region live-ins. - void discoverLiveIns(); + // Compute and cache live-ins and pressure for all regions in block. + void computeBlockPressure(const MachineBasicBlock *MBB); - // Return current region pressure. First value is SGPR number, second is VGPR. - std::pair getRealRegPressure() const; public: GCNScheduleDAGMILive(MachineSchedContext *C, diff --git a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp index d8cb98fe1b19..8cb35c506135 100644 --- a/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp +++ b/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp @@ -126,7 +126,7 @@ bool SIAnnotateControlFlow::doInitialization(Module &M) { Void = Type::getVoidTy(Context); Boolean = Type::getInt1Ty(Context); Int64 = Type::getInt64Ty(Context); - ReturnStruct = StructType::get(Boolean, Int64, (Type *)nullptr); + ReturnStruct = StructType::get(Boolean, Int64); BoolTrue = ConstantInt::getTrue(Context); BoolFalse = ConstantInt::getFalse(Context); diff --git a/lib/Target/AMDGPU/SIISelLowering.cpp b/lib/Target/AMDGPU/SIISelLowering.cpp index cc93c27731ff..48a14e4dbea2 100644 --- a/lib/Target/AMDGPU/SIISelLowering.cpp +++ b/lib/Target/AMDGPU/SIISelLowering.cpp @@ -488,6 +488,7 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM, setTargetDAGCombine(ISD::FCANONICALIZE); setTargetDAGCombine(ISD::SCALAR_TO_VECTOR); setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -2003,6 +2004,7 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter( break; } assert(Found); + (void)Found; // This should be before all vector instructions. BuildMI(*BB, FirstMI, DebugLoc(), TII->get(AMDGPU::S_BFE_U32), CountReg) @@ -4604,6 +4606,24 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N, return SDValue(); } +SDValue SITargetLowering::performExtractVectorEltCombine( + SDNode *N, DAGCombinerInfo &DCI) const { + SDValue Vec = N->getOperand(0); + + SelectionDAG &DAG= DCI.DAG; + if (Vec.getOpcode() == ISD::FNEG && allUsesHaveSourceMods(N)) { + SDLoc SL(N); + EVT EltVT = N->getValueType(0); + SDValue Idx = N->getOperand(1); + SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, + Vec.getOperand(0), Idx); + return DAG.getNode(ISD::FNEG, SL, EltVT, Elt); + } + + return SDValue(); +} + + unsigned SITargetLowering::getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const { @@ -4891,6 +4911,8 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N, break; } + case ISD::EXTRACT_VECTOR_ELT: + return performExtractVectorEltCombine(N, DCI); } return AMDGPUTargetLowering::PerformDAGCombine(N, DCI); } diff --git a/lib/Target/AMDGPU/SIISelLowering.h b/lib/Target/AMDGPU/SIISelLowering.h index d177777ad5ee..046e677756d1 100644 --- a/lib/Target/AMDGPU/SIISelLowering.h +++ b/lib/Target/AMDGPU/SIISelLowering.h @@ -100,6 +100,7 @@ class SITargetLowering final : public AMDGPUTargetLowering { SDValue performMinMaxCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performFMed3Combine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue performCvtPkRTZCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performExtractVectorEltCombine(SDNode *N, DAGCombinerInfo &DCI) const; unsigned getFusedOpcode(const SelectionDAG &DAG, const SDNode *N0, const SDNode *N1) const; diff --git a/lib/Target/AMDGPU/SIInstrInfo.cpp b/lib/Target/AMDGPU/SIInstrInfo.cpp index 92e452a3d6a0..065fd09eb356 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -496,6 +496,188 @@ int SIInstrInfo::commuteOpcode(unsigned Opcode) const { return Opcode; } +void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, unsigned DestReg, + int64_t Value) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + const TargetRegisterClass *RegClass = MRI.getRegClass(DestReg); + if (RegClass == &AMDGPU::SReg_32RegClass || + RegClass == &AMDGPU::SGPR_32RegClass || + RegClass == &AMDGPU::SReg_32_XM0RegClass || + RegClass == &AMDGPU::SReg_32_XM0_XEXECRegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DestReg) + .addImm(Value); + return; + } + + if (RegClass == &AMDGPU::SReg_64RegClass || + RegClass == &AMDGPU::SGPR_64RegClass || + RegClass == &AMDGPU::SReg_64_XEXECRegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), DestReg) + .addImm(Value); + return; + } + + if (RegClass == &AMDGPU::VGPR_32RegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DestReg) + .addImm(Value); + return; + } + if (RegClass == &AMDGPU::VReg_64RegClass) { + BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B64_PSEUDO), DestReg) + .addImm(Value); + return; + } + + unsigned EltSize = 4; + unsigned Opcode = AMDGPU::V_MOV_B32_e32; + if (RI.isSGPRClass(RegClass)) { + if (RI.getRegSizeInBits(*RegClass) > 32) { + Opcode = AMDGPU::S_MOV_B64; + EltSize = 8; + } else { + Opcode = AMDGPU::S_MOV_B32; + EltSize = 4; + } + } + + ArrayRef SubIndices = RI.getRegSplitParts(RegClass, EltSize); + for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { + int64_t IdxValue = Idx == 0 ? Value : 0; + + MachineInstrBuilder Builder = BuildMI(MBB, MI, DL, + get(Opcode), RI.getSubReg(DestReg, Idx)); + Builder.addImm(IdxValue); + } +} + +const TargetRegisterClass * +SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const { + return &AMDGPU::VGPR_32RegClass; +} + +void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, unsigned DstReg, + ArrayRef Cond, + unsigned TrueReg, + unsigned FalseReg) const { + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); + assert(MRI.getRegClass(DstReg) == &AMDGPU::VGPR_32RegClass && + "Not a VGPR32 reg"); + + if (Cond.size() == 1) { + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .add(Cond[0]); + } else if (Cond.size() == 2) { + assert(Cond[0].isImm() && "Cond[0] is not an immediate"); + switch (Cond[0].getImm()) { + case SIInstrInfo::SCC_TRUE: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(-1) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::SCC_FALSE: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(0) + .addImm(-1); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::VCCNZ: { + MachineOperand RegOp = Cond[1]; + RegOp.setImplicit(false); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .add(RegOp); + break; + } + case SIInstrInfo::VCCZ: { + MachineOperand RegOp = Cond[1]; + RegOp.setImplicit(false); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(TrueReg) + .addReg(FalseReg) + .add(RegOp); + break; + } + case SIInstrInfo::EXECNZ: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(-1) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + break; + } + case SIInstrInfo::EXECZ: { + unsigned SReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned SReg2 = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(MBB, I, DL, get(AMDGPU::S_OR_SAVEEXEC_B64), SReg2) + .addImm(0); + BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), SReg) + .addImm(0) + .addImm(-1); + BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e64), DstReg) + .addReg(FalseReg) + .addReg(TrueReg) + .addReg(SReg); + llvm_unreachable("Unhandled branch predicate EXECZ"); + break; + } + default: + llvm_unreachable("invalid branch predicate"); + } + } else { + llvm_unreachable("Can only handle Cond size 1 or 2"); + } +} + +unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned SrcReg, int Value) const { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg) + .addImm(Value) + .addReg(SrcReg); + + return Reg; +} + +unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, + const DebugLoc &DL, + unsigned SrcReg, int Value) const { + MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo(); + unsigned Reg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg) + .addImm(Value) + .addReg(SrcReg); + + return Reg; +} + unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const { if (RI.getRegSizeInBits(*DstRC) == 32) { @@ -834,6 +1016,20 @@ void SIInstrInfo::insertNoop(MachineBasicBlock &MBB, insertWaitStates(MBB, MI, 1); } +void SIInstrInfo::insertReturn(MachineBasicBlock &MBB) const { + auto MF = MBB.getParent(); + SIMachineFunctionInfo *Info = MF->getInfo(); + + assert(Info->isEntryFunction()); + + if (MBB.succ_empty()) { + bool HasNoTerminator = MBB.getFirstTerminator() == MBB.end(); + if (HasNoTerminator) + BuildMI(MBB, MBB.end(), DebugLoc(), + get(Info->returnsVoid() ? AMDGPU::S_ENDPGM : AMDGPU::SI_RETURN_TO_EPILOG)); + } +} + unsigned SIInstrInfo::getNumWaitStates(const MachineInstr &MI) const { switch (MI.getOpcode()) { default: return 1; // FIXME: Do wait states equal cycles? @@ -1241,14 +1437,20 @@ bool SIInstrInfo::analyzeBranchImpl(MachineBasicBlock &MBB, return false; } - BranchPredicate Pred = getBranchPredicate(I->getOpcode()); - if (Pred == INVALID_BR) - return true; + MachineBasicBlock *CondBB = nullptr; - MachineBasicBlock *CondBB = I->getOperand(0).getMBB(); - Cond.push_back(MachineOperand::CreateImm(Pred)); - Cond.push_back(I->getOperand(1)); // Save the branch register. + if (I->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + CondBB = I->getOperand(1).getMBB(); + Cond.push_back(I->getOperand(0)); + } else { + BranchPredicate Pred = getBranchPredicate(I->getOpcode()); + if (Pred == INVALID_BR) + return true; + CondBB = I->getOperand(0).getMBB(); + Cond.push_back(MachineOperand::CreateImm(Pred)); + Cond.push_back(I->getOperand(1)); // Save the branch register. + } ++I; if (I == MBB.end()) { @@ -1351,6 +1553,13 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, return 1; } + if(Cond.size() == 1 && Cond[0].isReg()) { + BuildMI(&MBB, DL, get(AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO)) + .add(Cond[0]) + .addMBB(TBB); + return 1; + } + assert(TBB && Cond[0].isImm()); unsigned Opcode @@ -1390,9 +1599,16 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB, bool SIInstrInfo::reverseBranchCondition( SmallVectorImpl &Cond) const { - assert(Cond.size() == 2); - Cond[0].setImm(-Cond[0].getImm()); - return false; + if (Cond.size() != 2) { + return true; + } + + if (Cond[0].isImm()) { + Cond[0].setImm(-Cond[0].getImm()); + return false; + } + + return true; } bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB, @@ -3920,6 +4136,82 @@ bool SIInstrInfo::mayAccessFlatAddressSpace(const MachineInstr &MI) const { return false; } +bool SIInstrInfo::isNonUniformBranchInstr(MachineInstr &Branch) const { + return Branch.getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO; +} + +void SIInstrInfo::convertNonUniformIfRegion(MachineBasicBlock *IfEntry, + MachineBasicBlock *IfEnd) const { + MachineBasicBlock::iterator TI = IfEntry->getFirstTerminator(); + assert(TI != IfEntry->end()); + + MachineInstr *Branch = &(*TI); + MachineFunction *MF = IfEntry->getParent(); + MachineRegisterInfo &MRI = IfEntry->getParent()->getRegInfo(); + + if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstr *SIIF = + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_IF), DstReg) + .add(Branch->getOperand(0)) + .add(Branch->getOperand(1)); + MachineInstr *SIEND = + BuildMI(*MF, Branch->getDebugLoc(), get(AMDGPU::SI_END_CF)) + .addReg(DstReg); + + IfEntry->erase(TI); + IfEntry->insert(IfEntry->end(), SIIF); + IfEnd->insert(IfEnd->getFirstNonPHI(), SIEND); + } +} + +void SIInstrInfo::convertNonUniformLoopRegion( + MachineBasicBlock *LoopEntry, MachineBasicBlock *LoopEnd) const { + MachineBasicBlock::iterator TI = LoopEnd->getFirstTerminator(); + // We expect 2 terminators, one conditional and one unconditional. + assert(TI != LoopEnd->end()); + + MachineInstr *Branch = &(*TI); + MachineFunction *MF = LoopEnd->getParent(); + MachineRegisterInfo &MRI = LoopEnd->getParent()->getRegInfo(); + + if (Branch->getOpcode() == AMDGPU::SI_NON_UNIFORM_BRCOND_PSEUDO) { + + unsigned DstReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + unsigned BackEdgeReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + MachineInstrBuilder HeaderPHIBuilder = + BuildMI(*(MF), Branch->getDebugLoc(), get(TargetOpcode::PHI), DstReg); + for (MachineBasicBlock::pred_iterator PI = LoopEntry->pred_begin(), + E = LoopEntry->pred_end(); + PI != E; ++PI) { + if (*PI == LoopEnd) { + HeaderPHIBuilder.addReg(BackEdgeReg); + } else { + MachineBasicBlock *PMBB = *PI; + unsigned ZeroReg = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass); + materializeImmediate(*PMBB, PMBB->getFirstTerminator(), DebugLoc(), + ZeroReg, 0); + HeaderPHIBuilder.addReg(ZeroReg); + } + HeaderPHIBuilder.addMBB(*PI); + } + MachineInstr *HeaderPhi = HeaderPHIBuilder; + MachineInstr *SIIFBREAK = BuildMI(*(MF), Branch->getDebugLoc(), + get(AMDGPU::SI_IF_BREAK), BackEdgeReg) + .addReg(DstReg) + .add(Branch->getOperand(0)); + MachineInstr *SILOOP = + BuildMI(*(MF), Branch->getDebugLoc(), get(AMDGPU::SI_LOOP)) + .addReg(BackEdgeReg) + .addMBB(LoopEntry); + + LoopEntry->insert(LoopEntry->begin(), HeaderPhi); + LoopEnd->erase(TI); + LoopEnd->insert(LoopEnd->end(), SIIFBREAK); + LoopEnd->insert(LoopEnd->end(), SILOOP); + } +} + ArrayRef> SIInstrInfo::getSerializableTargetIndices() const { static const std::pair TargetIndices[] = { diff --git a/lib/Target/AMDGPU/SIInstrInfo.h b/lib/Target/AMDGPU/SIInstrInfo.h index 03a5ef74b179..f6e5e8883f63 100644 --- a/lib/Target/AMDGPU/SIInstrInfo.h +++ b/lib/Target/AMDGPU/SIInstrInfo.h @@ -143,6 +143,23 @@ public: RegScavenger *RS, unsigned TmpReg, unsigned Offset, unsigned Size) const; + void materializeImmediate(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MI, + const DebugLoc &DL, + unsigned DestReg, + int64_t Value) const; + + const TargetRegisterClass *getPreferredSelectRegClass( + unsigned Size) const; + + unsigned insertNE(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned SrcReg, int Value) const; + + unsigned insertEQ(MachineBasicBlock *MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned SrcReg, int Value) const; + void storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg, bool isKill, int FrameIndex, @@ -193,7 +210,7 @@ public: bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, MachineBasicBlock *&FBB, SmallVectorImpl &Cond, - bool AllowModify) const override; + bool AllowModify = false) const override; unsigned removeBranch(MachineBasicBlock &MBB, int *BytesRemoved = nullptr) const override; @@ -218,6 +235,11 @@ public: unsigned DstReg, ArrayRef Cond, unsigned TrueReg, unsigned FalseReg) const override; + void insertVectorSelect(MachineBasicBlock &MBB, + MachineBasicBlock::iterator I, const DebugLoc &DL, + unsigned DstReg, ArrayRef Cond, + unsigned TrueReg, unsigned FalseReg) const; + bool areMemAccessesTriviallyDisjoint(MachineInstr &MIa, MachineInstr &MIb, AliasAnalysis *AA = nullptr) const override; @@ -705,6 +727,7 @@ public: void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + void insertReturn(MachineBasicBlock &MBB) const; /// \brief Return the number of wait states that result from executing this /// instruction. unsigned getNumWaitStates(const MachineInstr &MI) const; @@ -750,6 +773,14 @@ public: bool mayAccessFlatAddressSpace(const MachineInstr &MI) const; + bool isNonUniformBranchInstr(MachineInstr &Instr) const; + + void convertNonUniformIfRegion(MachineBasicBlock *IfEntry, + MachineBasicBlock *IfEnd) const; + + void convertNonUniformLoopRegion(MachineBasicBlock *LoopEntry, + MachineBasicBlock *LoopEnd) const; + ArrayRef> getSerializableTargetIndices() const override; diff --git a/lib/Target/AMDGPU/SIInstructions.td b/lib/Target/AMDGPU/SIInstructions.td index 7ccb54f54e34..3b4bdc864253 100644 --- a/lib/Target/AMDGPU/SIInstructions.td +++ b/lib/Target/AMDGPU/SIInstructions.td @@ -174,6 +174,13 @@ def SI_MASK_BRANCH : VPseudoInstSI < let isTerminator = 1 in { + def SI_NON_UNIFORM_BRCOND_PSEUDO : CFPseudoInstSI < + (outs), + (ins SReg_64:$vcc, brtarget:$target), + [(brcond i1:$vcc, bb:$target)]> { + let Size = 12; +} + def SI_IF: CFPseudoInstSI < (outs SReg_64:$dst), (ins SReg_64:$vcc, brtarget:$target), [(set i64:$dst, (AMDGPUif i1:$vcc, bb:$target))], 1, 1> { diff --git a/lib/Target/AMDGPU/VOP2Instructions.td b/lib/Target/AMDGPU/VOP2Instructions.td index 2281f338ab45..4a11d9471f1d 100644 --- a/lib/Target/AMDGPU/VOP2Instructions.td +++ b/lib/Target/AMDGPU/VOP2Instructions.td @@ -164,8 +164,11 @@ multiclass VOP2eInst : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, VGPR_32:$src1, ImmOpType:$imm); - field string Asm32 = "$vdst, $src0, $src1, $imm"; field bit HasExt = 0; + + // Hack to stop printing _e64 + let DstRC = RegisterOperand; + field string Asm32 = " $vdst, $src0, $src1, $imm"; } def VOP_MADAK_F16 : VOP_MADAK ; @@ -174,8 +177,11 @@ def VOP_MADAK_F32 : VOP_MADAK ; class VOP_MADMK : VOPProfile <[vt, vt, vt, vt]> { field Operand ImmOpType = !if(!eq(vt.Size, 32), f32kimm, f16kimm); field dag Ins32 = (ins VCSrc_f32:$src0, ImmOpType:$imm, VGPR_32:$src1); - field string Asm32 = "$vdst, $src0, $imm, $src1"; field bit HasExt = 0; + + // Hack to stop printing _e64 + let DstRC = RegisterOperand; + field string Asm32 = " $vdst, $src0, $imm, $src1"; } def VOP_MADMK_F16 : VOP_MADMK ; @@ -298,7 +304,7 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32]> { let SubtargetPredicate = isGCN in { defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>; -def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32>; +def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, [], "">; let isCommutable = 1 in { defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>; @@ -328,7 +334,7 @@ let Constraints = "$vdst = $src2", DisableEncoding="$src2", defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>; } -def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32>; +def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, [], "">; // No patterns so that the scalar instructions are always selected. // The scalar versions will be replaced with vector when needed later. @@ -383,7 +389,7 @@ defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32>; let SubtargetPredicate = isVI in { -def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16>; +def V_MADMK_F16 : VOP2_Pseudo <"v_madmk_f16", VOP_MADMK_F16, [], "">; defm V_LSHLREV_B16 : VOP2Inst <"v_lshlrev_b16", VOP_I16_I16_I16>; defm V_LSHRREV_B16 : VOP2Inst <"v_lshrrev_b16", VOP_I16_I16_I16>; defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16>; @@ -394,7 +400,7 @@ defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>; defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>; defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">; defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>; -def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16>; +def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">; defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16>; defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16>; defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">; @@ -651,6 +657,17 @@ multiclass VOP2_Real_e64_vi op> { VOP3e_vi (NAME#"_e64").Pfl>; } +multiclass VOP2_Real_e64only_vi op> { + def _e64_vi : + VOP3_Real(NAME#"_e64"), SIEncodingFamily.VI>, + VOP3e_vi (NAME#"_e64").Pfl> { + // Hack to stop printing _e64 + VOP3_Pseudo ps = !cast(NAME#"_e64"); + let OutOperandList = (outs VGPR_32:$vdst); + let AsmString = ps.Mnemonic # " " # ps.AsmOperands; + } +} + multiclass Base_VOP2be_Real_e32e64_vi op> : VOP2_Real_e32_vi { def _e64_vi : VOP3_Real(NAME#"_e64"), SIEncodingFamily.VI>, @@ -718,17 +735,17 @@ defm V_SUBBREV_U32 : VOP2be_Real_e32e64_vi <0x1e>; defm V_READLANE_B32 : VOP32_Real_vi <0x289>; defm V_WRITELANE_B32 : VOP32_Real_vi <0x28a>; -defm V_BFM_B32 : VOP2_Real_e64_vi <0x293>; -defm V_BCNT_U32_B32 : VOP2_Real_e64_vi <0x28b>; -defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64_vi <0x28c>; -defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64_vi <0x28d>; -defm V_LDEXP_F32 : VOP2_Real_e64_vi <0x288>; -defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64_vi <0x1f0>; -defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64_vi <0x294>; -defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64_vi <0x295>; -defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64_vi <0x296>; -defm V_CVT_PK_U16_U32 : VOP2_Real_e64_vi <0x297>; -defm V_CVT_PK_I16_I32 : VOP2_Real_e64_vi <0x298>; +defm V_BFM_B32 : VOP2_Real_e64only_vi <0x293>; +defm V_BCNT_U32_B32 : VOP2_Real_e64only_vi <0x28b>; +defm V_MBCNT_LO_U32_B32 : VOP2_Real_e64only_vi <0x28c>; +defm V_MBCNT_HI_U32_B32 : VOP2_Real_e64only_vi <0x28d>; +defm V_LDEXP_F32 : VOP2_Real_e64only_vi <0x288>; +defm V_CVT_PKACCUM_U8_F32 : VOP2_Real_e64only_vi <0x1f0>; +defm V_CVT_PKNORM_I16_F32 : VOP2_Real_e64only_vi <0x294>; +defm V_CVT_PKNORM_U16_F32 : VOP2_Real_e64only_vi <0x295>; +defm V_CVT_PKRTZ_F16_F32 : VOP2_Real_e64only_vi <0x296>; +defm V_CVT_PK_U16_U32 : VOP2_Real_e64only_vi <0x297>; +defm V_CVT_PK_I16_I32 : VOP2_Real_e64only_vi <0x298>; defm V_ADD_F16 : VOP2_Real_e32e64_vi <0x1f>; defm V_SUB_F16 : VOP2_Real_e32e64_vi <0x20>; diff --git a/lib/Target/AMDGPU/VOP3Instructions.td b/lib/Target/AMDGPU/VOP3Instructions.td index 217a07488853..ffa6c60d6b1f 100644 --- a/lib/Target/AMDGPU/VOP3Instructions.td +++ b/lib/Target/AMDGPU/VOP3Instructions.td @@ -232,7 +232,6 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile>; let SubtargetPredicate = isCIVI in { -def V_MQSAD_U16_U8 : VOP3Inst <"v_mqsad_u16_u8", VOP3_Profile>; def V_QSAD_PK_U16_U8 : VOP3Inst <"v_qsad_pk_u16_u8", VOP3_Profile, int_amdgcn_qsad_pk_u16_u8>; def V_MQSAD_U32_U8 : VOP3Inst <"v_mqsad_u32_u8", VOP3_Profile, int_amdgcn_mqsad_u32_u8>; @@ -402,7 +401,6 @@ multiclass VOP3be_Real_ci op> { } } -defm V_MQSAD_U16_U8 : VOP3_Real_ci <0x172>; defm V_QSAD_PK_U16_U8 : VOP3_Real_ci <0x172>; defm V_MQSAD_U32_U8 : VOP3_Real_ci <0x175>; defm V_MAD_U64_U32 : VOP3be_Real_ci <0x176>; @@ -426,7 +424,6 @@ multiclass VOP3be_Real_vi op> { } // End AssemblerPredicates = [isVI], DecoderNamespace = "VI" -defm V_MQSAD_U16_U8 : VOP3_Real_vi <0x172>; defm V_MAD_U64_U32 : VOP3be_Real_vi <0x1E8>; defm V_MAD_I64_I32 : VOP3be_Real_vi <0x1E9>; diff --git a/lib/Target/ARM/ARMBaseInstrInfo.h b/lib/Target/ARM/ARMBaseInstrInfo.h index 28c407f74125..dd7fe871345a 100644 --- a/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/lib/Target/ARM/ARMBaseInstrInfo.h @@ -404,21 +404,11 @@ public: /// Returns predicate register associated with the given frame instruction. unsigned getFramePred(const MachineInstr &MI) const { assert(isFrameInstr(MI)); - if (isFrameSetup(MI)) - // Operands of ADJCALLSTACKDOWN: - // - argument declared in ADJCALLSTACKDOWN pattern: - // 0 - frame size - // 1 - predicate code (like ARMCC::AL) - // - added by predOps: - // 2 - predicate reg - return MI.getOperand(2).getReg(); - assert(MI.getOpcode() == ARM::ADJCALLSTACKUP || - MI.getOpcode() == ARM::tADJCALLSTACKUP); - // Operands of ADJCALLSTACKUP: - // - argument declared in ADJCALLSTACKUP pattern: + // Operands of ADJCALLSTACKDOWN/ADJCALLSTACKUP: + // - argument declared in the pattern: // 0 - frame size - // 1 - arg of CALLSEQ_END - // 2 - predicate code + // 1 - arg of CALLSEQ_START/CALLSEQ_END + // 2 - predicate code (like ARMCC::AL) // - added by predOps: // 3 - predicate reg return MI.getOperand(3).getReg(); diff --git a/lib/Target/ARM/ARMCallLowering.cpp b/lib/Target/ARM/ARMCallLowering.cpp index 9178c67afa6e..46ac4d0ad933 100644 --- a/lib/Target/ARM/ARMCallLowering.cpp +++ b/lib/Target/ARM/ARMCallLowering.cpp @@ -433,7 +433,7 @@ bool ARMCallLowering::lowerCall(MachineIRBuilder &MIRBuilder, // We now know the size of the stack - update the ADJCALLSTACKDOWN // accordingly. - CallSeqStart.addImm(ArgHandler.StackSize).add(predOps(ARMCC::AL)); + CallSeqStart.addImm(ArgHandler.StackSize).addImm(0).add(predOps(ARMCC::AL)); MIRBuilder.buildInstr(ARM::ADJCALLSTACKUP) .addImm(ArgHandler.StackSize) diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp index 56cac855620d..4f6a73b5980d 100644 --- a/lib/Target/ARM/ARMFastISel.cpp +++ b/lib/Target/ARM/ARMFastISel.cpp @@ -1949,7 +1949,7 @@ bool ARMFastISel::ProcessCallArgs(SmallVectorImpl &Args, unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes)); + .addImm(NumBytes).addImm(0)); // Process the args. for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp index e64582402fe1..f8b584db7b99 100644 --- a/lib/Target/ARM/ARMISelLowering.cpp +++ b/lib/Target/ARM/ARMISelLowering.cpp @@ -473,9 +473,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, } // Use divmod compiler-rt calls for iOS 5.0 and later. - if (Subtarget->isTargetWatchOS() || - (Subtarget->isTargetIOS() && - !Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { + if (Subtarget->isTargetMachO() && + !(Subtarget->isTargetIOS() && + Subtarget->getTargetTriple().isOSVersionLT(5, 0))) { setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4"); setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4"); } @@ -1817,8 +1817,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!isSibCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(NumBytes, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue StackPtr = DAG.getCopyFromReg(Chain, dl, ARM::SP, getPointerTy(DAG.getDataLayout())); @@ -7365,7 +7364,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const { const TargetLowering &TLI = DAG.getTargetLoweringInfo(); // Pair of floats / doubles used to pass the result. - Type *RetTy = StructType::get(ArgTy, ArgTy, nullptr); + Type *RetTy = StructType::get(ArgTy, ArgTy); auto &DL = DAG.getDataLayout(); ArgListTy Args; @@ -13115,7 +13114,7 @@ SDValue ARMTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - Type *RetTy = (Type*)StructType::get(Ty, Ty, nullptr); + Type *RetTy = StructType::get(Ty, Ty); if (Subtarget->isTargetWindows()) InChain = WinDBZCheckDenominator(DAG, Op.getNode(), InChain); @@ -13417,9 +13416,9 @@ Instruction* ARMTargetLowering::makeDMB(IRBuilder<> &Builder, } // Based on http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: @@ -13428,7 +13427,7 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, case AtomicOrdering::Acquire: return nullptr; // Nothing to do case AtomicOrdering::SequentiallyConsistent: - if (!IsStore) + if (!Inst->hasAtomicStore()) return nullptr; // Nothing to do /*FALLTHROUGH*/ case AtomicOrdering::Release: @@ -13442,9 +13441,9 @@ Instruction* ARMTargetLowering::emitLeadingFence(IRBuilder<> &Builder, llvm_unreachable("Unknown fence ordering in emitLeadingFence"); } -Instruction* ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *ARMTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { switch (Ord) { case AtomicOrdering::NotAtomic: case AtomicOrdering::Unordered: diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h index 08c51b66dfe7..875c06210ae6 100644 --- a/lib/Target/ARM/ARMISelLowering.h +++ b/lib/Target/ARM/ARMISelLowering.h @@ -483,10 +483,10 @@ class InstrItineraryData; void emitAtomicCmpXchgNoStoreLLBalance(IRBuilder<> &Builder) const override; - Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; - Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; + Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; unsigned getMaxSupportedInterleaveFactor() const override { return 4; } diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td index a94d6048f02d..d06b7d0896f1 100644 --- a/lib/Target/ARM/ARMInstrInfo.td +++ b/lib/Target/ARM/ARMInstrInfo.td @@ -16,7 +16,8 @@ // // Type profiles. -def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_ARMCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_ARMCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; def SDT_ARMStructByVal : SDTypeProfile<0, 4, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, @@ -1968,8 +1969,8 @@ PseudoInst<(outs), (ins i32imm:$amt1, i32imm:$amt2, pred:$p), NoItinerary, [(ARMcallseq_end timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKDOWN : -PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary, - [(ARMcallseq_start timm:$amt)]>; +PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2, pred:$p), NoItinerary, + [(ARMcallseq_start timm:$amt, timm:$amt2)]>; } def HINT : AI<(outs), (ins imm0_239:$imm), MiscFrm, NoItinerary, diff --git a/lib/Target/ARM/ARMInstrThumb.td b/lib/Target/ARM/ARMInstrThumb.td index 8048c758e998..bee83dfb6f63 100644 --- a/lib/Target/ARM/ARMInstrThumb.td +++ b/lib/Target/ARM/ARMInstrThumb.td @@ -284,8 +284,8 @@ def tADJCALLSTACKUP : Requires<[IsThumb, IsThumb1Only]>; def tADJCALLSTACKDOWN : - PseudoInst<(outs), (ins i32imm:$amt), NoItinerary, - [(ARMcallseq_start imm:$amt)]>, + PseudoInst<(outs), (ins i32imm:$amt, i32imm:$amt2), NoItinerary, + [(ARMcallseq_start imm:$amt, imm:$amt2)]>, Requires<[IsThumb, IsThumb1Only]>; } diff --git a/lib/Target/ARM/ARMInstructionSelector.cpp b/lib/Target/ARM/ARMInstructionSelector.cpp index 2ac3fda9f448..8c680cdf9b47 100644 --- a/lib/Target/ARM/ARMInstructionSelector.cpp +++ b/lib/Target/ARM/ARMInstructionSelector.cpp @@ -101,14 +101,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, assert(RegBank && "Can't get reg bank for virtual register"); const unsigned DstSize = MRI.getType(DstReg).getSizeInBits(); - (void)DstSize; - unsigned SrcReg = I.getOperand(1).getReg(); - const unsigned SrcSize = RBI.getSizeInBits(SrcReg, MRI, TRI); - (void)SrcSize; - // We use copies for trunc, so it's ok for the size of the destination to be - // smaller (the higher bits will just be undefined). - assert(DstSize <= SrcSize && "Copy with different width?!"); - assert((RegBank->getID() == ARM::GPRRegBankID || RegBank->getID() == ARM::FPRRegBankID) && "Unsupported reg bank"); @@ -135,28 +127,6 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII, return true; } -static bool selectFAdd(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, - MachineRegisterInfo &MRI) { - assert(TII.getSubtarget().hasVFP2() && "Can't select fp add without vfp"); - - LLT Ty = MRI.getType(MIB->getOperand(0).getReg()); - unsigned ValSize = Ty.getSizeInBits(); - - if (ValSize == 32) { - if (TII.getSubtarget().useNEONForSinglePrecisionFP()) - return false; - MIB->setDesc(TII.get(ARM::VADDS)); - } else { - assert(ValSize == 64 && "Unsupported size for floating point value"); - if (TII.getSubtarget().isFPOnlySP()) - return false; - MIB->setDesc(TII.get(ARM::VADDD)); - } - MIB.add(predOps(ARMCC::AL)); - - return true; -} - static bool selectSequence(MachineInstrBuilder &MIB, const ARMBaseInstrInfo &TII, MachineRegisterInfo &MRI, @@ -352,6 +322,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } break; } + case G_ANYEXT: case G_TRUNC: { // The high bits are undefined, so there's nothing special to do, just // treat it as a copy. @@ -362,12 +333,12 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { const auto &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI); if (SrcRegBank.getID() != DstRegBank.getID()) { - DEBUG(dbgs() << "G_TRUNC operands on different register banks\n"); + DEBUG(dbgs() << "G_TRUNC/G_ANYEXT operands on different register banks\n"); return false; } if (SrcRegBank.getID() != ARM::GPRRegBankID) { - DEBUG(dbgs() << "G_TRUNC on non-GPR not supported yet\n"); + DEBUG(dbgs() << "G_TRUNC/G_ANYEXT on non-GPR not supported yet\n"); return false; } @@ -393,10 +364,6 @@ bool ARMInstructionSelector::select(MachineInstr &I) const { } MIB.add(predOps(ARMCC::AL)).add(condCodeOp()); break; - case G_FADD: - if (!selectFAdd(MIB, TII, MRI)) - return false; - break; case G_FRAME_INDEX: // Add 0 to the given frame index and hope it will eventually be folded into // the user(s). diff --git a/lib/Target/ARM/ARMLegalizerInfo.cpp b/lib/Target/ARM/ARMLegalizerInfo.cpp index 9b86030fdd29..5bf6c7aed6b8 100644 --- a/lib/Target/ARM/ARMLegalizerInfo.cpp +++ b/lib/Target/ARM/ARMLegalizerInfo.cpp @@ -45,9 +45,11 @@ ARMLegalizerInfo::ARMLegalizerInfo(const ARMSubtarget &ST) { setAction({Op, 1, p0}, Legal); } - for (unsigned Op : {G_ADD, G_SUB, G_MUL}) - for (auto Ty : {s1, s8, s16, s32}) - setAction({Op, Ty}, Legal); + for (unsigned Op : {G_ADD, G_SUB, G_MUL}) { + for (auto Ty : {s1, s8, s16}) + setAction({Op, Ty}, WidenScalar); + setAction({Op, s32}, Legal); + } for (unsigned Op : {G_SDIV, G_UDIV}) { for (auto Ty : {s8, s16}) diff --git a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp index 581d5fe159fd..7e4d598a6e0b 100644 --- a/lib/Target/ARM/ARMOptimizeBarriersPass.cpp +++ b/lib/Target/ARM/ARMOptimizeBarriersPass.cpp @@ -88,13 +88,15 @@ bool ARMOptimizeBarriersPass::runOnMachineFunction(MachineFunction &MF) { } } } + bool Changed = false; // Remove the tagged DMB for (auto MI : ToRemove) { MI->eraseFromParent(); ++NumDMBsRemoved; + Changed = true; } - return NumDMBsRemoved > 0; + return Changed; } /// createARMOptimizeBarriersPass - Returns an instance of the remove double diff --git a/lib/Target/ARM/ARMRegisterBankInfo.cpp b/lib/Target/ARM/ARMRegisterBankInfo.cpp index 13a32211f88c..a20997c95cd9 100644 --- a/lib/Target/ARM/ARMRegisterBankInfo.cpp +++ b/lib/Target/ARM/ARMRegisterBankInfo.cpp @@ -225,6 +225,7 @@ ARMRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const { case G_UDIV: case G_SEXT: case G_ZEXT: + case G_ANYEXT: case G_TRUNC: case G_GEP: // FIXME: We're abusing the fact that everything lives in a GPR for now; in diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp index d09f3ecbaa28..5583d6148b08 100644 --- a/lib/Target/ARM/ARMTargetMachine.cpp +++ b/lib/Target/ARM/ARMTargetMachine.cpp @@ -13,7 +13,9 @@ #include "ARM.h" #include "ARMCallLowering.h" #include "ARMLegalizerInfo.h" +#ifdef LLVM_BUILD_GLOBAL_ISEL #include "ARMRegisterBankInfo.h" +#endif #include "ARMSubtarget.h" #include "ARMTargetMachine.h" #include "ARMTargetObjectFile.h" diff --git a/lib/Target/AVR/AVRFrameLowering.cpp b/lib/Target/AVR/AVRFrameLowering.cpp index c297865db820..0ec8e8b08ceb 100644 --- a/lib/Target/AVR/AVRFrameLowering.cpp +++ b/lib/Target/AVR/AVRFrameLowering.cpp @@ -375,7 +375,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr( DebugLoc DL = MI->getDebugLoc(); unsigned int Opcode = MI->getOpcode(); - int Amount = MI->getOperand(0).getImm(); + int Amount = TII.getFrameSize(*MI); // Adjcallstackup does not need to allocate stack space for the call, instead // we insert push instructions that will allocate the necessary stack. diff --git a/lib/Target/AVR/AVRISelLowering.cpp b/lib/Target/AVR/AVRISelLowering.cpp index f0ab6acedad1..ef9c00e4b784 100644 --- a/lib/Target/AVR/AVRISelLowering.cpp +++ b/lib/Target/AVR/AVRISelLowering.cpp @@ -361,7 +361,7 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const { SDValue Callee = DAG.getExternalSymbol(getLibcallName(LC), getPointerTy(DAG.getDataLayout())); - Type *RetTy = (Type *)StructType::get(Ty, Ty, nullptr); + Type *RetTy = (Type *)StructType::get(Ty, Ty); SDLoc dl(Op); TargetLowering::CallLoweringInfo CLI(DAG); @@ -1166,8 +1166,7 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, // Get a count of how many bytes are to be pushed on the stack. unsigned NumBytes = CCInfo.getNextStackOffset(); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, DL, true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SmallVector, 8> RegsToPass; @@ -1611,8 +1610,9 @@ AVRTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *trueMBB = MF->CreateMachineBasicBlock(LLVM_BB); MachineBasicBlock *falseMBB = MF->CreateMachineBasicBlock(LLVM_BB); - MachineFunction::iterator I = MBB->getParent()->begin(); - ++I; + MachineFunction::iterator I; + for (I = MF->begin(); I != MF->end() && &(*I) != MBB; ++I); + if (I != MF->end()) ++I; MF->insert(I, trueMBB); MF->insert(I, falseMBB); diff --git a/lib/Target/AVR/AVRInstrInfo.td b/lib/Target/AVR/AVRInstrInfo.td index 1b6547ef7795..06ad2b3ffdf8 100644 --- a/lib/Target/AVR/AVRInstrInfo.td +++ b/lib/Target/AVR/AVRInstrInfo.td @@ -17,7 +17,7 @@ include "AVRInstrFormats.td" // AVR Type Profiles //===----------------------------------------------------------------------===// -def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>; +def SDT_AVRCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDT_AVRCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDT_AVRCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; def SDT_AVRWrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; @@ -333,9 +333,9 @@ let Defs = [SP, SREG], Uses = [SP] in { def ADJCALLSTACKDOWN : Pseudo<(outs), - (ins i16imm:$amt), + (ins i16imm:$amt, i16imm:$amt2), "#ADJCALLSTACKDOWN", - [(AVRcallseq_start timm:$amt)]>; + [(AVRcallseq_start timm:$amt, timm:$amt2)]>; // R31R30 is used to update SP, since it is a scratch reg and this instruction // is placed after the function call then R31R30 should be always free. diff --git a/lib/Target/AVR/AVRRegisterInfo.cpp b/lib/Target/AVR/AVRRegisterInfo.cpp index 2813e24d2ac7..11a47bad78ba 100644 --- a/lib/Target/AVR/AVRRegisterInfo.cpp +++ b/lib/Target/AVR/AVRRegisterInfo.cpp @@ -52,7 +52,6 @@ AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF, BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const { BitVector Reserved(getNumRegs()); const AVRTargetMachine &TM = static_cast(MF.getTarget()); - const TargetFrameLowering *TFI = TM.getSubtargetImpl()->getFrameLowering(); // Reserve the intermediate result registers r1 and r2 // The result of instructions like 'mul' is always stored here. diff --git a/lib/Target/BPF/BPFISelLowering.cpp b/lib/Target/BPF/BPFISelLowering.cpp index b9b3dff95c0a..6897161c903c 100644 --- a/lib/Target/BPF/BPFISelLowering.cpp +++ b/lib/Target/BPF/BPFISelLowering.cpp @@ -257,8 +257,7 @@ SDValue BPFTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } auto PtrVT = getPointerTy(MF.getDataLayout()); - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getConstant(NumBytes, CLI.DL, PtrVT, true), CLI.DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, CLI.DL); SmallVector, MaxArgs> RegsToPass; diff --git a/lib/Target/BPF/BPFInstrInfo.td b/lib/Target/BPF/BPFInstrInfo.td index 93ee24371c4d..c6c0ff587c6b 100644 --- a/lib/Target/BPF/BPFInstrInfo.td +++ b/lib/Target/BPF/BPFInstrInfo.td @@ -16,7 +16,8 @@ include "BPFInstrFormats.td" // Instruction Operands and Patterns // These are target-independent nodes, but have target-specific formats. -def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>; +def SDT_BPFCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>, + SDTCisVT<1, iPTR>]>; def SDT_BPFCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; def SDT_BPFCall : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; def SDT_BPFSetFlag : SDTypeProfile<0, 3, [SDTCisSameAs<0, 1>]>; @@ -445,9 +446,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot=0, isBarrier = 1, // ADJCALLSTACKDOWN/UP pseudo insns let Defs = [R11], Uses = [R11] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt), - "#ADJCALLSTACKDOWN $amt", - [(BPFcallseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), + "#ADJCALLSTACKDOWN $amt1 $amt2", + [(BPFcallseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", [(BPFcallseq_end timm:$amt1, timm:$amt2)]>; diff --git a/lib/Target/Hexagon/HexagonISelLowering.cpp b/lib/Target/Hexagon/HexagonISelLowering.cpp index 861af94f1e38..1dffebe97f2d 100644 --- a/lib/Target/Hexagon/HexagonISelLowering.cpp +++ b/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -848,8 +848,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue Glue; if (!IsTailCall) { - SDValue C = DAG.getConstant(NumBytes, dl, PtrVT, true); - Chain = DAG.getCALLSEQ_START(Chain, C, dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); Glue = Chain.getValue(1); } diff --git a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp index 5a5799dbe009..e4df7ff5c200 100644 --- a/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp +++ b/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp @@ -1209,7 +1209,7 @@ bool PolynomialMultiplyRecognize::highBitsAreZero(Value *V, KnownBits Known(T->getBitWidth()); computeKnownBits(V, Known, DL); - return Known.Zero.countLeadingOnes() >= IterCount; + return Known.countMinLeadingZeros() >= IterCount; } diff --git a/lib/Target/Hexagon/HexagonPatterns.td b/lib/Target/Hexagon/HexagonPatterns.td index 32503d111c24..81b5e10c1173 100644 --- a/lib/Target/Hexagon/HexagonPatterns.td +++ b/lib/Target/Hexagon/HexagonPatterns.td @@ -714,7 +714,8 @@ def: Pat<(i1 0), (PS_false)>; def: Pat<(i1 1), (PS_true)>; // Pseudo instructions. -def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; @@ -732,8 +733,8 @@ def HexagonTCRet : SDNode<"HexagonISD::TC_RETURN", SDT_SPCall, [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>; -def: Pat<(callseq_start timm:$amt), - (ADJCALLSTACKDOWN imm:$amt)>; +def: Pat<(callseq_start timm:$amt, timm:$amt2), + (ADJCALLSTACKDOWN imm:$amt, imm:$amt2)>; def: Pat<(callseq_end timm:$amt1, timm:$amt2), (ADJCALLSTACKUP imm:$amt1, imm:$amt2)>; diff --git a/lib/Target/Hexagon/HexagonPseudo.td b/lib/Target/Hexagon/HexagonPseudo.td index 8c2caea2d5c5..0f99dfe342b8 100644 --- a/lib/Target/Hexagon/HexagonPseudo.td +++ b/lib/Target/Hexagon/HexagonPseudo.td @@ -80,7 +80,7 @@ def PS_false : InstHexagon<(outs PredRegs:$dst), (ins), "", [(set I1:$dst, 0)], "", C2_andn.Itinerary, TypeCR>; let Defs = [R29, R30], Uses = [R31, R30, R29], isPseudo = 1 in -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), ".error \"should not emit\" ", []>; let Defs = [R29, R30, R31], Uses = [R29], isPseudo = 1 in diff --git a/lib/Target/Lanai/LanaiISelLowering.cpp b/lib/Target/Lanai/LanaiISelLowering.cpp index d156294a0b0c..0a9cac2565f2 100644 --- a/lib/Target/Lanai/LanaiISelLowering.cpp +++ b/lib/Target/Lanai/LanaiISelLowering.cpp @@ -11,9 +11,9 @@ // //===----------------------------------------------------------------------===// +#include "LanaiISelLowering.h" #include "Lanai.h" #include "LanaiCondCode.h" -#include "LanaiISelLowering.h" #include "LanaiMachineFunctionInfo.h" #include "LanaiSubtarget.h" #include "LanaiTargetObjectFile.h" @@ -38,10 +38,11 @@ #include "llvm/IR/Function.h" #include "llvm/IR/GlobalValue.h" #include "llvm/Support/Casting.h" -#include "llvm/Support/CommandLine.h" #include "llvm/Support/CodeGen.h" +#include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/MathExtras.h" #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetCallingConv.h" @@ -649,10 +650,7 @@ SDValue LanaiTargetLowering::LowerCCCCallTo( ByValArgs.push_back(FIPtr); } - Chain = DAG.getCALLSEQ_START( - Chain, - DAG.getConstant(NumBytes, DL, getPointerTy(DAG.getDataLayout()), true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); SmallVector, 4> RegsToPass; SmallVector MemOpChains; @@ -1502,3 +1500,24 @@ SDValue LanaiTargetLowering::PerformDAGCombine(SDNode *N, return SDValue(); } + +void LanaiTargetLowering::computeKnownBitsForTargetNode( + const SDValue Op, KnownBits &Known, const APInt &DemandedElts, + const SelectionDAG &DAG, unsigned Depth) const { + unsigned BitWidth = Known.getBitWidth(); + switch (Op.getOpcode()) { + default: + break; + case LanaiISD::SETCC: + Known = KnownBits(BitWidth); + Known.Zero.setBits(1, BitWidth); + break; + case LanaiISD::SELECT_CC: + KnownBits Known2; + DAG.computeKnownBits(Op->getOperand(0), Known, Depth + 1); + DAG.computeKnownBits(Op->getOperand(1), Known2, Depth + 1); + Known.Zero &= Known2.Zero; + Known.One &= Known2.One; + break; + } +} diff --git a/lib/Target/Lanai/LanaiISelLowering.h b/lib/Target/Lanai/LanaiISelLowering.h index c2fba4f9d167..49ad52a39771 100644 --- a/lib/Target/Lanai/LanaiISelLowering.h +++ b/lib/Target/Lanai/LanaiISelLowering.h @@ -106,6 +106,11 @@ public: SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; + void computeKnownBitsForTargetNode(const SDValue Op, KnownBits &Known, + const APInt &DemandedElts, + const SelectionDAG &DAG, + unsigned Depth = 0) const override; + private: SDValue LowerCCCCallTo(SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool IsVarArg, diff --git a/lib/Target/Lanai/LanaiInstrInfo.td b/lib/Target/Lanai/LanaiInstrInfo.td index 285fca11737d..776fee101dfe 100644 --- a/lib/Target/Lanai/LanaiInstrInfo.td +++ b/lib/Target/Lanai/LanaiInstrInfo.td @@ -22,7 +22,8 @@ include "LanaiInstrFormats.td" // -------------------------------------------------- // // These are target-independent nodes, but have target-specific formats. -def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_LanaiCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; def SDT_LanaiCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_LanaiCall : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; @@ -750,9 +751,9 @@ let isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1, // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber SP. let Defs = [SP], Uses = [SP] in { - def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - "#ADJCALLSTACKDOWN $amt", - [(CallSeqStart timm:$amt)]>; + def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "#ADJCALLSTACKDOWN $amt1 $amt2", + [(CallSeqStart timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", [(CallSeqEnd timm:$amt1, timm:$amt2)]>; @@ -770,9 +771,6 @@ let Uses = [SR] in { [(set (i32 GPR:$Rs1), (LanaiSetCC imm:$DDDI))]>; } -// SCC's output is already 1-bit so and'ing with 1 is redundant. -def : Pat<(and (LanaiSetCC imm:$DDDI), 1), (SCC imm:$DDDI)>; - // Select with hardware support let Uses = [SR], isSelect = 1 in { def SELECT : InstRR<0b111, (outs GPR:$Rd), diff --git a/lib/Target/MSP430/MSP430FrameLowering.cpp b/lib/Target/MSP430/MSP430FrameLowering.cpp index f1cb0b6c031b..b4ff8f66c55f 100644 --- a/lib/Target/MSP430/MSP430FrameLowering.cpp +++ b/lib/Target/MSP430/MSP430FrameLowering.cpp @@ -236,7 +236,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( // adjcallstackdown instruction into 'add SP, ' // TODO: consider using push / pop instead of sub + store / add MachineInstr &Old = *I; - uint64_t Amount = Old.getOperand(0).getImm(); + uint64_t Amount = TII.getFrameSize(Old); if (Amount != 0) { // We need to keep the stack aligned properly. To do this, we round the // amount of space needed for the outgoing arguments up to the next @@ -252,8 +252,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( } else { assert(Old.getOpcode() == TII.getCallFrameDestroyOpcode()); // factor out the amount the callee already popped. - uint64_t CalleeAmt = Old.getOperand(1).getImm(); - Amount -= CalleeAmt; + Amount -= TII.getFramePoppedByCallee(Old); if (Amount) New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::ADD16ri), MSP430::SP) @@ -272,7 +271,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr( } else if (I->getOpcode() == TII.getCallFrameDestroyOpcode()) { // If we are performing frame pointer elimination and if the callee pops // something off the stack pointer, add it back. - if (uint64_t CalleeAmt = I->getOperand(1).getImm()) { + if (uint64_t CalleeAmt = TII.getFramePoppedByCallee(*I)) { MachineInstr &Old = *I; MachineInstr *New = BuildMI(MF, Old.getDebugLoc(), TII.get(MSP430::SUB16ri), MSP430::SP) diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp index 40b1dd3cc2eb..cc6e64043f54 100644 --- a/lib/Target/MSP430/MSP430ISelLowering.cpp +++ b/lib/Target/MSP430/MSP430ISelLowering.cpp @@ -40,21 +40,24 @@ using namespace llvm; typedef enum { NoHWMult, - HWMultIntr, - HWMultNoIntr + HWMult16, + HWMult32, + HWMultF5 } HWMultUseMode; static cl::opt -HWMultMode("msp430-hwmult-mode", cl::Hidden, +HWMultMode("mhwmult", cl::Hidden, cl::desc("Hardware multiplier use mode"), - cl::init(HWMultNoIntr), + cl::init(NoHWMult), cl::values( - clEnumValN(NoHWMult, "no", + clEnumValN(NoHWMult, "none", "Do not use hardware multiplier"), - clEnumValN(HWMultIntr, "interrupts", - "Assume hardware multiplier can be used inside interrupts"), - clEnumValN(HWMultNoIntr, "use", - "Assume hardware multiplier cannot be used inside interrupts"))); + clEnumValN(HWMult16, "16bit", + "Use 16-bit hardware multiplier"), + clEnumValN(HWMult32, "32bit", + "Use 32-bit hardware multiplier"), + clEnumValN(HWMultF5, "f5series", + "Use F5 series hardware multiplier"))); MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, const MSP430Subtarget &STI) @@ -131,29 +134,29 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::i1, Expand); // FIXME: Implement efficiently multiplication by a constant - setOperationAction(ISD::MUL, MVT::i8, Expand); - setOperationAction(ISD::MULHS, MVT::i8, Expand); - setOperationAction(ISD::MULHU, MVT::i8, Expand); - setOperationAction(ISD::SMUL_LOHI, MVT::i8, Expand); - setOperationAction(ISD::UMUL_LOHI, MVT::i8, Expand); - setOperationAction(ISD::MUL, MVT::i16, Expand); + setOperationAction(ISD::MUL, MVT::i8, Promote); + setOperationAction(ISD::MULHS, MVT::i8, Promote); + setOperationAction(ISD::MULHU, MVT::i8, Promote); + setOperationAction(ISD::SMUL_LOHI, MVT::i8, Promote); + setOperationAction(ISD::UMUL_LOHI, MVT::i8, Promote); + setOperationAction(ISD::MUL, MVT::i16, LibCall); setOperationAction(ISD::MULHS, MVT::i16, Expand); setOperationAction(ISD::MULHU, MVT::i16, Expand); setOperationAction(ISD::SMUL_LOHI, MVT::i16, Expand); setOperationAction(ISD::UMUL_LOHI, MVT::i16, Expand); - setOperationAction(ISD::UDIV, MVT::i8, Expand); - setOperationAction(ISD::UDIVREM, MVT::i8, Expand); - setOperationAction(ISD::UREM, MVT::i8, Expand); - setOperationAction(ISD::SDIV, MVT::i8, Expand); - setOperationAction(ISD::SDIVREM, MVT::i8, Expand); - setOperationAction(ISD::SREM, MVT::i8, Expand); - setOperationAction(ISD::UDIV, MVT::i16, Expand); + setOperationAction(ISD::UDIV, MVT::i8, Promote); + setOperationAction(ISD::UDIVREM, MVT::i8, Promote); + setOperationAction(ISD::UREM, MVT::i8, Promote); + setOperationAction(ISD::SDIV, MVT::i8, Promote); + setOperationAction(ISD::SDIVREM, MVT::i8, Promote); + setOperationAction(ISD::SREM, MVT::i8, Promote); + setOperationAction(ISD::UDIV, MVT::i16, LibCall); setOperationAction(ISD::UDIVREM, MVT::i16, Expand); - setOperationAction(ISD::UREM, MVT::i16, Expand); - setOperationAction(ISD::SDIV, MVT::i16, Expand); + setOperationAction(ISD::UREM, MVT::i16, LibCall); + setOperationAction(ISD::SDIV, MVT::i16, LibCall); setOperationAction(ISD::SDIVREM, MVT::i16, Expand); - setOperationAction(ISD::SREM, MVT::i16, Expand); + setOperationAction(ISD::SREM, MVT::i16, LibCall); // varargs support setOperationAction(ISD::VASTART, MVT::Other, Custom); @@ -162,15 +165,183 @@ MSP430TargetLowering::MSP430TargetLowering(const TargetMachine &TM, setOperationAction(ISD::VACOPY, MVT::Other, Expand); setOperationAction(ISD::JumpTable, MVT::i16, Custom); - // Libcalls names. - if (HWMultMode == HWMultIntr) { - setLibcallName(RTLIB::MUL_I8, "__mulqi3hw"); - setLibcallName(RTLIB::MUL_I16, "__mulhi3hw"); - } else if (HWMultMode == HWMultNoIntr) { - setLibcallName(RTLIB::MUL_I8, "__mulqi3hw_noint"); - setLibcallName(RTLIB::MUL_I16, "__mulhi3hw_noint"); + // EABI Libcalls - EABI Section 6.2 + const struct { + const RTLIB::Libcall Op; + const char * const Name; + const ISD::CondCode Cond; + } LibraryCalls[] = { + // Floating point conversions - EABI Table 6 + { RTLIB::FPROUND_F64_F32, "__mspabi_cvtdf", ISD::SETCC_INVALID }, + { RTLIB::FPEXT_F32_F64, "__mspabi_cvtfd", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F64_I16, "__mspabi_fixdi", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F64_I32, "__mspabi_fixdli", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F64_I64, "__mspabi_fixdlli", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F64_I16, "__mspabi_fixdu", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I32, "__mspabi_fixdul", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F64_I64, "__mspabi_fixdull", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOSINT_F32_I16, "__mspabi_fixfi", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I32, "__mspabi_fixfli", ISD::SETCC_INVALID }, + { RTLIB::FPTOSINT_F32_I64, "__mspabi_fixflli", ISD::SETCC_INVALID }, + // The following is NOT implemented in libgcc + //{ RTLIB::FPTOUINT_F32_I16, "__mspabi_fixfu", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I32, "__mspabi_fixful", ISD::SETCC_INVALID }, + { RTLIB::FPTOUINT_F32_I64, "__mspabi_fixfull", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F64, "__mspabi_fltid", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F64, "__mspabi_fltlid", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc but is not in the EABI + { RTLIB::SINTTOFP_I64_F64, "__mspabi_fltllid", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F64, "__mspabi_fltud", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F64, "__mspabi_fltuld", ISD::SETCC_INVALID }, + // The following IS implemented in libgcc but is not in the EABI + { RTLIB::UINTTOFP_I64_F64, "__mspabi_fltulld", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::SINTTOFP_I16_F32, "__mspabi_fltif", ISD::SETCC_INVALID }, + { RTLIB::SINTTOFP_I32_F32, "__mspabi_fltlif", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc but is not in the EABI + { RTLIB::SINTTOFP_I64_F32, "__mspabi_fltllif", ISD::SETCC_INVALID }, + // TODO The following IS implemented in libgcc + //{ RTLIB::UINTTOFP_I16_F32, "__mspabi_fltuf", ISD::SETCC_INVALID }, + { RTLIB::UINTTOFP_I32_F32, "__mspabi_fltulf", ISD::SETCC_INVALID }, + // The following IS implemented in libgcc but is not in the EABI + { RTLIB::UINTTOFP_I64_F32, "__mspabi_fltullf", ISD::SETCC_INVALID }, + + // Floating point comparisons - EABI Table 7 + { RTLIB::OEQ_F64, "__mspabi_cmpd", ISD::SETEQ }, + { RTLIB::UNE_F64, "__mspabi_cmpd", ISD::SETNE }, + { RTLIB::OGE_F64, "__mspabi_cmpd", ISD::SETGE }, + { RTLIB::OLT_F64, "__mspabi_cmpd", ISD::SETLT }, + { RTLIB::OLE_F64, "__mspabi_cmpd", ISD::SETLE }, + { RTLIB::OGT_F64, "__mspabi_cmpd", ISD::SETGT }, + { RTLIB::OEQ_F32, "__mspabi_cmpf", ISD::SETEQ }, + { RTLIB::UNE_F32, "__mspabi_cmpf", ISD::SETNE }, + { RTLIB::OGE_F32, "__mspabi_cmpf", ISD::SETGE }, + { RTLIB::OLT_F32, "__mspabi_cmpf", ISD::SETLT }, + { RTLIB::OLE_F32, "__mspabi_cmpf", ISD::SETLE }, + { RTLIB::OGT_F32, "__mspabi_cmpf", ISD::SETGT }, + + // Floating point arithmetic - EABI Table 8 + { RTLIB::ADD_F64, "__mspabi_addd", ISD::SETCC_INVALID }, + { RTLIB::ADD_F32, "__mspabi_addf", ISD::SETCC_INVALID }, + { RTLIB::DIV_F64, "__mspabi_divd", ISD::SETCC_INVALID }, + { RTLIB::DIV_F32, "__mspabi_divf", ISD::SETCC_INVALID }, + { RTLIB::MUL_F64, "__mspabi_mpyd", ISD::SETCC_INVALID }, + { RTLIB::MUL_F32, "__mspabi_mpyf", ISD::SETCC_INVALID }, + { RTLIB::SUB_F64, "__mspabi_subd", ISD::SETCC_INVALID }, + { RTLIB::SUB_F32, "__mspabi_subf", ISD::SETCC_INVALID }, + // The following are NOT implemented in libgcc + // { RTLIB::NEG_F64, "__mspabi_negd", ISD::SETCC_INVALID }, + // { RTLIB::NEG_F32, "__mspabi_negf", ISD::SETCC_INVALID }, + + // TODO: SLL/SRA/SRL are in libgcc, RLL isn't + + // Universal Integer Operations - EABI Table 9 + { RTLIB::SDIV_I16, "__mspabi_divi", ISD::SETCC_INVALID }, + { RTLIB::SDIV_I32, "__mspabi_divli", ISD::SETCC_INVALID }, + { RTLIB::SDIV_I64, "__mspabi_divlli", ISD::SETCC_INVALID }, + { RTLIB::UDIV_I16, "__mspabi_divu", ISD::SETCC_INVALID }, + { RTLIB::UDIV_I32, "__mspabi_divul", ISD::SETCC_INVALID }, + { RTLIB::UDIV_I64, "__mspabi_divull", ISD::SETCC_INVALID }, + { RTLIB::SREM_I16, "__mspabi_remi", ISD::SETCC_INVALID }, + { RTLIB::SREM_I32, "__mspabi_remli", ISD::SETCC_INVALID }, + { RTLIB::SREM_I64, "__mspabi_remlli", ISD::SETCC_INVALID }, + { RTLIB::UREM_I16, "__mspabi_remu", ISD::SETCC_INVALID }, + { RTLIB::UREM_I32, "__mspabi_remul", ISD::SETCC_INVALID }, + { RTLIB::UREM_I64, "__mspabi_remull", ISD::SETCC_INVALID }, + + }; + + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + if (LC.Cond != ISD::SETCC_INVALID) + setCmpLibcallCC(LC.Op, LC.Cond); + } + + if (HWMultMode == HWMult16) { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi_hw" }, + { RTLIB::MUL_I32, "__mspabi_mpyl_hw" }, + { RTLIB::MUL_I64, "__mspabi_mpyll_hw" }, + // TODO The __mspabi_mpysl*_hw functions ARE implemented in libgcc + // TODO The __mspabi_mpyul*_hw functions ARE implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + } else if (HWMultMode == HWMult32) { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi_hw" }, + { RTLIB::MUL_I32, "__mspabi_mpyl_hw32" }, + { RTLIB::MUL_I64, "__mspabi_mpyll_hw32" }, + // TODO The __mspabi_mpysl*_hw32 functions ARE implemented in libgcc + // TODO The __mspabi_mpyul*_hw32 functions ARE implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + } else if (HWMultMode == HWMultF5) { + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi_f5hw" }, + { RTLIB::MUL_I32, "__mspabi_mpyl_f5hw" }, + { RTLIB::MUL_I64, "__mspabi_mpyll_f5hw" }, + // TODO The __mspabi_mpysl*_f5hw functions ARE implemented in libgcc + // TODO The __mspabi_mpyul*_f5hw functions ARE implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + } else { // NoHWMult + const struct { + const RTLIB::Libcall Op; + const char * const Name; + } LibraryCalls[] = { + // Integer Multiply - EABI Table 9 + { RTLIB::MUL_I16, "__mspabi_mpyi" }, + { RTLIB::MUL_I32, "__mspabi_mpyl" }, + { RTLIB::MUL_I64, "__mspabi_mpyll" }, + // The __mspabi_mpysl* functions are NOT implemented in libgcc + // The __mspabi_mpyul* functions are NOT implemented in libgcc + }; + for (const auto &LC : LibraryCalls) { + setLibcallName(LC.Op, LC.Name); + } + setLibcallCallingConv(RTLIB::MUL_I64, CallingConv::MSP430_BUILTIN); } + // Several of the runtime library functions use a special calling conv + setLibcallCallingConv(RTLIB::UDIV_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::UREM_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::SDIV_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::SREM_I64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::ADD_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::SUB_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::MUL_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::DIV_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OEQ_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::UNE_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OGE_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OLT_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OLE_F64, CallingConv::MSP430_BUILTIN); + setLibcallCallingConv(RTLIB::OGT_F64, CallingConv::MSP430_BUILTIN); + // TODO: __mspabi_srall, __mspabi_srlll, __mspabi_sllll + setMinFunctionAlignment(1); setPrefFunctionAlignment(2); } @@ -281,10 +452,27 @@ template static void AnalyzeArguments(CCState &State, SmallVectorImpl &ArgLocs, const SmallVectorImpl &Args) { - static const MCPhysReg RegList[] = { + static const MCPhysReg CRegList[] = { MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15 }; - static const unsigned NbRegs = array_lengthof(RegList); + static const unsigned CNbRegs = array_lengthof(CRegList); + static const MCPhysReg BuiltinRegList[] = { + MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11, + MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15 + }; + static const unsigned BuiltinNbRegs = array_lengthof(BuiltinRegList); + + ArrayRef RegList; + unsigned NbRegs; + + bool Builtin = (State.getCallingConv() == CallingConv::MSP430_BUILTIN); + if (Builtin) { + RegList = BuiltinRegList; + NbRegs = BuiltinNbRegs; + } else { + RegList = CRegList; + NbRegs = CNbRegs; + } if (State.isVarArg()) { AnalyzeVarArgs(State, Args); @@ -294,6 +482,11 @@ static void AnalyzeArguments(CCState &State, SmallVector ArgsParts; ParseFunctionArgs(Args, ArgsParts); + if (Builtin) { + assert(ArgsParts.size() == 2 && + "Builtin calling convention requires two arguments"); + } + unsigned RegsLeft = NbRegs; bool UsedStack = false; unsigned ValNo = 0; @@ -323,6 +516,11 @@ static void AnalyzeArguments(CCState &State, unsigned Parts = ArgsParts[i]; + if (Builtin) { + assert(Parts == 4 && + "Builtin calling convention requires 64-bit arguments"); + } + if (!UsedStack && Parts == 2 && RegsLeft == 1) { // Special case for 32-bit register split, see EABI section 3.3.3 unsigned Reg = State.AllocateReg(RegList); @@ -400,6 +598,7 @@ MSP430TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, switch (CallConv) { default: llvm_unreachable("Unsupported calling convention"); + case CallingConv::MSP430_BUILTIN: case CallingConv::Fast: case CallingConv::C: return LowerCCCCallTo(Chain, Callee, CallConv, isVarArg, isTailCall, @@ -598,7 +797,6 @@ MSP430TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, /// LowerCCCCallTo - functions arguments are copied from virtual regs to /// (physical regs)/(stack frame), CALLSEQ_START and CALLSEQ_END are emitted. -// TODO: sret. SDValue MSP430TargetLowering::LowerCCCCallTo( SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg, bool isTailCall, const SmallVectorImpl &Outs, @@ -615,8 +813,7 @@ SDValue MSP430TargetLowering::LowerCCCCallTo( unsigned NumBytes = CCInfo.getNextStackOffset(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getConstant(NumBytes, dl, PtrVT, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SmallVector, 4> RegsToPass; SmallVector MemOpChains; diff --git a/lib/Target/MSP430/MSP430InstrInfo.h b/lib/Target/MSP430/MSP430InstrInfo.h index e3259bd6a7bc..d81f17e753c5 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.h +++ b/lib/Target/MSP430/MSP430InstrInfo.h @@ -85,6 +85,12 @@ public: MachineBasicBlock *FBB, ArrayRef Cond, const DebugLoc &DL, int *BytesAdded = nullptr) const override; + + int64_t getFramePoppedByCallee(const MachineInstr &I) const { + assert(isFrameInstr(I) && "Not a frame instruction"); + assert(I.getOperand(1).getImm() >= 0 && "Size must not be negative"); + return I.getOperand(1).getImm(); + } }; } diff --git a/lib/Target/MSP430/MSP430InstrInfo.td b/lib/Target/MSP430/MSP430InstrInfo.td index 22fc2474fae6..1cd18611e52c 100644 --- a/lib/Target/MSP430/MSP430InstrInfo.td +++ b/lib/Target/MSP430/MSP430InstrInfo.td @@ -23,7 +23,8 @@ class SDTCisI16 : SDTCisVT; // Type Profiles. //===----------------------------------------------------------------------===// def SDT_MSP430Call : SDTypeProfile<0, -1, [SDTCisVT<0, iPTR>]>; -def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>]>; +def SDT_MSP430CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i16>, + SDTCisVT<1, i16>]>; def SDT_MSP430CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i16>, SDTCisVT<1, i16>]>; def SDT_MSP430Wrapper : SDTypeProfile<1, 1, [SDTCisSameAs<0, 1>, SDTCisPtrTy<0>]>; @@ -113,9 +114,9 @@ def and_su : PatFrag<(ops node:$lhs, node:$rhs), (and node:$lhs, node:$rhs), [{ // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber SR. let Defs = [SP, SR], Uses = [SP] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt), +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), "#ADJCALLSTACKDOWN", - [(MSP430callseq_start timm:$amt)]>; + [(MSP430callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i16imm:$amt1, i16imm:$amt2), "#ADJCALLSTACKUP", [(MSP430callseq_end timm:$amt1, timm:$amt2)]>; @@ -209,7 +210,7 @@ let isCall = 1 in // a use to prevent stack-pointer assignments that appear immediately // before calls from potentially appearing dead. Uses for argument // registers are added manually. - let Defs = [R12, R13, R14, R15, SR], + let Defs = [R11, R12, R13, R14, R15, SR], Uses = [SP] in { def CALLi : II16i<0x0, (outs), (ins i16imm:$dst), diff --git a/lib/Target/MSP430/MSP430RegisterInfo.cpp b/lib/Target/MSP430/MSP430RegisterInfo.cpp index 81cd9d1ad3f8..9600bc28f100 100644 --- a/lib/Target/MSP430/MSP430RegisterInfo.cpp +++ b/lib/Target/MSP430/MSP430RegisterInfo.cpp @@ -41,12 +41,12 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { const Function* F = MF->getFunction(); static const MCPhysReg CalleeSavedRegs[] = { MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7, - MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11, + MSP430::R8, MSP430::R9, MSP430::R10, 0 }; static const MCPhysReg CalleeSavedRegsFP[] = { MSP430::R5, MSP430::R6, MSP430::R7, - MSP430::R8, MSP430::R9, MSP430::R10, MSP430::R11, + MSP430::R8, MSP430::R9, MSP430::R10, 0 }; static const MCPhysReg CalleeSavedRegsIntr[] = { diff --git a/lib/Target/Mips/MipsFastISel.cpp b/lib/Target/Mips/MipsFastISel.cpp index 21c99da0922d..b83f44a74d5b 100644 --- a/lib/Target/Mips/MipsFastISel.cpp +++ b/lib/Target/Mips/MipsFastISel.cpp @@ -1133,7 +1133,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI, if (NumBytes < 16) NumBytes = 16; - emitInst(Mips::ADJCALLSTACKDOWN).addImm(16); + emitInst(Mips::ADJCALLSTACKDOWN).addImm(16).addImm(0); // Process the args. MVT firstMVT; for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) { diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp index 8f39ebd42a5c..78bae6954c3c 100644 --- a/lib/Target/Mips/MipsISelLowering.cpp +++ b/lib/Target/Mips/MipsISelLowering.cpp @@ -2787,7 +2787,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, SDValue NextStackOffsetVal = DAG.getIntPtrConstant(NextStackOffset, DL, true); if (!IsTailCall) - Chain = DAG.getCALLSEQ_START(Chain, NextStackOffsetVal, DL); + Chain = DAG.getCALLSEQ_START(Chain, NextStackOffset, 0, DL); SDValue StackPtr = DAG.getCopyFromReg(Chain, DL, ABI.IsN64() ? Mips::SP_64 : Mips::SP, diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td index b90077d7807d..8761946b8dbb 100644 --- a/lib/Target/Mips/MipsInstrInfo.td +++ b/lib/Target/Mips/MipsInstrInfo.td @@ -21,7 +21,7 @@ def SDT_MipsCMov : SDTypeProfile<1, 4, [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>, SDTCisSameAs<3, 4>, SDTCisInt<4>]>; -def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_MipsCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_MipsCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def SDT_MFLOHI : SDTypeProfile<1, 1, [SDTCisInt<0>, SDTCisVT<1, untyped>]>; def SDT_MTLOHI : SDTypeProfile<1, 2, [SDTCisVT<0, untyped>, @@ -1719,8 +1719,8 @@ let isReturn=1, isTerminator=1, isBarrier=1, hasCtrlDep=1, isCTI=1 in { } let Defs = [SP], Uses = [SP], hasSideEffects = 1 in { -def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt), - [(callseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : MipsPseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), [(callseq_end timm:$amt1, timm:$amt2)]>; } diff --git a/lib/Target/Mips/MipsOptimizePICCall.cpp b/lib/Target/Mips/MipsOptimizePICCall.cpp index 68dcbdfb4211..f8d9c34556bc 100644 --- a/lib/Target/Mips/MipsOptimizePICCall.cpp +++ b/lib/Target/Mips/MipsOptimizePICCall.cpp @@ -257,7 +257,7 @@ bool OptimizePICCall::isCallViaRegister(MachineInstr &MI, unsigned &Reg, // Get the instruction that loads the function address from the GOT. Reg = MO->getReg(); - Val = (Value*)nullptr; + Val = nullptr; MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo(); MachineInstr *DefMI = MRI.getVRegDef(Reg); diff --git a/lib/Target/NVPTX/NVPTXISelLowering.cpp b/lib/Target/NVPTX/NVPTXISelLowering.cpp index 61fdda8aa109..ebaaf42bc64e 100644 --- a/lib/Target/NVPTX/NVPTXISelLowering.cpp +++ b/lib/Target/NVPTX/NVPTXISelLowering.cpp @@ -1430,8 +1430,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, return Chain; SDValue tempChain = Chain; - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getIntPtrConstant(uniqueCallSite, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, uniqueCallSite, 0, dl); SDValue InFlag = Chain.getValue(1); unsigned paramCount = 0; @@ -1549,7 +1548,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, Chain = DAG.getMemIntrinsicNode( Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands, - TheStoreType, MachinePointerInfo(), EltAlign); + TheStoreType, MachinePointerInfo(), EltAlign, + /* Volatile */ false, /* ReadMem */ false, + /* WriteMem */ true, /* Size */ 0); InFlag = Chain.getValue(1); // Cleanup. @@ -1609,7 +1610,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, theVal, InFlag }; Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype, - MachinePointerInfo()); + MachinePointerInfo(), /* Align */ 0, + /* Volatile */ false, /* ReadMem */ false, + /* WriteMem */ true, /* Size */ 0); InFlag = Chain.getValue(1); } @@ -1795,7 +1798,8 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, DAG.getConstant(Offsets[VecIdx], dl, MVT::i32), InFlag}; SDValue RetVal = DAG.getMemIntrinsicNode( Op, dl, DAG.getVTList(LoadVTs), LoadOperands, TheLoadType, - MachinePointerInfo(), EltAlign); + MachinePointerInfo(), EltAlign, /* Volatile */ false, + /* ReadMem */ true, /* WriteMem */ false, /* Size */ 0); for (unsigned j = 0; j < NumElts; ++j) { SDValue Ret = RetVal.getValue(j); @@ -2579,7 +2583,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i]; Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType, - MachinePointerInfo(), 1); + MachinePointerInfo(), /* Align */ 1, + /* Volatile */ false, /* ReadMem */ false, + /* WriteMem */ true, /* Size */ 0); // Cleanup vector state. StoreOperands.clear(); } diff --git a/lib/Target/NVPTX/NVPTXInstrInfo.td b/lib/Target/NVPTX/NVPTXInstrInfo.td index 9378b29a9d0e..b5b5ea1ed639 100644 --- a/lib/Target/NVPTX/NVPTXInstrInfo.td +++ b/lib/Target/NVPTX/NVPTXInstrInfo.td @@ -3101,7 +3101,8 @@ def : Pat<(brcond (i1 (setne Int1Regs:$a, -1)), bb:$target), (CBranchOther Int1Regs:$a, bb:$target)>; // Call -def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_NVPTXCallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; def SDT_NVPTXCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_NVPTXCallSeqStart, @@ -3126,10 +3127,10 @@ class Pseudo pattern> : NVPTXInst; def Callseq_Start : - NVPTXInst<(outs), (ins i32imm:$amt), - "\\{ // callseq $amt\n" + NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "\\{ // callseq $amt1, $amt2\n" "\t.reg .b32 temp_param_reg;", - [(callseq_start timm:$amt)]>; + [(callseq_start timm:$amt1, timm:$amt2)]>; def Callseq_End : NVPTXInst<(outs), (ins i32imm:$amt1, i32imm:$amt2), "\\} // callseq $amt1", diff --git a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp index 12ffbfdeacc1..11d22377611b 100644 --- a/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp +++ b/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp @@ -204,6 +204,17 @@ static const unsigned G8Regs[] = { PPC::X28, PPC::X29, PPC::X30, PPC::X31 }; +static const unsigned G80Regs[] = { + PPC::ZERO8, PPC::X1, PPC::X2, PPC::X3, + PPC::X4, PPC::X5, PPC::X6, PPC::X7, + PPC::X8, PPC::X9, PPC::X10, PPC::X11, + PPC::X12, PPC::X13, PPC::X14, PPC::X15, + PPC::X16, PPC::X17, PPC::X18, PPC::X19, + PPC::X20, PPC::X21, PPC::X22, PPC::X23, + PPC::X24, PPC::X25, PPC::X26, PPC::X27, + PPC::X28, PPC::X29, PPC::X30, PPC::X31 +}; + static const unsigned QFRegs[] = { PPC::QF0, PPC::QF1, PPC::QF2, PPC::QF3, PPC::QF4, PPC::QF5, PPC::QF6, PPC::QF7, @@ -301,6 +312,12 @@ static DecodeStatus DecodeG8RCRegisterClass(MCInst &Inst, uint64_t RegNo, return decodeRegisterClass(Inst, RegNo, G8Regs); } +static DecodeStatus DecodeG8RC_NOX0RegisterClass(MCInst &Inst, uint64_t RegNo, + uint64_t Address, + const void *Decoder) { + return decodeRegisterClass(Inst, RegNo, G80Regs); +} + #define DecodePointerLikeRegClass0 DecodeGPRCRegisterClass #define DecodePointerLikeRegClass1 DecodeGPRC_NOR0RegisterClass diff --git a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp index 609d959c6d08..84bb9ec56800 100644 --- a/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp +++ b/lib/Target/PowerPC/InstPrinter/PPCInstPrinter.cpp @@ -95,7 +95,8 @@ void PPCInstPrinter::printInst(const MCInst *MI, raw_ostream &O, return; } - if (MI->getOpcode() == PPC::RLDICR) { + if (MI->getOpcode() == PPC::RLDICR || + MI->getOpcode() == PPC::RLDICR_32) { unsigned char SH = MI->getOperand(2).getImm(); unsigned char ME = MI->getOperand(3).getImm(); // rldicr RA, RS, SH, 63-SH == sldi RA, RS, SH diff --git a/lib/Target/PowerPC/PPCFastISel.cpp b/lib/Target/PowerPC/PPCFastISel.cpp index 9b91b9ab8f82..2fc8654deeab 100644 --- a/lib/Target/PowerPC/PPCFastISel.cpp +++ b/lib/Target/PowerPC/PPCFastISel.cpp @@ -1330,7 +1330,7 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl &Args, // Issue CALLSEQ_START. BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TII.getCallFrameSetupOpcode())) - .addImm(NumBytes); + .addImm(NumBytes).addImm(0); // Prepare to assign register arguments. Every argument uses up a // GPR protocol register even if it's passed in a floating-point @@ -2246,6 +2246,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo, } case PPC::EXTSW: + case PPC::EXTSW_32: case PPC::EXTSW_32_64: { if (VT != MVT::i32 && VT != MVT::i16 && VT != MVT::i8) return false; diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp index 1b0402bf003d..5fa7b2c6bfb1 100644 --- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp +++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp @@ -54,6 +54,7 @@ #include "llvm/Support/raw_ostream.h" #include "llvm/Target/TargetInstrInfo.h" #include "llvm/Target/TargetRegisterInfo.h" +#include "llvm/ADT/Statistic.h" #include #include #include @@ -68,6 +69,14 @@ using namespace llvm; #define DEBUG_TYPE "ppc-codegen" +STATISTIC(NumSextSetcc, + "Number of (sext(setcc)) nodes expanded into GPR sequence."); +STATISTIC(NumZextSetcc, + "Number of (zext(setcc)) nodes expanded into GPR sequence."); +STATISTIC(SignExtensionsAdded, + "Number of sign extensions for compare inputs added."); +STATISTIC(ZeroExtensionsAdded, + "Number of zero extensions for compare inputs added."); // FIXME: Remove this once the bug has been fixed! cl::opt ANDIGlueBug("expose-ppc-andi-glue-bug", cl::desc("expose the ANDI glue bug on PPC"), cl::Hidden); @@ -252,7 +261,28 @@ namespace { #include "PPCGenDAGISel.inc" private: + // Conversion type for interpreting results of a 32-bit instruction as + // a 64-bit value or vice versa. + enum ExtOrTruncConversion { Ext, Trunc }; + + // Modifiers to guide how an ISD::SETCC node's result is to be computed + // in a GPR. + // ZExtOrig - use the original condition code, zero-extend value + // ZExtInvert - invert the condition code, zero-extend value + // SExtOrig - use the original condition code, sign-extend value + // SExtInvert - invert the condition code, sign-extend value + enum SetccInGPROpts { ZExtOrig, ZExtInvert, SExtOrig, SExtInvert }; + bool trySETCC(SDNode *N); + bool tryEXTEND(SDNode *N); + SDValue signExtendInputIfNeeded(SDValue Input); + SDValue zeroExtendInputIfNeeded(SDValue Input); + SDValue addExtOrTrunc(SDValue NatWidthRes, ExtOrTruncConversion Conv); + SDValue get32BitZExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue get32BitSExtCompare(SDValue LHS, SDValue RHS, ISD::CondCode CC, + int64_t RHSValue, SDLoc dl); + SDValue getSETCCInGPR(SDValue Compare, SetccInGPROpts ConvOpts); void PeepholePPC64(); void PeepholePPC64ZExt(); @@ -2471,6 +2501,225 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) { return true; } +/// If this node is a sign/zero extension of an integer comparison, +/// it can usually be computed in GPR's rather than using comparison +/// instructions and ISEL. We only do this on 64-bit targets for now +/// as the code is specialized for 64-bit (it uses 64-bit instructions +/// and assumes 64-bit registers). +bool PPCDAGToDAGISel::tryEXTEND(SDNode *N) { + if (TM.getOptLevel() == CodeGenOpt::None || !TM.isPPC64()) + return false; + assert((N->getOpcode() == ISD::ZERO_EXTEND || + N->getOpcode() == ISD::SIGN_EXTEND) && + "Expecting a zero/sign extend node!"); + + if (N->getOperand(0).getOpcode() != ISD::SETCC) + return false; + + SDValue WideRes = + getSETCCInGPR(N->getOperand(0), + N->getOpcode() == ISD::SIGN_EXTEND ? + SetccInGPROpts::SExtOrig : SetccInGPROpts::ZExtOrig); + + if (!WideRes) + return false; + + SDLoc dl(N); + bool Inputs32Bit = N->getOperand(0).getOperand(0).getValueType() == MVT::i32; + bool Output32Bit = N->getValueType(0) == MVT::i32; + + NumSextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 1 : 0; + NumZextSetcc += N->getOpcode() == ISD::SIGN_EXTEND ? 0 : 1; + + SDValue ConvOp = WideRes; + if (Inputs32Bit != Output32Bit) + ConvOp = addExtOrTrunc(WideRes, Inputs32Bit ? ExtOrTruncConversion::Ext : + ExtOrTruncConversion::Trunc); + ReplaceNode(N, ConvOp.getNode()); + + return true; +} + +/// If the value isn't guaranteed to be sign-extended to 64-bits, extend it. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue PPCDAGToDAGISel::signExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only sign-extend 32-bit values here."); + unsigned Opc = Input.getOpcode(); + + // The value was sign extended and then truncated to 32-bits. No need to + // sign extend it again. + if (Opc == ISD::TRUNCATE && + (Input.getOperand(0).getOpcode() == ISD::AssertSext || + Input.getOperand(0).getOpcode() == ISD::SIGN_EXTEND)) + return Input; + + LoadSDNode *InputLoad = dyn_cast(Input); + // The input is a sign-extending load. No reason to sign-extend. + if (InputLoad && InputLoad->getExtensionType() == ISD::SEXTLOAD) + return Input; + + ConstantSDNode *InputConst = dyn_cast(Input); + // We don't sign-extend constants and already sign-extended values. + if (InputConst || Opc == ISD::AssertSext || Opc == ISD::SIGN_EXTEND_INREG || + Opc == ISD::SIGN_EXTEND) + return Input; + + SDLoc dl(Input); + SignExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::EXTSW_32, dl, MVT::i32, Input), 0); +} + +/// If the value isn't guaranteed to be zero-extended to 64-bits, extend it. +/// Useful when emitting comparison code for 32-bit values without using +/// the compare instruction (which only considers the lower 32-bits). +SDValue PPCDAGToDAGISel::zeroExtendInputIfNeeded(SDValue Input) { + assert(Input.getValueType() == MVT::i32 && + "Can only zero-extend 32-bit values here."); + LoadSDNode *InputLoad = dyn_cast(Input); + unsigned Opc = Input.getOpcode(); + + // No need to zero-extend loaded values (unless they're loaded with + // a sign-extending load). + if (InputLoad && InputLoad->getExtensionType() != ISD::SEXTLOAD) + return Input; + + ConstantSDNode *InputConst = dyn_cast(Input); + bool InputZExtConst = InputConst && InputConst->getSExtValue() >= 0; + // An ISD::TRUNCATE will be lowered to an EXTRACT_SUBREG so we have + // to conservatively actually clear the high bits. We also don't need to + // zero-extend constants or values that are already zero-extended. + if (InputZExtConst || Opc == ISD::AssertZext || Opc == ISD::ZERO_EXTEND) + return Input; + + SDLoc dl(Input); + ZeroExtensionsAdded++; + return SDValue(CurDAG->getMachineNode(PPC::RLDICL_32, dl, MVT::i32, Input, + getI64Imm(0, dl), getI64Imm(32, dl)), + 0); +} + +// Handle a 32-bit value in a 64-bit register and vice-versa. These are of +// course not actual zero/sign extensions that will generate machine code, +// they're just a way to reinterpret a 32 bit value in a register as a +// 64 bit value and vice-versa. +SDValue PPCDAGToDAGISel::addExtOrTrunc(SDValue NatWidthRes, + ExtOrTruncConversion Conv) { + SDLoc dl(NatWidthRes); + + // For reinterpreting 32-bit values as 64 bit values, we generate + // INSERT_SUBREG IMPLICIT_DEF:i64, , TargetConstant:i32<1> + if (Conv == ExtOrTruncConversion::Ext) { + SDValue ImDef(CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, MVT::i64), 0); + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, MVT::i64, + ImDef, NatWidthRes, SubRegIdx), 0); + } + + assert(Conv == ExtOrTruncConversion::Trunc && + "Unknown convertion between 32 and 64 bit values."); + // For reinterpreting 64-bit values as 32-bit values, we just need to + // EXTRACT_SUBREG (i.e. extract the low word). + SDValue SubRegIdx = + CurDAG->getTargetConstant(PPC::sub_32, dl, MVT::i32); + return SDValue(CurDAG->getMachineNode(PPC::EXTRACT_SUBREG, dl, MVT::i32, + NatWidthRes, SubRegIdx), 0); +} + +/// Produces a zero-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get32BitZExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (zext (setcc %a, %b, seteq)) -> (lshr (cntlzw (xor %a, %b)), 5) + // (zext (setcc %a, 0, seteq)) -> (lshr (cntlzw %a), 5) + SDValue Xor = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Clz = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, Xor), 0); + SDValue ShiftOps[] = { Clz, getI32Imm(27, dl), getI32Imm(5, dl), + getI32Imm(31, dl) }; + return SDValue(CurDAG->getMachineNode(PPC::RLWINM, dl, MVT::i32, + ShiftOps), 0); + } + } +} + +/// Produces a sign-extended result of comparing two 32-bit values according to +/// the passed condition code. +SDValue PPCDAGToDAGISel::get32BitSExtCompare(SDValue LHS, SDValue RHS, + ISD::CondCode CC, + int64_t RHSValue, SDLoc dl) { + bool IsRHSZero = RHSValue == 0; + switch (CC) { + default: return SDValue(); + case ISD::SETEQ: { + // (sext (setcc %a, %b, seteq)) -> + // (ashr (shl (ctlz (xor %a, %b)), 58), 63) + // (sext (setcc %a, 0, seteq)) -> + // (ashr (shl (ctlz %a), 58), 63) + SDValue CountInput = IsRHSZero ? LHS : + SDValue(CurDAG->getMachineNode(PPC::XOR, dl, MVT::i32, LHS, RHS), 0); + SDValue Cntlzw = + SDValue(CurDAG->getMachineNode(PPC::CNTLZW, dl, MVT::i32, CountInput), 0); + SDValue SHLOps[] = { Cntlzw, getI32Imm(58, dl), getI32Imm(0, dl) }; + SDValue Sldi = + SDValue(CurDAG->getMachineNode(PPC::RLDICR_32, dl, MVT::i32, SHLOps), 0); + return SDValue(CurDAG->getMachineNode(PPC::SRADI_32, dl, MVT::i32, Sldi, + getI32Imm(63, dl)), 0); + } + } +} + +/// Returns an equivalent of a SETCC node but with the result the same width as +/// the inputs. This can nalso be used for SELECT_CC if either the true or false +/// values is a power of two while the other is zero. +SDValue PPCDAGToDAGISel::getSETCCInGPR(SDValue Compare, + SetccInGPROpts ConvOpts) { + assert((Compare.getOpcode() == ISD::SETCC || + Compare.getOpcode() == ISD::SELECT_CC) && + "An ISD::SETCC node required here."); + + SDValue LHS = Compare.getOperand(0); + SDValue RHS = Compare.getOperand(1); + + // The condition code is operand 2 for SETCC and operand 4 for SELECT_CC. + int CCOpNum = Compare.getOpcode() == ISD::SELECT_CC ? 4 : 2; + ISD::CondCode CC = + cast(Compare.getOperand(CCOpNum))->get(); + EVT InputVT = LHS.getValueType(); + if (InputVT != MVT::i32) + return SDValue(); + + SDLoc dl(Compare); + ConstantSDNode *RHSConst = dyn_cast(RHS); + int64_t RHSValue = RHSConst ? RHSConst->getSExtValue() : INT64_MAX; + + if (ConvOpts == SetccInGPROpts::ZExtInvert || + ConvOpts == SetccInGPROpts::SExtInvert) + CC = ISD::getSetCCInverse(CC, true); + + if (ISD::isSignedIntSetCC(CC)) { + LHS = signExtendInputIfNeeded(LHS); + RHS = signExtendInputIfNeeded(RHS); + } else if (ISD::isUnsignedIntSetCC(CC)) { + LHS = zeroExtendInputIfNeeded(LHS); + RHS = zeroExtendInputIfNeeded(RHS); + } + + bool IsSext = ConvOpts == SetccInGPROpts::SExtOrig || + ConvOpts == SetccInGPROpts::SExtInvert; + if (IsSext) + return get32BitSExtCompare(LHS, RHS, CC, RHSValue, dl); + return get32BitZExtCompare(LHS, RHS, CC, RHSValue, dl); +} + void PPCDAGToDAGISel::transferMemOperands(SDNode *N, SDNode *Result) { // Transfer memoperands. MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(1); @@ -2508,6 +2757,12 @@ void PPCDAGToDAGISel::Select(SDNode *N) { } break; + case ISD::ZERO_EXTEND: + case ISD::SIGN_EXTEND: + if (tryEXTEND(N)) + return; + break; + case ISD::SETCC: if (trySETCC(N)) return; diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp index 685f24cb502e..17bdd595da10 100644 --- a/lib/Target/PowerPC/PPCISelLowering.cpp +++ b/lib/Target/PowerPC/PPCISelLowering.cpp @@ -923,6 +923,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM, setStackPointerRegisterToSaveRestore(isPPC64 ? PPC::X1 : PPC::R1); // We have target-specific dag combine patterns for the following nodes: + setTargetDAGCombine(ISD::SHL); + setTargetDAGCombine(ISD::SRA); + setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SINT_TO_FP); setTargetDAGCombine(ISD::BUILD_VECTOR); if (Subtarget.hasFPCVT()) @@ -4949,8 +4952,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be moved somewhere else @@ -5000,9 +5002,8 @@ SDValue PPCTargetLowering::LowerCall_32SVR4( Flags, DAG, dl); // This must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, NumBytes, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); Chain = CallSeqStart = NewCallSeqStart; @@ -5083,9 +5084,9 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq( CallSeqStart.getNode()->getOperand(0), Flags, DAG, dl); // The MEMCPY must go outside the CALLSEQ_START..END. - SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, - CallSeqStart.getNode()->getOperand(1), - SDLoc(MemcpyCall)); + int64_t FrameSize = CallSeqStart.getConstantOperandVal(1); + SDValue NewCallSeqStart = DAG.getCALLSEQ_START(MemcpyCall, FrameSize, 0, + SDLoc(MemcpyCall)); DAG.ReplaceAllUsesWith(CallSeqStart.getNode(), NewCallSeqStart.getNode()); return NewCallSeqStart; @@ -5268,8 +5269,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass if (!IsSibCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getIntPtrConstant(NumBytes, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -5828,8 +5828,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin( // Adjust the stack pointer for the new arguments... // These operations are automatically eliminated by the prolog/epilog pass - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(NumBytes, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SDValue CallSeqStart = Chain; // Load the return address and frame pointer so it can be move somewhere else @@ -8741,9 +8740,9 @@ static Instruction* callIntrinsic(IRBuilder<> &Builder, Intrinsic::ID Id) { // The mappings for emitLeading/TrailingFence is taken from // http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html -Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { +Instruction *PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { if (Ord == AtomicOrdering::SequentiallyConsistent) return callIntrinsic(Builder, Intrinsic::ppc_sync); if (isReleaseOrStronger(Ord)) @@ -8751,10 +8750,10 @@ Instruction* PPCTargetLowering::emitLeadingFence(IRBuilder<> &Builder, return nullptr; } -Instruction* PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, - AtomicOrdering Ord, bool IsStore, - bool IsLoad) const { - if (IsLoad && isAcquireOrStronger(Ord)) +Instruction *PPCTargetLowering::emitTrailingFence(IRBuilder<> &Builder, + Instruction *Inst, + AtomicOrdering Ord) const { + if (Inst->hasAtomicLoad() && isAcquireOrStronger(Ord)) return callIntrinsic(Builder, Intrinsic::ppc_lwsync); // FIXME: this is too conservative, a dependent branch + isync is enough. // See http://www.cl.cam.ac.uk/~pes20/cpp/cpp0xmappings.html and @@ -11316,6 +11315,12 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N, SDLoc dl(N); switch (N->getOpcode()) { default: break; + case ISD::SHL: + return combineSHL(N, DCI); + case ISD::SRA: + return combineSRA(N, DCI); + case ISD::SRL: + return combineSRL(N, DCI); case PPCISD::SHL: if (isNullConstant(N->getOperand(0))) // 0 << V -> 0. return N->getOperand(0); @@ -12948,3 +12953,58 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT) const { return Imm.isPosZero(); } } + +// For vector shift operation op, fold +// (op x, (and y, ((1 << numbits(x)) - 1))) -> (target op x, y) +static SDValue stripModuloOnShift(const TargetLowering &TLI, SDNode *N, + SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N0.getValueType(); + unsigned OpSizeInBits = VT.getScalarSizeInBits(); + unsigned Opcode = N->getOpcode(); + unsigned TargetOpcode; + + switch (Opcode) { + default: + llvm_unreachable("Unexpected shift operation"); + case ISD::SHL: + TargetOpcode = PPCISD::SHL; + break; + case ISD::SRL: + TargetOpcode = PPCISD::SRL; + break; + case ISD::SRA: + TargetOpcode = PPCISD::SRA; + break; + } + + if (VT.isVector() && TLI.isOperationLegal(Opcode, VT) && + N1->getOpcode() == ISD::AND) + if (ConstantSDNode *Mask = isConstOrConstSplat(N1->getOperand(1))) + if (Mask->getZExtValue() == OpSizeInBits - 1) + return DAG.getNode(TargetOpcode, SDLoc(N), VT, N0, N1->getOperand(0)); + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSHL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRA(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} + +SDValue PPCTargetLowering::combineSRL(SDNode *N, DAGCombinerInfo &DCI) const { + if (auto Value = stripModuloOnShift(*this, N, DCI.DAG)) + return Value; + + return SDValue(); +} diff --git a/lib/Target/PowerPC/PPCISelLowering.h b/lib/Target/PowerPC/PPCISelLowering.h index 32661099b79d..4fc744257262 100644 --- a/lib/Target/PowerPC/PPCISelLowering.h +++ b/lib/Target/PowerPC/PPCISelLowering.h @@ -117,9 +117,13 @@ namespace llvm { /// at function entry, used for PIC code. GlobalBaseReg, - /// These nodes represent the 32-bit PPC shifts that operate on 6-bit - /// shift amounts. These nodes are generated by the multi-precision shift - /// code. + /// These nodes represent PPC shifts. + /// + /// For scalar types, only the last `n + 1` bits of the shift amounts + /// are used, where n is log2(sizeof(element) * 8). See sld/slw, etc. + /// for exact behaviors. + /// + /// For vector types, only the last n bits are used. See vsld. SRL, SRA, SHL, /// The combination of sra[wd]i and addze used to implemented signed @@ -617,10 +621,10 @@ namespace llvm { return true; } - Instruction* emitLeadingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; - Instruction* emitTrailingFence(IRBuilder<> &Builder, AtomicOrdering Ord, - bool IsStore, bool IsLoad) const override; + Instruction *emitLeadingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; + Instruction *emitTrailingFence(IRBuilder<> &Builder, Instruction *Inst, + AtomicOrdering Ord) const override; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, @@ -999,6 +1003,9 @@ namespace llvm { SDValue DAGCombineBuildVector(SDNode *N, DAGCombinerInfo &DCI) const; SDValue DAGCombineTruncBoolExt(SDNode *N, DAGCombinerInfo &DCI) const; SDValue combineFPToIntToFP(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSHL(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSRA(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const; /// ConvertSETCCToSubtract - looks at SETCC that compares ints. It replaces /// SETCC with integer subtraction when (1) there is a legal way of doing it @@ -1017,14 +1024,6 @@ namespace llvm { SDValue combineElementTruncationToVectorTruncation(SDNode *N, DAGCombinerInfo &DCI) const; - - bool supportsModuloShift(ISD::NodeType Inst, - EVT ReturnType) const override { - assert((Inst == ISD::SHL || Inst == ISD::SRA || Inst == ISD::SRL) && - "Expect a shift instruction"); - assert(isOperationLegal(Inst, ReturnType)); - return ReturnType.isVector(); - } }; namespace PPC { diff --git a/lib/Target/PowerPC/PPCInstr64Bit.td b/lib/Target/PowerPC/PPCInstr64Bit.td index 997b96ca6ec8..a8433919f0f3 100644 --- a/lib/Target/PowerPC/PPCInstr64Bit.td +++ b/lib/Target/PowerPC/PPCInstr64Bit.td @@ -634,10 +634,19 @@ let Interpretation64Bit = 1, isCodeGenOnly = 1 in defm EXTSW_32_64 : XForm_11r<31, 986, (outs g8rc:$rA), (ins gprc:$rS), "extsw", "$rA, $rS", IIC_IntSimple, [(set i64:$rA, (sext i32:$rS))]>, isPPC64; +let isCodeGenOnly = 1 in +def EXTSW_32 : XForm_11<31, 986, (outs gprc:$rA), (ins gprc:$rS), + "extsw $rA, $rS", IIC_IntSimple, + []>, isPPC64; defm SRADI : XSForm_1rc<31, 413, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH), "sradi", "$rA, $rS, $SH", IIC_IntRotateDI, [(set i64:$rA, (sra i64:$rS, (i32 imm:$SH)))]>, isPPC64; +// For fast-isel: +let isCodeGenOnly = 1 in +def SRADI_32 : XSForm_1<31, 413, (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH), + "sradi $rA, $rS, $SH", IIC_IntRotateDI, []>, isPPC64; + defm CNTLZD : XForm_11r<31, 58, (outs g8rc:$rA), (ins g8rc:$rS), "cntlzd", "$rA, $rS", IIC_IntGeneral, [(set i64:$rA, (ctlz i64:$rS))]>; @@ -721,15 +730,26 @@ defm RLDICL : MDForm_1r<30, 0, // For fast-isel: let isCodeGenOnly = 1 in def RLDICL_32_64 : MDForm_1<30, 0, - (outs g8rc:$rA), - (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), - "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, - []>, isPPC64; + (outs g8rc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; // End fast-isel. +let isCodeGenOnly = 1 in +def RLDICL_32 : MDForm_1<30, 0, + (outs gprc:$rA), + (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicl $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDICR : MDForm_1r<30, 1, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldicr", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, []>, isPPC64; +let isCodeGenOnly = 1 in +def RLDICR_32 : MDForm_1<30, 1, + (outs gprc:$rA), (ins gprc:$rS, u6imm:$SH, u6imm:$MBE), + "rldicr $rA, $rS, $SH, $MBE", IIC_IntRotateDI, + []>, isPPC64; defm RLDIC : MDForm_1r<30, 2, (outs g8rc:$rA), (ins g8rc:$rS, u6imm:$SH, u6imm:$MBE), "rldic", "$rA, $rS, $SH, $MBE", IIC_IntRotateDI, diff --git a/lib/Target/PowerPC/PPCInstrAltivec.td b/lib/Target/PowerPC/PPCInstrAltivec.td index c380766e9f5c..e14d18fd5433 100644 --- a/lib/Target/PowerPC/PPCInstrAltivec.td +++ b/lib/Target/PowerPC/PPCInstrAltivec.td @@ -987,6 +987,12 @@ def : Pat<(v8i16 (shl v8i16:$vA, v8i16:$vB)), (v8i16 (VSLH $vA, $vB))>; def : Pat<(v4i32 (shl v4i32:$vA, v4i32:$vB)), (v4i32 (VSLW $vA, $vB))>; +def : Pat<(v16i8 (PPCshl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSLB $vA, $vB))>; +def : Pat<(v8i16 (PPCshl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSLH $vA, $vB))>; +def : Pat<(v4i32 (PPCshl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSLW $vA, $vB))>; def : Pat<(v16i8 (srl v16i8:$vA, v16i8:$vB)), (v16i8 (VSRB $vA, $vB))>; @@ -994,6 +1000,12 @@ def : Pat<(v8i16 (srl v8i16:$vA, v8i16:$vB)), (v8i16 (VSRH $vA, $vB))>; def : Pat<(v4i32 (srl v4i32:$vA, v4i32:$vB)), (v4i32 (VSRW $vA, $vB))>; +def : Pat<(v16i8 (PPCsrl v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRB $vA, $vB))>; +def : Pat<(v8i16 (PPCsrl v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRH $vA, $vB))>; +def : Pat<(v4i32 (PPCsrl v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRW $vA, $vB))>; def : Pat<(v16i8 (sra v16i8:$vA, v16i8:$vB)), (v16i8 (VSRAB $vA, $vB))>; @@ -1001,6 +1013,12 @@ def : Pat<(v8i16 (sra v8i16:$vA, v8i16:$vB)), (v8i16 (VSRAH $vA, $vB))>; def : Pat<(v4i32 (sra v4i32:$vA, v4i32:$vB)), (v4i32 (VSRAW $vA, $vB))>; +def : Pat<(v16i8 (PPCsra v16i8:$vA, v16i8:$vB)), + (v16i8 (VSRAB $vA, $vB))>; +def : Pat<(v8i16 (PPCsra v8i16:$vA, v8i16:$vB)), + (v8i16 (VSRAH $vA, $vB))>; +def : Pat<(v4i32 (PPCsra v4i32:$vA, v4i32:$vB)), + (v4i32 (VSRAW $vA, $vB))>; // Float to integer and integer to float conversions def : Pat<(v4i32 (fp_to_sint v4f32:$vA)), @@ -1072,14 +1090,24 @@ def:Pat<(vmrgow_swapped_shuffle v16i8:$vA, v16i8:$vB), // Vector shifts def VRLD : VX1_Int_Ty<196, "vrld", int_ppc_altivec_vrld, v2i64>; def VSLD : VXForm_1<1476, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsld $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (shl v2i64:$vA, v2i64:$vB))]>; + "vsld $vD, $vA, $vB", IIC_VecGeneral, []>; def VSRD : VXForm_1<1732, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsrd $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (srl v2i64:$vA, v2i64:$vB))]>; + "vsrd $vD, $vA, $vB", IIC_VecGeneral, []>; def VSRAD : VXForm_1<964, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB), - "vsrad $vD, $vA, $vB", IIC_VecGeneral, - [(set v2i64:$vD, (sra v2i64:$vA, v2i64:$vB))]>; + "vsrad $vD, $vA, $vB", IIC_VecGeneral, []>; + +def : Pat<(v2i64 (shl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSLD $vA, $vB))>; +def : Pat<(v2i64 (PPCshl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSLD $vA, $vB))>; +def : Pat<(v2i64 (srl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRD $vA, $vB))>; +def : Pat<(v2i64 (PPCsrl v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRD $vA, $vB))>; +def : Pat<(v2i64 (sra v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRAD $vA, $vB))>; +def : Pat<(v2i64 (PPCsra v2i64:$vA, v2i64:$vB)), + (v2i64 (VSRAD $vA, $vB))>; // Vector Integer Arithmetic Instructions let isCommutable = 1 in { diff --git a/lib/Target/PowerPC/PPCInstrInfo.td b/lib/Target/PowerPC/PPCInstrInfo.td index f004ce49cac0..1af5e7f28342 100644 --- a/lib/Target/PowerPC/PPCInstrInfo.td +++ b/lib/Target/PowerPC/PPCInstrInfo.td @@ -33,7 +33,8 @@ def SDT_PPCVexts : SDTypeProfile<1, 2, [ SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2> ]>; -def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_PPCCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; def SDT_PPCvperm : SDTypeProfile<1, 3, [ @@ -1099,9 +1100,11 @@ multiclass AForm_3r opcode, bits<5> xo, dag OOL, dag IOL, let hasCtrlDep = 1 in { let Defs = [R1], Uses = [R1] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt), "#ADJCALLSTACKDOWN $amt", - [(callseq_start timm:$amt)]>; -def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), "#ADJCALLSTACKUP $amt1 $amt2", +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), + "#ADJCALLSTACKDOWN $amt1 $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; +def ADJCALLSTACKUP : Pseudo<(outs), (ins u16imm:$amt1, u16imm:$amt2), + "#ADJCALLSTACKUP $amt1 $amt2", [(callseq_end timm:$amt1, timm:$amt2)]>; } @@ -4163,6 +4166,8 @@ def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, u6imm:$n, 0 def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCLo g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>; def : InstAlias<"clrldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; +def : InstAlias<"clrldi $rA, $rS, $n", + (RLDICL_32 gprc:$rA, gprc:$rS, 0, u6imm:$n)>; def : InstAlias<"clrldi. $rA, $rS, $n", (RLDICLo g8rc:$rA, g8rc:$rS, 0, u6imm:$n)>; def RLWINMbm : PPCAsmPseudo<"rlwinm $rA, $rS, $n, $b", diff --git a/lib/Target/PowerPC/PPCInstrVSX.td b/lib/Target/PowerPC/PPCInstrVSX.td index 967557452f24..b98140fedfc0 100644 --- a/lib/Target/PowerPC/PPCInstrVSX.td +++ b/lib/Target/PowerPC/PPCInstrVSX.td @@ -1436,7 +1436,7 @@ let Predicates = [IsISA3_0, HasDirectMove] in { def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA), "mtvsrws $XT, $rA", IIC_VecGeneral, []>; - def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB), + def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB), "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral, []>, Requires<[In64BitMode]>; diff --git a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp index 0c1260a2965b..c7aa4cb78b7a 100644 --- a/lib/Target/PowerPC/PPCTLSDynamicCall.cpp +++ b/lib/Target/PowerPC/PPCTLSDynamicCall.cpp @@ -99,7 +99,8 @@ protected: // Don't really need to save data to the stack - the clobbered // registers are already saved when the SDNode (e.g. PPCaddiTlsgdLAddr) // gets translated to the pseudo instruction (e.g. ADDItlsgdLADDR). - BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0); + BuildMI(MBB, I, DL, TII->get(PPC::ADJCALLSTACKDOWN)).addImm(0) + .addImm(0); // Expand into two ops built prior to the existing instruction. MachineInstr *Addi = BuildMI(MBB, I, DL, TII->get(Opc1), GPR3) diff --git a/lib/Target/Sparc/SparcISelLowering.cpp b/lib/Target/Sparc/SparcISelLowering.cpp index acb34d5baaa8..9e7e3c6b705a 100644 --- a/lib/Target/Sparc/SparcISelLowering.cpp +++ b/lib/Target/Sparc/SparcISelLowering.cpp @@ -773,8 +773,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI, } } - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, dl, true), - dl); + Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, dl); SmallVector, 8> RegsToPass; SmallVector MemOpChains; @@ -1165,8 +1164,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI, // Adjust the stack pointer to make room for the arguments. // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls // with more than 6 arguments. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL); // Collect the set of registers to pass to the function and their values. // This will be emitted as a sequence of CopyToReg nodes glued to the call @@ -2058,7 +2056,7 @@ SDValue SparcTargetLowering::LowerGlobalTLSAddress(SDValue Op, SDValue Chain = DAG.getEntryNode(); SDValue InFlag; - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(1, DL, true), DL); + Chain = DAG.getCALLSEQ_START(Chain, 1, 0, DL); Chain = DAG.getCopyToReg(Chain, DL, SP::O0, Argument, InFlag); InFlag = Chain.getValue(1); SDValue Callee = DAG.getTargetExternalSymbol("__tls_get_addr", PtrVT); @@ -3386,7 +3384,10 @@ SparcTargetLowering::getConstraintType(StringRef Constraint) const { if (Constraint.size() == 1) { switch (Constraint[0]) { default: break; - case 'r': return C_RegisterClass; + case 'r': + case 'f': + case 'e': + return C_RegisterClass; case 'I': // SIMM13 return C_Other; } @@ -3465,6 +3466,24 @@ SparcTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI, return std::make_pair(0U, &SP::IntPairRegClass); else return std::make_pair(0U, &SP::IntRegsRegClass); + case 'f': + if (VT == MVT::f32) + return std::make_pair(0U, &SP::FPRegsRegClass); + else if (VT == MVT::f64) + return std::make_pair(0U, &SP::LowDFPRegsRegClass); + else if (VT == MVT::f128) + return std::make_pair(0U, &SP::LowQFPRegsRegClass); + llvm_unreachable("Unknown ValueType for f-register-type!"); + break; + case 'e': + if (VT == MVT::f32) + return std::make_pair(0U, &SP::FPRegsRegClass); + else if (VT == MVT::f64) + return std::make_pair(0U, &SP::DFPRegsRegClass); + else if (VT == MVT::f128) + return std::make_pair(0U, &SP::QFPRegsRegClass); + llvm_unreachable("Unknown ValueType for e-register-type!"); + break; } } else if (!Constraint.empty() && Constraint.size() <= 5 && Constraint[0] == '{' && *(Constraint.end()-1) == '}') { diff --git a/lib/Target/Sparc/SparcInstrInfo.td b/lib/Target/Sparc/SparcInstrInfo.td index 5a19c624abb5..ae45c8be6752 100644 --- a/lib/Target/Sparc/SparcInstrInfo.td +++ b/lib/Target/Sparc/SparcInstrInfo.td @@ -195,7 +195,8 @@ def SPsjlj_longjmp: SDNode<"SPISD::EH_SJLJ_LONGJMP", [SDNPHasChain, SDNPSideEffect]>; // These are target-independent nodes, but have target-specific formats. -def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_SPCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, SDTCisVT<1, i32> ]>; @@ -404,9 +405,9 @@ let Defs = [O7] in { } let Defs = [O6], Uses = [O6] in { -def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt), - "!ADJCALLSTACKDOWN $amt", - [(callseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), + "!ADJCALLSTACKDOWN $amt1, $amt2", + [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i32imm:$amt1, i32imm:$amt2), "!ADJCALLSTACKUP $amt1", [(callseq_end timm:$amt1, timm:$amt2)]>; diff --git a/lib/Target/Sparc/SparcRegisterInfo.td b/lib/Target/Sparc/SparcRegisterInfo.td index 6ecfddfc7d66..6625eaafd992 100644 --- a/lib/Target/Sparc/SparcRegisterInfo.td +++ b/lib/Target/Sparc/SparcRegisterInfo.td @@ -346,11 +346,13 @@ def I64Regs : RegisterClass<"SP", [i64], 64, (add IntRegs)>; // Floating point register classes. def FPRegs : RegisterClass<"SP", [f32], 32, (sequence "F%u", 0, 31)>; - def DFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 31)>; - def QFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 15)>; +// The Low?FPRegs classes are used only for inline-asm constraints. +def LowDFPRegs : RegisterClass<"SP", [f64], 64, (sequence "D%u", 0, 15)>; +def LowQFPRegs : RegisterClass<"SP", [f128], 128, (sequence "Q%u", 0, 7)>; + // Floating point control register classes. def FCCRegs : RegisterClass<"SP", [i1], 1, (sequence "FCC%u", 0, 3)>; diff --git a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp index 3f91ca9035a6..efcf6696fd50 100644 --- a/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp +++ b/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp @@ -262,6 +262,9 @@ public: bool isMemDisp20(MemoryKind MemKind, RegisterKind RegKind) const { return isMem(MemKind, RegKind) && inRange(Mem.Disp, -524288, 524287); } + bool isMemDisp12Len4(RegisterKind RegKind) const { + return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x10); + } bool isMemDisp12Len8(RegisterKind RegKind) const { return isMemDisp12(BDLMem, RegKind) && inRange(Mem.Length.Imm, 1, 0x100); } @@ -347,6 +350,7 @@ public: bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); } bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); } bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); } + bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); } bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); } bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); } bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); } diff --git a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp index a281a0aa6bcc..27fd70bc6092 100644 --- a/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp +++ b/lib/Target/SystemZ/Disassembler/SystemZDisassembler.cpp @@ -327,6 +327,18 @@ static DecodeStatus decodeBDXAddr20Operand(MCInst &Inst, uint64_t Field, return MCDisassembler::Success; } +static DecodeStatus decodeBDLAddr12Len4Operand(MCInst &Inst, uint64_t Field, + const unsigned *Regs) { + uint64_t Length = Field >> 16; + uint64_t Base = (Field >> 12) & 0xf; + uint64_t Disp = Field & 0xfff; + assert(Length < 16 && "Invalid BDLAddr12Len4"); + Inst.addOperand(MCOperand::createReg(Base == 0 ? 0 : Regs[Base])); + Inst.addOperand(MCOperand::createImm(Disp)); + Inst.addOperand(MCOperand::createImm(Length + 1)); + return MCDisassembler::Success; +} + static DecodeStatus decodeBDLAddr12Len8Operand(MCInst &Inst, uint64_t Field, const unsigned *Regs) { uint64_t Length = Field >> 16; @@ -399,6 +411,13 @@ static DecodeStatus decodeBDXAddr64Disp20Operand(MCInst &Inst, uint64_t Field, return decodeBDXAddr20Operand(Inst, Field, SystemZMC::GR64Regs); } +static DecodeStatus decodeBDLAddr64Disp12Len4Operand(MCInst &Inst, + uint64_t Field, + uint64_t Address, + const void *Decoder) { + return decodeBDLAddr12Len4Operand(Inst, Field, SystemZMC::GR64Regs); +} + static DecodeStatus decodeBDLAddr64Disp12Len8Operand(MCInst &Inst, uint64_t Field, uint64_t Address, diff --git a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp index 092eb4011adc..d188f56512ab 100644 --- a/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp +++ b/lib/Target/SystemZ/MCTargetDesc/SystemZMCCodeEmitter.cpp @@ -77,6 +77,9 @@ private: uint64_t getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; + uint64_t getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const; uint64_t getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum, SmallVectorImpl &Fixups, const MCSubtargetInfo &STI) const; @@ -219,6 +222,17 @@ getBDXAddr20Encoding(const MCInst &MI, unsigned OpNum, | ((Disp & 0xff000) >> 12); } +uint64_t SystemZMCCodeEmitter:: +getBDLAddr12Len4Encoding(const MCInst &MI, unsigned OpNum, + SmallVectorImpl &Fixups, + const MCSubtargetInfo &STI) const { + uint64_t Base = getMachineOpValue(MI, MI.getOperand(OpNum), Fixups, STI); + uint64_t Disp = getMachineOpValue(MI, MI.getOperand(OpNum + 1), Fixups, STI); + uint64_t Len = getMachineOpValue(MI, MI.getOperand(OpNum + 2), Fixups, STI) - 1; + assert(isUInt<4>(Base) && isUInt<12>(Disp) && isUInt<4>(Len)); + return (Len << 16) | (Base << 12) | Disp; +} + uint64_t SystemZMCCodeEmitter:: getBDLAddr12Len8Encoding(const MCInst &MI, unsigned OpNum, SmallVectorImpl &Fixups, diff --git a/lib/Target/SystemZ/README.txt b/lib/Target/SystemZ/README.txt index 86a1322c9e23..74cf653b9d95 100644 --- a/lib/Target/SystemZ/README.txt +++ b/lib/Target/SystemZ/README.txt @@ -63,7 +63,7 @@ via a register.) -- -We don't use ICM or STCM. +We don't use ICM, STCM, or CLM. -- diff --git a/lib/Target/SystemZ/SystemZFeatures.td b/lib/Target/SystemZ/SystemZFeatures.td index 716e5add8051..7bfa378aa85c 100644 --- a/lib/Target/SystemZ/SystemZFeatures.td +++ b/lib/Target/SystemZ/SystemZFeatures.td @@ -68,6 +68,11 @@ def FeaturePopulationCount : SystemZFeature< "Assume that the population-count facility is installed" >; +def FeatureMessageSecurityAssist4 : SystemZFeature< + "message-security-assist-extension4", "MessageSecurityAssist4", + "Assume that the message-security-assist extension facility 4 is installed" +>; + def Arch9NewFeatures : SystemZFeatureList<[ FeatureDistinctOps, FeatureFastSerialization, @@ -75,7 +80,8 @@ def Arch9NewFeatures : SystemZFeatureList<[ FeatureHighWord, FeatureInterlockedAccess1, FeatureLoadStoreOnCond, - FeaturePopulationCount + FeaturePopulationCount, + FeatureMessageSecurityAssist4 ]>; //===----------------------------------------------------------------------===// @@ -133,6 +139,11 @@ def FeatureLoadStoreOnCond2 : SystemZFeature< "Assume that the load/store-on-condition facility 2 is installed" >; +def FeatureMessageSecurityAssist5 : SystemZFeature< + "message-security-assist-extension5", "MessageSecurityAssist5", + "Assume that the message-security-assist extension facility 5 is installed" +>; + def FeatureVector : SystemZFeature< "vector", "Vector", "Assume that the vectory facility is installed" @@ -142,6 +153,7 @@ def FeatureNoVector : SystemZMissingFeature<"Vector">; def Arch11NewFeatures : SystemZFeatureList<[ FeatureLoadAndZeroRightmostByte, FeatureLoadStoreOnCond2, + FeatureMessageSecurityAssist5, FeatureVector ]>; diff --git a/lib/Target/SystemZ/SystemZISelLowering.cpp b/lib/Target/SystemZ/SystemZISelLowering.cpp index 6989aabb8c6a..235e095f0010 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.cpp +++ b/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1110,9 +1110,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI, // Mark the start of the call. if (!IsTailCall) - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getConstant(NumBytes, DL, PtrVT, true), - DL); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, DL); // Copy argument values to their designated locations. SmallVector, 9> RegsToPass; @@ -6354,3 +6352,12 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter( llvm_unreachable("Unexpected instr type to insert"); } } + +// This is only used by the isel schedulers, and is needed only to prevent +// compiler from crashing when list-ilp is used. +const TargetRegisterClass * +SystemZTargetLowering::getRepRegClassFor(MVT VT) const { + if (VT == MVT::Untyped) + return &SystemZ::ADDR128BitRegClass; + return TargetLowering::getRepRegClassFor(VT); +} diff --git a/lib/Target/SystemZ/SystemZISelLowering.h b/lib/Target/SystemZ/SystemZISelLowering.h index 1c34dc43e8bb..79c8c4d92669 100644 --- a/lib/Target/SystemZ/SystemZISelLowering.h +++ b/lib/Target/SystemZ/SystemZISelLowering.h @@ -590,6 +590,8 @@ private: MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI, MachineBasicBlock *MBB, unsigned Opcode) const; + + const TargetRegisterClass *getRepRegClassFor(MVT VT) const override; }; } // end namespace llvm diff --git a/lib/Target/SystemZ/SystemZInstrFP.td b/lib/Target/SystemZ/SystemZInstrFP.td index bb6d27e24828..364b81f98eed 100644 --- a/lib/Target/SystemZ/SystemZInstrFP.td +++ b/lib/Target/SystemZ/SystemZInstrFP.td @@ -458,6 +458,12 @@ def DXBR : BinaryRRE<"dxbr", 0xB34D, fdiv, FP128, FP128>; def DEB : BinaryRXE<"deb", 0xED0D, fdiv, FP32, load, 4>; def DDB : BinaryRXE<"ddb", 0xED1D, fdiv, FP64, load, 8>; +// Divide to integer. +let Defs = [CC] in { + def DIEBR : TernaryRRFb<"diebr", 0xB353, FP32, FP32, FP32>; + def DIDBR : TernaryRRFb<"didbr", 0xB35B, FP64, FP64, FP64>; +} + //===----------------------------------------------------------------------===// // Comparisons //===----------------------------------------------------------------------===// @@ -469,6 +475,13 @@ let Defs = [CC], CCValues = 0xF in { def CEB : CompareRXE<"ceb", 0xED09, z_fcmp, FP32, load, 4>; def CDB : CompareRXE<"cdb", 0xED19, z_fcmp, FP64, load, 8>; + + def KEBR : CompareRRE<"kebr", 0xB308, null_frag, FP32, FP32>; + def KDBR : CompareRRE<"kdbr", 0xB318, null_frag, FP64, FP64>; + def KXBR : CompareRRE<"kxbr", 0xB348, null_frag, FP128, FP128>; + + def KEB : CompareRXE<"keb", 0xED08, null_frag, FP32, load, 4>; + def KDB : CompareRXE<"kdb", 0xED18, null_frag, FP64, load, 8>; } // Test Data Class. diff --git a/lib/Target/SystemZ/SystemZInstrFormats.td b/lib/Target/SystemZ/SystemZInstrFormats.td index c727f486087e..a37da2807854 100644 --- a/lib/Target/SystemZ/SystemZInstrFormats.td +++ b/lib/Target/SystemZ/SystemZInstrFormats.td @@ -710,6 +710,21 @@ class InstRSI op, dag outs, dag ins, string asmstr, list pattern> let Inst{15-0} = RI2; } +class InstRSLa op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<20> BDL1; + + let Inst{47-40} = op{15-8}; + let Inst{39-36} = BDL1{19-16}; + let Inst{35-32} = 0; + let Inst{31-16} = BDL1{15-0}; + let Inst{15-8} = 0; + let Inst{7-0} = op{7-0}; +} + class InstRSYa op, dag outs, dag ins, string asmstr, list pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -817,6 +832,37 @@ class InstSSa op, dag outs, dag ins, string asmstr, list pattern> let Inst{15-0} = BD2; } +class InstSSb op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<20> BDL1; + bits<20> BDL2; + + let Inst{47-40} = op; + let Inst{39-36} = BDL1{19-16}; + let Inst{35-32} = BDL2{19-16}; + let Inst{31-16} = BDL1{15-0}; + let Inst{15-0} = BDL2{15-0}; +} + +class InstSSc op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<20> BDL1; + bits<16> BD2; + bits<4> I3; + + let Inst{47-40} = op; + let Inst{39-36} = BDL1{19-16}; + let Inst{35-32} = I3; + let Inst{31-16} = BDL1{15-0}; + let Inst{15-0} = BD2; +} + class InstSSd op, dag outs, dag ins, string asmstr, list pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -850,6 +896,20 @@ class InstSSe op, dag outs, dag ins, string asmstr, list pattern> let Inst{15-0} = BD4; } +class InstSSf op, dag outs, dag ins, string asmstr, list pattern> + : InstSystemZ<6, outs, ins, asmstr, pattern> { + field bits<48> Inst; + field bits<48> SoftFail = 0; + + bits<16> BD1; + bits<24> BDL2; + + let Inst{47-40} = op; + let Inst{39-32} = BDL2{23-16}; + let Inst{31-16} = BD1; + let Inst{15-0} = BDL2{15-0}; +} + class InstSSE op, dag outs, dag ins, string asmstr, list pattern> : InstSystemZ<6, outs, ins, asmstr, pattern> { field bits<48> Inst; @@ -1567,6 +1627,9 @@ class ICV // Inherent: // One register output operand and no input operands. // +// InherentDual: +// Two register output operands and no input operands. +// // StoreInherent: // One address operand. The instruction stores to the address. // @@ -1642,8 +1705,9 @@ class ICV // Two input operands and an implicit CC output operand. // // Test: -// Two input operands and an implicit CC output operand. The second -// input operand is an "address" operand used as a test class mask. +// One or two input operands and an implicit CC output operand. If +// present, the second input operand is an "address" operand used as +// a test class mask. // // Ternary: // One register output operand and three input operands. @@ -1691,6 +1755,10 @@ class InherentRRE opcode, RegisterOperand cls, let R2 = 0; } +class InherentDualRRE opcode, RegisterOperand cls> + : InstRRE; + class InherentVRIa opcode, bits<16> value> : InstVRIa { let I2 = value; @@ -1714,6 +1782,12 @@ class SideEffectInherentS opcode, let BD2 = 0; } +class SideEffectInherentRRE opcode> + : InstRRE { + let R1 = 0; + let R2 = 0; +} + // Allow an optional TLS marker symbol to generate TLS call relocations. class CallRI opcode> : InstRIb rsOpcode, } } +class LoadMultipleSSe opcode, RegisterOperand cls> + : InstSSe { + let mayLoad = 1; +} + class LoadMultipleVRSa opcode> : InstVRSa { @@ -2355,6 +2436,15 @@ class UnaryRRE opcode, SDPatternOperator operator, let OpType = "reg"; } +class UnaryMemRRFc opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFc { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; + let M3 = 0; +} + class UnaryRI opcode, SDPatternOperator operator, RegisterOperand cls, Immediate imm> : InstRIa opcode, : InstIE; +class SideEffectBinarySI opcode, Operand imm> + : InstSI; + class SideEffectBinarySIL opcode, SDPatternOperator operator, Immediate imm> : InstSIL; +class SideEffectBinarySSa opcode> + : InstSSa; + +class SideEffectBinarySSb opcode> + : InstSSb; + +class SideEffectBinarySSf opcode> + : InstSSf; + +class SideEffectBinaryMemMemRR opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRR { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; +} + +class SideEffectBinaryMemRRE opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRE { + let Constraints = "$R2 = $R2src"; + let DisableEncoding = "$R2src"; +} + +class SideEffectBinaryMemMemRRE opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRE { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; +} + +class SideEffectBinaryMemMemRRFc opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRRFc { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; + let M3 = 0; +} + class BinaryRR opcode, SDPatternOperator operator, RegisterOperand cls1, RegisterOperand cls2> : InstRR opcode, SDPatternOperator operator, let M4 = 0; } +class BinaryMemRRFc opcode, + RegisterOperand cls1, RegisterOperand cls2, Immediate imm> + : InstRRFc { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; +} + +multiclass BinaryMemRRFcOpt opcode, + RegisterOperand cls1, RegisterOperand cls2> { + def "" : BinaryMemRRFc; + def Opt : UnaryMemRRFc; +} + class BinaryRRFe opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRRFe opcode, SDPatternOperator operator, let AccessBytes = bytes; } +class StoreBinaryRS opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr12only> + : InstRSb { + let mayStore = 1; + let AccessBytes = bytes; +} + +class StoreBinaryRSY opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr20only> + : InstRSYb { + let mayStore = 1; + let AccessBytes = bytes; +} + +multiclass StoreBinaryRSPair rsOpcode, + bits<16> rsyOpcode, RegisterOperand cls, + bits<5> bytes> { + let DispKey = mnemonic ## #cls in { + let DispSize = "12" in + def "" : StoreBinaryRS; + let DispSize = "20" in + def Y : StoreBinaryRSY; + } +} + class StoreBinaryVRV opcode, bits<5> bytes, Immediate index> : InstVRV rxOpcode, bits<16> rxyOpcode, } } +class CompareRS opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr12only> + : InstRSb { + let mayLoad = 1; + let AccessBytes = bytes; +} + +class CompareRSY opcode, RegisterOperand cls, + bits<5> bytes, AddressingMode mode = bdaddr20only> + : InstRSYb { + let mayLoad = 1; + let AccessBytes = bytes; +} + +multiclass CompareRSPair rsOpcode, bits<16> rsyOpcode, + RegisterOperand cls, bits<5> bytes> { + let DispKey = mnemonic ## #cls in { + let DispSize = "12" in + def "" : CompareRS; + let DispSize = "20" in + def Y : CompareRSY; + } +} + +class CompareSSb opcode> + : InstSSb { + let isCompare = 1; + let mayLoad = 1; +} + class CompareSI opcode, SDPatternOperator operator, SDPatternOperator load, Immediate imm, AddressingMode mode = bdaddr12only> @@ -3313,18 +3529,68 @@ class TestRXE opcode, SDPatternOperator operator, let M3 = 0; } +class TestRSL opcode> + : InstRSLa { + let mayLoad = 1; +} + +class SideEffectTernarySSc opcode> + : InstSSc; + +class SideEffectTernaryMemMemMemRRFb opcode, + RegisterOperand cls1, + RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFb { + let Constraints = "$R1 = $R1src, $R2 = $R2src, $R3 = $R3src"; + let DisableEncoding = "$R1src, $R2src, $R3src"; + let M4 = 0; +} + class SideEffectTernaryRRFc opcode, RegisterOperand cls1, RegisterOperand cls2, Immediate imm> : InstRRFc; +class SideEffectTernaryMemMemRRFc opcode, + RegisterOperand cls1, RegisterOperand cls2, + Immediate imm> + : InstRRFc { + let Constraints = "$R1 = $R1src, $R2 = $R2src"; + let DisableEncoding = "$R1src, $R2src"; +} + +multiclass SideEffectTernaryMemMemRRFcOpt opcode, + RegisterOperand cls1, + RegisterOperand cls2> { + def "" : SideEffectTernaryMemMemRRFc; + def Opt : SideEffectBinaryMemMemRRFc; +} + class SideEffectTernarySSF opcode, RegisterOperand cls> : InstSSF; +class TernaryRRFb opcode, + RegisterOperand cls1, RegisterOperand cls2, + RegisterOperand cls3> + : InstRRFb { + let Constraints = "$R1 = $R1src"; + let DisableEncoding = "$R1src"; +} + class TernaryRRFe opcode, RegisterOperand cls1, RegisterOperand cls2> : InstRRFe rsOpcode, bits<16> rsyOpcode, } } +class SideEffectTernaryMemMemRS opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRSa { + let Constraints = "$R1 = $R1src, $R3 = $R3src"; + let DisableEncoding = "$R1src, $R3src"; +} + +class SideEffectTernaryMemMemRSY opcode, + RegisterOperand cls1, RegisterOperand cls2> + : InstRSYa { + let Constraints = "$R1 = $R1src, $R3 = $R3src"; + let DisableEncoding = "$R1src, $R3src"; +} + class TernaryRXF opcode, SDPatternOperator operator, RegisterOperand cls, SDPatternOperator load, bits<5> bytes> : InstRXF // another instruction to handle the excess. multiclass MemorySS opcode, SDPatternOperator sequence, SDPatternOperator loop> { - def "" : InstSSa; + def "" : SideEffectBinarySSa; let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in { def Sequence : Pseudo<(outs), (ins bdaddr12only:$dest, bdaddr12only:$src, imm64:$length), @@ -4003,13 +4285,8 @@ multiclass MemorySS opcode, // the full loop (the main instruction plus the branch on CC==3). multiclass StringRRE opcode, SDPatternOperator operator> { - def "" : InstRRE { - let Uses = [R0L]; - let Constraints = "$R1 = $R1src, $R2 = $R2src"; - let DisableEncoding = "$R1src, $R2src"; - } + let Uses = [R0L] in + def "" : SideEffectBinaryMemMemRRE; let usesCustomInserter = 1, hasNoSchedulingInfo = 1 in def Loop : Pseudo<(outs GR64:$end), (ins GR64:$start1, GR64:$start2, GR32:$char), diff --git a/lib/Target/SystemZ/SystemZInstrInfo.td b/lib/Target/SystemZ/SystemZInstrInfo.td index d63525f29412..fa5ecdd85243 100644 --- a/lib/Target/SystemZ/SystemZInstrInfo.td +++ b/lib/Target/SystemZ/SystemZInstrInfo.td @@ -12,8 +12,8 @@ //===----------------------------------------------------------------------===// let hasNoSchedulingInfo = 1 in { - def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt), - [(callseq_start timm:$amt)]>; + def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), + [(callseq_start timm:$amt1, timm:$amt2)]>; def ADJCALLSTACKUP : Pseudo<(outs), (ins i64imm:$amt1, i64imm:$amt2), [(callseq_end timm:$amt1, timm:$amt2)]>; } @@ -464,6 +464,11 @@ def MVGHI : StoreSIL<"mvghi", 0xE548, store, imm64sx16>; // Memory-to-memory moves. let mayLoad = 1, mayStore = 1 in defm MVC : MemorySS<"mvc", 0xD2, z_mvc, z_mvc_loop>; +let mayLoad = 1, mayStore = 1, Defs = [CC] in { + def MVCL : SideEffectBinaryMemMemRR<"mvcl", 0x0E, GR128, GR128>; + def MVCLE : SideEffectTernaryMemMemRS<"mvcle", 0xA8, GR128, GR128>; + def MVCLU : SideEffectTernaryMemMemRSY<"mvclu", 0xEB8E, GR128, GR128>; +} // String moves. let mayLoad = 1, mayStore = 1, Defs = [CC] in @@ -707,6 +712,10 @@ def : StoreGR64PC; defm : StoreGR64Pair; def : StoreGR64PC; +// Store characters under mask -- not (yet) used for codegen. +defm STCM : StoreBinaryRSPair<"stcm", 0xBE, 0xEB2D, GR32, 0>; +def STCMH : StoreBinaryRSY<"stcmh", 0xEB2C, GRH32, 0>; + //===----------------------------------------------------------------------===// // Multi-register moves //===----------------------------------------------------------------------===// @@ -715,6 +724,7 @@ def : StoreGR64PC; defm LM : LoadMultipleRSPair<"lm", 0x98, 0xEB98, GR32>; def LMG : LoadMultipleRSY<"lmg", 0xEB04, GR64>; def LMH : LoadMultipleRSY<"lmh", 0xEB96, GRH32>; +def LMD : LoadMultipleSSe<"lmd", 0xEF, GR64>; // Multi-register stores. defm STM : StoreMultipleRSPair<"stm", 0x90, 0xEB90, GR32>; @@ -742,6 +752,10 @@ def STRVH : StoreRXY<"strvh", 0xE33F, z_strvh, GR32, 2>; def STRV : StoreRXY<"strv", 0xE33E, z_strv, GR32, 4>; def STRVG : StoreRXY<"strvg", 0xE32F, z_strvg, GR64, 8>; +// Byte-swapping memory-to-memory moves. +let mayLoad = 1, mayStore = 1 in + def MVCIN : SideEffectBinarySSa<"mvcin", 0xE8>; + //===----------------------------------------------------------------------===// // Load address instructions //===----------------------------------------------------------------------===// @@ -816,6 +830,7 @@ defm : InsertMem<"inserti8", IC32Y, GR32, azextloadi8, bdxaddr20pair>; defm : InsertMem<"inserti8", IC, GR64, azextloadi8, bdxaddr12pair>; defm : InsertMem<"inserti8", ICY, GR64, azextloadi8, bdxaddr20pair>; +// Insert characters under mask -- not (yet) used for codegen. let Defs = [CC] in { defm ICM : TernaryRSPair<"icm", 0xBF, 0xEB81, GR32, 0>; def ICMH : TernaryRSY<"icmh", 0xEB80, GRH32, 0>; @@ -919,6 +934,10 @@ let Defs = [CC] in { defm AL : BinaryRXPair<"al", 0x5E, 0xE35E, addc, GR32, load, 4>; def ALGF : BinaryRXY<"algf", 0xE31A, addc, GR64, azextloadi32, 4>; def ALG : BinaryRXY<"alg", 0xE30A, addc, GR64, load, 8>; + + // Addition to memory. + def ALSI : BinarySIY<"alsi", 0xEB6E, null_frag, imm32sx8>; + def ALGSI : BinarySIY<"algsi", 0xEB7E, null_frag, imm64sx8>; } defm : ZXB; @@ -1166,9 +1185,14 @@ def MSGF : BinaryRXY<"msgf", 0xE31C, mul, GR64, asextloadi32, 4>; def MSG : BinaryRXY<"msg", 0xE30C, mul, GR64, load, 8>; // Multiplication of a register, producing two results. +def MR : BinaryRR <"mr", 0x1C, null_frag, GR128, GR32>; +def MLR : BinaryRRE<"mlr", 0xB996, null_frag, GR128, GR32>; def MLGR : BinaryRRE<"mlgr", 0xB986, z_umul_lohi64, GR128, GR64>; // Multiplication of memory, producing two results. +def M : BinaryRX <"m", 0x5C, null_frag, GR128, load, 4>; +def MFY : BinaryRXY<"mfy", 0xE35C, null_frag, GR128, load, 4>; +def ML : BinaryRXY<"ml", 0xE396, null_frag, GR128, load, 4>; def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>; //===----------------------------------------------------------------------===// @@ -1177,12 +1201,14 @@ def MLG : BinaryRXY<"mlg", 0xE386, z_umul_lohi64, GR128, load, 8>; let hasSideEffects = 1 in { // Do not speculatively execute. // Division and remainder, from registers. + def DR : BinaryRR <"dr", 0x1D, null_frag, GR128, GR32>; def DSGFR : BinaryRRE<"dsgfr", 0xB91D, z_sdivrem32, GR128, GR32>; def DSGR : BinaryRRE<"dsgr", 0xB90D, z_sdivrem64, GR128, GR64>; def DLR : BinaryRRE<"dlr", 0xB997, z_udivrem32, GR128, GR32>; def DLGR : BinaryRRE<"dlgr", 0xB987, z_udivrem64, GR128, GR64>; // Division and remainder, from memory. + def D : BinaryRX <"d", 0x5D, null_frag, GR128, load, 4>; def DSGF : BinaryRXY<"dsgf", 0xE31D, z_sdivrem32, GR128, load, 4>; def DSG : BinaryRXY<"dsg", 0xE30D, z_sdivrem64, GR128, load, 8>; def DL : BinaryRXY<"dl", 0xE397, z_udivrem32, GR128, load, 4>; @@ -1193,23 +1219,32 @@ let hasSideEffects = 1 in { // Do not speculatively execute. // Shifts //===----------------------------------------------------------------------===// -// Shift left. +// Logical shift left. let hasSideEffects = 0 in { defm SLL : BinaryRSAndK<"sll", 0x89, 0xEBDF, shl, GR32>; - defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>; def SLLG : BinaryRSY<"sllg", 0xEB0D, shl, GR64>; + def SLDL : BinaryRS<"sldl", 0x8D, null_frag, GR128>; +} + +// Arithmetic shift left. +let Defs = [CC] in { + defm SLA : BinaryRSAndK<"sla", 0x8B, 0xEBDD, null_frag, GR32>; + def SLAG : BinaryRSY<"slag", 0xEB0B, null_frag, GR64>; + def SLDA : BinaryRS<"slda", 0x8F, null_frag, GR128>; } // Logical shift right. let hasSideEffects = 0 in { defm SRL : BinaryRSAndK<"srl", 0x88, 0xEBDE, srl, GR32>; def SRLG : BinaryRSY<"srlg", 0xEB0C, srl, GR64>; + def SRDL : BinaryRS<"srdl", 0x8C, null_frag, GR128>; } // Arithmetic shift right. let Defs = [CC], CCValues = 0xE, CompareZeroCCMask = 0xE in { defm SRA : BinaryRSAndK<"sra", 0x8A, 0xEBDC, sra, GR32>; def SRAG : BinaryRSY<"srag", 0xEB0A, sra, GR64>; + def SRDA : BinaryRS<"srda", 0x8E, null_frag, GR128>; } // Rotate left. @@ -1351,8 +1386,12 @@ let Defs = [CC], CCValues = 0xE, IsLogical = 1 in { defm : ZXB; // Memory-to-memory comparison. -let mayLoad = 1, Defs = [CC] in +let mayLoad = 1, Defs = [CC] in { defm CLC : MemorySS<"clc", 0xD5, z_clc, z_clc_loop>; + def CLCL : SideEffectBinaryMemMemRR<"clcl", 0x0F, GR128, GR128>; + def CLCLE : SideEffectTernaryMemMemRS<"clcle", 0xA9, GR128, GR128>; + def CLCLU : SideEffectTernaryMemMemRSY<"clclu", 0xEB8F, GR128, GR128>; +} // String comparison. let mayLoad = 1, Defs = [CC] in @@ -1381,6 +1420,12 @@ let Defs = [CC] in { def TML : InstAlias<"tml\t$R, $I", (TMLL GR32:$R, imm32ll16:$I), 0>; def TMH : InstAlias<"tmh\t$R, $I", (TMLH GR32:$R, imm32lh16:$I), 0>; +// Compare logical characters under mask -- not (yet) used for codegen. +let Defs = [CC] in { + defm CLM : CompareRSPair<"clm", 0xBD, 0xEB21, GR32, 0>; + def CLMH : CompareRSY<"clmh", 0xEB20, GRH32, 0>; +} + //===----------------------------------------------------------------------===// // Prefetch and execution hint //===----------------------------------------------------------------------===// @@ -1580,6 +1625,115 @@ let Predicates = [FeatureInterlockedAccess1], Defs = [CC] in { def LPDG : BinarySSF<"lpdg", 0xC85, GR128>; } +//===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +let mayLoad = 1, mayStore = 1 in + def TR : SideEffectBinarySSa<"tr", 0xDC>; + +let mayLoad = 1, Defs = [CC, R0L, R1D] in { + def TRT : SideEffectBinarySSa<"trt", 0xDD>; + def TRTR : SideEffectBinarySSa<"trtr", 0xD0>; +} + +let mayLoad = 1, mayStore = 1, Uses = [R0L] in + def TRE : SideEffectBinaryMemMemRRE<"tre", 0xB2A5, GR128, GR64>; + +let mayLoad = 1, Uses = [R1D], Defs = [CC] in { + defm TRTE : BinaryMemRRFcOpt<"trte", 0xB9BF, GR128, GR64>; + defm TRTRE : BinaryMemRRFcOpt<"trtre", 0xB9BD, GR128, GR64>; +} + +let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { + defm TROO : SideEffectTernaryMemMemRRFcOpt<"troo", 0xB993, GR128, GR64>; + defm TROT : SideEffectTernaryMemMemRRFcOpt<"trot", 0xB992, GR128, GR64>; + defm TRTO : SideEffectTernaryMemMemRRFcOpt<"trto", 0xB991, GR128, GR64>; + defm TRTT : SideEffectTernaryMemMemRRFcOpt<"trtt", 0xB990, GR128, GR64>; +} + +let mayLoad = 1, mayStore = 1, Defs = [CC] in { + defm CU12 : SideEffectTernaryMemMemRRFcOpt<"cu12", 0xB2A7, GR128, GR128>; + defm CU14 : SideEffectTernaryMemMemRRFcOpt<"cu14", 0xB9B0, GR128, GR128>; + defm CU21 : SideEffectTernaryMemMemRRFcOpt<"cu21", 0xB2A6, GR128, GR128>; + defm CU24 : SideEffectTernaryMemMemRRFcOpt<"cu24", 0xB9B1, GR128, GR128>; + def CU41 : SideEffectBinaryMemMemRRE<"cu41", 0xB9B2, GR128, GR128>; + def CU42 : SideEffectBinaryMemMemRRE<"cu42", 0xB9B3, GR128, GR128>; + + let isAsmParserOnly = 1 in { + defm CUUTF : SideEffectTernaryMemMemRRFcOpt<"cuutf", 0xB2A6, GR128, GR128>; + defm CUTFU : SideEffectTernaryMemMemRRFcOpt<"cutfu", 0xB2A7, GR128, GR128>; + } +} + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +let mayLoad = 1, mayStore = 1, Uses = [R0L, R1D], Defs = [CC] in { + def KM : SideEffectBinaryMemMemRRE<"km", 0xB92E, GR128, GR128>; + def KMC : SideEffectBinaryMemMemRRE<"kmc", 0xB92F, GR128, GR128>; + + def KIMD : SideEffectBinaryMemRRE<"kimd", 0xB93E, GR64, GR128>; + def KLMD : SideEffectBinaryMemRRE<"klmd", 0xB93F, GR64, GR128>; + def KMAC : SideEffectBinaryMemRRE<"kmac", 0xB91E, GR64, GR128>; + + let Predicates = [FeatureMessageSecurityAssist4] in { + def KMF : SideEffectBinaryMemMemRRE<"kmf", 0xB92A, GR128, GR128>; + def KMO : SideEffectBinaryMemMemRRE<"kmo", 0xB92B, GR128, GR128>; + def KMCTR : SideEffectTernaryMemMemMemRRFb<"kmctr", 0xB92D, + GR128, GR128, GR128>; + def PCC : SideEffectInherentRRE<"pcc", 0xB92C>; + } + let Predicates = [FeatureMessageSecurityAssist5] in + def PPNO : SideEffectBinaryMemMemRRE<"ppno", 0xB93C, GR128, GR128>; +} + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +defm CVB : BinaryRXPair<"cvb",0x4F, 0xE306, null_frag, GR32, load, 4>; +def CVBG : BinaryRXY<"cvbg", 0xE30E, null_frag, GR64, load, 8>; + +defm CVD : StoreRXPair<"cvd", 0x4E, 0xE326, null_frag, GR32, 4>; +def CVDG : StoreRXY<"cvdg", 0xE32E, null_frag, GR64, 8>; + +let mayLoad = 1, mayStore = 1 in { + def MVN : SideEffectBinarySSa<"mvn", 0xD1>; + def MVZ : SideEffectBinarySSa<"mvz", 0xD3>; + def MVO : SideEffectBinarySSb<"mvo", 0xF1>; + + def PACK : SideEffectBinarySSb<"pack", 0xF2>; + def PKA : SideEffectBinarySSf<"pka", 0xE9>; + def PKU : SideEffectBinarySSf<"pku", 0xE1>; + def UNPK : SideEffectBinarySSb<"unpk", 0xF3>; + let Defs = [CC] in { + def UNPKA : SideEffectBinarySSa<"unpka", 0xEA>; + def UNPKU : SideEffectBinarySSa<"unpku", 0xE2>; + } +} + +let mayLoad = 1, mayStore = 1 in { + let Defs = [CC] in { + def AP : SideEffectBinarySSb<"ap", 0xFA>; + def SP : SideEffectBinarySSb<"sp", 0xFB>; + def ZAP : SideEffectBinarySSb<"zap", 0xF8>; + def SRP : SideEffectTernarySSc<"srp", 0xF0>; + } + def MP : SideEffectBinarySSb<"mp", 0xFC>; + def DP : SideEffectBinarySSb<"dp", 0xFD>; + let Defs = [CC] in { + def ED : SideEffectBinarySSa<"ed", 0xDE>; + def EDMK : SideEffectBinarySSa<"edmk", 0xDF>; + } +} + +let Defs = [CC] in { + def CP : CompareSSb<"cp", 0xF9>; + def TP : TestRSL<"tp", 0xEBC0>; +} + //===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -1712,12 +1866,39 @@ let usesCustomInserter = 1 in { // Search a block of memory for a character. let mayLoad = 1, Defs = [CC] in - defm SRST : StringRRE<"srst", 0xb25e, z_search_string>; + defm SRST : StringRRE<"srst", 0xB25E, z_search_string>; +let mayLoad = 1, Defs = [CC], Uses = [R0L] in + def SRSTU : SideEffectBinaryMemMemRRE<"srstu", 0xB9BE, GR64, GR64>; + +// Compare until substring equal. +let mayLoad = 1, Defs = [CC], Uses = [R0L, R1L] in + def CUSE : SideEffectBinaryMemMemRRE<"cuse", 0xB257, GR128, GR128>; + +// Compare and form codeword. +let mayLoad = 1, Defs = [CC, R1D, R2D, R3D], Uses = [R1D, R2D, R3D] in + def CFC : SideEffectAddressS<"cfc", 0xB21A, null_frag>; + +// Update tree. +let mayLoad = 1, mayStore = 1, Defs = [CC, R0D, R1D, R2D, R3D, R5D], + Uses = [R0D, R1D, R2D, R3D, R4D, R5D] in + def UPT : SideEffectInherentE<"upt", 0x0102>; + +// Checksum. +let mayLoad = 1, Defs = [CC] in + def CKSM : SideEffectBinaryMemMemRRE<"cksm", 0xB241, GR64, GR128>; + +// Compression call. +let mayLoad = 1, mayStore = 1, Defs = [CC, R1D], Uses = [R0L, R1D] in + def CMPSC : SideEffectBinaryMemMemRRE<"cmpsc", 0xB263, GR128, GR128>; // Supervisor call. let hasSideEffects = 1, isCall = 1, Defs = [CC] in def SVC : SideEffectUnaryI<"svc", 0x0A, imm32zx8>; +// Monitor call. +let hasSideEffects = 1, isCall = 1 in + def MC : SideEffectBinarySI<"mc", 0xAF, imm32zx8>; + // Store clock. let hasSideEffects = 1, Defs = [CC] in { def STCK : StoreInherentS<"stck", 0xB205, null_frag, 8>; @@ -1729,10 +1910,18 @@ let hasSideEffects = 1, Defs = [CC] in { let hasSideEffects = 1, Uses = [R0D], Defs = [R0D, CC] in def STFLE : StoreInherentS<"stfle", 0xB2B0, null_frag, 0>; +// Extract CPU attribute. +let hasSideEffects = 1 in + def ECAG : BinaryRSY<"ecag", 0xEB4C, null_frag, GR64>; + // Extract CPU time. let Defs = [R0D, R1D], hasSideEffects = 1, mayLoad = 1 in def ECTG : SideEffectTernarySSF<"ectg", 0xC81, GR64>; +// Extract PSW. +let hasSideEffects = 1, Uses = [CC] in + def EPSW : InherentDualRRE<"epsw", 0xB98D, GR32>; + // Execute. let hasSideEffects = 1 in { def EX : SideEffectBinaryRX<"ex", 0x44, GR64>; diff --git a/lib/Target/SystemZ/SystemZOperands.td b/lib/Target/SystemZ/SystemZOperands.td index 7bb4fe5afb3f..713612129d90 100644 --- a/lib/Target/SystemZ/SystemZOperands.td +++ b/lib/Target/SystemZ/SystemZOperands.td @@ -531,6 +531,7 @@ def BDAddr64Disp12 : AddressAsmOperand<"BDAddr", "64", "12">; def BDAddr64Disp20 : AddressAsmOperand<"BDAddr", "64", "20">; def BDXAddr64Disp12 : AddressAsmOperand<"BDXAddr", "64", "12">; def BDXAddr64Disp20 : AddressAsmOperand<"BDXAddr", "64", "20">; +def BDLAddr64Disp12Len4 : AddressAsmOperand<"BDLAddr", "64", "12", "Len4">; def BDLAddr64Disp12Len8 : AddressAsmOperand<"BDLAddr", "64", "12", "Len8">; def BDRAddr64Disp12 : AddressAsmOperand<"BDRAddr", "64", "12">; def BDVAddr64Disp12 : AddressAsmOperand<"BDVAddr", "64", "12">; @@ -578,6 +579,7 @@ def bdxaddr20pair : BDXMode<"BDXAddr", "64", "20", "Pair">; def dynalloc12only : BDXMode<"DynAlloc", "64", "12", "Only">; def laaddr12pair : BDXMode<"LAAddr", "64", "12", "Pair">; def laaddr20pair : BDXMode<"LAAddr", "64", "20", "Pair">; +def bdladdr12onlylen4 : BDLMode<"BDLAddr", "64", "12", "Only", "4">; def bdladdr12onlylen8 : BDLMode<"BDLAddr", "64", "12", "Only", "8">; def bdraddr12only : BDRMode<"BDRAddr", "64", "12", "Only">; def bdvaddr12only : BDVMode< "64", "12">; diff --git a/lib/Target/SystemZ/SystemZOperators.td b/lib/Target/SystemZ/SystemZOperators.td index fde26ed4e1c5..adfc69c5d4cf 100644 --- a/lib/Target/SystemZ/SystemZOperators.td +++ b/lib/Target/SystemZ/SystemZOperators.td @@ -10,7 +10,8 @@ //===----------------------------------------------------------------------===// // Type profiles //===----------------------------------------------------------------------===// -def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>]>; +def SDT_CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i64>, + SDTCisVT<1, i64>]>; def SDT_CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i64>, SDTCisVT<1, i64>]>; def SDT_ZCall : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; diff --git a/lib/Target/SystemZ/SystemZSchedule.td b/lib/Target/SystemZ/SystemZSchedule.td index dbba8ab42b5a..1ce0168f95e9 100644 --- a/lib/Target/SystemZ/SystemZSchedule.td +++ b/lib/Target/SystemZ/SystemZSchedule.td @@ -56,12 +56,16 @@ def LSU_lat1 : SchedWrite; // Floating point unit (zEC12 and earlier) def FPU : SchedWrite; def FPU2 : SchedWrite; +def DFU : SchedWrite; +def DFU2 : SchedWrite; // Vector sub units (z13) def VecBF : SchedWrite; def VecBF2 : SchedWrite; def VecDF : SchedWrite; def VecDF2 : SchedWrite; +def VecDFX : SchedWrite; +def VecDFX2 : SchedWrite; def VecFPd : SchedWrite; // Blocking BFP div/sqrt unit. def VecMul : SchedWrite; def VecStr : SchedWrite; diff --git a/lib/Target/SystemZ/SystemZScheduleZ13.td b/lib/Target/SystemZ/SystemZScheduleZ13.td index 7aee6f52e9a7..612c3b6cf96e 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ13.td +++ b/lib/Target/SystemZ/SystemZScheduleZ13.td @@ -76,6 +76,8 @@ def : WriteRes { let Latency = 8; } def : WriteRes { let Latency = 9; } def : WriteRes { let Latency = 8; } def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 2; } def : WriteRes; // Move character def : InstRW<[FXb, LSU, LSU, LSU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; // Pseudo -> reg move def : InstRW<[FXa], (instregex "COPY(_TO_REGCLASS)?$")>; @@ -268,6 +271,7 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "LLG(F|T)?AT$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "STCM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Multi-register moves @@ -277,6 +281,9 @@ def : InstRW<[FXb, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; +// Load multiple disjoint +def : InstRW<[FXb, Lat30, GroupAlone], (instregex "LMD$")>; + // Store multiple (estimated average of ceil(5/2) FXb ops) def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10, GroupAlone], (instregex "STM(G|H|Y)?$")>; @@ -288,6 +295,7 @@ def : InstRW<[LSU, LSU, FXb, FXb, FXb, Lat10, def : InstRW<[FXa], (instregex "LRV(G)?R$")>; def : InstRW<[FXa, LSU, Lat5], (instregex "LRV(G|H)?$")>; def : InstRW<[FXb, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; //===----------------------------------------------------------------------===// // Load address instructions @@ -345,7 +353,7 @@ def : InstRW<[FXa], (instregex "ALGF(I|R)$")>; def : InstRW<[FXa], (instregex "ALGR(K)?$")>; def : InstRW<[FXa], (instregex "ALR(K)?$")>; def : InstRW<[FXa], (instregex "AR(K)?$")>; -def : InstRW<[FXb, LSU, Lat5], (instregex "A(G)?SI$")>; +def : InstRW<[FXb, LSU, Lat5], (instregex "A(L)?(G)?SI$")>; // Logical addition with carry def : InstRW<[FXa, LSU, Lat6, GroupAlone], (instregex "ALC(G)?$")>; @@ -438,11 +446,15 @@ def : InstRW<[FXa, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXa, Lat5], (instregex "MGHI$")>; def : InstRW<[FXa, Lat5], (instregex "MHI$")>; def : InstRW<[FXa, LSU, Lat9], (instregex "MH(Y)?$")>; +def : InstRW<[FXa, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXa, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder //===----------------------------------------------------------------------===// +def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DR$")>; +def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "D$")>; def : InstRW<[FXa, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; def : InstRW<[LSU, FXa, Lat30, GroupAlone], (instregex "DSG(F)?$")>; def : InstRW<[FXa2, FXa2, Lat20, GroupAlone], (instregex "DLR$")>; @@ -456,7 +468,8 @@ def : InstRW<[FXa2, FXa2, LSU, Lat30, GroupAlone], (instregex "DL(G)?$")>; def : InstRW<[FXa], (instregex "SLL(G|K)?$")>; def : InstRW<[FXa], (instregex "SRL(G|K)?$")>; def : InstRW<[FXa], (instregex "SRA(G|K)?$")>; -def : InstRW<[FXa], (instregex "SLA(K)?$")>; +def : InstRW<[FXa], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXa, FXa, FXa, FXa, Lat8], (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXa, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -505,7 +518,7 @@ def : InstRW<[FXb, Lat2], (instregex "CGFR$")>; // Compare logical character def : InstRW<[FXb, LSU, LSU, Lat9, BeginGroup], (instregex "CLC$")>; - +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; // Test under mask @@ -516,6 +529,9 @@ def : InstRW<[FXb], (instregex "TMHL(64)?$")>; def : InstRW<[FXb], (instregex "TMLH(64)?$")>; def : InstRW<[FXb], (instregex "TMLL(64)?$")>; +// Compare logical characters under mask +def : InstRW<[FXb, LSU, Lat5], (instregex "CLM(H|Y)?$")>; + //===----------------------------------------------------------------------===// // Prefetch and execution hint //===----------------------------------------------------------------------===// @@ -562,6 +578,42 @@ def : InstRW<[FXb, FXb, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; // Load pair disjoint def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; +//===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXa, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC|PPNO)$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXb, VecDF, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; +def : InstRW<[FXb, VecDF, FXb, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; + +def : InstRW<[FXb, VecDFX, LSU, LSU, Lat9, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXb, VecDFX2, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXb, FXb, VecDFX2, LSU, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[VecDFX, LSU, LSU, Lat5, GroupAlone], (instregex "CP$")>; +def : InstRW<[VecDFX, LSU, Lat4, GroupAlone], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + //===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -640,13 +692,30 @@ def : InstRW<[FXa], (instregex "ZEXT128_(32|64)$")>; // String instructions def : InstRW<[FXa, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; // Move with key def : InstRW<[FXa, FXa, FXb, LSU, Lat8, GroupAlone], (instregex "MVCK$")>; +// Monitor call +def : InstRW<[FXb], (instregex "MC$")>; + +// Extract CPU attribute +def : InstRW<[FXb, Lat30], (instregex "ECAG$")>; + // Extract CPU Time def : InstRW<[FXa, Lat5, LSU], (instregex "ECTG$")>; +// Extract PSW +def : InstRW<[FXb, Lat30], (instregex "EPSW$")>; + // Execute def : InstRW<[FXb, GroupAlone], (instregex "EX(RL)?$")>; @@ -811,14 +880,17 @@ def : InstRW<[VecFPd, LSU], (instregex "D(E|D)B$")>; def : InstRW<[VecFPd], (instregex "D(E|D)BR$")>; def : InstRW<[VecFPd, VecFPd, GroupAlone], (instregex "DXBR$")>; +// Divide to integer +def : InstRW<[VecFPd, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; + //===----------------------------------------------------------------------===// // FP: Comparisons //===----------------------------------------------------------------------===// // Compare -def : InstRW<[VecXsPm, LSU, Lat8], (instregex "C(E|D)B$")>; -def : InstRW<[VecXsPm, Lat4], (instregex "C(E|D)BR?$")>; -def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "CXBR$")>; +def : InstRW<[VecXsPm, LSU, Lat8], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[VecXsPm, Lat4], (instregex "(K|C)(E|D)BR?$")>; +def : InstRW<[VecDF, VecDF, Lat20, GroupAlone], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[LSU, VecXsPm, Lat9], (instregex "TC(E|D)B$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZ196.td b/lib/Target/SystemZ/SystemZScheduleZ196.td index a950e54e7601..670df8ff5541 100644 --- a/lib/Target/SystemZ/SystemZScheduleZ196.td +++ b/lib/Target/SystemZ/SystemZScheduleZ196.td @@ -59,6 +59,7 @@ def : WriteRes { let Latency = 30; let NumMicroOps = 0;} def Z196_FXUnit : ProcResource<2>; def Z196_LSUnit : ProcResource<2>; def Z196_FPUnit : ProcResource<1>; +def Z196_DFUnit : ProcResource<1>; // Subtarget specific definitions of scheduling resources. def : WriteRes { let Latency = 1; } @@ -66,6 +67,8 @@ def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 8; } def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } // -------------------------- INSTRUCTIONS ---------------------------------- // @@ -152,6 +155,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>; // Move character def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; // Pseudo -> reg move def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>; @@ -226,6 +230,7 @@ def : InstRW<[LSU], (instregex "LLG(C|F|H|T|FRL|HRL)$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Multi-register moves @@ -235,6 +240,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; +// Load multiple disjoint +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "LMD$")>; + // Store multiple (estimated average of 3 ops) def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], (instregex "STM(H|Y|G)?$")>; @@ -246,6 +254,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], def : InstRW<[FXU], (instregex "LRV(G)?R$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; //===----------------------------------------------------------------------===// // Load address instructions @@ -285,7 +294,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>; // Addition //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>; def : InstRW<[FXU, FXU, LSU, Lat6, GroupAlone], (instregex "AH(Y)?$")>; def : InstRW<[FXU], (instregex "AIH$")>; def : InstRW<[FXU], (instregex "AFI(Mux)?$")>; @@ -294,15 +303,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>; def : InstRW<[FXU], (instregex "AGR(K)?$")>; def : InstRW<[FXU], (instregex "AHI(K)?$")>; def : InstRW<[FXU], (instregex "AHIMux(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>; def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>; def : InstRW<[FXU], (instregex "ALGHSIK$")>; def : InstRW<[FXU], (instregex "ALGF(I|R)$")>; def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; // Logical addition with carry def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>; @@ -395,11 +403,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; +def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder //===----------------------------------------------------------------------===// +def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "DR$")>; +def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "D$")>; def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone], @@ -416,7 +430,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; -def : InstRW<[FXU, Lat2], (instregex "SLA(K)?$")>; +def : InstRW<[FXU, Lat2], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -465,7 +480,7 @@ def : InstRW<[FXU, FXU, Lat2, GroupAlone], (instregex "CGFR$")>; // Compare logical character def : InstRW<[LSU, LSU, FXU, Lat9, GroupAlone], (instregex "CLC$")>; - +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; // Test under mask @@ -476,6 +491,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>; def : InstRW<[FXU], (instregex "TMLH(64)?$")>; def : InstRW<[FXU], (instregex "TMLL(64)?$")>; +// Compare logical characters under mask +def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>; + //===----------------------------------------------------------------------===// // Prefetch //===----------------------------------------------------------------------===// @@ -519,6 +537,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; // Load pair disjoint def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; +//===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; +def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; + +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + //===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -571,13 +625,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; // Move with key def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>; +// Monitor call +def : InstRW<[FXU], (instregex "MC$")>; + +// Extract CPU attribute +def : InstRW<[FXU, Lat30], (instregex "ECAG$")>; + // Extract CPU Time def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>; +// Extract PSW +def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; + // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -740,14 +811,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>; def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; +// Divide to integer +def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; + //===----------------------------------------------------------------------===// // FP: Comparisons //===----------------------------------------------------------------------===// // Compare -def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>; -def : InstRW<[FPU], (instregex "C(E|D)BR$")>; -def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>; +def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>; +def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>; diff --git a/lib/Target/SystemZ/SystemZScheduleZEC12.td b/lib/Target/SystemZ/SystemZScheduleZEC12.td index 8ab6c826f1ed..1bdb8779dc72 100644 --- a/lib/Target/SystemZ/SystemZScheduleZEC12.td +++ b/lib/Target/SystemZ/SystemZScheduleZEC12.td @@ -59,6 +59,7 @@ def : WriteRes { let Latency = 30; let NumMicroOps = 0;} def ZEC12_FXUnit : ProcResource<2>; def ZEC12_LSUnit : ProcResource<2>; def ZEC12_FPUnit : ProcResource<1>; +def ZEC12_DFUnit : ProcResource<1>; def ZEC12_VBUnit : ProcResource<1>; // Subtarget specific definitions of scheduling resources. @@ -67,6 +68,8 @@ def : WriteRes { let Latency = 4; } def : WriteRes { let Latency = 1; } def : WriteRes { let Latency = 8; } def : WriteRes { let Latency = 9; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } def : WriteRes; // Virtual Branching Unit // -------------------------- INSTRUCTIONS ---------------------------------- // @@ -155,6 +158,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "MVI(Y)?$")>; // Move character def : InstRW<[LSU, LSU, LSU, FXU, Lat8, GroupAlone], (instregex "MVC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCL(E|U)?$")>; // Pseudo -> reg move def : InstRW<[FXU], (instregex "COPY(_TO_REGCLASS)?$")>; @@ -236,6 +240,7 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "LLG(F|T)?AT$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STC(H|Y|Mux)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "STCM(H|Y)?$")>; //===----------------------------------------------------------------------===// // Multi-register moves @@ -245,6 +250,9 @@ def : InstRW<[FXU, LSU, Lat5], (instregex "STH(H|Y|RL|Mux)?$")>; def : InstRW<[LSU, LSU, LSU, LSU, LSU, Lat10, GroupAlone], (instregex "LM(H|Y|G)?$")>; +// Load multiple disjoint +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "LMD$")>; + // Store multiple (estimated average of 3 ops) def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], (instregex "STM(H|Y|G)?$")>; @@ -256,6 +264,7 @@ def : InstRW<[LSU, LSU, FXU, FXU, FXU, Lat10, GroupAlone], def : InstRW<[FXU], (instregex "LRV(G)?R$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "LRV(G|H)?$")>; def : InstRW<[FXU, LSU, Lat5], (instregex "STRV(G|H)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MVCIN$")>; //===----------------------------------------------------------------------===// // Load address instructions @@ -295,7 +304,7 @@ def : InstRW<[FXU], (instregex "IILL(64)?$")>; // Addition //===----------------------------------------------------------------------===// -def : InstRW<[FXU, LSU, Lat5], (instregex "A(Y|SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?(Y|SI)?$")>; def : InstRW<[FXU, LSU, Lat6], (instregex "AH(Y)?$")>; def : InstRW<[FXU], (instregex "AIH$")>; def : InstRW<[FXU], (instregex "AFI(Mux)?$")>; @@ -304,15 +313,14 @@ def : InstRW<[FXU], (instregex "AGHI(K)?$")>; def : InstRW<[FXU], (instregex "AGR(K)?$")>; def : InstRW<[FXU], (instregex "AHI(K)?$")>; def : InstRW<[FXU], (instregex "AHIMux(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AL(Y)?$")>; def : InstRW<[FXU], (instregex "AL(FI|HSIK)$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "ALG(F)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "ALGF$")>; def : InstRW<[FXU], (instregex "ALGHSIK$")>; def : InstRW<[FXU], (instregex "ALGF(I|R)$")>; def : InstRW<[FXU], (instregex "ALGR(K)?$")>; def : InstRW<[FXU], (instregex "ALR(K)?$")>; def : InstRW<[FXU], (instregex "AR(K)?$")>; -def : InstRW<[FXU, LSU, Lat5], (instregex "AG(SI)?$")>; +def : InstRW<[FXU, LSU, Lat5], (instregex "A(L)?G(SI)?$")>; // Logical addition with carry def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "ALC(G)?$")>; @@ -405,11 +413,17 @@ def : InstRW<[FXU, Lat9, GroupAlone], (instregex "MLGR$")>; def : InstRW<[FXU, Lat5], (instregex "MGHI$")>; def : InstRW<[FXU, Lat5], (instregex "MHI$")>; def : InstRW<[FXU, LSU, Lat9], (instregex "MH(Y)?$")>; +def : InstRW<[FXU, Lat7, GroupAlone], (instregex "M(L)?R$")>; +def : InstRW<[FXU, LSU, Lat7, GroupAlone], (instregex "M(FY|L)?$")>; //===----------------------------------------------------------------------===// // Division and remainder //===----------------------------------------------------------------------===// +def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "DR$")>; +def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], + (instregex "D$")>; def : InstRW<[FPU2, FPU2, FXU, FXU, FXU, FXU, Lat30, GroupAlone], (instregex "DSG(F)?R$")>; def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, Lat30, GroupAlone], @@ -426,7 +440,8 @@ def : InstRW<[FPU2, FPU2, LSU, FXU, FXU, FXU, FXU, Lat30, GroupAlone], def : InstRW<[FXU], (instregex "SLL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRL(G|K)?$")>; def : InstRW<[FXU], (instregex "SRA(G|K)?$")>; -def : InstRW<[FXU], (instregex "SLA(K)?$")>; +def : InstRW<[FXU], (instregex "SLA(G|K)?$")>; +def : InstRW<[FXU, FXU, FXU, FXU, Lat8], (instregex "S(L|R)D(A|L)$")>; // Rotate def : InstRW<[FXU, LSU, Lat6], (instregex "RLL(G)?$")>; @@ -475,7 +490,7 @@ def : InstRW<[FXU, Lat2], (instregex "CGFR$")>; // Compare logical character def : InstRW<[FXU, LSU, LSU, Lat9, GroupAlone], (instregex "CLC$")>; - +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLCL(E|U)?$")>; def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CLST$")>; // Test under mask @@ -486,6 +501,9 @@ def : InstRW<[FXU], (instregex "TMHL(64)?$")>; def : InstRW<[FXU], (instregex "TMLH(64)?$")>; def : InstRW<[FXU], (instregex "TMLL(64)?$")>; +// Compare logical characters under mask +def : InstRW<[FXU, LSU, Lat5], (instregex "CLM(H|Y)?$")>; + //===----------------------------------------------------------------------===// // Prefetch and execution hint //===----------------------------------------------------------------------===// @@ -531,6 +549,42 @@ def : InstRW<[FXU, FXU, LSU, LSU, Lat6, GroupAlone], (instregex "STPQ$")>; // Load pair disjoint def : InstRW<[LSU, LSU, Lat5, GroupAlone], (instregex "LPD(G)?$")>; +//===----------------------------------------------------------------------===// +// Translate and convert +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|TR)?(E|EOpt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "TR(T|O)(T|O)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "CU(12|14|21|24|41|42)(Opt)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(CUUTF|CUTFU)(Opt)?$")>; + +//===----------------------------------------------------------------------===// +// Message-security assist +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "KM(C|F|O|CTR)?$")>; +def : InstRW<[FXU, Lat30, GroupAlone], (instregex "(KIMD|KLMD|KMAC|PCC)$")>; + +//===----------------------------------------------------------------------===// +// Decimal arithmetic +//===----------------------------------------------------------------------===// + +def : InstRW<[FXU, DFU, LSU, Lat30, GroupAlone], (instregex "CVB(Y|G)?$")>; +def : InstRW<[FXU, DFU, FXU, Lat30, GroupAlone], (instregex "CVD(Y|G)?$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "MV(N|Z|O)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "(PACK|PKA|PKU)$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UNPK(A|U)?$")>; + +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat15, GroupAlone], + (instregex "(A|S|ZA)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, LSU, LSU, Lat30, GroupAlone], + (instregex "(M|D)P$")>; +def : InstRW<[FXU, FXU, DFU2, LSU, LSU, Lat15, GroupAlone], + (instregex "SRP$")>; +def : InstRW<[DFU2, LSU, LSU, LSU, LSU, Lat11, GroupAlone], (instregex "CP$")>; +def : InstRW<[DFU2, LSU, LSU, Lat3, GroupAlone], (instregex "TP$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "ED(MK)?$")>; + //===----------------------------------------------------------------------===// // Access registers //===----------------------------------------------------------------------===// @@ -609,13 +663,30 @@ def : InstRW<[FXU], (instregex "ZEXT128_(32|64)$")>; // String instructions def : InstRW<[FXU, LSU, Lat30], (instregex "SRST$")>; +def : InstRW<[LSU, Lat30], (instregex "SRSTU$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CUSE$")>; + +// Various complex instructions +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CFC$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "UPT$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CKSM$")>; +def : InstRW<[LSU, Lat30, GroupAlone], (instregex "CMPSC$")>; // Move with key def : InstRW<[LSU, Lat8, GroupAlone], (instregex "MVCK$")>; +// Monitor call +def : InstRW<[FXU], (instregex "MC$")>; + +// Extract CPU attribute +def : InstRW<[FXU, Lat30], (instregex "ECAG$")>; + // Extract CPU Time def : InstRW<[FXU, Lat5, LSU], (instregex "ECTG$")>; +// Extract PSW +def : InstRW<[FXU, Lat30], (instregex "EPSW$")>; + // Execute def : InstRW<[LSU, GroupAlone], (instregex "EX(RL)?$")>; @@ -778,14 +849,17 @@ def : InstRW<[FPU, LSU, Lat30], (instregex "D(E|D)B$")>; def : InstRW<[FPU, Lat30], (instregex "D(E|D)BR$")>; def : InstRW<[FPU2, FPU2, Lat30, GroupAlone], (instregex "DXBR$")>; +// Divide to integer +def : InstRW<[FPU, Lat30, GroupAlone], (instregex "DI(E|D)BR$")>; + //===----------------------------------------------------------------------===// // FP: Comparisons //===----------------------------------------------------------------------===// // Compare -def : InstRW<[FPU, LSU, Lat12], (instregex "C(E|D)B$")>; -def : InstRW<[FPU], (instregex "C(E|D)BR$")>; -def : InstRW<[FPU, FPU, Lat30], (instregex "CXBR$")>; +def : InstRW<[FPU, LSU, Lat12], (instregex "(K|C)(E|D)B$")>; +def : InstRW<[FPU], (instregex "(K|C)(E|D)BR$")>; +def : InstRW<[FPU, FPU, Lat30], (instregex "(K|C)XBR$")>; // Test Data Class def : InstRW<[FPU, LSU, Lat15], (instregex "TC(E|D)B$")>; diff --git a/lib/Target/SystemZ/SystemZSubtarget.cpp b/lib/Target/SystemZ/SystemZSubtarget.cpp index ce07ea3318a5..022679a7bc18 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.cpp +++ b/lib/Target/SystemZ/SystemZSubtarget.cpp @@ -37,12 +37,13 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU, const TargetMachine &TM) : SystemZGenSubtargetInfo(TT, CPU, FS), HasDistinctOps(false), HasLoadStoreOnCond(false), HasHighWord(false), HasFPExtension(false), - HasPopulationCount(false), HasFastSerialization(false), - HasInterlockedAccess1(false), HasMiscellaneousExtensions(false), + HasPopulationCount(false), HasMessageSecurityAssist4(false), + HasFastSerialization(false), HasInterlockedAccess1(false), + HasMiscellaneousExtensions(false), HasExecutionHint(false), HasLoadAndTrap(false), HasTransactionalExecution(false), HasProcessorAssist(false), HasVector(false), HasLoadStoreOnCond2(false), - HasLoadAndZeroRightmostByte(false), + HasLoadAndZeroRightmostByte(false), HasMessageSecurityAssist5(false), TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)), TLInfo(TM, *this), TSInfo(), FrameLowering() {} diff --git a/lib/Target/SystemZ/SystemZSubtarget.h b/lib/Target/SystemZ/SystemZSubtarget.h index cdb61327a16a..770dd7cd939f 100644 --- a/lib/Target/SystemZ/SystemZSubtarget.h +++ b/lib/Target/SystemZ/SystemZSubtarget.h @@ -39,6 +39,7 @@ protected: bool HasHighWord; bool HasFPExtension; bool HasPopulationCount; + bool HasMessageSecurityAssist4; bool HasFastSerialization; bool HasInterlockedAccess1; bool HasMiscellaneousExtensions; @@ -49,6 +50,7 @@ protected: bool HasVector; bool HasLoadStoreOnCond2; bool HasLoadAndZeroRightmostByte; + bool HasMessageSecurityAssist5; private: Triple TargetTriple; @@ -104,6 +106,10 @@ public: // Return true if the target has the population-count facility. bool hasPopulationCount() const { return HasPopulationCount; } + // Return true if the target has the message-security-assist + // extension facility 4. + bool hasMessageSecurityAssist4() const { return HasMessageSecurityAssist4; } + // Return true if the target has the fast-serialization facility. bool hasFastSerialization() const { return HasFastSerialization; } @@ -132,6 +138,10 @@ public: return HasLoadAndZeroRightmostByte; } + // Return true if the target has the message-security-assist + // extension facility 5. + bool hasMessageSecurityAssist5() const { return HasMessageSecurityAssist5; } + // Return true if the target has the vector facility. bool hasVector() const { return HasVector; } diff --git a/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/lib/Target/WebAssembly/WebAssemblyInstrCall.td index 73d1d4be293b..6b45839c14b0 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrCall.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrCall.td @@ -19,8 +19,8 @@ let Defs = [ARGUMENTS] in { // Call sequence markers. These have an immediate which represents the amount of // stack space to allocate or free, which is used for varargs lowering. let Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1 in { -def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt), - [(WebAssemblycallseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : I<(outs), (ins i32imm:$amt, i32imm:$amt2), + [(WebAssemblycallseq_start timm:$amt, timm:$amt2)]>; def ADJCALLSTACKUP : I<(outs), (ins i32imm:$amt, i32imm:$amt2), [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>; } // isCodeGenOnly = 1 diff --git a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td index a601b575f579..fa2146f7db84 100644 --- a/lib/Target/WebAssembly/WebAssemblyInstrInfo.td +++ b/lib/Target/WebAssembly/WebAssemblyInstrInfo.td @@ -25,7 +25,8 @@ def HasSIMD128 : Predicate<"Subtarget->hasSIMD128()">, // WebAssembly-specific DAG Node Types. //===----------------------------------------------------------------------===// -def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>]>; +def SDT_WebAssemblyCallSeqStart : SDCallSeqStart<[SDTCisVT<0, iPTR>, + SDTCisVT<1, iPTR>]>; def SDT_WebAssemblyCallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>; def SDT_WebAssemblyCall0 : SDTypeProfile<0, -1, [SDTCisPtrTy<0>]>; diff --git a/lib/Target/X86/X86.td b/lib/Target/X86/X86.td index 784c3a6557ff..3a421fe77392 100644 --- a/lib/Target/X86/X86.td +++ b/lib/Target/X86/X86.td @@ -235,6 +235,8 @@ def FeatureLEAUsesAG : SubtargetFeature<"lea-uses-ag", "LEAUsesAG", "true", "LEA instruction needs inputs at AG stage">; def FeatureSlowLEA : SubtargetFeature<"slow-lea", "SlowLEA", "true", "LEA instruction with certain arguments is slow">; +def FeatureSlow3OpsLEA : SubtargetFeature<"slow-3ops-lea", "Slow3OpsLEA", "true", + "LEA instruction with 3 ops or certain registers is slow">; def FeatureSlowIncDec : SubtargetFeature<"slow-incdec", "SlowIncDec", "true", "INC and DEC instructions are slower than ADD and SUB">; def FeatureSoftFloat @@ -480,6 +482,7 @@ def SNBFeatures : ProcessorFeatures<[], [ FeatureXSAVE, FeatureXSAVEOPT, FeatureLAHFSAHF, + FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, FeatureFastSHLDRotate ]>; diff --git a/lib/Target/X86/X86FastISel.cpp b/lib/Target/X86/X86FastISel.cpp index ebd179e786da..fc3b4836c178 100644 --- a/lib/Target/X86/X86FastISel.cpp +++ b/lib/Target/X86/X86FastISel.cpp @@ -180,44 +180,6 @@ private: } // end anonymous namespace. -static std::pair -getX86ConditionCode(CmpInst::Predicate Predicate) { - X86::CondCode CC = X86::COND_INVALID; - bool NeedSwap = false; - switch (Predicate) { - default: break; - // Floating-point Predicates - case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; - case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_OGT: CC = X86::COND_A; break; - case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; - case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_ULT: CC = X86::COND_B; break; - case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; - case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; - case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; - case CmpInst::FCMP_UNO: CC = X86::COND_P; break; - case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; - case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; - case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; - - // Integer Predicates - case CmpInst::ICMP_EQ: CC = X86::COND_E; break; - case CmpInst::ICMP_NE: CC = X86::COND_NE; break; - case CmpInst::ICMP_UGT: CC = X86::COND_A; break; - case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; - case CmpInst::ICMP_ULT: CC = X86::COND_B; break; - case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; - case CmpInst::ICMP_SGT: CC = X86::COND_G; break; - case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; - case CmpInst::ICMP_SLT: CC = X86::COND_L; break; - case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; - } - - return std::make_pair(CC, NeedSwap); -} - static std::pair getX86SSEConditionCode(CmpInst::Predicate Predicate) { unsigned CC; @@ -1559,7 +1521,7 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) { X86::CondCode CC; bool SwapArgs; - std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); unsigned Opc = X86::getSETFromCond(CC); @@ -1697,7 +1659,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) { bool SwapArgs; unsigned BranchOpc; - std::tie(CC, SwapArgs) = getX86ConditionCode(Predicate); + std::tie(CC, SwapArgs) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); BranchOpc = X86::GetCondBranchFromCond(CC); @@ -2070,7 +2032,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) { } bool NeedSwap; - std::tie(CC, NeedSwap) = getX86ConditionCode(Predicate); + std::tie(CC, NeedSwap) = X86::getX86ConditionCode(Predicate); assert(CC <= X86::LAST_VALID_COND && "Unexpected condition code."); const Value *CmpLHS = CI->getOperand(0); @@ -2319,7 +2281,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) { const auto *CI = dyn_cast(Cond); if (CI && (CI->getParent() == I->getParent())) { bool NeedSwap; - std::tie(CC, NeedSwap) = getX86ConditionCode(CI->getPredicate()); + std::tie(CC, NeedSwap) = X86::getX86ConditionCode(CI->getPredicate()); if (CC > X86::LAST_VALID_COND) return false; @@ -3293,7 +3255,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) { // Issue CALLSEQ_START unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackDown)) - .addImm(NumBytes).addImm(0); + .addImm(NumBytes).addImm(0).addImm(0); // Walk the register/memloc assignments, inserting copies/loads. const X86RegisterInfo *RegInfo = Subtarget->getRegisterInfo(); diff --git a/lib/Target/X86/X86FixupLEAs.cpp b/lib/Target/X86/X86FixupLEAs.cpp index 2cd4c1a3e7b3..9f649dad8bc0 100644 --- a/lib/Target/X86/X86FixupLEAs.cpp +++ b/lib/Target/X86/X86FixupLEAs.cpp @@ -27,20 +27,26 @@ #include "llvm/Target/TargetInstrInfo.h" using namespace llvm; -#define DEBUG_TYPE "x86-fixup-LEAs" +namespace llvm { +void initializeFixupLEAPassPass(PassRegistry &); +} + +#define FIXUPLEA_DESC "X86 LEA Fixup" +#define FIXUPLEA_NAME "x86-fixup-LEAs" + +#define DEBUG_TYPE FIXUPLEA_NAME STATISTIC(NumLEAs, "Number of LEA instructions created"); namespace { class FixupLEAPass : public MachineFunctionPass { enum RegUsageState { RU_NotUsed, RU_Write, RU_Read }; - static char ID; + /// \brief Loop over all of the instructions in the basic block /// replacing applicable instructions with LEA instructions, /// where appropriate. bool processBasicBlock(MachineFunction &MF, MachineFunction::iterator MFI); - StringRef getPassName() const override { return "X86 LEA Fixup"; } /// \brief Given a machine register, look for the instruction /// which writes it in the current basic block. If found, @@ -62,6 +68,22 @@ class FixupLEAPass : public MachineFunctionPass { void processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI); + + /// \brief Given a LEA instruction which is unprofitable + /// on SNB+ try to replace it with other instructions. + /// According to Intel's Optimization Reference Manual: + /// " For LEA instructions with three source operands and some specific + /// situations, instruction latency has increased to 3 cycles, and must + /// dispatch via port 1: + /// - LEA that has all three source operands: base, index, and offset + /// - LEA that uses base and index registers where the base is EBP, RBP, + /// or R13 + /// - LEA that uses RIP relative addressing mode + /// - LEA that uses 16-bit addressing mode " + /// This function currently handles the first 2 cases only. + MachineInstr *processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI); + /// \brief Look for LEAs that add 1 to reg or subtract 1 from reg /// and convert them to INC or DEC respectively. bool fixupIncDec(MachineBasicBlock::iterator &I, @@ -85,7 +107,13 @@ class FixupLEAPass : public MachineFunctionPass { MachineBasicBlock::iterator &MBBI) const; public: - FixupLEAPass() : MachineFunctionPass(ID) {} + static char ID; + + StringRef getPassName() const override { return FIXUPLEA_DESC; } + + FixupLEAPass() : MachineFunctionPass(ID) { + initializeFixupLEAPassPass(*PassRegistry::getPassRegistry()); + } /// \brief Loop over all of the basic blocks, /// replacing instructions by equivalent LEA instructions @@ -104,9 +132,12 @@ private: bool OptIncDec; bool OptLEA; }; -char FixupLEAPass::ID = 0; } +char FixupLEAPass::ID = 0; + +INITIALIZE_PASS(FixupLEAPass, FIXUPLEA_NAME, FIXUPLEA_DESC, false, false) + MachineInstr * FixupLEAPass::postRAConvertToLEA(MachineFunction::iterator &MFI, MachineBasicBlock::iterator &MBBI) const { @@ -168,7 +199,7 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &Func) { MF = &Func; const X86Subtarget &ST = Func.getSubtarget(); OptIncDec = !ST.slowIncDec() || Func.getFunction()->optForMinSize(); - OptLEA = ST.LEAusesAG() || ST.slowLEA(); + OptLEA = ST.LEAusesAG() || ST.slowLEA() || ST.slow3OpsLEA(); if (!OptLEA && !OptIncDec) return false; @@ -242,9 +273,64 @@ FixupLEAPass::searchBackwards(MachineOperand &p, MachineBasicBlock::iterator &I, return MachineBasicBlock::iterator(); } -static inline bool isLEA(const int opcode) { - return opcode == X86::LEA16r || opcode == X86::LEA32r || - opcode == X86::LEA64r || opcode == X86::LEA64_32r; +static inline bool isLEA(const int Opcode) { + return Opcode == X86::LEA16r || Opcode == X86::LEA32r || + Opcode == X86::LEA64r || Opcode == X86::LEA64_32r; +} + +static inline bool isInefficientLEAReg(unsigned int Reg) { + return Reg == X86::EBP || Reg == X86::RBP || Reg == X86::R13; +} + +static inline bool isRegOperand(const MachineOperand &Op) { + return Op.isReg() && Op.getReg() != X86::NoRegister; +} +/// hasIneffecientLEARegs - LEA that uses base and index registers +/// where the base is EBP, RBP, or R13 +static inline bool hasInefficientLEABaseReg(const MachineOperand &Base, + const MachineOperand &Index) { + return Base.isReg() && isInefficientLEAReg(Base.getReg()) && + isRegOperand(Index); +} + +static inline bool hasLEAOffset(const MachineOperand &Offset) { + return (Offset.isImm() && Offset.getImm() != 0) || Offset.isGlobal(); +} + +// LEA instruction that has all three operands: offset, base and index +static inline bool isThreeOperandsLEA(const MachineOperand &Base, + const MachineOperand &Index, + const MachineOperand &Offset) { + return isRegOperand(Base) && isRegOperand(Index) && hasLEAOffset(Offset); +} + +static inline int getADDrrFromLEA(int LEAOpcode) { + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return X86::ADD16rr; + case X86::LEA32r: + return X86::ADD32rr; + case X86::LEA64_32r: + case X86::LEA64r: + return X86::ADD64rr; + } +} + +static inline int getADDriFromLEA(int LEAOpcode, const MachineOperand &Offset) { + bool IsInt8 = Offset.isImm() && isInt<8>(Offset.getImm()); + switch (LEAOpcode) { + default: + llvm_unreachable("Unexpected LEA instruction"); + case X86::LEA16r: + return IsInt8 ? X86::ADD16ri8 : X86::ADD16ri; + case X86::LEA32r: + case X86::LEA64_32r: + return IsInt8 ? X86::ADD32ri8 : X86::ADD32ri; + case X86::LEA64r: + return IsInt8 ? X86::ADD64ri8 : X86::ADD64ri32; + } } /// isLEASimpleIncOrDec - Does this LEA have one these forms: @@ -337,8 +423,8 @@ void FixupLEAPass::seekLEAFixup(MachineOperand &p, void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, MachineFunction::iterator MFI) { MachineInstr &MI = *I; - const int opcode = MI.getOpcode(); - if (!isLEA(opcode)) + const int Opcode = MI.getOpcode(); + if (!isLEA(Opcode)) return; if (MI.getOperand(5).getReg() != 0 || !MI.getOperand(4).isImm() || !TII->isSafeToClobberEFLAGS(*MFI, I)) @@ -350,53 +436,142 @@ void FixupLEAPass::processInstructionForSLM(MachineBasicBlock::iterator &I, return; if (MI.getOperand(2).getImm() > 1) return; - int addrr_opcode, addri_opcode; - switch (opcode) { - default: - llvm_unreachable("Unexpected LEA instruction"); - case X86::LEA16r: - addrr_opcode = X86::ADD16rr; - addri_opcode = X86::ADD16ri; - break; - case X86::LEA32r: - addrr_opcode = X86::ADD32rr; - addri_opcode = X86::ADD32ri; - break; - case X86::LEA64_32r: - case X86::LEA64r: - addrr_opcode = X86::ADD64rr; - addri_opcode = X86::ADD64ri32; - break; - } DEBUG(dbgs() << "FixLEA: Candidate to replace:"; I->dump();); DEBUG(dbgs() << "FixLEA: Replaced by: ";); MachineInstr *NewMI = nullptr; - const MachineOperand &Dst = MI.getOperand(0); // Make ADD instruction for two registers writing to LEA's destination if (SrcR1 != 0 && SrcR2 != 0) { - const MachineOperand &Src1 = MI.getOperand(SrcR1 == DstR ? 1 : 3); - const MachineOperand &Src2 = MI.getOperand(SrcR1 == DstR ? 3 : 1); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addrr_opcode)) - .add(Dst) - .add(Src1) - .add(Src2); - MFI->insert(I, NewMI); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(Opcode)); + const MachineOperand &Src = MI.getOperand(SrcR1 == DstR ? 3 : 1); + NewMI = + BuildMI(*MFI, I, MI.getDebugLoc(), ADDrr, DstR).addReg(DstR).add(Src); DEBUG(NewMI->dump();); } // Make ADD instruction for immediate if (MI.getOperand(4).getImm() != 0) { + const MCInstrDesc &ADDri = + TII->get(getADDriFromLEA(Opcode, MI.getOperand(4))); const MachineOperand &SrcR = MI.getOperand(SrcR1 == DstR ? 1 : 3); - NewMI = BuildMI(*MF, MI.getDebugLoc(), TII->get(addri_opcode)) - .add(Dst) + NewMI = BuildMI(*MFI, I, MI.getDebugLoc(), ADDri, DstR) .add(SrcR) .addImm(MI.getOperand(4).getImm()); - MFI->insert(I, NewMI); DEBUG(NewMI->dump();); } if (NewMI) { MFI->erase(I); - I = static_cast(NewMI); + I = NewMI; + } +} + +MachineInstr * +FixupLEAPass::processInstrForSlow3OpLEA(MachineInstr &MI, + MachineFunction::iterator MFI) { + + const int LEAOpcode = MI.getOpcode(); + if (!isLEA(LEAOpcode)) + return nullptr; + + const MachineOperand &Dst = MI.getOperand(0); + const MachineOperand &Base = MI.getOperand(1); + const MachineOperand &Scale = MI.getOperand(2); + const MachineOperand &Index = MI.getOperand(3); + const MachineOperand &Offset = MI.getOperand(4); + const MachineOperand &Segment = MI.getOperand(5); + + if (!(isThreeOperandsLEA(Base, Index, Offset) || + hasInefficientLEABaseReg(Base, Index)) || + !TII->isSafeToClobberEFLAGS(*MFI, MI) || + Segment.getReg() != X86::NoRegister) + return nullptr; + + unsigned int DstR = Dst.getReg(); + unsigned int BaseR = Base.getReg(); + unsigned int IndexR = Index.getReg(); + unsigned SSDstR = + (LEAOpcode == X86::LEA64_32r) ? getX86SubSuperRegister(DstR, 64) : DstR; + bool IsScale1 = Scale.getImm() == 1; + bool IsInefficientBase = isInefficientLEAReg(BaseR); + bool IsInefficientIndex = isInefficientLEAReg(IndexR); + + // Skip these cases since it takes more than 2 instructions + // to replace the LEA instruction. + if (IsInefficientBase && SSDstR == BaseR && !IsScale1) + return nullptr; + if (LEAOpcode == X86::LEA64_32r && IsInefficientBase && + (IsInefficientIndex || !IsScale1)) + return nullptr; + + const DebugLoc DL = MI.getDebugLoc(); + const MCInstrDesc &ADDrr = TII->get(getADDrrFromLEA(LEAOpcode)); + const MCInstrDesc &ADDri = TII->get(getADDriFromLEA(LEAOpcode, Offset)); + + DEBUG(dbgs() << "FixLEA: Candidate to replace:"; MI.dump();); + DEBUG(dbgs() << "FixLEA: Replaced by: ";); + + // First try to replace LEA with one or two (for the 3-op LEA case) + // add instructions: + // 1.lea (%base,%index,1), %base => add %index,%base + // 2.lea (%base,%index,1), %index => add %base,%index + if (IsScale1 && (DstR == BaseR || DstR == IndexR)) { + const MachineOperand &Src = DstR == BaseR ? Index : Base; + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Src); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // If the base is inefficient try switching the index and base operands, + // otherwise just break the 3-Ops LEA inst into 2-Ops LEA + ADD instruction: + // lea offset(%base,%index,scale),%dst => + // lea (%base,%index,scale); add offset,%dst + if (!IsInefficientBase || (!IsInefficientIndex && IsScale1)) { + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .add(IsInefficientBase ? Index : Base) + .add(Scale) + .add(IsInefficientBase ? Base : Index) + .addImm(0) + .add(Segment); + DEBUG(NewMI->dump();); + // Create ADD instruction for the Offset in case of 3-Ops LEA. + if (hasLEAOffset(Offset)) { + NewMI = BuildMI(*MFI, MI, DL, ADDri, DstR).addReg(DstR).add(Offset); + DEBUG(NewMI->dump();); + } + return NewMI; + } + // Handle the rest of the cases with inefficient base register: + assert(SSDstR != BaseR && "SSDstR == BaseR should be handled already!"); + assert(IsInefficientBase && "efficient base should be handled already!"); + + // lea (%base,%index,1), %dst => mov %base,%dst; add %index,%dst + if (IsScale1 && !hasLEAOffset(Offset)) { + TII->copyPhysReg(*MFI, MI, DL, DstR, BaseR, Base.isKill()); + DEBUG(MI.getPrevNode()->dump();); + + MachineInstr *NewMI = + BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Index); + DEBUG(NewMI->dump();); + return NewMI; } + // lea offset(%base,%index,scale), %dst => + // lea offset( ,%index,scale), %dst; add %base,%dst + MachineInstr *NewMI = BuildMI(*MFI, MI, DL, TII->get(LEAOpcode)) + .add(Dst) + .addReg(0) + .add(Scale) + .add(Index) + .add(Offset) + .add(Segment); + DEBUG(NewMI->dump();); + + NewMI = BuildMI(*MFI, MI, DL, ADDrr, DstR).addReg(DstR).add(Base); + DEBUG(NewMI->dump();); + return NewMI; } bool FixupLEAPass::processBasicBlock(MachineFunction &MF, @@ -410,8 +585,16 @@ bool FixupLEAPass::processBasicBlock(MachineFunction &MF, if (OptLEA) { if (MF.getSubtarget().isSLM()) processInstructionForSLM(I, MFI); - else - processInstruction(I, MFI); + + else { + if (MF.getSubtarget().slow3OpsLEA()) { + if (auto *NewMI = processInstrForSlow3OpLEA(*I, MFI)) { + MFI->erase(I); + I = NewMI; + } + } else + processInstruction(I, MFI); + } } } return false; diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp index 12a10bf3072f..c899f0fd5100 100644 --- a/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -1178,8 +1178,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.IndexReg.getNode() != nullptr || AM.Scale != 1) break; - if (ConstantSDNode - *CN = dyn_cast(N.getNode()->getOperand(1))) { + if (ConstantSDNode *CN = dyn_cast(N.getOperand(1))) { unsigned Val = CN->getZExtValue(); // Note that we handle x<<1 as (,x,2) rather than (x,x) here so // that the base operand remains free for further matching. If @@ -1187,15 +1186,14 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // in MatchAddress turns (,x,2) into (x,x), which is cheaper. if (Val == 1 || Val == 2 || Val == 3) { AM.Scale = 1 << Val; - SDValue ShVal = N.getNode()->getOperand(0); + SDValue ShVal = N.getOperand(0); // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (CurDAG->isBaseWithConstantOffset(ShVal)) { - AM.IndexReg = ShVal.getNode()->getOperand(0); - ConstantSDNode *AddVal = - cast(ShVal.getNode()->getOperand(1)); + AM.IndexReg = ShVal.getOperand(0); + ConstantSDNode *AddVal = cast(ShVal.getOperand(1)); uint64_t Disp = (uint64_t)AddVal->getSExtValue() << Val; if (!foldOffsetIntoAddress(Disp, AM)) return false; @@ -1245,28 +1243,27 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, if (AM.BaseType == X86ISelAddressMode::RegBase && AM.Base_Reg.getNode() == nullptr && AM.IndexReg.getNode() == nullptr) { - if (ConstantSDNode - *CN = dyn_cast(N.getNode()->getOperand(1))) + if (ConstantSDNode *CN = dyn_cast(N.getOperand(1))) if (CN->getZExtValue() == 3 || CN->getZExtValue() == 5 || CN->getZExtValue() == 9) { AM.Scale = unsigned(CN->getZExtValue())-1; - SDValue MulVal = N.getNode()->getOperand(0); + SDValue MulVal = N.getOperand(0); SDValue Reg; // Okay, we know that we have a scale by now. However, if the scaled // value is an add of something and a constant, we can fold the // constant into the disp field here. if (MulVal.getNode()->getOpcode() == ISD::ADD && MulVal.hasOneUse() && - isa(MulVal.getNode()->getOperand(1))) { - Reg = MulVal.getNode()->getOperand(0); + isa(MulVal.getOperand(1))) { + Reg = MulVal.getOperand(0); ConstantSDNode *AddVal = - cast(MulVal.getNode()->getOperand(1)); + cast(MulVal.getOperand(1)); uint64_t Disp = AddVal->getSExtValue() * CN->getZExtValue(); if (foldOffsetIntoAddress(Disp, AM)) - Reg = N.getNode()->getOperand(0); + Reg = N.getOperand(0); } else { - Reg = N.getNode()->getOperand(0); + Reg = N.getOperand(0); } AM.IndexReg = AM.Base_Reg = Reg; @@ -1289,7 +1286,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, // Test if the LHS of the sub can be folded. X86ISelAddressMode Backup = AM; - if (matchAddressRecursively(N.getNode()->getOperand(0), AM, Depth+1)) { + if (matchAddressRecursively(N.getOperand(0), AM, Depth+1)) { AM = Backup; break; } @@ -1300,7 +1297,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, } int Cost = 0; - SDValue RHS = Handle.getValue().getNode()->getOperand(1); + SDValue RHS = Handle.getValue().getOperand(1); // If the RHS involves a register with multiple uses, this // transformation incurs an extra mov, due to the neg instruction // clobbering its operand. @@ -1309,7 +1306,7 @@ bool X86DAGToDAGISel::matchAddressRecursively(SDValue N, X86ISelAddressMode &AM, RHS.getNode()->getOpcode() == ISD::TRUNCATE || RHS.getNode()->getOpcode() == ISD::ANY_EXTEND || (RHS.getNode()->getOpcode() == ISD::ZERO_EXTEND && - RHS.getNode()->getOperand(0).getValueType() == MVT::i32)) + RHS.getOperand(0).getValueType() == MVT::i32)) ++Cost; // If the base is a register with multiple uses, this // transformation may save a mov. @@ -2524,7 +2521,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { N0.getNode()->hasOneUse() && N0.getValueType() != MVT::i8 && X86::isZeroNode(N1)) { - ConstantSDNode *C = dyn_cast(N0.getNode()->getOperand(1)); + ConstantSDNode *C = dyn_cast(N0.getOperand(1)); if (!C) break; // For example, convert "testl %eax, $8" to "testb %al, $8" @@ -2532,7 +2529,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { (!(C->getZExtValue() & 0x80) || hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i8); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // On x86-32, only the ABCD registers have 8-bit subregisters. if (!Subtarget->is64Bit()) { @@ -2568,7 +2565,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { // Shift the immediate right by 8 bits. SDValue ShiftedImm = CurDAG->getTargetConstant(C->getZExtValue() >> 8, dl, MVT::i8); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Put the value in an ABCD register. const TargetRegisterClass *TRC; @@ -2605,7 +2602,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i16); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Extract the 16-bit subregister. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_16bit, dl, @@ -2628,7 +2625,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) { hasNoSignedComparisonUses(Node))) { SDValue Imm = CurDAG->getTargetConstant(C->getZExtValue(), dl, MVT::i32); - SDValue Reg = N0.getNode()->getOperand(0); + SDValue Reg = N0.getOperand(0); // Extract the 32-bit subregister. SDValue Subreg = CurDAG->getTargetExtractSubreg(X86::sub_32bit, dl, diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp index 9ee2234595f9..11c08292518a 100644 --- a/lib/Target/X86/X86ISelLowering.cpp +++ b/lib/Target/X86/X86ISelLowering.cpp @@ -40,6 +40,7 @@ #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" +#include "llvm/IR/DiagnosticInfo.h" #include "llvm/IR/Function.h" #include "llvm/IR/GlobalAlias.h" #include "llvm/IR/GlobalVariable.h" @@ -79,6 +80,17 @@ static cl::opt ExperimentalPrefLoopAlignment( " of the loop header PC will be 0)."), cl::Hidden); +/// Call this when the user attempts to do something unsupported, like +/// returning a double without SSE2 enabled on x86_64. This is not fatal, unlike +/// report_fatal_error, so calling code should attempt to recover without +/// crashing. +static void errorUnsupported(SelectionDAG &DAG, const SDLoc &dl, + const char *Msg) { + MachineFunction &MF = DAG.getMachineFunction(); + DAG.getContext()->diagnose( + DiagnosticInfoUnsupported(*MF.getFunction(), Msg, dl.getDebugLoc())); +} + X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, const X86Subtarget &STI) : TargetLowering(TM), Subtarget(STI) { @@ -1381,7 +1393,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::VECTOR_SHUFFLE, VT, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom); setOperationAction(ISD::SCALAR_TO_VECTOR, VT, Custom); setOperationAction(ISD::INSERT_SUBVECTOR, VT, Legal); @@ -1445,8 +1457,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i1, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v32i16, Custom); setOperationAction(ISD::INSERT_VECTOR_ELT, MVT::v64i8, Custom); - setOperationAction(ISD::VSELECT, MVT::v32i16, Legal); - setOperationAction(ISD::VSELECT, MVT::v64i8, Legal); setOperationAction(ISD::TRUNCATE, MVT::v32i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v64i1, Custom); setOperationAction(ISD::TRUNCATE, MVT::v32i8, Custom); @@ -1479,7 +1489,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM, for (auto VT : { MVT::v64i8, MVT::v32i16 }) { setOperationAction(ISD::BUILD_VECTOR, VT, Custom); - setOperationAction(ISD::VSELECT, VT, Legal); + setOperationAction(ISD::VSELECT, VT, Custom); setOperationAction(ISD::ABS, VT, Legal); setOperationAction(ISD::SRL, VT, Custom); setOperationAction(ISD::SHL, VT, Custom); @@ -2207,15 +2217,17 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv, // or SSE or MMX vectors. if ((ValVT == MVT::f32 || ValVT == MVT::f64 || VA.getLocReg() == X86::XMM0 || VA.getLocReg() == X86::XMM1) && - (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + (Subtarget.is64Bit() && !Subtarget.hasSSE1())) { + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. + } else if (ValVT == MVT::f64 && + (Subtarget.is64Bit() && !Subtarget.hasSSE2())) { + // Likewise we can't return F64 values with SSE1 only. gcc does so, but + // llvm-gcc has never done it right and no one has noticed, so this + // should be OK for now. + errorUnsupported(DAG, dl, "SSE2 register return with SSE2 disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } - // Likewise we can't return F64 values with SSE1 only. gcc does so, but - // llvm-gcc has never done it right and no one has noticed, so this - // should be OK for now. - if (ValVT == MVT::f64 && - (Subtarget.is64Bit() && !Subtarget.hasSSE2())) - report_fatal_error("SSE2 register return with SSE2 disabled"); // Returns in ST0/ST1 are handled specially: these are pushed as operands to // the RET instruction and handled by the FP Stackifier. @@ -2528,7 +2540,8 @@ SDValue X86TargetLowering::LowerCallResult( // If this is x86-64, and we disabled SSE, we can't return FP values if ((CopyVT == MVT::f32 || CopyVT == MVT::f64 || CopyVT == MVT::f128) && ((Is64Bit || Ins[InsIndex].Flags.isInReg()) && !Subtarget.hasSSE1())) { - report_fatal_error("SSE register return with SSE disabled"); + errorUnsupported(DAG, dl, "SSE register return with SSE disabled"); + VA.convertToReg(X86::FP0); // Set reg to FP0, avoid hitting asserts. } // If we prefer to use the value in xmm registers, copy it out as f80 and @@ -3415,8 +3428,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI, } if (!IsSibcall) - Chain = DAG.getCALLSEQ_START( - Chain, DAG.getIntPtrConstant(NumBytesToPush, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytesToPush, + NumBytes - NumBytesToPush, dl); SDValue RetAddrFrIdx; // Load return address for tail calls. @@ -6912,9 +6925,9 @@ X86TargetLowering::LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG) const { // for splat use " (select i1 splat_elt, all-ones, all-zeroes)" if (IsSplat) - return DAG.getNode(ISD::SELECT, dl, VT, Op.getOperand(SplatIdx), - DAG.getConstant(1, dl, VT), - DAG.getConstant(0, dl, VT)); + return DAG.getSelect(dl, VT, Op.getOperand(SplatIdx), + DAG.getConstant(1, dl, VT), + DAG.getConstant(0, dl, VT)); // insert elements one by one SDValue DstVec; @@ -8386,9 +8399,9 @@ static SDValue lowerVectorShuffleToEXPAND(const SDLoc &DL, MVT VT, Subtarget, DAG, DL); SDValue ZeroVector = getZeroVector(VT, Subtarget, DAG, DL); SDValue ExpandedVector = IsLeftZeroSide ? V2 : V1; - return DAG.getNode(ISD::VSELECT, DL, VT, VMask, - DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), - ZeroVector); + return DAG.getSelect(DL, VT, VMask, + DAG.getNode(X86ISD::EXPAND, DL, VT, ExpandedVector), + ZeroVector); } static bool matchVectorShuffleWithUNPCK(MVT VT, SDValue &V1, SDValue &V2, @@ -8748,8 +8761,9 @@ static SDValue lowerVectorShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1, V1 = DAG.getBitcast(BlendVT, V1); V2 = DAG.getBitcast(BlendVT, V2); return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, DL, BlendVT, - DAG.getBuildVector(BlendVT, DL, VSELECTMask), V1, V2)); + VT, + DAG.getSelect(DL, BlendVT, DAG.getBuildVector(BlendVT, DL, VSELECTMask), + V1, V2)); } case MVT::v16f32: case MVT::v8f64: @@ -13817,6 +13831,11 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { ISD::isBuildVectorOfConstantSDNodes(Op.getOperand(2).getNode())) return SDValue(); + // If this VSELECT has a vector if i1 as a mask, it will be directly matched + // with patterns on the mask registers on AVX-512. + if (Op->getOperand(0).getValueType().getScalarSizeInBits() == 1) + return Op; + // Try to lower this to a blend-style vector shuffle. This can handle all // constant condition cases. if (SDValue BlendOp = lowerVSELECTtoVectorShuffle(Op, Subtarget, DAG)) @@ -13826,10 +13845,30 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const { if (!Subtarget.hasSSE41()) return SDValue(); + SDLoc dl(Op); + MVT VT = Op.getSimpleValueType(); + + // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition + // into an i1 condition so that we can use the mask-based 512-bit blend + // instructions. + if (VT.getSizeInBits() == 512) { + SDValue Cond = Op.getOperand(0); + // The vNi1 condition case should be handled above as it can be trivially + // lowered. + assert(Cond.getValueType().getScalarSizeInBits() == + VT.getScalarSizeInBits() && + "Should have a size-matched integer condition!"); + // Build a mask by testing the condition against itself (tests for zero). + MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); + SDValue Mask = DAG.getNode(X86ISD::TESTM, dl, MaskVT, Cond, Cond); + // Now return a new VSELECT using the mask. + return DAG.getSelect(dl, VT, Mask, Op.getOperand(1), Op.getOperand(2)); + } + // Only some types will be legal on some subtargets. If we can emit a legal // VSELECT-matching blend, return Op, and but if we need to expand, return // a null value. - switch (Op.getSimpleValueType().SimpleTy) { + switch (VT.SimpleTy) { default: // Most of the vector types have blends past SSE4.1. return Op; @@ -14725,7 +14764,7 @@ X86TargetLowering::LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const { // location. SDValue Chain = DAG.getEntryNode(); SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue); - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, DL, true), DL); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL); SDValue Args[] = { Chain, Offset }; Chain = DAG.getNode(X86ISD::TLSCALL, DL, NodeTys, Args); Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true), @@ -15348,8 +15387,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op, // Get a pointer to FF if the sign bit was set, or to 0 otherwise. SDValue Zero = DAG.getIntPtrConstant(0, dl); SDValue Four = DAG.getIntPtrConstant(4, dl); - SDValue Offset = DAG.getNode(ISD::SELECT, dl, Zero.getValueType(), SignSet, - Zero, Four); + SDValue Offset = DAG.getSelect(dl, Zero.getValueType(), SignSet, Zero, Four); FudgePtr = DAG.getNode(ISD::ADD, dl, PtrVT, FudgePtr, Offset); // Load the value out, extending it from f32 to f80. @@ -15621,7 +15659,7 @@ static SDValue LowerZERO_EXTEND_AVX512(SDValue Op, SDValue Zero = DAG.getConstant(APInt::getNullValue(ExtVT.getScalarSizeInBits()), DL, ExtVT); - SDValue SelectedVal = DAG.getNode(ISD::VSELECT, DL, ExtVT, In, One, Zero); + SDValue SelectedVal = DAG.getSelect(DL, ExtVT, In, One, Zero); if (VT == ExtVT) return SelectedVal; return DAG.getNode(X86ISD::VTRUNC, DL, VT, SelectedVal); @@ -16713,7 +16751,7 @@ static SDValue LowerAndToBT(SDValue And, ISD::CondCode CC, if (BitWidth > AndBitWidth) { KnownBits Known; DAG.computeKnownBits(Op0, Known); - if (Known.Zero.countLeadingOnes() < BitWidth - AndBitWidth) + if (Known.countMinLeadingZeros() < BitWidth - AndBitWidth) return SDValue(); } LHS = Op1; @@ -17455,7 +17493,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { MVT VCmpVT = VT == MVT::f32 ? MVT::v4i32 : MVT::v2i64; VCmp = DAG.getBitcast(VCmpVT, VCmp); - SDValue VSel = DAG.getNode(ISD::VSELECT, DL, VecVT, VCmp, VOp1, VOp2); + SDValue VSel = DAG.getSelect(DL, VecVT, VCmp, VOp1, VOp2); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VSel, DAG.getIntPtrConstant(0, DL)); @@ -17483,9 +17521,8 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0)) Op2Scalar = Op2.getOperand(0); if (Op1Scalar.getNode() && Op2Scalar.getNode()) { - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, - Op1Scalar.getValueType(), - Cond, Op1Scalar, Op2Scalar); + SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond, + Op1Scalar, Op2Scalar); if (newSelect.getValueSizeInBits() == VT.getSizeInBits()) return DAG.getBitcast(VT, newSelect); SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect); @@ -17500,8 +17537,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const { DAG.getUNDEF(MVT::v8i1), Op1, zeroConst); Op2 = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, MVT::v8i1, DAG.getUNDEF(MVT::v8i1), Op2, zeroConst); - SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::v8i1, - Cond, Op1, Op2); + SDValue newSelect = DAG.getSelect(DL, MVT::v8i1, Cond, Op1, Op2); return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, newSelect, zeroConst); } @@ -17770,7 +17806,7 @@ static SDValue LowerSIGN_EXTEND_AVX512(SDValue Op, } else { SDValue NegOne = getOnesVector(ExtVT, DAG, dl); SDValue Zero = getZeroVector(ExtVT, Subtarget, DAG, dl); - V = DAG.getNode(ISD::VSELECT, dl, ExtVT, In, NegOne, Zero); + V = DAG.getSelect(dl, ExtVT, In, NegOne, Zero); if (ExtVT == VT) return V; } @@ -18572,7 +18608,7 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, // Chain the dynamic stack allocation so that it doesn't modify the stack // pointer when other instructions are using the stack. - Chain = DAG.getCALLSEQ_START(Chain, DAG.getIntPtrConstant(0, dl, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl); bool Is64Bit = Subtarget.is64Bit(); MVT SPTy = getPointerTy(DAG.getDataLayout()); @@ -19021,8 +19057,10 @@ static SDValue getScalarMaskingNode(SDValue Op, SDValue Mask, SDValue PreservedSrc, const X86Subtarget &Subtarget, SelectionDAG &DAG) { - if (isAllOnesConstant(Mask)) - return Op; + + if (auto *MaskConst = dyn_cast(Mask)) + if (MaskConst->getZExtValue() & 0x1) + return Op; MVT VT = Op.getSimpleValueType(); SDLoc dl(Op); @@ -19081,7 +19119,7 @@ static SDValue recoverFramePointer(SelectionDAG &DAG, const Function *Fn, // registration, or the .set_setframe offset. MCSymbol *OffsetSym = MF.getMMI().getContext().getOrCreateParentFrameOffsetSymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); SDValue OffsetSymVal = DAG.getMCSymbol(OffsetSym, PtrVT); SDValue ParentFrameOffset = DAG.getNode(ISD::LOCAL_RECOVER, dl, PtrVT, OffsetSymVal); @@ -19683,12 +19721,6 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget DAG.getIntPtrConstant(0, dl)); return DAG.getBitcast(Op.getValueType(), Res); } - case CONVERT_MASK_TO_VEC: { - SDValue Mask = Op.getOperand(1); - MVT MaskVT = MVT::getVectorVT(MVT::i1, VT.getVectorNumElements()); - SDValue VMask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl); - return DAG.getNode(IntrData->Opc0, dl, VT, VMask); - } case BRCST_SUBVEC_TO_VEC: { SDValue Src = Op.getOperand(1); SDValue Passthru = Op.getOperand(2); @@ -19932,7 +19964,7 @@ static SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, const X86Subtarget &Subtarget SDValue Op1 = Op.getOperand(1); auto *Fn = cast(cast(Op1)->getGlobal()); MCSymbol *LSDASym = MF.getMMI().getContext().getOrCreateLSDASymbol( - GlobalValue::getRealLinkageName(Fn->getName())); + GlobalValue::dropLLVMManglingEscape(Fn->getName())); // Generate a simple absolute symbol reference. This intrinsic is only // supported on 32-bit Windows, which isn't PIC. @@ -21741,6 +21773,14 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, MVT ExVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2); SDValue Ex = DAG.getBitcast(ExVT, R); + // ashr(R, 63) === cmp_slt(R, 0) + if (ShiftAmt == 63 && Subtarget.hasSSE42()) { + assert((VT != MVT::v4i64 || Subtarget.hasInt256()) && + "Unsupported PCMPGT op"); + return DAG.getNode(X86ISD::PCMPGT, dl, VT, + getZeroVector(VT, Subtarget, DAG, dl), R); + } + if (ShiftAmt >= 32) { // Splat sign to upper i32 dst, and SRA upper i32 src to lower i32. SDValue Upper = @@ -21839,10 +21879,19 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, } // Special case in 32-bit mode, where i64 is expanded into high and low parts. + // TODO: Replace constant extraction with getTargetConstantBitsFromNode. if (!Subtarget.is64Bit() && !Subtarget.hasXOP() && (VT == MVT::v2i64 || (Subtarget.hasInt256() && VT == MVT::v4i64) || (Subtarget.hasAVX512() && VT == MVT::v8i64))) { + // AVX1 targets maybe extracting a 128-bit vector from a 256-bit constant. + unsigned SubVectorScale = 1; + if (Amt.getOpcode() == ISD::EXTRACT_SUBVECTOR) { + SubVectorScale = + Amt.getOperand(0).getValueSizeInBits() / Amt.getValueSizeInBits(); + Amt = Amt.getOperand(0); + } + // Peek through any splat that was introduced for i64 shift vectorization. int SplatIndex = -1; if (ShuffleVectorSDNode *SVN = dyn_cast(Amt.getNode())) @@ -21859,7 +21908,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG, Amt = Amt.getOperand(0); unsigned Ratio = Amt.getSimpleValueType().getVectorNumElements() / - VT.getVectorNumElements(); + (SubVectorScale * VT.getVectorNumElements()); unsigned RatioInLog2 = Log2_32_Ceil(Ratio); uint64_t ShiftAmt = 0; unsigned BaseOp = (SplatIndex < 0 ? 0 : SplatIndex * Ratio); @@ -22233,23 +22282,21 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); Sel = DAG.getNode(X86ISD::CVT2MASK, dl, MaskVT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } else if (Subtarget.hasSSE41()) { // On SSE41 targets we make use of the fact that VSELECT lowers // to PBLENDVB which selects bytes based just on the sign bit. V0 = DAG.getBitcast(VT, V0); V1 = DAG.getBitcast(VT, V1); Sel = DAG.getBitcast(VT, Sel); - return DAG.getBitcast(SelVT, - DAG.getNode(ISD::VSELECT, dl, VT, Sel, V0, V1)); + return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1)); } // On pre-SSE41 targets we test for the sign bit by comparing to // zero - a negative value will set all bits of the lanes to true // and VSELECT uses that in its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue Z = getZeroVector(SelVT, Subtarget, DAG, dl); SDValue C = DAG.getNode(X86ISD::PCMPGT, dl, SelVT, Z, Sel); - return DAG.getNode(ISD::VSELECT, dl, SelVT, C, V0, V1); + return DAG.getSelect(dl, SelVT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 5; @@ -22371,15 +22418,14 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget, V0 = DAG.getBitcast(ExtVT, V0); V1 = DAG.getBitcast(ExtVT, V1); Sel = DAG.getBitcast(ExtVT, Sel); - return DAG.getBitcast( - VT, DAG.getNode(ISD::VSELECT, dl, ExtVT, Sel, V0, V1)); + return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1)); } // On pre-SSE41 targets we splat the sign bit - a negative value will // set all bits of the lanes to true and VSELECT uses that in // its OR(AND(V0,C),AND(V1,~C)) lowering. SDValue C = DAG.getNode(ISD::SRA, dl, VT, Sel, DAG.getConstant(15, dl, VT)); - return DAG.getNode(ISD::VSELECT, dl, VT, C, V0, V1); + return DAG.getSelect(dl, VT, C, V0, V1); }; // Turn 'a' into a mask suitable for VSELECT: a = a << 12; @@ -23296,9 +23342,8 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget, SDValue Callee = DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout())); - Type *RetTy = isF64 - ? (Type*)StructType::get(ArgTy, ArgTy, nullptr) - : (Type*)VectorType::get(ArgTy, 4); + Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy) + : (Type *)VectorType::get(ArgTy, 4); TargetLowering::CallLoweringInfo CLI(DAG); CLI.setDebugLoc(dl) @@ -25779,7 +25824,7 @@ X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI, // Emit CALLSEQ_START right before the instruction. unsigned AdjStackDown = TII.getCallFrameSetupOpcode(); MachineInstrBuilder CallseqStart = - BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0); + BuildMI(MF, DL, TII.get(AdjStackDown)).addImm(0).addImm(0).addImm(0); BB->insert(MachineBasicBlock::iterator(MI), CallseqStart); // Emit CALLSEQ_END right after the instruction. @@ -26517,7 +26562,7 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI, case TargetOpcode::STACKMAP: case TargetOpcode::PATCHPOINT: return emitPatchPoint(MI, BB); - + case TargetOpcode::PATCHABLE_EVENT_CALL: // Do nothing here, handle in xray instrumentation pass. return BB; @@ -29532,7 +29577,7 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG, SDValue CondNew = DAG.getNode(ISD::XOR, DL, Cond.getValueType(), Cond, DAG.getAllOnesConstant(DL, CondVT)); // Vselect cond, op1, op2 = Vselect not(cond), op2, op1 - return DAG.getNode(ISD::VSELECT, DL, VT, CondNew, RHS, LHS); + return DAG.getSelect(DL, VT, CondNew, RHS, LHS); } // To use the condition operand as a bitwise mask, it must have elements that @@ -30015,7 +30060,7 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG, ISD::CondCode NewCC = (CC == ISD::SETLT) ? ISD::SETLE : ISD::SETGE; Cond = DAG.getSetCC(SDLoc(Cond), Cond.getValueType(), Cond.getOperand(0), Cond.getOperand(1), NewCC); - return DAG.getNode(ISD::SELECT, DL, VT, Cond, LHS, RHS); + return DAG.getSelect(DL, VT, Cond, LHS, RHS); } } } @@ -31561,20 +31606,22 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG, // (sub (xor X, M), M) static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, const X86Subtarget &Subtarget) { - assert(N->getOpcode() == ISD::OR); + assert(N->getOpcode() == ISD::OR && "Unexpected Opcode"); SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); EVT VT = N->getValueType(0); - if (!((VT == MVT::v2i64) || (VT == MVT::v4i64 && Subtarget.hasInt256()))) + if (!((VT.is128BitVector() && Subtarget.hasSSE2()) || + (VT.is256BitVector() && Subtarget.hasInt256()))) return SDValue(); - assert(Subtarget.hasSSE2() && "Unexpected i64 vector without SSE2!"); - // Canonicalize pandn to RHS - if (N0.getOpcode() == X86ISD::ANDNP) + // Canonicalize AND to LHS. + if (N1.getOpcode() == ISD::AND) std::swap(N0, N1); + // TODO: Attempt to match against AND(XOR(-1,X),Y) as well, waiting for + // ANDNP combine allows other combines to happen that prevent matching. if (N0.getOpcode() != ISD::AND || N1.getOpcode() != X86ISD::ANDNP) return SDValue(); @@ -31596,20 +31643,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, Y = peekThroughBitcasts(Y); EVT MaskVT = Mask.getValueType(); - - // Validate that the Mask operand is a vector sra node. - // FIXME: what to do for bytes, since there is a psignb/pblendvb, but - // there is no psrai.b unsigned EltBits = MaskVT.getScalarSizeInBits(); - unsigned SraAmt = ~0; - if (Mask.getOpcode() == ISD::SRA) { - if (auto *AmtBV = dyn_cast(Mask.getOperand(1))) - if (auto *AmtConst = AmtBV->getConstantSplatNode()) - SraAmt = AmtConst->getZExtValue(); - } else if (Mask.getOpcode() == X86ISD::VSRAI) - SraAmt = Mask.getConstantOperandVal(1); - if ((SraAmt + 1) != EltBits) + // TODO: Attempt to handle floating point cases as well? + if (!MaskVT.isInteger() || DAG.ComputeNumSignBits(Mask) != EltBits) return SDValue(); SDLoc DL(N); @@ -31630,7 +31667,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, // (add (xor X, M), (and M, 1)) // And further to: // (sub (xor X, M), M) - if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT) { + if (X.getValueType() == MaskVT && Y.getValueType() == MaskVT && + DAG.getTargetLoweringInfo().isOperationLegal(ISD::SUB, MaskVT)) { auto IsNegV = [](SDNode *N, SDValue V) { return N->getOpcode() == ISD::SUB && N->getOperand(1) == V && ISD::isBuildVectorAllZeros(N->getOperand(0).getNode()); @@ -31642,9 +31680,6 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, V = Y; if (V) { - if (EltBits != 8 && EltBits != 16 && EltBits != 32) - return SDValue(); - SDValue SubOp1 = DAG.getNode(ISD::XOR, DL, MaskVT, V, Mask); SDValue SubOp2 = Mask; @@ -31661,8 +31696,8 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, if (V == Y) std::swap(SubOp1, SubOp2); - return DAG.getBitcast(VT, - DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2)); + SDValue Res = DAG.getNode(ISD::SUB, DL, MaskVT, SubOp1, SubOp2); + return DAG.getBitcast(VT, Res); } } @@ -31675,7 +31710,7 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG, X = DAG.getBitcast(BlendVT, X); Y = DAG.getBitcast(BlendVT, Y); Mask = DAG.getBitcast(BlendVT, Mask); - Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, Y, X); + Mask = DAG.getSelect(DL, BlendVT, Mask, Y, X); return DAG.getBitcast(VT, Mask); } @@ -33655,8 +33690,7 @@ static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG, // If Op0 is a NaN, select Op1. Otherwise, select the max. If both operands // are NaN, the NaN value of Op1 is the result. - auto SelectOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT; - return DAG.getNode(SelectOpcode, DL, VT, IsOp0Nan, Op1, MinOrMax); + return DAG.getSelect(DL, VT, IsOp0Nan, Op1, MinOrMax); } /// Do target-specific dag combines on X86ISD::ANDNP nodes. @@ -33949,7 +33983,7 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG, if (InVT == MVT::i1) { SDValue Zero = DAG.getConstant(0, DL, VT); SDValue AllOnes = DAG.getAllOnesConstant(DL, VT); - return DAG.getNode(ISD::SELECT, DL, VT, N0, AllOnes, Zero); + return DAG.getSelect(DL, VT, N0, AllOnes, Zero); } return SDValue(); } diff --git a/lib/Target/X86/X86InstrCompiler.td b/lib/Target/X86/X86InstrCompiler.td index 3dc673e3c35a..d003d027ddb9 100644 --- a/lib/Target/X86/X86InstrCompiler.td +++ b/lib/Target/X86/X86InstrCompiler.td @@ -43,7 +43,8 @@ let hasSideEffects = 0, isNotDuplicable = 1, Uses = [ESP] in // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [ESP, EFLAGS], Uses = [ESP] in { -def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), +def ADJCALLSTACKDOWN32 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), "#ADJCALLSTACKDOWN", []>, Requires<[NotLP64]>; @@ -52,8 +53,8 @@ def ADJCALLSTACKUP32 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[NotLP64]>; } -def : Pat<(X86callseq_start timm:$amt1), - (ADJCALLSTACKDOWN32 i32imm:$amt1, 0)>, Requires<[NotLP64]>; +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN32 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[NotLP64]>; // ADJCALLSTACKDOWN/UP implicitly use/def RSP because they may be expanded into @@ -62,7 +63,8 @@ def : Pat<(X86callseq_start timm:$amt1), // Pessimistically assume ADJCALLSTACKDOWN / ADJCALLSTACKUP will become // sub / add which can clobber EFLAGS. let Defs = [RSP, EFLAGS], Uses = [RSP] in { -def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), +def ADJCALLSTACKDOWN64 : I<0, Pseudo, (outs), + (ins i32imm:$amt1, i32imm:$amt2, i32imm:$amt3), "#ADJCALLSTACKDOWN", []>, Requires<[IsLP64]>; @@ -71,8 +73,8 @@ def ADJCALLSTACKUP64 : I<0, Pseudo, (outs), (ins i32imm:$amt1, i32imm:$amt2), [(X86callseq_end timm:$amt1, timm:$amt2)]>, Requires<[IsLP64]>; } -def : Pat<(X86callseq_start timm:$amt1), - (ADJCALLSTACKDOWN64 i32imm:$amt1, 0)>, Requires<[IsLP64]>; +def : Pat<(X86callseq_start timm:$amt1, timm:$amt2), + (ADJCALLSTACKDOWN64 i32imm:$amt1, i32imm:$amt2, 0)>, Requires<[IsLP64]>; // x86-64 va_start lowering magic. diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp index 888daa275265..092ceb207ada 100644 --- a/lib/Target/X86/X86InstrInfo.cpp +++ b/lib/Target/X86/X86InstrInfo.cpp @@ -5729,6 +5729,44 @@ static X86::CondCode getSwappedCondition(X86::CondCode CC) { } } +std::pair +X86::getX86ConditionCode(CmpInst::Predicate Predicate) { + X86::CondCode CC = X86::COND_INVALID; + bool NeedSwap = false; + switch (Predicate) { + default: break; + // Floating-point Predicates + case CmpInst::FCMP_UEQ: CC = X86::COND_E; break; + case CmpInst::FCMP_OLT: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_OGT: CC = X86::COND_A; break; + case CmpInst::FCMP_OLE: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_OGE: CC = X86::COND_AE; break; + case CmpInst::FCMP_UGT: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_ULT: CC = X86::COND_B; break; + case CmpInst::FCMP_UGE: NeedSwap = true; LLVM_FALLTHROUGH; + case CmpInst::FCMP_ULE: CC = X86::COND_BE; break; + case CmpInst::FCMP_ONE: CC = X86::COND_NE; break; + case CmpInst::FCMP_UNO: CC = X86::COND_P; break; + case CmpInst::FCMP_ORD: CC = X86::COND_NP; break; + case CmpInst::FCMP_OEQ: LLVM_FALLTHROUGH; + case CmpInst::FCMP_UNE: CC = X86::COND_INVALID; break; + + // Integer Predicates + case CmpInst::ICMP_EQ: CC = X86::COND_E; break; + case CmpInst::ICMP_NE: CC = X86::COND_NE; break; + case CmpInst::ICMP_UGT: CC = X86::COND_A; break; + case CmpInst::ICMP_UGE: CC = X86::COND_AE; break; + case CmpInst::ICMP_ULT: CC = X86::COND_B; break; + case CmpInst::ICMP_ULE: CC = X86::COND_BE; break; + case CmpInst::ICMP_SGT: CC = X86::COND_G; break; + case CmpInst::ICMP_SGE: CC = X86::COND_GE; break; + case CmpInst::ICMP_SLT: CC = X86::COND_L; break; + case CmpInst::ICMP_SLE: CC = X86::COND_LE; break; + } + + return std::make_pair(CC, NeedSwap); +} + /// Return a set opcode for the given condition and /// whether it has memory operand. unsigned X86::getSETFromCond(CondCode CC, bool HasMemoryOperand) { @@ -7589,6 +7627,13 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const { return Expand2AddrUndef(MIB, get(HasAVX ? X86::VPCMPEQDrr : X86::PCMPEQDrr)); case X86::AVX2_SETALLONES: return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr)); + case X86::AVX1_SETALLONES: { + unsigned Reg = MIB->getOperand(0).getReg(); + // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS. + MIB->setDesc(get(X86::VCMPPSYrri)); + MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf); + return true; + } case X86::AVX512_512_SETALLONES: { unsigned Reg = MIB->getOperand(0).getReg(); MIB->setDesc(get(X86::VPTERNLOGDZrri)); @@ -8477,6 +8522,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( Alignment = 64; break; case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_256_SET0: Alignment = 32; @@ -8522,6 +8568,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( case X86::V_SET0: case X86::V_SETALLONES: case X86::AVX2_SETALLONES: + case X86::AVX1_SETALLONES: case X86::AVX_SET0: case X86::AVX512_128_SET0: case X86::AVX512_256_SET0: @@ -8563,13 +8610,14 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl( else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()),16); else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 || - Opc == X86::AVX512_256_SET0) + Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES) Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 8); else Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4); bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES || - Opc == X86::AVX512_512_SETALLONES); + Opc == X86::AVX512_512_SETALLONES || + Opc == X86::AVX1_SETALLONES); const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) : Constant::getNullValue(Ty); unsigned CPI = MCP.getConstantPoolIndex(C, Alignment); diff --git a/lib/Target/X86/X86InstrInfo.h b/lib/Target/X86/X86InstrInfo.h index 38567831b3a4..e64876073ccf 100644 --- a/lib/Target/X86/X86InstrInfo.h +++ b/lib/Target/X86/X86InstrInfo.h @@ -64,6 +64,10 @@ enum CondCode { // Turn condition code into conditional branch opcode. unsigned GetCondBranchFromCond(CondCode CC); +/// \brief Return a pair of condition code for the given predicate and whether +/// the instruction operands should be swaped to match the condition code. +std::pair getX86ConditionCode(CmpInst::Predicate Predicate); + /// \brief Return a set opcode for the given condition and whether it has /// a memory operand. unsigned getSETFromCond(CondCode CC, bool HasMemoryOperand = false); @@ -186,6 +190,8 @@ public: /// setup..destroy sequence (e.g. by pushes, or inside the callee). int64_t getFrameAdjustment(const MachineInstr &I) const { assert(isFrameInstr(I)); + if (isFrameSetup(I)) + return I.getOperand(2).getImm(); return I.getOperand(1).getImm(); } @@ -193,7 +199,10 @@ public: /// instruction. void setFrameAdjustment(MachineInstr &I, int64_t V) const { assert(isFrameInstr(I)); - I.getOperand(1).setImm(V); + if (isFrameSetup(I)) + I.getOperand(2).setImm(V); + else + I.getOperand(1).setImm(V); } /// getSPAdjust - This returns the stack pointer adjustment made by diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td index 902b0c2c04e3..4d7d8ece92d9 100644 --- a/lib/Target/X86/X86InstrInfo.td +++ b/lib/Target/X86/X86InstrInfo.td @@ -84,7 +84,8 @@ def SDTLockBinaryArithWithFlags : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, def SDTX86Ret : SDTypeProfile<0, -1, [SDTCisVT<0, i32>]>; -def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>]>; +def SDT_X86CallSeqStart : SDCallSeqStart<[SDTCisVT<0, i32>, + SDTCisVT<1, i32>]>; def SDT_X86CallSeqEnd : SDCallSeqEnd<[SDTCisVT<0, i32>, SDTCisVT<1, i32>]>; @@ -2351,6 +2352,38 @@ let Predicates = [HasBMI2] in { def : Pat<(and (loadi64 addr:$src), (add (shl 1, GR8:$lz), -1)), (BZHI64rm addr:$src, (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR8:$lz, sub_8bit))>; + + // x & (-1 >> (32 - y)) + def : Pat<(and GR32:$src, (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(and (loadi32 addr:$src), (srl -1, (i8 (trunc (sub 32, GR32:$lz))))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x & (-1 >> (64 - y)) + def : Pat<(and GR64:$src, (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(and (loadi64 addr:$src), (srl -1, (i8 (trunc (sub 64, GR32:$lz))))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + + // x << (32 - y) >> (32 - y) + def : Pat<(srl (shl GR32:$src, (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rr GR32:$src, GR32:$lz)>; + def : Pat<(srl (shl (loadi32 addr:$src), (i8 (trunc (sub 32, GR32:$lz)))), + (i8 (trunc (sub 32, GR32:$lz)))), + (BZHI32rm addr:$src, GR32:$lz)>; + + // x << (64 - y) >> (64 - y) + def : Pat<(srl (shl GR64:$src, (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rr GR64:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; + def : Pat<(srl (shl (loadi64 addr:$src), (i8 (trunc (sub 64, GR32:$lz)))), + (i8 (trunc (sub 64, GR32:$lz)))), + (BZHI64rm addr:$src, + (INSERT_SUBREG (i64 (IMPLICIT_DEF)), GR32:$lz, sub_32bit))>; } // HasBMI2 let Predicates = [HasBMI] in { diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td index 48da2fa607af..f73d85e7e01b 100644 --- a/lib/Target/X86/X86InstrSSE.td +++ b/lib/Target/X86/X86InstrSSE.td @@ -486,6 +486,10 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1, isPseudo = 1, SchedRW = [WriteZero] in { def V_SETALLONES : I<0, Pseudo, (outs VR128:$dst), (ins), "", [(set VR128:$dst, (v4i32 immAllOnesV))]>; + let Predicates = [HasAVX1Only, OptForMinSize] in { + def AVX1_SETALLONES: I<0, Pseudo, (outs VR256:$dst), (ins), "", + [(set VR256:$dst, (v8i32 immAllOnesV))]>; + } let Predicates = [HasAVX2] in def AVX2_SETALLONES : I<0, Pseudo, (outs VR256:$dst), (ins), "", [(set VR256:$dst, (v8i32 immAllOnesV))]>; @@ -7755,14 +7759,12 @@ def VINSERTF128rm : AVXAIi8<0x18, MRMSrcMem, (outs VR256:$dst), []>, Sched<[WriteFShuffleLd, ReadAfterLd]>, VEX_4V, VEX_L; } - -// Without AVX2 we need to concat two v4i32 V_SETALLONES to create a 256-bit -// all ones value. -let Predicates = [HasAVX1Only] in -def : Pat<(v8i32 immAllOnesV), - (VINSERTF128rr - (INSERT_SUBREG (v8i32 (IMPLICIT_DEF)), (V_SETALLONES), sub_xmm), - (V_SETALLONES), 1)>; +// To create a 256-bit all ones value, we should produce VCMPTRUEPS +// with YMM register containing zero. +// FIXME: Avoid producing vxorps to clear the fake inputs. +let Predicates = [HasAVX1Only] in { +def : Pat<(v8i32 immAllOnesV), (VCMPPSYrri (AVX_SET0), (AVX_SET0), 0xf)>; +} multiclass vinsert_lowering { diff --git a/lib/Target/X86/X86InstructionSelector.cpp b/lib/Target/X86/X86InstructionSelector.cpp index d65eb1de8d09..de58d719acb4 100644 --- a/lib/Target/X86/X86InstructionSelector.cpp +++ b/lib/Target/X86/X86InstructionSelector.cpp @@ -56,13 +56,9 @@ private: bool selectImpl(MachineInstr &I) const; // TODO: remove after suported by Tablegen-erated instruction selection. - unsigned getFAddOp(LLT &Ty, const RegisterBank &RB) const; - unsigned getFSubOp(LLT &Ty, const RegisterBank &RB) const; unsigned getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const; - bool selectBinaryOp(MachineInstr &I, MachineRegisterInfo &MRI, - MachineFunction &MF) const; bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; bool selectFrameIndexOrGep(MachineInstr &I, MachineRegisterInfo &MRI, @@ -71,6 +67,10 @@ private: MachineFunction &MF) const; bool selectTrunc(MachineInstr &I, MachineRegisterInfo &MRI, MachineFunction &MF) const; + bool selectZext(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; + bool selectCmp(MachineInstr &I, MachineRegisterInfo &MRI, + MachineFunction &MF) const; const X86TargetMachine &TM; const X86Subtarget &STI; @@ -226,13 +226,11 @@ bool X86InstructionSelector::select(MachineInstr &I) const { "Generic instruction has unexpected implicit operands\n"); if (selectImpl(I)) - return true; + return true; DEBUG(dbgs() << " C++ instruction selection: "; I.print(dbgs())); // TODO: This should be implemented by tblgen. - if (selectBinaryOp(I, MRI, MF)) - return true; if (selectLoadStoreOp(I, MRI, MF)) return true; if (selectFrameIndexOrGep(I, MRI, MF)) @@ -241,109 +239,14 @@ bool X86InstructionSelector::select(MachineInstr &I) const { return true; if (selectTrunc(I, MRI, MF)) return true; + if (selectZext(I, MRI, MF)) + return true; + if (selectCmp(I, MRI, MF)) + return true; return false; } -unsigned X86InstructionSelector::getFAddOp(LLT &Ty, - const RegisterBank &RB) const { - - if (X86::VECRRegBankID != RB.getID()) - return TargetOpcode::G_FADD; - - if (Ty == LLT::scalar(32)) { - if (STI.hasAVX512()) { - return X86::VADDSSZrr; - } else if (STI.hasAVX()) { - return X86::VADDSSrr; - } else if (STI.hasSSE1()) { - return X86::ADDSSrr; - } - } else if (Ty == LLT::scalar(64)) { - if (STI.hasAVX512()) { - return X86::VADDSDZrr; - } else if (STI.hasAVX()) { - return X86::VADDSDrr; - } else if (STI.hasSSE2()) { - return X86::ADDSDrr; - } - } else if (Ty == LLT::vector(4, 32)) { - if ((STI.hasAVX512()) && (STI.hasVLX())) { - return X86::VADDPSZ128rr; - } else if (STI.hasAVX()) { - return X86::VADDPSrr; - } else if (STI.hasSSE1()) { - return X86::ADDPSrr; - } - } - - return TargetOpcode::G_FADD; -} - -unsigned X86InstructionSelector::getFSubOp(LLT &Ty, - const RegisterBank &RB) const { - - if (X86::VECRRegBankID != RB.getID()) - return TargetOpcode::G_FSUB; - - if (Ty == LLT::scalar(32)) { - if (STI.hasAVX512()) { - return X86::VSUBSSZrr; - } else if (STI.hasAVX()) { - return X86::VSUBSSrr; - } else if (STI.hasSSE1()) { - return X86::SUBSSrr; - } - } else if (Ty == LLT::scalar(64)) { - if (STI.hasAVX512()) { - return X86::VSUBSDZrr; - } else if (STI.hasAVX()) { - return X86::VSUBSDrr; - } else if (STI.hasSSE2()) { - return X86::SUBSDrr; - } - } else if (Ty == LLT::vector(4, 32)) { - if ((STI.hasAVX512()) && (STI.hasVLX())) { - return X86::VSUBPSZ128rr; - } else if (STI.hasAVX()) { - return X86::VSUBPSrr; - } else if (STI.hasSSE1()) { - return X86::SUBPSrr; - } - } - - return TargetOpcode::G_FSUB; -} - -bool X86InstructionSelector::selectBinaryOp(MachineInstr &I, - MachineRegisterInfo &MRI, - MachineFunction &MF) const { - - const unsigned DefReg = I.getOperand(0).getReg(); - LLT Ty = MRI.getType(DefReg); - const RegisterBank &RB = *RBI.getRegBank(DefReg, MRI, TRI); - - unsigned NewOpc = I.getOpcode(); - - switch (NewOpc) { - case TargetOpcode::G_FADD: - NewOpc = getFAddOp(Ty, RB); - break; - case TargetOpcode::G_FSUB: - NewOpc = getFSubOp(Ty, RB); - break; - default: - break; - } - - if (NewOpc == I.getOpcode()) - return false; - - I.setDesc(TII.get(NewOpc)); - - return constrainSelectedInstRegOperands(I, TII, TRI, RBI); -} - unsigned X86InstructionSelector::getLoadStoreOp(LLT &Ty, const RegisterBank &RB, unsigned Opc, uint64_t Alignment) const { @@ -562,6 +465,105 @@ bool X86InstructionSelector::selectTrunc(MachineInstr &I, return true; } +bool X86InstructionSelector::selectZext(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_ZEXT) + return false; + + const unsigned DstReg = I.getOperand(0).getReg(); + const unsigned SrcReg = I.getOperand(1).getReg(); + + const LLT DstTy = MRI.getType(DstReg); + const LLT SrcTy = MRI.getType(SrcReg); + + if (SrcTy == LLT::scalar(1)) { + + unsigned AndOpc; + if (DstTy == LLT::scalar(32)) + AndOpc = X86::AND32ri8; + else if (DstTy == LLT::scalar(64)) + AndOpc = X86::AND64ri8; + else + return false; + + const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI); + unsigned DefReg = + MRI.createVirtualRegister(getRegClassForTypeOnBank(DstTy, RegBank)); + + BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(TargetOpcode::SUBREG_TO_REG), DefReg) + .addImm(0) + .addReg(SrcReg) + .addImm(X86::sub_8bit); + + MachineInstr &AndInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(AndOpc), DstReg) + .addReg(DefReg) + .addImm(1); + + constrainSelectedInstRegOperands(AndInst, TII, TRI, RBI); + + I.eraseFromParent(); + return true; + } + + return false; +} + +bool X86InstructionSelector::selectCmp(MachineInstr &I, + MachineRegisterInfo &MRI, + MachineFunction &MF) const { + if (I.getOpcode() != TargetOpcode::G_ICMP) + return false; + + X86::CondCode CC; + bool SwapArgs; + std::tie(CC, SwapArgs) = X86::getX86ConditionCode( + (CmpInst::Predicate)I.getOperand(1).getPredicate()); + unsigned OpSet = X86::getSETFromCond(CC); + + unsigned LHS = I.getOperand(2).getReg(); + unsigned RHS = I.getOperand(3).getReg(); + + if (SwapArgs) + std::swap(LHS, RHS); + + unsigned OpCmp; + LLT Ty = MRI.getType(LHS); + + switch (Ty.getSizeInBits()) { + default: + return false; + case 8: + OpCmp = X86::CMP8rr; + break; + case 16: + OpCmp = X86::CMP16rr; + break; + case 32: + OpCmp = X86::CMP32rr; + break; + case 64: + OpCmp = X86::CMP64rr; + break; + } + + MachineInstr &CmpInst = + *BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(OpCmp)) + .addReg(LHS) + .addReg(RHS); + + MachineInstr &SetInst = *BuildMI(*I.getParent(), I, I.getDebugLoc(), + TII.get(OpSet), I.getOperand(0).getReg()); + + constrainSelectedInstRegOperands(CmpInst, TII, TRI, RBI); + constrainSelectedInstRegOperands(SetInst, TII, TRI, RBI); + + I.eraseFromParent(); + return true; +} + InstructionSelector * llvm::createX86InstructionSelector(const X86TargetMachine &TM, X86Subtarget &Subtarget, diff --git a/lib/Target/X86/X86IntrinsicsInfo.h b/lib/Target/X86/X86IntrinsicsInfo.h index 2a40399ba571..bc73bb1ae8c5 100644 --- a/lib/Target/X86/X86IntrinsicsInfo.h +++ b/lib/Target/X86/X86IntrinsicsInfo.h @@ -36,7 +36,7 @@ enum IntrinsicType : uint16_t { TRUNCATE_TO_MEM_VI8, TRUNCATE_TO_MEM_VI16, TRUNCATE_TO_MEM_VI32, EXPAND_FROM_MEM, TERLOG_OP_MASK, TERLOG_OP_MASKZ, BROADCASTM, KUNPCK, FIXUPIMM, FIXUPIMM_MASKZ, FIXUPIMMS, - FIXUPIMMS_MASKZ, CONVERT_MASK_TO_VEC, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, + FIXUPIMMS_MASKZ, CONVERT_TO_MASK, GATHER_AVX2, MASK_BINOP, }; struct IntrinsicData { diff --git a/lib/Target/X86/X86LegalizerInfo.cpp b/lib/Target/X86/X86LegalizerInfo.cpp index 4f5e70414aa9..cf26238c0239 100644 --- a/lib/Target/X86/X86LegalizerInfo.cpp +++ b/lib/Target/X86/X86LegalizerInfo.cpp @@ -87,10 +87,16 @@ void X86LegalizerInfo::setLegalizerInfo32bit() { setAction({G_ZEXT, s32}, Legal); setAction({G_SEXT, s32}, Legal); - for (auto Ty : {s8, s16}) { + for (auto Ty : {s1, s8, s16}) { setAction({G_ZEXT, 1, Ty}, Legal); setAction({G_SEXT, 1, Ty}, Legal); } + + // Comparison + setAction({G_ICMP, s1}, Legal); + + for (auto Ty : {s8, s16, s32, p0}) + setAction({G_ICMP, 1, Ty}, Legal); } void X86LegalizerInfo::setLegalizerInfo64bit() { @@ -139,10 +145,16 @@ void X86LegalizerInfo::setLegalizerInfo64bit() { setAction({G_SEXT, Ty}, Legal); } - for (auto Ty : {s8, s16, s32}) { + for (auto Ty : {s1, s8, s16, s32}) { setAction({G_ZEXT, 1, Ty}, Legal); setAction({G_SEXT, 1, Ty}, Legal); } + + // Comparison + setAction({G_ICMP, s1}, Legal); + + for (auto Ty : {s8, s16, s32, s64, p0}) + setAction({G_ICMP, 1, Ty}, Legal); } void X86LegalizerInfo::setLegalizerInfoSSE1() { diff --git a/lib/Target/X86/X86RegisterInfo.cpp b/lib/Target/X86/X86RegisterInfo.cpp index cf2ceef8013a..7e4cba1c8345 100644 --- a/lib/Target/X86/X86RegisterInfo.cpp +++ b/lib/Target/X86/X86RegisterInfo.cpp @@ -320,14 +320,14 @@ X86RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const { case CallingConv::X86_RegCall: if (Is64Bit) { if (IsWin64) { - return (HasSSE ? CSR_Win64_RegCall_SaveList : + return (HasSSE ? CSR_Win64_RegCall_SaveList : CSR_Win64_RegCall_NoSSE_SaveList); } else { - return (HasSSE ? CSR_SysV64_RegCall_SaveList : + return (HasSSE ? CSR_SysV64_RegCall_SaveList : CSR_SysV64_RegCall_NoSSE_SaveList); } } else { - return (HasSSE ? CSR_32_RegCall_SaveList : + return (HasSSE ? CSR_32_RegCall_SaveList : CSR_32_RegCall_NoSSE_SaveList); } case CallingConv::Cold: @@ -435,15 +435,15 @@ X86RegisterInfo::getCallPreservedMask(const MachineFunction &MF, return CSR_64_HHVM_RegMask; case CallingConv::X86_RegCall: if (Is64Bit) { - if (IsWin64) { - return (HasSSE ? CSR_Win64_RegCall_RegMask : + if (IsWin64) { + return (HasSSE ? CSR_Win64_RegCall_RegMask : CSR_Win64_RegCall_NoSSE_RegMask); } else { - return (HasSSE ? CSR_SysV64_RegCall_RegMask : + return (HasSSE ? CSR_SysV64_RegCall_RegMask : CSR_SysV64_RegCall_NoSSE_RegMask); } } else { - return (HasSSE ? CSR_32_RegCall_RegMask : + return (HasSSE ? CSR_32_RegCall_RegMask : CSR_32_RegCall_NoSSE_RegMask); } case CallingConv::Cold: diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h index de1514243aeb..02be95e2e556 100644 --- a/lib/Target/X86/X86Subtarget.h +++ b/lib/Target/X86/X86Subtarget.h @@ -253,6 +253,11 @@ protected: /// True if the LEA instruction with certain arguments is slow bool SlowLEA; + /// True if the LEA instruction has all three source operands: base, index, + /// and offset or if the LEA instruction uses base and index registers where + /// the base is EBP, RBP,or R13 + bool Slow3OpsLEA; + /// True if INC and DEC instructions are slow when writing to flags bool SlowIncDec; @@ -490,6 +495,7 @@ public: bool callRegIndirect() const { return CallRegIndirect; } bool LEAusesAG() const { return LEAUsesAG; } bool slowLEA() const { return SlowLEA; } + bool slow3OpsLEA() const { return Slow3OpsLEA; } bool slowIncDec() const { return SlowIncDec; } bool hasCDI() const { return HasCDI; } bool hasPFI() const { return HasPFI; } diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp index 086f55dd60b5..c6a90725d89c 100644 --- a/lib/Target/X86/X86TargetMachine.cpp +++ b/lib/Target/X86/X86TargetMachine.cpp @@ -61,6 +61,7 @@ static cl::opt EnableMachineCombinerPass("x86-machine-combiner", namespace llvm { void initializeWinEHStatePassPass(PassRegistry &); +void initializeFixupLEAPassPass(PassRegistry &); void initializeX86ExecutionDepsFixPass(PassRegistry &); } // end namespace llvm @@ -75,6 +76,7 @@ extern "C" void LLVMInitializeX86Target() { initializeWinEHStatePassPass(PR); initializeFixupBWInstPassPass(PR); initializeEvexToVexInstPassPass(PR); + initializeFixupLEAPassPass(PR); initializeX86ExecutionDepsFixPass(PR); } @@ -87,7 +89,7 @@ static std::unique_ptr createTLOF(const Triple &TT) { if (TT.isOSFreeBSD()) return llvm::make_unique(); - if (TT.isOSLinux() || TT.isOSNaCl()) + if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU()) return llvm::make_unique(); if (TT.isOSFuchsia()) return llvm::make_unique(); diff --git a/lib/Target/X86/X86TargetTransformInfo.cpp b/lib/Target/X86/X86TargetTransformInfo.cpp index f3b619a2956a..80e18161a94b 100644 --- a/lib/Target/X86/X86TargetTransformInfo.cpp +++ b/lib/Target/X86/X86TargetTransformInfo.cpp @@ -247,35 +247,38 @@ int X86TTIImpl::getArithmeticInstrCost( } static const CostTblEntry SSE2UniformConstCostTable[] = { - { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. - { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. - { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. - - { ISD::SHL, MVT::v32i8, 4 }, // 2*(psllw + pand). - { ISD::SRL, MVT::v32i8, 4 }, // 2*(psrlw + pand). - { ISD::SRA, MVT::v32i8, 8 }, // 2*(psrlw, pand, pxor, psubb). - - { ISD::SDIV, MVT::v16i16, 12 }, // pmulhw sequence - { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence - { ISD::UDIV, MVT::v16i16, 12 }, // pmulhuw sequence - { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence - { ISD::SDIV, MVT::v8i32, 38 }, // pmuludq sequence - { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence - { ISD::UDIV, MVT::v8i32, 30 }, // pmuludq sequence - { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence + { ISD::SHL, MVT::v16i8, 2 }, // psllw + pand. + { ISD::SRL, MVT::v16i8, 2 }, // psrlw + pand. + { ISD::SRA, MVT::v16i8, 4 }, // psrlw, pand, pxor, psubb. + + { ISD::SHL, MVT::v32i8, 4+2 }, // 2*(psllw + pand) + split. + { ISD::SRL, MVT::v32i8, 4+2 }, // 2*(psrlw + pand) + split. + { ISD::SRA, MVT::v32i8, 8+2 }, // 2*(psrlw, pand, pxor, psubb) + split. + + { ISD::SDIV, MVT::v16i16, 12+2 }, // 2*pmulhw sequence + split. + { ISD::SDIV, MVT::v8i16, 6 }, // pmulhw sequence + { ISD::UDIV, MVT::v16i16, 12+2 }, // 2*pmulhuw sequence + split. + { ISD::UDIV, MVT::v8i16, 6 }, // pmulhuw sequence + { ISD::SDIV, MVT::v8i32, 38+2 }, // 2*pmuludq sequence + split. + { ISD::SDIV, MVT::v4i32, 19 }, // pmuludq sequence + { ISD::UDIV, MVT::v8i32, 30+2 }, // 2*pmuludq sequence + split. + { ISD::UDIV, MVT::v4i32, 15 }, // pmuludq sequence }; if (Op2Info == TargetTransformInfo::OK_UniformConstantValue && ST->hasSSE2()) { // pmuldq sequence. if (ISD == ISD::SDIV && LT.second == MVT::v8i32 && ST->hasAVX()) - return LT.first * 30; + return LT.first * 32; if (ISD == ISD::SDIV && LT.second == MVT::v4i32 && ST->hasSSE41()) return LT.first * 15; - if (const auto *Entry = CostTableLookup(SSE2UniformConstCostTable, ISD, - LT.second)) - return LT.first * Entry->Cost; + // XOP has faster vXi8 shifts. + if ((ISD != ISD::SHL && ISD != ISD::SRL && ISD != ISD::SRA) || + !ST->hasXOP()) + if (const auto *Entry = + CostTableLookup(SSE2UniformConstCostTable, ISD, LT.second)) + return LT.first * Entry->Cost; } static const CostTblEntry AVX2UniformCostTable[] = { @@ -430,18 +433,18 @@ int X86TTIImpl::getArithmeticInstrCost( { ISD::SRL, MVT::v2i64, 2 }, { ISD::SRA, MVT::v2i64, 2 }, // 256bit shifts require splitting if AVX2 didn't catch them above. - { ISD::SHL, MVT::v32i8, 2 }, - { ISD::SRL, MVT::v32i8, 4 }, - { ISD::SRA, MVT::v32i8, 4 }, - { ISD::SHL, MVT::v16i16, 2 }, - { ISD::SRL, MVT::v16i16, 4 }, - { ISD::SRA, MVT::v16i16, 4 }, - { ISD::SHL, MVT::v8i32, 2 }, - { ISD::SRL, MVT::v8i32, 4 }, - { ISD::SRA, MVT::v8i32, 4 }, - { ISD::SHL, MVT::v4i64, 2 }, - { ISD::SRL, MVT::v4i64, 4 }, - { ISD::SRA, MVT::v4i64, 4 }, + { ISD::SHL, MVT::v32i8, 2+2 }, + { ISD::SRL, MVT::v32i8, 4+2 }, + { ISD::SRA, MVT::v32i8, 4+2 }, + { ISD::SHL, MVT::v16i16, 2+2 }, + { ISD::SRL, MVT::v16i16, 4+2 }, + { ISD::SRA, MVT::v16i16, 4+2 }, + { ISD::SHL, MVT::v8i32, 2+2 }, + { ISD::SRL, MVT::v8i32, 4+2 }, + { ISD::SRA, MVT::v8i32, 4+2 }, + { ISD::SHL, MVT::v4i64, 2+2 }, + { ISD::SRL, MVT::v4i64, 4+2 }, + { ISD::SRA, MVT::v4i64, 4+2 }, }; // Look for XOP lowering tricks. @@ -451,23 +454,28 @@ int X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry SSE2UniformShiftCostTable[] = { // Uniform splats are cheaper for the following instructions. - { ISD::SHL, MVT::v16i16, 2 }, // psllw. - { ISD::SHL, MVT::v8i32, 2 }, // pslld - { ISD::SHL, MVT::v4i64, 2 }, // psllq. - - { ISD::SRL, MVT::v16i16, 2 }, // psrlw. - { ISD::SRL, MVT::v8i32, 2 }, // psrld. - { ISD::SRL, MVT::v4i64, 2 }, // psrlq. - - { ISD::SRA, MVT::v16i16, 2 }, // psraw. - { ISD::SRA, MVT::v8i32, 2 }, // psrad. - { ISD::SRA, MVT::v2i64, 4 }, // 2 x psrad + shuffle. - { ISD::SRA, MVT::v4i64, 8 }, // 2 x psrad + shuffle. + { ISD::SHL, MVT::v16i16, 2+2 }, // 2*psllw + split. + { ISD::SHL, MVT::v8i32, 2+2 }, // 2*pslld + split. + { ISD::SHL, MVT::v4i64, 2+2 }, // 2*psllq + split. + + { ISD::SRL, MVT::v16i16, 2+2 }, // 2*psrlw + split. + { ISD::SRL, MVT::v8i32, 2+2 }, // 2*psrld + split. + { ISD::SRL, MVT::v4i64, 2+2 }, // 2*psrlq + split. + + { ISD::SRA, MVT::v16i16, 2+2 }, // 2*psraw + split. + { ISD::SRA, MVT::v8i32, 2+2 }, // 2*psrad + split. + { ISD::SRA, MVT::v2i64, 4 }, // 2*psrad + shuffle. + { ISD::SRA, MVT::v4i64, 8+2 }, // 2*(2*psrad + shuffle) + split. }; if (ST->hasSSE2() && ((Op2Info == TargetTransformInfo::OK_UniformConstantValue) || (Op2Info == TargetTransformInfo::OK_UniformValue))) { + + // Handle AVX2 uniform v4i64 ISD::SRA, it's not worth a table. + if (ISD == ISD::SRA && LT.second == MVT::v4i64 && ST->hasAVX2()) + return LT.first * 4; // 2*psrad + shuffle. + if (const auto *Entry = CostTableLookup(SSE2UniformShiftCostTable, ISD, LT.second)) return LT.first * Entry->Cost; @@ -581,28 +589,28 @@ int X86TTIImpl::getArithmeticInstrCost( return LT.first * Entry->Cost; static const CostTblEntry SSE41CostTable[] = { - { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. - { ISD::SHL, MVT::v32i8, 2*11 }, // pblendvb sequence. - { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SHL, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld - { ISD::SHL, MVT::v8i32, 2*4 }, // pslld/paddd/cvttps2dq/pmulld - - { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. - { ISD::SRL, MVT::v32i8, 2*12 }, // pblendvb sequence. - { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRL, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. - { ISD::SRL, MVT::v8i32, 2*11 }, // Shift each lane + blend. - - { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. - { ISD::SRA, MVT::v32i8, 2*24 }, // pblendvb sequence. - { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. - { ISD::SRA, MVT::v16i16, 2*14 }, // pblendvb sequence. - { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. - { ISD::SRA, MVT::v8i32, 2*12 }, // Shift each lane + blend. - - { ISD::MUL, MVT::v4i32, 1 } // pmulld + { ISD::SHL, MVT::v16i8, 11 }, // pblendvb sequence. + { ISD::SHL, MVT::v32i8, 2*11+2 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SHL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SHL, MVT::v4i32, 4 }, // pslld/paddd/cvttps2dq/pmulld + { ISD::SHL, MVT::v8i32, 2*4+2 }, // pslld/paddd/cvttps2dq/pmulld + split + + { ISD::SRL, MVT::v16i8, 12 }, // pblendvb sequence. + { ISD::SRL, MVT::v32i8, 2*12+2 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRL, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SRL, MVT::v4i32, 11 }, // Shift each lane + blend. + { ISD::SRL, MVT::v8i32, 2*11+2 }, // Shift each lane + blend + split. + + { ISD::SRA, MVT::v16i8, 24 }, // pblendvb sequence. + { ISD::SRA, MVT::v32i8, 2*24+2 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v8i16, 14 }, // pblendvb sequence. + { ISD::SRA, MVT::v16i16, 2*14+2 }, // pblendvb sequence + split. + { ISD::SRA, MVT::v4i32, 12 }, // Shift each lane + blend. + { ISD::SRA, MVT::v8i32, 2*12+2 }, // Shift each lane + blend + split. + + { ISD::MUL, MVT::v4i32, 1 } // pmulld }; if (ST->hasSSE41()) @@ -612,33 +620,33 @@ int X86TTIImpl::getArithmeticInstrCost( static const CostTblEntry SSE2CostTable[] = { // We don't correctly identify costs of casts because they are marked as // custom. - { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. - { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SHL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. - - { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. - { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. - { ISD::SRL, MVT::v4i64, 2*4 }, // splat+shuffle sequence. - - { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. - { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. - { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. - { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. - { ISD::SRA, MVT::v4i64, 2*12 }, // srl/xor/sub sequence. - - { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. - { ISD::MUL, MVT::v8i16, 1 }, // pmullw - { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle - { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add - - { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ - { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ + { ISD::SHL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SHL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SHL, MVT::v4i32, 2*5 }, // We optimized this using mul. + { ISD::SHL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SHL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. + + { ISD::SRL, MVT::v16i8, 26 }, // cmpgtb sequence. + { ISD::SRL, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRL, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRL, MVT::v2i64, 4 }, // splat+shuffle sequence. + { ISD::SRL, MVT::v4i64, 2*4+2 }, // splat+shuffle sequence + split. + + { ISD::SRA, MVT::v16i8, 54 }, // unpacked cmpgtb sequence. + { ISD::SRA, MVT::v8i16, 32 }, // cmpgtb sequence. + { ISD::SRA, MVT::v4i32, 16 }, // Shift each lane + blend. + { ISD::SRA, MVT::v2i64, 12 }, // srl/xor/sub sequence. + { ISD::SRA, MVT::v4i64, 2*12+2 }, // srl/xor/sub sequence+split. + + { ISD::MUL, MVT::v16i8, 12 }, // extend/pmullw/trunc sequence. + { ISD::MUL, MVT::v8i16, 1 }, // pmullw + { ISD::MUL, MVT::v4i32, 6 }, // 3*pmuludq/4*shuffle + { ISD::MUL, MVT::v2i64, 8 }, // 3*pmuludq/3*shift/2*add + + { ISD::FDIV, MVT::f32, 23 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v4f32, 39 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::f64, 38 }, // Pentium IV from http://www.agner.org/ + { ISD::FDIV, MVT::v2f64, 69 }, // Pentium IV from http://www.agner.org/ // It is not a good idea to vectorize division. We have to scalarize it and // in the process we will often end up having to spilling regular diff --git a/lib/Target/X86/X86WinEHState.cpp b/lib/Target/X86/X86WinEHState.cpp index 500b26b3be17..3ee14a0ff7b1 100644 --- a/lib/Target/X86/X86WinEHState.cpp +++ b/lib/Target/X86/X86WinEHState.cpp @@ -398,7 +398,7 @@ Function *WinEHStatePass::generateLSDAInEAXThunk(Function *ParentFunc) { /*isVarArg=*/false); Function *Trampoline = Function::Create(TrampolineTy, GlobalValue::InternalLinkage, - Twine("__ehhandler$") + GlobalValue::getRealLinkageName( + Twine("__ehhandler$") + GlobalValue::dropLLVMManglingEscape( ParentFunc->getName()), TheModule); BasicBlock *EntryBB = BasicBlock::Create(Context, "entry", Trampoline); diff --git a/lib/Target/XCore/XCoreISelLowering.cpp b/lib/Target/XCore/XCoreISelLowering.cpp index b8742683a0c8..1da189c5cd31 100644 --- a/lib/Target/XCore/XCoreISelLowering.cpp +++ b/lib/Target/XCore/XCoreISelLowering.cpp @@ -409,7 +409,7 @@ static bool isWordAligned(SDValue Value, SelectionDAG &DAG) { KnownBits Known; DAG.computeKnownBits(Value, Known); - return Known.Zero.countTrailingOnes() >= 2; + return Known.countMinTrailingZeros() >= 2; } SDValue XCoreTargetLowering:: @@ -1131,8 +1131,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo( unsigned NumBytes = RetCCInfo.getNextStackOffset(); auto PtrVT = getPointerTy(DAG.getDataLayout()); - Chain = DAG.getCALLSEQ_START(Chain, - DAG.getConstant(NumBytes, dl, PtrVT, true), dl); + Chain = DAG.getCALLSEQ_START(Chain, NumBytes, 0, dl); SmallVector, 4> RegsToPass; SmallVector MemOpChains; diff --git a/lib/Target/XCore/XCoreInstrInfo.td b/lib/Target/XCore/XCoreInstrInfo.td index f1d52d5a191f..b87ba6548962 100644 --- a/lib/Target/XCore/XCoreInstrInfo.td +++ b/lib/Target/XCore/XCoreInstrInfo.td @@ -73,9 +73,10 @@ def XCoreLdwsp : SDNode<"XCoreISD::LDWSP", SDT_XCoreLdwsp, [SDNPHasChain, SDNPMayLoad]>; // These are target-independent nodes, but have target-specific formats. -def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32> ]>; +def SDT_XCoreCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>, + SDTCisVT<1, i32> ]>; def SDT_XCoreCallSeqEnd : SDCallSeqEnd<[ SDTCisVT<0, i32>, - SDTCisVT<1, i32> ]>; + SDTCisVT<1, i32> ]>; def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_XCoreCallSeqStart, [SDNPHasChain, SDNPOutGlue]>; @@ -323,9 +324,9 @@ class F2R_np opc, string OpcStr> : //===----------------------------------------------------------------------===// let Defs = [SP], Uses = [SP] in { -def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt), - "# ADJCALLSTACKDOWN $amt", - [(callseq_start timm:$amt)]>; +def ADJCALLSTACKDOWN : PseudoInstXCore<(outs), (ins i32imm:$amt, i32imm:$amt2), + "# ADJCALLSTACKDOWN $amt, $amt2", + [(callseq_start timm:$amt, timm:$amt2)]>; def ADJCALLSTACKUP : PseudoInstXCore<(outs), (ins i32imm:$amt1, i32imm:$amt2), "# ADJCALLSTACKUP $amt1", [(callseq_end timm:$amt1, timm:$amt2)]>; diff --git a/lib/ToolDrivers/CMakeLists.txt b/lib/ToolDrivers/CMakeLists.txt new file mode 100644 index 000000000000..ad458450fda3 --- /dev/null +++ b/lib/ToolDrivers/CMakeLists.txt @@ -0,0 +1 @@ +add_subdirectory(llvm-lib) diff --git a/lib/ToolDrivers/LLVMBuild.txt b/lib/ToolDrivers/LLVMBuild.txt new file mode 100644 index 000000000000..7da9a5c01005 --- /dev/null +++ b/lib/ToolDrivers/LLVMBuild.txt @@ -0,0 +1,24 @@ +;===- ./lib/ToolDrivers/LLVMBuild.txt --------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[common] +subdirectories = llvm-lib + +[component_0] +type = Group +name = ToolDrivers +parent = Libraries diff --git a/lib/ToolDrivers/llvm-lib/CMakeLists.txt b/lib/ToolDrivers/llvm-lib/CMakeLists.txt new file mode 100644 index 000000000000..ab53a6843446 --- /dev/null +++ b/lib/ToolDrivers/llvm-lib/CMakeLists.txt @@ -0,0 +1,8 @@ +set(LLVM_TARGET_DEFINITIONS Options.td) +tablegen(LLVM Options.inc -gen-opt-parser-defs) +add_public_tablegen_target(LibOptionsTableGen) + +add_llvm_library(LLVMLibDriver + LibDriver.cpp + ) +add_dependencies(LLVMLibDriver LibOptionsTableGen) diff --git a/lib/ToolDrivers/llvm-lib/LLVMBuild.txt b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt new file mode 100644 index 000000000000..799dc997c0bb --- /dev/null +++ b/lib/ToolDrivers/llvm-lib/LLVMBuild.txt @@ -0,0 +1,22 @@ +;===- ./lib/LibDriver/LLVMBuild.txt ----------------------------*- Conf -*--===; +; +; The LLVM Compiler Infrastructure +; +; This file is distributed under the University of Illinois Open Source +; License. See LICENSE.TXT for details. +; +;===------------------------------------------------------------------------===; +; +; This is an LLVMBuild description file for the components in this subdirectory. +; +; For more information on the LLVMBuild system, please see: +; +; http://llvm.org/docs/LLVMBuild.html +; +;===------------------------------------------------------------------------===; + +[component_0] +type = Library +name = LibDriver +parent = Libraries +required_libraries = Object Option Support diff --git a/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/lib/ToolDrivers/llvm-lib/LibDriver.cpp new file mode 100644 index 000000000000..3bae3826d62e --- /dev/null +++ b/lib/ToolDrivers/llvm-lib/LibDriver.cpp @@ -0,0 +1,171 @@ +//===- LibDriver.cpp - lib.exe-compatible driver --------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// Defines an interface to a lib.exe-compatible driver that also understands +// bitcode files. Used by llvm-lib and lld-link /lib. +// +//===----------------------------------------------------------------------===// + +#include "llvm/ToolDrivers/llvm-lib/LibDriver.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/Object/ArchiveWriter.h" +#include "llvm/Option/Arg.h" +#include "llvm/Option/ArgList.h" +#include "llvm/Option/Option.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/StringSaver.h" +#include "llvm/Support/Path.h" +#include "llvm/Support/Process.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +namespace { + +enum { + OPT_INVALID = 0, +#define OPTION(_1, _2, ID, _4, _5, _6, _7, _8, _9, _10, _11) OPT_##ID, +#include "Options.inc" +#undef OPTION +}; + +#define PREFIX(NAME, VALUE) const char *const NAME[] = VALUE; +#include "Options.inc" +#undef PREFIX + +static const llvm::opt::OptTable::Info infoTable[] = { +#define OPTION(X1, X2, ID, KIND, GROUP, ALIAS, X6, X7, X8, X9, X10) \ + { \ + X1, X2, X9, X10, OPT_##ID, llvm::opt::Option::KIND##Class, X8, X7, \ + OPT_##GROUP, OPT_##ALIAS, X6 \ + }, +#include "Options.inc" +#undef OPTION +}; + +class LibOptTable : public llvm::opt::OptTable { +public: + LibOptTable() : OptTable(infoTable, true) {} +}; + +} + +static std::string getOutputPath(llvm::opt::InputArgList *Args, + const llvm::NewArchiveMember &FirstMember) { + if (auto *Arg = Args->getLastArg(OPT_out)) + return Arg->getValue(); + SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier()); + llvm::sys::path::replace_extension(Val, ".lib"); + return Val.str(); +} + +static std::vector getSearchPaths(llvm::opt::InputArgList *Args, + StringSaver &Saver) { + std::vector Ret; + // Add current directory as first item of the search path. + Ret.push_back(""); + + // Add /libpath flags. + for (auto *Arg : Args->filtered(OPT_libpath)) + Ret.push_back(Arg->getValue()); + + // Add $LIB. + Optional EnvOpt = sys::Process::GetEnv("LIB"); + if (!EnvOpt.hasValue()) + return Ret; + StringRef Env = Saver.save(*EnvOpt); + while (!Env.empty()) { + StringRef Path; + std::tie(Path, Env) = Env.split(';'); + Ret.push_back(Path); + } + return Ret; +} + +static Optional findInputFile(StringRef File, + ArrayRef Paths) { + for (auto Dir : Paths) { + SmallString<128> Path = Dir; + sys::path::append(Path, File); + if (sys::fs::exists(Path)) + return Path.str().str(); + } + return Optional(); +} + +int llvm::libDriverMain(llvm::ArrayRef ArgsArr) { + SmallVector NewArgs(ArgsArr.begin(), ArgsArr.end()); + BumpPtrAllocator Alloc; + StringSaver Saver(Alloc); + cl::ExpandResponseFiles(Saver, cl::TokenizeWindowsCommandLine, NewArgs); + ArgsArr = NewArgs; + + LibOptTable Table; + unsigned MissingIndex; + unsigned MissingCount; + llvm::opt::InputArgList Args = + Table.ParseArgs(ArgsArr.slice(1), MissingIndex, MissingCount); + if (MissingCount) { + llvm::errs() << "missing arg value for \"" + << Args.getArgString(MissingIndex) << "\", expected " + << MissingCount + << (MissingCount == 1 ? " argument.\n" : " arguments.\n"); + return 1; + } + for (auto *Arg : Args.filtered(OPT_UNKNOWN)) + llvm::errs() << "ignoring unknown argument: " << Arg->getSpelling() << "\n"; + + if (!Args.hasArgNoClaim(OPT_INPUT)) { + // No input files. To match lib.exe, silently do nothing. + return 0; + } + + std::vector SearchPaths = getSearchPaths(&Args, Saver); + + std::vector Members; + for (auto *Arg : Args.filtered(OPT_INPUT)) { + Optional Path = findInputFile(Arg->getValue(), SearchPaths); + if (!Path.hasValue()) { + llvm::errs() << Arg->getValue() << ": no such file or directory\n"; + return 1; + } + Expected MOrErr = + NewArchiveMember::getFile(Saver.save(*Path), /*Deterministic=*/true); + if (!MOrErr) { + handleAllErrors(MOrErr.takeError(), [&](const llvm::ErrorInfoBase &EIB) { + llvm::errs() << Arg->getValue() << ": " << EIB.message() << "\n"; + }); + return 1; + } + sys::fs::file_magic Magic = + sys::fs::identify_magic(MOrErr->Buf->getBuffer()); + if (Magic != sys::fs::file_magic::coff_object && + Magic != sys::fs::file_magic::bitcode && + Magic != sys::fs::file_magic::windows_resource) { + llvm::errs() << Arg->getValue() + << ": not a COFF object, bitcode or resource file\n"; + return 1; + } + Members.emplace_back(std::move(*MOrErr)); + } + + std::pair Result = + llvm::writeArchive(getOutputPath(&Args, Members[0]), Members, + /*WriteSymtab=*/true, object::Archive::K_GNU, + /*Deterministic*/ true, Args.hasArg(OPT_llvmlibthin)); + + if (Result.second) { + if (Result.first.empty()) + Result.first = ArgsArr[0]; + llvm::errs() << Result.first << ": " << Result.second.message() << "\n"; + return 1; + } + + return 0; +} diff --git a/lib/ToolDrivers/llvm-lib/Options.td b/lib/ToolDrivers/llvm-lib/Options.td new file mode 100644 index 000000000000..5a56ef7468d4 --- /dev/null +++ b/lib/ToolDrivers/llvm-lib/Options.td @@ -0,0 +1,25 @@ +include "llvm/Option/OptParser.td" + +// lib.exe accepts options starting with either a dash or a slash. + +// Flag that takes no arguments. +class F : Flag<["/", "-", "-?"], name>; + +// Flag that takes one argument after ":". +class P : + Joined<["/", "-", "-?"], name#":">, HelpText; + +def libpath: P<"libpath", "Object file search path">; +def out : P<"out", "Path to file to write output">; + +def llvmlibthin : F<"llvmlibthin">; + +//============================================================================== +// The flags below do nothing. They are defined only for lib.exe compatibility. +//============================================================================== + +class QF : Joined<["/", "-", "-?"], name#":">; + +def ignore : QF<"ignore">; +def machine: QF<"machine">; +def nologo : F<"nologo">; diff --git a/lib/Transforms/Coroutines/CoroFrame.cpp b/lib/Transforms/Coroutines/CoroFrame.cpp index 19e6789dfa74..4480220f2cd4 100644 --- a/lib/Transforms/Coroutines/CoroFrame.cpp +++ b/lib/Transforms/Coroutines/CoroFrame.cpp @@ -177,7 +177,7 @@ SuspendCrossingInfo::SuspendCrossingInfo(Function &F, coro::Shape &Shape) // consume. Note, that crossing coro.save also requires a spill, as any code // between coro.save and coro.suspend may resume the coroutine and all of the // state needs to be saved by that time. - auto markSuspendBlock = [&](IntrinsicInst* BarrierInst) { + auto markSuspendBlock = [&](IntrinsicInst *BarrierInst) { BasicBlock *SuspendBlock = BarrierInst->getParent(); auto &B = getBlockData(SuspendBlock); B.Suspend = true; @@ -495,6 +495,78 @@ static Instruction *insertSpills(SpillInfo &Spills, coro::Shape &Shape) { return FramePtr; } +// Sets the unwind edge of an instruction to a particular successor. +static void setUnwindEdgeTo(TerminatorInst *TI, BasicBlock *Succ) { + if (auto *II = dyn_cast(TI)) + II->setUnwindDest(Succ); + else if (auto *CS = dyn_cast(TI)) + CS->setUnwindDest(Succ); + else if (auto *CR = dyn_cast(TI)) + CR->setUnwindDest(Succ); + else + llvm_unreachable("unexpected terminator instruction"); +} + +// Replaces all uses of OldPred with the NewPred block in all PHINodes in a +// block. +static void updatePhiNodes(BasicBlock *DestBB, BasicBlock *OldPred, + BasicBlock *NewPred, + PHINode *LandingPadReplacement) { + unsigned BBIdx = 0; + for (BasicBlock::iterator I = DestBB->begin(); isa(I); ++I) { + PHINode *PN = cast(I); + + // We manually update the LandingPadReplacement PHINode and it is the last + // PHI Node. So, if we find it, we are done. + if (LandingPadReplacement == PN) + break; + + // Reuse the previous value of BBIdx if it lines up. In cases where we + // have multiple phi nodes with *lots* of predecessors, this is a speed + // win because we don't have to scan the PHI looking for TIBB. This + // happens because the BB list of PHI nodes are usually in the same + // order. + if (PN->getIncomingBlock(BBIdx) != OldPred) + BBIdx = PN->getBasicBlockIndex(OldPred); + + assert(BBIdx != (unsigned)-1 && "Invalid PHI Index!"); + PN->setIncomingBlock(BBIdx, NewPred); + } +} + +// Uses SplitEdge unless the successor block is an EHPad, in which case do EH +// specific handling. +static BasicBlock *ehAwareSplitEdge(BasicBlock *BB, BasicBlock *Succ, + LandingPadInst *OriginalPad, + PHINode *LandingPadReplacement) { + auto *PadInst = Succ->getFirstNonPHI(); + if (!LandingPadReplacement && !PadInst->isEHPad()) + return SplitEdge(BB, Succ); + + auto *NewBB = BasicBlock::Create(BB->getContext(), "", BB->getParent(), Succ); + setUnwindEdgeTo(BB->getTerminator(), NewBB); + updatePhiNodes(Succ, BB, NewBB, LandingPadReplacement); + + if (LandingPadReplacement) { + auto *NewLP = OriginalPad->clone(); + auto *Terminator = BranchInst::Create(Succ, NewBB); + NewLP->insertBefore(Terminator); + LandingPadReplacement->addIncoming(NewLP, NewBB); + return NewBB; + } + Value *ParentPad = nullptr; + if (auto *FuncletPad = dyn_cast(PadInst)) + ParentPad = FuncletPad->getParentPad(); + else if (auto *CatchSwitch = dyn_cast(PadInst)) + ParentPad = CatchSwitch->getParentPad(); + else + llvm_unreachable("handling for other EHPads not implemented yet"); + + auto *NewCleanupPad = CleanupPadInst::Create(ParentPad, {}, "", NewBB); + CleanupReturnInst::Create(NewCleanupPad, Succ, NewBB); + return NewBB; +} + static void rewritePHIs(BasicBlock &BB) { // For every incoming edge we will create a block holding all // incoming values in a single PHI nodes. @@ -502,7 +574,7 @@ static void rewritePHIs(BasicBlock &BB) { // loop: // %n.val = phi i32[%n, %entry], [%inc, %loop] // - // It will create: + // It will create: // // loop.from.entry: // %n.loop.pre = phi i32 [%n, %entry] @@ -517,9 +589,22 @@ static void rewritePHIs(BasicBlock &BB) { // TODO: Simplify PHINodes in the basic block to remove duplicate // predecessors. + LandingPadInst *LandingPad = nullptr; + PHINode *ReplPHI = nullptr; + if ((LandingPad = dyn_cast_or_null(BB.getFirstNonPHI()))) { + // ehAwareSplitEdge will clone the LandingPad in all the edge blocks. + // We replace the original landing pad with a PHINode that will collect the + // results from all of them. + ReplPHI = PHINode::Create(LandingPad->getType(), 1, "", LandingPad); + ReplPHI->takeName(LandingPad); + LandingPad->replaceAllUsesWith(ReplPHI); + // We will erase the original landing pad at the end of this function after + // ehAwareSplitEdge cloned it in the transition blocks. + } + SmallVector Preds(pred_begin(&BB), pred_end(&BB)); for (BasicBlock *Pred : Preds) { - auto *IncomingBB = SplitEdge(Pred, &BB); + auto *IncomingBB = ehAwareSplitEdge(Pred, &BB, LandingPad, ReplPHI); IncomingBB->setName(BB.getName() + Twine(".from.") + Pred->getName()); auto *PN = cast(&BB.front()); do { @@ -531,7 +616,14 @@ static void rewritePHIs(BasicBlock &BB) { InputV->addIncoming(V, Pred); PN->setIncomingValue(Index, InputV); PN = dyn_cast(PN->getNextNode()); - } while (PN); + } while (PN != ReplPHI); // ReplPHI is either null or the PHI that replaced + // the landing pad. + } + + if (LandingPad) { + // Calls to ehAwareSplitEdge function cloned the original lading pad. + // No longer need it. + LandingPad->eraseFromParent(); } } diff --git a/lib/Transforms/IPO/FunctionImport.cpp b/lib/Transforms/IPO/FunctionImport.cpp index 7ed07d63c627..231487923fad 100644 --- a/lib/Transforms/IPO/FunctionImport.cpp +++ b/lib/Transforms/IPO/FunctionImport.cpp @@ -610,8 +610,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, return true; // Lookup the linkage recorded in the summaries during global analysis. - const auto &GS = DefinedGlobals.find(GV.getGUID()); - GlobalValue::LinkageTypes Linkage; + auto GS = DefinedGlobals.find(GV.getGUID()); if (GS == DefinedGlobals.end()) { // Must have been promoted (possibly conservatively). Find original // name so that we can access the correct summary and see if it can @@ -623,7 +622,7 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, std::string OrigId = GlobalValue::getGlobalIdentifier( OrigName, GlobalValue::InternalLinkage, TheModule.getSourceFileName()); - const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); + GS = DefinedGlobals.find(GlobalValue::getGUID(OrigId)); if (GS == DefinedGlobals.end()) { // Also check the original non-promoted non-globalized name. In some // cases a preempted weak value is linked in as a local copy because @@ -631,15 +630,11 @@ void llvm::thinLTOInternalizeModule(Module &TheModule, // In that case, since it was originally not a local value, it was // recorded in the index using the original name. // FIXME: This may not be needed once PR27866 is fixed. - const auto &GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); + GS = DefinedGlobals.find(GlobalValue::getGUID(OrigName)); assert(GS != DefinedGlobals.end()); - Linkage = GS->second->linkage(); - } else { - Linkage = GS->second->linkage(); } - } else - Linkage = GS->second->linkage(); - return !GlobalValue::isLocalLinkage(Linkage); + } + return !GlobalValue::isLocalLinkage(GS->second->linkage()); }; // FIXME: See if we can just internalize directly here via linkage changes diff --git a/lib/Transforms/IPO/Inliner.cpp b/lib/Transforms/IPO/Inliner.cpp index 6c83c99ae3be..673d3af0ab52 100644 --- a/lib/Transforms/IPO/Inliner.cpp +++ b/lib/Transforms/IPO/Inliner.cpp @@ -502,7 +502,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG, std::swap(CallSites[i--], CallSites[--FirstCallInSCC]); InlinedArrayAllocasTy InlinedArrayAllocas; - InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache); + InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI); // Now that we have all of the call sites, loop over them and inline them if // it looks profitable to do so. @@ -872,7 +872,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC, // Setup the data structure used to plumb customization into the // `InlineFunction` routine. InlineFunctionInfo IFI( - /*cg=*/nullptr, &GetAssumptionCache, + /*cg=*/nullptr, &GetAssumptionCache, PSI, &FAM.getResult(*(CS.getCaller())), &FAM.getResult(Callee)); diff --git a/lib/Transforms/IPO/PartialInlining.cpp b/lib/Transforms/IPO/PartialInlining.cpp index 2db47b3b5622..8dff2fb3be8a 100644 --- a/lib/Transforms/IPO/PartialInlining.cpp +++ b/lib/Transforms/IPO/PartialInlining.cpp @@ -16,6 +16,7 @@ #include "llvm/ADT/Statistic.h" #include "llvm/Analysis/BlockFrequencyInfo.h" #include "llvm/Analysis/BranchProbabilityInfo.h" +#include "llvm/Analysis/CodeMetrics.h" #include "llvm/Analysis/InlineCost.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/OptimizationDiagnosticInfo.h" @@ -42,6 +43,11 @@ STATISTIC(NumPartialInlined, static cl::opt DisablePartialInlining("disable-partial-inlining", cl::init(false), cl::Hidden, cl::desc("Disable partial ininling")); +// This is an option used by testing: +static cl::opt SkipCostAnalysis("skip-partial-inlining-cost-analysis", + cl::init(false), cl::ZeroOrMore, + cl::ReallyHidden, + cl::desc("Skip Cost Analysis")); static cl::opt MaxNumInlineBlocks( "max-num-inline-blocks", cl::init(5), cl::Hidden, @@ -53,6 +59,15 @@ static cl::opt MaxNumPartialInlining( "max-partial-inlining", cl::init(-1), cl::Hidden, cl::ZeroOrMore, cl::desc("Max number of partial inlining. The default is unlimited")); +// Used only when PGO or user annotated branch data is absent. It is +// the least value that is used to weigh the outline region. If BFI +// produces larger value, the BFI value will be used. +static cl::opt + OutlineRegionFreqPercent("outline-region-freq-percent", cl::init(75), + cl::Hidden, cl::ZeroOrMore, + cl::desc("Relative frequency of outline region to " + "the entry block")); + namespace { struct FunctionOutliningInfo { @@ -84,8 +99,6 @@ struct PartialInlinerImpl { bool run(Module &M); Function *unswitchFunction(Function *F); - std::unique_ptr computeOutliningInfo(Function *F); - private: int NumPartialInlining = 0; std::function *GetAssumptionCache; @@ -93,11 +106,84 @@ private: Optional> GetBFI; ProfileSummaryInfo *PSI; - bool shouldPartialInline(CallSite CS, OptimizationRemarkEmitter &ORE); + // Return the frequency of the OutlininingBB relative to F's entry point. + // The result is no larger than 1 and is represented using BP. + // (Note that the outlined region's 'head' block can only have incoming + // edges from the guarding entry blocks). + BranchProbability getOutliningCallBBRelativeFreq(Function *F, + FunctionOutliningInfo *OI, + Function *DuplicateFunction, + BlockFrequencyInfo *BFI, + BasicBlock *OutliningCallBB); + + // Return true if the callee of CS should be partially inlined with + // profit. + bool shouldPartialInline(CallSite CS, Function *F, FunctionOutliningInfo *OI, + BlockFrequencyInfo *CalleeBFI, + BasicBlock *OutliningCallBB, + int OutliningCallOverhead, + OptimizationRemarkEmitter &ORE); + + // Try to inline DuplicateFunction (cloned from F with call to + // the OutlinedFunction into its callers. Return true + // if there is any successful inlining. + bool tryPartialInline(Function *DuplicateFunction, + Function *F, /*orignal function */ + FunctionOutliningInfo *OI, Function *OutlinedFunction, + BlockFrequencyInfo *CalleeBFI); + + // Compute the mapping from use site of DuplicationFunction to the enclosing + // BB's profile count. + void computeCallsiteToProfCountMap(Function *DuplicateFunction, + DenseMap &SiteCountMap); + bool IsLimitReached() { return (MaxNumPartialInlining != -1 && NumPartialInlining >= MaxNumPartialInlining); } + + CallSite getCallSite(User *U) { + CallSite CS; + if (CallInst *CI = dyn_cast(U)) + CS = CallSite(CI); + else if (InvokeInst *II = dyn_cast(U)) + CS = CallSite(II); + else + llvm_unreachable("All uses must be calls"); + return CS; + } + + CallSite getOneCallSiteTo(Function *F) { + User *User = *F->user_begin(); + return getCallSite(User); + } + + std::tuple getOneDebugLoc(Function *F) { + CallSite CS = getOneCallSiteTo(F); + DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); + BasicBlock *Block = CS.getParent(); + return std::make_tuple(DLoc, Block); + } + + // Returns the costs associated with function outlining: + // - The first value is the non-weighted runtime cost for making the call + // to the outlined function 'OutlinedFunction', including the addtional + // setup cost in the outlined function itself; + // - The second value is the estimated size of the new call sequence in + // basic block 'OutliningCallBB'; + // - The third value is the estimated size of the original code from + // function 'F' that is extracted into the outlined function. + std::tuple + computeOutliningCosts(Function *F, const FunctionOutliningInfo *OutliningInfo, + Function *OutlinedFunction, + BasicBlock *OutliningCallBB); + // Compute the 'InlineCost' of block BB. InlineCost is a proxy used to + // approximate both the size and runtime cost (Note that in the current + // inline cost analysis, there is no clear distinction there either). + int computeBBInlineCost(BasicBlock *BB); + + std::unique_ptr computeOutliningInfo(Function *F); + }; struct PartialInlinerLegacyPass : public ModulePass { @@ -157,7 +243,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { return isa(TI); }; - auto GetReturnBlock = [=](BasicBlock *Succ1, BasicBlock *Succ2) { + auto GetReturnBlock = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsReturnBlock(Succ1)) return std::make_tuple(Succ1, Succ2); if (IsReturnBlock(Succ2)) @@ -167,7 +253,7 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { }; // Detect a triangular shape: - auto GetCommonSucc = [=](BasicBlock *Succ1, BasicBlock *Succ2) { + auto GetCommonSucc = [&](BasicBlock *Succ1, BasicBlock *Succ2) { if (IsSuccessor(Succ1, Succ2)) return std::make_tuple(Succ1, Succ2); if (IsSuccessor(Succ2, Succ1)) @@ -223,7 +309,8 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { // Do sanity check of the entries: threre should not // be any successors (not in the entry set) other than // {ReturnBlock, NonReturnBlock} - assert(OutliningInfo->Entries[0] == &F->front()); + assert(OutliningInfo->Entries[0] == &F->front() && + "Function Entry must be the first in Entries vector"); DenseSet Entries; for (BasicBlock *E : OutliningInfo->Entries) Entries.insert(E); @@ -289,10 +376,54 @@ PartialInlinerImpl::computeOutliningInfo(Function *F) { return OutliningInfo; } -bool PartialInlinerImpl::shouldPartialInline(CallSite CS, - OptimizationRemarkEmitter &ORE) { - // TODO : more sharing with shouldInline in Inliner.cpp +// Check if there is PGO data or user annoated branch data: +static bool hasProfileData(Function *F, FunctionOutliningInfo *OI) { + if (F->getEntryCount()) + return true; + // Now check if any of the entry block has MD_prof data: + for (auto *E : OI->Entries) { + BranchInst *BR = dyn_cast(E->getTerminator()); + if (!BR || BR->isUnconditional()) + continue; + uint64_t T, F; + if (BR->extractProfMetadata(T, F)) + return true; + } + return false; +} + +BranchProbability PartialInlinerImpl::getOutliningCallBBRelativeFreq( + Function *F, FunctionOutliningInfo *OI, Function *DuplicateFunction, + BlockFrequencyInfo *BFI, BasicBlock *OutliningCallBB) { + + auto EntryFreq = + BFI->getBlockFreq(&DuplicateFunction->getEntryBlock()); + auto OutliningCallFreq = BFI->getBlockFreq(OutliningCallBB); + + auto OutlineRegionRelFreq = + BranchProbability::getBranchProbability(OutliningCallFreq.getFrequency(), + EntryFreq.getFrequency()); + + if (hasProfileData(F, OI)) + return OutlineRegionRelFreq; + + // When profile data is not available, we need to be very + // conservative in estimating the overall savings. We need to make sure + // the outline region relative frequency is not below the threshold + // specified by the option. + OutlineRegionRelFreq = std::max(OutlineRegionRelFreq, BranchProbability(OutlineRegionFreqPercent, 100)); + + return OutlineRegionRelFreq; +} + +bool PartialInlinerImpl::shouldPartialInline( + CallSite CS, Function *F /* Original Callee */, FunctionOutliningInfo *OI, + BlockFrequencyInfo *CalleeBFI, BasicBlock *OutliningCallBB, + int NonWeightedOutliningRcost, OptimizationRemarkEmitter &ORE) { using namespace ore; + if (SkipCostAnalysis) + return true; + Instruction *Call = CS.getInstruction(); Function *Callee = CS.getCalledFunction(); Function *Caller = CS.getCaller(); @@ -302,36 +433,170 @@ bool PartialInlinerImpl::shouldPartialInline(CallSite CS, if (IC.isAlways()) { ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call) - << NV("Callee", Callee) + << NV("Callee", F) << " should always be fully inlined, not partially"); return false; } if (IC.isNever()) { ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call) - << NV("Callee", Callee) << " not partially inlined into " + << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because it should never be inlined (cost=never)"); return false; } if (!IC) { - ORE.emit(OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call) - << NV("Callee", Callee) << " not partially inlined into " + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call) + << NV("Callee", F) << " not partially inlined into " << NV("Caller", Caller) << " because too costly to inline (cost=" << NV("Cost", IC.getCost()) << ", threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return false; } + const DataLayout &DL = Caller->getParent()->getDataLayout(); + // The savings of eliminating the call: + int NonWeightedSavings = getCallsiteCost(CS, DL); + BlockFrequency NormWeightedSavings(NonWeightedSavings); + + auto RelativeFreq = + getOutliningCallBBRelativeFreq(F, OI, Callee, CalleeBFI, OutliningCallBB); + auto NormWeightedRcost = + BlockFrequency(NonWeightedOutliningRcost) * RelativeFreq; + + // Weighted saving is smaller than weighted cost, return false + if (NormWeightedSavings < NormWeightedRcost) { + ORE.emit( + OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh", Call) + << NV("Callee", F) << " not partially inlined into " + << NV("Caller", Caller) << " runtime overhead (overhead=" + << NV("Overhead", (unsigned)NormWeightedRcost.getFrequency()) + << ", savings=" + << NV("Savings", (unsigned)NormWeightedSavings.getFrequency()) << ")" + << " of making the outlined call is too high"); + + return false; + } ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call) - << NV("Callee", Callee) << " can be partially inlined into " + << NV("Callee", F) << " can be partially inlined into " << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost()) << " (threshold=" << NV("Threshold", IC.getCostDelta() + IC.getCost()) << ")"); return true; } +// TODO: Ideally we should share Inliner's InlineCost Analysis code. +// For now use a simplified version. The returned 'InlineCost' will be used +// to esimate the size cost as well as runtime cost of the BB. +int PartialInlinerImpl::computeBBInlineCost(BasicBlock *BB) { + int InlineCost = 0; + const DataLayout &DL = BB->getParent()->getParent()->getDataLayout(); + for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) { + if (isa(I)) + continue; + + if (CallInst *CI = dyn_cast(I)) { + InlineCost += getCallsiteCost(CallSite(CI), DL); + continue; + } + + if (InvokeInst *II = dyn_cast(I)) { + InlineCost += getCallsiteCost(CallSite(II), DL); + continue; + } + + if (SwitchInst *SI = dyn_cast(I)) { + InlineCost += (SI->getNumCases() + 1) * InlineConstants::InstrCost; + continue; + } + InlineCost += InlineConstants::InstrCost; + } + return InlineCost; +} + +std::tuple PartialInlinerImpl::computeOutliningCosts( + Function *F, const FunctionOutliningInfo *OI, Function *OutlinedFunction, + BasicBlock *OutliningCallBB) { + // First compute the cost of the outlined region 'OI' in the original + // function 'F': + int OutlinedRegionCost = 0; + for (BasicBlock &BB : *F) { + if (&BB != OI->ReturnBlock && + // Assuming Entry set is small -- do a linear search here: + std::find(OI->Entries.begin(), OI->Entries.end(), &BB) == + OI->Entries.end()) { + OutlinedRegionCost += computeBBInlineCost(&BB); + } + } + + // Now compute the cost of the call sequence to the outlined function + // 'OutlinedFunction' in BB 'OutliningCallBB': + int OutliningFuncCallCost = computeBBInlineCost(OutliningCallBB); + + // Now compute the cost of the extracted/outlined function itself: + int OutlinedFunctionCost = 0; + for (BasicBlock &BB : *OutlinedFunction) { + OutlinedFunctionCost += computeBBInlineCost(&BB); + } + + assert(OutlinedFunctionCost >= OutlinedRegionCost && + "Outlined function cost should be no less than the outlined region"); + int OutliningRuntimeOverhead = + OutliningFuncCallCost + (OutlinedFunctionCost - OutlinedRegionCost); + + return std::make_tuple(OutliningFuncCallCost, OutliningRuntimeOverhead, + OutlinedRegionCost); +} + +// Create the callsite to profile count map which is +// used to update the original function's entry count, +// after the function is partially inlined into the callsite. +void PartialInlinerImpl::computeCallsiteToProfCountMap( + Function *DuplicateFunction, + DenseMap &CallSiteToProfCountMap) { + std::vector Users(DuplicateFunction->user_begin(), + DuplicateFunction->user_end()); + Function *CurrentCaller = nullptr; + BlockFrequencyInfo *CurrentCallerBFI = nullptr; + + auto ComputeCurrBFI = [&,this](Function *Caller) { + // For the old pass manager: + if (!GetBFI) { + if (CurrentCallerBFI) + delete CurrentCallerBFI; + DominatorTree DT(*Caller); + LoopInfo LI(DT); + BranchProbabilityInfo BPI(*Caller, LI); + CurrentCallerBFI = new BlockFrequencyInfo(*Caller, BPI, LI); + } else { + // New pass manager: + CurrentCallerBFI = &(*GetBFI)(*Caller); + } + }; + + for (User *User : Users) { + CallSite CS = getCallSite(User); + Function *Caller = CS.getCaller(); + if (CurrentCaller != Caller) { + CurrentCaller = Caller; + ComputeCurrBFI(Caller); + } else { + assert(CurrentCallerBFI && "CallerBFI is not set"); + } + BasicBlock *CallBB = CS.getInstruction()->getParent(); + auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB); + if (Count) + CallSiteToProfCountMap[User] = *Count; + else + CallSiteToProfCountMap[User] = 0; + } + if (!GetBFI) { + if (CurrentCallerBFI) + delete CurrentCallerBFI; + } +} + Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (F->hasAddressTaken()) @@ -347,21 +612,21 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { if (PSI->isFunctionEntryCold(F)) return nullptr; - std::unique_ptr OutliningInfo = - computeOutliningInfo(F); + if (F->user_begin() == F->user_end()) + return nullptr; + + std::unique_ptr OI = computeOutliningInfo(F); - if (!OutliningInfo) + if (!OI) return nullptr; // Clone the function, so that we can hack away on it. ValueToValueMapTy VMap; Function *DuplicateFunction = CloneFunction(F, VMap); - BasicBlock *NewReturnBlock = - cast(VMap[OutliningInfo->ReturnBlock]); - BasicBlock *NewNonReturnBlock = - cast(VMap[OutliningInfo->NonReturnBlock]); + BasicBlock *NewReturnBlock = cast(VMap[OI->ReturnBlock]); + BasicBlock *NewNonReturnBlock = cast(VMap[OI->NonReturnBlock]); DenseSet NewEntries; - for (BasicBlock *BB : OutliningInfo->Entries) { + for (BasicBlock *BB : OI->Entries) { NewEntries.insert(cast(VMap[BB])); } @@ -390,7 +655,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { BasicBlock *PreReturn = NewReturnBlock; // only split block when necessary: PHINode *FirstPhi = getFirstPHI(PreReturn); - unsigned NumPredsFromEntries = OutliningInfo->ReturnBlockPreds.size(); + unsigned NumPredsFromEntries = OI->ReturnBlockPreds.size(); if (FirstPhi && FirstPhi->getNumIncomingValues() > NumPredsFromEntries + 1) { NewReturnBlock = NewReturnBlock->splitBasicBlock( @@ -408,14 +673,14 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { Ins = NewReturnBlock->getFirstNonPHI(); RetPhi->addIncoming(&*I, PreReturn); - for (BasicBlock *E : OutliningInfo->ReturnBlockPreds) { + for (BasicBlock *E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); RetPhi->addIncoming(OldPhi->getIncomingValueForBlock(NewE), NewE); OldPhi->removeIncomingValue(NewE); } ++I; } - for (auto E : OutliningInfo->ReturnBlockPreds) { + for (auto E : OI->ReturnBlockPreds) { BasicBlock *NewE = cast(VMap[E]); NewE->getTerminator()->replaceUsesOfWith(PreReturn, NewReturnBlock); } @@ -423,7 +688,7 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { // Returns true if the block is to be partial inlined into the caller // (i.e. not to be extracted to the out of line function) - auto ToBeInlined = [=](BasicBlock *BB) { + auto ToBeInlined = [&](BasicBlock *BB) { return BB == NewReturnBlock || NewEntries.count(BB); }; // Gather up the blocks that we're going to extract. @@ -443,50 +708,113 @@ Function *PartialInlinerImpl::unswitchFunction(Function *F) { BlockFrequencyInfo BFI(*DuplicateFunction, BPI, LI); // Extract the body of the if. - Function *ExtractedFunction = + Function *OutlinedFunction = CodeExtractor(ToExtract, &DT, /*AggregateArgs*/ false, &BFI, &BPI) .extractCodeRegion(); - // Inline the top-level if test into all callers. + bool AnyInline = + tryPartialInline(DuplicateFunction, F, OI.get(), OutlinedFunction, &BFI); + + // Ditch the duplicate, since we're done with it, and rewrite all remaining + // users (function pointers, etc.) back to the original function. + DuplicateFunction->replaceAllUsesWith(F); + DuplicateFunction->eraseFromParent(); + + if (AnyInline) + return OutlinedFunction; + + // Remove the function that is speculatively created: + if (OutlinedFunction) + OutlinedFunction->eraseFromParent(); + + return nullptr; +} + +bool PartialInlinerImpl::tryPartialInline(Function *DuplicateFunction, + Function *F, + FunctionOutliningInfo *OI, + Function *OutlinedFunction, + BlockFrequencyInfo *CalleeBFI) { + if (OutlinedFunction == nullptr) + return false; + + int NonWeightedRcost; + int SizeCost; + int OutlinedRegionSizeCost; + + auto OutliningCallBB = + getOneCallSiteTo(OutlinedFunction).getInstruction()->getParent(); + + std::tie(SizeCost, NonWeightedRcost, OutlinedRegionSizeCost) = + computeOutliningCosts(F, OI, OutlinedFunction, OutliningCallBB); + + // The call sequence to the outlined function is larger than the original + // outlined region size, it does not increase the chances of inlining + // 'F' with outlining (The inliner usies the size increase to model the + // the cost of inlining a callee). + if (!SkipCostAnalysis && OutlinedRegionSizeCost < SizeCost) { + OptimizationRemarkEmitter ORE(F); + DebugLoc DLoc; + BasicBlock *Block; + std::tie(DLoc, Block) = getOneDebugLoc(DuplicateFunction); + ORE.emit(OptimizationRemarkAnalysis(DEBUG_TYPE, "OutlineRegionTooSmall", + DLoc, Block) + << ore::NV("Function", F) + << " not partially inlined into callers (Original Size = " + << ore::NV("OutlinedRegionOriginalSize", OutlinedRegionSizeCost) + << ", Size of call sequence to outlined function = " + << ore::NV("NewSize", SizeCost) << ")"); + return false; + } + + assert(F->user_begin() == F->user_end() && + "F's users should all be replaced!"); std::vector Users(DuplicateFunction->user_begin(), DuplicateFunction->user_end()); + DenseMap CallSiteToProfCountMap; + if (F->getEntryCount()) + computeCallsiteToProfCountMap(DuplicateFunction, CallSiteToProfCountMap); + + auto CalleeEntryCount = F->getEntryCount(); + uint64_t CalleeEntryCountV = (CalleeEntryCount ? *CalleeEntryCount : 0); + bool AnyInline = false; for (User *User : Users) { - CallSite CS; - if (CallInst *CI = dyn_cast(User)) - CS = CallSite(CI); - else if (InvokeInst *II = dyn_cast(User)) - CS = CallSite(II); - else - llvm_unreachable("All uses must be calls"); + CallSite CS = getCallSite(User); if (IsLimitReached()) continue; OptimizationRemarkEmitter ORE(CS.getCaller()); - if (!shouldPartialInline(CS, ORE)) + + if (!shouldPartialInline(CS, F, OI, CalleeBFI, OutliningCallBB, + NonWeightedRcost, ORE)) continue; - DebugLoc DLoc = CS.getInstruction()->getDebugLoc(); - BasicBlock *Block = CS.getParent(); - ORE.emit(OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", DLoc, Block) - << ore::NV("Callee", F) << " partially inlined into " - << ore::NV("Caller", CS.getCaller())); + ORE.emit( + OptimizationRemark(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction()) + << ore::NV("Callee", F) << " partially inlined into " + << ore::NV("Caller", CS.getCaller())); - InlineFunctionInfo IFI(nullptr, GetAssumptionCache); + InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI); InlineFunction(CS, IFI); + + // Now update the entry count: + if (CalleeEntryCountV && CallSiteToProfCountMap.count(User)) { + uint64_t CallSiteCount = CallSiteToProfCountMap[User]; + CalleeEntryCountV -= std::min(CalleeEntryCountV, CallSiteCount); + } + + AnyInline = true; NumPartialInlining++; - // update stats + // Update the stats NumPartialInlined++; } - // Ditch the duplicate, since we're done with it, and rewrite all remaining - // users (function pointers, etc.) back to the original function. - DuplicateFunction->replaceAllUsesWith(F); - DuplicateFunction->eraseFromParent(); - + if (AnyInline && CalleeEntryCount) + F->setEntryCount(CalleeEntryCountV); - return ExtractedFunction; + return AnyInline; } bool PartialInlinerImpl::run(Module &M) { diff --git a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp index d3a3c24ce7b4..659cb9df00a2 100644 --- a/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp +++ b/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp @@ -16,6 +16,7 @@ #include "llvm/Analysis/BasicAliasAnalysis.h" #include "llvm/Analysis/ModuleSummaryAnalysis.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/TypeMetadataUtils.h" #include "llvm/Bitcode/BitcodeWriter.h" #include "llvm/IR/Constants.h" @@ -178,7 +179,7 @@ void filterModule( else GO = new GlobalVariable( *M, GA->getValueType(), false, GlobalValue::ExternalLinkage, - (Constant *)nullptr, "", (GlobalVariable *)nullptr, + nullptr, "", nullptr, GA->getThreadLocalMode(), GA->getType()->getAddressSpace()); GO->takeName(GA); GA->replaceAllUsesWith(GO); @@ -320,7 +321,8 @@ void splitAndWriteThinLTOBitcode( // FIXME: Try to re-use BSI and PFI from the original module here. - ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, nullptr); + ProfileSummaryInfo PSI(M); + ModuleSummaryIndex Index = buildModuleSummaryIndex(M, nullptr, &PSI); SmallVector Buffer; diff --git a/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/lib/Transforms/InstCombine/InstCombineAddSub.cpp index 153a186d5ed4..0ca62b7ae40c 100644 --- a/lib/Transforms/InstCombine/InstCombineAddSub.cpp +++ b/lib/Transforms/InstCombine/InstCombineAddSub.cpp @@ -847,92 +847,6 @@ Value *FAddCombine::createAddendVal(const FAddend &Opnd, bool &NeedNeg) { return createFMul(OpndVal, Coeff.getValue(Instr->getType())); } -/// \brief Return true if we can prove that adding the two values of the -/// knownbits will not overflow. -/// Otherwise return false. -static bool checkRippleForAdd(const KnownBits &LHSKnown, - const KnownBits &RHSKnown) { - // Addition of two 2's complement numbers having opposite signs will never - // overflow. - if ((LHSKnown.isNegative() && RHSKnown.isNonNegative()) || - (LHSKnown.isNonNegative() && RHSKnown.isNegative())) - return true; - - // If either of the values is known to be non-negative, adding them can only - // overflow if the second is also non-negative, so we can assume that. - // Two non-negative numbers will only overflow if there is a carry to the - // sign bit, so we can check if even when the values are as big as possible - // there is no overflow to the sign bit. - if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) { - APInt MaxLHS = ~LHSKnown.Zero; - MaxLHS.clearSignBit(); - APInt MaxRHS = ~RHSKnown.Zero; - MaxRHS.clearSignBit(); - APInt Result = std::move(MaxLHS) + std::move(MaxRHS); - return Result.isSignBitClear(); - } - - // If either of the values is known to be negative, adding them can only - // overflow if the second is also negative, so we can assume that. - // Two negative number will only overflow if there is no carry to the sign - // bit, so we can check if even when the values are as small as possible - // there is overflow to the sign bit. - if (LHSKnown.isNegative() || RHSKnown.isNegative()) { - APInt MinLHS = LHSKnown.One; - MinLHS.clearSignBit(); - APInt MinRHS = RHSKnown.One; - MinRHS.clearSignBit(); - APInt Result = std::move(MinLHS) + std::move(MinRHS); - return Result.isSignBitSet(); - } - - // If we reached here it means that we know nothing about the sign bits. - // In this case we can't know if there will be an overflow, since by - // changing the sign bits any two values can be made to overflow. - return false; -} - -/// Return true if we can prove that: -/// (sext (add LHS, RHS)) === (add (sext LHS), (sext RHS)) -/// This basically requires proving that the add in the original type would not -/// overflow to change the sign bit or have a carry out. -bool InstCombiner::WillNotOverflowSignedAdd(Value *LHS, Value *RHS, - Instruction &CxtI) { - // There are different heuristics we can use for this. Here are some simple - // ones. - - // If LHS and RHS each have at least two sign bits, the addition will look - // like - // - // XX..... + - // YY..... - // - // If the carry into the most significant position is 0, X and Y can't both - // be 1 and therefore the carry out of the addition is also 0. - // - // If the carry into the most significant position is 1, X and Y can't both - // be 0 and therefore the carry out of the addition is also 1. - // - // Since the carry into the most significant position is always equal to - // the carry out of the addition, there is no signed overflow. - if (ComputeNumSignBits(LHS, 0, &CxtI) > 1 && - ComputeNumSignBits(RHS, 0, &CxtI) > 1) - return true; - - unsigned BitWidth = LHS->getType()->getScalarSizeInBits(); - KnownBits LHSKnown(BitWidth); - computeKnownBits(LHS, LHSKnown, 0, &CxtI); - - KnownBits RHSKnown(BitWidth); - computeKnownBits(RHS, RHSKnown, 0, &CxtI); - - // Check if carry bit of addition will not cause overflow. - if (checkRippleForAdd(LHSKnown, RHSKnown)) - return true; - - return false; -} - /// \brief Return true if we can prove that: /// (sub LHS, RHS) === (sub nsw LHS, RHS) /// This basically requires proving that the add in the original type would not @@ -968,13 +882,9 @@ bool InstCombiner::WillNotOverflowSignedSub(Value *LHS, Value *RHS, bool InstCombiner::WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI) { // If the LHS is negative and the RHS is non-negative, no unsigned wrap. - bool LHSKnownNonNegative, LHSKnownNegative; - bool RHSKnownNonNegative, RHSKnownNegative; - ComputeSignBit(LHS, LHSKnownNonNegative, LHSKnownNegative, /*Depth=*/0, - &CxtI); - ComputeSignBit(RHS, RHSKnownNonNegative, RHSKnownNegative, /*Depth=*/0, - &CxtI); - if (LHSKnownNegative && RHSKnownNonNegative) + KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI); + KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI); + if (LHSKnown.isNegative() && RHSKnown.isNonNegative()) return true; return false; @@ -1041,6 +951,57 @@ static Value *checkForNegativeOperand(BinaryOperator &I, return nullptr; } +static Instruction *foldAddWithConstant(BinaryOperator &Add, + InstCombiner::BuilderTy &Builder) { + Value *Op0 = Add.getOperand(0), *Op1 = Add.getOperand(1); + const APInt *C; + if (!match(Op1, m_APInt(C))) + return nullptr; + + if (C->isSignMask()) { + // If wrapping is not allowed, then the addition must set the sign bit: + // X + (signmask) --> X | signmask + if (Add.hasNoSignedWrap() || Add.hasNoUnsignedWrap()) + return BinaryOperator::CreateOr(Op0, Op1); + + // If wrapping is allowed, then the addition flips the sign bit of LHS: + // X + (signmask) --> X ^ signmask + return BinaryOperator::CreateXor(Op0, Op1); + } + + Value *X; + const APInt *C2; + Type *Ty = Add.getType(); + + // Is this add the last step in a convoluted sext? + // add(zext(xor i16 X, -32768), -32768) --> sext X + if (match(Op0, m_ZExt(m_Xor(m_Value(X), m_APInt(C2)))) && + C2->isMinSignedValue() && C2->sext(Ty->getScalarSizeInBits()) == *C) + return CastInst::Create(Instruction::SExt, X, Ty); + + // (add (zext (add nuw X, C2)), C) --> (zext (add nuw X, C2 + C)) + // FIXME: This should check hasOneUse to not increase the instruction count? + if (C->isNegative() && + match(Op0, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C2)))) && + C->sge(-C2->sext(C->getBitWidth()))) { + Constant *NewC = + ConstantInt::get(X->getType(), *C2 + C->trunc(C2->getBitWidth())); + return new ZExtInst(Builder.CreateNUWAdd(X, NewC), Ty); + } + + // Shifts and add used to flip and mask off the low bit: + // add (ashr (shl i32 X, 31), 31), 1 --> and (not X), 1 + const APInt *C3; + if (*C == 1 && match(Op0, m_OneUse(m_AShr(m_Shl(m_Value(X), m_APInt(C2)), + m_APInt(C3)))) && + C2 == C3 && *C2 == Ty->getScalarSizeInBits() - 1) { + Value *NotX = Builder.CreateNot(X); + return BinaryOperator::CreateAnd(NotX, ConstantInt::get(Ty, 1)); + } + + return nullptr; +} + Instruction *InstCombiner::visitAdd(BinaryOperator &I) { bool Changed = SimplifyAssociativeOrCommutative(I); Value *LHS = I.getOperand(0), *RHS = I.getOperand(1); @@ -1056,41 +1017,11 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (Value *V = SimplifyUsingDistributiveLaws(I)) return replaceInstUsesWith(I, V); - const APInt *RHSC; - if (match(RHS, m_APInt(RHSC))) { - if (RHSC->isSignMask()) { - // If wrapping is not allowed, then the addition must set the sign bit: - // X + (signmask) --> X | signmask - if (I.hasNoSignedWrap() || I.hasNoUnsignedWrap()) - return BinaryOperator::CreateOr(LHS, RHS); - - // If wrapping is allowed, then the addition flips the sign bit of LHS: - // X + (signmask) --> X ^ signmask - return BinaryOperator::CreateXor(LHS, RHS); - } - - // Is this add the last step in a convoluted sext? - Value *X; - const APInt *C; - if (match(LHS, m_ZExt(m_Xor(m_Value(X), m_APInt(C)))) && - C->isMinSignedValue() && - C->sext(LHS->getType()->getScalarSizeInBits()) == *RHSC) { - // add(zext(xor i16 X, -32768), -32768) --> sext X - return CastInst::Create(Instruction::SExt, X, LHS->getType()); - } - - if (RHSC->isNegative() && - match(LHS, m_ZExt(m_NUWAdd(m_Value(X), m_APInt(C)))) && - RHSC->sge(-C->sext(RHSC->getBitWidth()))) { - // (add (zext (add nuw X, C)), Val) -> (zext (add nuw X, C+Val)) - Constant *NewC = - ConstantInt::get(X->getType(), *C + RHSC->trunc(C->getBitWidth())); - return new ZExtInst(Builder->CreateNUWAdd(X, NewC), I.getType()); - } - } + if (Instruction *X = foldAddWithConstant(I, *Builder)) + return X; - // FIXME: Use the match above instead of dyn_cast to allow these transforms - // for splat vectors. + // FIXME: This should be moved into the above helper function to allow these + // transforms for splat vectors. if (ConstantInt *CI = dyn_cast(RHS)) { // zext(bool) + C -> bool ? C + 1 : C if (ZExtInst *ZI = dyn_cast(LHS)) @@ -1285,8 +1216,7 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { Constant *CI = ConstantExpr::getTrunc(RHSC, LHSConv->getOperand(0)->getType()); if (ConstantExpr::getZExt(CI, I.getType()) == RHSC && - computeOverflowForUnsignedAdd(LHSConv->getOperand(0), CI, &I) == - OverflowResult::NeverOverflows) { + willNotOverflowUnsignedAdd(LHSConv->getOperand(0), CI, I)) { // Insert the new, smaller add. Value *NewAdd = Builder->CreateNUWAdd(LHSConv->getOperand(0), CI, "addconv"); @@ -1303,9 +1233,8 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { if (LHSConv->getOperand(0)->getType() == RHSConv->getOperand(0)->getType() && (LHSConv->hasOneUse() || RHSConv->hasOneUse()) && - computeOverflowForUnsignedAdd(LHSConv->getOperand(0), - RHSConv->getOperand(0), - &I) == OverflowResult::NeverOverflows) { + willNotOverflowUnsignedAdd(LHSConv->getOperand(0), + RHSConv->getOperand(0), I)) { // Insert the new integer add. Value *NewAdd = Builder->CreateNUWAdd( LHSConv->getOperand(0), RHSConv->getOperand(0), "addconv"); @@ -1347,15 +1276,13 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) { } // TODO(jingyue): Consider WillNotOverflowSignedAdd and - // WillNotOverflowUnsignedAdd to reduce the number of invocations of + // willNotOverflowUnsignedAdd to reduce the number of invocations of // computeKnownBits. if (!I.hasNoSignedWrap() && WillNotOverflowSignedAdd(LHS, RHS, I)) { Changed = true; I.setHasNoSignedWrap(true); } - if (!I.hasNoUnsignedWrap() && - computeOverflowForUnsignedAdd(LHS, RHS, &I) == - OverflowResult::NeverOverflows) { + if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedAdd(LHS, RHS, I)) { Changed = true; I.setHasNoUnsignedWrap(true); } diff --git a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp index b114801cc1c0..82dc88f1b3ad 100644 --- a/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp +++ b/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp @@ -23,21 +23,6 @@ using namespace PatternMatch; #define DEBUG_TYPE "instcombine" -static inline Value *dyn_castNotVal(Value *V) { - // If this is not(not(x)) don't return that this is a not: we want the two - // not's to be folded first. - if (BinaryOperator::isNot(V)) { - Value *Operand = BinaryOperator::getNotArgument(V); - if (!IsFreeToInvert(Operand, Operand->hasOneUse())) - return Operand; - } - - // Constants can be considered to be not'ed values... - if (ConstantInt *C = dyn_cast(V)) - return ConstantInt::get(C->getType(), ~C->getValue()); - return nullptr; -} - /// Similar to getICmpCode but for FCmpInst. This encodes a fcmp predicate into /// a four bit mask. static unsigned getFCmpCode(FCmpInst::Predicate CC) { @@ -713,9 +698,8 @@ Value *InstCombiner::simplifyRangeCheck(ICmpInst *Cmp0, ICmpInst *Cmp1, } // This simplification is only valid if the upper range is not negative. - bool IsNegative, IsNotNegative; - ComputeSignBit(RangeEnd, IsNotNegative, IsNegative, /*Depth=*/0, Cmp1); - if (!IsNotNegative) + KnownBits Known = computeKnownBits(RangeEnd, /*Depth=*/0, Cmp1); + if (!Known.isNonNegative()) return nullptr; if (Inverted) @@ -1013,26 +997,22 @@ Value *InstCombiner::FoldAndOfFCmps(FCmpInst *LHS, FCmpInst *RHS) { /// (~A & ~B) == (~(A | B)) /// (~A | ~B) == (~(A & B)) static Instruction *matchDeMorgansLaws(BinaryOperator &I, - InstCombiner::BuilderTy *Builder) { + InstCombiner::BuilderTy &Builder) { auto Opcode = I.getOpcode(); assert((Opcode == Instruction::And || Opcode == Instruction::Or) && "Trying to match De Morgan's Laws with something other than and/or"); + // Flip the logic operation. - if (Opcode == Instruction::And) - Opcode = Instruction::Or; - else - Opcode = Instruction::And; + Opcode = (Opcode == Instruction::And) ? Instruction::Or : Instruction::And; - Value *Op0 = I.getOperand(0); - Value *Op1 = I.getOperand(1); - // TODO: Use pattern matchers instead of dyn_cast. - if (Value *Op0NotVal = dyn_castNotVal(Op0)) - if (Value *Op1NotVal = dyn_castNotVal(Op1)) - if (Op0->hasOneUse() && Op1->hasOneUse()) { - Value *LogicOp = Builder->CreateBinOp(Opcode, Op0NotVal, Op1NotVal, - I.getName() + ".demorgan"); - return BinaryOperator::CreateNot(LogicOp); - } + Value *A, *B; + if (match(I.getOperand(0), m_OneUse(m_Not(m_Value(A)))) && + match(I.getOperand(1), m_OneUse(m_Not(m_Value(B)))) && + !IsFreeToInvert(A, A->hasOneUse()) && + !IsFreeToInvert(B, B->hasOneUse())) { + Value *AndOr = Builder.CreateBinOp(Opcode, A, B, I.getName() + ".demorgan"); + return BinaryOperator::CreateNot(AndOr); + } return nullptr; } @@ -1376,7 +1356,7 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) { if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I)) return FoldedLogic; - if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder)) return DeMorgan; { @@ -2005,18 +1985,6 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (Value *V = SimplifyBSwap(I)) return replaceInstUsesWith(I, V); - if (ConstantInt *RHS = dyn_cast(Op1)) { - ConstantInt *C1 = nullptr; Value *X = nullptr; - // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2) - if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && - Op0->hasOneUse()) { - Value *Or = Builder->CreateOr(X, RHS); - Or->takeName(Op0); - return BinaryOperator::CreateXor(Or, - Builder->getInt(C1->getValue() & ~RHS->getValue())); - } - } - if (isa(Op1)) if (Instruction *FoldedLogic = foldOpWithConstantIntoOperand(I)) return FoldedLogic; @@ -2167,7 +2135,7 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) { if (match(Op0, m_And(m_Or(m_Specific(Op1), m_Value(C)), m_Value(A)))) return BinaryOperator::CreateOr(Op1, Builder->CreateAnd(A, C)); - if (Instruction *DeMorgan = matchDeMorgansLaws(I, Builder)) + if (Instruction *DeMorgan = matchDeMorgansLaws(I, *Builder)) return DeMorgan; // Canonicalize xor to the RHS. @@ -2399,27 +2367,44 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) { } // Is this a 'not' (~) fed by a binary operator? - BinaryOperator *NotOp; - if (match(&I, m_Not(m_BinOp(NotOp)))) { - if (NotOp->getOpcode() == Instruction::And || - NotOp->getOpcode() == Instruction::Or) { + BinaryOperator *NotVal; + if (match(&I, m_Not(m_BinOp(NotVal)))) { + if (NotVal->getOpcode() == Instruction::And || + NotVal->getOpcode() == Instruction::Or) { // Apply DeMorgan's Law when inverts are free: // ~(X & Y) --> (~X | ~Y) // ~(X | Y) --> (~X & ~Y) - if (IsFreeToInvert(NotOp->getOperand(0), - NotOp->getOperand(0)->hasOneUse()) && - IsFreeToInvert(NotOp->getOperand(1), - NotOp->getOperand(1)->hasOneUse())) { - Value *NotX = Builder->CreateNot(NotOp->getOperand(0), "notlhs"); - Value *NotY = Builder->CreateNot(NotOp->getOperand(1), "notrhs"); - if (NotOp->getOpcode() == Instruction::And) + if (IsFreeToInvert(NotVal->getOperand(0), + NotVal->getOperand(0)->hasOneUse()) && + IsFreeToInvert(NotVal->getOperand(1), + NotVal->getOperand(1)->hasOneUse())) { + Value *NotX = Builder->CreateNot(NotVal->getOperand(0), "notlhs"); + Value *NotY = Builder->CreateNot(NotVal->getOperand(1), "notrhs"); + if (NotVal->getOpcode() == Instruction::And) return BinaryOperator::CreateOr(NotX, NotY); return BinaryOperator::CreateAnd(NotX, NotY); } - } else if (NotOp->getOpcode() == Instruction::AShr) { - // ~(~X >>s Y) --> (X >>s Y) - if (Value *Op0NotVal = dyn_castNotVal(NotOp->getOperand(0))) - return BinaryOperator::CreateAShr(Op0NotVal, NotOp->getOperand(1)); + } + + // ~(~X >>s Y) --> (X >>s Y) + if (match(NotVal, m_AShr(m_Not(m_Value(X)), m_Value(Y)))) + return BinaryOperator::CreateAShr(X, Y); + + // If we are inverting a right-shifted constant, we may be able to eliminate + // the 'not' by inverting the constant and using the opposite shift type. + // Canonicalization rules ensure that only a negative constant uses 'ashr', + // but we must check that in case that transform has not fired yet. + const APInt *C; + if (match(NotVal, m_AShr(m_APInt(C), m_Value(Y))) && C->isNegative()) { + // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits) + Constant *NotC = ConstantInt::get(I.getType(), ~(*C)); + return BinaryOperator::CreateLShr(NotC, Y); + } + + if (match(NotVal, m_LShr(m_APInt(C), m_Value(Y))) && C->isNonNegative()) { + // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits) + Constant *NotC = ConstantInt::get(I.getType(), ~(*C)); + return BinaryOperator::CreateAShr(NotC, Y); } } diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp index 6989d67f0060..face7abcc95f 100644 --- a/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -1384,10 +1384,10 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) { // Create a mask for bits above (ctlz) or below (cttz) the first known one. bool IsTZ = II.getIntrinsicID() == Intrinsic::cttz; - unsigned PossibleZeros = IsTZ ? Known.One.countTrailingZeros() - : Known.One.countLeadingZeros(); - unsigned DefiniteZeros = IsTZ ? Known.Zero.countTrailingOnes() - : Known.Zero.countLeadingOnes(); + unsigned PossibleZeros = IsTZ ? Known.countMaxTrailingZeros() + : Known.countMaxLeadingZeros(); + unsigned DefiniteZeros = IsTZ ? Known.countMinTrailingZeros() + : Known.countMinLeadingZeros(); // If all bits above (ctlz) or below (cttz) the first known one are known // zero, this value is constant. diff --git a/lib/Transforms/InstCombine/InstCombineCasts.cpp b/lib/Transforms/InstCombine/InstCombineCasts.cpp index 312d9baae43a..001a4bcf16f3 100644 --- a/lib/Transforms/InstCombine/InstCombineCasts.cpp +++ b/lib/Transforms/InstCombine/InstCombineCasts.cpp @@ -559,6 +559,9 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { return new ICmpInst(ICmpInst::ICMP_NE, Src, Zero); } + // FIXME: Maybe combine the next two transforms to handle the no cast case + // more efficiently. Support vector types. Cleanup code by using m_OneUse. + // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion. Value *A = nullptr; ConstantInt *Cst = nullptr; if (Src->hasOneUse() && @@ -588,15 +591,20 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) { // the sign bit of the original value; performing ashr instead of lshr // generates bits of the same value as the sign bit. if (Src->hasOneUse() && - match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst))) && - cast(Src)->getOperand(0)->hasOneUse()) { + match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) { + Value *SExt = cast(Src)->getOperand(0); + const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits(); const unsigned ASize = A->getType()->getPrimitiveSizeInBits(); + unsigned ShiftAmt = Cst->getZExtValue(); // This optimization can be only performed when zero bits generated by // the original lshr aren't pulled into the value after truncation, so we - // can only shift by values smaller than the size of destination type (in - // bits). - if (Cst->getValue().ult(ASize)) { - Value *Shift = Builder->CreateAShr(A, Cst->getZExtValue()); + // can only shift by values no larger than the number of extension bits. + // FIXME: Instead of bailing when the shift is too large, use and to clear + // the extra bits. + if (SExt->hasOneUse() && ShiftAmt <= SExtSize - ASize) { + // If shifting by the size of the original value in bits or more, it is + // being filled with the sign bit, so shift by ASize-1 to avoid ub. + Value *Shift = Builder->CreateAShr(A, std::min(ShiftAmt, ASize-1)); Shift->takeName(Src); return CastInst::CreateIntegerCast(Shift, CI.getType(), true); } @@ -1180,9 +1188,8 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) { // If we know that the value being extended is positive, we can use a zext // instead. - bool KnownZero, KnownOne; - ComputeSignBit(Src, KnownZero, KnownOne, 0, &CI); - if (KnownZero) { + KnownBits Known = computeKnownBits(Src, 0, &CI); + if (Known.isNonNegative()) { Value *ZExt = Builder->CreateZExt(Src, DestTy); return replaceInstUsesWith(CI, ZExt); } diff --git a/lib/Transforms/InstCombine/InstCombineCompares.cpp b/lib/Transforms/InstCombine/InstCombineCompares.cpp index 34ce235b3fe2..60ed4057cedd 100644 --- a/lib/Transforms/InstCombine/InstCombineCompares.cpp +++ b/lib/Transforms/InstCombine/InstCombineCompares.cpp @@ -2785,6 +2785,9 @@ Instruction *InstCombiner::foldICmpInstWithConstantNotInt(ICmpInst &I) { } /// Try to fold icmp (binop), X or icmp X, (binop). +/// TODO: A large part of this logic is duplicated in InstSimplify's +/// simplifyICmpWithBinOp(). We should be able to share that and avoid the code +/// duplication. Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1); @@ -2794,7 +2797,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { if (!BO0 && !BO1) return nullptr; - CmpInst::Predicate Pred = I.getPredicate(); + const CmpInst::Predicate Pred = I.getPredicate(); bool NoOp0WrapProblem = false, NoOp1WrapProblem = false; if (BO0 && isa(BO0)) NoOp0WrapProblem = @@ -3029,21 +3032,20 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { case Instruction::Sub: case Instruction::Xor: if (I.isEquality()) // a+x icmp eq/ne b+x --> a icmp b - return new ICmpInst(I.getPredicate(), BO0->getOperand(0), - BO1->getOperand(0)); + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); // icmp u/s (a ^ signmask), (b ^ signmask) --> icmp s/u a, b if (ConstantInt *CI = dyn_cast(BO0->getOperand(1))) { if (CI->getValue().isSignMask()) { - ICmpInst::Predicate Pred = + ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); - return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); } if (BO0->getOpcode() == Instruction::Xor && CI->isMaxValue(true)) { - ICmpInst::Predicate Pred = + ICmpInst::Predicate NewPred = I.isSigned() ? I.getUnsignedPredicate() : I.getSignedPredicate(); - Pred = I.getSwappedPredicate(Pred); - return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + NewPred = I.getSwappedPredicate(NewPred); + return new ICmpInst(NewPred, BO0->getOperand(0), BO1->getOperand(0)); } } break; @@ -3062,21 +3064,27 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { AP.getBitWidth() - AP.countTrailingZeros())); Value *And1 = Builder->CreateAnd(BO0->getOperand(0), Mask); Value *And2 = Builder->CreateAnd(BO1->getOperand(0), Mask); - return new ICmpInst(I.getPredicate(), And1, And2); + return new ICmpInst(Pred, And1, And2); } } break; + case Instruction::UDiv: case Instruction::LShr: - if (I.isSigned()) + if (I.isSigned() || !BO0->isExact() || !BO1->isExact()) break; - LLVM_FALLTHROUGH; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + case Instruction::SDiv: + if (!I.isEquality() || !BO0->isExact() || !BO1->isExact()) + break; + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + case Instruction::AShr: if (!BO0->isExact() || !BO1->isExact()) break; - return new ICmpInst(I.getPredicate(), BO0->getOperand(0), - BO1->getOperand(0)); + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); + case Instruction::Shl: { bool NUW = BO0->hasNoUnsignedWrap() && BO1->hasNoUnsignedWrap(); bool NSW = BO0->hasNoSignedWrap() && BO1->hasNoSignedWrap(); @@ -3084,8 +3092,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { break; if (!NSW && I.isSigned()) break; - return new ICmpInst(I.getPredicate(), BO0->getOperand(0), - BO1->getOperand(0)); + return new ICmpInst(Pred, BO0->getOperand(0), BO1->getOperand(0)); } } } @@ -3096,7 +3103,7 @@ Instruction *InstCombiner::foldICmpBinOp(ICmpInst &I) { auto BitwiseAnd = m_CombineOr(m_And(m_Value(), LSubOne), m_And(LSubOne, m_Value())); - if (match(BO0, BitwiseAnd) && I.getPredicate() == ICmpInst::ICMP_ULT) { + if (match(BO0, BitwiseAnd) && Pred == ICmpInst::ICMP_ULT) { auto *Zero = Constant::getNullValue(BO0->getType()); return new ICmpInst(ICmpInst::ICMP_NE, Op1, Zero); } diff --git a/lib/Transforms/InstCombine/InstCombineInternal.h b/lib/Transforms/InstCombine/InstCombineInternal.h index 3be6419a129a..1424f61fe701 100644 --- a/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/lib/Transforms/InstCombine/InstCombineInternal.h @@ -30,6 +30,7 @@ #include "llvm/IR/PatternMatch.h" #include "llvm/Pass.h" #include "llvm/Support/Dwarf.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Transforms/InstCombine/InstCombineWorklist.h" #include "llvm/Transforms/Utils/Local.h" @@ -388,10 +389,21 @@ private: bool DoTransform = true); Instruction *transformSExtICmp(ICmpInst *ICI, Instruction &CI); - bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI); + bool WillNotOverflowSignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) { + return computeOverflowForSignedAdd(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + }; + bool willNotOverflowUnsignedAdd(Value *LHS, Value *RHS, Instruction &CxtI) { + return computeOverflowForUnsignedAdd(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + }; bool WillNotOverflowSignedSub(Value *LHS, Value *RHS, Instruction &CxtI); bool WillNotOverflowUnsignedSub(Value *LHS, Value *RHS, Instruction &CxtI); bool WillNotOverflowSignedMul(Value *LHS, Value *RHS, Instruction &CxtI); + bool willNotOverflowUnsignedMul(Value *LHS, Value *RHS, Instruction &CxtI) { + return computeOverflowForUnsignedMul(LHS, RHS, &CxtI) == + OverflowResult::NeverOverflows; + }; Value *EmitGEPOffset(User *GEP); Instruction *scalarizePHI(ExtractElementInst &EI, PHINode *PN); Value *EvaluateInDifferentElementOrder(Value *V, ArrayRef Mask); @@ -492,7 +504,11 @@ public: void computeKnownBits(Value *V, KnownBits &Known, unsigned Depth, Instruction *CxtI) const { - return llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); + llvm::computeKnownBits(V, Known, DL, Depth, &AC, CxtI, &DT); + } + KnownBits computeKnownBits(Value *V, unsigned Depth, + Instruction *CxtI) const { + return llvm::computeKnownBits(V, DL, Depth, &AC, CxtI, &DT); } bool MaskedValueIsZero(Value *V, const APInt &Mask, unsigned Depth = 0, @@ -503,11 +519,6 @@ public: Instruction *CxtI = nullptr) const { return llvm::ComputeNumSignBits(Op, DL, Depth, &AC, CxtI, &DT); } - void ComputeSignBit(Value *V, bool &KnownZero, bool &KnownOne, - unsigned Depth = 0, Instruction *CxtI = nullptr) const { - return llvm::ComputeSignBit(V, KnownZero, KnownOne, DL, Depth, &AC, CxtI, - &DT); - } OverflowResult computeOverflowForUnsignedMul(Value *LHS, Value *RHS, const Instruction *CxtI) { return llvm::computeOverflowForUnsignedMul(LHS, RHS, DL, &AC, CxtI, &DT); @@ -516,6 +527,11 @@ public: const Instruction *CxtI) { return llvm::computeOverflowForUnsignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); } + OverflowResult computeOverflowForSignedAdd(const Value *LHS, + const Value *RHS, + const Instruction *CxtI) const { + return llvm::computeOverflowForSignedAdd(LHS, RHS, DL, &AC, CxtI, &DT); + } /// Maximum size of array considered when transforming. uint64_t MaxArraySizeForCombine; diff --git a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp index 675553017838..a4d84ae81aa0 100644 --- a/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp +++ b/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp @@ -885,10 +885,8 @@ static bool canReplaceGEPIdxWithZero(InstCombiner &IC, GetElementPtrInst *GEPI, // first non-zero index. auto IsAllNonNegative = [&]() { for (unsigned i = Idx+1, e = GEPI->getNumOperands(); i != e; ++i) { - bool KnownNonNegative, KnownNegative; - IC.ComputeSignBit(GEPI->getOperand(i), KnownNonNegative, - KnownNegative, 0, MemI); - if (KnownNonNegative) + KnownBits Known = IC.computeKnownBits(GEPI->getOperand(i), 0, MemI); + if (Known.isNonNegative()) continue; return false; } diff --git a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index face9d9237ae..2a35259f2103 100644 --- a/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -162,11 +162,9 @@ bool InstCombiner::WillNotOverflowSignedMul(Value *LHS, Value *RHS, // product is exactly the minimum negative number. // E.g. mul i16 with 17 sign bits: 0xff00 * 0xff80 = 0x8000 // For simplicity we just check if at least one side is not negative. - bool LHSNonNegative, LHSNegative; - bool RHSNonNegative, RHSNegative; - ComputeSignBit(LHS, LHSNonNegative, LHSNegative, /*Depth=*/0, &CxtI); - ComputeSignBit(RHS, RHSNonNegative, RHSNegative, /*Depth=*/0, &CxtI); - if (LHSNonNegative || RHSNonNegative) + KnownBits LHSKnown = computeKnownBits(LHS, /*Depth=*/0, &CxtI); + KnownBits RHSKnown = computeKnownBits(RHS, /*Depth=*/0, &CxtI); + if (LHSKnown.isNonNegative() || RHSKnown.isNonNegative()) return true; } return false; @@ -422,8 +420,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { Constant *CI = ConstantExpr::getTrunc(Op1C, Op0Conv->getOperand(0)->getType()); if (ConstantExpr::getZExt(CI, I.getType()) == Op1C && - computeOverflowForUnsignedMul(Op0Conv->getOperand(0), CI, &I) == - OverflowResult::NeverOverflows) { + willNotOverflowUnsignedMul(Op0Conv->getOperand(0), CI, I)) { // Insert the new, smaller mul. Value *NewMul = Builder->CreateNUWMul(Op0Conv->getOperand(0), CI, "mulconv"); @@ -440,9 +437,8 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { if (Op0Conv->getOperand(0)->getType() == Op1Conv->getOperand(0)->getType() && (Op0Conv->hasOneUse() || Op1Conv->hasOneUse()) && - computeOverflowForUnsignedMul(Op0Conv->getOperand(0), - Op1Conv->getOperand(0), - &I) == OverflowResult::NeverOverflows) { + willNotOverflowUnsignedMul(Op0Conv->getOperand(0), + Op1Conv->getOperand(0), I)) { // Insert the new integer mul. Value *NewMul = Builder->CreateNUWMul( Op0Conv->getOperand(0), Op1Conv->getOperand(0), "mulconv"); @@ -456,9 +452,7 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) { I.setHasNoSignedWrap(true); } - if (!I.hasNoUnsignedWrap() && - computeOverflowForUnsignedMul(Op0, Op1, &I) == - OverflowResult::NeverOverflows) { + if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedMul(Op0, Op1, I)) { Changed = true; I.setHasNoUnsignedWrap(true); } diff --git a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp index 05b01774cd5e..4028a92771a4 100644 --- a/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ b/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -611,7 +611,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask, SimplifyDemandedBits(I, 1, AllOnes, Known2, Depth + 1)) return I; - unsigned Leaders = Known2.Zero.countLeadingOnes(); + unsigned Leaders = Known2.countMinLeadingZeros(); Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask; break; } diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp index 1792cb585f87..65b1148cb03b 100644 --- a/lib/Transforms/InstCombine/InstructionCombining.cpp +++ b/lib/Transforms/InstCombine/InstructionCombining.cpp @@ -2212,9 +2212,9 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { // Canonicalize fcmp_one -> fcmp_oeq FCmpInst::Predicate FPred; Value *Y; - if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), - TrueDest, FalseDest)) && - BI.getCondition()->hasOneUse()) + if (match(&BI, m_Br(m_OneUse(m_FCmp(FPred, m_Value(X), m_Value(Y))), + TrueDest, FalseDest))) { + // TODO: Why are we only transforming these 3 predicates? if (FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE || FPred == FCmpInst::FCMP_OGE) { FCmpInst *Cond = cast(BI.getCondition()); @@ -2225,12 +2225,12 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { Worklist.Add(Cond); return &BI; } + } // Canonicalize icmp_ne -> icmp_eq ICmpInst::Predicate IPred; - if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)), - TrueDest, FalseDest)) && - BI.getCondition()->hasOneUse()) + if (match(&BI, m_Br(m_OneUse(m_ICmp(IPred, m_Value(X), m_Value(Y))), + TrueDest, FalseDest))) { if (IPred == ICmpInst::ICMP_NE || IPred == ICmpInst::ICMP_ULE || IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE || IPred == ICmpInst::ICMP_SGE) { @@ -2241,6 +2241,7 @@ Instruction *InstCombiner::visitBranchInst(BranchInst &BI) { Worklist.Add(Cond); return &BI; } + } return nullptr; } @@ -2264,8 +2265,8 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) { unsigned BitWidth = cast(Cond->getType())->getBitWidth(); KnownBits Known(BitWidth); computeKnownBits(Cond, Known, 0, &SI); - unsigned LeadingKnownZeros = Known.Zero.countLeadingOnes(); - unsigned LeadingKnownOnes = Known.One.countLeadingOnes(); + unsigned LeadingKnownZeros = Known.countMinLeadingZeros(); + unsigned LeadingKnownOnes = Known.countMinLeadingOnes(); // Compute the number of leading bits we can ignore. // TODO: A better way to determine this would use ComputeNumSignBits(). @@ -3141,7 +3142,7 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, // Lower dbg.declare intrinsics otherwise their value may be clobbered // by instcombiner. - bool DbgDeclaresChanged = LowerDbgDeclare(F); + bool MadeIRChange = LowerDbgDeclare(F); // Iterate while there is work to do. int Iteration = 0; @@ -3150,18 +3151,17 @@ combineInstructionsOverFunction(Function &F, InstCombineWorklist &Worklist, DEBUG(dbgs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on " << F.getName() << "\n"); - bool Changed = prepareICWorklistFromFunction(F, DL, &TLI, Worklist); + MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist); InstCombiner IC(Worklist, &Builder, F.optForMinSize(), ExpensiveCombines, AA, AC, TLI, DT, DL, LI); IC.MaxArraySizeForCombine = MaxArraySize; - Changed |= IC.run(); - if (!Changed) + if (!IC.run()) break; } - return DbgDeclaresChanged || Iteration > 1; + return MadeIRChange || Iteration > 1; } PreservedAnalyses InstCombinePass::run(Function &F, diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp index b034ccc46933..7eea44d6aca0 100644 --- a/lib/Transforms/Instrumentation/AddressSanitizer.cpp +++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp @@ -613,7 +613,15 @@ public: bool UseGlobalsGC = true) : ModulePass(ID), CompileKernel(CompileKernel || ClEnableKasan), Recover(Recover || ClRecover), - UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC) {} + UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC), + // Not a typo: ClWithComdat is almost completely pointless without + // ClUseGlobalsGC (because then it only works on modules without + // globals, which are rare); it is a prerequisite for ClUseGlobalsGC; + // and both suffer from gold PR19002 for which UseGlobalsGC constructor + // argument is designed as workaround. Therefore, disable both + // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to + // do globals-gc. + UseCtorComdat(UseGlobalsGC && ClWithComdat) {} bool runOnModule(Module &M) override; static char ID; // Pass identification, replacement for typeid StringRef getPassName() const override { return "AddressSanitizerModule"; } @@ -656,6 +664,7 @@ private: bool CompileKernel; bool Recover; bool UseGlobalsGC; + bool UseCtorComdat; Type *IntptrTy; LLVMContext *C; Triple TargetTriple; @@ -1677,7 +1686,7 @@ AddressSanitizerModule::CreateMetadataGlobal(Module &M, Constant *Initializer, : GlobalVariable::PrivateLinkage; GlobalVariable *Metadata = new GlobalVariable( M, Initializer->getType(), false, Linkage, Initializer, - Twine("__asan_global_") + GlobalValue::getRealLinkageName(OriginalName)); + Twine("__asan_global_") + GlobalValue::dropLLVMManglingEscape(OriginalName)); Metadata->setSection(getGlobalMetadataSection()); return Metadata; } @@ -1782,7 +1791,7 @@ void AddressSanitizerModule::InstrumentGlobalsMachO( // On recent Mach-O platforms, use a structure which binds the liveness of // the global variable to the metadata struct. Keep the list of "Liveness" GV // created to be added to llvm.compiler.used - StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy, nullptr); + StructType *LivenessTy = StructType::get(IntptrTy, IntptrTy); SmallVector LivenessGlobals(ExtendedGlobals.size()); for (size_t i = 0; i < ExtendedGlobals.size(); i++) { @@ -1793,9 +1802,9 @@ void AddressSanitizerModule::InstrumentGlobalsMachO( // On recent Mach-O platforms, we emit the global metadata in a way that // allows the linker to properly strip dead globals. - auto LivenessBinder = ConstantStruct::get( - LivenessTy, Initializer->getAggregateElement(0u), - ConstantExpr::getPointerCast(Metadata, IntptrTy), nullptr); + auto LivenessBinder = + ConstantStruct::get(LivenessTy, Initializer->getAggregateElement(0u), + ConstantExpr::getPointerCast(Metadata, IntptrTy)); GlobalVariable *Liveness = new GlobalVariable( M, LivenessTy, false, GlobalVariable::InternalLinkage, LivenessBinder, Twine("__asan_binder_") + G->getName()); @@ -1893,7 +1902,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool // We initialize an array of such structures and pass it to a run-time call. StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy, IntptrTy, IntptrTy, IntptrTy, - IntptrTy, IntptrTy, IntptrTy, nullptr); + IntptrTy, IntptrTy, IntptrTy); SmallVector NewGlobals(n); SmallVector Initializers(n); @@ -1929,10 +1938,9 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0); Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize); - StructType *NewTy = StructType::get(Ty, RightRedZoneTy, nullptr); - Constant *NewInitializer = - ConstantStruct::get(NewTy, G->getInitializer(), - Constant::getNullValue(RightRedZoneTy), nullptr); + StructType *NewTy = StructType::get(Ty, RightRedZoneTy); + Constant *NewInitializer = ConstantStruct::get( + NewTy, G->getInitializer(), Constant::getNullValue(RightRedZoneTy)); // Create a new global variable with enough space for a redzone. GlobalValue::LinkageTypes Linkage = G->getLinkage(); @@ -2013,7 +2021,7 @@ bool AddressSanitizerModule::InstrumentGlobals(IRBuilder<> &IRB, Module &M, bool ConstantExpr::getPointerCast(Name, IntptrTy), ConstantExpr::getPointerCast(ModuleName, IntptrTy), ConstantInt::get(IntptrTy, MD.IsDynInit), SourceLoc, - ConstantExpr::getPointerCast(ODRIndicator, IntptrTy), nullptr); + ConstantExpr::getPointerCast(ODRIndicator, IntptrTy)); if (ClInitializers && MD.IsDynInit) HasDynamicallyInitializedGlobals = true; @@ -2073,7 +2081,7 @@ bool AddressSanitizerModule::runOnModule(Module &M) { // Put the constructor and destructor in comdat if both // (1) global instrumentation is not TU-specific // (2) target is ELF. - if (ClWithComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) { + if (UseCtorComdat && TargetTriple.isOSBinFormatELF() && CtorComdat) { AsanCtorFunction->setComdat(M.getOrInsertComdat(kAsanModuleCtorName)); appendToGlobalCtors(M, AsanCtorFunction, kAsanCtorAndDtorPriority, AsanCtorFunction); diff --git a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp index 8786781933ea..e2e3cbdbc295 100644 --- a/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp +++ b/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp @@ -388,7 +388,7 @@ FunctionType *DataFlowSanitizer::getArgsFunctionType(FunctionType *T) { ArgTypes.push_back(ShadowPtrTy); Type *RetType = T->getReturnType(); if (!RetType->isVoidTy()) - RetType = StructType::get(RetType, ShadowTy, (Type *)nullptr); + RetType = StructType::get(RetType, ShadowTy); return FunctionType::get(RetType, ArgTypes, T->isVarArg()); } @@ -476,16 +476,14 @@ bool DataFlowSanitizer::doInitialization(Module &M) { GetArgTLS = ConstantExpr::getIntToPtr( ConstantInt::get(IntptrTy, uintptr_t(GetArgTLSPtr)), PointerType::getUnqual( - FunctionType::get(PointerType::getUnqual(ArgTLSTy), - (Type *)nullptr))); + FunctionType::get(PointerType::getUnqual(ArgTLSTy), false))); } if (GetRetvalTLSPtr) { RetvalTLS = nullptr; GetRetvalTLS = ConstantExpr::getIntToPtr( ConstantInt::get(IntptrTy, uintptr_t(GetRetvalTLSPtr)), PointerType::getUnqual( - FunctionType::get(PointerType::getUnqual(ShadowTy), - (Type *)nullptr))); + FunctionType::get(PointerType::getUnqual(ShadowTy), false))); } ColdCallWeights = MDBuilder(*Ctx).createBranchWeights(1, 1000); diff --git a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp index 7dea1dee756a..e89384c559fe 100644 --- a/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp +++ b/lib/Transforms/Instrumentation/EfficiencySanitizer.cpp @@ -398,8 +398,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( // u64 *ArrayCounter; // }; auto *StructInfoTy = - StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy, - Int8PtrPtrTy, Int64PtrTy, Int64PtrTy, nullptr); + StructType::get(Int8PtrTy, Int32Ty, Int32Ty, Int32PtrTy, Int32PtrTy, + Int8PtrPtrTy, Int64PtrTy, Int64PtrTy); auto *StructInfoPtrTy = StructInfoTy->getPointerTo(); // This structure should be kept consistent with the CacheFragInfo struct // in the runtime library. @@ -408,8 +408,7 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( // u32 NumStructs; // StructInfo *Structs; // }; - auto *CacheFragInfoTy = - StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy, nullptr); + auto *CacheFragInfoTy = StructType::get(Int8PtrTy, Int32Ty, StructInfoPtrTy); std::vector Vec = M.getIdentifiedStructTypes(); unsigned NumStructs = 0; @@ -457,24 +456,23 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( ArrayCounterIdx[0] = ConstantInt::get(Int32Ty, 0); ArrayCounterIdx[1] = ConstantInt::get(Int32Ty, getArrayCounterIdx(StructTy)); - Initializers.push_back( - ConstantStruct::get( - StructInfoTy, - ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy), - ConstantInt::get(Int32Ty, - DL.getStructLayout(StructTy)->getSizeInBytes()), - ConstantInt::get(Int32Ty, StructTy->getNumElements()), - Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) : - ConstantExpr::getPointerCast(Offset, Int32PtrTy), - Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) : - ConstantExpr::getPointerCast(Size, Int32PtrTy), - TypeName == nullptr ? ConstantPointerNull::get(Int8PtrPtrTy) : - ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy), - ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, - FieldCounterIdx), - ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, - ArrayCounterIdx), - nullptr)); + Initializers.push_back(ConstantStruct::get( + StructInfoTy, + ConstantExpr::getPointerCast(StructCounterName, Int8PtrTy), + ConstantInt::get(Int32Ty, + DL.getStructLayout(StructTy)->getSizeInBytes()), + ConstantInt::get(Int32Ty, StructTy->getNumElements()), + Offset == nullptr ? ConstantPointerNull::get(Int32PtrTy) + : ConstantExpr::getPointerCast(Offset, Int32PtrTy), + Size == nullptr ? ConstantPointerNull::get(Int32PtrTy) + : ConstantExpr::getPointerCast(Size, Int32PtrTy), + TypeName == nullptr + ? ConstantPointerNull::get(Int8PtrPtrTy) + : ConstantExpr::getPointerCast(TypeName, Int8PtrPtrTy), + ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, + FieldCounterIdx), + ConstantExpr::getGetElementPtr(CounterArrayTy, Counters, + ArrayCounterIdx))); } // Structs. Constant *StructInfo; @@ -491,11 +489,8 @@ GlobalVariable *EfficiencySanitizer::createCacheFragInfoGV( auto *CacheFragInfoGV = new GlobalVariable( M, CacheFragInfoTy, true, GlobalVariable::InternalLinkage, - ConstantStruct::get(CacheFragInfoTy, - UnitName, - ConstantInt::get(Int32Ty, NumStructs), - StructInfo, - nullptr)); + ConstantStruct::get(CacheFragInfoTy, UnitName, + ConstantInt::get(Int32Ty, NumStructs), StructInfo)); return CacheFragInfoGV; } diff --git a/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/lib/Transforms/Instrumentation/MemorySanitizer.cpp index 15333a5317dd..ff753c20a94a 100644 --- a/lib/Transforms/Instrumentation/MemorySanitizer.cpp +++ b/lib/Transforms/Instrumentation/MemorySanitizer.cpp @@ -1576,13 +1576,16 @@ struct MemorySanitizerVisitor : public InstVisitor { Value *CreateShadowCast(IRBuilder<> &IRB, Value *V, Type *dstTy, bool Signed = false) { Type *srcTy = V->getType(); + size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); + size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); + if (srcSizeInBits > 1 && dstSizeInBits == 1) + return IRB.CreateICmpNE(V, getCleanShadow(V)); + if (dstTy->isIntegerTy() && srcTy->isIntegerTy()) return IRB.CreateIntCast(V, dstTy, Signed); if (dstTy->isVectorTy() && srcTy->isVectorTy() && dstTy->getVectorNumElements() == srcTy->getVectorNumElements()) return IRB.CreateIntCast(V, dstTy, Signed); - size_t srcSizeInBits = VectorOrPrimitiveTypeSizeInBits(srcTy); - size_t dstSizeInBits = VectorOrPrimitiveTypeSizeInBits(dstTy); Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits)); Value *V2 = IRB.CreateIntCast(V1, Type::getIntNTy(*MS.C, dstSizeInBits), Signed); diff --git a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp index 3f1a77b49a44..ee493a8ec7e1 100644 --- a/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp +++ b/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp @@ -442,9 +442,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) { bool Changed = false; if (!NUW) { - ConstantRange NUWRange = - LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, - OBO::NoUnsignedWrap); + ConstantRange NUWRange = ConstantRange::makeGuaranteedNoWrapRegion( + BinaryOperator::Add, LRange, OBO::NoUnsignedWrap); if (!NUWRange.isEmptySet()) { bool NewNUW = NUWRange.contains(LazyRRange()); AddOp->setHasNoUnsignedWrap(NewNUW); @@ -452,9 +451,8 @@ static bool processAdd(BinaryOperator *AddOp, LazyValueInfo *LVI) { } } if (!NSW) { - ConstantRange NSWRange = - LRange.makeGuaranteedNoWrapRegion(BinaryOperator::Add, LRange, - OBO::NoSignedWrap); + ConstantRange NSWRange = ConstantRange::makeGuaranteedNoWrapRegion( + BinaryOperator::Add, LRange, OBO::NoSignedWrap); if (!NSWRange.isEmptySet()) { bool NewNSW = NSWRange.contains(LazyRRange()); AddOp->setHasNoSignedWrap(NewNSW); diff --git a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp index 48d5ae88cda9..6693a26e8890 100644 --- a/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -144,6 +144,10 @@ private: bool recognizePopcount(); void transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var); + bool recognizeAndInsertCTLZ(); + void transformLoopToCountable(BasicBlock *PreCondBB, Instruction *CntInst, + PHINode *CntPhi, Value *Var, const DebugLoc DL, + bool ZeroCheck, bool IsCntPhiUsedOutsideLoop); /// @} }; @@ -994,7 +998,7 @@ bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset, } bool LoopIdiomRecognize::runOnNoncountableLoop() { - return recognizePopcount(); + return recognizePopcount() || recognizeAndInsertCTLZ(); } /// Check if the given conditional branch is based on the comparison between @@ -1159,6 +1163,167 @@ static bool detectPopcountIdiom(Loop *CurLoop, BasicBlock *PreCondBB, return true; } +/// Return true if the idiom is detected in the loop. +/// +/// Additionally: +/// 1) \p CntInst is set to the instruction Counting Leading Zeros (CTLZ) +/// or nullptr if there is no such. +/// 2) \p CntPhi is set to the corresponding phi node +/// or nullptr if there is no such. +/// 3) \p Var is set to the value whose CTLZ could be used. +/// 4) \p DefX is set to the instruction calculating Loop exit condition. +/// +/// The core idiom we are trying to detect is: +/// \code +/// if (x0 == 0) +/// goto loop-exit // the precondition of the loop +/// cnt0 = init-val; +/// do { +/// x = phi (x0, x.next); //PhiX +/// cnt = phi(cnt0, cnt.next); +/// +/// cnt.next = cnt + 1; +/// ... +/// x.next = x >> 1; // DefX +/// ... +/// } while(x.next != 0); +/// +/// loop-exit: +/// \endcode +static bool detectCTLZIdiom(Loop *CurLoop, PHINode *&PhiX, + Instruction *&CntInst, PHINode *&CntPhi, + Instruction *&DefX) { + BasicBlock *LoopEntry; + Value *VarX = nullptr; + + DefX = nullptr; + PhiX = nullptr; + CntInst = nullptr; + CntPhi = nullptr; + LoopEntry = *(CurLoop->block_begin()); + + // step 1: Check if the loop-back branch is in desirable form. + if (Value *T = matchCondition( + dyn_cast(LoopEntry->getTerminator()), LoopEntry)) + DefX = dyn_cast(T); + else + return false; + + // step 2: detect instructions corresponding to "x.next = x >> 1" + if (!DefX || DefX->getOpcode() != Instruction::AShr) + return false; + if (ConstantInt *Shft = dyn_cast(DefX->getOperand(1))) + if (!Shft || !Shft->isOne()) + return false; + VarX = DefX->getOperand(0); + + // step 3: Check the recurrence of variable X + PhiX = dyn_cast(VarX); + if (!PhiX || (PhiX->getOperand(0) != DefX && PhiX->getOperand(1) != DefX)) + return false; + + // step 4: Find the instruction which count the CTLZ: cnt.next = cnt + 1 + // TODO: We can skip the step. If loop trip count is known (CTLZ), + // then all uses of "cnt.next" could be optimized to the trip count + // plus "cnt0". Currently it is not optimized. + // This step could be used to detect POPCNT instruction: + // cnt.next = cnt + (x.next & 1) + for (BasicBlock::iterator Iter = LoopEntry->getFirstNonPHI()->getIterator(), + IterE = LoopEntry->end(); + Iter != IterE; Iter++) { + Instruction *Inst = &*Iter; + if (Inst->getOpcode() != Instruction::Add) + continue; + + ConstantInt *Inc = dyn_cast(Inst->getOperand(1)); + if (!Inc || !Inc->isOne()) + continue; + + PHINode *Phi = dyn_cast(Inst->getOperand(0)); + if (!Phi || Phi->getParent() != LoopEntry) + continue; + + CntInst = Inst; + CntPhi = Phi; + break; + } + if (!CntInst) + return false; + + return true; +} + +/// Recognize CTLZ idiom in a non-countable loop and convert the loop +/// to countable (with CTLZ trip count). +/// If CTLZ inserted as a new trip count returns true; otherwise, returns false. +bool LoopIdiomRecognize::recognizeAndInsertCTLZ() { + // Give up if the loop has multiple blocks or multiple backedges. + if (CurLoop->getNumBackEdges() != 1 || CurLoop->getNumBlocks() != 1) + return false; + + Instruction *CntInst, *DefX; + PHINode *CntPhi, *PhiX; + if (!detectCTLZIdiom(CurLoop, PhiX, CntInst, CntPhi, DefX)) + return false; + + bool IsCntPhiUsedOutsideLoop = false; + for (User *U : CntPhi->users()) + if (!CurLoop->contains(dyn_cast(U))) { + IsCntPhiUsedOutsideLoop = true; + break; + } + bool IsCntInstUsedOutsideLoop = false; + for (User *U : CntInst->users()) + if (!CurLoop->contains(dyn_cast(U))) { + IsCntInstUsedOutsideLoop = true; + break; + } + // If both CntInst and CntPhi are used outside the loop the profitability + // is questionable. + if (IsCntInstUsedOutsideLoop && IsCntPhiUsedOutsideLoop) + return false; + + // For some CPUs result of CTLZ(X) intrinsic is undefined + // when X is 0. If we can not guarantee X != 0, we need to check this + // when expand. + bool ZeroCheck = false; + // It is safe to assume Preheader exist as it was checked in + // parent function RunOnLoop. + BasicBlock *PH = CurLoop->getLoopPreheader(); + Value *InitX = PhiX->getIncomingValueForBlock(PH); + // If we check X != 0 before entering the loop we don't need a zero + // check in CTLZ intrinsic. + if (BasicBlock *PreCondBB = PH->getSinglePredecessor()) + if (BranchInst *PreCondBr = + dyn_cast(PreCondBB->getTerminator())) { + if (matchCondition(PreCondBr, PH) == InitX) + ZeroCheck = true; + } + + // Check if CTLZ intrinsic is profitable. Assume it is always profitable + // if we delete the loop (the loop has only 6 instructions): + // %n.addr.0 = phi [ %n, %entry ], [ %shr, %while.cond ] + // %i.0 = phi [ %i0, %entry ], [ %inc, %while.cond ] + // %shr = ashr %n.addr.0, 1 + // %tobool = icmp eq %shr, 0 + // %inc = add nsw %i.0, 1 + // br i1 %tobool + + IRBuilder<> Builder(PH->getTerminator()); + SmallVector Ops = + {InitX, ZeroCheck ? Builder.getTrue() : Builder.getFalse()}; + ArrayRef Args(Ops); + if (CurLoop->getHeader()->size() != 6 && + TTI->getIntrinsicCost(Intrinsic::ctlz, InitX->getType(), Args) > + TargetTransformInfo::TCC_Basic) + return false; + + const DebugLoc DL = DefX->getDebugLoc(); + transformLoopToCountable(PH, CntInst, CntPhi, InitX, DL, ZeroCheck, + IsCntPhiUsedOutsideLoop); + return true; +} + /// Recognizes a population count idiom in a non-countable loop. /// /// If detected, transforms the relevant code to issue the popcount intrinsic @@ -1222,6 +1387,134 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val, return CI; } +static CallInst *createCTLZIntrinsic(IRBuilder<> &IRBuilder, Value *Val, + const DebugLoc &DL, bool ZeroCheck) { + Value *Ops[] = {Val, ZeroCheck ? IRBuilder.getTrue() : IRBuilder.getFalse()}; + Type *Tys[] = {Val->getType()}; + + Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent(); + Value *Func = Intrinsic::getDeclaration(M, Intrinsic::ctlz, Tys); + CallInst *CI = IRBuilder.CreateCall(Func, Ops); + CI->setDebugLoc(DL); + + return CI; +} + +/// Transform the following loop: +/// loop: +/// CntPhi = PHI [Cnt0, CntInst] +/// PhiX = PHI [InitX, DefX] +/// CntInst = CntPhi + 1 +/// DefX = PhiX >> 1 +// LOOP_BODY +/// Br: loop if (DefX != 0) +/// Use(CntPhi) or Use(CntInst) +/// +/// Into: +/// If CntPhi used outside the loop: +/// CountPrev = BitWidth(InitX) - CTLZ(InitX >> 1) +/// Count = CountPrev + 1 +/// else +/// Count = BitWidth(InitX) - CTLZ(InitX) +/// loop: +/// CntPhi = PHI [Cnt0, CntInst] +/// PhiX = PHI [InitX, DefX] +/// PhiCount = PHI [Count, Dec] +/// CntInst = CntPhi + 1 +/// DefX = PhiX >> 1 +/// Dec = PhiCount - 1 +/// LOOP_BODY +/// Br: loop if (Dec != 0) +/// Use(CountPrev + Cnt0) // Use(CntPhi) +/// or +/// Use(Count + Cnt0) // Use(CntInst) +/// +/// If LOOP_BODY is empty the loop will be deleted. +/// If CntInst and DefX are not used in LOOP_BODY they will be removed. +void LoopIdiomRecognize::transformLoopToCountable( + BasicBlock *Preheader, Instruction *CntInst, PHINode *CntPhi, Value *InitX, + const DebugLoc DL, bool ZeroCheck, bool IsCntPhiUsedOutsideLoop) { + BranchInst *PreheaderBr = dyn_cast(Preheader->getTerminator()); + + // Step 1: Insert the CTLZ instruction at the end of the preheader block + // Count = BitWidth - CTLZ(InitX); + // If there are uses of CntPhi create: + // CountPrev = BitWidth - CTLZ(InitX >> 1); + IRBuilder<> Builder(PreheaderBr); + Builder.SetCurrentDebugLocation(DL); + Value *CTLZ, *Count, *CountPrev, *NewCount, *InitXNext; + + if (IsCntPhiUsedOutsideLoop) + InitXNext = Builder.CreateAShr(InitX, + ConstantInt::get(InitX->getType(), 1)); + else + InitXNext = InitX; + CTLZ = createCTLZIntrinsic(Builder, InitXNext, DL, ZeroCheck); + Count = Builder.CreateSub( + ConstantInt::get(CTLZ->getType(), + CTLZ->getType()->getIntegerBitWidth()), + CTLZ); + if (IsCntPhiUsedOutsideLoop) { + CountPrev = Count; + Count = Builder.CreateAdd( + CountPrev, + ConstantInt::get(CountPrev->getType(), 1)); + } + if (IsCntPhiUsedOutsideLoop) + NewCount = Builder.CreateZExtOrTrunc(CountPrev, + cast(CntInst->getType())); + else + NewCount = Builder.CreateZExtOrTrunc(Count, + cast(CntInst->getType())); + + // If the CTLZ counter's initial value is not zero, insert Add Inst. + Value *CntInitVal = CntPhi->getIncomingValueForBlock(Preheader); + ConstantInt *InitConst = dyn_cast(CntInitVal); + if (!InitConst || !InitConst->isZero()) + NewCount = Builder.CreateAdd(NewCount, CntInitVal); + + // Step 2: Insert new IV and loop condition: + // loop: + // ... + // PhiCount = PHI [Count, Dec] + // ... + // Dec = PhiCount - 1 + // ... + // Br: loop if (Dec != 0) + BasicBlock *Body = *(CurLoop->block_begin()); + auto *LbBr = dyn_cast(Body->getTerminator()); + ICmpInst *LbCond = cast(LbBr->getCondition()); + Type *Ty = Count->getType(); + + PHINode *TcPhi = PHINode::Create(Ty, 2, "tcphi", &Body->front()); + + Builder.SetInsertPoint(LbCond); + Instruction *TcDec = cast( + Builder.CreateSub(TcPhi, ConstantInt::get(Ty, 1), + "tcdec", false, true)); + + TcPhi->addIncoming(Count, Preheader); + TcPhi->addIncoming(TcDec, Body); + + CmpInst::Predicate Pred = + (LbBr->getSuccessor(0) == Body) ? CmpInst::ICMP_NE : CmpInst::ICMP_EQ; + LbCond->setPredicate(Pred); + LbCond->setOperand(0, TcDec); + LbCond->setOperand(1, ConstantInt::get(Ty, 0)); + + // Step 3: All the references to the original counter outside + // the loop are replaced with the NewCount -- the value returned from + // __builtin_ctlz(x). + if (IsCntPhiUsedOutsideLoop) + CntPhi->replaceUsesOutsideBlock(NewCount, Body); + else + CntInst->replaceUsesOutsideBlock(NewCount, Body); + + // step 4: Forget the "non-computable" trip-count SCEV associated with the + // loop. The loop would otherwise not be deleted even if it becomes empty. + SE->forgetLoop(CurLoop); +} + void LoopIdiomRecognize::transformLoopToPopcount(BasicBlock *PreCondBB, Instruction *CntInst, PHINode *CntPhi, Value *Var) { diff --git a/lib/Transforms/Scalar/NewGVN.cpp b/lib/Transforms/Scalar/NewGVN.cpp index 3c9850b156ac..5e0a705782ea 100644 --- a/lib/Transforms/Scalar/NewGVN.cpp +++ b/lib/Transforms/Scalar/NewGVN.cpp @@ -283,7 +283,6 @@ public: // Forward propagation info const Expression *getDefiningExpr() const { return DefiningExpr; } - void setDefiningExpr(const Expression *E) { DefiningExpr = E; } // Value member set bool empty() const { return Members.empty(); } @@ -317,6 +316,9 @@ public: --StoreCount; } + // True if this class has no memory members. + bool definesNoMemory() const { return StoreCount == 0 && memory_empty(); } + // Return true if two congruence classes are equivalent to each other. This // means // that every field but the ID number and the dead field are equivalent. @@ -401,9 +403,12 @@ class NewGVN { MemorySSAWalker *MSSAWalker; const DataLayout &DL; std::unique_ptr PredInfo; - BumpPtrAllocator ExpressionAllocator; - ArrayRecycler ArgRecycler; - TarjanSCC SCCFinder; + + // These are the only two things the create* functions should have + // side-effects on due to allocating memory. + mutable BumpPtrAllocator ExpressionAllocator; + mutable ArrayRecycler ArgRecycler; + mutable TarjanSCC SCCFinder; const SimplifyQuery SQ; // Number of function arguments, used by ranking @@ -430,11 +435,12 @@ class NewGVN { // In order to correctly ensure propagation, we must keep track of what // comparisons we used, so that when the values of the comparisons change, we // propagate the information to the places we used the comparison. - DenseMap> PredicateToUsers; - // Mapping from MemoryAccess we used to the MemoryAccess we used it with. Has + mutable DenseMap> + PredicateToUsers; // the same reasoning as PredicateToUsers. When we skip MemoryAccesses for // stores, we no longer can rely solely on the def-use chains of MemorySSA. - DenseMap> MemoryToUsers; + mutable DenseMap> + MemoryToUsers; // A table storing which memorydefs/phis represent a memory state provably // equivalent to another memory state. @@ -457,7 +463,7 @@ class NewGVN { DenseMap MemoryPhiState; enum PhiCycleState { PCS_Unknown, PCS_CycleFree, PCS_Cycle }; - DenseMap PhiCycleState; + mutable DenseMap PhiCycleState; // Expression to class mapping. using ExpressionClassMap = DenseMap; ExpressionClassMap ExpressionToClass; @@ -511,21 +517,24 @@ public: private: // Expression handling. - const Expression *createExpression(Instruction *); - const Expression *createBinaryExpression(unsigned, Type *, Value *, Value *); - PHIExpression *createPHIExpression(Instruction *, bool &HasBackedge, - bool &AllConstant); - const VariableExpression *createVariableExpression(Value *); - const ConstantExpression *createConstantExpression(Constant *); - const Expression *createVariableOrConstant(Value *V); - const UnknownExpression *createUnknownExpression(Instruction *); + const Expression *createExpression(Instruction *) const; + const Expression *createBinaryExpression(unsigned, Type *, Value *, + Value *) const; + PHIExpression *createPHIExpression(Instruction *, bool &HasBackEdge, + bool &AllConstant) const; + const VariableExpression *createVariableExpression(Value *) const; + const ConstantExpression *createConstantExpression(Constant *) const; + const Expression *createVariableOrConstant(Value *V) const; + const UnknownExpression *createUnknownExpression(Instruction *) const; const StoreExpression *createStoreExpression(StoreInst *, - const MemoryAccess *); + const MemoryAccess *) const; LoadExpression *createLoadExpression(Type *, Value *, LoadInst *, - const MemoryAccess *); - const CallExpression *createCallExpression(CallInst *, const MemoryAccess *); - const AggregateValueExpression *createAggregateValueExpression(Instruction *); - bool setBasicExpressionInfo(Instruction *, BasicExpression *); + const MemoryAccess *) const; + const CallExpression *createCallExpression(CallInst *, + const MemoryAccess *) const; + const AggregateValueExpression * + createAggregateValueExpression(Instruction *) const; + bool setBasicExpressionInfo(Instruction *, BasicExpression *) const; // Congruence class handling. CongruenceClass *createCongruenceClass(Value *Leader, const Expression *E) { @@ -560,17 +569,18 @@ private: // Symbolic evaluation. const Expression *checkSimplificationResults(Expression *, Instruction *, - Value *); - const Expression *performSymbolicEvaluation(Value *); + Value *) const; + const Expression *performSymbolicEvaluation(Value *) const; const Expression *performSymbolicLoadCoercion(Type *, Value *, LoadInst *, - Instruction *, MemoryAccess *); - const Expression *performSymbolicLoadEvaluation(Instruction *); - const Expression *performSymbolicStoreEvaluation(Instruction *); - const Expression *performSymbolicCallEvaluation(Instruction *); - const Expression *performSymbolicPHIEvaluation(Instruction *); - const Expression *performSymbolicAggrValueEvaluation(Instruction *); - const Expression *performSymbolicCmpEvaluation(Instruction *); - const Expression *performSymbolicPredicateInfoEvaluation(Instruction *); + Instruction *, + MemoryAccess *) const; + const Expression *performSymbolicLoadEvaluation(Instruction *) const; + const Expression *performSymbolicStoreEvaluation(Instruction *) const; + const Expression *performSymbolicCallEvaluation(Instruction *) const; + const Expression *performSymbolicPHIEvaluation(Instruction *) const; + const Expression *performSymbolicAggrValueEvaluation(Instruction *) const; + const Expression *performSymbolicCmpEvaluation(Instruction *) const; + const Expression *performSymbolicPredicateInfoEvaluation(Instruction *) const; // Congruence finding. bool someEquivalentDominates(const Instruction *, const Instruction *) const; @@ -620,8 +630,8 @@ private: void markPredicateUsersTouched(Instruction *); void markValueLeaderChangeTouched(CongruenceClass *CC); void markMemoryLeaderChangeTouched(CongruenceClass *CC); - void addPredicateUsers(const PredicateBase *, Instruction *); - void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U); + void addPredicateUsers(const PredicateBase *, Instruction *) const; + void addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const; // Main loop of value numbering void iterateTouchedInstructions(); @@ -634,7 +644,7 @@ private: void verifyIterationSettled(Function &F); bool singleReachablePHIPath(const MemoryAccess *, const MemoryAccess *) const; BasicBlock *getBlockForValue(Value *V) const; - void deleteExpression(const Expression *E); + void deleteExpression(const Expression *E) const; unsigned InstrToDFSNum(const Value *V) const { assert(isa(V) && "This should not be used for MemoryAccesses"); return InstrDFS.lookup(V); @@ -654,7 +664,7 @@ private: ? InstrToDFSNum(cast(MA)->getMemoryInst()) : InstrDFS.lookup(MA); } - bool isCycleFree(const PHINode *PN); + bool isCycleFree(const PHINode *PN) const; template T *getMinDFSOfRange(const Range &) const; // Debug counter info. When verifying, we have to reset the value numbering // debug counter to the same state it started in to get the same results. @@ -702,7 +712,7 @@ BasicBlock *NewGVN::getBlockForValue(Value *V) const { // Delete a definitely dead expression, so it can be reused by the expression // allocator. Some of these are not in creation functions, so we have to accept // const versions. -void NewGVN::deleteExpression(const Expression *E) { +void NewGVN::deleteExpression(const Expression *E) const { assert(isa(E)); auto *BE = cast(E); const_cast(BE)->deallocateOperands(ArgRecycler); @@ -710,7 +720,7 @@ void NewGVN::deleteExpression(const Expression *E) { } PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, - bool &AllConstant) { + bool &AllConstant) const { BasicBlock *PHIBlock = I->getParent(); auto *PN = cast(I); auto *E = @@ -722,30 +732,46 @@ PHIExpression *NewGVN::createPHIExpression(Instruction *I, bool &HasBackedge, unsigned PHIRPO = RPOOrdering.lookup(DT->getNode(PHIBlock)); + // NewGVN assumes the operands of a PHI node are in a consistent order across + // PHIs. LLVM doesn't seem to always guarantee this. While we need to fix + // this in LLVM at some point we don't want GVN to find wrong congruences. + // Therefore, here we sort uses in predecessor order. + // We're sorting the values by pointer. In theory this might be cause of + // non-determinism, but here we don't rely on the ordering for anything + // significant, e.g. we don't create new instructions based on it so we're + // fine. + SmallVector PHIOperands; + for (const Use &U : PN->operands()) + PHIOperands.push_back(&U); + std::sort(PHIOperands.begin(), PHIOperands.end(), + [&](const Use *U1, const Use *U2) { + return PN->getIncomingBlock(*U1) < PN->getIncomingBlock(*U2); + }); + // Filter out unreachable phi operands. - auto Filtered = make_filter_range(PN->operands(), [&](const Use &U) { - return ReachableEdges.count({PN->getIncomingBlock(U), PHIBlock}); + auto Filtered = make_filter_range(PHIOperands, [&](const Use *U) { + return ReachableEdges.count({PN->getIncomingBlock(*U), PHIBlock}); }); std::transform(Filtered.begin(), Filtered.end(), op_inserter(E), - [&](const Use &U) -> Value * { - auto *BB = PN->getIncomingBlock(U); + [&](const Use *U) -> Value * { + auto *BB = PN->getIncomingBlock(*U); auto *DTN = DT->getNode(BB); if (RPOOrdering.lookup(DTN) >= PHIRPO) HasBackedge = true; - AllConstant &= isa(U) || isa(U); + AllConstant &= isa(*U) || isa(*U); // Don't try to transform self-defined phis. - if (U == PN) + if (*U == PN) return PN; - return lookupOperandLeader(U); + return lookupOperandLeader(*U); }); return E; } // Set basic expression info (Arguments, type, opcode) for Expression // E from Instruction I in block B. -bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) { +bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) const { bool AllConstant = true; if (auto *GEP = dyn_cast(I)) E->setType(GEP->getSourceElementType()); @@ -766,7 +792,8 @@ bool NewGVN::setBasicExpressionInfo(Instruction *I, BasicExpression *E) { } const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, - Value *Arg1, Value *Arg2) { + Value *Arg1, + Value *Arg2) const { auto *E = new (ExpressionAllocator) BasicExpression(2); E->setType(T); @@ -795,7 +822,8 @@ const Expression *NewGVN::createBinaryExpression(unsigned Opcode, Type *T, // TODO: Once finished, this should not take an Instruction, we only // use it for printing. const Expression *NewGVN::checkSimplificationResults(Expression *E, - Instruction *I, Value *V) { + Instruction *I, + Value *V) const { if (!V) return nullptr; if (auto *C = dyn_cast(V)) { @@ -827,7 +855,7 @@ const Expression *NewGVN::checkSimplificationResults(Expression *E, return nullptr; } -const Expression *NewGVN::createExpression(Instruction *I) { +const Expression *NewGVN::createExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) BasicExpression(I->getNumOperands()); bool AllConstant = setBasicExpressionInfo(I, E); @@ -913,7 +941,7 @@ const Expression *NewGVN::createExpression(Instruction *I) { } const AggregateValueExpression * -NewGVN::createAggregateValueExpression(Instruction *I) { +NewGVN::createAggregateValueExpression(Instruction *I) const { if (auto *II = dyn_cast(I)) { auto *E = new (ExpressionAllocator) AggregateValueExpression(I->getNumOperands(), II->getNumIndices()); @@ -932,32 +960,32 @@ NewGVN::createAggregateValueExpression(Instruction *I) { llvm_unreachable("Unhandled type of aggregate value operation"); } -const VariableExpression *NewGVN::createVariableExpression(Value *V) { +const VariableExpression *NewGVN::createVariableExpression(Value *V) const { auto *E = new (ExpressionAllocator) VariableExpression(V); E->setOpcode(V->getValueID()); return E; } -const Expression *NewGVN::createVariableOrConstant(Value *V) { +const Expression *NewGVN::createVariableOrConstant(Value *V) const { if (auto *C = dyn_cast(V)) return createConstantExpression(C); return createVariableExpression(V); } -const ConstantExpression *NewGVN::createConstantExpression(Constant *C) { +const ConstantExpression *NewGVN::createConstantExpression(Constant *C) const { auto *E = new (ExpressionAllocator) ConstantExpression(C); E->setOpcode(C->getValueID()); return E; } -const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) { +const UnknownExpression *NewGVN::createUnknownExpression(Instruction *I) const { auto *E = new (ExpressionAllocator) UnknownExpression(I); E->setOpcode(I->getOpcode()); return E; } -const CallExpression *NewGVN::createCallExpression(CallInst *CI, - const MemoryAccess *MA) { +const CallExpression * +NewGVN::createCallExpression(CallInst *CI, const MemoryAccess *MA) const { // FIXME: Add operand bundles for calls. auto *E = new (ExpressionAllocator) CallExpression(CI->getNumOperands(), CI, MA); @@ -1017,9 +1045,8 @@ Value *NewGVN::lookupOperandLeader(Value *V) const { const MemoryAccess *NewGVN::lookupMemoryLeader(const MemoryAccess *MA) const { auto *CC = getMemoryClass(MA); assert(CC->getMemoryLeader() && - "Every MemoryAccess should be mapped to a " - "congruence class with a represenative memory " - "access"); + "Every MemoryAccess should be mapped to a congruence class with a " + "representative memory access"); return CC->getMemoryLeader(); } @@ -1032,7 +1059,7 @@ bool NewGVN::isMemoryAccessTop(const MemoryAccess *MA) const { LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, LoadInst *LI, - const MemoryAccess *MA) { + const MemoryAccess *MA) const { auto *E = new (ExpressionAllocator) LoadExpression(1, LI, lookupMemoryLeader(MA)); E->allocateOperands(ArgRecycler, ExpressionAllocator); @@ -1050,8 +1077,8 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp, return E; } -const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, - const MemoryAccess *MA) { +const StoreExpression * +NewGVN::createStoreExpression(StoreInst *SI, const MemoryAccess *MA) const { auto *StoredValueLeader = lookupOperandLeader(SI->getValueOperand()); auto *E = new (ExpressionAllocator) StoreExpression(SI->getNumOperands(), SI, StoredValueLeader, MA); @@ -1068,7 +1095,7 @@ const StoreExpression *NewGVN::createStoreExpression(StoreInst *SI, return E; } -const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) const { // Unlike loads, we never try to eliminate stores, so we do not check if they // are simple and avoid value numbering them. auto *SI = cast(I); @@ -1126,7 +1153,7 @@ const Expression *NewGVN::performSymbolicStoreEvaluation(Instruction *I) { const Expression * NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, LoadInst *LI, Instruction *DepInst, - MemoryAccess *DefiningAccess) { + MemoryAccess *DefiningAccess) const { assert((!LI || LI->isSimple()) && "Not a simple load"); if (auto *DepSI = dyn_cast(DepInst)) { // Can't forward from non-atomic to atomic without violating memory model. @@ -1201,7 +1228,7 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr, return nullptr; } -const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) const { auto *LI = cast(I); // We can eliminate in favor of non-simple loads, but we won't be able to @@ -1239,7 +1266,7 @@ const Expression *NewGVN::performSymbolicLoadEvaluation(Instruction *I) { } const Expression * -NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) { +NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) const { auto *PI = PredInfo->getPredicateInfoFor(I); if (!PI) return nullptr; @@ -1284,7 +1311,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) { return nullptr; if (CopyOf != Cmp->getOperand(0) && CopyOf != Cmp->getOperand(1)) { - DEBUG(dbgs() << "Copy is not of any condition operands!"); + DEBUG(dbgs() << "Copy is not of any condition operands!\n"); return nullptr; } Value *FirstOp = lookupOperandLeader(Cmp->getOperand(0)); @@ -1329,7 +1356,7 @@ NewGVN::performSymbolicPredicateInfoEvaluation(Instruction *I) { } // Evaluate read only and pure calls, and create an expression result. -const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicCallEvaluation(Instruction *I) const { auto *CI = cast(I); if (auto *II = dyn_cast(I)) { // Instrinsics with the returned attribute are copies of arguments. @@ -1366,8 +1393,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, DEBUG(dbgs() << "Setting " << *From); DEBUG(dbgs() << " equivalent to congruence class "); DEBUG(dbgs() << NewClass->getID() << " with current MemoryAccess leader "); - DEBUG(dbgs() << *NewClass->getMemoryLeader()); - DEBUG(dbgs() << "\n"); + DEBUG(dbgs() << *NewClass->getMemoryLeader() << "\n"); auto LookupResult = MemoryAccessToClass.find(From); bool Changed = false; @@ -1381,7 +1407,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, NewClass->memory_insert(MP); // This may have killed the class if it had no non-memory members if (OldClass->getMemoryLeader() == From) { - if (OldClass->memory_empty()) { + if (OldClass->definesNoMemory()) { OldClass->setMemoryLeader(nullptr); } else { OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); @@ -1406,7 +1432,7 @@ bool NewGVN::setMemoryClass(const MemoryAccess *From, // Determine if a phi is cycle-free. That means the values in the phi don't // depend on any expressions that can change value as a result of the phi. // For example, a non-cycle free phi would be v = phi(0, v+1). -bool NewGVN::isCycleFree(const PHINode *PN) { +bool NewGVN::isCycleFree(const PHINode *PN) const { // In order to compute cycle-freeness, we do SCC finding on the phi, and see // what kind of SCC it ends up in. If it is a singleton, it is cycle-free. // If it is not in a singleton, it is only cycle free if the other members are @@ -1436,7 +1462,7 @@ bool NewGVN::isCycleFree(const PHINode *PN) { } // Evaluate PHI nodes symbolically, and create an expression result. -const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) const { // True if one of the incoming phi edges is a backedge. bool HasBackedge = false; // All constant tracks the state of whether all the *original* phi operands @@ -1510,7 +1536,8 @@ const Expression *NewGVN::performSymbolicPHIEvaluation(Instruction *I) { return E; } -const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) { +const Expression * +NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) const { if (auto *EI = dyn_cast(I)) { auto *II = dyn_cast(EI->getAggregateOperand()); if (II && EI->getNumIndices() == 1 && *EI->idx_begin() == 0) { @@ -1548,7 +1575,7 @@ const Expression *NewGVN::performSymbolicAggrValueEvaluation(Instruction *I) { return createAggregateValueExpression(I); } -const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) { +const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) const { auto *CI = dyn_cast(I); // See if our operands are equal to those of a previous predicate, and if so, // if it implies true or false. @@ -1663,7 +1690,7 @@ const Expression *NewGVN::performSymbolicCmpEvaluation(Instruction *I) { } // Substitute and symbolize the value before value numbering. -const Expression *NewGVN::performSymbolicEvaluation(Value *V) { +const Expression *NewGVN::performSymbolicEvaluation(Value *V) const { const Expression *E = nullptr; if (auto *C = dyn_cast(V)) E = createConstantExpression(C); @@ -1749,7 +1776,7 @@ void NewGVN::markUsersTouched(Value *V) { } } -void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) { +void NewGVN::addMemoryUsers(const MemoryAccess *To, MemoryAccess *U) const { DEBUG(dbgs() << "Adding memory user " << *U << " to " << *To << "\n"); MemoryToUsers[To].insert(U); } @@ -1772,7 +1799,7 @@ void NewGVN::markMemoryUsersTouched(const MemoryAccess *MA) { } // Add I to the set of users of a given predicate. -void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) { +void NewGVN::addPredicateUsers(const PredicateBase *PB, Instruction *I) const { if (auto *PBranch = dyn_cast(PB)) PredicateToUsers[PBranch->Condition].insert(I); else if (auto *PAssume = dyn_cast(PB)) @@ -1825,8 +1852,7 @@ const MemoryAccess *NewGVN::getNextMemoryLeader(CongruenceClass *CC) const { // TODO: If this ends up to slow, we can maintain a next memory leader like we // do for regular leaders. // Make sure there will be a leader to find - assert((CC->getStoreCount() > 0 || !CC->memory_empty()) && - "Can't get next leader if there is none"); + assert(!CC->definesNoMemory() && "Can't get next leader if there is none"); if (CC->getStoreCount() > 0) { if (auto *NL = dyn_cast_or_null(CC->getNextLeader().first)) return MSSA->getMemoryAccess(NL); @@ -1898,7 +1924,7 @@ void NewGVN::moveMemoryToNewCongruenceClass(Instruction *I, setMemoryClass(InstMA, NewClass); // Now, fixup the old class if necessary if (OldClass->getMemoryLeader() == InstMA) { - if (OldClass->getStoreCount() != 0 || !OldClass->memory_empty()) { + if (!OldClass->definesNoMemory()) { OldClass->setMemoryLeader(getNextMemoryLeader(OldClass)); DEBUG(dbgs() << "Memory class leader change for class " << OldClass->getID() << " to " @@ -1956,10 +1982,9 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, if (NewClass->getStoreCount() == 0 && !NewClass->getStoredValue()) { // If it's a store expression we are using, it means we are not equivalent // to something earlier. - if (isa(E)) { - assert(lookupOperandLeader(SI->getValueOperand()) != - NewClass->getLeader()); - NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand())); + if (auto *SE = dyn_cast(E)) { + assert(SE->getStoredValue() != NewClass->getLeader()); + NewClass->setStoredValue(SE->getStoredValue()); markValueLeaderChangeTouched(NewClass); // Shift the new class leader to be the store DEBUG(dbgs() << "Changing leader of congruence class " @@ -1985,7 +2010,7 @@ void NewGVN::moveValueToNewCongruenceClass(Instruction *I, const Expression *E, // See if we destroyed the class or need to swap leaders. if (OldClass->empty() && OldClass != TOPClass) { if (OldClass->getDefiningExpr()) { - DEBUG(dbgs() << "Erasing expression " << OldClass->getDefiningExpr() + DEBUG(dbgs() << "Erasing expression " << *OldClass->getDefiningExpr() << " from table\n"); ExpressionToClass.erase(OldClass->getDefiningExpr()); } @@ -2064,7 +2089,7 @@ void NewGVN::performCongruenceFinding(Instruction *I, const Expression *E) { } else if (const auto *SE = dyn_cast(E)) { StoreInst *SI = SE->getStoreInst(); NewClass->setLeader(SI); - NewClass->setStoredValue(lookupOperandLeader(SI->getValueOperand())); + NewClass->setStoredValue(SE->getStoredValue()); // The RepMemoryAccess field will be filled in properly by the // moveValueToNewCongruenceClass call. } else { @@ -2523,6 +2548,19 @@ void NewGVN::verifyMemoryCongruency() const { return false; if (auto *MemDef = dyn_cast(Pair.first)) return !isInstructionTriviallyDead(MemDef->getMemoryInst()); + + // We could have phi nodes which operands are all trivially dead, + // so we don't process them. + if (auto *MemPHI = dyn_cast(Pair.first)) { + for (auto &U : MemPHI->incoming_values()) { + if (Instruction *I = dyn_cast(U.get())) { + if (!isInstructionTriviallyDead(I)) + return true; + } + } + return false; + } + return true; }; diff --git a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp index fb1b47c48276..4f608c97147d 100644 --- a/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp +++ b/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp @@ -55,7 +55,7 @@ static void replaceLoopUsesWithConstant(Loop &L, Value &LIC, /// Update the dominator tree after removing one exiting predecessor of a loop /// exit block. static void updateLoopExitIDom(BasicBlock *LoopExitBB, Loop &L, - DominatorTree &DT) { + DominatorTree &DT) { assert(pred_begin(LoopExitBB) != pred_end(LoopExitBB) && "Cannot have empty predecessors of the loop exit block if we split " "off a block to unswitch!"); @@ -137,6 +137,98 @@ static void updateDTAfterUnswitch(BasicBlock *UnswitchedBB, BasicBlock *OldPH, } } +/// Check that all the LCSSA PHI nodes in the loop exit block have trivial +/// incoming values along this edge. +static bool areLoopExitPHIsLoopInvariant(Loop &L, BasicBlock &ExitingBB, + BasicBlock &ExitBB) { + for (Instruction &I : ExitBB) { + auto *PN = dyn_cast(&I); + if (!PN) + // No more PHIs to check. + return true; + + // If the incoming value for this edge isn't loop invariant the unswitch + // won't be trivial. + if (!L.isLoopInvariant(PN->getIncomingValueForBlock(&ExitingBB))) + return false; + } + llvm_unreachable("Basic blocks should never be empty!"); +} + +/// Rewrite the PHI nodes in an unswitched loop exit basic block. +/// +/// Requires that the loop exit and unswitched basic block are the same, and +/// that the exiting block was a unique predecessor of that block. Rewrites the +/// PHI nodes in that block such that what were LCSSA PHI nodes become trivial +/// PHI nodes from the old preheader that now contains the unswitched +/// terminator. +static void rewritePHINodesForUnswitchedExitBlock(BasicBlock &UnswitchedBB, + BasicBlock &OldExitingBB, + BasicBlock &OldPH) { + for (Instruction &I : UnswitchedBB) { + auto *PN = dyn_cast(&I); + if (!PN) + // No more PHIs to check. + break; + + // When the loop exit is directly unswitched we just need to update the + // incoming basic block. We loop to handle weird cases with repeated + // incoming blocks, but expect to typically only have one operand here. + for (auto i : llvm::seq(0, PN->getNumOperands())) { + assert(PN->getIncomingBlock(i) == &OldExitingBB && + "Found incoming block different from unique predecessor!"); + PN->setIncomingBlock(i, &OldPH); + } + } +} + +/// Rewrite the PHI nodes in the loop exit basic block and the split off +/// unswitched block. +/// +/// Because the exit block remains an exit from the loop, this rewrites the +/// LCSSA PHI nodes in it to remove the unswitched edge and introduces PHI +/// nodes into the unswitched basic block to select between the value in the +/// old preheader and the loop exit. +static void rewritePHINodesForExitAndUnswitchedBlocks(BasicBlock &ExitBB, + BasicBlock &UnswitchedBB, + BasicBlock &OldExitingBB, + BasicBlock &OldPH) { + assert(&ExitBB != &UnswitchedBB && + "Must have different loop exit and unswitched blocks!"); + Instruction *InsertPt = &*UnswitchedBB.begin(); + for (Instruction &I : ExitBB) { + auto *PN = dyn_cast(&I); + if (!PN) + // No more PHIs to check. + break; + + auto *NewPN = PHINode::Create(PN->getType(), /*NumReservedValues*/ 2, + PN->getName() + ".split", InsertPt); + + // Walk backwards over the old PHI node's inputs to minimize the cost of + // removing each one. We have to do this weird loop manually so that we + // create the same number of new incoming edges in the new PHI as we expect + // each case-based edge to be included in the unswitched switch in some + // cases. + // FIXME: This is really, really gross. It would be much cleaner if LLVM + // allowed us to create a single entry for a predecessor block without + // having separate entries for each "edge" even though these edges are + // required to produce identical results. + for (int i = PN->getNumIncomingValues() - 1; i >= 0; --i) { + if (PN->getIncomingBlock(i) != &OldExitingBB) + continue; + + Value *Incoming = PN->removeIncomingValue(i); + NewPN->addIncoming(Incoming, &OldPH); + } + + // Now replace the old PHI with the new one and wire the old one in as an + // input to the new one. + PN->replaceAllUsesWith(NewPN); + NewPN->addIncoming(PN, &ExitBB); + } +} + /// Unswitch a trivial branch if the condition is loop invariant. /// /// This routine should only be called when loop code leading to the branch has @@ -187,10 +279,8 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, assert(L.contains(ContinueBB) && "Cannot have both successors exit and still be in the loop!"); - // If the loop exit block contains phi nodes, this isn't trivial. - // FIXME: We should examine the PHI to determine whether or not we can handle - // it trivially. - if (isa(LoopExitBB->begin())) + auto *ParentBB = BI.getParent(); + if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, *LoopExitBB)) return false; DEBUG(dbgs() << " unswitching trivial branch when: " << CondVal @@ -209,14 +299,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, BasicBlock *UnswitchedBB; if (BasicBlock *PredBB = LoopExitBB->getUniquePredecessor()) { (void)PredBB; - assert(PredBB == BI.getParent() && "A branch's parent is't a predecessor!"); + assert(PredBB == BI.getParent() && + "A branch's parent isn't a predecessor!"); UnswitchedBB = LoopExitBB; } else { UnswitchedBB = SplitBlock(LoopExitBB, &LoopExitBB->front(), &DT, &LI); } - BasicBlock *ParentBB = BI.getParent(); - // Now splice the branch to gate reaching the new preheader and re-point its // successors. OldPH->getInstList().splice(std::prev(OldPH->end()), @@ -229,6 +318,13 @@ static bool unswitchTrivialBranch(Loop &L, BranchInst &BI, DominatorTree &DT, // terminator. BranchInst::Create(ContinueBB, ParentBB); + // Rewrite the relevant PHI nodes. + if (UnswitchedBB == LoopExitBB) + rewritePHINodesForUnswitchedExitBlock(*UnswitchedBB, *ParentBB, *OldPH); + else + rewritePHINodesForExitAndUnswitchedBlocks(*LoopExitBB, *UnswitchedBB, + *ParentBB, *OldPH); + // Now we need to update the dominator tree. updateDTAfterUnswitch(UnswitchedBB, OldPH, DT); // But if we split something off of the loop exit block then we also removed @@ -278,6 +374,8 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (!L.isLoopInvariant(LoopCond)) return false; + auto *ParentBB = SI.getParent(); + // FIXME: We should compute this once at the start and update it! SmallVector ExitBlocks; L.getExitBlocks(ExitBlocks); @@ -287,12 +385,13 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, SmallVector ExitCaseIndices; for (auto Case : SI.cases()) { auto *SuccBB = Case.getCaseSuccessor(); - if (ExitBlockSet.count(SuccBB) && !isa(SuccBB->begin())) + if (ExitBlockSet.count(SuccBB) && + areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB)) ExitCaseIndices.push_back(Case.getCaseIndex()); } BasicBlock *DefaultExitBB = nullptr; if (ExitBlockSet.count(SI.getDefaultDest()) && - !isa(SI.getDefaultDest()->begin()) && + areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) && !isa(SI.getDefaultDest()->getTerminator())) DefaultExitBB = SI.getDefaultDest(); else if (ExitCaseIndices.empty()) @@ -330,7 +429,6 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (CommonSuccBB) { SI.setDefaultDest(CommonSuccBB); } else { - BasicBlock *ParentBB = SI.getParent(); BasicBlock *UnreachableBB = BasicBlock::Create( ParentBB->getContext(), Twine(ParentBB->getName()) + ".unreachable_default", @@ -358,30 +456,44 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, // Now add the unswitched switch. auto *NewSI = SwitchInst::Create(LoopCond, NewPH, ExitCases.size(), OldPH); - // Split any exit blocks with remaining in-loop predecessors. We walk in - // reverse so that we split in the same order as the cases appeared. This is - // purely for convenience of reading the resulting IR, but it doesn't cost - // anything really. + // Rewrite the IR for the unswitched basic blocks. This requires two steps. + // First, we split any exit blocks with remaining in-loop predecessors. Then + // we update the PHIs in one of two ways depending on if there was a split. + // We walk in reverse so that we split in the same order as the cases + // appeared. This is purely for convenience of reading the resulting IR, but + // it doesn't cost anything really. + SmallPtrSet UnswitchedExitBBs; SmallDenseMap SplitExitBBMap; // Handle the default exit if necessary. // FIXME: It'd be great if we could merge this with the loop below but LLVM's // ranges aren't quite powerful enough yet. - if (DefaultExitBB && !pred_empty(DefaultExitBB)) { - auto *SplitBB = - SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); - updateLoopExitIDom(DefaultExitBB, L, DT); - DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; + if (DefaultExitBB) { + if (pred_empty(DefaultExitBB)) { + UnswitchedExitBBs.insert(DefaultExitBB); + rewritePHINodesForUnswitchedExitBlock(*DefaultExitBB, *ParentBB, *OldPH); + } else { + auto *SplitBB = + SplitBlock(DefaultExitBB, &DefaultExitBB->front(), &DT, &LI); + rewritePHINodesForExitAndUnswitchedBlocks(*DefaultExitBB, *SplitBB, + *ParentBB, *OldPH); + updateLoopExitIDom(DefaultExitBB, L, DT); + DefaultExitBB = SplitExitBBMap[DefaultExitBB] = SplitBB; + } } // Note that we must use a reference in the for loop so that we update the // container. for (auto &CasePair : reverse(ExitCases)) { // Grab a reference to the exit block in the pair so that we can update it. - BasicBlock *&ExitBB = CasePair.second; + BasicBlock *ExitBB = CasePair.second; // If this case is the last edge into the exit block, we can simply reuse it // as it will no longer be a loop exit. No mapping necessary. - if (pred_empty(ExitBB)) + if (pred_empty(ExitBB)) { + // Only rewrite once. + if (UnswitchedExitBBs.insert(ExitBB).second) + rewritePHINodesForUnswitchedExitBlock(*ExitBB, *ParentBB, *OldPH); continue; + } // Otherwise we need to split the exit block so that we retain an exit // block from the loop and a target for the unswitched condition. @@ -389,9 +501,12 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT, if (!SplitExitBB) { // If this is the first time we see this, do the split and remember it. SplitExitBB = SplitBlock(ExitBB, &ExitBB->front(), &DT, &LI); + rewritePHINodesForExitAndUnswitchedBlocks(*ExitBB, *SplitExitBB, + *ParentBB, *OldPH); updateLoopExitIDom(ExitBB, L, DT); } - ExitBB = SplitExitBB; + // Update the case pair to point to the split block. + CasePair.second = SplitExitBB; } // Now add the unswitched cases. We do this in reverse order as we built them diff --git a/lib/Transforms/Scalar/SpeculativeExecution.cpp b/lib/Transforms/Scalar/SpeculativeExecution.cpp index a0fc966cee2c..a7c308b59877 100644 --- a/lib/Transforms/Scalar/SpeculativeExecution.cpp +++ b/lib/Transforms/Scalar/SpeculativeExecution.cpp @@ -208,6 +208,47 @@ bool SpeculativeExecutionPass::runOnBasicBlock(BasicBlock &B) { return false; } +static unsigned ComputeSpeculationCost(const Instruction *I, + const TargetTransformInfo &TTI) { + switch (Operator::getOpcode(I)) { + case Instruction::GetElementPtr: + case Instruction::Add: + case Instruction::Mul: + case Instruction::And: + case Instruction::Or: + case Instruction::Select: + case Instruction::Shl: + case Instruction::Sub: + case Instruction::LShr: + case Instruction::AShr: + case Instruction::Xor: + case Instruction::ZExt: + case Instruction::SExt: + case Instruction::Call: + case Instruction::BitCast: + case Instruction::PtrToInt: + case Instruction::IntToPtr: + case Instruction::AddrSpaceCast: + case Instruction::FPToUI: + case Instruction::FPToSI: + case Instruction::UIToFP: + case Instruction::SIToFP: + case Instruction::FPExt: + case Instruction::FPTrunc: + case Instruction::FAdd: + case Instruction::FSub: + case Instruction::FMul: + case Instruction::FDiv: + case Instruction::FRem: + case Instruction::ICmp: + case Instruction::FCmp: + return TTI.getUserCost(I); + + default: + return UINT_MAX; // Disallow anything not whitelisted. + } +} + bool SpeculativeExecutionPass::considerHoistingFromTo( BasicBlock &FromBlock, BasicBlock &ToBlock) { SmallSet NotHoisted; @@ -223,7 +264,7 @@ bool SpeculativeExecutionPass::considerHoistingFromTo( unsigned TotalSpeculationCost = 0; for (auto& I : FromBlock) { - const unsigned Cost = TTI->getUserCost(&I); + const unsigned Cost = ComputeSpeculationCost(&I, *TTI); if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) && AllPrecedingUsesFromBlockHoisted(&I)) { TotalSpeculationCost += Cost; diff --git a/lib/Transforms/Utils/BypassSlowDivision.cpp b/lib/Transforms/Utils/BypassSlowDivision.cpp index 7ffdad597a9b..83ec7f55d1af 100644 --- a/lib/Transforms/Utils/BypassSlowDivision.cpp +++ b/lib/Transforms/Utils/BypassSlowDivision.cpp @@ -261,10 +261,10 @@ ValueRange FastDivInsertionTask::getValueRange(Value *V, computeKnownBits(V, Known, DL); - if (Known.Zero.countLeadingOnes() >= HiBits) + if (Known.countMinLeadingZeros() >= HiBits) return VALRNG_KNOWN_SHORT; - if (Known.One.countLeadingZeros() < HiBits) + if (Known.countMaxLeadingZeros() < HiBits) return VALRNG_LIKELY_LONG; // Long integer divisions are often used in hashtable implementations. It's diff --git a/lib/Transforms/Utils/CloneFunction.cpp b/lib/Transforms/Utils/CloneFunction.cpp index d5124ac89016..4aa26fd14fee 100644 --- a/lib/Transforms/Utils/CloneFunction.cpp +++ b/lib/Transforms/Utils/CloneFunction.cpp @@ -41,6 +41,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap, const Twine &NameSuffix, Function *F, ClonedCodeInfo *CodeInfo) { + DenseMap Cache; BasicBlock *NewBB = BasicBlock::Create(BB->getContext(), "", F); if (BB->hasName()) NewBB->setName(BB->getName()+NameSuffix); @@ -50,6 +51,9 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, for (BasicBlock::const_iterator II = BB->begin(), IE = BB->end(); II != IE; ++II) { Instruction *NewInst = II->clone(); + if (F && F->getSubprogram()) + DebugLoc::reparentDebugInfo(*NewInst, BB->getParent()->getSubprogram(), + F->getSubprogram(), Cache); if (II->hasName()) NewInst->setName(II->getName()+NameSuffix); NewBB->getInstList().push_back(NewInst); @@ -120,12 +124,28 @@ void llvm::CloneFunctionInto(Function *NewFunc, const Function *OldFunc, SmallVector, 1> MDs; OldFunc->getAllMetadata(MDs); - for (auto MD : MDs) - NewFunc->addMetadata( - MD.first, - *MapMetadata(MD.second, VMap, - ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, - TypeMapper, Materializer)); + for (auto MD : MDs) { + MDNode *NewMD; + bool MustCloneSP = + (MD.first == LLVMContext::MD_dbg && OldFunc->getParent() && + OldFunc->getParent() == NewFunc->getParent()); + if (MustCloneSP) { + auto *SP = cast(MD.second); + NewMD = DISubprogram::getDistinct( + NewFunc->getContext(), SP->getScope(), SP->getName(), + NewFunc->getName(), SP->getFile(), SP->getLine(), SP->getType(), + SP->isLocalToUnit(), SP->isDefinition(), SP->getScopeLine(), + SP->getContainingType(), SP->getVirtuality(), SP->getVirtualIndex(), + SP->getThisAdjustment(), SP->getFlags(), SP->isOptimized(), + SP->getUnit(), SP->getTemplateParams(), SP->getDeclaration(), + SP->getVariables(), SP->getThrownTypes()); + } else + NewMD = + MapMetadata(MD.second, VMap, + ModuleLevelChanges ? RF_None : RF_NoModuleLevelChanges, + TypeMapper, Materializer); + NewFunc->addMetadata(MD.first, *NewMD); + } // Loop over all of the basic blocks in the function, cloning them as // appropriate. Note that we save BE this way in order to handle cloning of diff --git a/lib/Transforms/Utils/CloneModule.cpp b/lib/Transforms/Utils/CloneModule.cpp index 4e9d67252d6c..5444b752de82 100644 --- a/lib/Transforms/Utils/CloneModule.cpp +++ b/lib/Transforms/Utils/CloneModule.cpp @@ -96,7 +96,7 @@ std::unique_ptr llvm::CloneModule( else GV = new GlobalVariable( *New, I->getValueType(), false, GlobalValue::ExternalLinkage, - (Constant *)nullptr, I->getName(), (GlobalVariable *)nullptr, + nullptr, I->getName(), nullptr, I->getThreadLocalMode(), I->getType()->getAddressSpace()); VMap[&*I] = GV; // We do not copy attributes (mainly because copying between different diff --git a/lib/Transforms/Utils/EscapeEnumerator.cpp b/lib/Transforms/Utils/EscapeEnumerator.cpp index 8c2386554da5..78d7474e5b95 100644 --- a/lib/Transforms/Utils/EscapeEnumerator.cpp +++ b/lib/Transforms/Utils/EscapeEnumerator.cpp @@ -67,8 +67,7 @@ IRBuilder<> *EscapeEnumerator::Next() { // Create a cleanup block. LLVMContext &C = F.getContext(); BasicBlock *CleanupBB = BasicBlock::Create(C, CleanupBBName, &F); - Type *ExnTy = - StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C), nullptr); + Type *ExnTy = StructType::get(Type::getInt8PtrTy(C), Type::getInt32Ty(C)); if (!F.hasPersonalityFn()) { Constant *PersFn = getDefaultPersonalityFn(F.getParent()); F.setPersonalityFn(PersFn); diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp index 6d56e08af99f..9cb4762b683c 100644 --- a/lib/Transforms/Utils/InlineFunction.cpp +++ b/lib/Transforms/Utils/InlineFunction.cpp @@ -1302,41 +1302,6 @@ static bool hasLifetimeMarkers(AllocaInst *AI) { return false; } -/// Rebuild the entire inlined-at chain for this instruction so that the top of -/// the chain now is inlined-at the new call site. -static DebugLoc -updateInlinedAtInfo(const DebugLoc &DL, DILocation *InlinedAtNode, - LLVMContext &Ctx, - DenseMap &IANodes) { - SmallVector InlinedAtLocations; - DILocation *Last = InlinedAtNode; - DILocation *CurInlinedAt = DL; - - // Gather all the inlined-at nodes - while (DILocation *IA = CurInlinedAt->getInlinedAt()) { - // Skip any we've already built nodes for - if (DILocation *Found = IANodes[IA]) { - Last = Found; - break; - } - - InlinedAtLocations.push_back(IA); - CurInlinedAt = IA; - } - - // Starting from the top, rebuild the nodes to point to the new inlined-at - // location (then rebuilding the rest of the chain behind it) and update the - // map of already-constructed inlined-at nodes. - for (const DILocation *MD : reverse(InlinedAtLocations)) { - Last = IANodes[MD] = DILocation::getDistinct( - Ctx, MD->getLine(), MD->getColumn(), MD->getScope(), Last); - } - - // And finally create the normal location for this instruction, referring to - // the new inlined-at chain. - return DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), Last); -} - /// Return the result of AI->isStaticAlloca() if AI were moved to the entry /// block. Allocas used in inalloca calls and allocas of dynamic array size /// cannot be static. @@ -1364,14 +1329,16 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI, // Cache the inlined-at nodes as they're built so they are reused, without // this every instruction's inlined-at chain would become distinct from each // other. - DenseMap IANodes; + DenseMap IANodes; for (; FI != Fn->end(); ++FI) { for (BasicBlock::iterator BI = FI->begin(), BE = FI->end(); BI != BE; ++BI) { if (DebugLoc DL = BI->getDebugLoc()) { - BI->setDebugLoc( - updateInlinedAtInfo(DL, InlinedAtNode, BI->getContext(), IANodes)); + auto IA = DebugLoc::appendInlinedAt(DL, InlinedAtNode, BI->getContext(), + IANodes); + auto IDL = DebugLoc::get(DL.getLine(), DL.getCol(), DL.getScope(), IA); + BI->setDebugLoc(IDL); continue; } @@ -1429,11 +1396,12 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock, /// Update the branch metadata for cloned call instructions. static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, const Optional &CalleeEntryCount, - const Instruction *TheCall) { + const Instruction *TheCall, + ProfileSummaryInfo *PSI) { if (!CalleeEntryCount.hasValue() || CalleeEntryCount.getValue() < 1) return; Optional CallSiteCount = - ProfileSummaryInfo::getProfileCount(TheCall, nullptr); + PSI ? PSI->getProfileCount(TheCall, nullptr) : None; uint64_t CallCount = std::min(CallSiteCount.hasValue() ? CallSiteCount.getValue() : 0, CalleeEntryCount.getValue()); @@ -1456,16 +1424,16 @@ static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap, /// The callsite's block count is subtracted from the callee's function entry /// count. static void updateCalleeCount(BlockFrequencyInfo *CallerBFI, BasicBlock *CallBB, - Instruction *CallInst, Function *Callee) { + Instruction *CallInst, Function *Callee, + ProfileSummaryInfo *PSI) { // If the callee has a original count of N, and the estimated count of // callsite is M, the new callee count is set to N - M. M is estimated from // the caller's entry count, its entry block frequency and the block frequency // of the callsite. Optional CalleeCount = Callee->getEntryCount(); - if (!CalleeCount.hasValue()) + if (!CalleeCount.hasValue() || !PSI) return; - Optional CallCount = - ProfileSummaryInfo::getProfileCount(CallInst, CallerBFI); + Optional CallCount = PSI->getProfileCount(CallInst, CallerBFI); if (!CallCount.hasValue()) return; // Since CallSiteCount is an estimate, it could exceed the original callee @@ -1668,9 +1636,10 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI, updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI, CalledFunc->front()); - updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall); + updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall, + IFI.PSI); // Update the profile count of callee. - updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc); + updateCalleeCount(IFI.CallerBFI, OrigBB, TheCall, CalledFunc, IFI.PSI); // Inject byval arguments initialization. for (std::pair &Init : ByValInit) diff --git a/lib/Transforms/Utils/InstructionNamer.cpp b/lib/Transforms/Utils/InstructionNamer.cpp index 8a1973d1db05..53b432fcafd4 100644 --- a/lib/Transforms/Utils/InstructionNamer.cpp +++ b/lib/Transforms/Utils/InstructionNamer.cpp @@ -26,16 +26,15 @@ namespace { InstNamer() : FunctionPass(ID) { initializeInstNamerPass(*PassRegistry::getPassRegistry()); } - + void getAnalysisUsage(AnalysisUsage &Info) const override { Info.setPreservesAll(); } bool runOnFunction(Function &F) override { - for (Function::arg_iterator AI = F.arg_begin(), AE = F.arg_end(); - AI != AE; ++AI) - if (!AI->hasName() && !AI->getType()->isVoidTy()) - AI->setName("arg"); + for (auto &Arg : F.args()) + if (!Arg.hasName()) + Arg.setName("arg"); for (BasicBlock &BB : F) { if (!BB.hasName()) @@ -48,11 +47,11 @@ namespace { return true; } }; - + char InstNamer::ID = 0; } -INITIALIZE_PASS(InstNamer, "instnamer", +INITIALIZE_PASS(InstNamer, "instnamer", "Assign names to anonymous instructions", false, false) char &llvm::InstructionNamerID = InstNamer::ID; //===----------------------------------------------------------------------===// diff --git a/lib/Transforms/Utils/Local.cpp b/lib/Transforms/Utils/Local.cpp index ce6b703f3528..1ca509472b5f 100644 --- a/lib/Transforms/Utils/Local.cpp +++ b/lib/Transforms/Utils/Local.cpp @@ -1041,7 +1041,7 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign, KnownBits Known(BitWidth); computeKnownBits(V, Known, DL, 0, AC, CxtI, DT); - unsigned TrailZ = Known.Zero.countTrailingOnes(); + unsigned TrailZ = Known.countMinTrailingZeros(); // Avoid trouble with ridiculously large TrailZ values, such as // those computed from a null pointer. @@ -1105,8 +1105,9 @@ static bool PhiHasDebugValue(DILocalVariable *DIVar, void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, StoreInst *SI, DIBuilder &Builder) { auto *DIVar = DDI->getVariable(); - auto *DIExpr = DDI->getExpression(); assert(DIVar && "Missing variable"); + auto *DIExpr = DDI->getExpression(); + Value *DV = SI->getOperand(0); // If an argument is zero extended then use argument directly. The ZExt // may be zapped by an optimization pass in future. @@ -1116,34 +1117,28 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgDeclareInst *DDI, if (SExtInst *SExt = dyn_cast(SI->getOperand(0))) ExtendedArg = dyn_cast(SExt->getOperand(0)); if (ExtendedArg) { - // We're now only describing a subset of the variable. The fragment we're - // describing will always be smaller than the variable size, because - // VariableSize == Size of Alloca described by DDI. Since SI stores - // to the alloca described by DDI, if it's first operand is an extend, - // we're guaranteed that before extension, the value was narrower than - // the size of the alloca, hence the size of the described variable. - SmallVector Ops; - unsigned FragmentOffset = 0; - // If this already is a bit fragment, we drop the bit fragment from the - // expression and record the offset. - auto Fragment = DIExpr->getFragmentInfo(); - if (Fragment) { - Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()-3); - FragmentOffset = Fragment->OffsetInBits; - } else { - Ops.append(DIExpr->elements_begin(), DIExpr->elements_end()); + // If this DDI was already describing only a fragment of a variable, ensure + // that fragment is appropriately narrowed here. + // But if a fragment wasn't used, describe the value as the original + // argument (rather than the zext or sext) so that it remains described even + // if the sext/zext is optimized away. This widens the variable description, + // leaving it up to the consumer to know how the smaller value may be + // represented in a larger register. + if (auto Fragment = DIExpr->getFragmentInfo()) { + unsigned FragmentOffset = Fragment->OffsetInBits; + SmallVector Ops(DIExpr->elements_begin(), + DIExpr->elements_end() - 3); + Ops.push_back(dwarf::DW_OP_LLVM_fragment); + Ops.push_back(FragmentOffset); + const DataLayout &DL = DDI->getModule()->getDataLayout(); + Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); + DIExpr = Builder.createExpression(Ops); } - Ops.push_back(dwarf::DW_OP_LLVM_fragment); - Ops.push_back(FragmentOffset); - const DataLayout &DL = DDI->getModule()->getDataLayout(); - Ops.push_back(DL.getTypeSizeInBits(ExtendedArg->getType())); - auto NewDIExpr = Builder.createExpression(Ops); - if (!LdStHasDebugValue(DIVar, NewDIExpr, SI)) - Builder.insertDbgValueIntrinsic(ExtendedArg, 0, DIVar, NewDIExpr, - DDI->getDebugLoc(), SI); - } else if (!LdStHasDebugValue(DIVar, DIExpr, SI)) - Builder.insertDbgValueIntrinsic(SI->getOperand(0), 0, DIVar, DIExpr, - DDI->getDebugLoc(), SI); + DV = ExtendedArg; + } + if (!LdStHasDebugValue(DIVar, DIExpr, SI)) + Builder.insertDbgValueIntrinsic(DV, 0, DIVar, DIExpr, DDI->getDebugLoc(), + SI); } /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value @@ -1781,44 +1776,43 @@ void llvm::combineMetadataForCSE(Instruction *K, const Instruction *J) { combineMetadata(K, J, KnownIDs); } -unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, - DominatorTree &DT, - const BasicBlockEdge &Root) { +template +static unsigned replaceDominatedUsesWith(Value *From, Value *To, + const RootType &Root, + const DominatesFn &Dominates) { assert(From->getType() == To->getType()); - + unsigned Count = 0; for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); - UI != UE; ) { + UI != UE;) { Use &U = *UI++; - if (DT.dominates(Root, U)) { - U.set(To); - DEBUG(dbgs() << "Replace dominated use of '" - << From->getName() << "' as " - << *To << " in " << *U << "\n"); - ++Count; - } + if (!Dominates(Root, U)) + continue; + U.set(To); + DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " + << *To << " in " << *U << "\n"); + ++Count; } return Count; } unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, DominatorTree &DT, - const BasicBlock *BB) { - assert(From->getType() == To->getType()); + const BasicBlockEdge &Root) { + auto Dominates = [&DT](const BasicBlockEdge &Root, const Use &U) { + return DT.dominates(Root, U); + }; + return ::replaceDominatedUsesWith(From, To, Root, Dominates); +} - unsigned Count = 0; - for (Value::use_iterator UI = From->use_begin(), UE = From->use_end(); - UI != UE;) { - Use &U = *UI++; - auto *I = cast(U.getUser()); - if (DT.properlyDominates(BB, I->getParent())) { - U.set(To); - DEBUG(dbgs() << "Replace dominated use of '" << From->getName() << "' as " - << *To << " in " << *U << "\n"); - ++Count; - } - } - return Count; +unsigned llvm::replaceDominatedUsesWith(Value *From, Value *To, + DominatorTree &DT, + const BasicBlock *BB) { + auto ProperlyDominates = [&DT](const BasicBlock *BB, const Use &U) { + auto *I = cast(U.getUser())->getParent(); + return DT.properlyDominates(BB, I); + }; + return ::replaceDominatedUsesWith(From, To, BB, ProperlyDominates); } bool llvm::callsGCLeafFunction(ImmutableCallSite CS) { diff --git a/lib/Transforms/Utils/LoopUtils.cpp b/lib/Transforms/Utils/LoopUtils.cpp index 175d013a011d..81f033e7d51a 100644 --- a/lib/Transforms/Utils/LoopUtils.cpp +++ b/lib/Transforms/Utils/LoopUtils.cpp @@ -18,6 +18,7 @@ #include "llvm/Analysis/GlobalsModRef.h" #include "llvm/Analysis/LoopInfo.h" #include "llvm/Analysis/LoopPass.h" +#include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h" #include "llvm/Analysis/ScalarEvolutionExpander.h" @@ -1112,3 +1113,203 @@ Optional llvm::getLoopEstimatedTripCount(Loop *L) { else return (FalseVal + (TrueVal / 2)) / TrueVal; } + +/// \brief Adds a 'fast' flag to floating point operations. +static Value *addFastMathFlag(Value *V) { + if (isa(V)) { + FastMathFlags Flags; + Flags.setUnsafeAlgebra(); + cast(V)->setFastMathFlags(Flags); + } + return V; +} + +// Helper to generate a log2 shuffle reduction. +Value * +llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, + RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, + ArrayRef RedOps) { + unsigned VF = Src->getType()->getVectorNumElements(); + // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles + // and vector ops, reducing the set of values being computed by half each + // round. + assert(isPowerOf2_32(VF) && + "Reduction emission only supported for pow2 vectors!"); + Value *TmpVec = Src; + SmallVector ShuffleMask(VF, nullptr); + for (unsigned i = VF; i != 1; i >>= 1) { + // Move the upper half of the vector to the lower half. + for (unsigned j = 0; j != i / 2; ++j) + ShuffleMask[j] = Builder.getInt32(i / 2 + j); + + // Fill the rest of the mask with undef. + std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), + UndefValue::get(Builder.getInt32Ty())); + + Value *Shuf = Builder.CreateShuffleVector( + TmpVec, UndefValue::get(TmpVec->getType()), + ConstantVector::get(ShuffleMask), "rdx.shuf"); + + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + // Floating point operations had to be 'fast' to enable the reduction. + TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, + TmpVec, Shuf, "bin.rdx")); + } else { + assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && + "Invalid min/max"); + TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, TmpVec, + Shuf); + } + if (!RedOps.empty()) + propagateIRFlags(TmpVec, RedOps); + } + // The result is in the first element of the vector. + return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); +} + +/// Create a simple vector reduction specified by an opcode and some +/// flags (if generating min/max reductions). +Value *llvm::createSimpleTargetReduction( + IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode, + Value *Src, TargetTransformInfo::ReductionFlags Flags, + ArrayRef RedOps) { + assert(isa(Src->getType()) && "Type must be a vector"); + + Value *ScalarUdf = UndefValue::get(Src->getType()->getVectorElementType()); + std::function BuildFunc; + using RD = RecurrenceDescriptor; + RD::MinMaxRecurrenceKind MinMaxKind = RD::MRK_Invalid; + // TODO: Support creating ordered reductions. + FastMathFlags FMFUnsafe; + FMFUnsafe.setUnsafeAlgebra(); + + switch (Opcode) { + case Instruction::Add: + BuildFunc = [&]() { return Builder.CreateAddReduce(Src); }; + break; + case Instruction::Mul: + BuildFunc = [&]() { return Builder.CreateMulReduce(Src); }; + break; + case Instruction::And: + BuildFunc = [&]() { return Builder.CreateAndReduce(Src); }; + break; + case Instruction::Or: + BuildFunc = [&]() { return Builder.CreateOrReduce(Src); }; + break; + case Instruction::Xor: + BuildFunc = [&]() { return Builder.CreateXorReduce(Src); }; + break; + case Instruction::FAdd: + BuildFunc = [&]() { + auto Rdx = Builder.CreateFAddReduce(ScalarUdf, Src); + cast(Rdx)->setFastMathFlags(FMFUnsafe); + return Rdx; + }; + break; + case Instruction::FMul: + BuildFunc = [&]() { + auto Rdx = Builder.CreateFMulReduce(ScalarUdf, Src); + cast(Rdx)->setFastMathFlags(FMFUnsafe); + return Rdx; + }; + break; + case Instruction::ICmp: + if (Flags.IsMaxOp) { + MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMax : RD::MRK_UIntMax; + BuildFunc = [&]() { + return Builder.CreateIntMaxReduce(Src, Flags.IsSigned); + }; + } else { + MinMaxKind = Flags.IsSigned ? RD::MRK_SIntMin : RD::MRK_UIntMin; + BuildFunc = [&]() { + return Builder.CreateIntMinReduce(Src, Flags.IsSigned); + }; + } + break; + case Instruction::FCmp: + if (Flags.IsMaxOp) { + MinMaxKind = RD::MRK_FloatMax; + BuildFunc = [&]() { return Builder.CreateFPMaxReduce(Src, Flags.NoNaN); }; + } else { + MinMaxKind = RD::MRK_FloatMin; + BuildFunc = [&]() { return Builder.CreateFPMinReduce(Src, Flags.NoNaN); }; + } + break; + default: + llvm_unreachable("Unhandled opcode"); + break; + } + if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) + return BuildFunc(); + return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); +} + +/// Create a vector reduction using a given recurrence descriptor. +Value *llvm::createTargetReduction(IRBuilder<> &Builder, + const TargetTransformInfo *TTI, + RecurrenceDescriptor &Desc, Value *Src, + bool NoNaN) { + // TODO: Support in-order reductions based on the recurrence descriptor. + RecurrenceDescriptor::RecurrenceKind RecKind = Desc.getRecurrenceKind(); + TargetTransformInfo::ReductionFlags Flags; + Flags.NoNaN = NoNaN; + auto getSimpleRdx = [&](unsigned Opc) { + return createSimpleTargetReduction(Builder, TTI, Opc, Src, Flags); + }; + switch (RecKind) { + case RecurrenceDescriptor::RK_FloatAdd: + return getSimpleRdx(Instruction::FAdd); + case RecurrenceDescriptor::RK_FloatMult: + return getSimpleRdx(Instruction::FMul); + case RecurrenceDescriptor::RK_IntegerAdd: + return getSimpleRdx(Instruction::Add); + case RecurrenceDescriptor::RK_IntegerMult: + return getSimpleRdx(Instruction::Mul); + case RecurrenceDescriptor::RK_IntegerAnd: + return getSimpleRdx(Instruction::And); + case RecurrenceDescriptor::RK_IntegerOr: + return getSimpleRdx(Instruction::Or); + case RecurrenceDescriptor::RK_IntegerXor: + return getSimpleRdx(Instruction::Xor); + case RecurrenceDescriptor::RK_IntegerMinMax: { + switch (Desc.getMinMaxRecurrenceKind()) { + case RecurrenceDescriptor::MRK_SIntMax: + Flags.IsSigned = true; + Flags.IsMaxOp = true; + break; + case RecurrenceDescriptor::MRK_UIntMax: + Flags.IsMaxOp = true; + break; + case RecurrenceDescriptor::MRK_SIntMin: + Flags.IsSigned = true; + break; + case RecurrenceDescriptor::MRK_UIntMin: + break; + default: + llvm_unreachable("Unhandled MRK"); + } + return getSimpleRdx(Instruction::ICmp); + } + case RecurrenceDescriptor::RK_FloatMinMax: { + Flags.IsMaxOp = + Desc.getMinMaxRecurrenceKind() == RecurrenceDescriptor::MRK_FloatMax; + return getSimpleRdx(Instruction::FCmp); + } + default: + llvm_unreachable("Unhandled RecKind"); + } +} + +void llvm::propagateIRFlags(Value *I, ArrayRef VL) { + if (auto *VecOp = dyn_cast(I)) { + if (auto *I0 = dyn_cast(VL[0])) { + // VecOVp is initialized to the 0th scalar, so start counting from index + // '1'. + VecOp->copyIRFlags(I0); + for (int i = 1, e = VL.size(); i < e; ++i) { + if (auto *Scalar = dyn_cast(VL[i])) + VecOp->andIRFlags(Scalar); + } + } + } +} diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp index 29d334f2968f..2ef3d6336ae2 100644 --- a/lib/Transforms/Utils/ModuleUtils.cpp +++ b/lib/Transforms/Utils/ModuleUtils.cpp @@ -35,7 +35,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F, // Upgrade a 2-field global array type to the new 3-field format if needed. if (Data && OldEltTy->getNumElements() < 3) EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), - IRB.getInt8PtrTy(), nullptr); + IRB.getInt8PtrTy()); else EltTy = OldEltTy; if (Constant *Init = GVCtor->getInitializer()) { @@ -44,10 +44,10 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F, for (unsigned i = 0; i != n; ++i) { auto Ctor = cast(Init->getOperand(i)); if (EltTy != OldEltTy) - Ctor = ConstantStruct::get( - EltTy, Ctor->getAggregateElement((unsigned)0), - Ctor->getAggregateElement(1), - Constant::getNullValue(IRB.getInt8PtrTy()), nullptr); + Ctor = + ConstantStruct::get(EltTy, Ctor->getAggregateElement((unsigned)0), + Ctor->getAggregateElement(1), + Constant::getNullValue(IRB.getInt8PtrTy())); CurrentCtors.push_back(Ctor); } } @@ -55,7 +55,7 @@ static void appendToGlobalArray(const char *Array, Module &M, Function *F, } else { // Use the new three-field struct if there isn't one already. EltTy = StructType::get(IRB.getInt32Ty(), PointerType::getUnqual(FnTy), - IRB.getInt8PtrTy(), nullptr); + IRB.getInt8PtrTy()); } // Build a 2 or 3 field global_ctor entry. We don't take a comdat key. diff --git a/lib/Transforms/Utils/SimplifyLibCalls.cpp b/lib/Transforms/Utils/SimplifyLibCalls.cpp index 9e71d746de34..1de579ed41b0 100644 --- a/lib/Transforms/Utils/SimplifyLibCalls.cpp +++ b/lib/Transforms/Utils/SimplifyLibCalls.cpp @@ -1450,11 +1450,11 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg, // x86_64 can't use {float, float} since that would be returned in both // xmm0 and xmm1, which isn't what a real struct would do. ResTy = T.getArch() == Triple::x86_64 - ? static_cast(VectorType::get(ArgTy, 2)) - : static_cast(StructType::get(ArgTy, ArgTy, nullptr)); + ? static_cast(VectorType::get(ArgTy, 2)) + : static_cast(StructType::get(ArgTy, ArgTy)); } else { Name = "__sincospi_stret"; - ResTy = StructType::get(ArgTy, ArgTy, nullptr); + ResTy = StructType::get(ArgTy, ArgTy); } Module *M = OrigCallee->getParent(); diff --git a/lib/Transforms/Utils/VNCoercion.cpp b/lib/Transforms/Utils/VNCoercion.cpp index 83bd29dbca65..60d9ede2c487 100644 --- a/lib/Transforms/Utils/VNCoercion.cpp +++ b/lib/Transforms/Utils/VNCoercion.cpp @@ -303,6 +303,15 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy, const DataLayout &DL) { LLVMContext &Ctx = SrcVal->getType()->getContext(); + // If two pointers are in the same address space, they have the same size, + // so we don't need to do any truncation, etc. This avoids introducing + // ptrtoint instructions for pointers that may be non-integral. + if (SrcVal->getType()->isPointerTy() && LoadTy->isPointerTy() && + cast(SrcVal->getType())->getAddressSpace() == + cast(LoadTy)->getAddressSpace()) { + return SrcVal; + } + uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8; uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8; // Compute which bits of the stored value are being used by the load. Convert diff --git a/lib/Transforms/Utils/ValueMapper.cpp b/lib/Transforms/Utils/ValueMapper.cpp index 84d89f103a2f..930972924c3c 100644 --- a/lib/Transforms/Utils/ValueMapper.cpp +++ b/lib/Transforms/Utils/ValueMapper.cpp @@ -949,11 +949,10 @@ void Mapper::mapAppendingVariable(GlobalVariable &GV, Constant *InitPrefix, Constant *NewV; if (IsOldCtorDtor) { auto *S = cast(V); - auto *E1 = mapValue(S->getOperand(0)); - auto *E2 = mapValue(S->getOperand(1)); - Value *Null = Constant::getNullValue(VoidPtrTy); - NewV = - ConstantStruct::get(cast(EltTy), E1, E2, Null, nullptr); + auto *E1 = cast(mapValue(S->getOperand(0))); + auto *E2 = cast(mapValue(S->getOperand(1))); + Constant *Null = Constant::getNullValue(VoidPtrTy); + NewV = ConstantStruct::get(cast(EltTy), E1, E2, Null); } else { NewV = cast_or_null(mapValue(V)); } diff --git a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp index 97dcb40a1d72..9cf66382b581 100644 --- a/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp +++ b/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp @@ -346,7 +346,7 @@ bool Vectorizer::isConsecutiveAccess(Value *A, Value *B) { if (!Safe) { KnownBits Known(BitWidth); computeKnownBits(OpA, Known, DL, 0, nullptr, OpA, &DT); - if (Known.Zero.countTrailingZeros() < (BitWidth - 1)) + if (Known.countMaxTrailingOnes() < (BitWidth - 1)) Safe = true; } diff --git a/lib/Transforms/Vectorize/LoopVectorize.cpp b/lib/Transforms/Vectorize/LoopVectorize.cpp index 3fde0a453962..516ab7d03a88 100644 --- a/lib/Transforms/Vectorize/LoopVectorize.cpp +++ b/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -391,13 +391,14 @@ public: TripCount(nullptr), VectorTripCount(nullptr), Legal(LVL), Cost(CM), AddedSafetyChecks(false) {} - // Perform the actual loop widening (vectorization). - void vectorize() { - // Create a new empty loop. Unlink the old loop and connect the new one. - createEmptyLoop(); - // Widen each instruction in the old loop to a new one in the new loop. - vectorizeLoop(); - } + /// Create a new empty loop. Unlink the old loop and connect the new one. + void createVectorizedLoopSkeleton(); + + /// Vectorize a single instruction within the innermost loop. + void vectorizeInstruction(Instruction &I); + + /// Fix the vectorized code, taking care of header phi's, live-outs, and more. + void fixVectorizedLoop(); // Return true if any runtime check is added. bool areSafetyChecksAdded() { return AddedSafetyChecks; } @@ -425,9 +426,6 @@ protected: EdgeMaskCacheTy; typedef DenseMap BlockMaskCacheTy; - /// Create an empty loop, based on the loop ranges of the old loop. - void createEmptyLoop(); - /// Set up the values of the IVs correctly when exiting the vector loop. void fixupIVUsers(PHINode *OrigPhi, const InductionDescriptor &II, Value *CountRoundDown, Value *EndValue, @@ -436,8 +434,6 @@ protected: /// Create a new induction variable inside L. PHINode *createInductionVariable(Loop *L, Value *Start, Value *End, Value *Step, Instruction *DL); - /// Copy and widen the instructions from the old loop. - virtual void vectorizeLoop(); /// Handle all cross-iteration phis in the header. void fixCrossIterationPHIs(); @@ -450,10 +446,10 @@ protected: /// vectorizing this phi node. void fixReduction(PHINode *Phi); - /// \brief The Loop exit block may have single value PHI nodes where the - /// incoming value is 'Undef'. While vectorizing we only handled real values - /// that were defined inside the loop. Here we fix the 'undef case'. - /// See PR14725. + /// \brief The Loop exit block may have single value PHI nodes with some + /// incoming value. While vectorizing we only handled real values + /// that were defined inside the loop and we should have one value for + /// each predecessor of its parent basic block. See PR14725. void fixLCSSAPHIs(); /// Iteratively sink the scalarized operands of a predicated instruction into @@ -464,11 +460,6 @@ protected: /// respective conditions. void predicateInstructions(); - /// Collect the instructions from the original loop that would be trivially - /// dead in the vectorized loop if generated. - void collectTriviallyDeadInstructions( - SmallPtrSetImpl &DeadInstructions); - /// Shrinks vector element sizes to the smallest bitwidth they can be legally /// represented as. void truncateToMinimalBitwidths(); @@ -481,10 +472,6 @@ protected: /// and DST. VectorParts createEdgeMask(BasicBlock *Src, BasicBlock *Dst); - /// A helper function to vectorize a single instruction within the innermost - /// loop. - void vectorizeInstruction(Instruction &I); - /// Vectorize a single PHINode in a block. This method handles the induction /// variable canonicalization. It supports both VF = 1 for unrolled loops and /// arbitrary length vectors. @@ -1700,6 +1687,9 @@ public: /// access that can be widened. bool memoryInstructionCanBeWidened(Instruction *I, unsigned VF = 1); + // Returns true if the NoNaN attribute is set on the function. + bool hasFunNoNaNAttr() const { return HasFunNoNaNAttr; } + private: /// Check if a single basic block loop is vectorizable. /// At this point we know that this is a loop with a constant trip count @@ -2185,7 +2175,10 @@ public: /// passed Legality checks. class LoopVectorizationPlanner { public: - LoopVectorizationPlanner(LoopVectorizationCostModel &CM) : CM(CM) {} + LoopVectorizationPlanner(Loop *OrigLoop, LoopInfo *LI, + LoopVectorizationLegality *Legal, + LoopVectorizationCostModel &CM) + : OrigLoop(OrigLoop), LI(LI), Legal(Legal), CM(CM) {} ~LoopVectorizationPlanner() {} @@ -2193,7 +2186,25 @@ public: LoopVectorizationCostModel::VectorizationFactor plan(bool OptForSize, unsigned UserVF); + /// Generate the IR code for the vectorized loop. + void executePlan(InnerLoopVectorizer &ILV); + +protected: + /// Collect the instructions from the original loop that would be trivially + /// dead in the vectorized loop if generated. + void collectTriviallyDeadInstructions( + SmallPtrSetImpl &DeadInstructions); + private: + /// The loop that we evaluate. + Loop *OrigLoop; + + /// Loop Info analysis. + LoopInfo *LI; + + /// The legality analysis. + LoopVectorizationLegality *Legal; + /// The profitablity analysis. LoopVectorizationCostModel &CM; }; @@ -3361,7 +3372,7 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) { LVer->prepareNoAliasMetadata(); } -void InnerLoopVectorizer::createEmptyLoop() { +void InnerLoopVectorizer::createVectorizedLoopSkeleton() { /* In this function we generate a new loop. The new loop will contain the vectorized instructions while the old loop will continue to run the @@ -3883,36 +3894,7 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() { } } -void InnerLoopVectorizer::vectorizeLoop() { - //===------------------------------------------------===// - // - // Notice: any optimization or new instruction that go - // into the code below should be also be implemented in - // the cost-model. - // - //===------------------------------------------------===// - - // Collect instructions from the original loop that will become trivially dead - // in the vectorized loop. We don't need to vectorize these instructions. For - // example, original induction update instructions can become dead because we - // separately emit induction "steps" when generating code for the new loop. - // Similarly, we create a new latch condition when setting up the structure - // of the new loop, so the old one can become dead. - SmallPtrSet DeadInstructions; - collectTriviallyDeadInstructions(DeadInstructions); - - // Scan the loop in a topological order to ensure that defs are vectorized - // before users. - LoopBlocksDFS DFS(OrigLoop); - DFS.perform(LI); - - // Vectorize all instructions in the original loop that will not become - // trivially dead when vectorized. - for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) - for (Instruction &I : *BB) - if (!DeadInstructions.count(&I)) - vectorizeInstruction(I); - +void InnerLoopVectorizer::fixVectorizedLoop() { // Insert truncates and extends for any truncated instructions as hints to // InstCombine. if (VF > 1) @@ -4049,8 +4031,11 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) { // Set the insertion point after the previous value if it is an instruction. // Note that the previous value may have been constant-folded so it is not - // guaranteed to be an instruction in the vector loop. - if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1])) + // guaranteed to be an instruction in the vector loop. Also, if the previous + // value is a phi node, we should insert after all the phi nodes to avoid + // breaking basic block verification. + if (LI->getLoopFor(LoopVectorBody)->isLoopInvariant(PreviousParts[UF - 1]) || + isa(PreviousParts[UF - 1])) Builder.SetInsertPoint(&*LoopVectorBody->getFirstInsertionPt()); else Builder.SetInsertPoint( @@ -4258,39 +4243,9 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) { } if (VF > 1) { - // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles - // and vector ops, reducing the set of values being computed by half each - // round. - assert(isPowerOf2_32(VF) && - "Reduction emission only supported for pow2 vectors!"); - Value *TmpVec = ReducedPartRdx; - SmallVector ShuffleMask(VF, nullptr); - for (unsigned i = VF; i != 1; i >>= 1) { - // Move the upper half of the vector to the lower half. - for (unsigned j = 0; j != i / 2; ++j) - ShuffleMask[j] = Builder.getInt32(i / 2 + j); - - // Fill the rest of the mask with undef. - std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), - UndefValue::get(Builder.getInt32Ty())); - - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), - ConstantVector::get(ShuffleMask), "rdx.shuf"); - - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - TmpVec = addFastMathFlag(Builder.CreateBinOp( - (Instruction::BinaryOps)Op, TmpVec, Shuf, "bin.rdx")); - else - TmpVec = RecurrenceDescriptor::createMinMaxOp(Builder, MinMaxKind, - TmpVec, Shuf); - } - - // The result is in the first element of the vector. + bool NoNaN = Legal->hasFunNoNaNAttr(); ReducedPartRdx = - Builder.CreateExtractElement(TmpVec, Builder.getInt32(0)); - + createTargetReduction(Builder, TTI, RdxDesc, ReducedPartRdx, NoNaN); // If the reduction can be performed in a smaller type, we need to extend // the reduction to the wider type before we branch to the original loop. if (Phi->getType() != RdxDesc.getRecurrenceType()) @@ -4345,33 +4300,11 @@ void InnerLoopVectorizer::fixLCSSAPHIs() { auto *LCSSAPhi = dyn_cast(&LEI); if (!LCSSAPhi) break; - if (LCSSAPhi->getNumIncomingValues() == 1) - LCSSAPhi->addIncoming(UndefValue::get(LCSSAPhi->getType()), - LoopMiddleBlock); - } -} - -void InnerLoopVectorizer::collectTriviallyDeadInstructions( - SmallPtrSetImpl &DeadInstructions) { - BasicBlock *Latch = OrigLoop->getLoopLatch(); - - // We create new control-flow for the vectorized loop, so the original - // condition will be dead after vectorization if it's only used by the - // branch. - auto *Cmp = dyn_cast(Latch->getTerminator()->getOperand(0)); - if (Cmp && Cmp->hasOneUse()) - DeadInstructions.insert(Cmp); - - // We create new "steps" for induction variable updates to which the original - // induction variables map. An original update instruction will be dead if - // all its users except the induction variable are dead. - for (auto &Induction : *Legal->getInductionVars()) { - PHINode *Ind = Induction.first; - auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); - if (all_of(IndUpdate->users(), [&](User *U) -> bool { - return U == Ind || DeadInstructions.count(cast(U)); - })) - DeadInstructions.insert(IndUpdate); + if (LCSSAPhi->getNumIncomingValues() == 1) { + assert(OrigLoop->isLoopInvariant(LCSSAPhi->getIncomingValue(0)) && + "Incoming value isn't loop invariant"); + LCSSAPhi->addIncoming(LCSSAPhi->getIncomingValue(0), LoopMiddleBlock); + } } } @@ -7577,6 +7510,72 @@ LoopVectorizationPlanner::plan(bool OptForSize, unsigned UserVF) { return CM.selectVectorizationFactor(MaxVF); } +void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV) { + // Perform the actual loop transformation. + + // 1. Create a new empty loop. Unlink the old loop and connect the new one. + ILV.createVectorizedLoopSkeleton(); + + //===------------------------------------------------===// + // + // Notice: any optimization or new instruction that go + // into the code below should also be implemented in + // the cost-model. + // + //===------------------------------------------------===// + + // 2. Copy and widen instructions from the old loop into the new loop. + + // Collect instructions from the original loop that will become trivially dead + // in the vectorized loop. We don't need to vectorize these instructions. For + // example, original induction update instructions can become dead because we + // separately emit induction "steps" when generating code for the new loop. + // Similarly, we create a new latch condition when setting up the structure + // of the new loop, so the old one can become dead. + SmallPtrSet DeadInstructions; + collectTriviallyDeadInstructions(DeadInstructions); + + // Scan the loop in a topological order to ensure that defs are vectorized + // before users. + LoopBlocksDFS DFS(OrigLoop); + DFS.perform(LI); + + // Vectorize all instructions in the original loop that will not become + // trivially dead when vectorized. + for (BasicBlock *BB : make_range(DFS.beginRPO(), DFS.endRPO())) + for (Instruction &I : *BB) + if (!DeadInstructions.count(&I)) + ILV.vectorizeInstruction(I); + + // 3. Fix the vectorized code: take care of header phi's, live-outs, + // predication, updating analyses. + ILV.fixVectorizedLoop(); +} + +void LoopVectorizationPlanner::collectTriviallyDeadInstructions( + SmallPtrSetImpl &DeadInstructions) { + BasicBlock *Latch = OrigLoop->getLoopLatch(); + + // We create new control-flow for the vectorized loop, so the original + // condition will be dead after vectorization if it's only used by the + // branch. + auto *Cmp = dyn_cast(Latch->getTerminator()->getOperand(0)); + if (Cmp && Cmp->hasOneUse()) + DeadInstructions.insert(Cmp); + + // We create new "steps" for induction variable updates to which the original + // induction variables map. An original update instruction will be dead if + // all its users except the induction variable are dead. + for (auto &Induction : *Legal->getInductionVars()) { + PHINode *Ind = Induction.first; + auto *IndUpdate = cast(Ind->getIncomingValueForBlock(Latch)); + if (all_of(IndUpdate->users(), [&](User *U) -> bool { + return U == Ind || DeadInstructions.count(cast(U)); + })) + DeadInstructions.insert(IndUpdate); + } +} + void InnerLoopUnroller::vectorizeMemoryInstruction(Instruction *Instr) { auto *SI = dyn_cast(Instr); bool IfPredicateInstr = (SI && Legal->blockNeedsPredication(SI->getParent())); @@ -7759,7 +7758,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { CM.collectValuesToIgnore(); // Use the planner for vectorization. - LoopVectorizationPlanner LVP(CM); + LoopVectorizationPlanner LVP(L, LI, &LVL, CM); // Get user vectorization factor. unsigned UserVF = Hints.getWidth(); @@ -7853,7 +7852,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // interleave it. InnerLoopUnroller Unroller(L, PSE, LI, DT, TLI, TTI, AC, ORE, IC, &LVL, &CM); - Unroller.vectorize(); + LVP.executePlan(Unroller); ORE->emit(OptimizationRemark(LV_NAME, "Interleaved", L->getStartLoc(), L->getHeader()) @@ -7863,7 +7862,7 @@ bool LoopVectorizePass::processLoop(Loop *L) { // If we decided that it is *legal* to vectorize the loop, then do it. InnerLoopVectorizer LB(L, PSE, LI, DT, TLI, TTI, AC, ORE, VF.Width, IC, &LVL, &CM); - LB.vectorize(); + LVP.executePlan(LB); ++LoopsVectorized; // Add metadata to disable runtime unrolling a scalar loop when there are diff --git a/lib/Transforms/Vectorize/SLPVectorizer.cpp b/lib/Transforms/Vectorize/SLPVectorizer.cpp index f112c555205c..64013d6d687d 100644 --- a/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -40,7 +40,9 @@ #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/GraphWriter.h" +#include "llvm/Support/KnownBits.h" #include "llvm/Support/raw_ostream.h" +#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Vectorize.h" #include #include @@ -212,23 +214,6 @@ static unsigned getSameOpcode(ArrayRef VL) { return Opcode; } -/// Get the intersection (logical and) of all of the potential IR flags -/// of each scalar operation (VL) that will be converted into a vector (I). -/// Flag set: NSW, NUW, exact, and all of fast-math. -static void propagateIRFlags(Value *I, ArrayRef VL) { - if (auto *VecOp = dyn_cast(I)) { - if (auto *I0 = dyn_cast(VL[0])) { - // VecOVp is initialized to the 0th scalar, so start counting from index - // '1'. - VecOp->copyIRFlags(I0); - for (int i = 1, e = VL.size(); i < e; ++i) { - if (auto *Scalar = dyn_cast(VL[i])) - VecOp->andIRFlags(Scalar); - } - } - } -} - /// \returns true if all of the values in \p VL have the same type or false /// otherwise. static bool allSameType(ArrayRef VL) { @@ -315,10 +300,10 @@ public: BoUpSLP(Function *Func, ScalarEvolution *Se, TargetTransformInfo *Tti, TargetLibraryInfo *TLi, AliasAnalysis *Aa, LoopInfo *Li, DominatorTree *Dt, AssumptionCache *AC, DemandedBits *DB, - const DataLayout *DL) + const DataLayout *DL, OptimizationRemarkEmitter *ORE) : NumLoadsWantToKeepOrder(0), NumLoadsWantToChangeOrder(0), F(Func), SE(Se), TTI(Tti), TLI(TLi), AA(Aa), LI(Li), DT(Dt), AC(AC), DB(DB), - DL(DL), Builder(Se->getContext()) { + DL(DL), ORE(ORE), Builder(Se->getContext()) { CodeMetrics::collectEphemeralValues(F, AC, EphValues); // Use the vector register size specified by the target unless overridden // by a command-line option. @@ -331,7 +316,10 @@ public: else MaxVecRegSize = TTI->getRegisterBitWidth(true); - MinVecRegSize = MinVectorRegSizeOption; + if (MinVectorRegSizeOption.getNumOccurrences()) + MinVecRegSize = MinVectorRegSizeOption; + else + MinVecRegSize = TTI->getMinVectorRegisterBitWidth(); } /// \brief Vectorize the tree that starts with the elements in \p VL. @@ -377,6 +365,8 @@ public: MinBWs.clear(); } + unsigned getTreeSize() const { return VectorizableTree.size(); } + /// \brief Perform LICM and CSE on the newly generated gather sequences. void optimizeGatherSequence(); @@ -415,6 +405,8 @@ public: /// vectorizable. We do not vectorize such trees. bool isTreeTinyAndNotFullyVectorizable(); + OptimizationRemarkEmitter *getORE() { return ORE; } + private: struct TreeEntry; @@ -944,6 +936,8 @@ private: AssumptionCache *AC; DemandedBits *DB; const DataLayout *DL; + OptimizationRemarkEmitter *ORE; + unsigned MaxVecRegSize; // This is set by TTI or overridden by cl::opt. unsigned MinVecRegSize; // Set by cl::opt (default: 128). /// Instruction builder to construct the vectorized tree. @@ -1835,11 +1829,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) { CInt->getValue().isPowerOf2()) Op2VP = TargetTransformInfo::OP_PowerOf2; - int ScalarCost = VecTy->getNumElements() * - TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, - Op2VK, Op1VP, Op2VP); + SmallVector Operands(VL0->operand_values()); + int ScalarCost = + VecTy->getNumElements() * + TTI->getArithmeticInstrCost(Opcode, ScalarTy, Op1VK, Op2VK, Op1VP, + Op2VP, Operands); int VecCost = TTI->getArithmeticInstrCost(Opcode, VecTy, Op1VK, Op2VK, - Op1VP, Op2VP); + Op1VP, Op2VP, Operands); return VecCost - ScalarCost; } case Instruction::GetElementPtr: { @@ -3703,10 +3699,8 @@ void BoUpSLP::computeMinimumValueSizes() { // Determine if the sign bit of all the roots is known to be zero. If not, // IsKnownPositive is set to False. IsKnownPositive = all_of(TreeRoot, [&](Value *R) { - bool KnownZero = false; - bool KnownOne = false; - ComputeSignBit(R, KnownZero, KnownOne, *DL); - return KnownZero; + KnownBits Known = computeKnownBits(R, *DL); + return Known.isNonNegative(); }); // Determine the maximum number of bits required to store the scalar @@ -3786,8 +3780,9 @@ struct SLPVectorizer : public FunctionPass { auto *DT = &getAnalysis().getDomTree(); auto *AC = &getAnalysis().getAssumptionCache(F); auto *DB = &getAnalysis().getDemandedBits(); + auto *ORE = &getAnalysis().getORE(); - return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + return Impl.runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); } void getAnalysisUsage(AnalysisUsage &AU) const override { @@ -3799,6 +3794,7 @@ struct SLPVectorizer : public FunctionPass { AU.addRequired(); AU.addRequired(); AU.addRequired(); + AU.addRequired(); AU.addPreserved(); AU.addPreserved(); AU.addPreserved(); @@ -3817,8 +3813,9 @@ PreservedAnalyses SLPVectorizerPass::run(Function &F, FunctionAnalysisManager &A auto *DT = &AM.getResult(F); auto *AC = &AM.getResult(F); auto *DB = &AM.getResult(F); + auto *ORE = &AM.getResult(F); - bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB); + bool Changed = runImpl(F, SE, TTI, TLI, AA, LI, DT, AC, DB, ORE); if (!Changed) return PreservedAnalyses::all(); @@ -3833,7 +3830,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, TargetTransformInfo *TTI_, TargetLibraryInfo *TLI_, AliasAnalysis *AA_, LoopInfo *LI_, DominatorTree *DT_, - AssumptionCache *AC_, DemandedBits *DB_) { + AssumptionCache *AC_, DemandedBits *DB_, + OptimizationRemarkEmitter *ORE_) { SE = SE_; TTI = TTI_; TLI = TLI_; @@ -3861,7 +3859,7 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_, // Use the bottom up slp vectorizer to construct chains that start with // store instructions. - BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL); + BoUpSLP R(&F, SE, TTI, TLI, AA, LI, DT, AC, DB, DL, ORE_); // A general note: the vectorizer must use BoUpSLP::eraseInstruction() to // delete instructions. @@ -3950,6 +3948,13 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef Chain, BoUpSLP &R, DEBUG(dbgs() << "SLP: Found cost=" << Cost << " for VF=" << VF << "\n"); if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Decided to vectorize cost=" << Cost << "\n"); + using namespace ore; + R.getORE()->emit(OptimizationRemark(SV_NAME, "StoresVectorized", + cast(Chain[i])) + << "Stores SLP vectorized with cost " << NV("Cost", Cost) + << " and with tree size " + << NV("TreeSize", R.getTreeSize())); + R.vectorizeTree(); // Move to the next bundle. @@ -4163,6 +4168,12 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef VL, BoUpSLP &R, if (Cost < -SLPCostThreshold) { DEBUG(dbgs() << "SLP: Vectorizing list at cost:" << Cost << ".\n"); + R.getORE()->emit(OptimizationRemark(SV_NAME, "VectorizedList", + cast(Ops[0])) + << "SLP vectorized with cost " << ore::NV("Cost", Cost) + << " and with tree size " + << ore::NV("TreeSize", R.getTreeSize())); + Value *VectorizedRoot = R.vectorizeTree(); // Reconstruct the build vector by extracting the vectorized root. This @@ -4506,6 +4517,12 @@ public: DEBUG(dbgs() << "SLP: Vectorizing horizontal reduction at cost:" << Cost << ". (HorRdx)\n"); + auto *I0 = cast(VL[0]); + V.getORE()->emit( + OptimizationRemark(SV_NAME, "VectorizedHorizontalReduction", I0) + << "Vectorized horizontal reduction with cost " + << ore::NV("Cost", Cost) << " and with tree size " + << ore::NV("TreeSize", V.getTreeSize())); // Vectorize a tree. DebugLoc Loc = cast(ReducedVals[i])->getDebugLoc(); @@ -4513,7 +4530,7 @@ public: // Emit a reduction. Value *ReducedSubTree = - emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps); + emitReduction(VectorizedRoot, Builder, ReduxWidth, ReductionOps, TTI); if (VectorizedTree) { Builder.SetCurrentDebugLocation(Loc); VectorizedTree = Builder.CreateBinOp(ReductionOpcode, VectorizedTree, @@ -4583,33 +4600,31 @@ private: /// \brief Emit a horizontal reduction of the vectorized value. Value *emitReduction(Value *VectorizedValue, IRBuilder<> &Builder, - unsigned ReduxWidth, ArrayRef RedOps) { + unsigned ReduxWidth, ArrayRef RedOps, + const TargetTransformInfo *TTI) { assert(VectorizedValue && "Need to have a vectorized tree node"); assert(isPowerOf2_32(ReduxWidth) && "We only handle power-of-two reductions for now"); + if (!IsPairwiseReduction) + return createSimpleTargetReduction( + Builder, TTI, ReductionOpcode, VectorizedValue, + TargetTransformInfo::ReductionFlags(), RedOps); + Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { - if (IsPairwiseReduction) { - Value *LeftMask = + Value *LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true, Builder); - Value *RightMask = + Value *RightMask = createRdxShuffleMask(ReduxWidth, i, true, false, Builder); - Value *LeftShuf = Builder.CreateShuffleVector( + Value *LeftShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l"); - Value *RightShuf = Builder.CreateShuffleVector( + Value *RightShuf = Builder.CreateShuffleVector( TmpVec, UndefValue::get(TmpVec->getType()), (RightMask), "rdx.shuf.r"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, - "bin.rdx"); - } else { - Value *UpperHalf = - createRdxShuffleMask(ReduxWidth, i, false, false, Builder); - Value *Shuf = Builder.CreateShuffleVector( - TmpVec, UndefValue::get(TmpVec->getType()), UpperHalf, "rdx.shuf"); - TmpVec = Builder.CreateBinOp(ReductionOpcode, TmpVec, Shuf, "bin.rdx"); - } + TmpVec = + Builder.CreateBinOp(ReductionOpcode, LeftShuf, RightShuf, "bin.rdx"); propagateIRFlags(TmpVec, RedOps); } @@ -5162,6 +5177,7 @@ INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopSimplify) INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass) +INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass) INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false) namespace llvm { diff --git a/lib/XRay/Trace.cpp b/lib/XRay/Trace.cpp index d2984697c8a9..6677063f944f 100644 --- a/lib/XRay/Trace.cpp +++ b/lib/XRay/Trace.cpp @@ -115,6 +115,7 @@ struct FDRState { uint16_t CPUId; uint16_t ThreadId; uint64_t BaseTSC; + /// Encode some of the state transitions for the FDR log reader as explicit /// checks. These are expectations for the next Record in the stream. enum class Token { @@ -123,8 +124,10 @@ struct FDRState { NEW_CPU_ID_RECORD, FUNCTION_SEQUENCE, SCAN_TO_END_OF_THREAD_BUF, + CUSTOM_EVENT_DATA, }; Token Expects; + // Each threads buffer may have trailing garbage to scan over, so we track our // progress. uint64_t CurrentBufferSize; @@ -143,6 +146,8 @@ Twine fdrStateToTwine(const FDRState::Token &state) { return "FUNCTION_SEQUENCE"; case FDRState::Token::SCAN_TO_END_OF_THREAD_BUF: return "SCAN_TO_END_OF_THREAD_BUF"; + case FDRState::Token::CUSTOM_EVENT_DATA: + return "CUSTOM_EVENT_DATA"; } return "UNKNOWN"; } @@ -212,13 +217,32 @@ Error processFDRWallTimeRecord(FDRState &State, uint8_t RecordFirstByte, return Error::success(); } +/// State transition when a CustomEventMarker is encountered. +Error processCustomEventMarker(FDRState &State, uint8_t RecordFirstByte, + DataExtractor &RecordExtractor, + size_t &RecordSize) { + // We can encounter a CustomEventMarker anywhere in the log, so we can handle + // it regardless of the expectation. However, we do se the expectation to read + // a set number of fixed bytes, as described in the metadata. + uint32_t OffsetPtr = 1; // Read after the first byte. + uint32_t DataSize = RecordExtractor.getU32(&OffsetPtr); + uint64_t TSC = RecordExtractor.getU64(&OffsetPtr); + + // FIXME: Actually represent the record through the API. For now we only skip + // through the data. + (void)TSC; + RecordSize = 16 + DataSize; + return Error::success(); +} + /// Advances the state machine for reading the FDR record type by reading one /// Metadata Record and updating the State appropriately based on the kind of /// record encountered. The RecordKind is encoded in the first byte of the /// Record, which the caller should pass in because they have already read it /// to determine that this is a metadata record as opposed to a function record. Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte, - DataExtractor &RecordExtractor) { + DataExtractor &RecordExtractor, + size_t &RecordSize) { // The remaining 7 bits are the RecordKind enum. uint8_t RecordKind = RecordFirstByte >> 1; switch (RecordKind) { @@ -247,6 +271,11 @@ Error processFDRMetadataRecord(FDRState &State, uint8_t RecordFirstByte, processFDRWallTimeRecord(State, RecordFirstByte, RecordExtractor)) return E; break; + case 5: // CustomEventMarker + if (auto E = processCustomEventMarker(State, RecordFirstByte, + RecordExtractor, RecordSize)) + return E; + break; default: // Widen the record type to uint16_t to prevent conversion to char. return make_error( @@ -400,7 +429,8 @@ Error loadFDRLog(StringRef Data, XRayFileHeader &FileHeader, bool isMetadataRecord = BitField & 0x01uL; if (isMetadataRecord) { RecordSize = 16; - if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor)) + if (auto E = processFDRMetadataRecord(State, BitField, RecordExtractor, + RecordSize)) return E; State.CurrentBufferConsumed += RecordSize; } else { // Process Function Record diff --git a/projects/CMakeLists.txt b/projects/CMakeLists.txt index 79d8fc7df99b..9102efbdcb46 100644 --- a/projects/CMakeLists.txt +++ b/projects/CMakeLists.txt @@ -22,7 +22,9 @@ endforeach(entry) if(${LLVM_BUILD_RUNTIME}) # MSVC isn't quite working with libc++ yet, disable it until issues are # fixed. - if(NOT MSVC) + # FIXME: LLVM_FORCE_BUILD_RUNTIME is currently used by libc++ to force + # enable the in-tree build when targeting clang-cl. + if(NOT MSVC OR LLVM_FORCE_BUILD_RUNTIME) # Add the projects in reverse order of their dependencies so that the # dependent projects can see the target names of their dependencies. add_llvm_external_project(libunwind) diff --git a/test/Analysis/BasicAA/cs-cs-arm.ll b/test/Analysis/BasicAA/cs-cs-arm.ll new file mode 100644 index 000000000000..1580af9ea826 --- /dev/null +++ b/test/Analysis/BasicAA/cs-cs-arm.ll @@ -0,0 +1,34 @@ +; RUN: opt < %s -basicaa -aa-eval -print-all-alias-modref-info -disable-output 2>&1 | FileCheck %s +; REQUIRES: arm + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" +target triple = "arm-apple-ios" + +declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind + +define <8 x i16> @test1(i8* %p, <8 x i16> %y) { +entry: + %q = getelementptr i8, i8* %p, i64 16 + %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) + %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + %c = add <8 x i16> %a, %b + ret <8 x i16> %c + +; CHECK-LABEL: Function: test1: + +; CHECK: NoAlias: i8* %p, i8* %q +; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) +; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +} diff --git a/test/Analysis/BasicAA/cs-cs.ll b/test/Analysis/BasicAA/cs-cs.ll index 0f74dbd92bbd..870794c25165 100644 --- a/test/Analysis/BasicAA/cs-cs.ll +++ b/test/Analysis/BasicAA/cs-cs.ll @@ -2,41 +2,12 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" target triple = "arm-apple-ios" -declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind - declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind declare void @a_readonly_func(i8 *) noinline nounwind readonly declare void @a_writeonly_func(i8 *) noinline nounwind writeonly -define <8 x i16> @test1(i8* %p, <8 x i16> %y) { -entry: - %q = getelementptr i8, i8* %p, i64 16 - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind - call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind - %c = add <8 x i16> %a, %b - ret <8 x i16> %c - -; CHECK-LABEL: Function: test1: - -; CHECK: NoAlias: i8* %p, i8* %q -; CHECK: Just Ref: Ptr: i8* %p <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: Ptr: i8* %q <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: Ptr: i8* %p <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Both ModRef: Ptr: i8* %q <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: Just Ref: Ptr: i8* %p <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: Ptr: i8* %q <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK: NoModRef: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) <-> %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) -; CHECK: NoModRef: %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) #{{[0-9]+}} <-> call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -} - define void @test2(i8* %P, i8* %Q) nounwind ssp { tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %P, i8* %Q, i64 12, i32 1, i1 false) @@ -247,9 +218,9 @@ define void @test7(i8* %P) nounwind ssp { ; CHECK: Just Ref: call void @a_readonly_func(i8* %P) <-> call void @a_writeonly_func(i8* %P) } -declare void @an_inaccessiblememonly_func() nounwind inaccessiblememonly -declare void @an_inaccessibleorargmemonly_func(i8 *) nounwind inaccessiblemem_or_argmemonly -declare void @an_argmemonly_func(i8 *) nounwind argmemonly +declare void @an_inaccessiblememonly_func() nounwind inaccessiblememonly +declare void @an_inaccessibleorargmemonly_func(i8 *) nounwind inaccessiblemem_or_argmemonly +declare void @an_argmemonly_func(i8 *) nounwind argmemonly define void @test8(i8* %p) { entry: @@ -260,7 +231,7 @@ entry: call void @an_inaccessiblememonly_func() call void @an_inaccessibleorargmemonly_func(i8* %q) call void @an_argmemonly_func(i8* %q) - ret void + ret void ; CHECK-LABEL: Function: test8 ; CHECK: NoModRef: Ptr: i8* %p <-> call void @an_inaccessiblememonly_func() diff --git a/test/Analysis/BasicAA/intrinsics-arm.ll b/test/Analysis/BasicAA/intrinsics-arm.ll new file mode 100644 index 000000000000..e15ce1c65c64 --- /dev/null +++ b/test/Analysis/BasicAA/intrinsics-arm.ll @@ -0,0 +1,31 @@ +; RUN: opt -basicaa -gvn -S < %s | FileCheck %s +; REQUIRES: arm + +target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32-f32:32:32-f64:32:32-v64:32:64-v128:32:128-a0:0:32-n32" + +; BasicAA should prove that these calls don't interfere, since we've +; specifically special cased exactly these two intrinsics in +; MemoryLocation::getForArgument. + +; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) { +; CHECK-NEXT: entry: +; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16 +; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]] +; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK-NEXT: %c = add <8 x i16> %a, %a +define <8 x i16> @test1(i8* %p, <8 x i16> %y) { +entry: + %q = getelementptr i8, i8* %p, i64 16 + %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) + %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + %c = add <8 x i16> %a, %b + ret <8 x i16> %c +} + +declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly +declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind + +; CHECK: attributes #0 = { argmemonly nounwind readonly } +; CHECK: attributes #1 = { argmemonly nounwind } +; CHECK: attributes [[ATTR]] = { nounwind } diff --git a/test/Analysis/BasicAA/intrinsics.ll b/test/Analysis/BasicAA/intrinsics.ll index 526a039ef7ac..68e59862bcc1 100644 --- a/test/Analysis/BasicAA/intrinsics.ll +++ b/test/Analysis/BasicAA/intrinsics.ll @@ -5,38 +5,22 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32- ; BasicAA should prove that these calls don't interfere, since they are ; IntrArgReadMem and have noalias pointers. -; CHECK: define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) { +; CHECK: define <8 x i16> @test0(<8 x i16>* noalias %p, <8 x i16>* noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) { ; CHECK-NEXT: entry: -; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR:#[0-9]+]] -; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK-NEXT: %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[ATTR:#[0-9]+]] +; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m) ; CHECK-NEXT: %c = add <8 x i16> %a, %a -define <8 x i16> @test0(i8* noalias %p, i8* noalias %q, <8 x i16> %y) { +define <8 x i16> @test0(<8 x i16>* noalias %p, <8 x i16>* noalias %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) { entry: - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind - call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind + %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m) + %b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind %c = add <8 x i16> %a, %b ret <8 x i16> %c } -; CHECK: define <8 x i16> @test1(i8* %p, <8 x i16> %y) { -; CHECK-NEXT: entry: -; CHECK-NEXT: %q = getelementptr i8, i8* %p, i64 16 -; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[ATTR]] -; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) -; CHECK-NEXT: %c = add <8 x i16> %a, %a -define <8 x i16> @test1(i8* %p, <8 x i16> %y) { -entry: - %q = getelementptr i8, i8* %p, i64 16 - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind - call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind - %c = add <8 x i16> %a, %b - ret <8 x i16> %c -} - -declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) nounwind readonly +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind ; CHECK: attributes #0 = { argmemonly nounwind readonly } ; CHECK: attributes #1 = { argmemonly nounwind } diff --git a/test/Analysis/BranchProbabilityInfo/basic.ll b/test/Analysis/BranchProbabilityInfo/basic.ll index 84936b7761ca..7bee1bd57373 100644 --- a/test/Analysis/BranchProbabilityInfo/basic.ll +++ b/test/Analysis/BranchProbabilityInfo/basic.ll @@ -452,7 +452,7 @@ entry: i32 3, label %case_d i32 4, label %case_e ], !prof !8 ; CHECK: edge entry -> case_a probability is 0x00000800 / 0x80000000 = 0.00% -; CHECK: edge entry -> case_b probability is 0x07fffe01 / 0x80000000 = 6.25% +; CHECK: edge entry -> case_b probability is 0x07fffdff / 0x80000000 = 6.25% ; CHECK: edge entry -> case_c probability is 0x67fffdff / 0x80000000 = 81.25% [HOT edge] ; CHECK: edge entry -> case_d probability is 0x07fffdff / 0x80000000 = 6.25% ; CHECK: edge entry -> case_e probability is 0x07fffdff / 0x80000000 = 6.25% @@ -495,7 +495,7 @@ entry: i32 4, label %case_e ], !prof !9 ; CHECK: edge entry -> case_a probability is 0x00000400 / 0x80000000 = 0.00% ; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00% -; CHECK: edge entry -> case_c probability is 0x6aaaa800 / 0x80000000 = 83.33% [HOT edge] +; CHECK: edge entry -> case_c probability is 0x6aaaa7ff / 0x80000000 = 83.33% [HOT edge] ; CHECK: edge entry -> case_d probability is 0x0aaaa7ff / 0x80000000 = 8.33% ; CHECK: edge entry -> case_e probability is 0x0aaaa7ff / 0x80000000 = 8.33% @@ -535,7 +535,7 @@ entry: i32 4, label %case_e ], !prof !10 ; CHECK: edge entry -> case_a probability is 0x00000000 / 0x80000000 = 0.00% ; CHECK: edge entry -> case_b probability is 0x00000400 / 0x80000000 = 0.00% -; CHECK: edge entry -> case_c probability is 0x6e08fa2e / 0x80000000 = 85.96% [HOT edge] +; CHECK: edge entry -> case_c probability is 0x6e08fa2d / 0x80000000 = 85.96% [HOT edge] ; CHECK: edge entry -> case_d probability is 0x08fb80e9 / 0x80000000 = 7.02% ; CHECK: edge entry -> case_e probability is 0x08fb80e9 / 0x80000000 = 7.02% diff --git a/test/Analysis/CostModel/AArch64/free-widening-casts.ll b/test/Analysis/CostModel/AArch64/free-widening-casts.ll new file mode 100644 index 000000000000..07f32d1d8ba2 --- /dev/null +++ b/test/Analysis/CostModel/AArch64/free-widening-casts.ll @@ -0,0 +1,622 @@ +; RUN: opt < %s -mtriple=aarch64--linux-gnu -cost-model -analyze | FileCheck %s --check-prefix=COST +; RUN: llc < %s -mtriple=aarch64--linux-gnu | FileCheck %s --check-prefix=CODE + +; COST-LABEL: uaddl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +; CODE-LABEL: uaddl_8h +; CODE: uaddl v0.8h, v0.8b, v1.8b +define <8 x i16> @uaddl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: uaddl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +; CODE-LABEL: uaddl_4s +; CODE: uaddl v0.4s, v0.4h, v1.4h +define <4 x i32> @uaddl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: uaddl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64> +; CODE-LABEL: uaddl_2d +; CODE: uaddl v0.2d, v0.2s, v1.2s +define <2 x i64> @uaddl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = zext <2 x i32> %b to <2 x i64> + %tmp2 = add <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: uaddl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16> +; CODE-LABEL: uaddl2_8h +; CODE: uaddl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: uaddl v0.8h, v0.8b, v1.8b +define <16 x i16> @uaddl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = zext <16 x i8> %b to <16 x i16> + %tmp2 = add <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: uaddl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32> +; CODE-LABEL: uaddl2_4s +; CODE: uaddl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: uaddl v0.4s, v0.4h, v1.4h +define <8 x i32> @uaddl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = zext <8 x i16> %b to <8 x i32> + %tmp2 = add <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: uaddl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64> +; CODE-LABEL: uaddl2_2d +; CODE: uaddl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: uaddl v0.2d, v0.2s, v1.2s +define <4 x i64> @uaddl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = zext <4 x i32> %b to <4 x i64> + %tmp2 = add <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: saddl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16> +; CODE-LABEL: saddl_8h +; CODE: saddl v0.8h, v0.8b, v1.8b +define <8 x i16> @saddl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: saddl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32> +; CODE-LABEL: saddl_4s +; CODE: saddl v0.4s, v0.4h, v1.4h +define <4 x i32> @saddl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: saddl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64> +; CODE-LABEL: saddl_2d +; CODE: saddl v0.2d, v0.2s, v1.2s +define <2 x i64> @saddl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sext <2 x i32> %b to <2 x i64> + %tmp2 = add <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: saddl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16> +; CODE-LABEL: saddl2_8h +; CODE: saddl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: saddl v0.8h, v0.8b, v1.8b +define <16 x i16> @saddl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sext <16 x i8> %b to <16 x i16> + %tmp2 = add <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: saddl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32> +; CODE-LABEL: saddl2_4s +; CODE: saddl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: saddl v0.4s, v0.4h, v1.4h +define <8 x i32> @saddl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sext <8 x i16> %b to <8 x i32> + %tmp2 = add <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: saddl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64> +; CODE-LABEL: saddl2_2d +; CODE: saddl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: saddl v0.2d, v0.2s, v1.2s +define <4 x i64> @saddl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sext <4 x i32> %b to <4 x i64> + %tmp2 = add <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: usubl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +; CODE-LABEL: usubl_8h +; CODE: usubl v0.8h, v0.8b, v1.8b +define <8 x i16> @usubl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = sub <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: usubl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +; CODE-LABEL: usubl_4s +; CODE: usubl v0.4s, v0.4h, v1.4h +define <4 x i32> @usubl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = sub <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: usubl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <2 x i32> %b to <2 x i64> +; CODE-LABEL: usubl_2d +; CODE: usubl v0.2d, v0.2s, v1.2s +define <2 x i64> @usubl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = zext <2 x i32> %b to <2 x i64> + %tmp2 = sub <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: usubl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <16 x i8> %b to <16 x i16> +; CODE-LABEL: usubl2_8h +; CODE: usubl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: usubl v0.8h, v0.8b, v1.8b +define <16 x i16> @usubl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = zext <16 x i8> %b to <16 x i16> + %tmp2 = sub <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: usubl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i16> %b to <8 x i32> +; CODE-LABEL: usubl2_4s +; CODE: usubl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: usubl v0.4s, v0.4h, v1.4h +define <8 x i32> @usubl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = zext <8 x i16> %b to <8 x i32> + %tmp2 = sub <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: usubl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i32> %b to <4 x i64> +; CODE-LABEL: usubl2_2d +; CODE: usubl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: usubl v0.2d, v0.2s, v1.2s +define <4 x i64> @usubl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = zext <4 x i32> %b to <4 x i64> + %tmp2 = sub <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: ssubl_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i8> %b to <8 x i16> +; CODE-LABEL: ssubl_8h +; CODE: ssubl v0.8h, v0.8b, v1.8b +define <8 x i16> @ssubl_8h(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sext <8 x i8> %b to <8 x i16> + %tmp2 = sub <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: ssubl_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i16> %b to <4 x i32> +; CODE-LABEL: ssubl_4s +; CODE: ssubl v0.4s, v0.4h, v1.4h +define <4 x i32> @ssubl_4s(<4 x i16> %a, <4 x i16> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sext <4 x i16> %b to <4 x i32> + %tmp2 = sub <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: ssubl_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <2 x i32> %b to <2 x i64> +; CODE-LABEL: ssubl_2d +; CODE: ssubl v0.2d, v0.2s, v1.2s +define <2 x i64> @ssubl_2d(<2 x i32> %a, <2 x i32> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sext <2 x i32> %b to <2 x i64> + %tmp2 = sub <2 x i64> %tmp0, %tmp1 + ret <2 x i64> %tmp2 +} + +; COST-LABEL: ssubl2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <16 x i8> %b to <16 x i16> +; CODE-LABEL: ssubl2_8h +; CODE: ssubl2 v2.8h, v0.16b, v1.16b +; CODE-NEXT: ssubl v0.8h, v0.8b, v1.8b +define <16 x i16> @ssubl2_8h(<16 x i8> %a, <16 x i8> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sext <16 x i8> %b to <16 x i16> + %tmp2 = sub <16 x i16> %tmp0, %tmp1 + ret <16 x i16> %tmp2 +} + +; COST-LABEL: ssubl2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <8 x i16> %b to <8 x i32> +; CODE-LABEL: ssubl2_4s +; CODE: ssubl2 v2.4s, v0.8h, v1.8h +; CODE-NEXT: ssubl v0.4s, v0.4h, v1.4h +define <8 x i32> @ssubl2_4s(<8 x i16> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sext <8 x i16> %b to <8 x i32> + %tmp2 = sub <8 x i32> %tmp0, %tmp1 + ret <8 x i32> %tmp2 +} + +; COST-LABEL: ssubl2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = sext <4 x i32> %b to <4 x i64> +; CODE-LABEL: ssubl2_2d +; CODE: ssubl2 v2.2d, v0.4s, v1.4s +; CODE-NEXT: ssubl v0.2d, v0.2s, v1.2s +define <4 x i64> @ssubl2_2d(<4 x i32> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sext <4 x i32> %b to <4 x i64> + %tmp2 = sub <4 x i64> %tmp0, %tmp1 + ret <4 x i64> %tmp2 +} + +; COST-LABEL: uaddw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; CODE-LABEL: uaddw_8h +; CODE: uaddw v0.8h, v1.8h, v0.8b +define <8 x i16> @uaddw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = add <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: uaddw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; CODE-LABEL: uaddw_4s +; CODE: uaddw v0.4s, v1.4s, v0.4h +define <4 x i32> @uaddw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = add <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: uaddw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; CODE-LABEL: uaddw_2d +; CODE: uaddw v0.2d, v1.2d, v0.2s +define <2 x i64> @uaddw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = add <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: uaddw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; CODE-LABEL: uaddw2_8h +; CODE: uaddw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: uaddw v0.8h, v1.8h, v0.8b +define <16 x i16> @uaddw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = add <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: uaddw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; CODE-LABEL: uaddw2_4s +; CODE: uaddw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: uaddw v0.4s, v1.4s, v0.4h +define <8 x i32> @uaddw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = add <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: uaddw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; CODE-LABEL: uaddw2_2d +; CODE: uaddw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: uaddw v0.2d, v1.2d, v0.2s +define <4 x i64> @uaddw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = add <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: saddw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; CODE-LABEL: saddw_8h +; CODE: saddw v0.8h, v1.8h, v0.8b +define <8 x i16> @saddw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = add <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: saddw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; CODE-LABEL: saddw_4s +; CODE: saddw v0.4s, v1.4s, v0.4h +define <4 x i32> @saddw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = add <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: saddw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; CODE-LABEL: saddw_2d +; CODE: saddw v0.2d, v1.2d, v0.2s +define <2 x i64> @saddw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = add <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: saddw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; CODE-LABEL: saddw2_8h +; CODE: saddw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: saddw v0.8h, v1.8h, v0.8b +define <16 x i16> @saddw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = add <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: saddw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; CODE-LABEL: saddw2_4s +; CODE: saddw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: saddw v0.4s, v1.4s, v0.4h +define <8 x i32> @saddw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = add <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: saddw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; CODE-LABEL: saddw2_2d +; CODE: saddw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: saddw v0.2d, v1.2d, v0.2s +define <4 x i64> @saddw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = add <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: usubw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +; CODE-LABEL: usubw_8h +; CODE: usubw v0.8h, v1.8h, v0.8b +define <8 x i16> @usubw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = sub <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: usubw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i32> +; CODE-LABEL: usubw_4s +; CODE: usubw v0.4s, v1.4s, v0.4h +define <4 x i32> @usubw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i32> + %tmp1 = sub <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: usubw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <2 x i32> %a to <2 x i64> +; CODE-LABEL: usubw_2d +; CODE: usubw v0.2d, v1.2d, v0.2s +define <2 x i64> @usubw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = zext <2 x i32> %a to <2 x i64> + %tmp1 = sub <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: usubw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <16 x i8> %a to <16 x i16> +; CODE-LABEL: usubw2_8h +; CODE: usubw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: usubw v0.8h, v1.8h, v0.8b +define <16 x i16> @usubw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = zext <16 x i8> %a to <16 x i16> + %tmp1 = sub <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: usubw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <8 x i16> %a to <8 x i32> +; CODE-LABEL: usubw2_4s +; CODE: usubw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: usubw v0.4s, v1.4s, v0.4h +define <8 x i32> @usubw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = zext <8 x i16> %a to <8 x i32> + %tmp1 = sub <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: usubw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = zext <4 x i32> %a to <4 x i64> +; CODE-LABEL: usubw2_2d +; CODE: usubw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: usubw v0.2d, v1.2d, v0.2s +define <4 x i64> @usubw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = zext <4 x i32> %a to <4 x i64> + %tmp1 = sub <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: ssubw_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; CODE-LABEL: ssubw_8h +; CODE: ssubw v0.8h, v1.8h, v0.8b +define <8 x i16> @ssubw_8h(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = sub <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: ssubw_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i16> %a to <4 x i32> +; CODE-LABEL: ssubw_4s +; CODE: ssubw v0.4s, v1.4s, v0.4h +define <4 x i32> @ssubw_4s(<4 x i16> %a, <4 x i32> %b) { + %tmp0 = sext <4 x i16> %a to <4 x i32> + %tmp1 = sub <4 x i32> %b, %tmp0 + ret <4 x i32> %tmp1 +} + +; COST-LABEL: ssubw_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <2 x i32> %a to <2 x i64> +; CODE-LABEL: ssubw_2d +; CODE: ssubw v0.2d, v1.2d, v0.2s +define <2 x i64> @ssubw_2d(<2 x i32> %a, <2 x i64> %b) { + %tmp0 = sext <2 x i32> %a to <2 x i64> + %tmp1 = sub <2 x i64> %b, %tmp0 + ret <2 x i64> %tmp1 +} + +; COST-LABEL: ssubw2_8h +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <16 x i8> %a to <16 x i16> +; CODE-LABEL: ssubw2_8h +; CODE: ssubw2 v2.8h, v2.8h, v0.16b +; CODE-NEXT: ssubw v0.8h, v1.8h, v0.8b +define <16 x i16> @ssubw2_8h(<16 x i8> %a, <16 x i16> %b) { + %tmp0 = sext <16 x i8> %a to <16 x i16> + %tmp1 = sub <16 x i16> %b, %tmp0 + ret <16 x i16> %tmp1 +} + +; COST-LABEL: ssubw2_4s +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <8 x i16> %a to <8 x i32> +; CODE-LABEL: ssubw2_4s +; CODE: ssubw2 v2.4s, v2.4s, v0.8h +; CODE-NEXT: ssubw v0.4s, v1.4s, v0.4h +define <8 x i32> @ssubw2_4s(<8 x i16> %a, <8 x i32> %b) { + %tmp0 = sext <8 x i16> %a to <8 x i32> + %tmp1 = sub <8 x i32> %b, %tmp0 + ret <8 x i32> %tmp1 +} + +; COST-LABEL: ssubw2_2d +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp0 = sext <4 x i32> %a to <4 x i64> +; CODE-LABEL: ssubw2_2d +; CODE: ssubw2 v2.2d, v2.2d, v0.4s +; CODE-NEXT: ssubw v0.2d, v1.2d, v0.2s +define <4 x i64> @ssubw2_2d(<4 x i32> %a, <4 x i64> %b) { + %tmp0 = sext <4 x i32> %a to <4 x i64> + %tmp1 = sub <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: neg_wrong_operand_order +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +define <8 x i16> @neg_wrong_operand_order(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = sub <8 x i16> %tmp0, %b + ret <8 x i16> %tmp1 +} + +; COST-LABEL: neg_non_widening_op +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <8 x i8> %a to <8 x i16> +define <8 x i16> @neg_non_widening_op(<8 x i8> %a, <8 x i16> %b) { + %tmp0 = zext <8 x i8> %a to <8 x i16> + %tmp1 = udiv <8 x i16> %b, %tmp0 + ret <8 x i16> %tmp1 +} + +; COST-LABEL: neg_dissimilar_operand_kind_0 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = sext <8 x i8> %a to <8 x i16> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <8 x i8> %b to <8 x i16> +define <8 x i16> @neg_dissimilar_operand_kind_0(<8 x i8> %a, <8 x i8> %b) { + %tmp0 = sext <8 x i8> %a to <8 x i16> + %tmp1 = zext <8 x i8> %b to <8 x i16> + %tmp2 = add <8 x i16> %tmp0, %tmp1 + ret <8 x i16> %tmp2 +} + +; COST-LABEL: neg_dissimilar_operand_kind_1 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <4 x i8> %a to <4 x i32> +; COST-NEXT: Cost Model: Found an estimated cost of 0 for instruction: %tmp1 = zext <4 x i16> %b to <4 x i32> +define <4 x i32> @neg_dissimilar_operand_kind_1(<4 x i8> %a, <4 x i16> %b) { + %tmp0 = zext <4 x i8> %a to <4 x i32> + %tmp1 = zext <4 x i16> %b to <4 x i32> + %tmp2 = add <4 x i32> %tmp0, %tmp1 + ret <4 x i32> %tmp2 +} + +; COST-LABEL: neg_illegal_vector_type_0 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <16 x i4> %a to <16 x i8> +define <16 x i8> @neg_illegal_vector_type_0(<16 x i4> %a, <16 x i8> %b) { + %tmp0 = zext <16 x i4> %a to <16 x i8> + %tmp1 = sub <16 x i8> %b, %tmp0 + ret <16 x i8> %tmp1 +} + +; COST-LABEL: neg_llegal_vector_type_1 +; COST-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %tmp0 = zext <1 x i16> %a to <1 x i32> +define <1 x i32> @neg_llegal_vector_type_1(<1 x i16> %a, <1 x i32> %b) { + %tmp0 = zext <1 x i16> %a to <1 x i32> + %tmp1 = add <1 x i32> %b, %tmp0 + ret <1 x i32> %tmp1 +} + +; COST-LABEL: neg_llegal_vector_type_2 +; COST-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <4 x i16> %a to <4 x i64> +define <4 x i64> @neg_llegal_vector_type_2(<4 x i16> %a, <4 x i64> %b) { + %tmp0 = zext <4 x i16> %a to <4 x i64> + %tmp1 = add <4 x i64> %b, %tmp0 + ret <4 x i64> %tmp1 +} + +; COST-LABEL: neg_llegal_vector_type_3 +; COST-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %tmp0 = zext <3 x i34> %a to <3 x i68> +define <3 x i68> @neg_llegal_vector_type_3(<3 x i34> %a, <3 x i68> %b) { + %tmp0 = zext <3 x i34> %a to <3 x i68> + %tmp1 = add <3 x i68> %b, %tmp0 + ret <3 x i68> %tmp1 +} diff --git a/test/Analysis/CostModel/AMDGPU/extractelement.ll b/test/Analysis/CostModel/AMDGPU/extractelement.ll index 1efbb5873acb..54c8b6c52365 100644 --- a/test/Analysis/CostModel/AMDGPU/extractelement.ll +++ b/test/Analysis/CostModel/AMDGPU/extractelement.ll @@ -1,7 +1,9 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s -; CHECK: 'extractelement_v2i32' -; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i32> +; GCN: 'extractelement_v2i32' +; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i32> define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr %elt = extractelement <2 x i32> %vec, i32 1 @@ -9,8 +11,8 @@ define amdgpu_kernel void @extractelement_v2i32(i32 addrspace(1)* %out, <2 x i32 ret void } -; CHECK: 'extractelement_v2f32' -; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x float> +; GCN: 'extractelement_v2f32' +; GCN: estimated cost of 0 for {{.*}} extractelement <2 x float> define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x float> addrspace(1)* %vaddr) { %vec = load <2 x float>, <2 x float> addrspace(1)* %vaddr %elt = extractelement <2 x float> %vec, i32 1 @@ -18,8 +20,8 @@ define amdgpu_kernel void @extractelement_v2f32(float addrspace(1)* %out, <2 x f ret void } -; CHECK: 'extractelement_v3i32' -; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i32> +; GCN: 'extractelement_v3i32' +; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i32> define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32> addrspace(1)* %vaddr) { %vec = load <3 x i32>, <3 x i32> addrspace(1)* %vaddr %elt = extractelement <3 x i32> %vec, i32 1 @@ -27,8 +29,8 @@ define amdgpu_kernel void @extractelement_v3i32(i32 addrspace(1)* %out, <3 x i32 ret void } -; CHECK: 'extractelement_v4i32' -; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i32> +; GCN: 'extractelement_v4i32' +; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i32> define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32> addrspace(1)* %vaddr) { %vec = load <4 x i32>, <4 x i32> addrspace(1)* %vaddr %elt = extractelement <4 x i32> %vec, i32 1 @@ -36,8 +38,8 @@ define amdgpu_kernel void @extractelement_v4i32(i32 addrspace(1)* %out, <4 x i32 ret void } -; CHECK: 'extractelement_v8i32' -; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i32> +; GCN: 'extractelement_v8i32' +; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i32> define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr) { %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr %elt = extractelement <8 x i32> %vec, i32 1 @@ -46,8 +48,8 @@ define amdgpu_kernel void @extractelement_v8i32(i32 addrspace(1)* %out, <8 x i32 } ; FIXME: Should be non-0 -; CHECK: 'extractelement_v8i32_dynindex' -; CHECK: estimated cost of 2 for {{.*}} extractelement <8 x i32> +; GCN: 'extractelement_v8i32_dynindex' +; GCN: estimated cost of 2 for {{.*}} extractelement <8 x i32> define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, <8 x i32> addrspace(1)* %vaddr, i32 %idx) { %vec = load <8 x i32>, <8 x i32> addrspace(1)* %vaddr %elt = extractelement <8 x i32> %vec, i32 %idx @@ -55,8 +57,8 @@ define amdgpu_kernel void @extractelement_v8i32_dynindex(i32 addrspace(1)* %out, ret void } -; CHECK: 'extractelement_v2i64' -; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i64> +; GCN: 'extractelement_v2i64' +; GCN: estimated cost of 0 for {{.*}} extractelement <2 x i64> define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr %elt = extractelement <2 x i64> %vec, i64 1 @@ -64,8 +66,8 @@ define amdgpu_kernel void @extractelement_v2i64(i64 addrspace(1)* %out, <2 x i64 ret void } -; CHECK: 'extractelement_v3i64' -; CHECK: estimated cost of 0 for {{.*}} extractelement <3 x i64> +; GCN: 'extractelement_v3i64' +; GCN: estimated cost of 0 for {{.*}} extractelement <3 x i64> define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64> addrspace(1)* %vaddr) { %vec = load <3 x i64>, <3 x i64> addrspace(1)* %vaddr %elt = extractelement <3 x i64> %vec, i64 1 @@ -73,8 +75,8 @@ define amdgpu_kernel void @extractelement_v3i64(i64 addrspace(1)* %out, <3 x i64 ret void } -; CHECK: 'extractelement_v4i64' -; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i64> +; GCN: 'extractelement_v4i64' +; GCN: estimated cost of 0 for {{.*}} extractelement <4 x i64> define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64> addrspace(1)* %vaddr) { %vec = load <4 x i64>, <4 x i64> addrspace(1)* %vaddr %elt = extractelement <4 x i64> %vec, i64 1 @@ -82,8 +84,8 @@ define amdgpu_kernel void @extractelement_v4i64(i64 addrspace(1)* %out, <4 x i64 ret void } -; CHECK: 'extractelement_v8i64' -; CHECK: estimated cost of 0 for {{.*}} extractelement <8 x i64> +; GCN: 'extractelement_v8i64' +; GCN: estimated cost of 0 for {{.*}} extractelement <8 x i64> define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64> addrspace(1)* %vaddr) { %vec = load <8 x i64>, <8 x i64> addrspace(1)* %vaddr %elt = extractelement <8 x i64> %vec, i64 1 @@ -91,8 +93,8 @@ define amdgpu_kernel void @extractelement_v8i64(i64 addrspace(1)* %out, <8 x i64 ret void } -; CHECK: 'extractelement_v4i8' -; CHECK: estimated cost of 0 for {{.*}} extractelement <4 x i8> +; GCN: 'extractelement_v4i8' +; GCN: estimated cost of 1 for {{.*}} extractelement <4 x i8> define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> addrspace(1)* %vaddr) { %vec = load <4 x i8>, <4 x i8> addrspace(1)* %vaddr %elt = extractelement <4 x i8> %vec, i8 1 @@ -100,11 +102,31 @@ define amdgpu_kernel void @extractelement_v4i8(i8 addrspace(1)* %out, <4 x i8> a ret void } -; CHECK: 'extractelement_v2i16' -; CHECK: estimated cost of 0 for {{.*}} extractelement <2 x i16> -define amdgpu_kernel void @extractelement_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { +; GCN: 'extractelement_0_v2i16': +; CI: estimated cost of 1 for {{.*}} extractelement <2 x i16> %vec, i16 0 +; VI: estimated cost of 0 for {{.*}} extractelement <2 x i16> +; GFX9: estimated cost of 0 for {{.*}} extractelement <2 x i16> +define amdgpu_kernel void @extractelement_0_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %elt = extractelement <2 x i16> %vec, i16 0 + store i16 %elt, i16 addrspace(1)* %out + ret void +} + +; GCN: 'extractelement_1_v2i16': +; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16> +define amdgpu_kernel void @extractelement_1_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr %elt = extractelement <2 x i16> %vec, i16 1 store i16 %elt, i16 addrspace(1)* %out ret void } + +; GCN: 'extractelement_var_v2i16' +; GCN: estimated cost of 1 for {{.*}} extractelement <2 x i16> +define amdgpu_kernel void @extractelement_var_v2i16(i16 addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr, i32 %idx) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %elt = extractelement <2 x i16> %vec, i32 %idx + store i16 %elt, i16 addrspace(1)* %out + ret void +} diff --git a/test/Analysis/CostModel/AMDGPU/insertelement.ll b/test/Analysis/CostModel/AMDGPU/insertelement.ll index 6f296a3e7a34..67ab2607acd5 100644 --- a/test/Analysis/CostModel/AMDGPU/insertelement.ll +++ b/test/Analysis/CostModel/AMDGPU/insertelement.ll @@ -1,37 +1,50 @@ -; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa < %s | FileCheck %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa %s | FileCheck -check-prefixes=GCN,CI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=GCN,VI %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GCN,GFX9 %s -; CHECK: 'insertelement_v2i32' -; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i32> +; GCN-LABEL: 'insertelement_v2i32' +; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i32> define amdgpu_kernel void @insertelement_v2i32(<2 x i32> addrspace(1)* %out, <2 x i32> addrspace(1)* %vaddr) { %vec = load <2 x i32>, <2 x i32> addrspace(1)* %vaddr - %insert = insertelement <2 x i32> %vec, i32 1, i32 123 + %insert = insertelement <2 x i32> %vec, i32 123, i32 1 store <2 x i32> %insert, <2 x i32> addrspace(1)* %out ret void } -; CHECK: 'insertelement_v2i64' -; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i64> +; GCN-LABEL: 'insertelement_v2i64' +; GCN: estimated cost of 0 for {{.*}} insertelement <2 x i64> define amdgpu_kernel void @insertelement_v2i64(<2 x i64> addrspace(1)* %out, <2 x i64> addrspace(1)* %vaddr) { %vec = load <2 x i64>, <2 x i64> addrspace(1)* %vaddr - %insert = insertelement <2 x i64> %vec, i64 1, i64 123 + %insert = insertelement <2 x i64> %vec, i64 123, i64 1 store <2 x i64> %insert, <2 x i64> addrspace(1)* %out ret void } -; CHECK: 'insertelement_v2i16' -; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i16> -define amdgpu_kernel void @insertelement_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { +; GCN-LABEL: 'insertelement_0_v2i16' +; CI: estimated cost of 1 for {{.*}} insertelement <2 x i16> +; VI: estimated cost of 0 for {{.*}} insertelement <2 x i16> +; GFX9: estimated cost of 0 for {{.*}} insertelement <2 x i16> +define amdgpu_kernel void @insertelement_0_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr - %insert = insertelement <2 x i16> %vec, i16 1, i16 123 + %insert = insertelement <2 x i16> %vec, i16 123, i16 0 store <2 x i16> %insert, <2 x i16> addrspace(1)* %out ret void } -; CHECK: 'insertelement_v2i8' -; CHECK: estimated cost of 0 for {{.*}} insertelement <2 x i8> -define amdgpu_kernel void @insertelement_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) { +; GCN-LABEL: 'insertelement_1_v2i16' +; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i16> +define amdgpu_kernel void @insertelement_1_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %insert = insertelement <2 x i16> %vec, i16 123, i16 1 + store <2 x i16> %insert, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN-LABEL: 'insertelement_1_v2i8' +; GCN: estimated cost of 1 for {{.*}} insertelement <2 x i8> +define amdgpu_kernel void @insertelement_1_v2i8(<2 x i8> addrspace(1)* %out, <2 x i8> addrspace(1)* %vaddr) { %vec = load <2 x i8>, <2 x i8> addrspace(1)* %vaddr - %insert = insertelement <2 x i8> %vec, i8 1, i8 123 + %insert = insertelement <2 x i8> %vec, i8 123, i8 1 store <2 x i8> %insert, <2 x i8> addrspace(1)* %out ret void } diff --git a/test/Analysis/CostModel/AMDGPU/shufflevector.ll b/test/Analysis/CostModel/AMDGPU/shufflevector.ll new file mode 100644 index 000000000000..cc756c82fed3 --- /dev/null +++ b/test/Analysis/CostModel/AMDGPU/shufflevector.ll @@ -0,0 +1,43 @@ +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 %s | FileCheck -check-prefixes=GFX9,GCN %s +; RUN: opt -cost-model -analyze -mtriple=amdgcn-unknown-amdhsa -mcpu=fiji %s | FileCheck -check-prefixes=VI,GCN %s + +; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer +define amdgpu_kernel void @shufflevector_00_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> zeroinitializer + store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out + ret void +} + +; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> +define amdgpu_kernel void @shufflevector_01_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> + store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out + ret void +} + +; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> +define amdgpu_kernel void @shufflevector_10_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> + store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out + ret void +} + +; GFX9: estimated cost of 0 for {{.*}} shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> +define amdgpu_kernel void @shufflevector_11_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr) { + %vec = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr + %shuf = shufflevector <2 x i16> %vec, <2 x i16> undef, <2 x i32> + store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out + ret void +} + +; GCN: estimated cost of 2 for {{.*}} shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> +define amdgpu_kernel void @shufflevector_02_v2i16(<2 x i16> addrspace(1)* %out, <2 x i16> addrspace(1)* %vaddr0, <2 x i16> addrspace(1)* %vaddr1) { + %vec0 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr0 + %vec1 = load <2 x i16>, <2 x i16> addrspace(1)* %vaddr1 + %shuf = shufflevector <2 x i16> %vec0, <2 x i16> %vec1, <2 x i32> + store <2 x i16> %shuf, <2 x i16> addrspace(1)* %out + ret void +} diff --git a/test/Analysis/CostModel/X86/div.ll b/test/Analysis/CostModel/X86/div.ll index 0ac06ff75ebe..dabaaef3596a 100644 --- a/test/Analysis/CostModel/X86/div.ll +++ b/test/Analysis/CostModel/X86/div.ll @@ -139,14 +139,14 @@ define i32 @sdiv_uniformconst() { ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv - ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv + ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv %V8i32 = sdiv <8 x i32> undef, ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv - ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv + ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv %V16i32 = sdiv <16 x i32> undef, @@ -157,12 +157,12 @@ define i32 @sdiv_uniformconst() { ; AVX: cost of 6 {{.*}} %V8i16 = sdiv %V8i16 = sdiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = sdiv - ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv + ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv %V16i16 = sdiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = sdiv - ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv + ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv @@ -203,12 +203,12 @@ define i32 @udiv_uniformconst() { ; AVX: cost of 15 {{.*}} %V4i32 = udiv %V4i32 = udiv <4 x i32> undef, ; SSE: cost of 30 {{.*}} %V8i32 = udiv - ; AVX1: cost of 30 {{.*}} %V8i32 = udiv + ; AVX1: cost of 32 {{.*}} %V8i32 = udiv ; AVX2: cost of 15 {{.*}} %V8i32 = udiv ; AVX512: cost of 15 {{.*}} %V8i32 = udiv %V8i32 = udiv <8 x i32> undef, ; SSE: cost of 60 {{.*}} %V16i32 = udiv - ; AVX1: cost of 60 {{.*}} %V16i32 = udiv + ; AVX1: cost of 64 {{.*}} %V16i32 = udiv ; AVX2: cost of 30 {{.*}} %V16i32 = udiv ; AVX512: cost of 15 {{.*}} %V16i32 = udiv %V16i32 = udiv <16 x i32> undef, @@ -219,12 +219,12 @@ define i32 @udiv_uniformconst() { ; AVX: cost of 6 {{.*}} %V8i16 = udiv %V8i16 = udiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = udiv - ; AVX1: cost of 12 {{.*}} %V16i16 = udiv + ; AVX1: cost of 14 {{.*}} %V16i16 = udiv ; AVX2: cost of 6 {{.*}} %V16i16 = udiv ; AVX512: cost of 6 {{.*}} %V16i16 = udiv %V16i16 = udiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = udiv - ; AVX1: cost of 24 {{.*}} %V32i16 = udiv + ; AVX1: cost of 28 {{.*}} %V32i16 = udiv ; AVX2: cost of 12 {{.*}} %V32i16 = udiv ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv @@ -269,14 +269,14 @@ define i32 @sdiv_uniformconstpow2() { ; SSE2: cost of 38 {{.*}} %V8i32 = sdiv ; SSSE3: cost of 38 {{.*}} %V8i32 = sdiv ; SSE42: cost of 30 {{.*}} %V8i32 = sdiv - ; AVX1: cost of 30 {{.*}} %V8i32 = sdiv + ; AVX1: cost of 32 {{.*}} %V8i32 = sdiv ; AVX2: cost of 15 {{.*}} %V8i32 = sdiv ; AVX512: cost of 15 {{.*}} %V8i32 = sdiv %V8i32 = sdiv <8 x i32> undef, ; SSE2: cost of 76 {{.*}} %V16i32 = sdiv ; SSSE3: cost of 76 {{.*}} %V16i32 = sdiv ; SSE42: cost of 60 {{.*}} %V16i32 = sdiv - ; AVX1: cost of 60 {{.*}} %V16i32 = sdiv + ; AVX1: cost of 64 {{.*}} %V16i32 = sdiv ; AVX2: cost of 30 {{.*}} %V16i32 = sdiv ; AVX512: cost of 15 {{.*}} %V16i32 = sdiv %V16i32 = sdiv <16 x i32> undef, @@ -287,12 +287,12 @@ define i32 @sdiv_uniformconstpow2() { ; AVX: cost of 6 {{.*}} %V8i16 = sdiv %V8i16 = sdiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = sdiv - ; AVX1: cost of 12 {{.*}} %V16i16 = sdiv + ; AVX1: cost of 14 {{.*}} %V16i16 = sdiv ; AVX2: cost of 6 {{.*}} %V16i16 = sdiv ; AVX512: cost of 6 {{.*}} %V16i16 = sdiv %V16i16 = sdiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = sdiv - ; AVX1: cost of 24 {{.*}} %V32i16 = sdiv + ; AVX1: cost of 28 {{.*}} %V32i16 = sdiv ; AVX2: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512F: cost of 12 {{.*}} %V32i16 = sdiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = sdiv @@ -333,12 +333,12 @@ define i32 @udiv_uniformconstpow2() { ; AVX: cost of 15 {{.*}} %V4i32 = udiv %V4i32 = udiv <4 x i32> undef, ; SSE: cost of 30 {{.*}} %V8i32 = udiv - ; AVX1: cost of 30 {{.*}} %V8i32 = udiv + ; AVX1: cost of 32 {{.*}} %V8i32 = udiv ; AVX2: cost of 15 {{.*}} %V8i32 = udiv ; AVX512: cost of 15 {{.*}} %V8i32 = udiv %V8i32 = udiv <8 x i32> undef, ; SSE: cost of 60 {{.*}} %V16i32 = udiv - ; AVX1: cost of 60 {{.*}} %V16i32 = udiv + ; AVX1: cost of 64 {{.*}} %V16i32 = udiv ; AVX2: cost of 30 {{.*}} %V16i32 = udiv ; AVX512: cost of 15 {{.*}} %V16i32 = udiv %V16i32 = udiv <16 x i32> undef, @@ -349,12 +349,12 @@ define i32 @udiv_uniformconstpow2() { ; AVX: cost of 6 {{.*}} %V8i16 = udiv %V8i16 = udiv <8 x i16> undef, ; SSE: cost of 12 {{.*}} %V16i16 = udiv - ; AVX1: cost of 12 {{.*}} %V16i16 = udiv + ; AVX1: cost of 14 {{.*}} %V16i16 = udiv ; AVX2: cost of 6 {{.*}} %V16i16 = udiv ; AVX512: cost of 6 {{.*}} %V16i16 = udiv %V16i16 = udiv <16 x i16> undef, ; SSE: cost of 24 {{.*}} %V32i16 = udiv - ; AVX1: cost of 24 {{.*}} %V32i16 = udiv + ; AVX1: cost of 28 {{.*}} %V32i16 = udiv ; AVX2: cost of 12 {{.*}} %V32i16 = udiv ; AVX512F: cost of 12 {{.*}} %V32i16 = udiv ; AVX512BW: cost of 6 {{.*}} %V32i16 = udiv diff --git a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll index a23b13fb2e25..eabc2330ddc6 100644 --- a/test/Analysis/CostModel/X86/vshift-ashr-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-ashr-cost.ll @@ -33,10 +33,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64': ; SSE2: Found an estimated cost of 24 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift -; AVX: Found an estimated cost of 24 for instruction: %shift +; AVX: Found an estimated cost of 26 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = ashr <4 x i64> %a, %b ret <4 x i64> %shift } @@ -45,10 +45,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64': ; SSE2: Found an estimated cost of 48 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 52 for instruction: %shift ; AVX2: Found an estimated cost of 8 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = ashr <8 x i64> %a, %b ret <8 x i64> %shift } @@ -70,10 +70,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift -; AVX: Found an estimated cost of 24 for instruction: %shift +; AVX: Found an estimated cost of 26 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <8 x i32> %a, %b ret <8 x i32> %shift @@ -83,10 +83,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 52 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = ashr <16 x i32> %a, %b ret <16 x i32> %shift @@ -109,11 +109,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 28 for instruction: %shift -; AVX: Found an estimated cost of 28 for instruction: %shift +; AVX: Found an estimated cost of 30 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift ; AVX512F: Found an estimated cost of 10 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -122,11 +122,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16': ; SSE2: Found an estimated cost of 128 for instruction: %shift ; SSE41: Found an estimated cost of 56 for instruction: %shift -; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 60 for instruction: %shift ; AVX2: Found an estimated cost of 20 for instruction: %shift ; AVX512F: Found an estimated cost of 20 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = ashr <32 x i16> %a, %b ret <32 x i16> %shift } @@ -147,11 +147,11 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8': ; SSE2: Found an estimated cost of 108 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 50 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift ; AVX512F: Found an estimated cost of 24 for instruction: %shift ; AVX512BW: Found an estimated cost of 24 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -160,11 +160,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8': ; SSE2: Found an estimated cost of 216 for instruction: %shift ; SSE41: Found an estimated cost of 96 for instruction: %shift -; AVX: Found an estimated cost of 96 for instruction: %shift +; AVX: Found an estimated cost of 100 for instruction: %shift ; AVX2: Found an estimated cost of 48 for instruction: %shift ; AVX512F: Found an estimated cost of 48 for instruction: %shift ; AVX512BW: Found an estimated cost of 24 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = ashr <64 x i8> %a, %b ret <64 x i8> %shift } @@ -191,11 +191,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift -; AVX2: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift +; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift -; XOPAVX2: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %insert = insertelement <4 x i64> undef, i64 %b, i32 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer %shift = ashr <4 x i64> %a, %splat @@ -206,11 +205,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift -; AVX2: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift +; AVX2: Found an estimated cost of 8 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift -; XOPAVX2: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %insert = insertelement <8 x i64> undef, i64 %b, i32 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer %shift = ashr <8 x i64> %a, %splat @@ -235,10 +233,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <8 x i32> undef, i32 %b, i32 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer @@ -250,10 +248,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <16 x i32> undef, i32 %b, i32 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer @@ -279,10 +277,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <16 x i16> undef, i16 %b, i32 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer @@ -294,11 +292,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512F: Found an estimated cost of 2 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -324,10 +322,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8': ; SSE2: Found an estimated cost of 108 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 50 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift ; AVX512: Found an estimated cost of 24 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %insert = insertelement <32 x i8> undef, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer %shift = ashr <32 x i8> %a, %splat @@ -338,11 +336,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8': ; SSE2: Found an estimated cost of 216 for instruction: %shift ; SSE41: Found an estimated cost of 96 for instruction: %shift -; AVX: Found an estimated cost of 96 for instruction: %shift +; AVX: Found an estimated cost of 100 for instruction: %shift ; AVX2: Found an estimated cost of 48 for instruction: %shift ; AVX512F: Found an estimated cost of 48 for instruction: %shift ; AVX512BW: Found an estimated cost of 24 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer %shift = ashr <64 x i8> %a, %splat @@ -369,10 +367,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64': ; SSE2: Found an estimated cost of 24 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift -; AVX: Found an estimated cost of 24 for instruction: %shift +; AVX: Found an estimated cost of 26 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = ashr <4 x i64> %a, ret <4 x i64> %shift } @@ -381,10 +379,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64': ; SSE2: Found an estimated cost of 48 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 52 for instruction: %shift ; AVX2: Found an estimated cost of 8 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = ashr <8 x i64> %a, ret <8 x i64> %shift } @@ -406,10 +404,10 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift -; AVX: Found an estimated cost of 24 for instruction: %shift +; AVX: Found an estimated cost of 26 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <8 x i32> %a, ret <8 x i32> %shift @@ -419,10 +417,10 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 52 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = ashr <16 x i32> %a, ret <16 x i32> %shift @@ -445,11 +443,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 28 for instruction: %shift -; AVX: Found an estimated cost of 28 for instruction: %shift +; AVX: Found an estimated cost of 30 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift ; AVX512F: Found an estimated cost of 10 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = ashr <16 x i16> %a, ret <16 x i16> %shift } @@ -458,11 +456,11 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16': ; SSE2: Found an estimated cost of 128 for instruction: %shift ; SSE41: Found an estimated cost of 56 for instruction: %shift -; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 60 for instruction: %shift ; AVX2: Found an estimated cost of 20 for instruction: %shift ; AVX512F: Found an estimated cost of 20 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = ashr <32 x i16> %a, ret <32 x i16> %shift } @@ -483,10 +481,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8': ; SSE2: Found an estimated cost of 108 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 50 for instruction: %shift ; AVX2: Found an estimated cost of 24 for instruction: %shift ; AVX512: Found an estimated cost of 24 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = ashr <32 x i8> %a, ret <32 x i8> %shift } @@ -495,11 +493,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8': ; SSE2: Found an estimated cost of 216 for instruction: %shift ; SSE41: Found an estimated cost of 96 for instruction: %shift -; AVX: Found an estimated cost of 96 for instruction: %shift +; AVX: Found an estimated cost of 100 for instruction: %shift ; AVX2: Found an estimated cost of 48 for instruction: %shift ; AVX512F: Found an estimated cost of 48 for instruction: %shift ; AVX512BW: Found an estimated cost of 24 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = ashr <64 x i8> %a, ret <64 x i8> %shift } @@ -524,10 +522,11 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift +; XOPAVX2: Found an estimated cost of 4 for instruction: %shift %shift = ashr <4 x i64> %a, ret <4 x i64> %shift } @@ -536,10 +535,11 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 8 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift +; XOPAVX2: Found an estimated cost of 8 for instruction: %shift %shift = ashr <8 x i64> %a, ret <8 x i64> %shift } @@ -560,10 +560,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <8 x i32> %a, ret <8 x i32> %shift @@ -573,10 +573,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = ashr <16 x i32> %a, ret <16 x i32> %shift @@ -598,10 +598,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = ashr <16 x i16> %a, ret <16 x i16> %shift @@ -611,11 +611,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512F: Found an estimated cost of 2 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = ashr <32 x i16> %a, ret <32 x i16> %shift @@ -628,7 +628,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512: Found an estimated cost of 4 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 2 for instruction: %shift %shift = ashr <16 x i8> %a, ret <16 x i8> %shift } @@ -637,10 +637,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512: Found an estimated cost of 4 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 4 for instruction: %shift %shift = ashr <32 x i8> %a, ret <32 x i8> %shift @@ -650,11 +650,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 8 for instruction: %shift ; AVX512F: Found an estimated cost of 8 for instruction: %shift ; AVX512BW: Found an estimated cost of 4 for instruction: %shift -; XOPAVX: Found an estimated cost of 16 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 8 for instruction: %shift %shift = ashr <64 x i8> %a, ret <64 x i8> %shift diff --git a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll index 546b2bb50f26..6e890369d677 100644 --- a/test/Analysis/CostModel/X86/vshift-lshr-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-lshr-cost.ll @@ -34,10 +34,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i64> %a, %b ret <4 x i64> %shift @@ -47,10 +47,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <8 x i64> %a, %b ret <8 x i64> %shift @@ -73,10 +73,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 22 for instruction: %shift -; AVX: Found an estimated cost of 22 for instruction: %shift +; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <8 x i32> %a, %b ret <8 x i32> %shift @@ -86,10 +86,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 44 for instruction: %shift -; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <16 x i32> %a, %b ret <16 x i32> %shift @@ -112,11 +112,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 28 for instruction: %shift -; AVX: Found an estimated cost of 28 for instruction: %shift +; AVX: Found an estimated cost of 30 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift ; AVX512F: Found an estimated cost of 10 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -125,11 +125,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16': ; SSE2: Found an estimated cost of 128 for instruction: %shift ; SSE41: Found an estimated cost of 56 for instruction: %shift -; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 60 for instruction: %shift ; AVX2: Found an estimated cost of 20 for instruction: %shift ; AVX512F: Found an estimated cost of 20 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = lshr <32 x i16> %a, %b ret <32 x i16> %shift } @@ -150,10 +150,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8': ; SSE2: Found an estimated cost of 52 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift -; AVX: Found an estimated cost of 24 for instruction: %shift +; AVX: Found an estimated cost of 26 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift ; AVX512: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -162,11 +162,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8': ; SSE2: Found an estimated cost of 104 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 52 for instruction: %shift ; AVX2: Found an estimated cost of 22 for instruction: %shift ; AVX512F: Found an estimated cost of 22 for instruction: %shift ; AVX512BW: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = lshr <64 x i8> %a, %b ret <64 x i8> %shift } @@ -193,10 +193,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <4 x i64> undef, i64 %b, i32 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer @@ -208,10 +208,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <8 x i64> undef, i64 %b, i32 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer @@ -237,10 +237,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <8 x i32> undef, i32 %b, i32 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer @@ -252,10 +252,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <16 x i32> undef, i32 %b, i32 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer @@ -281,10 +281,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <16 x i16> undef, i16 %b, i32 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer @@ -296,11 +296,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512F: Found an estimated cost of 2 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -326,10 +326,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8': ; SSE2: Found an estimated cost of 52 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift -; AVX: Found an estimated cost of 24 for instruction: %shift +; AVX: Found an estimated cost of 26 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift ; AVX512: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %insert = insertelement <32 x i8> undef, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer %shift = lshr <32 x i8> %a, %splat @@ -340,11 +340,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8': ; SSE2: Found an estimated cost of 104 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 52 for instruction: %shift ; AVX2: Found an estimated cost of 22 for instruction: %shift ; AVX512F: Found an estimated cost of 22 for instruction: %shift ; AVX512BW: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer %shift = lshr <64 x i8> %a, %splat @@ -372,10 +372,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i64> %a, ret <4 x i64> %shift @@ -385,10 +385,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <8 x i64> %a, ret <8 x i64> %shift @@ -411,10 +411,10 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i32': ; SSE2: Found an estimated cost of 32 for instruction: %shift ; SSE41: Found an estimated cost of 22 for instruction: %shift -; AVX: Found an estimated cost of 22 for instruction: %shift +; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <8 x i32> %a, ret <8 x i32> %shift @@ -424,10 +424,10 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i32': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 44 for instruction: %shift -; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <16 x i32> %a, ret <16 x i32> %shift @@ -450,11 +450,11 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v16i16': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 28 for instruction: %shift -; AVX: Found an estimated cost of 28 for instruction: %shift +; AVX: Found an estimated cost of 30 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift ; AVX512F: Found an estimated cost of 10 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = lshr <16 x i16> %a, ret <16 x i16> %shift } @@ -463,11 +463,11 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i16': ; SSE2: Found an estimated cost of 128 for instruction: %shift ; SSE41: Found an estimated cost of 56 for instruction: %shift -; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 60 for instruction: %shift ; AVX2: Found an estimated cost of 20 for instruction: %shift ; AVX512F: Found an estimated cost of 20 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = lshr <32 x i16> %a, ret <32 x i16> %shift } @@ -488,10 +488,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8': ; SSE2: Found an estimated cost of 52 for instruction: %shift ; SSE41: Found an estimated cost of 24 for instruction: %shift -; AVX: Found an estimated cost of 24 for instruction: %shift +; AVX: Found an estimated cost of 26 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift ; AVX512: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 6 for instruction: %shift %shift = lshr <32 x i8> %a, ret <32 x i8> %shift } @@ -500,11 +500,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8': ; SSE2: Found an estimated cost of 104 for instruction: %shift ; SSE41: Found an estimated cost of 48 for instruction: %shift -; AVX: Found an estimated cost of 48 for instruction: %shift +; AVX: Found an estimated cost of 52 for instruction: %shift ; AVX2: Found an estimated cost of 22 for instruction: %shift ; AVX512F: Found an estimated cost of 22 for instruction: %shift ; AVX512BW: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 8 for instruction: %shift +; XOP: Found an estimated cost of 12 for instruction: %shift %shift = lshr <64 x i8> %a, ret <64 x i8> %shift } @@ -529,10 +529,10 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <4 x i64> %a, ret <4 x i64> %shift @@ -542,10 +542,10 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <8 x i64> %a, ret <8 x i64> %shift @@ -567,10 +567,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <8 x i32> %a, ret <8 x i32> %shift @@ -580,10 +580,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <16 x i32> %a, ret <16 x i32> %shift @@ -605,10 +605,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = lshr <16 x i16> %a, ret <16 x i16> %shift @@ -618,11 +618,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512F: Found an estimated cost of 2 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <32 x i16> %a, ret <32 x i16> %shift @@ -644,10 +644,10 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 6 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 2 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 6 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = lshr <32 x i8> %a, ret <32 x i8> %shift @@ -657,11 +657,11 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512F: Found an estimated cost of 4 for instruction: %shift ; AVX512BW: Found an estimated cost of 2 for instruction: %shift -; XOPAVX: Found an estimated cost of 8 for instruction: %shift +; XOPAVX: Found an estimated cost of 12 for instruction: %shift ; XOPAVX2: Found an estimated cost of 4 for instruction: %shift %shift = lshr <64 x i8> %a, ret <64 x i8> %shift diff --git a/test/Analysis/CostModel/X86/vshift-shl-cost.ll b/test/Analysis/CostModel/X86/vshift-shl-cost.ll index 90356f5ce8be..5e604bb7983e 100644 --- a/test/Analysis/CostModel/X86/vshift-shl-cost.ll +++ b/test/Analysis/CostModel/X86/vshift-shl-cost.ll @@ -35,10 +35,10 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v4i64': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i64> %a, %b ret <4 x i64> %shift @@ -48,10 +48,10 @@ define <8 x i64> @var_shift_v8i64(<8 x i64> %a, <8 x i64> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i64': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <8 x i64> %a, %b ret <8 x i64> %shift @@ -74,10 +74,10 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v8i32': ; SSE2: Found an estimated cost of 20 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i32> %a, %b ret <8 x i32> %shift @@ -87,10 +87,10 @@ define <16 x i32> @var_shift_v16i32(<16 x i32> %a, <16 x i32> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i32': ; SSE2: Found an estimated cost of 40 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <16 x i32> %a, %b ret <16 x i32> %shift @@ -113,11 +113,11 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v16i16': ; SSE2: Found an estimated cost of 64 for instruction: %shift ; SSE41: Found an estimated cost of 28 for instruction: %shift -; AVX: Found an estimated cost of 28 for instruction: %shift +; AVX: Found an estimated cost of 30 for instruction: %shift ; AVX2: Found an estimated cost of 10 for instruction: %shift ; AVX512F: Found an estimated cost of 10 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift } @@ -126,11 +126,11 @@ define <32 x i16> @var_shift_v32i16(<32 x i16> %a, <32 x i16> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i16': ; SSE2: Found an estimated cost of 128 for instruction: %shift ; SSE41: Found an estimated cost of 56 for instruction: %shift -; AVX: Found an estimated cost of 56 for instruction: %shift +; AVX: Found an estimated cost of 60 for instruction: %shift ; AVX2: Found an estimated cost of 20 for instruction: %shift ; AVX512F: Found an estimated cost of 20 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift %shift = shl <32 x i16> %a, %b ret <32 x i16> %shift } @@ -151,10 +151,10 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v32i8': ; SSE2: Found an estimated cost of 52 for instruction: %shift ; SSE41: Found an estimated cost of 22 for instruction: %shift -; AVX: Found an estimated cost of 22 for instruction: %shift +; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift ; AVX512: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift %shift = shl <32 x i8> %a, %b ret <32 x i8> %shift } @@ -163,11 +163,11 @@ define <64 x i8> @var_shift_v64i8(<64 x i8> %a, <64 x i8> %b) { ; CHECK: 'Cost Model Analysis' for function 'var_shift_v64i8': ; SSE2: Found an estimated cost of 104 for instruction: %shift ; SSE41: Found an estimated cost of 44 for instruction: %shift -; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 22 for instruction: %shift ; AVX512F: Found an estimated cost of 22 for instruction: %shift ; AVX512BW: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift %shift = shl <64 x i8> %a, %b ret <64 x i8> %shift } @@ -194,10 +194,10 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, i64 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v4i64': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <4 x i64> undef, i64 %b, i32 0 %splat = shufflevector <4 x i64> %insert, <4 x i64> undef, <4 x i32> zeroinitializer @@ -209,10 +209,10 @@ define <8 x i64> @splatvar_shift_v8i64(<8 x i64> %a, i64 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i64': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <8 x i64> undef, i64 %b, i32 0 %splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer @@ -238,10 +238,10 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, i32 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v8i32': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <8 x i32> undef, i32 %b, i32 0 %splat = shufflevector <8 x i32> %insert, <8 x i32> undef, <8 x i32> zeroinitializer @@ -253,10 +253,10 @@ define <16 x i32> @splatvar_shift_v16i32(<16 x i32> %a, i32 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i32': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <16 x i32> undef, i32 %b, i32 0 %splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer @@ -282,10 +282,10 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, i16 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v16i16': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %insert = insertelement <16 x i16> undef, i16 %b, i32 0 %splat = shufflevector <16 x i16> %insert, <16 x i16> undef, <16 x i32> zeroinitializer @@ -297,11 +297,11 @@ define <32 x i16> @splatvar_shift_v32i16(<32 x i16> %a, i16 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i16': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512F: Found an estimated cost of 2 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %insert = insertelement <32 x i16> undef, i16 %b, i32 0 %splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer @@ -327,10 +327,10 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, i8 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v32i8': ; SSE2: Found an estimated cost of 52 for instruction: %shift ; SSE41: Found an estimated cost of 22 for instruction: %shift -; AVX: Found an estimated cost of 22 for instruction: %shift +; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift ; AVX512: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift %insert = insertelement <32 x i8> undef, i8 %b, i32 0 %splat = shufflevector <32 x i8> %insert, <32 x i8> undef, <32 x i32> zeroinitializer %shift = shl <32 x i8> %a, %splat @@ -341,11 +341,11 @@ define <64 x i8> @splatvar_shift_v64i8(<64 x i8> %a, i8 %b) { ; CHECK: 'Cost Model Analysis' for function 'splatvar_shift_v64i8': ; SSE2: Found an estimated cost of 104 for instruction: %shift ; SSE41: Found an estimated cost of 44 for instruction: %shift -; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 22 for instruction: %shift ; AVX512F: Found an estimated cost of 22 for instruction: %shift ; AVX512BW: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift %insert = insertelement <64 x i8> undef, i8 %b, i32 0 %splat = shufflevector <64 x i8> %insert, <64 x i8> undef, <64 x i32> zeroinitializer %shift = shl <64 x i8> %a, %splat @@ -373,10 +373,10 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v4i64': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 10 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i64> %a, ret <4 x i64> %shift @@ -386,10 +386,10 @@ define <8 x i64> @constant_shift_v8i64(<8 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v8i64': ; SSE2: Found an estimated cost of 16 for instruction: %shift ; SSE41: Found an estimated cost of 16 for instruction: %shift -; AVX: Found an estimated cost of 16 for instruction: %shift +; AVX: Found an estimated cost of 20 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <8 x i64> %a, ret <8 x i64> %shift @@ -415,7 +415,7 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) { ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i32> %a, ret <8 x i32> %shift @@ -428,7 +428,7 @@ define <16 x i32> @constant_shift_v16i32(<16 x i32> %a) { ; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <16 x i32> %a, ret <16 x i32> %shift @@ -453,7 +453,7 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) { ; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -467,7 +467,7 @@ define <32 x i16> @constant_shift_v32i16(<32 x i16> %a) { ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512F: Found an estimated cost of 2 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -489,10 +489,10 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v32i8': ; SSE2: Found an estimated cost of 52 for instruction: %shift ; SSE41: Found an estimated cost of 22 for instruction: %shift -; AVX: Found an estimated cost of 22 for instruction: %shift +; AVX: Found an estimated cost of 24 for instruction: %shift ; AVX2: Found an estimated cost of 11 for instruction: %shift ; AVX512: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 4 for instruction: %shift %shift = shl <32 x i8> %a, ret <32 x i8> %shift } @@ -501,11 +501,11 @@ define <64 x i8> @constant_shift_v64i8(<64 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'constant_shift_v64i8': ; SSE2: Found an estimated cost of 104 for instruction: %shift ; SSE41: Found an estimated cost of 44 for instruction: %shift -; AVX: Found an estimated cost of 44 for instruction: %shift +; AVX: Found an estimated cost of 48 for instruction: %shift ; AVX2: Found an estimated cost of 22 for instruction: %shift ; AVX512F: Found an estimated cost of 22 for instruction: %shift ; AVX512BW: Found an estimated cost of 11 for instruction: %shift -; XOP: Found an estimated cost of 4 for instruction: %shift +; XOP: Found an estimated cost of 8 for instruction: %shift %shift = shl <64 x i8> %a, ret <64 x i8> %shift } @@ -531,10 +531,10 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v4i64': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <4 x i64> %a, ret <4 x i64> %shift @@ -544,10 +544,10 @@ define <8 x i64> @splatconstant_shift_v8i64(<8 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i64': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <8 x i64> %a, ret <8 x i64> %shift @@ -570,10 +570,10 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v8i32': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <8 x i32> %a, ret <8 x i32> %shift @@ -583,10 +583,10 @@ define <16 x i32> @splatconstant_shift_v16i32(<16 x i32> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i32': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <16 x i32> %a, ret <16 x i32> %shift @@ -608,10 +608,10 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v16i16': ; SSE2: Found an estimated cost of 2 for instruction: %shift ; SSE41: Found an estimated cost of 2 for instruction: %shift -; AVX: Found an estimated cost of 2 for instruction: %shift +; AVX: Found an estimated cost of 4 for instruction: %shift ; AVX2: Found an estimated cost of 1 for instruction: %shift ; AVX512: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 2 for instruction: %shift +; XOPAVX: Found an estimated cost of 4 for instruction: %shift ; XOPAVX2: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i16> %a, ret <16 x i16> %shift @@ -621,11 +621,11 @@ define <32 x i16> @splatconstant_shift_v32i16(<32 x i16> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i16': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 8 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512F: Found an estimated cost of 2 for instruction: %shift ; AVX512BW: Found an estimated cost of 1 for instruction: %shift -; XOPAVX: Found an estimated cost of 4 for instruction: %shift +; XOPAVX: Found an estimated cost of 8 for instruction: %shift ; XOPAVX2: Found an estimated cost of 2 for instruction: %shift %shift = shl <32 x i16> %a, ret <32 x i16> %shift @@ -638,7 +638,7 @@ define <16 x i8> @splatconstant_shift_v16i8(<16 x i8> %a) { ; AVX: Found an estimated cost of 2 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 2 for instruction: %shift -; XOP: Found an estimated cost of 2 for instruction: %shift +; XOP: Found an estimated cost of 1 for instruction: %shift %shift = shl <16 x i8> %a, ret <16 x i8> %shift } @@ -647,7 +647,7 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v32i8': ; SSE2: Found an estimated cost of 4 for instruction: %shift ; SSE41: Found an estimated cost of 4 for instruction: %shift -; AVX: Found an estimated cost of 4 for instruction: %shift +; AVX: Found an estimated cost of 6 for instruction: %shift ; AVX2: Found an estimated cost of 2 for instruction: %shift ; AVX512: Found an estimated cost of 2 for instruction: %shift ; XOPAVX: Found an estimated cost of 4 for instruction: %shift @@ -660,7 +660,7 @@ define <64 x i8> @splatconstant_shift_v64i8(<64 x i8> %a) { ; CHECK: 'Cost Model Analysis' for function 'splatconstant_shift_v64i8': ; SSE2: Found an estimated cost of 8 for instruction: %shift ; SSE41: Found an estimated cost of 8 for instruction: %shift -; AVX: Found an estimated cost of 8 for instruction: %shift +; AVX: Found an estimated cost of 12 for instruction: %shift ; AVX2: Found an estimated cost of 4 for instruction: %shift ; AVX512F: Found an estimated cost of 4 for instruction: %shift ; AVX512BW: Found an estimated cost of 2 for instruction: %shift @@ -761,7 +761,7 @@ define <16 x i16> @test6(<16 x i16> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shl ; AVX: Found an estimated cost of 4 for instruction: %shl ; AVX2: Found an estimated cost of 1 for instruction: %shl -; XOPAVX: Found an estimated cost of 2 for instruction: %shl +; XOPAVX: Found an estimated cost of 4 for instruction: %shl ; XOPAVX2: Found an estimated cost of 1 for instruction: %shl @@ -778,7 +778,7 @@ define <8 x i32> @test7(<8 x i32> %a) { ; SSE41: Found an estimated cost of 2 for instruction: %shl ; AVX: Found an estimated cost of 4 for instruction: %shl ; AVX2: Found an estimated cost of 1 for instruction: %shl -; XOPAVX: Found an estimated cost of 2 for instruction: %shl +; XOPAVX: Found an estimated cost of 4 for instruction: %shl ; XOPAVX2: Found an estimated cost of 1 for instruction: %shl @@ -794,9 +794,9 @@ define <4 x i64> @test8(<4 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'test8': ; SSE2: Found an estimated cost of 8 for instruction: %shl ; SSE41: Found an estimated cost of 8 for instruction: %shl -; AVX: Found an estimated cost of 8 for instruction: %shl +; AVX: Found an estimated cost of 10 for instruction: %shl ; AVX2: Found an estimated cost of 1 for instruction: %shl -; XOPAVX: Found an estimated cost of 2 for instruction: %shl +; XOPAVX: Found an estimated cost of 4 for instruction: %shl ; XOPAVX2: Found an estimated cost of 1 for instruction: %shl @@ -811,7 +811,7 @@ define <32 x i16> @test9(<32 x i16> %a) { ; SSE41: Found an estimated cost of 4 for instruction: %shl ; AVX: Found an estimated cost of 8 for instruction: %shl ; AVX2: Found an estimated cost of 2 for instruction: %shl -; XOPAVX: Found an estimated cost of 4 for instruction: %shl +; XOPAVX: Found an estimated cost of 8 for instruction: %shl ; XOPAVX2: Found an estimated cost of 2 for instruction: %shl @@ -826,7 +826,7 @@ define <16 x i32> @test10(<16 x i32> %a) { ; SSE41: Found an estimated cost of 4 for instruction: %shl ; AVX: Found an estimated cost of 8 for instruction: %shl ; AVX2: Found an estimated cost of 2 for instruction: %shl -; XOPAVX: Found an estimated cost of 4 for instruction: %shl +; XOPAVX: Found an estimated cost of 8 for instruction: %shl ; XOPAVX2: Found an estimated cost of 2 for instruction: %shl @@ -842,7 +842,7 @@ define <8 x i64> @test11(<8 x i64> %a) { ; CHECK: 'Cost Model Analysis' for function 'test11': ; SSE2: Found an estimated cost of 16 for instruction: %shl ; SSE41: Found an estimated cost of 16 for instruction: %shl -; AVX: Found an estimated cost of 16 for instruction: %shl +; AVX: Found an estimated cost of 20 for instruction: %shl ; AVX2: Found an estimated cost of 2 for instruction: %shl -; XOPAVX: Found an estimated cost of 4 for instruction: %shl +; XOPAVX: Found an estimated cost of 8 for instruction: %shl ; XOPAVX2: Found an estimated cost of 2 for instruction: %shl diff --git a/test/Analysis/ScalarEvolution/different-loops-recs.ll b/test/Analysis/ScalarEvolution/different-loops-recs.ll new file mode 100644 index 000000000000..ad3d1e0bd110 --- /dev/null +++ b/test/Analysis/ScalarEvolution/different-loops-recs.ll @@ -0,0 +1,454 @@ +; RUN: opt -analyze -scalar-evolution < %s | FileCheck %s + +; This test set ensures that we can correctly operate with recurrencies from +; different loops. + +; Check that we can evaluate a sum of phis from two different loops in any +; order. + +define void @test_00() { + +; CHECK-LABEL: Classifying expressions for: @test_00 +; CHECK: %sum1 = add i32 %phi1, %phi2 +; CHECK-NEXT: --> {14,+,3}<%loop1> +; CHECK: %sum2 = add i32 %sum1, %phi3 +; CHECK-NEXT: --> {20,+,6}<%loop1> +; CHECK: %sum3 = add i32 %phi4, %phi5 +; CHECK-NEXT: --> {116,+,3}<%loop2> +; CHECK: %sum4 = add i32 %sum3, %phi6 +; CHECK-NEXT: --> {159,+,6}<%loop2> +; CHECK: %s1 = add i32 %phi1, %phi4 +; CHECK-NEXT: --> {{{{}}73,+,1}<%loop1>,+,1}<%loop2> +; CHECK: %s2 = add i32 %phi5, %phi2 +; CHECK-NEXT: --> {{{{}}57,+,2}<%loop1>,+,2}<%loop2> +; CHECK: %s3 = add i32 %sum1, %sum3 +; CHECK-NEXT: --> {{{{}}130,+,3}<%loop1>,+,3}<%loop2> +; CHECK: %s4 = add i32 %sum4, %sum2 +; CHECK-NEXT: --> {{{{}}179,+,6}<%loop1>,+,6}<%loop2> +; CHECK: %s5 = add i32 %phi3, %sum3 +; CHECK-NEXT: --> {{{{}}122,+,3}<%loop1>,+,3}<%loop2> +; CHECK: %s6 = add i32 %sum2, %phi6 +; CHECK-NEXT: --> {{{{}}63,+,6}<%loop1>,+,3}<%loop2> + +entry: + br label %loop1 + +loop1: + %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1 ] + %phi2 = phi i32 [ 4, %entry ], [ %phi2.inc, %loop1 ] + %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ] + %phi1.inc = add i32 %phi1, 1 + %phi2.inc = add i32 %phi2, 2 + %phi3.inc = add i32 %phi3, 3 + %sum1 = add i32 %phi1, %phi2 + %sum2 = add i32 %sum1, %phi3 + %cond1 = icmp ult i32 %sum2, 1000 + br i1 %cond1, label %loop1, label %loop2 + +loop2: + %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ] + %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ] + %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ] + %phi4.inc = add i32 %phi4, 1 + %phi5.inc = add i32 %phi5, 2 + %phi6.inc = add i32 %phi6, 3 + %sum3 = add i32 %phi4, %phi5 + %sum4 = add i32 %sum3, %phi6 + %cond2 = icmp ult i32 %sum4, 1000 + br i1 %cond2, label %loop2, label %exit + +exit: + %s1 = add i32 %phi1, %phi4 + %s2 = add i32 %phi5, %phi2 + %s3 = add i32 %sum1, %sum3 + %s4 = add i32 %sum4, %sum2 + %s5 = add i32 %phi3, %sum3 + %s6 = add i32 %sum2, %phi6 + ret void +} + +; Check that we can evaluate a sum of phis+invariants from two different loops +; in any order. + +define void @test_01(i32 %a, i32 %b) { + +; CHECK-LABEL: Classifying expressions for: @test_01 +; CHECK: %sum1 = add i32 %phi1, %phi2 +; CHECK-NEXT: --> {(%a + %b),+,3}<%loop1> +; CHECK: %sum2 = add i32 %sum1, %phi3 +; CHECK-NEXT: --> {(6 + %a + %b),+,6}<%loop1> +; CHECK: %is1 = add i32 %sum2, %a +; CHECK-NEXT: --> {(6 + (2 * %a) + %b),+,6}<%loop1> +; CHECK: %sum3 = add i32 %phi4, %phi5 +; CHECK-NEXT: --> {116,+,3}<%loop2> +; CHECK: %sum4 = add i32 %sum3, %phi6 +; CHECK-NEXT: --> {159,+,6}<%loop2> +; CHECK: %is2 = add i32 %sum4, %b +; CHECK-NEXT: --> {(159 + %b),+,6}<%loop2> +; CHECK: %ec2 = add i32 %is1, %is2 +; CHECK-NEXT: --> {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2> +; CHECK: %s1 = add i32 %phi1, %is1 +; CHECK-NEXT: --> {(6 + (3 * %a) + %b),+,7}<%loop1> +; CHECK: %s2 = add i32 %is2, %phi4 +; CHECK-NEXT: --> {(222 + %b),+,7}<%loop2> +; CHECK: %s3 = add i32 %is1, %phi5 +; CHECK-NEXT: --> {{{{}}(59 + (2 * %a) + %b),+,6}<%loop1>,+,2}<%loop2> +; CHECK: %s4 = add i32 %phi2, %is2 +; CHECK-NEXT: --> {{{{}}(159 + (2 * %b)),+,2}<%loop1>,+,6}<%loop2> +; CHECK: %s5 = add i32 %is1, %is2 +; CHECK-NEXT: --> {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2> +; CHECK: %s6 = add i32 %is2, %is1 +; CHECK-NEXT: --> {{{{}}(165 + (2 * %a) + (2 * %b)),+,6}<%loop1>,+,6}<%loop2> + +entry: + br label %loop1 + +loop1: + %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ] + %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ] + %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ] + %phi1.inc = add i32 %phi1, 1 + %phi2.inc = add i32 %phi2, 2 + %phi3.inc = add i32 %phi3, 3 + %sum1 = add i32 %phi1, %phi2 + %sum2 = add i32 %sum1, %phi3 + %is1 = add i32 %sum2, %a + %cond1 = icmp ult i32 %is1, 1000 + br i1 %cond1, label %loop1, label %loop2 + +loop2: + %phi4 = phi i32 [ 63, %loop1 ], [ %phi4.inc, %loop2 ] + %phi5 = phi i32 [ 53, %loop1 ], [ %phi5.inc, %loop2 ] + %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ] + %phi4.inc = add i32 %phi4, 1 + %phi5.inc = add i32 %phi5, 2 + %phi6.inc = add i32 %phi6, 3 + %sum3 = add i32 %phi4, %phi5 + %sum4 = add i32 %sum3, %phi6 + %is2 = add i32 %sum4, %b + %ec2 = add i32 %is1, %is2 + %cond2 = icmp ult i32 %ec2, 1000 + br i1 %cond2, label %loop2, label %exit + +exit: + %s1 = add i32 %phi1, %is1 + %s2 = add i32 %is2, %phi4 + %s3 = add i32 %is1, %phi5 + %s4 = add i32 %phi2, %is2 + %s5 = add i32 %is1, %is2 + %s6 = add i32 %is2, %is1 + ret void +} + +; Check that we can correctly evaluate a sum of phis+variants from two different +; loops in any order. + +define void @test_02(i32 %a, i32 %b, i32* %p) { + +; CHECK-LABEL: Classifying expressions for: @test_02 +; CHECK: %sum1 = add i32 %phi1, %phi2 +; CHECK-NEXT: --> {(%a + %b),+,3}<%loop1> +; CHECK: %sum2 = add i32 %sum1, %phi3 +; CHECK-NEXT: --> {(6 + %a + %b),+,6}<%loop1> +; CHECK: %is1 = add i32 %sum2, %v1 +; CHECK-NEXT: --> ({(6 + %a + %b),+,6}<%loop1> + %v1) +; CHECK: %sum3 = add i32 %phi4, %phi5 +; CHECK-NEXT: --> {(%a + %b),+,3}<%loop2> +; CHECK: %sum4 = add i32 %sum3, %phi6 +; CHECK-NEXT: --> {(43 + %a + %b),+,6}<%loop2> +; CHECK: %is2 = add i32 %sum4, %v2 +; CHECK-NEXT: --> ({(43 + %a + %b),+,6}<%loop2> + %v2) +; CHECK: %is3 = add i32 %v1, %sum2 +; CHECK-NEXT: --> ({(6 + %a + %b),+,6}<%loop1> + %v1) +; CHECK: %ec2 = add i32 %is1, %is3 +; CHECK-NEXT: --> (2 * ({(6 + %a + %b),+,6}<%loop1> + %v1)) +; CHECK: %s1 = add i32 %phi1, %is1 +; CHECK-NEXT: --> ({(6 + (2 * %a) + %b),+,7}<%loop1> + %v1) +; CHECK: %s2 = add i32 %is2, %phi4 +; CHECK-NEXT: --> ({(43 + (2 * %a) + %b),+,7}<%loop2> + %v2) +; CHECK: %s3 = add i32 %is1, %phi5 +; CHECK-NEXT: --> {({(6 + (2 * %b) + %a),+,6}<%loop1> + %v1),+,2}<%loop2> +; CHECK: %s4 = add i32 %phi2, %is2 +; CHECK-NEXT: --> ({{{{}}(43 + (2 * %b) + %a),+,2}<%loop1>,+,6}<%loop2> + %v2) +; CHECK: %s5 = add i32 %is1, %is2 +; CHECK-NEXT: --> ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2) +; CHECK: %s6 = add i32 %is2, %is1 +; CHECK-NEXT: --> ({({(49 + (2 * %a) + (2 * %b)),+,6}<%loop1> + %v1),+,6}<%loop2> + %v2) + +entry: + br label %loop1 + +loop1: + %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ] + %phi2 = phi i32 [ %b, %entry ], [ %phi2.inc, %loop1 ] + %phi3 = phi i32 [ 6, %entry ], [ %phi3.inc, %loop1 ] + %phi1.inc = add i32 %phi1, 1 + %phi2.inc = add i32 %phi2, 2 + %phi3.inc = add i32 %phi3, 3 + %v1 = load i32, i32* %p + %sum1 = add i32 %phi1, %phi2 + %sum2 = add i32 %sum1, %phi3 + %is1 = add i32 %sum2, %v1 + %cond1 = icmp ult i32 %is1, 1000 + br i1 %cond1, label %loop1, label %loop2 + +loop2: + %phi4 = phi i32 [ %a, %loop1 ], [ %phi4.inc, %loop2 ] + %phi5 = phi i32 [ %b, %loop1 ], [ %phi5.inc, %loop2 ] + %phi6 = phi i32 [ 43, %loop1 ], [ %phi6.inc, %loop2 ] + %phi4.inc = add i32 %phi4, 1 + %phi5.inc = add i32 %phi5, 2 + %phi6.inc = add i32 %phi6, 3 + %v2 = load i32, i32* %p + %sum3 = add i32 %phi4, %phi5 + %sum4 = add i32 %sum3, %phi6 + %is2 = add i32 %sum4, %v2 + %is3 = add i32 %v1, %sum2 + %ec2 = add i32 %is1, %is3 + %cond2 = icmp ult i32 %ec2, 1000 + br i1 %cond2, label %loop2, label %exit + +exit: + %s1 = add i32 %phi1, %is1 + %s2 = add i32 %is2, %phi4 + %s3 = add i32 %is1, %phi5 + %s4 = add i32 %phi2, %is2 + %s5 = add i32 %is1, %is2 + %s6 = add i32 %is2, %is1 + ret void +} + +; Mix of previous use cases that demonstrates %s3 can be incorrectly treated as +; a recurrence of loop1 because of operands order if we pick recurrencies in an +; incorrect order. + +define void @test_03(i32 %a, i32 %b, i32 %c, i32* %p) { + +; CHECK-LABEL: Classifying expressions for: @test_03 +; CHECK: %v1 = load i32, i32* %p +; CHECK-NEXT: --> %v1 +; CHECK: %s1 = add i32 %phi1, %v1 +; CHECK-NEXT: --> {(%a + %v1),+,1}<%loop1> +; CHECK: %s2 = add i32 %s1, %b +; CHECK-NEXT: --> {(%a + %b + %v1),+,1}<%loop1> +; CHECK: %s3 = add i32 %s2, %phi2 +; CHECK-NEXT: --> ({{{{}}((2 * %a) + %b),+,1}<%loop1>,+,2}<%loop2> + %v1) + +entry: + br label %loop1 + +loop1: + %phi1 = phi i32 [ %a, %entry ], [ %phi1.inc, %loop1 ] + %phi1.inc = add i32 %phi1, 1 + %cond1 = icmp ult i32 %phi1, %c + br i1 %cond1, label %loop1, label %loop2 + +loop2: + %phi2 = phi i32 [ %a, %loop1 ], [ %phi2.inc, %loop2 ] + %phi2.inc = add i32 %phi2, 2 + %v1 = load i32, i32* %p + %s1 = add i32 %phi1, %v1 + %s2 = add i32 %s1, %b + %s3 = add i32 %s2, %phi2 + %cond2 = icmp ult i32 %s3, %c + br i1 %cond2, label %loop2, label %exit + +exit: + + ret void +} + +; Another mix of previous use cases that demonstrates that incorrect picking of +; a loop for a recurrence may cause a crash of SCEV analysis. +define void @test_04() { + +; CHECK-LABEL: Classifying expressions for: @test_04 +; CHECK: %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ] +; CHECK-NEXT: --> {2,+,1}<%loop1> +; CHECK: %tmp2 = trunc i64 %tmp to i32 +; CHECK-NEXT: --> {2,+,1}<%loop1> +; CHECK: %tmp4 = add nuw nsw i64 %tmp, 1 +; CHECK-NEXT: --> {3,+,1}<%loop1> +; CHECK: %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ] +; CHECK-NEXT: --> {2,+,1}<%loop2> +; CHECK: %tmp10 = sub i64 %tmp9, %tmp7 +; CHECK-NEXT: --> ((sext i8 %tmp8 to i64) + {-2,+,-1}<%loop2>) +; CHECK: %tmp11 = add i64 %tmp10, undef +; CHECK-NEXT: --> ((sext i8 %tmp8 to i64) + {(-2 + undef),+,-1}<%loop2>) +; CHECK: %tmp13 = trunc i64 %tmp11 to i32 +; CHECK-NEXT: --> ((sext i8 %tmp8 to i32) + {(trunc i64 (-2 + undef) to i32),+,-1}<%loop2>) +; CHECK: %tmp14 = sub i32 %tmp13, %tmp2 +; CHECK-NEXT: --> ((sext i8 %tmp8 to i32) + {{{{}}(-2 + (trunc i64 (-2 + undef) to i32)),+,-1}<%loop1>,+,-1}<%loop2>) +; CHECK: %tmp15 = add nuw nsw i64 %tmp7, 1 +; CHECK-NEXT: --> {3,+,1}<%loop2> + +bb: + br label %loop1 + +loop1: + %tmp = phi i64 [ 2, %bb ], [ %tmp4, %bb3 ] + %tmp2 = trunc i64 %tmp to i32 + br i1 undef, label %loop2, label %bb3 + +bb3: + %tmp4 = add nuw nsw i64 %tmp, 1 + br label %loop1 + +bb5: + ret void + +loop2: + %tmp7 = phi i64 [ %tmp15, %loop2 ], [ 2, %loop1 ] + %tmp8 = load i8, i8 addrspace(1)* undef, align 1 + %tmp9 = sext i8 %tmp8 to i64 + %tmp10 = sub i64 %tmp9, %tmp7 + %tmp11 = add i64 %tmp10, undef + %tmp13 = trunc i64 %tmp11 to i32 + %tmp14 = sub i32 %tmp13, %tmp2 + %tmp15 = add nuw nsw i64 %tmp7, 1 + %tmp16 = icmp slt i64 %tmp15, %tmp + br i1 %tmp16, label %loop2, label %bb5 +} + +@A = weak global [1000 x i32] zeroinitializer, align 32 + +; Demonstrate a situation when we can add two recs with different degrees from +; the same loop. +define void @test_05(i32 %N) { + +; CHECK-LABEL: Classifying expressions for: @test_05 +; CHECK: %SQ = mul i32 %i.0, %i.0 +; CHECK-NEXT: --> {4,+,5,+,2}<%bb3> +; CHECK: %tmp4 = mul i32 %i.0, 2 +; CHECK-NEXT: --> {4,+,2}<%bb3> +; CHECK: %tmp5 = sub i32 %SQ, %tmp4 +; CHECK-NEXT: --> {0,+,3,+,2}<%bb3> + +entry: + %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] + br label %bb3 + +bb: ; preds = %bb3 + %tmp = getelementptr [1000 x i32], [1000 x i32]* @A, i32 0, i32 %i.0 ; [#uses=1] + store i32 123, i32* %tmp + %tmp2 = add i32 %i.0, 1 ; [#uses=1] + br label %bb3 + +bb3: ; preds = %bb, %entry + %i.0 = phi i32 [ 2, %entry ], [ %tmp2, %bb ] ; [#uses=3] + %SQ = mul i32 %i.0, %i.0 + %tmp4 = mul i32 %i.0, 2 + %tmp5 = sub i32 %SQ, %tmp4 + %tmp3 = icmp sle i32 %tmp5, 9999 ; [#uses=1] + br i1 %tmp3, label %bb, label %bb5 + +bb5: ; preds = %bb3 + br label %return + +return: ; preds = %bb5 + ret void +} + +; Check that we can add Phis from different loops with different nesting, nested +; loop comes first. +define void @test_06() { + +; CHECK-LABEL: Classifying expressions for: @test_06 +; CHECK: %s1 = add i32 %phi1, %phi2 +; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2> +; CHECK: %s2 = add i32 %phi2, %phi1 +; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2> +; CHECK: %s3 = add i32 %phi1, %phi3 +; CHECK-NEXT: --> {{{{}}40,+,1}<%loop1>,+,3}<%loop3> +; CHECK: %s4 = add i32 %phi3, %phi1 +; CHECK-NEXT: --> {{{{}}40,+,1}<%loop1>,+,3}<%loop3> +; CHECK: %s5 = add i32 %phi2, %phi3 +; CHECK-NEXT: --> {{{{}}50,+,2}<%loop2>,+,3}<%loop3> +; CHECK: %s6 = add i32 %phi3, %phi2 +; CHECK-NEXT: --> {{{{}}50,+,2}<%loop2>,+,3}<%loop3> + +entry: + br label %loop1 + +loop1: + %phi1 = phi i32 [ 10, %entry ], [ %phi1.inc, %loop1.exit ] + br label %loop2 + +loop2: + %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ] + %phi2.inc = add i32 %phi2, 2 + %cond2 = icmp ult i32 %phi2.inc, 1000 + br i1 %cond2, label %loop2, label %loop1.exit + +loop1.exit: + %phi1.inc = add i32 %phi1, 1 + %cond1 = icmp ult i32 %phi1.inc, 1000 + br i1 %cond1, label %loop1, label %loop3 + +loop3: + %phi3 = phi i32 [ 30, %loop1.exit ], [ %phi3.inc, %loop3 ] + %phi3.inc = add i32 %phi3, 3 + %cond3 = icmp ult i32 %phi3.inc, 1000 + br i1 %cond3, label %loop3, label %exit + +exit: + %s1 = add i32 %phi1, %phi2 + %s2 = add i32 %phi2, %phi1 + %s3 = add i32 %phi1, %phi3 + %s4 = add i32 %phi3, %phi1 + %s5 = add i32 %phi2, %phi3 + %s6 = add i32 %phi3, %phi2 + ret void +} + +; Check that we can add Phis from different loops with different nesting, nested +; loop comes second. +define void @test_07() { + +; CHECK-LABEL: Classifying expressions for: @test_07 +; CHECK: %s1 = add i32 %phi1, %phi2 +; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2> +; CHECK: %s2 = add i32 %phi2, %phi1 +; CHECK-NEXT: --> {{{{}}30,+,1}<%loop1>,+,2}<%loop2> +; CHECK: %s3 = add i32 %phi1, %phi3 +; CHECK-NEXT: --> {{{{}}40,+,3}<%loop3>,+,1}<%loop1> +; CHECK: %s4 = add i32 %phi3, %phi1 +; CHECK-NEXT: --> {{{{}}40,+,3}<%loop3>,+,1}<%loop1> +; CHECK: %s5 = add i32 %phi2, %phi3 +; CHECK-NEXT: --> {{{{}}50,+,3}<%loop3>,+,2}<%loop2> +; CHECK: %s6 = add i32 %phi3, %phi2 +; CHECK-NEXT: --> {{{{}}50,+,3}<%loop3>,+,2}<%loop2> + +entry: + br label %loop3 + +loop3: + %phi3 = phi i32 [ 30, %entry ], [ %phi3.inc, %loop3 ] + %phi3.inc = add i32 %phi3, 3 + %cond3 = icmp ult i32 %phi3.inc, 1000 + br i1 %cond3, label %loop3, label %loop1 + +loop1: + %phi1 = phi i32 [ 10, %loop3 ], [ %phi1.inc, %loop1.exit ] + br label %loop2 + +loop2: + %phi2 = phi i32 [ 20, %loop1 ], [ %phi2.inc, %loop2 ] + %phi2.inc = add i32 %phi2, 2 + %cond2 = icmp ult i32 %phi2.inc, 1000 + br i1 %cond2, label %loop2, label %loop1.exit + +loop1.exit: + %phi1.inc = add i32 %phi1, 1 + %cond1 = icmp ult i32 %phi1.inc, 1000 + br i1 %cond1, label %exit, label %loop1 + +exit: + %s1 = add i32 %phi1, %phi2 + %s2 = add i32 %phi2, %phi1 + %s3 = add i32 %phi1, %phi3 + %s4 = add i32 %phi3, %phi1 + %s5 = add i32 %phi2, %phi3 + %s6 = add i32 %phi3, %phi2 + ret void +} diff --git a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll index eab314eaa9c2..655d4558a5e1 100644 --- a/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll +++ b/test/Analysis/TypeBasedAliasAnalysis/intrinsics.ll @@ -5,22 +5,22 @@ target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:32- ; TBAA should prove that these calls don't interfere, since they are ; IntrArgReadMem and have TBAA metadata. -; CHECK: define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) { +; CHECK: define <8 x i16> @test0(<8 x i16>* %p, <8 x i16>* %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) { ; CHECK-NEXT: entry: -; CHECK-NEXT: %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) [[NUW:#[0-9]+]] -; CHECK-NEXT: call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16) +; CHECK-NEXT: %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) [[NUW:#[0-9]+]] +; CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m) ; CHECK-NEXT: %c = add <8 x i16> %a, %a -define <8 x i16> @test0(i8* %p, i8* %q, <8 x i16> %y) { +define <8 x i16> @test0(<8 x i16>* %p, <8 x i16>* %q, <8 x i16> %y, <8 x i1> %m, <8 x i16> %pt) { entry: - %a = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2 - call void @llvm.arm.neon.vst1.p0i8.v8i16(i8* %q, <8 x i16> %y, i32 16), !tbaa !1 - %b = call <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8* %p, i32 16) nounwind, !tbaa !2 + %a = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2 + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %y, <8 x i16>* %q, i32 16, <8 x i1> %m), !tbaa !1 + %b = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %p, i32 16, <8 x i1> %m, <8 x i16> %pt) nounwind, !tbaa !2 %c = add <8 x i16> %a, %b ret <8 x i16> %c } -declare <8 x i16> @llvm.arm.neon.vld1.v8i16.p0i8(i8*, i32) nounwind readonly -declare void @llvm.arm.neon.vst1.p0i8.v8i16(i8*, <8 x i16>, i32) nounwind +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32, <8 x i1>, <8 x i16>) nounwind readonly +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32, <8 x i1>) nounwind ; CHECK: attributes #0 = { argmemonly nounwind readonly } ; CHECK: attributes #1 = { argmemonly nounwind } diff --git a/test/Assembler/globalvariable-attributes.ll b/test/Assembler/globalvariable-attributes.ll new file mode 100644 index 000000000000..64227a451c25 --- /dev/null +++ b/test/Assembler/globalvariable-attributes.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llvm-dis | llvm-as | llvm-dis | FileCheck %s + +@g1 = global i32 7 "key" = "value" "key2" = "value2" +@g2 = global i32 2, align 4 "key3" = "value3" +@g3 = global i32 2 #0 +@g4 = global i32 2, align 4 "key5" = "value5" #0 + +attributes #0 = { "string" = "value" nobuiltin norecurse } + +; CHECK: @g1 = global i32 7 #0 +; CHECK: @g2 = global i32 2, align 4 #1 +; CHECK: @g3 = global i32 2 #2 +; CHECK: @g4 = global i32 2, align 4 #3 + +; CHECK: attributes #0 = { "key"="value" "key2"="value2" } +; CHECK: attributes #1 = { "key3"="value3" } +; CHECK: attributes #2 = { nobuiltin norecurse "string"="value" } +; CHECK: attributes #3 = { nobuiltin norecurse "key5"="value5" "string"="value" } + diff --git a/test/Bitcode/globalvariable-attributes.ll b/test/Bitcode/globalvariable-attributes.ll new file mode 100644 index 000000000000..cbab3b71e58a --- /dev/null +++ b/test/Bitcode/globalvariable-attributes.ll @@ -0,0 +1,19 @@ +; RUN: llvm-as < %s | llvm-dis | FileCheck %s + +@g1 = global i32 7 "key" = "value" "key2" = "value2" +@g2 = global i32 2, align 4 "key3" = "value3" +@g3 = global i32 2 #0 +@g4 = global i32 2, align 4 "key5" = "value5" #0 + +attributes #0 = { "string" = "value" nobuiltin norecurse } + +; CHECK: @g1 = global i32 7 #0 +; CHECK: @g2 = global i32 2, align 4 #1 +; CHECK: @g3 = global i32 2 #2 +; CHECK: @g4 = global i32 2, align 4 #3 + +; CHECK: attributes #0 = { "key"="value" "key2"="value2" } +; CHECK: attributes #1 = { "key3"="value3" } +; CHECK: attributes #2 = { nobuiltin norecurse "string"="value" } +; CHECK: attributes #3 = { nobuiltin norecurse "key5"="value5" "string"="value" } + diff --git a/test/Bitcode/ptest-old.ll b/test/Bitcode/ptest-old.ll index c1e1cae37368..53ffef900b57 100644 --- a/test/Bitcode/ptest-old.ll +++ b/test/Bitcode/ptest-old.ll @@ -1,5 +1,6 @@ ; RUN: llvm-as < %s | llvm-dis | FileCheck %s ; RUN: verify-uselistorder < %s +; REQUIRES: x86 define i32 @foo(<4 x float> %bar) nounwind { entry: diff --git a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll index 982bb5cb7e53..b64d5bd52bfc 100644 --- a/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll +++ b/test/Bitcode/thinlto-function-summary-callgraph-profile-summary.ll @@ -29,7 +29,7 @@ ; CHECK-NEXT: ; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123 -; CHECK-NEXT: +; CHECK-NEXT: ; CHECK-NEXT: ; CHECK: +; op4=hot1 op6=cold op8=hot2 op10=hot4 op12=none1 op14=hot3 op16=none2 op18=none3 op20=123 +; CHECK-NEXT: +; CHECK-NEXT: + +; CHECK: +; COMBINED_NEXT: + + +; ModuleID = 'thinlto-function-summary-callgraph.ll' +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; This function have high profile count, so entry block is hot. +define void @hot_function(i1 %a, i1 %a2) !prof !20 { +entry: + call void @hot1() + br i1 %a, label %Cold, label %Hot, !prof !41 +Cold: ; 1/1000 goes here + call void @cold() + call void @hot2() + call void @hot4(), !prof !15 + call void @none1() + br label %exit +Hot: ; 999/1000 goes here + call void @hot2() + call void @hot3() + br i1 %a2, label %None1, label %None2, !prof !42 +None1: ; half goes here + call void @none1() + call void @none2() + br label %exit +None2: ; half goes here + call void @none3() + br label %exit +exit: + ret void +} + +declare void @hot1() #1 +declare void @hot2() #1 +declare void @hot3() #1 +declare void @hot4() #1 +declare void @cold() #1 +declare void @none1() #1 +declare void @none2() #1 +declare void @none3() #1 + + +!41 = !{!"branch_weights", i32 1, i32 1000} +!42 = !{!"branch_weights", i32 1, i32 1} + + + +!llvm.module.flags = !{!1} +!20 = !{!"function_entry_count", i64 110, i64 123} + +!1 = !{i32 1, !"ProfileSummary", !2} +!2 = !{!3, !4, !5, !6, !7, !8, !9, !10} +!3 = !{!"ProfileFormat", !"SampleProfile"} +!4 = !{!"TotalCount", i64 10000} +!5 = !{!"MaxCount", i64 10} +!6 = !{!"MaxInternalCount", i64 1} +!7 = !{!"MaxFunctionCount", i64 1000} +!8 = !{!"NumCounts", i64 3} +!9 = !{!"NumFunctions", i64 3} +!10 = !{!"DetailedSummary", !11} +!11 = !{!12, !13, !14} +!12 = !{i32 10000, i64 100, i32 1} +!13 = !{i32 999000, i64 100, i32 1} +!14 = !{i32 999999, i64 1, i32 2} +!15 = !{!"branch_weights", i32 100} diff --git a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir index 739fdd5cb4c5..0f054f1d940c 100644 --- a/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir +++ b/test/CodeGen/AArch64/GlobalISel/arm64-regbankselect.mir @@ -74,6 +74,21 @@ %res = bitcast <2 x i32> %vres to i64 ret i64 %res } + + define i64 @floatingPointLoad(i64 %arg1, double* %addr) { + %varg1 = bitcast i64 %arg1 to double + %varg2 = load double, double* %addr + %vres = fadd double %varg1, %varg2 + %res = bitcast double %vres to i64 + ret i64 %res + } + + define void @floatingPointStore(i64 %arg1, double* %addr) { + %varg1 = bitcast i64 %arg1 to double + %vres = fadd double %varg1, %varg1 + store double %vres, double* %addr + ret void + } ... --- @@ -650,3 +665,84 @@ body: | RET_ReallyLR implicit %x0 ... + +--- +# Make sure we map what looks like floating point +# loads to floating point register bank. +# CHECK-LABEL: name: floatingPointLoad +name: floatingPointLoad +legalized: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: fpr } +# CHECK-NEXT: - { id: 3, class: fpr } +# CHECK-NEXT: - { id: 4, class: fpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } + +# No repairing should be necessary for both modes. +# CHECK: %0(s64) = COPY %x0 +# CHECK-NEXT: %1(p0) = COPY %x1 +# CHECK-NEXT: %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr) +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %4(s64) = COPY %0 +# CHECK-NEXT: %3(s64) = G_FADD %4, %2 +# CHECK-NEXT: %x0 = COPY %3(s64) +# CHECK-NEXT: RET_ReallyLR implicit %x0 + +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(p0) = COPY %x1 + %2(s64) = G_LOAD %1(p0) :: (load 8 from %ir.addr) + %3(s64) = G_FADD %0, %2 + %x0 = COPY %3(s64) + RET_ReallyLR implicit %x0 + +... + +--- +# Make sure we map what looks like floating point +# stores to floating point register bank. +# CHECK-LABEL: name: floatingPointStore +name: floatingPointStore +legalized: true + +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: fpr } +# CHECK-NEXT: - { id: 3, class: fpr } +# CHECK-NEXT: - { id: 4, class: fpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + +# CHECK: %0(s64) = COPY %x0 +# CHECK-NEXT: %1(p0) = COPY %x1 +# %0 has been mapped to GPR, we need to repair to match FPR. +# CHECK-NEXT: %3(s64) = COPY %0 +# CHECK-NEXT: %4(s64) = COPY %0 +# CHECK-NEXT: %2(s64) = G_FADD %3, %4 +# CHECK-NEXT: G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr) +# CHECK-NEXT: RET_ReallyLR + +body: | + bb.0: + liveins: %x0, %x1 + + %0(s64) = COPY %x0 + %1(p0) = COPY %x1 + %2(s64) = G_FADD %0, %0 + G_STORE %2(s64), %1(p0) :: (store 8 into %ir.addr) + RET_ReallyLR + +... diff --git a/test/CodeGen/AArch64/GlobalISel/call-translator.ll b/test/CodeGen/AArch64/GlobalISel/call-translator.ll index f8d95c88cc8f..44705a9c9f65 100644 --- a/test/CodeGen/AArch64/GlobalISel/call-translator.ll +++ b/test/CodeGen/AArch64/GlobalISel/call-translator.ll @@ -1,7 +1,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnu -O0 -stop-after=irtranslator -global-isel -verify-machineinstrs %s -o - 2>&1 | FileCheck %s ; CHECK-LABEL: name: test_trivial_call -; CHECK: ADJCALLSTACKDOWN 0, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def %sp, implicit %sp ; CHECK: BL @trivial_callee, csr_aarch64_aapcs, implicit-def %lr ; CHECK: ADJCALLSTACKUP 0, 0, implicit-def %sp, implicit %sp declare void @trivial_callee() @@ -186,7 +186,7 @@ define void @test_stack_slots([8 x i64], i64 %lhs, i64 %rhs, i64* %addr) { ; CHECK: [[C42:%[0-9]+]](s64) = G_CONSTANT i64 42 ; CHECK: [[C12:%[0-9]+]](s64) = G_CONSTANT i64 12 ; CHECK: [[PTR:%[0-9]+]](p0) = G_CONSTANT i64 0 -; CHECK: ADJCALLSTACKDOWN 24, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 24, 0, implicit-def %sp, implicit %sp ; CHECK: [[SP:%[0-9]+]](p0) = COPY %sp ; CHECK: [[C42_OFFS:%[0-9]+]](s64) = G_CONSTANT i64 0 ; CHECK: [[C42_LOC:%[0-9]+]](p0) = G_GEP [[SP]], [[C42_OFFS]](s64) diff --git a/test/CodeGen/AArch64/arm64-ccmp.ll b/test/CodeGen/AArch64/arm64-ccmp.ll index 2682fa7dcce1..fc1aeb7b37d9 100644 --- a/test/CodeGen/AArch64/arm64-ccmp.ll +++ b/test/CodeGen/AArch64/arm64-ccmp.ll @@ -378,11 +378,11 @@ define i64 @select_noccmp1(i64 %v1, i64 %v2, i64 %v3, i64 %r) { ; CHECK-NEXT: cmp x0, #13 ; CHECK-NOT: ccmp ; CHECK-NEXT: cset [[REG1:w[0-9]+]], gt +; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]] ; CHECK-NEXT: cmp x2, #2 ; CHECK-NEXT: cset [[REG2:w[0-9]+]], lt ; CHECK-NEXT: cmp x2, #4 ; CHECK-NEXT: cset [[REG3:w[0-9]+]], gt -; CHECK-NEXT: and [[REG4:w[0-9]+]], [[REG0]], [[REG1]] ; CHECK-NEXT: and [[REG5:w[0-9]+]], [[REG2]], [[REG3]] ; CHECK-NEXT: orr [[REG6:w[0-9]+]], [[REG4]], [[REG5]] ; CHECK-NEXT: cmp [[REG6]], #0 diff --git a/test/CodeGen/AArch64/arm64-fml-combines.ll b/test/CodeGen/AArch64/arm64-fml-combines.ll index 840d1dcbf060..f97498825279 100644 --- a/test/CodeGen/AArch64/arm64-fml-combines.ll +++ b/test/CodeGen/AArch64/arm64-fml-combines.ll @@ -1,4 +1,6 @@ -; RUN: llc < %s -O=3 -mtriple=arm64-apple-ios -mcpu=cyclone -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -enable-unsafe-fp-math | FileCheck %s +; RUN: llc < %s -O3 -mtriple=arm64-apple-ios -fp-contract=fast | FileCheck %s + define void @foo_2d(double* %src) { entry: %arrayidx1 = getelementptr inbounds double, double* %src, i64 5 @@ -126,3 +128,23 @@ for.body: ; preds = %for.body, %entry for.end: ; preds = %for.body ret void } + +; CHECK-LABEL: test1: +; CHECK: fnmadd s0, s0, s1, s2 +define float @test1(float %a, float %b, float %c) { +entry: + %0 = fmul float %a, %b + %mul = fsub float -0.000000e+00, %0 + %sub1 = fsub float %mul, %c + ret float %sub1 +} + +; CHECK-LABEL: test2: +; CHECK: fnmadd d0, d0, d1, d2 +define double @test2(double %a, double %b, double %c) { +entry: + %0 = fmul double %a, %b + %mul = fsub double -0.000000e+00, %0 + %sub1 = fsub double %mul, %c + ret double %sub1 +} diff --git a/test/CodeGen/AArch64/arm64-hello.ll b/test/CodeGen/AArch64/arm64-hello.ll index caaf8615cd4a..a8d1c2482520 100644 --- a/test/CodeGen/AArch64/arm64-hello.ll +++ b/test/CodeGen/AArch64/arm64-hello.ll @@ -6,8 +6,8 @@ ; CHECK-NEXT: stp x29, x30, [sp, #16] ; CHECK-NEXT: add x29, sp, #16 ; CHECK-NEXT: stur wzr, [x29, #-4] -; CHECK: adrp x0, L_.str@PAGE -; CHECK: add x0, x0, L_.str@PAGEOFF +; CHECK: adrp x0, l_.str@PAGE +; CHECK: add x0, x0, l_.str@PAGEOFF ; CHECK-NEXT: bl _puts ; CHECK-NEXT: ldp x29, x30, [sp, #16] ; CHECK-NEXT: add sp, sp, #32 diff --git a/test/CodeGen/AArch64/arm64-misched-multimmo.ll b/test/CodeGen/AArch64/arm64-misched-multimmo.ll index 3593668e0156..4c0195b93a44 100644 --- a/test/CodeGen/AArch64/arm64-misched-multimmo.ll +++ b/test/CodeGen/AArch64/arm64-misched-multimmo.ll @@ -12,7 +12,7 @@ ; CHECK: Successors: ; CHECK-NOT: ch SU(4) ; CHECK: SU(3) -; CHECK: SU(4): STRWui %WZR, %X{{[0-9]+}} +; CHECK: SU(5): STRWui %WZR, %X{{[0-9]+}} define i32 @foo() { entry: %0 = load i32, i32* getelementptr inbounds ([100 x i32], [100 x i32]* @G2, i64 0, i64 0), align 4 diff --git a/test/CodeGen/AArch64/macho-global-symbols.ll b/test/CodeGen/AArch64/macho-global-symbols.ll new file mode 100644 index 000000000000..d68abad57ccd --- /dev/null +++ b/test/CodeGen/AArch64/macho-global-symbols.ll @@ -0,0 +1,17 @@ +; RUN: llc -mtriple=arm64-apple-ios %s -o - | FileCheck %s + +; All global symbols must be at-most linker-private for AArch64 because we don't +; use section-relative relocations in MachO. + +define i8* @private_sym() { +; CHECK-LABEL: private_sym: +; CHECK: adrp [[HIBITS:x[0-9]+]], l_var@PAGE +; CHECK: add x0, [[HIBITS]], l_var@PAGEOFF + + ret i8* getelementptr([2 x i8], [2 x i8]* @var, i32 0, i32 0) +} + +; CHECK: .section __TEXT,__cstring +; CHECK: l_var: +; CHECK: .asciz "\002" +@var = private unnamed_addr constant [2 x i8] [i8 2, i8 0] diff --git a/test/CodeGen/AArch64/misched-fusion-aes.ll b/test/CodeGen/AArch64/misched-fusion-aes.ll index f29dfb3a9802..4c682e594e66 100644 --- a/test/CodeGen/AArch64/misched-fusion-aes.ll +++ b/test/CodeGen/AArch64/misched-fusion-aes.ll @@ -1,4 +1,5 @@ ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a57 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA57 +; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=cortex-a72 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKA72 ; RUN: llc %s -o - -mtriple=aarch64-unknown -mcpu=exynos-m1 | FileCheck %s --check-prefix=CHECK --check-prefix=CHECKM1 declare <16 x i8> @llvm.aarch64.crypto.aese(<16 x i8> %d, <16 x i8> %k) @@ -87,6 +88,22 @@ define void @aesea(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VG]] ; CHECKA57: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} ; CHECKA57-NEXT: aesmc {{v[0-7].16b}}, [[VH]] +; CHECKA72: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VA]] +; CHECKA72: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VB]] +; CHECKA72: aese [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VC]] +; CHECKA72: aese [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VD]] +; CHECKA72: aese [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VE]] +; CHECKA72: aese [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VF]] +; CHECKA72: aese [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VG]] +; CHECKA72: aese [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesmc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aese [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1: aesmc {{v[0-7].16b}}, [[VA]] ; CHECKM1: aese [[VB:v[0-7].16b]], {{v[0-7].16b}} @@ -187,6 +204,22 @@ define void @aesda(<16 x i8>* %a0, <16 x i8>* %b0, <16 x i8>* %c0, <16 x i8> %d, ; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VG]] ; CHECKA57: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} ; CHECKA57-NEXT: aesimc {{v[0-7].16b}}, [[VH]] +; CHECKA72: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VA]] +; CHECKA72: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VB]] +; CHECKA72: aesd [[VC:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VC]] +; CHECKA72: aesd [[VD:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VD]] +; CHECKA72: aesd [[VE:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VE]] +; CHECKA72: aesd [[VF:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VF]] +; CHECKA72: aesd [[VG:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VG]] +; CHECKA72: aesd [[VH:v[0-7].16b]], {{v[0-7].16b}} +; CHECKA72-NEXT: aesimc {{v[0-7].16b}}, [[VH]] ; CHECKM1: aesd [[VA:v[0-7].16b]], {{v[0-7].16b}} ; CHECKM1: aesimc {{v[0-7].16b}}, [[VA]] ; CHECKM1: aesd [[VB:v[0-7].16b]], {{v[0-7].16b}} diff --git a/test/CodeGen/AArch64/stackmap-frame-setup.ll b/test/CodeGen/AArch64/stackmap-frame-setup.ll index 5646703fa403..677ff8dc2530 100644 --- a/test/CodeGen/AArch64/stackmap-frame-setup.ll +++ b/test/CodeGen/AArch64/stackmap-frame-setup.ll @@ -7,11 +7,11 @@ entry: store i64 11, i64* %metadata store i64 12, i64* %metadata store i64 13, i64* %metadata -; ISEL: ADJCALLSTACKDOWN 0, implicit-def +; ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; ISEL-NEXT: STACKMAP ; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) -; FAST-ISEL: ADJCALLSTACKDOWN 0, implicit-def +; FAST-ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; FAST-ISEL-NEXT: STACKMAP ; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def ret void diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir index 56a9e7022db9..2a3d3887ed69 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-load-flat.mir @@ -14,7 +14,7 @@ regBankSelected: true # GCN: global_addrspace # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 -# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0, 0 +# GCN: FLAT_LOAD_DWORD [[PTR]], 0, 0 body: | bb.0: diff --git a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir index ea435725bf25..89be3bde94a8 100644 --- a/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir +++ b/test/CodeGen/AMDGPU/GlobalISel/inst-select-store-flat.mir @@ -15,7 +15,7 @@ regBankSelected: true # GCN: global_addrspace # GCN: [[PTR:%[0-9]+]] = COPY %vgpr0_vgpr1 # GCN: [[VAL:%[0-9]+]] = COPY %vgpr2 -# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0, 0 +# GCN: FLAT_STORE_DWORD [[PTR]], [[VAL]], 0, 0 body: | bb.0: diff --git a/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir new file mode 100644 index 000000000000..8839ba8e0ab2 --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/legalize-constant.mir @@ -0,0 +1,20 @@ +# RUN: llc -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -O0 -run-pass=legalizer -global-isel %s -o - | FileCheck %s + +--- | + define void @test_constant() { + entry: + ret void + } +... + +--- +name: test_constant +registers: + - { id: 0, class: _ } +body: | + bb.0.entry: + ; CHECK-LABEL: name: test_constant + ; CHECK: %0(s32) = G_CONSTANT i32 5 + + %0(s32) = G_CONSTANT i32 5 +... diff --git a/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg new file mode 100644 index 000000000000..e99d1bb8446c --- /dev/null +++ b/test/CodeGen/AMDGPU/GlobalISel/lit.local.cfg @@ -0,0 +1,2 @@ +if not 'global-isel' in config.root.available_features: + config.unsupported = True diff --git a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir index 62b47beb1251..bc992ed77ffd 100644 --- a/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir +++ b/test/CodeGen/AMDGPU/constant-fold-imm-immreg.mir @@ -219,19 +219,19 @@ body: | %34 = V_MOV_B32_e32 63, implicit %exec %27 = V_AND_B32_e64 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %27, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %27, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_AND_B32_e64 %24, %26, implicit %exec - FLAT_STORE_DWORD %37, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %29 = V_AND_B32_e32 %26, %24, implicit %exec - FLAT_STORE_DWORD %37, %29, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %29, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %30 = V_AND_B32_e64 %26, %26, implicit %exec - FLAT_STORE_DWORD %37, %30, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %30, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %31 = V_AND_B32_e64 %34, %34, implicit %exec - FLAT_STORE_DWORD %37, %31, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %37, %31, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM @@ -407,34 +407,34 @@ body: | %27 = S_MOV_B32 -4 %11 = V_LSHLREV_B32_e64 12, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %12 = V_LSHLREV_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %13 = V_LSHL_B32_e64 %7, 12, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %14 = V_LSHL_B32_e64 12, %7, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %15 = V_LSHL_B32_e64 12, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %22 = V_LSHL_B32_e64 %6, 12, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %23 = V_LSHL_B32_e64 %6, 32, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %25 = V_LSHL_B32_e32 %6, %6, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %26 = V_LSHLREV_B32_e32 11, %24, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_LSHL_B32_e32 %27, %6, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM @@ -615,34 +615,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_ASHRREV_I32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %12 = V_ASHRREV_I32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %13 = V_ASHR_I32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %14 = V_ASHR_I32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %15 = V_ASHR_I32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %22 = V_ASHR_I32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %23 = V_ASHR_I32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %25 = V_ASHR_I32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %26 = V_ASHRREV_I32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_ASHR_I32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM @@ -824,34 +824,34 @@ body: | %35 = V_MOV_B32_e32 2, implicit %exec %11 = V_LSHRREV_B32_e64 8, %10, implicit %exec - FLAT_STORE_DWORD %20, %11, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %11, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %12 = V_LSHRREV_B32_e64 %8, %10, implicit %exec - FLAT_STORE_DWORD %20, %12, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %12, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %13 = V_LSHR_B32_e64 %7, 3, implicit %exec - FLAT_STORE_DWORD %20, %13, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %13, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %14 = V_LSHR_B32_e64 7, %32, implicit %exec - FLAT_STORE_DWORD %20, %14, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %14, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %15 = V_LSHR_B32_e64 %27, %24, implicit %exec - FLAT_STORE_DWORD %20, %15, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %15, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %22 = V_LSHR_B32_e64 %6, 4, implicit %exec - FLAT_STORE_DWORD %20, %22, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %22, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %23 = V_LSHR_B32_e64 %6, %33, implicit %exec - FLAT_STORE_DWORD %20, %23, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %23, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %25 = V_LSHR_B32_e32 %34, %34, implicit %exec - FLAT_STORE_DWORD %20, %25, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %25, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %26 = V_LSHRREV_B32_e32 11, %10, implicit %exec - FLAT_STORE_DWORD %20, %26, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %26, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) %28 = V_LSHR_B32_e32 %27, %35, implicit %exec - FLAT_STORE_DWORD %20, %28, 0, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) + FLAT_STORE_DWORD %20, %28, 0, 0, implicit %exec, implicit %flat_scr :: (volatile store 4 into %ir.gep.out) S_ENDPGM diff --git a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll index 0831d250b9e7..8611cd080e15 100644 --- a/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll +++ b/test/CodeGen/AMDGPU/constant-fold-mi-operands.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @fold_mi_s_and_0(i32 addrspace(1)* %out, i32 %x) #0 { } ; GCN-LABEL: {{^}}fold_mi_v_or_0: -; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] +; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fold_mi_v_or_0(i32 addrspace(1)* %out) { @@ -50,7 +50,7 @@ define amdgpu_kernel void @fold_mi_s_or_0(i32 addrspace(1)* %out, i32 %x) #0 { } ; GCN-LABEL: {{^}}fold_mi_v_xor_0: -; GCN: v_mbcnt_lo_u32_b32_e64 [[RESULT:v[0-9]+]] +; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]] ; GCN-NOT: [[RESULT]] ; GCN: buffer_store_dword [[RESULT]] define amdgpu_kernel void @fold_mi_v_xor_0(i32 addrspace(1)* %out) { @@ -86,8 +86,8 @@ define amdgpu_kernel void @fold_mi_s_not_0(i32 addrspace(1)* %out, i32 %x) #0 { } ; GCN-LABEL: {{^}}fold_mi_v_not_0: -; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} -; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} +; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-NEXT: v_not_b32_e32 v[[RESULT_LO]] ; GCN-NEXT: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], -1{{$}} ; GCN-NEXT: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} @@ -104,8 +104,8 @@ define amdgpu_kernel void @fold_mi_v_not_0(i64 addrspace(1)* %out) { ; GCN: buffer_load_dwordx2 ; GCN: buffer_load_dwordx2 v{{\[}}[[VREG1_LO:[0-9]+]]:[[VREG1_HI:[0-9]+]]{{\]}} -; GCN: v_bcnt_u32_b32_e64 v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} -; GCN: v_bcnt_u32_b32_e{{[0-9]+}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, 0{{$}} +; GCN: v_bcnt_u32_b32{{(_e32)*(_e64)*}} v[[RESULT_LO:[0-9]+]], v{{[0-9]+}}, v[[RESULT_LO]]{{$}} ; GCN-DAG: v_not_b32_e32 v[[RESULT_LO]], v[[RESULT_LO]] ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO]], v[[VREG1_LO]], v[[RESULT_LO]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], v[[VREG1_HI]] diff --git a/test/CodeGen/AMDGPU/ctpop.ll b/test/CodeGen/AMDGPU/ctpop.ll index a29e72ea57cb..aa913ad406d2 100644 --- a/test/CodeGen/AMDGPU/ctpop.ll +++ b/test/CodeGen/AMDGPU/ctpop.ll @@ -25,7 +25,7 @@ define amdgpu_kernel void @s_ctpop_i32(i32 addrspace(1)* noalias %out, i32 %val) ; XXX - Why 0 in register? ; FUNC-LABEL: {{^}}v_ctpop_i32: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 0 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -40,9 +40,9 @@ define amdgpu_kernel void @v_ctpop_i32(i32 addrspace(1)* noalias %out, i32 addrs ; FUNC-LABEL: {{^}}v_ctpop_add_chain_i32: ; GCN: buffer_load_dword [[VAL1:v[0-9]+]], ; GCN: buffer_load_dword [[VAL0:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], [[VAL1]], 0 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL0]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -61,7 +61,7 @@ define amdgpu_kernel void @v_ctpop_add_chain_i32(i32 addrspace(1)* noalias %out, ; FUNC-LABEL: {{^}}v_ctpop_add_sgpr_i32: ; GCN: buffer_load_dword [[VAL0:v[0-9]+]], ; GCN: s_waitcnt -; GCN-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} +; GCN-NEXT: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL0]], s{{[0-9]+}} ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in0, i32 addrspace(1)* noalias %in1, i32 %sval) nounwind { @@ -73,8 +73,8 @@ define amdgpu_kernel void @v_ctpop_add_sgpr_i32(i32 addrspace(1)* noalias %out, } ; FUNC-LABEL: {{^}}v_ctpop_v2i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -87,10 +87,10 @@ define amdgpu_kernel void @v_ctpop_v2i32(<2 x i32> addrspace(1)* noalias %out, < } ; FUNC-LABEL: {{^}}v_ctpop_v4i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -105,14 +105,14 @@ define amdgpu_kernel void @v_ctpop_v4i32(<4 x i32> addrspace(1)* noalias %out, < } ; FUNC-LABEL: {{^}}v_ctpop_v8i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -131,22 +131,22 @@ define amdgpu_kernel void @v_ctpop_v8i32(<8 x i32> addrspace(1)* noalias %out, < } ; FUNC-LABEL: {{^}}v_ctpop_v16i32: -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 -; GCN: v_bcnt_u32_b32_e64 +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} +; GCN: v_bcnt_u32_b32{{(_e64)*}} ; GCN: s_endpgm ; EG: BCNT_INT @@ -174,7 +174,7 @@ define amdgpu_kernel void @v_ctpop_v16i32(<16 x i32> addrspace(1)* noalias %out, ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -189,7 +189,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant(i32 addrspace(1)* noa ; FUNC-LABEL: {{^}}v_ctpop_i32_add_inline_constant_inv: ; GCN: buffer_load_dword [[VAL:v[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], 4 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], 4 ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -206,7 +206,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_inline_constant_inv(i32 addrspace(1)* ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: v_mov_b32_e32 [[LIT:v[0-9]+]], 0x1869f ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[LIT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %out, i32 addrspace(1)* noalias %in) nounwind { @@ -220,7 +220,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_literal(i32 addrspace(1)* noalias %ou ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var: ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -236,7 +236,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var(i32 addrspace(1)* noalias %out, i ; FUNC-LABEL: {{^}}v_ctpop_i32_add_var_inv: ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], ; GCN-DAG: s_load_dword [[VAR:s[0-9]+]], -; GCN: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm @@ -253,7 +253,7 @@ define amdgpu_kernel void @v_ctpop_i32_add_var_inv(i32 addrspace(1)* noalias %ou ; GCN-DAG: buffer_load_dword [[VAL:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], {{0$}} ; GCN-DAG: buffer_load_dword [[VAR:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:16 ; SI: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] -; VI: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] +; VI: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], [[VAL]], [[VAR]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm diff --git a/test/CodeGen/AMDGPU/ctpop64.ll b/test/CodeGen/AMDGPU/ctpop64.ll index 2610684ad9ee..f18bd9fd8174 100644 --- a/test/CodeGen/AMDGPU/ctpop64.ll +++ b/test/CodeGen/AMDGPU/ctpop64.ll @@ -26,9 +26,9 @@ define amdgpu_kernel void @s_ctpop_i64(i32 addrspace(1)* noalias %out, i64 %val) ; FUNC-LABEL: {{^}}v_ctpop_i64: ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN: buffer_store_dword [[RESULT]], ; GCN: s_endpgm define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrspace(1)* noalias %in) nounwind { @@ -41,9 +41,9 @@ define amdgpu_kernel void @v_ctpop_i64(i32 addrspace(1)* noalias %out, i64 addrs ; FUNC-LABEL: {{^}}v_ctpop_i64_user: ; GCN: buffer_load_dwordx2 v{{\[}}[[LOVAL:[0-9]+]]:[[HIVAL:[0-9]+]]{{\]}}, -; GCN: v_bcnt_u32_b32_e64 [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 +; GCN: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT:v[0-9]+]], v[[LOVAL]], 0 ; SI-NEXT: v_bcnt_u32_b32_e32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] -; VI-NEXT: v_bcnt_u32_b32_e64 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] +; VI-NEXT: v_bcnt_u32_b32 [[RESULT:v[0-9]+]], v[[HIVAL]], [[MIDRESULT]] ; GCN-DAG: v_or_b32_e32 v[[RESULT_LO:[0-9]+]], s{{[0-9]+}}, [[RESULT]] ; GCN-DAG: v_mov_b32_e32 v[[RESULT_HI:[0-9]+]], s{{[0-9]+}} ; GCN: buffer_store_dwordx2 v{{\[}}[[RESULT_LO]]:[[RESULT_HI]]{{\]}} @@ -171,11 +171,11 @@ define amdgpu_kernel void @s_ctpop_i65(i32 addrspace(1)* noalias %out, i65 %val) ; FUNC-LABEL: {{^}}v_ctpop_i128: ; GCN: buffer_load_dwordx4 v{{\[}}[[VAL0:[0-9]+]]:[[VAL3:[0-9]+]]{{\]}}, off, s{{\[[0-9]+:[0-9]+\]}}, 0{{$}} -; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 -; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] +; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT0:v[0-9]+]], v{{[0-9]+}}, 0 +; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT1:v[0-9]+]], v[[VAL3]], [[MIDRESULT0]] -; GCN-DAG: v_bcnt_u32_b32_e64 [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 -; GCN-DAG: v_bcnt_u32_b32{{_e32|_e64}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] +; GCN-DAG: v_bcnt_u32_b32{{(_e64)*}} [[MIDRESULT2:v[0-9]+]], v[[VAL0]], 0 +; GCN-DAG: v_bcnt_u32_b32{{(_e32)*(_e64)*}} [[MIDRESULT3:v[0-9]+]], v{{[0-9]+}}, [[MIDRESULT2]] ; GCN: v_add_i32_e32 [[RESULT:v[0-9]+]], vcc, [[MIDRESULT1]], [[MIDRESULT2]] diff --git a/test/CodeGen/AMDGPU/fneg-combines.ll b/test/CodeGen/AMDGPU/fneg-combines.ll index 1c0e9a2f13ce..66bf9d0ffb00 100644 --- a/test/CodeGen/AMDGPU/fneg-combines.ll +++ b/test/CodeGen/AMDGPU/fneg-combines.ll @@ -1471,11 +1471,10 @@ define amdgpu_kernel void @v_fneg_mul_legacy_store_use_mul_legacy_f32(float addr ; GCN-LABEL: {{^}}v_fneg_mul_legacy_multi_use_mul_legacy_f32: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN-DAG: v_mul_legacy_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] -; GCN-DAG: v_xor_b32_e32 [[NEG_MUL_LEGACY:v[0-9]+]], 0x80000000, [[ADD]] -; GCN: v_mul_legacy_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] -; GCN-NEXT: buffer_store_dword [[NEG_MUL_LEGACY]] -; GCN: buffer_store_dword [[MUL]] +; GCN: v_mul_legacy_f32_e64 [[ADD:v[0-9]+]], [[A]], -[[B]] +; GCN-NEXT: v_mul_legacy_f32_e64 [[MUL:v[0-9]+]], -[[ADD]], 4.0 +; GCN-NEXT: buffer_store_dword [[ADD]] +; GCN-NEXT: buffer_store_dword [[MUL]] define amdgpu_kernel void @v_fneg_mul_legacy_multi_use_mul_legacy_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/test/CodeGen/AMDGPU/fneg.f16.ll b/test/CodeGen/AMDGPU/fneg.f16.ll index 626a0b50cce8..ed36666db807 100644 --- a/test/CodeGen/AMDGPU/fneg.f16.ll +++ b/test/CodeGen/AMDGPU/fneg.f16.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=kaveri -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=CI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN %s -; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN %s +; RUN: llc -march=amdgcn -mcpu=tonga -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=VI -check-prefix=CIVI -check-prefix=GCN -check-prefix=GFX89 %s +; RUN: llc -march=amdgcn -mcpu=gfx901 -mtriple=amdgcn--amdhsa -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 -check-prefix=GCN -check-prefix=GFX89 %s ; FIXME: Should be able to do scalar op ; GCN-LABEL: {{^}}s_fneg_f16: @@ -129,6 +129,41 @@ define amdgpu_kernel void @v_fneg_fold_v2f16(<2 x half> addrspace(1)* %out, <2 x ret void } +; GCN-LABEL: {{^}}v_extract_fneg_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; CI-DAG: v_mul_f32_e32 v{{[0-9]+}}, -4.0, v{{[0-9]+}} +; CI-DAG: v_sub_f32_e32 v{{[0-9]+}}, 2.0, v{{[0-9]+}} + +; GFX89: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[VAL]] +; GFX89-DAG: v_mul_f16_e32 v{{[0-9]+}}, -4.0, [[VAL]] +; GFX89-DAG: v_sub_f16_e32 v{{[0-9]+}}, 2.0, [[ELT1]] +define amdgpu_kernel void @v_extract_fneg_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fneg = fsub <2 x half> , %val + %elt0 = extractelement <2 x half> %fneg, i32 0 + %elt1 = extractelement <2 x half> %fneg, i32 1 + + %fmul0 = fmul half %elt0, 4.0 + %fadd1 = fadd half %elt1, 2.0 + store volatile half %fmul0, half addrspace(1)* undef + store volatile half %fadd1, half addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}v_extract_fneg_no_fold_v2f16: +; GCN: flat_load_dword [[VAL:v[0-9]+]] +; GCN: v_xor_b32_e32 [[NEG:v[0-9]+]], 0x80008000, [[VAL]] +; GCN: v_lshrrev_b32_e32 [[ELT1:v[0-9]+]], 16, [[NEG]] +define amdgpu_kernel void @v_extract_fneg_no_fold_v2f16(<2 x half> addrspace(1)* %in) #0 { + %val = load <2 x half>, <2 x half> addrspace(1)* %in + %fneg = fsub <2 x half> , %val + %elt0 = extractelement <2 x half> %fneg, i32 0 + %elt1 = extractelement <2 x half> %fneg, i32 1 + store volatile half %elt0, half addrspace(1)* undef + store volatile half %elt1, half addrspace(1)* undef + ret void +} + declare i32 @llvm.amdgcn.workitem.id.x() #1 attributes #0 = { nounwind } diff --git a/test/CodeGen/AMDGPU/inserted-wait-states.mir b/test/CodeGen/AMDGPU/inserted-wait-states.mir index c6fe6debd225..ff9fcd1c693f 100644 --- a/test/CodeGen/AMDGPU/inserted-wait-states.mir +++ b/test/CodeGen/AMDGPU/inserted-wait-states.mir @@ -246,15 +246,15 @@ body: | S_BRANCH %bb.1 bb.1: - FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, %vgpr2_vgpr3, 0, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4, 0, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + FLAT_ATOMIC_CMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec - FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, 0, implicit %exec, implicit %flat_scr + FLAT_ATOMIC_FCMPSWAP_X2 %vgpr0_vgpr1, %vgpr2_vgpr3_vgpr4_vgpr5, 0, implicit %exec, implicit %flat_scr %vgpr3 = V_MOV_B32_e32 0, implicit %exec S_ENDPGM diff --git a/test/CodeGen/AMDGPU/limit-coalesce.mir b/test/CodeGen/AMDGPU/limit-coalesce.mir index 106a96e32dc3..a0d2d6c097a2 100644 --- a/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -57,15 +57,15 @@ body: | %4.sub1 = COPY %3.sub0 undef %5.sub0 = COPY %4.sub1 %5.sub1 = COPY %4.sub0 - FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX2 %vgpr0_vgpr1, killed %5, 0, 0, implicit %exec, implicit %flat_scr %6 = IMPLICIT_DEF undef %7.sub0_sub1 = COPY %6 %7.sub2 = COPY %3.sub0 - FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX3 %vgpr0_vgpr1, killed %7, 0, 0, implicit %exec, implicit %flat_scr %8 = IMPLICIT_DEF undef %9.sub0_sub1_sub2 = COPY %8 %9.sub3 = COPY %3.sub0 - FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORDX4 %vgpr0_vgpr1, killed %9, 0, 0, implicit %exec, implicit %flat_scr ... diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll index b92eb34750d9..7179d02fc6dd 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -7,7 +7,7 @@ ; GCN-DAG: s_load_dword [[SY:s[0-9]+]], s[0:1], 0x{{c|30}} ; GCN: v_mov_b32_e32 [[VY:v[0-9]+]], [[SY]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[X]], [[VY]] -; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[VY]] +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[X]], [[VY]] define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float %x, float %y) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %y) store <2 x half> %result, <2 x half> addrspace(1)* %out @@ -16,7 +16,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GCN-LABEL: {{^}}s_cvt_pkrtz_samereg_v2f16_f32: ; GCN: s_load_dword [[X:s[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[X]], [[X]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[X]], [[X]] define amdgpu_kernel void @s_cvt_pkrtz_samereg_v2f16_f32(<2 x half> addrspace(1)* %out, float %x) #0 { %result = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %x, float %x) store <2 x half> %result, <2 x half> addrspace(1)* %out @@ -39,7 +39,7 @@ define amdgpu_kernel void @s_cvt_pkrtz_undef_undef(<2 x half> addrspace(1)* %out ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, [[A]], [[B]] -; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], [[B]] +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, [[A]], [[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -55,7 +55,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32(<2 x half> addrspace(1)* %out, ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_reg_imm: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], 1.0 +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], 1.0 define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -70,7 +70,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_reg_imm(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_imm_reg: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; SI: v_cvt_pkrtz_f16_f32_e32 v{{[0-9]+}}, 1.0, [[A]] -; GFX89: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, 1.0, [[A]] +; GFX89: v_cvt_pkrtz_f16_f32 v{{[0-9]+}}, 1.0, [[A]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -85,7 +85,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_imm_reg(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], [[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], [[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -103,7 +103,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_hi: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, [[A]], -[[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, [[A]], -[[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -121,7 +121,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_hi(<2 x half> addrspace(1) ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_lo_hi: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -[[A]], -[[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -[[A]], -[[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 @@ -140,7 +140,7 @@ define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_lo_hi(<2 x half> addrspace ; GCN-LABEL: {{^}}v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi: ; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] ; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] -; GCN: v_cvt_pkrtz_f16_f32_e64 v{{[0-9]+}}, -|[[A]]|, -[[B]] +; GCN: v_cvt_pkrtz_f16_f32{{(_e64)*}} v{{[0-9]+}}, -|[[A]]|, -[[B]] define amdgpu_kernel void @v_cvt_pkrtz_v2f16_f32_fneg_fabs_lo_fneg_hi(<2 x half> addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { %tid = call i32 @llvm.amdgcn.workitem.id.x() %tid.ext = sext i32 %tid to i64 diff --git a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll index ab76c870796b..144c8f428ab0 100644 --- a/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll +++ b/test/CodeGen/AMDGPU/llvm.amdgcn.mbcnt.ll @@ -2,9 +2,9 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s ; GCN-LABEL: {{^}}mbcnt_intrinsics: -; GCN: v_mbcnt_lo_u32_b32_e64 [[LO:v[0-9]+]], -1, 0 +; GCN: v_mbcnt_lo_u32_b32{{(_e64)*}} [[LO:v[0-9]+]], -1, 0 ; SI: v_mbcnt_hi_u32_b32_e32 {{v[0-9]+}}, -1, [[LO]] -; VI: v_mbcnt_hi_u32_b32_e64 {{v[0-9]+}}, -1, [[LO]] +; VI: v_mbcnt_hi_u32_b32 {{v[0-9]+}}, -1, [[LO]] define amdgpu_ps void @mbcnt_intrinsics(<16 x i8> addrspace(2)* inreg %arg, <16 x i8> addrspace(2)* inreg %arg1, <32 x i8> addrspace(2)* inreg %arg2, i32 inreg %arg3) { main_body: %lo = call i32 @llvm.amdgcn.mbcnt.lo(i32 -1, i32 0) #0 diff --git a/test/CodeGen/AMDGPU/madak.ll b/test/CodeGen/AMDGPU/madak.ll index eb4066a2a0a8..5f1fb0e2d732 100644 --- a/test/CodeGen/AMDGPU/madak.ll +++ b/test/CodeGen/AMDGPU/madak.ll @@ -9,7 +9,7 @@ declare float @llvm.fabs.f32(float) nounwind readnone ; GCN-LABEL: {{^}}madak_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] ; GCN: buffer_load_dword [[VB:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 +; GCN: v_madak_f32 {{v[0-9]+}}, [[VA]], [[VB]], 0x41200000 define amdgpu_kernel void @madak_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a, float addrspace(1)* noalias %in.b) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -63,7 +63,7 @@ define amdgpu_kernel void @madak_2_use_f32(float addrspace(1)* noalias %out, flo ; GCN-LABEL: {{^}}madak_m_inline_imm_f32: ; GCN: buffer_load_dword [[VA:v[0-9]+]] -; GCN: v_madak_f32_e32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 +; GCN: v_madak_f32 {{v[0-9]+}}, 4.0, [[VA]], 0x41200000 define amdgpu_kernel void @madak_m_inline_imm_f32(float addrspace(1)* noalias %out, float addrspace(1)* noalias %in.a) nounwind { %tid = tail call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone %in.a.gep = getelementptr float, float addrspace(1)* %in.a, i32 %tid @@ -198,7 +198,7 @@ define amdgpu_kernel void @no_madak_src1_modifier_f32(float addrspace(1)* noalia ; GCN: s_load_dword [[SGPR0:s[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, {{0xa|0x28}} ; GCN: v_mov_b32_e32 [[SGPR0_VCOPY:v[0-9]+]], [[SGPR0]] ; GCN: buffer_load_dword [[VGPR:v[0-9]+]] -; GCN: v_madak_f32_e32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 +; GCN: v_madak_f32 [[MADAK:v[0-9]+]], 0.5, [[SGPR0_VCOPY]], 0x42280000 ; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], [[VGPR]], [[MADAK]] ; GCN: buffer_store_dword [[MUL]] define amdgpu_kernel void @madak_constant_bus_violation(i32 %arg1, float %sgpr0, float %sgpr1) #0 { diff --git a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll index 9c43a6dc60f4..d7655993a2d9 100644 --- a/test/CodeGen/AMDGPU/promote-alloca-volatile.ll +++ b/test/CodeGen/AMDGPU/promote-alloca-volatile.ll @@ -1,26 +1,26 @@ ; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -amdgpu-promote-alloca < %s | FileCheck %s ; CHECK-LABEL: @volatile_load( -; CHECK: alloca [5 x i32] +; CHECK: alloca [4 x i32] ; CHECK: load volatile i32, i32* define amdgpu_kernel void @volatile_load(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [4 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp %load = load volatile i32, i32* %arrayidx1 store i32 %load, i32 addrspace(1)* %out ret void } ; CHECK-LABEL: @volatile_store( -; CHECK: alloca [5 x i32] +; CHECK: alloca [4 x i32] ; CHECK: store volatile i32 %tmp, i32* define amdgpu_kernel void @volatile_store(i32 addrspace(1)* nocapture %out, i32 addrspace(1)* nocapture %in) { entry: - %stack = alloca [5 x i32], align 4 + %stack = alloca [4 x i32], align 4 %tmp = load i32, i32 addrspace(1)* %in, align 4 - %arrayidx1 = getelementptr inbounds [5 x i32], [5 x i32]* %stack, i32 0, i32 %tmp + %arrayidx1 = getelementptr inbounds [4 x i32], [4 x i32]* %stack, i32 0, i32 %tmp store volatile i32 %tmp, i32* %arrayidx1 ret void } diff --git a/test/CodeGen/AMDGPU/v_madak_f16.ll b/test/CodeGen/AMDGPU/v_madak_f16.ll index bfb10503aaea..0148ff470b78 100644 --- a/test/CodeGen/AMDGPU/v_madak_f16.ll +++ b/test/CodeGen/AMDGPU/v_madak_f16.ll @@ -4,7 +4,7 @@ ; GCN-LABEL: {{^}}madak_f16 ; GCN: buffer_load_ushort v[[A_F16:[0-9]+]] ; GCN: buffer_load_ushort v[[B_F16:[0-9]+]] -; VI: v_madak_f16_e32 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}} +; VI: v_madak_f16 v[[R_F16:[0-9]+]], v[[A_F16]], v[[B_F16]], 0x4900{{$}} ; VI: buffer_store_short v[[R_F16]] ; GCN: s_endpgm define amdgpu_kernel void @madak_f16( diff --git a/test/CodeGen/AMDGPU/waitcnt.mir b/test/CodeGen/AMDGPU/waitcnt.mir index 38662e83b359..f754415dccb4 100644 --- a/test/CodeGen/AMDGPU/waitcnt.mir +++ b/test/CodeGen/AMDGPU/waitcnt.mir @@ -51,21 +51,21 @@ name: flat_zero_waitcnt body: | bb.0: successors: %bb.1 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4) - %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.global4) + %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_BRANCH %bb.1 bb.1: successors: %bb.2 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr - %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr + %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.global16) %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_BRANCH %bb.2 bb.2: - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4) - %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16) + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr :: (load 4 from %ir.flat4) + %vgpr3_vgpr4_vgpr5_vgpr6 = FLAT_LOAD_DWORDX4 %vgpr7_vgpr8, 0, 0, implicit %exec, implicit %flat_scr :: (load 16 from %ir.flat16) %vgpr0 = V_MOV_B32_e32 %vgpr1, implicit %exec S_ENDPGM ... @@ -86,11 +86,11 @@ name: single_fallthrough_successor_no_end_block_wait body: | bb.0: successors: %bb.1 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr bb.1: %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec - FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM ... --- @@ -114,15 +114,15 @@ name: single_branch_successor_not_next_block body: | bb.0: successors: %bb.2 - %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, 0, implicit %exec, implicit %flat_scr + %vgpr0 = FLAT_LOAD_DWORD %vgpr1_vgpr2, 0, 0, implicit %exec, implicit %flat_scr S_BRANCH %bb.2 bb.1: - FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr8_vgpr9, %vgpr10, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM bb.2: %vgpr3_vgpr4 = V_LSHLREV_B64 4, %vgpr7_vgpr8, implicit %exec - FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, 0, implicit %exec, implicit %flat_scr + FLAT_STORE_DWORD %vgpr3_vgpr4, %vgpr0, 0, 0, implicit %exec, implicit %flat_scr S_ENDPGM ... diff --git a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir index 83ab2659ef4a..72c3b715d36e 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-instruction-select.mir @@ -4,6 +4,8 @@ define void @test_sext_s1() { ret void } define void @test_sext_s8() { ret void } define void @test_zext_s16() { ret void } + define void @test_anyext_s8() { ret void } + define void @test_anyext_s16() { ret void } define void @test_trunc_s32_16() { ret void } @@ -149,6 +151,58 @@ body: | ; CHECK: BX_RET 14, _, implicit %r0 ... --- +name: test_anyext_s8 +# CHECK-LABEL: name: test_anyext_s8 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s8) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]] + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- +name: test_anyext_s16 +# CHECK-LABEL: name: test_anyext_s16 +legalized: true +regBankSelected: true +selected: false +# CHECK: selected: true +registers: + - { id: 0, class: gprb } + - { id: 1, class: gprb } +body: | + bb.0: + liveins: %r0 + + %0(s16) = COPY %r0 + ; CHECK: [[VREGX:%[0-9]+]] = COPY %r0 + + %1(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGEXT:%[0-9]+]] = COPY [[VREGX]] + + %r0 = COPY %1(s32) + ; CHECK: %r0 = COPY [[VREGEXT]] + + BX_RET 14, _, implicit %r0 + ; CHECK: BX_RET 14, _, implicit %r0 +... +--- name: test_trunc_s32_16 # CHECK-LABEL: name: test_trunc_s32_16 legalized: true @@ -187,9 +241,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -200,11 +260,20 @@ body: | %1(s8) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s8) = G_ADD %0, %1 - ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s8) - ; CHECK: %r0 = COPY [[VREGSUM]] + %3(s32) = G_ANYEXT %1(s8) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_ADD %2, %3 + ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s8) = G_TRUNC %4(s32) + ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]] + + %r0 = COPY %5(s8) + ; CHECK: %r0 = COPY [[VREGSUMTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -220,9 +289,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -233,11 +308,20 @@ body: | %1(s16) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s16) = G_ADD %0, %1 - ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s16) - ; CHECK: %r0 = COPY [[VREGSUM]] + %3(s32) = G_ANYEXT %1(s16) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_ADD %2, %3 + ; CHECK: [[VREGSUM:%[0-9]+]] = ADDrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s16) = G_TRUNC %4(s32) + ; CHECK: [[VREGSUMTR:%[0-9]+]] = COPY [[VREGSUM]] + + %r0 = COPY %5(s16) + ; CHECK: %r0 = COPY [[VREGSUMTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -352,9 +436,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -365,11 +455,20 @@ body: | %1(s8) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s8) = G_SUB %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s8) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s8) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_SUB %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s8) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s8) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -385,9 +484,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } # CHECK-DAG: id: 0, class: gpr # CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gpr +# CHECK-DAG: id: 3, class: gpr +# CHECK-DAG: id: 4, class: gpr +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -398,11 +503,20 @@ body: | %1(s16) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s16) = G_SUB %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s16) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s16) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_SUB %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = SUBrr [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s16) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s16) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -451,9 +565,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } -# CHECK-DAG: id: 0, class: gprnopc -# CHECK-DAG: id: 1, class: gprnopc + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gprnopc +# CHECK-DAG: id: 3, class: gprnopc +# CHECK-DAG: id: 4, class: gprnopc +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -464,11 +584,20 @@ body: | %1(s8) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s8) = G_MUL %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s8) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s8) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s8) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_MUL %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s8) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s8) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 @@ -484,9 +613,15 @@ registers: - { id: 0, class: gprb } - { id: 1, class: gprb } - { id: 2, class: gprb } -# CHECK-DAG: id: 0, class: gprnopc -# CHECK-DAG: id: 1, class: gprnopc + - { id: 3, class: gprb } + - { id: 4, class: gprb } + - { id: 5, class: gprb } +# CHECK-DAG: id: 0, class: gpr +# CHECK-DAG: id: 1, class: gpr # CHECK-DAG: id: 2, class: gprnopc +# CHECK-DAG: id: 3, class: gprnopc +# CHECK-DAG: id: 4, class: gprnopc +# CHECK-DAG: id: 5, class: gpr body: | bb.0: liveins: %r0, %r1 @@ -497,11 +632,20 @@ body: | %1(s16) = COPY %r1 ; CHECK: [[VREGY:%[0-9]+]] = COPY %r1 - %2(s16) = G_MUL %0, %1 - ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGX]], [[VREGY]], 14, _, _ + %2(s32) = G_ANYEXT %0(s16) + ; CHECK: [[VREGXEXT:%[0-9]+]] = COPY [[VREGX]] - %r0 = COPY %2(s16) - ; CHECK: %r0 = COPY [[VREGRES]] + %3(s32) = G_ANYEXT %1(s16) + ; CHECK: [[VREGYEXT:%[0-9]+]] = COPY [[VREGY]] + + %4(s32) = G_MUL %2, %3 + ; CHECK: [[VREGRES:%[0-9]+]] = MUL [[VREGXEXT]], [[VREGYEXT]], 14, _, _ + + %5(s16) = G_TRUNC %4(s32) + ; CHECK: [[VREGRESTR:%[0-9]+]] = COPY [[VREGRES]] + + %r0 = COPY %5(s16) + ; CHECK: %r0 = COPY [[VREGRESTR]] BX_RET 14, _, implicit %r0 ; CHECK: BX_RET 14, _, implicit %r0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll index 44fe7410b42c..53577dbd76f6 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll +++ b/test/CodeGen/ARM/GlobalISel/arm-irtranslator.ll @@ -421,7 +421,7 @@ entry: define arm_aapcscc void @test_indirect_call(void() *%fptr) { ; CHECK-LABEL: name: test_indirect_call ; CHECK: [[FPTR:%[0-9]+]](p0) = COPY %r0 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: BLX [[FPTR]](p0), csr_aapcs, implicit-def %lr, implicit %sp ; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp entry: @@ -433,7 +433,7 @@ declare arm_aapcscc void @call_target() define arm_aapcscc void @test_direct_call() { ; CHECK-LABEL: name: test_direct_call -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: BLX @call_target, csr_aapcs, implicit-def %lr, implicit %sp ; CHECK: ADJCALLSTACKUP 0, 0, 14, _, implicit-def %sp, implicit %sp entry: @@ -447,7 +447,7 @@ define arm_aapcscc i32* @test_call_simple_reg_params(i32 *%a, i32 %b) { ; CHECK-LABEL: name: test_call_simple_reg_params ; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %r0 = COPY [[BVREG]] ; CHECK-DAG: %r1 = COPY [[AVREG]] ; CHECK: BLX @simple_reg_params_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit %r1, implicit-def %r0 @@ -466,7 +466,7 @@ define arm_aapcscc i32* @test_call_simple_stack_params(i32 *%a, i32 %b) { ; CHECK-LABEL: name: test_call_simple_stack_params ; CHECK-DAG: [[AVREG:%[0-9]+]](p0) = COPY %r0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r1 -; CHECK: ADJCALLSTACKDOWN 8, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 8, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %r0 = COPY [[BVREG]] ; CHECK-DAG: %r1 = COPY [[AVREG]] ; CHECK-DAG: %r2 = COPY [[BVREG]] @@ -496,7 +496,7 @@ define arm_aapcscc signext i16 @test_call_ext_params(i8 %a, i16 %b, i1 %c) { ; CHECK-DAG: [[AVREG:%[0-9]+]](s8) = COPY %r0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s16) = COPY %r1 ; CHECK-DAG: [[CVREG:%[0-9]+]](s1) = COPY %r2 -; CHECK: ADJCALLSTACKDOWN 20, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 20, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: [[SEXTA:%[0-9]+]](s32) = G_SEXT [[AVREG]](s8) ; CHECK: %r0 = COPY [[SEXTA]] ; CHECK: [[ZEXTA:%[0-9]+]](s32) = G_ZEXT [[AVREG]](s8) @@ -547,7 +547,7 @@ define arm_aapcs_vfpcc double @test_call_vfpcc_fp_params(double %a, float %b) { ; CHECK-LABEL: name: test_call_vfpcc_fp_params ; CHECK-DAG: [[AVREG:%[0-9]+]](s64) = COPY %d0 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %s2 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %s0 = COPY [[BVREG]] ; CHECK-DAG: %d1 = COPY [[AVREG]] ; CHECK: BLX @vfpcc_fp_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %s0, implicit %d1, implicit-def %d0 @@ -569,7 +569,7 @@ define arm_aapcscc double @test_call_aapcs_fp_params(double %a, float %b) { ; LITTLE-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A1]](s32), 0, [[A2]](s32), 32 ; BIG-DAG: [[AVREG:%[0-9]+]](s64) = G_SEQUENCE [[A2]](s32), 0, [[A1]](s32), 32 ; CHECK-DAG: [[BVREG:%[0-9]+]](s32) = COPY %r2 -; CHECK: ADJCALLSTACKDOWN 16, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 16, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK-DAG: %r0 = COPY [[BVREG]] ; CHECK-DAG: [[A1:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 0 ; CHECK-DAG: [[A2:%[0-9]+]](s32) = G_EXTRACT [[AVREG]](s64), 32 @@ -608,7 +608,7 @@ declare arm_aapcscc float @different_call_conv_target(float) define arm_aapcs_vfpcc float @test_call_different_call_conv(float %x) { ; CHECK-LABEL: name: test_call_different_call_conv ; CHECK: [[X:%[0-9]+]](s32) = COPY %s0 -; CHECK: ADJCALLSTACKDOWN 0, 14, _, implicit-def %sp, implicit %sp +; CHECK: ADJCALLSTACKDOWN 0, 0, 14, _, implicit-def %sp, implicit %sp ; CHECK: %r0 = COPY [[X]] ; CHECK: BLX @different_call_conv_target, csr_aapcs, implicit-def %lr, implicit %sp, implicit %r0, implicit-def %r0 ; CHECK: [[R:%[0-9]+]](s32) = COPY %r0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir index 625d35acf17b..f6ac92597cb2 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-legalizer.mir @@ -91,8 +91,9 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 %2(s8) = G_ADD %0, %1 - ; G_ADD with s8 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}} + ; G_ADD with s8 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_ADD {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) BX_RET 14, _, implicit %r0 ... @@ -115,8 +116,9 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 %2(s16) = G_ADD %0, %1 - ; G_ADD with s16 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}} + ; G_ADD with s16 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_ADD {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_ADD {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) BX_RET 14, _, implicit %r0 @@ -165,8 +167,9 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 %2(s8) = G_SUB %0, %1 - ; G_SUB with s8 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}} + ; G_SUB with s8 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_SUB {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) BX_RET 14, _, implicit %r0 ... @@ -189,8 +192,9 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 %2(s16) = G_SUB %0, %1 - ; G_SUB with s16 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}} + ; G_SUB with s16 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_SUB {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_SUB {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) BX_RET 14, _, implicit %r0 @@ -239,8 +243,9 @@ body: | %0(s8) = COPY %r0 %1(s8) = COPY %r1 %2(s8) = G_MUL %0, %1 - ; G_MUL with s8 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}} + ; G_MUL with s8 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s8) = G_MUL {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s8) BX_RET 14, _, implicit %r0 ... @@ -263,8 +268,9 @@ body: | %0(s16) = COPY %r0 %1(s16) = COPY %r1 %2(s16) = G_MUL %0, %1 - ; G_MUL with s16 is legal, so we should find it unchanged in the output - ; CHECK: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}} + ; G_MUL with s16 should widen + ; CHECK: {{%[0-9]+}}(s32) = G_MUL {{%[0-9]+, %[0-9]+}} + ; CHECK-NOT: {{%[0-9]+}}(s16) = G_MUL {{%[0-9]+, %[0-9]+}} %r0 = COPY %2(s16) BX_RET 14, _, implicit %r0 diff --git a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir index 4e94fb4e3481..dfccc47c277c 100644 --- a/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir +++ b/test/CodeGen/ARM/GlobalISel/arm-regbankselect.mir @@ -25,6 +25,9 @@ define void @test_constants() { ret void } + define void @test_anyext_s8_32() { ret void } + define void @test_anyext_s16_32() { ret void } + define void @test_trunc_s32_16() { ret void } define void @test_fadd_s32() #0 { ret void } @@ -71,19 +74,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s16) = COPY %r0 %1(s16) = COPY %r1 - %2(s16) = G_ADD %0, %1 - %r0 = COPY %2(s16) + %2(s32) = G_ANYEXT %0(s16) + %3(s32) = G_ANYEXT %1(s16) + %4(s32) = G_ADD %2, %3 + %5(s16) = G_TRUNC %4(s32) + %r0 = COPY %5(s16) BX_RET 14, _, implicit %r0 ... @@ -97,19 +109,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s8) = COPY %r0 %1(s8) = COPY %r1 - %2(s8) = G_ADD %0, %1 - %r0 = COPY %2(s8) + %2(s32) = G_ANYEXT %0(s8) + %3(s32) = G_ANYEXT %1(s8) + %4(s32) = G_ADD %2, %3 + %5(s8) = G_TRUNC %4(s32) + %r0 = COPY %5(s8) BX_RET 14, _, implicit %r0 ... @@ -123,19 +144,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s1) = COPY %r0 %1(s1) = COPY %r1 - %2(s1) = G_ADD %0, %1 - %r0 = COPY %2(s1) + %2(s32) = G_ANYEXT %0(s1) + %3(s32) = G_ANYEXT %1(s1) + %4(s32) = G_ADD %2, %3 + %5(s1) = G_TRUNC %4(s32) + %r0 = COPY %5(s1) BX_RET 14, _, implicit %r0 ... @@ -175,19 +205,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s16) = COPY %r0 %1(s16) = COPY %r1 - %2(s16) = G_SUB %0, %1 - %r0 = COPY %2(s16) + %2(s32) = G_ANYEXT %0(s16) + %3(s32) = G_ANYEXT %1(s16) + %4(s32) = G_SUB %2, %3 + %5(s16) = G_TRUNC %4(s32) + %r0 = COPY %5(s16) BX_RET 14, _, implicit %r0 ... @@ -201,19 +240,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s8) = COPY %r0 %1(s8) = COPY %r1 - %2(s8) = G_SUB %0, %1 - %r0 = COPY %2(s8) + %2(s32) = G_ANYEXT %0(s8) + %3(s32) = G_ANYEXT %1(s8) + %4(s32) = G_SUB %2, %3 + %5(s8) = G_TRUNC %4(s32) + %r0 = COPY %5(s8) BX_RET 14, _, implicit %r0 ... @@ -253,19 +301,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s16) = COPY %r0 %1(s16) = COPY %r1 - %2(s16) = G_MUL %0, %1 - %r0 = COPY %2(s16) + %2(s32) = G_ANYEXT %0(s16) + %3(s32) = G_ANYEXT %1(s16) + %4(s32) = G_MUL %2, %3 + %5(s16) = G_TRUNC %4(s32) + %r0 = COPY %5(s16) BX_RET 14, _, implicit %r0 ... @@ -279,19 +336,28 @@ selected: false # CHECK: - { id: 0, class: gprb } # CHECK: - { id: 1, class: gprb } # CHECK: - { id: 2, class: gprb } +# CHECK: - { id: 3, class: gprb } +# CHECK: - { id: 4, class: gprb } +# CHECK: - { id: 5, class: gprb } registers: - { id: 0, class: _ } - { id: 1, class: _ } - { id: 2, class: _ } + - { id: 3, class: _ } + - { id: 4, class: _ } + - { id: 5, class: _ } body: | bb.0: liveins: %r0, %r1 %0(s8) = COPY %r0 %1(s8) = COPY %r1 - %2(s8) = G_MUL %0, %1 - %r0 = COPY %2(s8) + %2(s32) = G_ANYEXT %0(s8) + %3(s32) = G_ANYEXT %1(s8) + %4(s32) = G_MUL %2, %3 + %5(s8) = G_TRUNC %4(s32) + %r0 = COPY %5(s8) BX_RET 14, _, implicit %r0 ... @@ -500,6 +566,48 @@ body: | BX_RET 14, _, implicit %r0 ... --- +name: test_anyext_s8_32 +# CHECK-LABEL: name: test_anyext_s8_32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s8) = COPY %r0 + %1(s32) = G_ANYEXT %0(s8) + %r0 = COPY %1(s32) + BX_RET 14, _, implicit %r0 +... +--- +name: test_anyext_s16_32 +# CHECK-LABEL: name: test_anyext_s16_32 +legalized: true +regBankSelected: false +selected: false +# CHECK: registers: +# CHECK: - { id: 0, class: gprb } +# CHECK: - { id: 1, class: gprb } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } +body: | + bb.0: + liveins: %r0 + + %0(s16) = COPY %r0 + %1(s32) = G_ANYEXT %0(s16) + %r0 = COPY %1(s32) + BX_RET 14, _, implicit %r0 +... +--- name: test_trunc_s32_16 # CHECK-LABEL: name: test_trunc_s32_16 legalized: true diff --git a/test/CodeGen/ARM/divmod-eabi.ll b/test/CodeGen/ARM/divmod-eabi.ll index ce5a1df05e3f..77ffc46e6a69 100644 --- a/test/CodeGen/ARM/divmod-eabi.ll +++ b/test/CodeGen/ARM/divmod-eabi.ll @@ -16,17 +16,15 @@ ; RUN: llc -mtriple armv7-linux-gnueabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI ; RUN: llc -mtriple armv7-linux-musleabi %s -o - | FileCheck %s --check-prefix=EABI ; RUN: llc -mtriple armv7-linux-musleabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=EABI -; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT -; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0 -; FIXME: long-term, we will use "-apple-macho" and won't need this exception: -; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - | FileCheck %s --check-prefixes=DARWIN,DARWIN-DEFAULT -; RUN: llc -mtriple armv7-apple-darwin-eabi %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=DARWIN,DARWIN-O0 +; RUN: llc -mtriple armv7-apple-darwin %s -o - | FileCheck %s --check-prefixes=DARWIN +; RUN: llc -mtriple armv7-apple-darwin %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefix=DARWIN-O0 ; RUN: llc -mtriple thumbv7-windows %s -o - | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-DEFAULT ; RUN: llc -mtriple thumbv7-windows %s -o - -O0 -optimize-regalloc | FileCheck %s --check-prefixes=WINDOWS,WINDOWS-O0 define signext i16 @f16(i16 signext %a, i16 signext %b) { ; EABI-LABEL: f16: ; DARWIN-LABEL: f16: +; DARWIN-O0-LABEL: f16: ; WINDOWS-LABEL: f16: entry: %conv = sext i16 %a to i32 @@ -36,11 +34,9 @@ entry: ; EABI: __aeabi_idivmod ; EABI: mov [[div:r[0-9]+]], r0 ; EABI: mov [[rem:r[0-9]+]], r1 -; DARWIN: ___divsi3 -; DARWIN: mov [[div:r[0-9]+]], r0 -; DARWIN: __modsi3 -; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]] -; DARWIN-O0: mov [[rem:r[0-9]+]], r0 +; DARWIN: __divmodsi4 +; DARWIN-O0: __divsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: __rt_sdiv ; WINDOWS-DEFAULT: add [[sum:r[0-9]+]], r1 @@ -48,16 +44,13 @@ entry: %rem8 = srem i32 %conv1, %conv ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div %add13 = add nsw i32 %add, %rem8 %conv14 = trunc i32 %add13 to i16 ; EABI: add r0{{.*}}r1 ; EABI: sxth r0, r0 -; DARWIN-DEFAULT: add [[res:r[0-9]+]], [[sum]], r0 -; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]] -; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0 -; DARWIN: sxth r0, [[res]] ; WINDOWS-DEFAULT: adds [[sum1:r[0-9]+]], [[sum]], r1 ; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], ; WINDOWS-O0: add [[sum1:r[0-9]+]], r1 @@ -68,6 +61,7 @@ entry: define i32 @f32(i32 %a, i32 %b) { ; EABI-LABEL: f32: ; DARWIN-LABEL: f32: +; DARWIN-O0-LABEL: f32: ; WINDOWS-LABEL: f32: entry: %div = sdiv i32 %a, %b @@ -75,11 +69,9 @@ entry: ; EABI: __aeabi_idivmod ; EABI: mov [[div:r[0-9]+]], r0 ; EABI: mov [[rem:r[0-9]+]], r1 -; DARWIN: ___divsi3 -; DARWIN: mov [[div:r[0-9]+]], r0 -; DARWIN: __modsi3 -; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]] -; DARWIN-O0: mov [[rem:r[0-9]+]], r0 +; DARWIN: ___divmodsi4 +; DARWIN-O0: __divsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_sdiv @@ -87,13 +79,11 @@ entry: %rem1 = srem i32 %b, %a ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div %add2 = add nsw i32 %add, %rem1 ; EABI: add r0{{.*}}r1 -; DARWIN-DEFAULT: add r0, [[sum]], r0 -; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]] -; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0 ; WINDOWS-DEFAULT: adds r0, [[div]], r1 ; WINDOWS-O0: adds [[sum:r[0-9]+]], [[rem]], [[div]] ; WINDOWS-O0: add [[sum]], r1 @@ -103,16 +93,15 @@ entry: define i32 @uf(i32 %a, i32 %b) { ; EABI-LABEL: uf: ; DARWIN-LABEL: uf: +; DARWIN-O0-LABEL: uf: ; WINDOWS-LABEL: uf: entry: %div = udiv i32 %a, %b %rem = urem i32 %a, %b ; EABI: __aeabi_uidivmod -; DARWIN: ___udivsi3 -; DARWIN: mov [[div:r[0-9]+]], r0 -; DARWIN: __umodsi3 -; DARWIN-DEFAULT: add [[sum:r[0-9]+]], r0, [[div]] -; DARWIN-O0: mov [[rem:r[0-9]+]], r0 +; DARWIN: __udivmodsi4 +; DARWIN-O0: __udivsi3 +; DARWIN-O0: __umodsi3 ; WINDOWS: __rt_udiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_udiv @@ -120,13 +109,11 @@ entry: %rem1 = urem i32 %b, %a ; EABI: __aeabi_uidivmod ; DARWIN: __umodsi3 +; DARWIN-O0: __umodsi3 ; WINDOWS: __rt_udiv %add = add nuw i32 %rem, %div %add2 = add nuw i32 %add, %rem1 ; EABI: add r0{{.*}}r1 -; DARWIN-DEFAULT: add r0, [[sum]], r0 -; DARWIN-O0: add [[sum:r[0-9]+]], [[rem]], [[div]] -; DARWIN-O0: add [[res:r[0-9]+]], [[sum]], r0 ; WINDOWS-DEFAULT: adds [[sum:r[0-9]+]], [[div]], r1 ; WINDOWS-O0: adds [[sum:r[0-9]+]], ; WINDOWS-O0: add [[sum]], r1 @@ -136,6 +123,7 @@ entry: define i64 @longf(i64 %a, i64 %b) { ; EABI-LABEL: longf: ; DARWIN-LABEL: longf: +; DARWIN-O0-LABEL: longf: ; WINDOWS-LABEL: longf: entry: %div = sdiv i64 %a, %b @@ -148,6 +136,8 @@ entry: ; DARWIN: mov [[div1:r[0-9]+]], r0 ; DARWIN: mov [[div2:r[0-9]+]], r1 ; DARWIN: __moddi3 +; DARWIN-O0: __divdi3 +; DARWIN-O0: __moddi3 ; WINDOWS: __rt_sdiv64 %add = add nsw i64 %rem, %div ; DARWIN: adds r0{{.*}}[[div1]] @@ -160,20 +150,19 @@ entry: define i16 @shortf(i16 %a, i16 %b) { ; EABI-LABEL: shortf: ; DARWIN-LABEL: shortf: +; DARWIN-O0-LABEL: shortf: ; WINDOWS-LABEL: shortf: entry: %div = sdiv i16 %a, %b %rem = srem i16 %a, %b ; EABI: __aeabi_idivmod -; DARWIN: ___divsi3 -; DARWIN: mov [[div1:r[0-9]+]], r0 -; DARWIN: __modsi3 +; DARWIN: ___divmodsi4 +; DARWIN-O0: __divmodsi4 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_sdiv %add = add nsw i16 %rem, %div ; EABI: add r0, r1 -; DARWIN: add r0{{.*}}[[div1]] ; WINDOWS: adds r0, r1, [[div]] ret i16 %add } @@ -181,20 +170,20 @@ entry: define i32 @g1(i32 %a, i32 %b) { ; EABI-LABEL: g1: ; DARWIN-LABEL: g1: +; DARWIN-O0-LABEL: g1: ; WINDOWS-LABEL: g1: entry: %div = sdiv i32 %a, %b %rem = srem i32 %a, %b ; EABI: __aeabi_idivmod -; DARWIN: ___divsi3 -; DARWIN: mov [[sum:r[0-9]+]], r0 -; DARWIN: __modsi3 +; DARWIN: ___divmodsi4 +; DARWIN-O0: __divsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div ; EABI: add r0{{.*}}r1 -; DARWIN: add r0{{.*}}[[sum]] ; WINDOWS: adds r0, r1, [[div]] ret i32 %add } @@ -203,11 +192,13 @@ entry: define i32 @g2(i32 %a, i32 %b) { ; EABI-LABEL: g2: ; DARWIN-LABEL: g2: +; DARWIN-O0-LABEL: g2: ; WINDOWS-LABEL: g2: entry: %rem = srem i32 %a, %b ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ret i32 %rem ; EABI: mov r0, r1 @@ -217,6 +208,7 @@ entry: define i32 @g3(i32 %a, i32 %b) { ; EABI-LABEL: g3: ; DARWIN-LABEL: g3: +; DARWIN-O0-LABEL: g3: ; WINDOWS-LABEL: g3: entry: %rem = srem i32 %a, %b @@ -224,11 +216,13 @@ entry: ; EABI: mov [[mod:r[0-9]+]], r1 ; DARWIN: __modsi3 ; DARWIN: mov [[sum:r[0-9]+]], r0 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[rem:r[0-9]+]], r1 %rem1 = srem i32 %b, %rem ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem1, %rem ; EABI: add r0, r1, [[mod]] @@ -240,6 +234,7 @@ entry: define i32 @g4(i32 %a, i32 %b) { ; EABI-LABEL: g4: ; DARWIN-LABEL: g4: +; DARWIN-O0-LABEL: g4: ; WINDOWS-LABEL: g4: entry: %div = sdiv i32 %a, %b @@ -247,11 +242,13 @@ entry: ; EABI: mov [[div:r[0-9]+]], r0 ; DARWIN: ___divsi3 ; DARWIN: mov [[sum:r[0-9]+]], r0 +; DARWIN-O0: __divsi3 ; WINDOWS: __rt_sdiv ; WINDOWS: mov [[div:r[0-9]+]], r0 %rem = srem i32 %b, %div ; EABI: __aeabi_idivmod ; DARWIN: __modsi3 +; DARWIN-O0: __modsi3 ; WINDOWS: __rt_sdiv %add = add nsw i32 %rem, %div ; EABI: add r0, r1, [[div]] diff --git a/test/CodeGen/ARM/divmod.ll b/test/CodeGen/ARM/divmod.ll index 9336d0c477d1..ffc1ed09cbf0 100644 --- a/test/CodeGen/ARM/divmod.ll +++ b/test/CodeGen/ARM/divmod.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8 ; RUN: llc < %s -mtriple=arm-apple-ios5.0 -mcpu=swift | FileCheck %s -check-prefix=SWIFT +; RUN: llc < %s -mtriple=thumbv7-apple-macho -mcpu=cortex-a8 | FileCheck %s -check-prefix=A8 ; rdar://12481395 diff --git a/test/CodeGen/AVR/select-mbb-placement-bug.ll b/test/CodeGen/AVR/select-mbb-placement-bug.ll new file mode 100644 index 000000000000..ca7ec1ab831c --- /dev/null +++ b/test/CodeGen/AVR/select-mbb-placement-bug.ll @@ -0,0 +1,35 @@ +; RUN: llc -mcpu=atmega328p < %s -march=avr | FileCheck %s + +; CHECK-LABEL: loopy +define internal fastcc void @loopy() { + +; In this case, when we expand `Select8`/`Select16`, we should be +; replacing the existing MBB instead of adding a new one. +; +; https://github.com/avr-rust/rust/issues/49 + +; CHECK: LBB0_1: +; CHECK: LBB0_2: +; CHECK-NOT: LBB0_3: +start: + br label %bb7.preheader + +bb7.preheader: ; preds = %bb10, %start + %i = phi i8 [ 0, %start ], [ %j, %bb10 ] + %j = phi i8 [ 1, %start ], [ %next, %bb10 ] + br label %bb10 + +bb4: ; preds = %bb10 + ret void + +bb10: ; preds = %bb7.preheader + tail call fastcc void @observe(i8 %i, i8 1) + %0 = icmp ult i8 %j, 20 + %1 = zext i1 %0 to i8 + %next = add i8 %j, %1 + br i1 %0, label %bb7.preheader, label %bb4 + +} + +declare void @observe(i8, i8); + diff --git a/test/CodeGen/Generic/expand-experimental-reductions.ll b/test/CodeGen/Generic/expand-experimental-reductions.ll new file mode 100644 index 000000000000..ef813fa7205b --- /dev/null +++ b/test/CodeGen/Generic/expand-experimental-reductions.ll @@ -0,0 +1,210 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -expand-reductions -S | FileCheck %s +; Tests without a target which should expand all reductions +declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>) + +declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>) +declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>) + +declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>) +declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>) + +declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>) +declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double>) + + +define i64 @add_i64(<2 x i64> %vec) { +; CHECK-LABEL: @add_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = add <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @mul_i64(<2 x i64> %vec) { +; CHECK-LABEL: @mul_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = mul <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @and_i64(<2 x i64> %vec) { +; CHECK-LABEL: @and_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = and <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @or_i64(<2 x i64> %vec) { +; CHECK-LABEL: @or_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = or <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @xor_i64(<2 x i64> %vec) { +; CHECK-LABEL: @xor_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = xor <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[BIN_RDX]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define float @fadd_f32(<4 x float> %vec) { +; CHECK-LABEL: @fadd_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fadd fast <4 x float> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fadd fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP0]] +; +entry: + %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define float @fadd_f32_strict(<4 x float> %vec) { +; CHECK-LABEL: @fadd_f32_strict( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[R:%.*]] = call float @llvm.experimental.vector.reduce.fadd.f32.f32.v4f32(float undef, <4 x float> [[VEC:%.*]]) +; CHECK-NEXT: ret float [[R]] +; +entry: + %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define float @fmul_f32(<4 x float> %vec) { +; CHECK-LABEL: @fmul_f32( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[VEC:%.*]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX:%.*]] = fmul fast <4 x float> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[BIN_RDX]], <4 x float> undef, <4 x i32> +; CHECK-NEXT: [[BIN_RDX2:%.*]] = fmul fast <4 x float> [[BIN_RDX]], [[RDX_SHUF1]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <4 x float> [[BIN_RDX2]], i32 0 +; CHECK-NEXT: ret float [[TMP0]] +; +entry: + %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec) + ret float %r +} + +define i64 @smax_i64(<2 x i64> %vec) { +; CHECK-LABEL: @smax_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp sgt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @smin_i64(<2 x i64> %vec) { +; CHECK-LABEL: @smin_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp slt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @umax_i64(<2 x i64> %vec) { +; CHECK-LABEL: @umax_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ugt <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define i64 @umin_i64(<2 x i64> %vec) { +; CHECK-LABEL: @umin_i64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x i64> [[VEC:%.*]], <2 x i64> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = icmp ult <2 x i64> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x i64> [[VEC]], <2 x i64> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x i64> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret i64 [[TMP0]] +; +entry: + %r = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> %vec) + ret i64 %r +} + +define double @fmax_f64(<2 x double> %vec) { +; CHECK-LABEL: @fmax_f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <2 x double> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret double [[TMP0]] +; +entry: + %r = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %vec) + ret double %r +} + +define double @fmin_f64(<2 x double> %vec) { +; CHECK-LABEL: @fmin_f64( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <2 x double> [[VEC:%.*]], <2 x double> undef, <2 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <2 x double> [[VEC]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <2 x i1> [[RDX_MINMAX_CMP]], <2 x double> [[VEC]], <2 x double> [[RDX_SHUF]] +; CHECK-NEXT: [[TMP0:%.*]] = extractelement <2 x double> [[RDX_MINMAX_SELECT]], i32 0 +; CHECK-NEXT: ret double [[TMP0]] +; +entry: + %r = call double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %vec) + ret double %r +} diff --git a/test/CodeGen/Hexagon/regalloc-bad-undef.mir b/test/CodeGen/Hexagon/regalloc-bad-undef.mir index d8fbb92b0d50..a541e766f593 100644 --- a/test/CodeGen/Hexagon/regalloc-bad-undef.mir +++ b/test/CodeGen/Hexagon/regalloc-bad-undef.mir @@ -161,17 +161,17 @@ body: | bb.1.for.body: successors: %bb.3.for.end, %bb.2.if.end82 - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 undef %29.isub_lo = COPY killed %r0 %29.isub_hi = S2_asr_i_r %29.isub_lo, 31 - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 %32.isub_lo = COPY killed %r0 %7 = S2_extractup %32, 22, 9 - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3, implicit-def %r0 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 undef %43.isub_lo = COPY killed %r0 @@ -179,7 +179,7 @@ body: | %16 = S2_extractup %43, 6, 25 %18 = A2_tfrpi -1 %18 = S2_asl_r_p_acc %18, %47, %16.isub_lo - ADJCALLSTACKDOWN 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 + ADJCALLSTACKDOWN 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit %r31, implicit %r30, implicit %r29 J2_call @lrand48, implicit-def dead %d0, implicit-def dead %d1, implicit-def dead %d2, implicit-def dead %d3, implicit-def dead %d4, implicit-def dead %d5, implicit-def dead %d6, implicit-def dead %d7, implicit-def dead %r28, implicit-def dead %r31, implicit-def dead %p0, implicit-def dead %p1, implicit-def dead %p2, implicit-def dead %p3, implicit-def dead %m0, implicit-def dead %m1, implicit-def dead %lc0, implicit-def dead %lc1, implicit-def dead %sa0, implicit-def dead %sa1, implicit-def dead %usr, implicit-def %usr_ovf, implicit-def dead %cs0, implicit-def dead %cs1, implicit-def dead %w0, implicit-def dead %w1, implicit-def dead %w2, implicit-def dead %w3, implicit-def dead %w4, implicit-def dead %w5, implicit-def dead %w6, implicit-def dead %w7, implicit-def dead %w8, implicit-def dead %w9, implicit-def dead %w10, implicit-def dead %w11, implicit-def dead %w12, implicit-def dead %w13, implicit-def dead %w14, implicit-def dead %w15, implicit-def dead %q0, implicit-def dead %q1, implicit-def dead %q2, implicit-def dead %q3 ADJCALLSTACKUP 0, 0, implicit-def dead %r29, implicit-def dead %r30, implicit-def dead %r31, implicit %r29 %22 = S2_asl_r_p %18, %8.isub_lo diff --git a/test/CodeGen/Lanai/masking_setccs.ll b/test/CodeGen/Lanai/masking_setccs.ll new file mode 100644 index 000000000000..48136fd42574 --- /dev/null +++ b/test/CodeGen/Lanai/masking_setccs.ll @@ -0,0 +1,48 @@ +; RUN: llc < %s | FileCheck %s + +; Test that unnecessary masking with 0x1 is not inserted. + +target datalayout = "E-m:e-p:32:32-i64:64-a:0:32-n32-S64" +target triple = "lanai" + +; CHECK-LABEL: masking: +; CHECK-NOT: mov 1 +define i32 @masking(i32 inreg %a, i32 inreg %b, i32 inreg %c, i32 inreg %d) { +entry: + %cmp = icmp ne i32 %a, 0 + %cmp1 = icmp ult i32 %a, %b + %or.cond = and i1 %cmp, %cmp1 + br i1 %or.cond, label %return, label %if.end + +if.end: ; preds = %entry + %cmp2 = icmp ne i32 %b, 0 + %cmp4 = icmp ult i32 %b, %c + %or.cond29 = and i1 %cmp2, %cmp4 + br i1 %or.cond29, label %return, label %if.end6 + +if.end6: ; preds = %if.end + %cmp7 = icmp ne i32 %c, 0 + %cmp9 = icmp ult i32 %c, %d + %or.cond30 = and i1 %cmp7, %cmp9 + br i1 %or.cond30, label %return, label %if.end11 + +if.end11: ; preds = %if.end6 + %cmp12 = icmp ne i32 %d, 0 + %cmp14 = icmp ult i32 %d, %a + %or.cond31 = and i1 %cmp12, %cmp14 + %b. = select i1 %or.cond31, i32 %b, i32 21 + ret i32 %b. + +return: ; preds = %if.end6, %if.end, %entry + %retval.0 = phi i32 [ %c, %entry ], [ %d, %if.end ], [ %a, %if.end6 ] + ret i32 %retval.0 +} + +; CHECK-LABEL: notnot: +; CHECK-NOT: mov 1 +define i32 @notnot(i32 %x) { +entry: + %tobool = icmp ne i32 %x, 0 + %lnot.ext = zext i1 %tobool to i32 + ret i32 %lnot.ext +} diff --git a/test/CodeGen/Lanai/peephole-compare.mir b/test/CodeGen/Lanai/peephole-compare.mir index 5056a05ed1f6..51133b5e58e3 100644 --- a/test/CodeGen/Lanai/peephole-compare.mir +++ b/test/CodeGen/Lanai/peephole-compare.mir @@ -644,7 +644,7 @@ body: | bb.1.if.then: successors: %bb.2.while.body - ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp @@ -663,7 +663,7 @@ body: | bb.4.if.then4: successors: %bb.5.while.body6 - ADJCALLSTACKDOWN 0, implicit-def dead %sp, implicit %sp + ADJCALLSTACKDOWN 0, 0, implicit-def dead %sp, implicit %sp CALL @g, csr, implicit-def dead %rca, implicit %sp, implicit-def %sp, implicit-def %rv ADJCALLSTACKUP 0, 0, implicit-def dead %sp, implicit %sp diff --git a/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir new file mode 100644 index 000000000000..96801f5b0a37 --- /dev/null +++ b/test/CodeGen/MIR/ARM/PR32721_ifcvt_triangle_unanalyzable.mir @@ -0,0 +1,24 @@ +# RUN: llc -mtriple=arm-apple-ios -run-pass=if-converter %s -o - | FileCheck %s +--- +name: foo +body: | + bb.0: + B %bb.2 + + bb.1: + BX_RET 14, 0 + + bb.2: + Bcc %bb.1, 1, %cpsr + + bb.3: + B %bb.1 + +... + +# We should get a single block containing the BX_RET, with no successors at all + +# CHECK: body: +# CHECK-NEXT: bb.0: +# CHECK-NEXT: BX_RET + diff --git a/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir new file mode 100644 index 000000000000..5a1583f7a9be --- /dev/null +++ b/test/CodeGen/MIR/ARM/ifcvt_canFallThroughTo.mir @@ -0,0 +1,64 @@ +# RUN: llc -mtriple=arm-apple-ios -o - %s -run-pass if-converter | FileCheck %s +--- +name: f1 +body: | + bb.0: + successors: %bb.1 + + B %bb.1 + + bb.1: + successors: %bb.2, %bb.4 + + Bcc %bb.4, 1, %cpsr + + bb.2: + successors: %bb.3, %bb.5 + + Bcc %bb.5, 1, %cpsr + + bb.3: + successors: %bb.5 + + B %bb.5 + + bb.4: + successors: + + bb.5: + successors: %bb.1, %bb.6 + + Bcc %bb.1, 1, %cpsr + + bb.6: + BX_RET 14, _ + +... + +# IfConversion.cpp/canFallThroughTo thought there was a fallthrough from +# bb.4 to bb5 even if the successor list was empty. +# bb.4 is empty, so it surely looks like it can fallthrough, but this is what +# happens for a bb just containing an "unreachable". + +#CHECK: body: | +#CHECK: bb.0: +#CHECK: successors: %bb.1 + +#CHECK: bb.1: +#CHECK: successors: %bb.3({{.*}}), %bb.2 + +# The original brr_cond from bb.1, jumping to the empty bb +#CHECK: Bcc %bb.2 +#CHECK: B %bb.3 + +# Empty bb.2, originally containing "unreachable" and thus has no successors +#CHECK: bb.2: +#CHECK-NOT: successors + +#CHECK: bb.3: +#CHECK: successors: %bb.1 + +# Conditional BX_RET and then loop back to bb.1 +#CHECK: BX_RET 0 +#CHECK: B %bb.1 + diff --git a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir index 2d5347e5d30d..14bb5db5a51d 100644 --- a/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir +++ b/test/CodeGen/MIR/X86/frame-info-save-restore-points.mir @@ -60,7 +60,7 @@ body: | liveins: %eax MOV32mr %stack.0.tmp, 1, _, 0, _, killed %eax - ADJCALLSTACKDOWN64 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp + ADJCALLSTACKDOWN64 0, 0, 0, implicit-def %rsp, implicit-def dead %eflags, implicit %rsp %rsi = LEA64r %stack.0.tmp, 1, _, 0, _ %edi = MOV32r0 implicit-def dead %eflags CALL64pcrel32 @doSomething, csr_64, implicit %rsp, implicit %edi, implicit %rsi, implicit-def %rsp, implicit-def %eax diff --git a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll b/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll deleted file mode 100644 index dce9d25ca87a..000000000000 --- a/test/CodeGen/MSP430/2009-11-05-8BitLibcalls.ll +++ /dev/null @@ -1,22 +0,0 @@ -; RUN: llc < %s | FileCheck %s - -target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8" -target triple = "msp430-elf" - -@g_29 = common global i8 0, align 1 ; [#uses=0] - -define signext i8 @foo(i8 signext %_si1, i8 signext %_si2) nounwind readnone { -entry: -; CHECK-LABEL: foo: -; CHECK: call #__mulqi3 - %mul = mul i8 %_si2, %_si1 ; [#uses=1] - ret i8 %mul -} - -define void @uint81(i16* nocapture %p_32) nounwind { -entry: - %call = tail call i16 @bar(i8* bitcast (i8 (i8, i8)* @foo to i8*)) nounwind ; [#uses=0] - ret void -} - -declare i16 @bar(i8*) diff --git a/test/CodeGen/MSP430/hwmult16.ll b/test/CodeGen/MSP430/hwmult16.ll new file mode 100644 index 000000000000..b23f1ad37d81 --- /dev/null +++ b/test/CodeGen/MSP430/hwmult16.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mhwmult=16bit < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi_hw + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl_hw + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll_hw + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/hwmult32.ll b/test/CodeGen/MSP430/hwmult32.ll new file mode 100644 index 000000000000..6ffeb9698862 --- /dev/null +++ b/test/CodeGen/MSP430/hwmult32.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mhwmult=32bit < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi_hw + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl_hw32 + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll_hw32 + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/hwmultf5.ll b/test/CodeGen/MSP430/hwmultf5.ll new file mode 100644 index 000000000000..51ca4be4a654 --- /dev/null +++ b/test/CodeGen/MSP430/hwmultf5.ll @@ -0,0 +1,43 @@ +; RUN: llc -O0 -mhwmult=f5series < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi_f5hw + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl_f5hw + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll_f5hw + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/jumptable.ll b/test/CodeGen/MSP430/jumptable.ll index 5ccdbb701db1..b4366251698b 100644 --- a/test/CodeGen/MSP430/jumptable.ll +++ b/test/CodeGen/MSP430/jumptable.ll @@ -12,7 +12,7 @@ entry: store i16 %i, i16* %i.addr, align 2 %0 = load i16, i16* %i.addr, align 2 ; CHECK: mov.w #2, r13 -; CHECK: call #__mulhi3hw_noint +; CHECK: call #__mspabi_mpyi ; CHECK: br .LJTI0_0(r12) switch i16 %0, label %sw.default [ i16 0, label %sw.bb diff --git a/test/CodeGen/MSP430/libcalls.ll b/test/CodeGen/MSP430/libcalls.ll new file mode 100644 index 000000000000..950ed6c17e2c --- /dev/null +++ b/test/CodeGen/MSP430/libcalls.ll @@ -0,0 +1,595 @@ +; RUN: llc -O0 < %s | FileCheck %s + +target datalayout = "e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16-a0:16:16" +target triple = "msp430---elf" + +@g_double = global double 123.0, align 8 +@g_float = global float 123.0, align 8 +@g_i32 = global i32 123, align 8 +@g_i64 = global i64 456, align 8 +@g_i16 = global i16 789, align 8 + +define float @d2f() #0 { +entry: +; CHECK: d2f: + +; CHECK: call #__mspabi_cvtdf + %0 = load volatile double, double* @g_double, align 8 + %1 = fptrunc double %0 to float + + ret float %1 +} + +define double @f2d() #0 { +entry: +; CHECK: f2d: + +; CHECK: call #__mspabi_cvtfd + %0 = load volatile float, float* @g_float, align 8 + %1 = fpext float %0 to double + + ret double %1 +} + +define i32 @d2l() #0 { +entry: +; CHECK: d2l: + +; CHECK: call #__mspabi_fixdli + %0 = load volatile double, double* @g_double, align 8 + %1 = fptosi double %0 to i32 + + ret i32 %1 +} + +define i64 @d2ll() #0 { +entry: +; CHECK: d2ll: + +; CHECK: call #__mspabi_fixdlli + %0 = load volatile double, double* @g_double, align 8 + %1 = fptosi double %0 to i64 + + ret i64 %1 +} + +define i32 @d2ul() #0 { +entry: +; CHECK: d2ul: + +; CHECK: call #__mspabi_fixdul + %0 = load volatile double, double* @g_double, align 8 + %1 = fptoui double %0 to i32 + + ret i32 %1 +} + +define i64 @d2ull() #0 { +entry: +; CHECK: d2ull: + +; CHECK: call #__mspabi_fixdull + %0 = load volatile double, double* @g_double, align 8 + %1 = fptoui double %0 to i64 + + ret i64 %1 +} + +define i32 @f2l() #0 { +entry: +; CHECK: f2l: + +; CHECK: call #__mspabi_fixfli + %0 = load volatile float, float* @g_float, align 8 + %1 = fptosi float %0 to i32 + + ret i32 %1 +} + +define i64 @f2ll() #0 { +entry: +; CHECK: f2ll: + +; CHECK: call #__mspabi_fixflli + %0 = load volatile float, float* @g_float, align 8 + %1 = fptosi float %0 to i64 + + ret i64 %1 +} + +define i32 @f2ul() #0 { +entry: +; CHECK: f2ul: + +; CHECK: call #__mspabi_fixful + %0 = load volatile float, float* @g_float, align 8 + %1 = fptoui float %0 to i32 + + ret i32 %1 +} + +define i64 @f2ull() #0 { +entry: +; CHECK: f2ull: + +; CHECK: call #__mspabi_fixfull + %0 = load volatile float, float* @g_float, align 8 + %1 = fptoui float %0 to i64 + + ret i64 %1 +} + +define double @l2d() #0 { +entry: +; CHECK: l2d: + +; CHECK: call #__mspabi_fltlid + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = sitofp i32 %0 to double + + ret double %1 +} + +define double @ll2d() #0 { +entry: +; CHECK: ll2d: + +; CHECK: call #__mspabi_fltllid + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = sitofp i64 %0 to double + + ret double %1 +} + +define double @ul2d() #0 { +entry: +; CHECK: ul2d: + +; CHECK: call #__mspabi_fltuld + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = uitofp i32 %0 to double + + ret double %1 +} + +define double @ull2d() #0 { +entry: +; CHECK: ull2d: + +; CHECK: call #__mspabi_fltulld + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = uitofp i64 %0 to double + + ret double %1 +} + +define float @l2f() #0 { +entry: +; CHECK: l2f: + +; CHECK: call #__mspabi_fltlif + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = sitofp i32 %0 to float + + ret float %1 +} + +define float @ll2f() #0 { +entry: +; CHECK: ll2f: + +; CHECK: call #__mspabi_fltllif + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = sitofp i64 %0 to float + + ret float %1 +} + +define float @ul2f() #0 { +entry: +; CHECK: ul2f: + +; CHECK: call #__mspabi_fltulf + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = uitofp i32 %0 to float + + ret float %1 +} + +define float @ull2f() #0 { +entry: +; CHECK: ull2f: + +; CHECK: call #__mspabi_fltullf + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = uitofp i64 %0 to float + + ret float %1 +} + +define i1 @cmpd_oeq() #0 { +entry: +; CHECK: cmpd_oeq: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp oeq double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_une() #0 { +entry: +; CHECK: cmpd_une: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp une double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_oge() #0 { +entry: +; CHECK: cmpd_oge: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp oge double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_olt() #0 { +entry: +; CHECK: cmpd_olt: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp olt double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_ole() #0 { +entry: +; CHECK: cmpd_ole: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp ole double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpd_ogt() #0 { +entry: +; CHECK: cmpd_ogt: + +; CHECK: call #__mspabi_cmpd + %0 = load volatile double, double* @g_double, align 8 + %1 = fcmp ogt double %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_oeq() #0 { +entry: +; CHECK: cmpf_oeq: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp oeq float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_une() #0 { +entry: +; CHECK: cmpf_une: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp une float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_oge() #0 { +entry: +; CHECK: cmpf_oge: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp oge float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_olt() #0 { +entry: +; CHECK: cmpf_olt: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp olt float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_ole() #0 { +entry: +; CHECK: cmpf_ole: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp ole float %0, 123.0 + + ret i1 %1 +} + +define i1 @cmpf_ogt() #0 { +entry: +; CHECK: cmpf_ogt: + +; CHECK: call #__mspabi_cmpf + %0 = load volatile float, float* @g_float, align 8 + %1 = fcmp ogt float %0, 123.0 + + ret i1 %1 +} + +define double @addd() #0 { +entry: +; CHECK: addd: + +; CHECK: call #__mspabi_addd + %0 = load volatile double, double* @g_double, align 8 + %1 = fadd double %0, 123.0 + + ret double %1 +} + +define float @addf() #0 { +entry: +; CHECK: addf: + +; CHECK: call #__mspabi_addf + %0 = load volatile float, float* @g_float, align 8 + %1 = fadd float %0, 123.0 + + ret float %1 +} + +define double @divd() #0 { +entry: +; CHECK: divd: + +; CHECK: call #__mspabi_divd + %0 = load volatile double, double* @g_double, align 8 + %1 = fdiv double %0, 123.0 + + ret double %1 +} + +define float @divf() #0 { +entry: +; CHECK: divf: + +; CHECK: call #__mspabi_divf + %0 = load volatile float, float* @g_float, align 8 + %1 = fdiv float %0, 123.0 + + ret float %1 +} + +define double @mpyd() #0 { +entry: +; CHECK: mpyd: + +; CHECK: call #__mspabi_mpyd + %0 = load volatile double, double* @g_double, align 8 + %1 = fmul double %0, 123.0 + + ret double %1 +} + +define float @mpyf() #0 { +entry: +; CHECK: mpyf: + +; CHECK: call #__mspabi_mpyf + %0 = load volatile float, float* @g_float, align 8 + %1 = fmul float %0, 123.0 + + ret float %1 +} + +define double @subd() #0 { +entry: +; CHECK: subd: + +; CHECK: call #__mspabi_subd + %0 = load volatile double, double* @g_double, align 8 + %1 = fsub double %0, %0 + + ret double %1 +} + +define float @subf() #0 { +entry: +; CHECK: subf: + +; CHECK: call #__mspabi_subf + %0 = load volatile float, float* @g_float, align 8 + %1 = fsub float %0, %0 + + ret float %1 +} + +define i16 @divi() #0 { +entry: +; CHECK: divi: + +; CHECK: call #__mspabi_divi + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = sdiv i16 %0, %0 + + ret i16 %1 +} + +define i32 @divli() #0 { +entry: +; CHECK: divli: + +; CHECK: call #__mspabi_divli + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = sdiv i32 %0, %0 + + ret i32 %1 +} + +define i64 @divlli() #0 { +entry: +; CHECK: divlli: + +; CHECK: call #__mspabi_divlli + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = sdiv i64 %0, %0 + + ret i64 %1 +} + +define i16 @divu() #0 { +entry: +; CHECK: divu: + +; CHECK: call #__mspabi_divu + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = udiv i16 %0, %0 + + ret i16 %1 +} + +define i32 @divul() #0 { +entry: +; CHECK: divul: + +; CHECK: call #__mspabi_divul + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = udiv i32 %0, %0 + + ret i32 %1 +} + +define i64 @divull() #0 { +entry: +; CHECK: divull: + +; CHECK: call #__mspabi_divull + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = udiv i64 %0, %0 + + ret i64 %1 +} + +define i16 @remi() #0 { +entry: +; CHECK: remi: + +; CHECK: call #__mspabi_remi + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = srem i16 %0, %0 + + ret i16 %1 +} + +define i32 @remli() #0 { +entry: +; CHECK: remli: + +; CHECK: call #__mspabi_remli + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = srem i32 %0, %0 + + ret i32 %1 +} + +define i64 @remlli() #0 { +entry: +; CHECK: remlli: + +; CHECK: call #__mspabi_remlli + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = srem i64 %0, %0 + + ret i64 %1 +} + +define i16 @remu() #0 { +entry: +; CHECK: remu: + +; CHECK: call #__mspabi_remu + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = urem i16 %0, %0 + + ret i16 %1 +} + +define i32 @remul() #0 { +entry: +; CHECK: remul: + +; CHECK: call #__mspabi_remul + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = urem i32 %0, %0 + + ret i32 %1 +} + +define i64 @remull() #0 { +entry: +; CHECK: remull: + +; CHECK: call #__mspabi_remull + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = urem i64 %0, %0 + + ret i64 %1 +} + +define i16 @mpyi() #0 { +entry: +; CHECK: mpyi: + +; CHECK: call #__mspabi_mpyi + %0 = load volatile i16, i16* @g_i16, align 8 + %1 = mul i16 %0, %0 + + ret i16 %1 +} + +define i32 @mpyli() #0 { +entry: +; CHECK: mpyli: + +; CHECK: call #__mspabi_mpyl + %0 = load volatile i32, i32* @g_i32, align 8 + %1 = mul i32 %0, %0 + + ret i32 %1 +} + +define i64 @mpylli() #0 { +entry: +; CHECK: mpylli: + +; CHECK: call #__mspabi_mpyll + %0 = load volatile i64, i64* @g_i64, align 8 + %1 = mul i64 %0, %0 + + ret i64 %1 +} + +attributes #0 = { nounwind } diff --git a/test/CodeGen/MSP430/promote-i8-mul.ll b/test/CodeGen/MSP430/promote-i8-mul.ll new file mode 100644 index 000000000000..0e05e3978b1e --- /dev/null +++ b/test/CodeGen/MSP430/promote-i8-mul.ll @@ -0,0 +1,22 @@ +; RUN: llc < %s | FileCheck %s + +target datalayout = "e-p:16:8:8-i8:8:8-i16:8:8-i32:8:8" +target triple = "msp430-elf" + +@g_29 = common global i8 0, align 1 ; [#uses=0] + +define signext i8 @foo(i8 signext %_si1, i8 signext %_si2) nounwind readnone { +entry: +; CHECK-LABEL: foo: +; CHECK: call #__mspabi_mpyi + %mul = mul i8 %_si2, %_si1 ; [#uses=1] + ret i8 %mul +} + +define void @uint81(i16* nocapture %p_32) nounwind { +entry: + %call = tail call i16 @bar(i8* bitcast (i8 (i8, i8)* @foo to i8*)) nounwind ; [#uses=0] + ret void +} + +declare i16 @bar(i8*) diff --git a/test/CodeGen/NVPTX/bug17709.ll b/test/CodeGen/NVPTX/bug17709.ll index 076c44684579..6d747f09d8a7 100644 --- a/test/CodeGen/NVPTX/bug17709.ll +++ b/test/CodeGen/NVPTX/bug17709.ll @@ -1,26 +1,26 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - -; ModuleID = '__kernelgen_main_module' -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64-nvidia-cuda" - -define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) { -entry: - ;unreachable - %t0 = insertvalue {double, double} undef, double 1.0, 0 - %t1 = insertvalue {double, double} %t0, double 1.0, 1 - ret { double, double } %t1 -} - -%struct.descriptor_dimension.0.52 = type { i64, i64, i64 } -%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } -%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } -@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096 - -; CHECK: .visible .entry __kernelgen_main -define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) { -entry: - %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8) - ret void -} - +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s + +; ModuleID = '__kernelgen_main_module' +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-nvidia-cuda" + +define private ptx_device { double, double } @__utils1_MOD_trace(%"struct.array2_complex(kind=8).43.5.57"* noalias %m) { +entry: + ;unreachable + %t0 = insertvalue {double, double} undef, double 1.0, 0 + %t1 = insertvalue {double, double} %t0, double 1.0, 1 + ret { double, double } %t1 +} + +%struct.descriptor_dimension.0.52 = type { i64, i64, i64 } +%"struct.array2_complex(kind=8).37.18.70" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } +%"struct.array2_complex(kind=8).43.5.57" = type { i8*, i64, i64, [2 x %struct.descriptor_dimension.0.52] } +@replacementOfAlloca8 = private global %"struct.array2_complex(kind=8).37.18.70" zeroinitializer, align 4096 + +; CHECK: .visible .entry __kernelgen_main +define ptx_kernel void @__kernelgen_main(i32* nocapture %args, i32*) { +entry: + %1 = tail call ptx_device { double, double } bitcast ({ double, double } (%"struct.array2_complex(kind=8).43.5.57"*)* @__utils1_MOD_trace to { double, double } (%"struct.array2_complex(kind=8).37.18.70"*)*)(%"struct.array2_complex(kind=8).37.18.70"* noalias @replacementOfAlloca8) + ret void +} + diff --git a/test/CodeGen/NVPTX/ctlz.ll b/test/CodeGen/NVPTX/ctlz.ll index 005958bd938a..7aa29fe811dd 100644 --- a/test/CodeGen/NVPTX/ctlz.ll +++ b/test/CodeGen/NVPTX/ctlz.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/test/CodeGen/NVPTX/ctpop.ll b/test/CodeGen/NVPTX/ctpop.ll index b961d4d27bdd..69a4f879a8d8 100644 --- a/test/CodeGen/NVPTX/ctpop.ll +++ b/test/CodeGen/NVPTX/ctpop.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/test/CodeGen/NVPTX/cttz.ll b/test/CodeGen/NVPTX/cttz.ll index 124ba9d1e9a7..0bfe0139bcdf 100644 --- a/test/CodeGen/NVPTX/cttz.ll +++ b/test/CodeGen/NVPTX/cttz.ll @@ -1,5 +1,4 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" diff --git a/test/CodeGen/NVPTX/f16-instructions.ll b/test/CodeGen/NVPTX/f16-instructions.ll index 3d4140820794..08a2ee14e8bd 100644 --- a/test/CodeGen/NVPTX/f16-instructions.ll +++ b/test/CodeGen/NVPTX/f16-instructions.ll @@ -1,1078 +1,1079 @@ -; ## Full FP16 support enabled by default. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s -; ## FP16 support explicitly disabled. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s -; ## FP16 is not supported by hardware. -; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ -; RUN: -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s - -target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - -; CHECK-LABEL: test_ret_const( -; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_ret_const() #0 { - ret half 1.0 -} - -; CHECK-LABEL: test_fadd( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_param_1]; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fadd(half %a, half %b) #0 { - %r = fadd half %a, %b - ret half %r -} - -; CHECK-LABEL: test_fadd_v1f16( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_v1f16_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_v1f16_param_1]; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 { - %r = fadd <1 x half> %a, %b - ret <1 x half> %r -} - -; Check that we can lower fadd with immediate arguments. -; CHECK-LABEL: test_fadd_imm_0( -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0]; -; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fadd_imm_0(half %b) #0 { - %r = fadd half 1.0, %b - ret half %r -} - -; CHECK-LABEL: test_fadd_imm_1( -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_1_param_0]; -; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; -; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fadd_imm_1(half %a) #0 { - %r = fadd half %a, 1.0 - ret half %r -} - -; CHECK-LABEL: test_fsub( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fsub_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fsub_param_1]; -; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fsub(half %a, half %b) #0 { - %r = fsub half %a, %b - ret half %r -} - -; CHECK-LABEL: test_fneg( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fneg_param_0]; -; CHECK-F16-NEXT: mov.b16 [[Z:%h[0-9]+]], 0x0000 -; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[Z]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; -; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fneg(half %a) #0 { - %r = fsub half 0.0, %a - ret half %r -} - -; CHECK-LABEL: test_fmul( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmul_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmul_param_1]; -; CHECK-F16-NEXT: mul.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fmul(half %a, half %b) #0 { - %r = fmul half %a, %b - ret half %r -} - -; CHECK-LABEL: test_fdiv( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fdiv_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fdiv_param_1]; -; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]]; -; CHECK-NEXT: div.rn.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]]; -; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[FR]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_fdiv(half %a, half %b) #0 { - %r = fdiv half %a, %b - ret half %r -} - -; CHECK-LABEL: test_frem( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1]; -; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]]; -; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]]; -; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]]; -; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]]; -; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]]; -; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_frem(half %a, half %b) #0 { - %r = frem half %a, %b - ret half %r -} - -; CHECK-LABEL: test_store( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_store_param_0]; -; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1]; -; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]]; -; CHECK-NEXT: ret; -define void @test_store(half %a, half* %b) #0 { - store half %a, half* %b - ret void -} - -; CHECK-LABEL: test_load( -; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0]; -; CHECK-NEXT: ld.b16 [[R:%h[0-9]+]], [%[[PTR]]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_load(half* %a) #0 { - %r = load half, half* %a - ret half %r -} - -; CHECK-LABEL: .visible .func test_halfp0a1( -; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0]; -; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1]; -; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] -; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] -; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] -; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] -; CHECK: ret -define void @test_halfp0a1(half * noalias readonly %from, half * %to) { - %1 = load half, half * %from , align 1 - store half %1, half * %to , align 1 - ret void -} - -declare half @test_callee(half %a, half %b) #0 - -; CHECK-LABEL: test_call( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_param_1]; -; CHECK: { -; CHECK-DAG: .param .b32 param0; -; CHECK-DAG: .param .b32 param1; -; CHECK-DAG: st.param.b16 [param0+0], [[A]]; -; CHECK-DAG: st.param.b16 [param1+0], [[B]]; -; CHECK-DAG: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_call(half %a, half %b) #0 { - %r = call half @test_callee(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_call_flipped( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_flipped_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .b32 param0; -; CHECK-DAG: .param .b32 param1; -; CHECK-DAG: st.param.b16 [param0+0], [[B]]; -; CHECK-DAG: st.param.b16 [param1+0], [[A]]; -; CHECK-DAG: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_call_flipped(half %a, half %b) #0 { - %r = call half @test_callee(half %b, half %a) - ret half %r -} - -; CHECK-LABEL: test_tailcall_flipped( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_tailcall_flipped_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_tailcall_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .b32 param0; -; CHECK-DAG: .param .b32 param1; -; CHECK-DAG: st.param.b16 [param0+0], [[B]]; -; CHECK-DAG: st.param.b16 [param1+0], [[A]]; -; CHECK-DAG: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_tailcall_flipped(half %a, half %b) #0 { - %r = tail call half @test_callee(half %b, half %a) - ret half %r -} - -; CHECK-LABEL: test_select( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1]; -; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_select(half %a, half %b, i1 zeroext %c) #0 { - %r = select i1 %c, half %a, half %b - ret half %r -} - -; CHECK-LABEL: test_select_cc( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_param_2]; -; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_param_3]; -; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; -; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] -; CHECK: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { - %cc = fcmp une half %c, %d - %r = select i1 %cc, half %a, half %b - ret half %r -} - -; CHECK-LABEL: test_select_cc_f32_f16( -; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0]; -; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2]; -; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3]; -; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; -; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] -; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { - %cc = fcmp une half %c, %d - %r = select i1 %cc, float %a, float %b - ret float %r -} - -; CHECK-LABEL: test_select_cc_f16_f32( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0]; -; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2]; -; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3]; -; CHECK-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]] -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1]; -; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { - %cc = fcmp une float %c, %d - %r = select i1 %cc, half %a, half %b - ret half %r -} - -; CHECK-LABEL: test_fcmp_une( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_une_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_une_param_1]; -; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_une(half %a, half %b) #0 { - %r = fcmp une half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ueq( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ueq_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ueq_param_1]; -; CHECK-F16: setp.equ.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ueq(half %a, half %b) #0 { - %r = fcmp ueq half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ugt( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ugt_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ugt_param_1]; -; CHECK-F16: setp.gtu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ugt(half %a, half %b) #0 { - %r = fcmp ugt half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_uge( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uge_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uge_param_1]; -; CHECK-F16: setp.geu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_uge(half %a, half %b) #0 { - %r = fcmp uge half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ult( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ult_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ult_param_1]; -; CHECK-F16: setp.ltu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ult(half %a, half %b) #0 { - %r = fcmp ult half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ule( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ule_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ule_param_1]; -; CHECK-F16: setp.leu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ule(half %a, half %b) #0 { - %r = fcmp ule half %a, %b - ret i1 %r -} - - -; CHECK-LABEL: test_fcmp_uno( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uno_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uno_param_1]; -; CHECK-F16: setp.nan.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_uno(half %a, half %b) #0 { - %r = fcmp uno half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_one( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_one_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_one_param_1]; -; CHECK-F16: setp.ne.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_one(half %a, half %b) #0 { - %r = fcmp one half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_oeq( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oeq_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oeq_param_1]; -; CHECK-F16: setp.eq.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_oeq(half %a, half %b) #0 { - %r = fcmp oeq half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ogt( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ogt_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ogt_param_1]; -; CHECK-F16: setp.gt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ogt(half %a, half %b) #0 { - %r = fcmp ogt half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_oge( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oge_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oge_param_1]; -; CHECK-F16: setp.ge.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_oge(half %a, half %b) #0 { - %r = fcmp oge half %a, %b - ret i1 %r -} - -; XCHECK-LABEL: test_fcmp_olt( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_olt_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_olt_param_1]; -; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_olt(half %a, half %b) #0 { - %r = fcmp olt half %a, %b - ret i1 %r -} - -; XCHECK-LABEL: test_fcmp_ole( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ole_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ole_param_1]; -; CHECK-F16: setp.le.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ole(half %a, half %b) #0 { - %r = fcmp ole half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_fcmp_ord( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ord_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ord_param_1]; -; CHECK-F16: setp.num.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i1 @test_fcmp_ord(half %a, half %b) #0 { - %r = fcmp ord half %a, %b - ret i1 %r -} - -; CHECK-LABEL: test_br_cc( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_br_cc_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_br_cc_param_1]; -; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2]; -; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3]; -; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] -; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]]; -; CHECK: st.u32 [%[[C]]], -; CHECK: [[LABEL]]: -; CHECK: st.u32 [%[[D]]], -; CHECK: ret; -define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 { - %c = fcmp uge half %a, %b - br i1 %c, label %then, label %else -then: - store i32 0, i32* %p1 - ret void -else: - store i32 0, i32* %p2 - ret void -} - -; CHECK-LABEL: test_phi( -; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0]; -; CHECK: ld.b16 {{%h[0-9]+}}, [%[[P1]]]; -; CHECK: [[LOOP:LBB[0-9_]+]]: -; CHECK: mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]]; -; CHECK: ld.b16 [[AB:%h[0-9]+]], [%[[P1]]]; -; CHECK: { -; CHECK: st.param.b64 [param0+0], %[[P1]]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_dummy -; CHECK: } -; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1; -; CHECK: @[[PRED]] bra [[LOOP]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_phi(half* %p1) #0 { -entry: - %a = load half, half* %p1 - br label %loop -loop: - %r = phi half [%a, %entry], [%b, %loop] - %b = load half, half* %p1 - %c = call i1 @test_dummy(half* %p1) - br i1 %c, label %loop, label %return -return: - ret half %r -} -declare i1 @test_dummy(half* %p1) #0 - -; CHECK-LABEL: test_fptosi_i32( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i32_param_0]; -; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i32 @test_fptosi_i32(half %a) #0 { - %r = fptosi half %a to i32 - ret i32 %r -} - -; CHECK-LABEL: test_fptosi_i64( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i64_param_0]; -; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK: ret; -define i64 @test_fptosi_i64(half %a) #0 { - %r = fptosi half %a to i64 - ret i64 %r -} - -; CHECK-LABEL: test_fptoui_i32( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i32_param_0]; -; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i32 @test_fptoui_i32(half %a) #0 { - %r = fptoui half %a to i32 - ret i32 %r -} - -; CHECK-LABEL: test_fptoui_i64( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i64_param_0]; -; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK: ret; -define i64 @test_fptoui_i64(half %a) #0 { - %r = fptoui half %a to i64 - ret i64 %r -} - -; CHECK-LABEL: test_uitofp_i32( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0]; -; CHECK: cvt.rn.f16.u32 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_uitofp_i32(i32 %a) #0 { - %r = uitofp i32 %a to half - ret half %r -} - -; CHECK-LABEL: test_uitofp_i64( -; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0]; -; CHECK: cvt.rn.f16.u64 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_uitofp_i64(i64 %a) #0 { - %r = uitofp i64 %a to half - ret half %r -} - -; CHECK-LABEL: test_sitofp_i32( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0]; -; CHECK: cvt.rn.f16.s32 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sitofp_i32(i32 %a) #0 { - %r = sitofp i32 %a to half - ret half %r -} - -; CHECK-LABEL: test_sitofp_i64( -; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0]; -; CHECK: cvt.rn.f16.s64 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sitofp_i64(i64 %a) #0 { - %r = sitofp i64 %a to half - ret half %r -} - -; CHECK-LABEL: test_uitofp_i32_fadd( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0]; -; CHECK-DAG: cvt.rn.f16.u32 [[C:%h[0-9]+]], [[A]]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1]; -; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { - %c = uitofp i32 %a to half - %r = fadd half %b, %c - ret half %r -} - -; CHECK-LABEL: test_sitofp_i32_fadd( -; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0]; -; CHECK-DAG: cvt.rn.f16.s32 [[C:%h[0-9]+]], [[A]]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1]; -; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; -; XCHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; -; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { - %c = sitofp i32 %a to half - %r = fadd half %b, %c - ret half %r -} - -; CHECK-LABEL: test_fptrunc_float( -; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fptrunc_float(float %a) #0 { - %r = fptrunc float %a to half - ret half %r -} - -; CHECK-LABEL: test_fptrunc_double( -; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0]; -; CHECK: cvt.rn.f16.f64 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fptrunc_double(double %a) #0 { - %r = fptrunc double %a to half - ret half %r -} - -; CHECK-LABEL: test_fpext_float( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_float_param_0]; -; CHECK: cvt.f32.f16 [[R:%f[0-9]+]], [[A]]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; -; CHECK: ret; -define float @test_fpext_float(half %a) #0 { - %r = fpext half %a to float - ret float %r -} - -; CHECK-LABEL: test_fpext_double( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_double_param_0]; -; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]]; -; CHECK: st.param.f64 [func_retval0+0], [[R]]; -; CHECK: ret; -define double @test_fpext_double(half %a) #0 { - %r = fpext half %a to double - ret double %r -} - - -; CHECK-LABEL: test_bitcast_halftoi16( -; CHECK: ld.param.b16 [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0]; -; CHECK: mov.b16 [[AS:%rs[0-9]+]], [[AH]] -; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AS]] -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i16 @test_bitcast_halftoi16(half %a) #0 { - %r = bitcast half %a to i16 - ret i16 %r -} - -; CHECK-LABEL: test_bitcast_i16tohalf( -; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0]; -; CHECK: mov.b16 [[AH:%h[0-9]+]], [[AS]] -; CHECK: st.param.b16 [func_retval0+0], [[AH]]; -; CHECK: ret; -define half @test_bitcast_i16tohalf(i16 %a) #0 { - %r = bitcast i16 %a to half - ret half %r -} - - -declare half @llvm.sqrt.f16(half %a) #0 -declare half @llvm.powi.f16(half %a, i32 %b) #0 -declare half @llvm.sin.f16(half %a) #0 -declare half @llvm.cos.f16(half %a) #0 -declare half @llvm.pow.f16(half %a, half %b) #0 -declare half @llvm.exp.f16(half %a) #0 -declare half @llvm.exp2.f16(half %a) #0 -declare half @llvm.log.f16(half %a) #0 -declare half @llvm.log10.f16(half %a) #0 -declare half @llvm.log2.f16(half %a) #0 -declare half @llvm.fma.f16(half %a, half %b, half %c) #0 -declare half @llvm.fabs.f16(half %a) #0 -declare half @llvm.minnum.f16(half %a, half %b) #0 -declare half @llvm.maxnum.f16(half %a, half %b) #0 -declare half @llvm.copysign.f16(half %a, half %b) #0 -declare half @llvm.floor.f16(half %a) #0 -declare half @llvm.ceil.f16(half %a) #0 -declare half @llvm.trunc.f16(half %a) #0 -declare half @llvm.rint.f16(half %a) #0 -declare half @llvm.nearbyint.f16(half %a) #0 -declare half @llvm.round.f16(half %a) #0 -declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 - -; CHECK-LABEL: test_sqrt( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sqrt_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: sqrt.rn.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sqrt(half %a) #0 { - %r = call half @llvm.sqrt.f16(half %a) - ret half %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_powi( -;define half @test_powi(half %a, i32 %b) #0 { -; %r = call half @llvm.powi.f16(half %a, i32 %b) -; ret half %r -;} - -; CHECK-LABEL: test_sin( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sin_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_sin(half %a) #0 #1 { - %r = call half @llvm.sin.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_cos( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_cos_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_cos(half %a) #0 #1 { - %r = call half @llvm.cos.f16(half %a) - ret half %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_pow( -;define half @test_pow(half %a, half %b) #0 { -; %r = call half @llvm.pow.f16(half %a, half %b) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp( -;define half @test_exp(half %a) #0 { -; %r = call half @llvm.exp.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp2( -;define half @test_exp2(half %a) #0 { -; %r = call half @llvm.exp2.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log( -;define half @test_log(half %a) #0 { -; %r = call half @llvm.log.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log10( -;define half @test_log10(half %a) #0 { -; %r = call half @llvm.log10.f16(half %a) -; ret half %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log2( -;define half @test_log2(half %a) #0 { -; %r = call half @llvm.log2.f16(half %a) -; ret half %r -;} - -; CHECK-LABEL: test_fma( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fma_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fma_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fma_param_2]; -; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret -define half @test_fma(half %a, half %b, half %c) #0 { - %r = call half @llvm.fma.f16(half %a, half %b, half %c) - ret half %r -} - -; CHECK-LABEL: test_fabs( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fabs_param_0]; -; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK: abs.f32 [[RF:%f[0-9]+]], [[AF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fabs(half %a) #0 { - %r = call half @llvm.fabs.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_minnum( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_minnum_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_minnum_param_1]; -; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK: min.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_minnum(half %a, half %b) #0 { - %r = call half @llvm.minnum.f16(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_maxnum( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_maxnum_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_maxnum_param_1]; -; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; -; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; -; CHECK: max.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; -; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_maxnum(half %a, half %b) #0 { - %r = call half @llvm.maxnum.f16(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_copysign( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_param_0]; -; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_param_1]; -; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; -; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_copysign(half %a, half %b) #0 { - %r = call half @llvm.copysign.f16(half %a, half %b) - ret half %r -} - -; CHECK-LABEL: test_copysign_f32( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f32_param_0]; -; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1]; -; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; -; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648; -; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16; -; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]]; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_copysign_f32(half %a, float %b) #0 { - %tb = fptrunc float %b to half - %r = call half @llvm.copysign.f16(half %a, half %tb) - ret half %r -} - -; CHECK-LABEL: test_copysign_f64( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f64_param_0]; -; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1]; -; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; -; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808; -; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48; -; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]]; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_copysign_f64(half %a, double %b) #0 { - %tb = fptrunc double %b to half - %r = call half @llvm.copysign.f16(half %a, half %tb) - ret half %r -} - -; CHECK-LABEL: test_copysign_extended( -; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_extended_param_0]; -; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_extended_param_1]; -; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; -; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; -; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; -; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; -; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; -; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; -; CHECK: cvt.f32.f16 [[XR:%f[0-9]+]], [[R]]; -; CHECK: st.param.f32 [func_retval0+0], [[XR]]; -; CHECK: ret; -define float @test_copysign_extended(half %a, half %b) #0 { - %r = call half @llvm.copysign.f16(half %a, half %b) - %xr = fpext half %r to float - ret float %xr -} - -; CHECK-LABEL: test_floor( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_floor_param_0]; -; CHECK: cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_floor(half %a) #0 { - %r = call half @llvm.floor.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_ceil( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_ceil_param_0]; -; CHECK: cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_ceil(half %a) #0 { - %r = call half @llvm.ceil.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_trunc( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_trunc_param_0]; -; CHECK: cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_trunc(half %a) #0 { - %r = call half @llvm.trunc.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_rint( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_rint_param_0]; -; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_rint(half %a) #0 { - %r = call half @llvm.rint.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_nearbyint( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_nearbyint_param_0]; -; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_nearbyint(half %a) #0 { - %r = call half @llvm.nearbyint.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_round( -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0]; -; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_round(half %a) #0 { - %r = call half @llvm.round.f16(half %a) - ret half %r -} - -; CHECK-LABEL: test_fmuladd( -; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmuladd_param_0]; -; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmuladd_param_1]; -; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fmuladd_param_2]; -; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] -; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; -; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_fmuladd(half %a, half %b, half %c) #0 { - %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c) - ret half %r -} - -attributes #0 = { nounwind } -attributes #1 = { "unsafe-fp-math" = "true" } +; ## Full FP16 support enabled by default. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s +; ## FP16 support explicitly disabled. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ +; RUN: -verify-machineinstrs \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s +; ## FP16 is not supported by hardware. +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: -disable-post-ra -disable-fp-elim -verify-machineinstrs \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_ret_const( +; CHECK: mov.b16 [[R:%h[0-9]+]], 0x3C00; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_ret_const() #0 { + ret half 1.0 +} + +; CHECK-LABEL: test_fadd( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_param_1]; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fadd(half %a, half %b) #0 { + %r = fadd half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fadd_v1f16( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fadd_v1f16_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_v1f16_param_1]; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <1 x half> @test_fadd_v1f16(<1 x half> %a, <1 x half> %b) #0 { + %r = fadd <1 x half> %a, %b + ret <1 x half> %r +} + +; Check that we can lower fadd with immediate arguments. +; CHECK-LABEL: test_fadd_imm_0( +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_0_param_0]; +; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fadd_imm_0(half %b) #0 { + %r = fadd half 1.0, %b + ret half %r +} + +; CHECK-LABEL: test_fadd_imm_1( +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fadd_imm_1_param_0]; +; CHECK-F16-DAG: mov.b16 [[A:%h[0-9]+]], 0x3C00; +; CHECK-F16-NEXT: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], 0f3F800000; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fadd_imm_1(half %a) #0 { + %r = fadd half %a, 1.0 + ret half %r +} + +; CHECK-LABEL: test_fsub( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fsub_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fsub_param_1]; +; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fsub(half %a, half %b) #0 { + %r = fsub half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fneg( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fneg_param_0]; +; CHECK-F16-NEXT: mov.b16 [[Z:%h[0-9]+]], 0x0000 +; CHECK-F16-NEXT: sub.rn.f16 [[R:%h[0-9]+]], [[Z]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; +; CHECK-NOF16-NEXT: sub.rn.f32 [[R32:%f[0-9]+]], [[Z]], [[A32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fneg(half %a) #0 { + %r = fsub half 0.0, %a + ret half %r +} + +; CHECK-LABEL: test_fmul( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmul_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmul_param_1]; +; CHECK-F16-NEXT: mul.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-NEXT: mul.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fmul(half %a, half %b) #0 { + %r = fmul half %a, %b + ret half %r +} + +; CHECK-LABEL: test_fdiv( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fdiv_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fdiv_param_1]; +; CHECK-DAG: cvt.f32.f16 [[F0:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[F1:%f[0-9]+]], [[B]]; +; CHECK-NEXT: div.rn.f32 [[FR:%f[0-9]+]], [[F0]], [[F1]]; +; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[FR]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_fdiv(half %a, half %b) #0 { + %r = fdiv half %a, %b + ret half %r +} + +; CHECK-LABEL: test_frem( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_frem_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_frem_param_1]; +; CHECK-DAG: cvt.f32.f16 [[FA:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[FB:%f[0-9]+]], [[B]]; +; CHECK-NEXT: div.rn.f32 [[D:%f[0-9]+]], [[FA]], [[FB]]; +; CHECK-NEXT: cvt.rmi.f32.f32 [[DI:%f[0-9]+]], [[D]]; +; CHECK-NEXT: mul.f32 [[RI:%f[0-9]+]], [[DI]], [[FB]]; +; CHECK-NEXT: sub.f32 [[RF:%f[0-9]+]], [[FA]], [[RI]]; +; CHECK-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_frem(half %a, half %b) #0 { + %r = frem half %a, %b + ret half %r +} + +; CHECK-LABEL: test_store( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_store_param_0]; +; CHECK-DAG: ld.param.u64 %[[PTR:rd[0-9]+]], [test_store_param_1]; +; CHECK-NEXT: st.b16 [%[[PTR]]], [[A]]; +; CHECK-NEXT: ret; +define void @test_store(half %a, half* %b) #0 { + store half %a, half* %b + ret void +} + +; CHECK-LABEL: test_load( +; CHECK: ld.param.u64 %[[PTR:rd[0-9]+]], [test_load_param_0]; +; CHECK-NEXT: ld.b16 [[R:%h[0-9]+]], [%[[PTR]]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_load(half* %a) #0 { + %r = load half, half* %a + ret half %r +} + +; CHECK-LABEL: .visible .func test_halfp0a1( +; CHECK-DAG: ld.param.u64 %[[FROM:rd?[0-9]+]], [test_halfp0a1_param_0]; +; CHECK-DAG: ld.param.u64 %[[TO:rd?[0-9]+]], [test_halfp0a1_param_1]; +; CHECK-DAG: ld.u8 [[B0:%r[sd]?[0-9]+]], [%[[FROM]]] +; CHECK-DAG: st.u8 [%[[TO]]], [[B0]] +; CHECK-DAG: ld.u8 [[B1:%r[sd]?[0-9]+]], [%[[FROM]]+1] +; CHECK-DAG: st.u8 [%[[TO]]+1], [[B1]] +; CHECK: ret +define void @test_halfp0a1(half * noalias readonly %from, half * %to) { + %1 = load half, half * %from , align 1 + store half %1, half * %to , align 1 + ret void +} + +declare half @test_callee(half %a, half %b) #0 + +; CHECK-LABEL: test_call( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_param_1]; +; CHECK: { +; CHECK-DAG: .param .b32 param0; +; CHECK-DAG: .param .b32 param1; +; CHECK-DAG: st.param.b16 [param0+0], [[A]]; +; CHECK-DAG: st.param.b16 [param1+0], [[B]]; +; CHECK-DAG: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_call(half %a, half %b) #0 { + %r = call half @test_callee(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_call_flipped( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_call_flipped_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_call_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .b32 param0; +; CHECK-DAG: .param .b32 param1; +; CHECK-DAG: st.param.b16 [param0+0], [[B]]; +; CHECK-DAG: st.param.b16 [param1+0], [[A]]; +; CHECK-DAG: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_call_flipped(half %a, half %b) #0 { + %r = call half @test_callee(half %b, half %a) + ret half %r +} + +; CHECK-LABEL: test_tailcall_flipped( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_tailcall_flipped_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_tailcall_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .b32 param0; +; CHECK-DAG: .param .b32 param1; +; CHECK-DAG: st.param.b16 [param0+0], [[B]]; +; CHECK-DAG: st.param.b16 [param1+0], [[A]]; +; CHECK-DAG: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_tailcall_flipped(half %a, half %b) #0 { + %r = tail call half @test_callee(half %b, half %a) + ret half %r +} + +; CHECK-LABEL: test_select( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_param_1]; +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_select(half %a, half %b, i1 zeroext %c) #0 { + %r = select i1 %c, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_select_cc( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_param_2]; +; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_param_3]; +; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; +; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] +; CHECK: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_select_cc(half %a, half %b, half %c, half %d) #0 { + %cc = fcmp une half %c, %d + %r = select i1 %cc, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_select_cc_f32_f16( +; CHECK-DAG: ld.param.f32 [[A:%f[0-9]+]], [test_select_cc_f32_f16_param_0]; +; CHECK-DAG: ld.param.f32 [[B:%f[0-9]+]], [test_select_cc_f32_f16_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_select_cc_f32_f16_param_2]; +; CHECK-DAG: ld.param.b16 [[D:%h[0-9]+]], [test_select_cc_f32_f16_param_3]; +; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[C]], [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF:%f[0-9]+]], [[D]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF:%f[0-9]+]], [[C]]; +; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[CF]], [[DF]] +; CHECK-NEXT: selp.f32 [[R:%f[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define float @test_select_cc_f32_f16(float %a, float %b, half %c, half %d) #0 { + %cc = fcmp une half %c, %d + %r = select i1 %cc, float %a, float %b + ret float %r +} + +; CHECK-LABEL: test_select_cc_f16_f32( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_select_cc_f16_f32_param_0]; +; CHECK-DAG: ld.param.f32 [[C:%f[0-9]+]], [test_select_cc_f16_f32_param_2]; +; CHECK-DAG: ld.param.f32 [[D:%f[0-9]+]], [test_select_cc_f16_f32_param_3]; +; CHECK-DAG: setp.neu.f32 [[PRED:%p[0-9]+]], [[C]], [[D]] +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_select_cc_f16_f32_param_1]; +; CHECK-NEXT: selp.b16 [[R:%h[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define half @test_select_cc_f16_f32(half %a, half %b, float %c, float %d) #0 { + %cc = fcmp une float %c, %d + %r = select i1 %cc, half %a, half %b + ret half %r +} + +; CHECK-LABEL: test_fcmp_une( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_une_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_une_param_1]; +; CHECK-F16: setp.neu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.neu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_une(half %a, half %b) #0 { + %r = fcmp une half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ueq( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ueq_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ueq_param_1]; +; CHECK-F16: setp.equ.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.equ.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ueq(half %a, half %b) #0 { + %r = fcmp ueq half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ugt( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ugt_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ugt_param_1]; +; CHECK-F16: setp.gtu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.gtu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ugt(half %a, half %b) #0 { + %r = fcmp ugt half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_uge( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uge_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uge_param_1]; +; CHECK-F16: setp.geu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.geu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_uge(half %a, half %b) #0 { + %r = fcmp uge half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ult( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ult_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ult_param_1]; +; CHECK-F16: setp.ltu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.ltu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ult(half %a, half %b) #0 { + %r = fcmp ult half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ule( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ule_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ule_param_1]; +; CHECK-F16: setp.leu.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.leu.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ule(half %a, half %b) #0 { + %r = fcmp ule half %a, %b + ret i1 %r +} + + +; CHECK-LABEL: test_fcmp_uno( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_uno_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_uno_param_1]; +; CHECK-F16: setp.nan.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.nan.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_uno(half %a, half %b) #0 { + %r = fcmp uno half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_one( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_one_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_one_param_1]; +; CHECK-F16: setp.ne.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.ne.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_one(half %a, half %b) #0 { + %r = fcmp one half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_oeq( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oeq_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oeq_param_1]; +; CHECK-F16: setp.eq.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.eq.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_oeq(half %a, half %b) #0 { + %r = fcmp oeq half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ogt( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ogt_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ogt_param_1]; +; CHECK-F16: setp.gt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.gt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ogt(half %a, half %b) #0 { + %r = fcmp ogt half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_oge( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_oge_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_oge_param_1]; +; CHECK-F16: setp.ge.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.ge.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_oge(half %a, half %b) #0 { + %r = fcmp oge half %a, %b + ret i1 %r +} + +; XCHECK-LABEL: test_fcmp_olt( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_olt_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_olt_param_1]; +; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_olt(half %a, half %b) #0 { + %r = fcmp olt half %a, %b + ret i1 %r +} + +; XCHECK-LABEL: test_fcmp_ole( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ole_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ole_param_1]; +; CHECK-F16: setp.le.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.le.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ole(half %a, half %b) #0 { + %r = fcmp ole half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_fcmp_ord( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fcmp_ord_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fcmp_ord_param_1]; +; CHECK-F16: setp.num.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.num.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: selp.u32 [[R:%r[0-9]+]], 1, 0, [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i1 @test_fcmp_ord(half %a, half %b) #0 { + %r = fcmp ord half %a, %b + ret i1 %r +} + +; CHECK-LABEL: test_br_cc( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_br_cc_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_br_cc_param_1]; +; CHECK-DAG: ld.param.u64 %[[C:rd[0-9]+]], [test_br_cc_param_2]; +; CHECK-DAG: ld.param.u64 %[[D:rd[0-9]+]], [test_br_cc_param_3]; +; CHECK-F16: setp.lt.f16 [[PRED:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK-NOF16: setp.lt.f32 [[PRED:%p[0-9]+]], [[AF]], [[BF]] +; CHECK-NEXT: @[[PRED]] bra [[LABEL:LBB.*]]; +; CHECK: st.u32 [%[[C]]], +; CHECK: [[LABEL]]: +; CHECK: st.u32 [%[[D]]], +; CHECK: ret; +define void @test_br_cc(half %a, half %b, i32* %p1, i32* %p2) #0 { + %c = fcmp uge half %a, %b + br i1 %c, label %then, label %else +then: + store i32 0, i32* %p1 + ret void +else: + store i32 0, i32* %p2 + ret void +} + +; CHECK-LABEL: test_phi( +; CHECK: ld.param.u64 %[[P1:rd[0-9]+]], [test_phi_param_0]; +; CHECK: ld.b16 {{%h[0-9]+}}, [%[[P1]]]; +; CHECK: [[LOOP:LBB[0-9_]+]]: +; CHECK: mov.b16 [[R:%h[0-9]+]], [[AB:%h[0-9]+]]; +; CHECK: ld.b16 [[AB:%h[0-9]+]], [%[[P1]]]; +; CHECK: { +; CHECK: st.param.b64 [param0+0], %[[P1]]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_dummy +; CHECK: } +; CHECK: setp.eq.b32 [[PRED:%p[0-9]+]], %r{{[0-9]+}}, 1; +; CHECK: @[[PRED]] bra [[LOOP]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_phi(half* %p1) #0 { +entry: + %a = load half, half* %p1 + br label %loop +loop: + %r = phi half [%a, %entry], [%b, %loop] + %b = load half, half* %p1 + %c = call i1 @test_dummy(half* %p1) + br i1 %c, label %loop, label %return +return: + ret half %r +} +declare i1 @test_dummy(half* %p1) #0 + +; CHECK-LABEL: test_fptosi_i32( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i32_param_0]; +; CHECK: cvt.rzi.s32.f16 [[R:%r[0-9]+]], [[A]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i32 @test_fptosi_i32(half %a) #0 { + %r = fptosi half %a to i32 + ret i32 %r +} + +; CHECK-LABEL: test_fptosi_i64( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptosi_i64_param_0]; +; CHECK: cvt.rzi.s64.f16 [[R:%rd[0-9]+]], [[A]]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: ret; +define i64 @test_fptosi_i64(half %a) #0 { + %r = fptosi half %a to i64 + ret i64 %r +} + +; CHECK-LABEL: test_fptoui_i32( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i32_param_0]; +; CHECK: cvt.rzi.u32.f16 [[R:%r[0-9]+]], [[A]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i32 @test_fptoui_i32(half %a) #0 { + %r = fptoui half %a to i32 + ret i32 %r +} + +; CHECK-LABEL: test_fptoui_i64( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fptoui_i64_param_0]; +; CHECK: cvt.rzi.u64.f16 [[R:%rd[0-9]+]], [[A]]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK: ret; +define i64 @test_fptoui_i64(half %a) #0 { + %r = fptoui half %a to i64 + ret i64 %r +} + +; CHECK-LABEL: test_uitofp_i32( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_param_0]; +; CHECK: cvt.rn.f16.u32 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_uitofp_i32(i32 %a) #0 { + %r = uitofp i32 %a to half + ret half %r +} + +; CHECK-LABEL: test_uitofp_i64( +; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_uitofp_i64_param_0]; +; CHECK: cvt.rn.f16.u64 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_uitofp_i64(i64 %a) #0 { + %r = uitofp i64 %a to half + ret half %r +} + +; CHECK-LABEL: test_sitofp_i32( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_param_0]; +; CHECK: cvt.rn.f16.s32 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sitofp_i32(i32 %a) #0 { + %r = sitofp i32 %a to half + ret half %r +} + +; CHECK-LABEL: test_sitofp_i64( +; CHECK: ld.param.u64 [[A:%rd[0-9]+]], [test_sitofp_i64_param_0]; +; CHECK: cvt.rn.f16.s64 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sitofp_i64(i64 %a) #0 { + %r = sitofp i64 %a to half + ret half %r +} + +; CHECK-LABEL: test_uitofp_i32_fadd( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_uitofp_i32_fadd_param_0]; +; CHECK-DAG: cvt.rn.f16.u32 [[C:%h[0-9]+]], [[A]]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_uitofp_i32_fadd_param_1]; +; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_uitofp_i32_fadd(i32 %a, half %b) #0 { + %c = uitofp i32 %a to half + %r = fadd half %b, %c + ret half %r +} + +; CHECK-LABEL: test_sitofp_i32_fadd( +; CHECK-DAG: ld.param.u32 [[A:%r[0-9]+]], [test_sitofp_i32_fadd_param_0]; +; CHECK-DAG: cvt.rn.f16.s32 [[C:%h[0-9]+]], [[A]]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_sitofp_i32_fadd_param_1]; +; CHECK-F16: add.rn.f16 [[R:%h[0-9]+]], [[B]], [[C]]; +; XCHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; XCHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; XCHECK-NOF16-NEXT: add.rn.f32 [[R32:%f[0-9]+]], [[B32]], [[C32]]; +; XCHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sitofp_i32_fadd(i32 %a, half %b) #0 { + %c = sitofp i32 %a to half + %r = fadd half %b, %c + ret half %r +} + +; CHECK-LABEL: test_fptrunc_float( +; CHECK: ld.param.f32 [[A:%f[0-9]+]], [test_fptrunc_float_param_0]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fptrunc_float(float %a) #0 { + %r = fptrunc float %a to half + ret half %r +} + +; CHECK-LABEL: test_fptrunc_double( +; CHECK: ld.param.f64 [[A:%fd[0-9]+]], [test_fptrunc_double_param_0]; +; CHECK: cvt.rn.f16.f64 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fptrunc_double(double %a) #0 { + %r = fptrunc double %a to half + ret half %r +} + +; CHECK-LABEL: test_fpext_float( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_float_param_0]; +; CHECK: cvt.f32.f16 [[R:%f[0-9]+]], [[A]]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK: ret; +define float @test_fpext_float(half %a) #0 { + %r = fpext half %a to float + ret float %r +} + +; CHECK-LABEL: test_fpext_double( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fpext_double_param_0]; +; CHECK: cvt.f64.f16 [[R:%fd[0-9]+]], [[A]]; +; CHECK: st.param.f64 [func_retval0+0], [[R]]; +; CHECK: ret; +define double @test_fpext_double(half %a) #0 { + %r = fpext half %a to double + ret double %r +} + + +; CHECK-LABEL: test_bitcast_halftoi16( +; CHECK: ld.param.b16 [[AH:%h[0-9]+]], [test_bitcast_halftoi16_param_0]; +; CHECK: mov.b16 [[AS:%rs[0-9]+]], [[AH]] +; CHECK: cvt.u32.u16 [[R:%r[0-9]+]], [[AS]] +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i16 @test_bitcast_halftoi16(half %a) #0 { + %r = bitcast half %a to i16 + ret i16 %r +} + +; CHECK-LABEL: test_bitcast_i16tohalf( +; CHECK: ld.param.u16 [[AS:%rs[0-9]+]], [test_bitcast_i16tohalf_param_0]; +; CHECK: mov.b16 [[AH:%h[0-9]+]], [[AS]] +; CHECK: st.param.b16 [func_retval0+0], [[AH]]; +; CHECK: ret; +define half @test_bitcast_i16tohalf(i16 %a) #0 { + %r = bitcast i16 %a to half + ret half %r +} + + +declare half @llvm.sqrt.f16(half %a) #0 +declare half @llvm.powi.f16(half %a, i32 %b) #0 +declare half @llvm.sin.f16(half %a) #0 +declare half @llvm.cos.f16(half %a) #0 +declare half @llvm.pow.f16(half %a, half %b) #0 +declare half @llvm.exp.f16(half %a) #0 +declare half @llvm.exp2.f16(half %a) #0 +declare half @llvm.log.f16(half %a) #0 +declare half @llvm.log10.f16(half %a) #0 +declare half @llvm.log2.f16(half %a) #0 +declare half @llvm.fma.f16(half %a, half %b, half %c) #0 +declare half @llvm.fabs.f16(half %a) #0 +declare half @llvm.minnum.f16(half %a, half %b) #0 +declare half @llvm.maxnum.f16(half %a, half %b) #0 +declare half @llvm.copysign.f16(half %a, half %b) #0 +declare half @llvm.floor.f16(half %a) #0 +declare half @llvm.ceil.f16(half %a) #0 +declare half @llvm.trunc.f16(half %a) #0 +declare half @llvm.rint.f16(half %a) #0 +declare half @llvm.nearbyint.f16(half %a) #0 +declare half @llvm.round.f16(half %a) #0 +declare half @llvm.fmuladd.f16(half %a, half %b, half %c) #0 + +; CHECK-LABEL: test_sqrt( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sqrt_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: sqrt.rn.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sqrt(half %a) #0 { + %r = call half @llvm.sqrt.f16(half %a) + ret half %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_powi( +;define half @test_powi(half %a, i32 %b) #0 { +; %r = call half @llvm.powi.f16(half %a, i32 %b) +; ret half %r +;} + +; CHECK-LABEL: test_sin( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_sin_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: sin.approx.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_sin(half %a) #0 #1 { + %r = call half @llvm.sin.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_cos( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_cos_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: cos.approx.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_cos(half %a) #0 #1 { + %r = call half @llvm.cos.f16(half %a) + ret half %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_pow( +;define half @test_pow(half %a, half %b) #0 { +; %r = call half @llvm.pow.f16(half %a, half %b) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp( +;define half @test_exp(half %a) #0 { +; %r = call half @llvm.exp.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp2( +;define half @test_exp2(half %a) #0 { +; %r = call half @llvm.exp2.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log( +;define half @test_log(half %a) #0 { +; %r = call half @llvm.log.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log10( +;define half @test_log10(half %a) #0 { +; %r = call half @llvm.log10.f16(half %a) +; ret half %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log2( +;define half @test_log2(half %a) #0 { +; %r = call half @llvm.log2.f16(half %a) +; ret half %r +;} + +; CHECK-LABEL: test_fma( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fma_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fma_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fma_param_2]; +; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret +define half @test_fma(half %a, half %b, half %c) #0 { + %r = call half @llvm.fma.f16(half %a, half %b, half %c) + ret half %r +} + +; CHECK-LABEL: test_fabs( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_fabs_param_0]; +; CHECK: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK: abs.f32 [[RF:%f[0-9]+]], [[AF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fabs(half %a) #0 { + %r = call half @llvm.fabs.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_minnum( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_minnum_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_minnum_param_1]; +; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK: min.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_minnum(half %a, half %b) #0 { + %r = call half @llvm.minnum.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_maxnum( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_maxnum_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_maxnum_param_1]; +; CHECK-DAG: cvt.f32.f16 [[AF:%f[0-9]+]], [[A]]; +; CHECK-DAG: cvt.f32.f16 [[BF:%f[0-9]+]], [[B]]; +; CHECK: max.f32 [[RF:%f[0-9]+]], [[AF]], [[BF]]; +; CHECK: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[RF]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_maxnum(half %a, half %b) #0 { + %r = call half @llvm.maxnum.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_copysign( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_param_0]; +; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_param_1]; +; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; +; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_copysign(half %a, half %b) #0 { + %r = call half @llvm.copysign.f16(half %a, half %b) + ret half %r +} + +; CHECK-LABEL: test_copysign_f32( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f32_param_0]; +; CHECK-DAG: ld.param.f32 [[BF:%f[0-9]+]], [test_copysign_f32_param_1]; +; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b32 [[B:%r[0-9]+]], [[BF]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; +; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[B]], -2147483648; +; CHECK-DAG: shr.u32 [[BX1:%r[0-9]+]], [[BX0]], 16; +; CHECK-DAG: cvt.u16.u32 [[BX2:%rs[0-9]+]], [[BX1]]; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_copysign_f32(half %a, float %b) #0 { + %tb = fptrunc float %b to half + %r = call half @llvm.copysign.f16(half %a, half %tb) + ret half %r +} + +; CHECK-LABEL: test_copysign_f64( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_f64_param_0]; +; CHECK-DAG: ld.param.f64 [[BD:%fd[0-9]+]], [test_copysign_f64_param_1]; +; CHECK-DAG: mov.b16 [[A:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b64 [[B:%rd[0-9]+]], [[BD]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[A]], 32767; +; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[B]], -9223372036854775808; +; CHECK-DAG: shr.u64 [[BX1:%rd[0-9]+]], [[BX0]], 48; +; CHECK-DAG: cvt.u16.u64 [[BX2:%rs[0-9]+]], [[BX1]]; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX2]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_copysign_f64(half %a, double %b) #0 { + %tb = fptrunc double %b to half + %r = call half @llvm.copysign.f16(half %a, half %tb) + ret half %r +} + +; CHECK-LABEL: test_copysign_extended( +; CHECK-DAG: ld.param.b16 [[AH:%h[0-9]+]], [test_copysign_extended_param_0]; +; CHECK-DAG: ld.param.b16 [[BH:%h[0-9]+]], [test_copysign_extended_param_1]; +; CHECK-DAG: mov.b16 [[AS:%rs[0-9]+]], [[AH]]; +; CHECK-DAG: mov.b16 [[BS:%rs[0-9]+]], [[BH]]; +; CHECK-DAG: and.b16 [[AX:%rs[0-9]+]], [[AS]], 32767; +; CHECK-DAG: and.b16 [[BX:%rs[0-9]+]], [[BS]], -32768; +; CHECK: or.b16 [[RX:%rs[0-9]+]], [[AX]], [[BX]]; +; CHECK: mov.b16 [[R:%h[0-9]+]], [[RX]]; +; CHECK: cvt.f32.f16 [[XR:%f[0-9]+]], [[R]]; +; CHECK: st.param.f32 [func_retval0+0], [[XR]]; +; CHECK: ret; +define float @test_copysign_extended(half %a, half %b) #0 { + %r = call half @llvm.copysign.f16(half %a, half %b) + %xr = fpext half %r to float + ret float %xr +} + +; CHECK-LABEL: test_floor( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_floor_param_0]; +; CHECK: cvt.rmi.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_floor(half %a) #0 { + %r = call half @llvm.floor.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_ceil( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_ceil_param_0]; +; CHECK: cvt.rpi.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_ceil(half %a) #0 { + %r = call half @llvm.ceil.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_trunc( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_trunc_param_0]; +; CHECK: cvt.rzi.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_trunc(half %a) #0 { + %r = call half @llvm.trunc.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_rint( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_rint_param_0]; +; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_rint(half %a) #0 { + %r = call half @llvm.rint.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_nearbyint( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_nearbyint_param_0]; +; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_nearbyint(half %a) #0 { + %r = call half @llvm.nearbyint.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_round( +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_round_param_0]; +; CHECK: cvt.rni.f16.f16 [[R:%h[0-9]+]], [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_round(half %a) #0 { + %r = call half @llvm.round.f16(half %a) + ret half %r +} + +; CHECK-LABEL: test_fmuladd( +; CHECK-DAG: ld.param.b16 [[A:%h[0-9]+]], [test_fmuladd_param_0]; +; CHECK-DAG: ld.param.b16 [[B:%h[0-9]+]], [test_fmuladd_param_1]; +; CHECK-DAG: ld.param.b16 [[C:%h[0-9]+]], [test_fmuladd_param_2]; +; CHECK-F16: fma.rn.f16 [[R:%h[0-9]+]], [[A]], [[B]], [[C]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[A32:%f[0-9]+]], [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[B32:%f[0-9]+]], [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[C32:%f[0-9]+]], [[C]] +; CHECK-NOF16-NEXT: fma.rn.f32 [[R32:%f[0-9]+]], [[A32]], [[B32]], [[C32]]; +; CHECK-NOF16-NEXT: cvt.rn.f16.f32 [[R:%h[0-9]+]], [[R32]] +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_fmuladd(half %a, half %b, half %c) #0 { + %r = call half @llvm.fmuladd.f16(half %a, half %b, half %c) + ret half %r +} + +attributes #0 = { nounwind } +attributes #1 = { "unsafe-fp-math" = "true" } diff --git a/test/CodeGen/NVPTX/f16x2-instructions.ll b/test/CodeGen/NVPTX/f16x2-instructions.ll index 33bb616d895c..5dc796ada37f 100644 --- a/test/CodeGen/NVPTX/f16x2-instructions.ll +++ b/test/CodeGen/NVPTX/f16x2-instructions.ll @@ -1,1426 +1,1427 @@ -; ## Full FP16 support enabled by default. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s -; ## FP16 support explicitly disabled. -; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ -; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s -; ## FP16 is not supported by hardware. -; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ -; RUN: -disable-post-ra -disable-fp-elim \ -; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s - -target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" - -; CHECK-LABEL: test_ret_const( -; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184; -; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_ret_const() #0 { - ret <2 x half> -} - -; CHECK-LABEL: test_extract_0( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0]; -; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_extract_0(<2 x half> %a) #0 { - %e = extractelement <2 x half> %a, i32 0 - ret half %e -} - -; CHECK-LABEL: test_extract_1( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0]; -; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_extract_1(<2 x half> %a) #0 { - %e = extractelement <2 x half> %a, i32 1 - ret half %e -} - -; CHECK-LABEL: test_extract_i( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0]; -; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; -; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; -; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]]; -; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK: ret; -define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { - %e = extractelement <2 x half> %a, i64 %idx - ret half %e -} - -; CHECK-LABEL: test_fadd( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1]; -; -; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { - %r = fadd <2 x half> %a, %b - ret <2 x half> %r -} - -; Check that we can lower fadd with immediate arguments. -; CHECK-LABEL: test_fadd_imm_0( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0]; -; -; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; -; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; -; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { - %r = fadd <2 x half> , %a - ret <2 x half> %r -} - -; CHECK-LABEL: test_fadd_imm_1( -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0]; -; -; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; -; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; -; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { - %r = fadd <2 x half> %a, - ret <2 x half> %r -} - -; CHECK-LABEL: test_fsub( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0]; -; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1]; -; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { - %r = fsub <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_fneg( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0]; -; -; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0; -; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]]; -; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]]; -; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fneg(<2 x half> %a) #0 { - %r = fsub <2 x half> , %a - ret <2 x half> %r -} - -; CHECK-LABEL: test_fmul( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1]; -; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { - %r = fmul <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_fdiv( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]; -; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { - %r = fdiv <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_frem( -; -- Load two 16x2 inputs and split them into f16 elements -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1]; -; -- Split into elements -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; -- promote to f32. -; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; -; -- frem(a[0],b[0]). -; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]]; -; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]]; -; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]]; -; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]]; -; -- frem(a[1],b[1]). -; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]]; -; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]]; -; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]]; -; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]]; -; -- convert back to f16. -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; -- merge into f16x2 and return it. -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { - %r = frem <2 x half> %a, %b - ret <2 x half> %r -} - -; CHECK-LABEL: .func test_ldst_v2f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1]; -; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]] -; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]]; -; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]}; -; CHECK: ret; -define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) { - %t1 = load <2 x half>, <2 x half>* %a - store <2 x half> %t1, <2 x half>* %b, align 16 - ret void -} - -; CHECK-LABEL: .func test_ldst_v3f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1]; -; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair -; number of bitshifting instructions that may change at llvm's whim. -; So we only verify that we only issue correct number of writes using -; correct offset, but not the values we write. -; CHECK-DAG: ld.u64 -; CHECK-DAG: st.u32 [%[[B]]], -; CHECK-DAG: st.b16 [%[[B]]+4], -; CHECK: ret; -define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) { - %t1 = load <3 x half>, <3 x half>* %a - store <3 x half> %t1, <3 x half>* %b, align 16 - ret void -} - -; CHECK-LABEL: .func test_ldst_v4f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1]; -; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; -define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) { - %t1 = load <4 x half>, <4 x half>* %a - store <4 x half> %t1, <4 x half>* %b, align 16 - ret void -} - -; CHECK-LABEL: .func test_ldst_v8f16( -; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0]; -; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1]; -; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; -; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: ret; -define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) { - %t1 = load <8 x half>, <8 x half>* %a - store <8 x half> %t1, <8 x half>* %b, align 16 - ret void -} - -declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 - -; CHECK-LABEL: test_call( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[A]]; -; CHECK-DAG: st.param.b32 [param1+0], [[B]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_call_flipped( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_tailcall_flipped( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1]; -; CHECK: { -; CHECK-DAG: .param .align 4 .b8 param0[4]; -; CHECK-DAG: .param .align 4 .b8 param1[4]; -; CHECK-DAG: st.param.b32 [param0+0], [[B]]; -; CHECK-DAG: st.param.b32 [param1+0], [[A]]; -; CHECK-DAG: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_callee, -; CHECK: ); -; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK-NEXT: } -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { - %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_select( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1]; -; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] -; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; -; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]]; -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { - %r = select i1 %c, <2 x half> %a, <2 x half> %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_select_cc( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3]; -; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 { - %cc = fcmp une <2 x half> %c, %d - %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_select_cc_f32_f16( -; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0]; -; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2]; -; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3]; -; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; -; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] -; -; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, - <2 x half> %c, <2 x half> %d) #0 { - %cc = fcmp une <2 x half> %c, %d - %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b - ret <2 x float> %r -} - -; CHECK-LABEL: test_select_cc_f16_f32( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1]; -; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2]; -; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3]; -; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]] -; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]] -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; -; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, - <2 x float> %c, <2 x float> %d) #0 { - %cc = fcmp une <2 x float> %c, %d - %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b - ret <2 x half> %r -} - -; CHECK-LABEL: test_fcmp_une( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1]; -; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp une <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ueq( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1]; -; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ueq <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ugt( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1]; -; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ugt <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_uge( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1]; -; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp uge <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ult( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1]; -; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ult <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ule( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1]; -; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ule <2 x half> %a, %b - ret <2 x i1> %r -} - - -; CHECK-LABEL: test_fcmp_uno( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1]; -; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp uno <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_one( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1]; -; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp one <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_oeq( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1]; -; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp oeq <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ogt( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1]; -; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ogt <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_oge( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1]; -; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp oge <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_olt( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1]; -; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp olt <2 x half> %a, %b - ret <2 x i1> %r -} - -; XCHECK-LABEL: test_fcmp_ole( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1]; -; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ole <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fcmp_ord( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1]; -; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] -; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] -; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; -; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; -; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-NEXT: ret; -define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { - %r = fcmp ord <2 x half> %a, %b - ret <2 x i1> %r -} - -; CHECK-LABEL: test_fptosi_i32( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { - %r = fptosi <2 x half> %a to <2 x i32> - ret <2 x i32> %r -} - -; CHECK-LABEL: test_fptosi_i64( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { - %r = fptosi <2 x half> %a to <2 x i64> - ret <2 x i64> %r -} - -; CHECK-LABEL: test_fptoui_2xi32( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { - %r = fptoui <2 x half> %a to <2 x i32> - ret <2 x i32> %r -} - -; CHECK-LABEL: test_fptoui_2xi64( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]]; -; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { - %r = fptoui <2 x half> %a to <2 x i64> - ret <2 x i64> %r -} - -; CHECK-LABEL: test_uitofp_2xi32( -; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0]; -; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { - %r = uitofp <2 x i32> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_uitofp_2xi64( -; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0]; -; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { - %r = uitofp <2 x i64> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_sitofp_2xi32( -; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0]; -; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { - %r = sitofp <2 x i32> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_sitofp_2xi64( -; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0]; -; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { - %r = sitofp <2 x i64> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_uitofp_2xi32_fadd( -; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1]; -; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]]; - -; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} -; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { - %c = uitofp <2 x i32> %a to <2 x half> - %r = fadd <2 x half> %b, %c - ret <2 x half> %r -} - -; CHECK-LABEL: test_sitofp_2xi32_fadd( -; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1]; -; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]]; -; -; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} -; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] -; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { - %c = sitofp <2 x i32> %a to <2 x half> - %r = fadd <2 x half> %b, %c - ret <2 x half> %r -} - -; CHECK-LABEL: test_fptrunc_2xfloat( -; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { - %r = fptrunc <2 x float> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_fptrunc_2xdouble( -; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0]; -; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { - %r = fptrunc <2 x double> %a to <2 x half> - ret <2 x half> %r -} - -; CHECK-LABEL: test_fpext_2xfloat( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK: ret; -define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { - %r = fpext <2 x half> %a to <2 x float> - ret <2 x float> %r -} - -; CHECK-LABEL: test_fpext_2xdouble( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]]; -; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK: ret; -define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { - %r = fpext <2 x half> %a to <2 x double> - ret <2 x double> %r -} - - -; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( -; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; -; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]] -; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16 -; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]] -; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]} -; CHECK: ret; -define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { - %r = bitcast <2 x half> %a to <2 x i16> - ret <2 x i16> %r -} - -; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( -; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0]; -; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]]; -; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]]; -; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16; -; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { - %r = bitcast <2 x i16> %a to <2 x half> - ret <2 x half> %r -} - - -declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0 -declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.log.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 -declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0 -declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.round.f16(<2 x half> %a) #0 -declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 - -; CHECK-LABEL: test_sqrt( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sqrt(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a) - ret <2 x half> %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_powi( -;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 { -; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) -; ret <2 x half> %r -;} - -; CHECK-LABEL: test_sin( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_sin(<2 x half> %a) #0 #1 { - %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_cos( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_cos(<2 x half> %a) #0 #1 { - %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) - ret <2 x half> %r -} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_pow( -;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 { -; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp( -;define <2 x half> @test_exp(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_exp2( -;define <2 x half> @test_exp2(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log( -;define <2 x half> @test_log(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.log.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log10( -;define <2 x half> @test_log10(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -;;; Can't do this yet: requires libcall. -; XCHECK-LABEL: test_log2( -;define <2 x half> @test_log2(<2 x half> %a) #0 { -; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a) -; ret <2 x half> %r -;} - -; CHECK-LABEL: test_fma( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2]; -; -; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} - -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret -define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { - %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) - ret <2 x half> %r -} - -; CHECK-LABEL: test_fabs( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0]; -; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; -; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fabs(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_minnum( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; -; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_maxnum( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; -; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; -; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; -; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; -; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; -; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; -; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign_f32( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0]; -; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; -; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; -; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16; -; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16; -; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]]; -; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]]; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { - %tb = fptrunc <2 x float> %b to <2 x half> - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign_f64( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0]; -; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; -; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; -; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; -; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; -; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; -; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { - %tb = fptrunc <2 x double> %b to <2 x half> - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) - ret <2 x half> %r -} - -; CHECK-LABEL: test_copysign_extended( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; -; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; -; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; -; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; -; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; -; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; -; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; -; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; -; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; -; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; -; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; -; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; -; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]] -; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]]; -; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]]; -; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]}; -; CHECK: ret; -define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { - %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) - %xr = fpext <2 x half> %r to <2 x float> - ret <2 x float> %xr -} - -; CHECK-LABEL: test_floor( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_floor(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.floor.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_ceil( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_ceil(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_trunc( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_trunc(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_rint( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_rint(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.rint.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_nearbyint( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_nearbyint(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_round( -; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0]; -; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; -; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; -; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_round(<2 x half> %a) #0 { - %r = call <2 x half> @llvm.round.f16(<2 x half> %a) - ret <2 x half> %r -} - -; CHECK-LABEL: test_fmuladd( -; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0]; -; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1]; -; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2]; -; -; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; -; -; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] -; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] -; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] -; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] -; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; -; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] -; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] -; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} -; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { - %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) - ret <2 x half> %r -} - -attributes #0 = { nounwind } -attributes #1 = { "unsafe-fp-math" = "true" } +; ## Full FP16 support enabled by default. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim -verify-machineinstrs \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-F16 %s +; ## FP16 support explicitly disabled. +; RUN: llc < %s -mtriple=nvptx64-nvidia-cuda -mcpu=sm_53 -asm-verbose=false \ +; RUN: -O0 -disable-post-ra -disable-fp-elim --nvptx-no-f16-math \ +; RUN: -verify-machineinstrs \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s +; ## FP16 is not supported by hardware. +; RUN: llc < %s -O0 -mtriple=nvptx64-nvidia-cuda -mcpu=sm_52 -asm-verbose=false \ +; RUN: -disable-post-ra -disable-fp-elim -verify-machineinstrs \ +; RUN: | FileCheck -check-prefixes CHECK,CHECK-NOF16 %s + +target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + +; CHECK-LABEL: test_ret_const( +; CHECK: mov.u32 [[T:%r[0-9+]]], 1073757184; +; CHECK: mov.b32 [[R:%hh[0-9+]]], [[T]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_ret_const() #0 { + ret <2 x half> +} + +; CHECK-LABEL: test_extract_0( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_0_param_0]; +; CHECK: mov.b32 {[[R:%h[0-9]+]], %tmp_hi}, [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_0(<2 x half> %a) #0 { + %e = extractelement <2 x half> %a, i32 0 + ret half %e +} + +; CHECK-LABEL: test_extract_1( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_1_param_0]; +; CHECK: mov.b32 {%tmp_lo, [[R:%h[0-9]+]]}, [[A]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_1(<2 x half> %a) #0 { + %e = extractelement <2 x half> %a, i32 1 + ret half %e +} + +; CHECK-LABEL: test_extract_i( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_extract_i_param_0]; +; CHECK-DAG: ld.param.u64 [[IDX:%rd[0-9]+]], [test_extract_i_param_1]; +; CHECK-DAG: setp.eq.s64 [[PRED:%p[0-9]+]], [[IDX]], 0; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[A]]; +; CHECK: selp.b16 [[R:%h[0-9]+]], [[E0]], [[E1]], [[PRED]]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK: ret; +define half @test_extract_i(<2 x half> %a, i64 %idx) #0 { + %e = extractelement <2 x half> %a, i64 %idx + ret half %e +} + +; CHECK-LABEL: test_fadd( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_param_1]; +; +; CHECK-F16-NEXT: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd(<2 x half> %a, <2 x half> %b) #0 { + %r = fadd <2 x half> %a, %b + ret <2 x half> %r +} + +; Check that we can lower fadd with immediate arguments. +; CHECK-LABEL: test_fadd_imm_0( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fadd_imm_0_param_0]; +; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[IHH]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd_imm_0(<2 x half> %a) #0 { + %r = fadd <2 x half> , %a + ret <2 x half> %r +} + +; CHECK-LABEL: test_fadd_imm_1( +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fadd_imm_1_param_0]; +; +; CHECK-F16: mov.u32 [[I:%r[0-9+]]], 1073757184; +; CHECK-F16: mov.b32 [[IHH:%hh[0-9+]]], [[I]]; +; CHECK-F16: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[IHH]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], 0f3F800000; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], 0f40000000; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fadd_imm_1(<2 x half> %a) #0 { + %r = fadd <2 x half> %a, + ret <2 x half> %r +} + +; CHECK-LABEL: test_fsub( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fsub_param_0]; +; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fsub_param_1]; +; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fsub(<2 x half> %a, <2 x half> %b) #0 { + %r = fsub <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fneg( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fneg_param_0]; +; +; CHECK-F16: mov.u32 [[I0:%r[0-9+]]], 0; +; CHECK-F16: mov.b32 [[IHH0:%hh[0-9+]]], [[I0]]; +; CHECK-F16-NEXT: sub.rn.f16x2 [[R:%hh[0-9]+]], [[IHH0]], [[A]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: mov.f32 [[Z:%f[0-9]+]], 0f00000000; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR0:%f[0-9]+]], [[Z]], [[FA0]]; +; CHECK-NOF16-DAG: sub.rn.f32 [[FR1:%f[0-9]+]], [[Z]], [[FA1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fneg(<2 x half> %a) #0 { + %r = fsub <2 x half> , %a + ret <2 x half> %r +} + +; CHECK-LABEL: test_fmul( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmul_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmul_param_1]; +; CHECK-F16-NEXT: mul.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: mul.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-NOF16-DAG: mul.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fmul(<2 x half> %a, <2 x half> %b) #0 { + %r = fmul <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fdiv( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fdiv_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fdiv_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: div.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-DAG: div.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]]; +; CHECK-NEXT: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_fdiv(<2 x half> %a, <2 x half> %b) #0 { + %r = fdiv <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_frem( +; -- Load two 16x2 inputs and split them into f16 elements +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_frem_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_frem_param_1]; +; -- Split into elements +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; -- promote to f32. +; CHECK-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]]; +; -- frem(a[0],b[0]). +; CHECK-DAG: div.rn.f32 [[FD0:%f[0-9]+]], [[FA0]], [[FB0]]; +; CHECK-DAG: cvt.rmi.f32.f32 [[DI0:%f[0-9]+]], [[FD0]]; +; CHECK-DAG: mul.f32 [[RI0:%f[0-9]+]], [[DI0]], [[FB0]]; +; CHECK-DAG: sub.f32 [[RF0:%f[0-9]+]], [[FA0]], [[RI0]]; +; -- frem(a[1],b[1]). +; CHECK-DAG: div.rn.f32 [[FD1:%f[0-9]+]], [[FA1]], [[FB1]]; +; CHECK-DAG: cvt.rmi.f32.f32 [[DI1:%f[0-9]+]], [[FD1]]; +; CHECK-DAG: mul.f32 [[RI1:%f[0-9]+]], [[DI1]], [[FB1]]; +; CHECK-DAG: sub.f32 [[RF1:%f[0-9]+]], [[FA1]], [[RI1]]; +; -- convert back to f16. +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; -- merge into f16x2 and return it. +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_frem(<2 x half> %a, <2 x half> %b) #0 { + %r = frem <2 x half> %a, %b + ret <2 x half> %r +} + +; CHECK-LABEL: .func test_ldst_v2f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v2f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v2f16_param_1]; +; CHECK-DAG: ld.b32 [[E:%hh[0-9]+]], [%[[A]]] +; CHECK: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[E]]; +; CHECK-DAG: st.v2.b16 [%[[B]]], {[[E0]], [[E1]]}; +; CHECK: ret; +define void @test_ldst_v2f16(<2 x half>* %a, <2 x half>* %b) { + %t1 = load <2 x half>, <2 x half>* %a + store <2 x half> %t1, <2 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v3f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v3f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v3f16_param_1]; +; -- v3 is inconvenient to capture as it's lowered as ld.b64 + fair +; number of bitshifting instructions that may change at llvm's whim. +; So we only verify that we only issue correct number of writes using +; correct offset, but not the values we write. +; CHECK-DAG: ld.u64 +; CHECK-DAG: st.u32 [%[[B]]], +; CHECK-DAG: st.b16 [%[[B]]+4], +; CHECK: ret; +define void @test_ldst_v3f16(<3 x half>* %a, <3 x half>* %b) { + %t1 = load <3 x half>, <3 x half>* %a + store <3 x half> %t1, <3 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v4f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v4f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v4f16_param_1]; +; CHECK-DAG: ld.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.b16 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v4f16(<4 x half>* %a, <4 x half>* %b) { + %t1 = load <4 x half>, <4 x half>* %a + store <4 x half> %t1, <4 x half>* %b, align 16 + ret void +} + +; CHECK-LABEL: .func test_ldst_v8f16( +; CHECK-DAG: ld.param.u64 %[[A:rd[0-9]+]], [test_ldst_v8f16_param_0]; +; CHECK-DAG: ld.param.u64 %[[B:rd[0-9]+]], [test_ldst_v8f16_param_1]; +; CHECK-DAG: ld.v4.b32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [%[[A]]]; +; CHECK-DAG: st.v4.b32 [%[[B]]], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: ret; +define void @test_ldst_v8f16(<8 x half>* %a, <8 x half>* %b) { + %t1 = load <8 x half>, <8 x half>* %a + store <8 x half> %t1, <8 x half>* %b, align 16 + ret void +} + +declare <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) #0 + +; CHECK-LABEL: test_call( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[A]]; +; CHECK-DAG: st.param.b32 [param1+0], [[B]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_call(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @test_callee(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_call_flipped( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_call_flipped_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_call_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_call_flipped(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_tailcall_flipped( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_tailcall_flipped_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_tailcall_flipped_param_1]; +; CHECK: { +; CHECK-DAG: .param .align 4 .b8 param0[4]; +; CHECK-DAG: .param .align 4 .b8 param1[4]; +; CHECK-DAG: st.param.b32 [param0+0], [[B]]; +; CHECK-DAG: st.param.b32 [param1+0], [[A]]; +; CHECK-DAG: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_callee, +; CHECK: ); +; CHECK-NEXT: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK-NEXT: } +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_tailcall_flipped(<2 x half> %a, <2 x half> %b) #0 { + %r = tail call <2 x half> @test_callee(<2 x half> %b, <2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_select( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_param_1]; +; CHECK-DAG: ld.param.u8 [[C:%rs[0-9]+]], [test_select_param_2] +; CHECK-DAG: setp.eq.b16 [[PRED:%p[0-9]+]], %rs{{.*}}, 1; +; CHECK-NEXT: selp.b32 [[R:%hh[0-9]+]], [[A]], [[B]], [[PRED]]; +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select(<2 x half> %a, <2 x half> %b, i1 zeroext %c) #0 { + %r = select i1 %c, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_select_cc( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_param_2]; +; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_param_3]; +; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] +; +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] +; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select_cc(<2 x half> %a, <2 x half> %b, <2 x half> %c, <2 x half> %d) #0 { + %cc = fcmp une <2 x half> %c, %d + %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_select_cc_f32_f16( +; CHECK-DAG: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_0]; +; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_select_cc_f32_f16_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_select_cc_f32_f16_param_2]; +; CHECK-DAG: ld.param.b32 [[D:%hh[0-9]+]], [test_select_cc_f32_f16_param_3]; +; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[C]], [[D]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: mov.b32 {[[D0:%h[0-9]+]], [[D1:%h[0-9]+]]}, [[D]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF0:%f[0-9]+]], [[D0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF0:%f[0-9]+]], [[C0]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[DF1:%f[0-9]+]], [[D1]]; +; CHECK-NOF16-DAG: cvt.f32.f16 [[CF1:%f[0-9]+]], [[C1]]; +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[CF0]], [[DF0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[CF1]], [[DF1]] +; +; CHECK-DAG: selp.f32 [[R0:%f[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.f32 [[R1:%f[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x float> @test_select_cc_f32_f16(<2 x float> %a, <2 x float> %b, + <2 x half> %c, <2 x half> %d) #0 { + %cc = fcmp une <2 x half> %c, %d + %r = select <2 x i1> %cc, <2 x float> %a, <2 x float> %b + ret <2 x float> %r +} + +; CHECK-LABEL: test_select_cc_f16_f32( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_select_cc_f16_f32_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_select_cc_f16_f32_param_1]; +; CHECK-DAG: ld.param.v2.f32 {[[C0:%f[0-9]+]], [[C1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_2]; +; CHECK-DAG: ld.param.v2.f32 {[[D0:%f[0-9]+]], [[D1:%f[0-9]+]]}, [test_select_cc_f16_f32_param_3]; +; CHECK-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[C0]], [[D0]] +; CHECK-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[C1]], [[D1]] +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: selp.b16 [[R0:%h[0-9]+]], [[A0]], [[B0]], [[P0]]; +; CHECK-DAG: selp.b16 [[R1:%h[0-9]+]], [[A1]], [[B1]], [[P1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK-NEXT: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define <2 x half> @test_select_cc_f16_f32(<2 x half> %a, <2 x half> %b, + <2 x float> %c, <2 x float> %d) #0 { + %cc = fcmp une <2 x float> %c, %d + %r = select <2 x i1> %cc, <2 x half> %a, <2 x half> %b + ret <2 x half> %r +} + +; CHECK-LABEL: test_fcmp_une( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_une_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_une_param_1]; +; CHECK-F16: setp.neu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.neu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_une(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp une <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ueq( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ueq_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ueq_param_1]; +; CHECK-F16: setp.equ.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.equ.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.equ.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ueq(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ueq <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ugt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ugt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ugt_param_1]; +; CHECK-F16: setp.gtu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.gtu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.gtu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ugt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ugt <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_uge( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uge_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uge_param_1]; +; CHECK-F16: setp.geu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.geu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.geu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_uge(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp uge <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ult( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ult_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ult_param_1]; +; CHECK-F16: setp.ltu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ltu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ltu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ult(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ult <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ule( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ule_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ule_param_1]; +; CHECK-F16: setp.leu.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.leu.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.leu.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ule(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ule <2 x half> %a, %b + ret <2 x i1> %r +} + + +; CHECK-LABEL: test_fcmp_uno( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_uno_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_uno_param_1]; +; CHECK-F16: setp.nan.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.nan.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.nan.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_uno(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp uno <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_one( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_one_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_one_param_1]; +; CHECK-F16: setp.ne.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ne.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ne.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_one(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp one <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_oeq( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oeq_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oeq_param_1]; +; CHECK-F16: setp.eq.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.eq.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.eq.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_oeq(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp oeq <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ogt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ogt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ogt_param_1]; +; CHECK-F16: setp.gt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.gt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.gt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ogt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ogt <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_oge( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_oge_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_oge_param_1]; +; CHECK-F16: setp.ge.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.ge.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.ge.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_oge(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp oge <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_olt( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_olt_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_olt_param_1]; +; CHECK-F16: setp.lt.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.lt.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.lt.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_olt(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp olt <2 x half> %a, %b + ret <2 x i1> %r +} + +; XCHECK-LABEL: test_fcmp_ole( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ole_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ole_param_1]; +; CHECK-F16: setp.le.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.le.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.le.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ole(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ole <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fcmp_ord( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fcmp_ord_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fcmp_ord_param_1]; +; CHECK-F16: setp.num.f16x2 [[P0:%p[0-9]+]]|[[P1:%p[0-9]+]], [[A]], [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: setp.num.f32 [[P0:%p[0-9]+]], [[FA0]], [[FB0]] +; CHECK-NOF16-DAG: setp.num.f32 [[P1:%p[0-9]+]], [[FA1]], [[FB1]] +; CHECK-DAG: selp.u16 [[R0:%rs[0-9]+]], -1, 0, [[P0]]; +; CHECK-DAG: selp.u16 [[R1:%rs[0-9]+]], -1, 0, [[P1]]; +; CHECK-NEXT: st.param.v2.b8 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-NEXT: ret; +define <2 x i1> @test_fcmp_ord(<2 x half> %a, <2 x half> %b) #0 { + %r = fcmp ord <2 x half> %a, %b + ret <2 x i1> %r +} + +; CHECK-LABEL: test_fptosi_i32( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i32_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.s32.f16 [[R0:%r[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.s32.f16 [[R1:%r[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i32> @test_fptosi_i32(<2 x half> %a) #0 { + %r = fptosi <2 x half> %a to <2 x i32> + ret <2 x i32> %r +} + +; CHECK-LABEL: test_fptosi_i64( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptosi_i64_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.s64.f16 [[R0:%rd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.s64.f16 [[R1:%rd[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i64> @test_fptosi_i64(<2 x half> %a) #0 { + %r = fptosi <2 x half> %a to <2 x i64> + ret <2 x i64> %r +} + +; CHECK-LABEL: test_fptoui_2xi32( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi32_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.u32.f16 [[R0:%r[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.u32.f16 [[R1:%r[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i32> @test_fptoui_2xi32(<2 x half> %a) #0 { + %r = fptoui <2 x half> %a to <2 x i32> + ret <2 x i32> %r +} + +; CHECK-LABEL: test_fptoui_2xi64( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fptoui_2xi64_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.rzi.u64.f16 [[R0:%rd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rzi.u64.f16 [[R1:%rd[0-9]+]], [[A1]]; +; CHECK: st.param.v2.b64 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i64> @test_fptoui_2xi64(<2 x half> %a) #0 { + %r = fptoui <2 x half> %a to <2 x i64> + ret <2 x i64> %r +} + +; CHECK-LABEL: test_uitofp_2xi32( +; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_param_0]; +; CHECK-DAG: cvt.rn.f16.u32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.u32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi32(<2 x i32> %a) #0 { + %r = uitofp <2 x i32> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_uitofp_2xi64( +; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_uitofp_2xi64_param_0]; +; CHECK-DAG: cvt.rn.f32.u64 [[F0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f32.u64 [[F1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi64(<2 x i64> %a) #0 { + %r = uitofp <2 x i64> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi32( +; CHECK: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_param_0]; +; CHECK-DAG: cvt.rn.f16.s32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.s32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi32(<2 x i32> %a) #0 { + %r = sitofp <2 x i32> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi64( +; CHECK: ld.param.v2.u64 {[[A0:%rd[0-9]+]], [[A1:%rd[0-9]+]]}, [test_sitofp_2xi64_param_0]; +; CHECK-DAG: cvt.rn.f32.s64 [[F0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f32.s64 [[F1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[F0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[F1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi64(<2 x i64> %a) #0 { + %r = sitofp <2 x i64> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_uitofp_2xi32_fadd( +; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_uitofp_2xi32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_uitofp_2xi32_fadd_param_1]; +; CHECK-DAG: cvt.rn.f16.u32 [[C0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.u32 [[C1:%h[0-9]+]], [[A1]]; + +; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} +; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_uitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { + %c = uitofp <2 x i32> %a to <2 x half> + %r = fadd <2 x half> %b, %c + ret <2 x half> %r +} + +; CHECK-LABEL: test_sitofp_2xi32_fadd( +; CHECK-DAG: ld.param.v2.u32 {[[A0:%r[0-9]+]], [[A1:%r[0-9]+]]}, [test_sitofp_2xi32_fadd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_sitofp_2xi32_fadd_param_1]; +; CHECK-DAG: cvt.rn.f16.s32 [[C0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.s32 [[C1:%h[0-9]+]], [[A1]]; +; +; CHECK-F16-DAG: mov.b32 [[C:%hh[0-9]+]], {[[C0]], [[C1]]} +; CHECK-F16-DAG: add.rn.f16x2 [[R:%hh[0-9]+]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC1:%f[0-9]+]], [[C1]] +; CHECK-NOF16-DAG: add.rn.f32 [[FR0:%f[0-9]+]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: add.rn.f32 [[FR1:%f[0-9]+]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sitofp_2xi32_fadd(<2 x i32> %a, <2 x half> %b) #0 { + %c = sitofp <2 x i32> %a to <2 x half> + %r = fadd <2 x half> %b, %c + ret <2 x half> %r +} + +; CHECK-LABEL: test_fptrunc_2xfloat( +; CHECK: ld.param.v2.f32 {[[A0:%f[0-9]+]], [[A1:%f[0-9]+]]}, [test_fptrunc_2xfloat_param_0]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fptrunc_2xfloat(<2 x float> %a) #0 { + %r = fptrunc <2 x float> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_fptrunc_2xdouble( +; CHECK: ld.param.v2.f64 {[[A0:%fd[0-9]+]], [[A1:%fd[0-9]+]]}, [test_fptrunc_2xdouble_param_0]; +; CHECK-DAG: cvt.rn.f16.f64 [[R0:%h[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.rn.f16.f64 [[R1:%h[0-9]+]], [[A1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fptrunc_2xdouble(<2 x double> %a) #0 { + %r = fptrunc <2 x double> %a to <2 x half> + ret <2 x half> %r +} + +; CHECK-LABEL: test_fpext_2xfloat( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xfloat_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[R0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[R1:%f[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.f32 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x float> @test_fpext_2xfloat(<2 x half> %a) #0 { + %r = fpext <2 x half> %a to <2 x float> + ret <2 x float> %r +} + +; CHECK-LABEL: test_fpext_2xdouble( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fpext_2xdouble_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f64.f16 [[R0:%fd[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f64.f16 [[R1:%fd[0-9]+]], [[A1]]; +; CHECK-NEXT: st.param.v2.f64 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK: ret; +define <2 x double> @test_fpext_2xdouble(<2 x half> %a) #0 { + %r = fpext <2 x half> %a to <2 x double> + ret <2 x double> %r +} + + +; CHECK-LABEL: test_bitcast_2xhalf_to_2xi16( +; CHECK: ld.param.u32 [[A:%r[0-9]+]], [test_bitcast_2xhalf_to_2xi16_param_0]; +; CHECK-DAG: cvt.u16.u32 [[R0:%rs[0-9]+]], [[A]] +; CHECK-DAG: shr.u32 [[AH:%r[0-9]+]], [[A]], 16 +; CHECK-DAG: cvt.u16.u32 [[R1:%rs[0-9]+]], [[AH]] +; CHECK: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]} +; CHECK: ret; +define <2 x i16> @test_bitcast_2xhalf_to_2xi16(<2 x half> %a) #0 { + %r = bitcast <2 x half> %a to <2 x i16> + ret <2 x i16> %r +} + +; CHECK-LABEL: test_bitcast_2xi16_to_2xhalf( +; CHECK: ld.param.v2.u16 {[[RS0:%rs[0-9]+]], [[RS1:%rs[0-9]+]]}, [test_bitcast_2xi16_to_2xhalf_param_0]; +; CHECK-DAG: cvt.u32.u16 [[R0:%r[0-9]+]], [[RS0]]; +; CHECK-DAG: cvt.u32.u16 [[R1:%r[0-9]+]], [[RS1]]; +; CHECK-DAG: shl.b32 [[R1H:%r[0-9]+]], [[R1]], 16; +; CHECK-DAG: or.b32 [[R1H0L:%r[0-9]+]], [[R0]], [[R1H]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], [[R1H0L]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_bitcast_2xi16_to_2xhalf(<2 x i16> %a) #0 { + %r = bitcast <2 x i16> %a to <2 x half> + ret <2 x half> %r +} + + +declare <2 x half> @llvm.sqrt.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) #0 +declare <2 x half> @llvm.sin.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.cos.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.exp.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.exp2.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log10.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.log2.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 +declare <2 x half> @llvm.fabs.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) #0 +declare <2 x half> @llvm.floor.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.ceil.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.trunc.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.rint.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.nearbyint.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.round.f16(<2 x half> %a) #0 +declare <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 + +; CHECK-LABEL: test_sqrt( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sqrt_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: sqrt.rn.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: sqrt.rn.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sqrt(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.sqrt.f16(<2 x half> %a) + ret <2 x half> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_powi( +;define <2 x half> @test_powi(<2 x half> %a, <2 x i32> %b) #0 { +; %r = call <2 x half> @llvm.powi.f16(<2 x half> %a, <2 x i32> %b) +; ret <2 x half> %r +;} + +; CHECK-LABEL: test_sin( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_sin_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: sin.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: sin.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_sin(<2 x half> %a) #0 #1 { + %r = call <2 x half> @llvm.sin.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_cos( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_cos_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cos.approx.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: cos.approx.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_cos(<2 x half> %a) #0 #1 { + %r = call <2 x half> @llvm.cos.f16(<2 x half> %a) + ret <2 x half> %r +} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_pow( +;define <2 x half> @test_pow(<2 x half> %a, <2 x half> %b) #0 { +; %r = call <2 x half> @llvm.pow.f16(<2 x half> %a, <2 x half> %b) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp( +;define <2 x half> @test_exp(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.exp.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_exp2( +;define <2 x half> @test_exp2(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.exp2.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log( +;define <2 x half> @test_log(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log10( +;define <2 x half> @test_log10(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log10.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +;;; Can't do this yet: requires libcall. +; XCHECK-LABEL: test_log2( +;define <2 x half> @test_log2(<2 x half> %a) #0 { +; %r = call <2 x half> @llvm.log2.f16(<2 x half> %a) +; ret <2 x half> %r +;} + +; CHECK-LABEL: test_fma( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fma_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fma_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fma_param_2]; +; +; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} + +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret +define <2 x half> @test_fma(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { + %r = call <2 x half> @llvm.fma.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + ret <2 x half> %r +} + +; CHECK-LABEL: test_fabs( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_fabs_param_0]; +; CHECK: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: abs.f32 [[RF0:%f[0-9]+]], [[AF0]]; +; CHECK-DAG: abs.f32 [[RF1:%f[0-9]+]], [[AF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fabs(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.fabs.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_minnum( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_minnum_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_minnum_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: min.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; +; CHECK-DAG: min.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_minnum(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.minnum.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_maxnum( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_maxnum_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_maxnum_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: cvt.f32.f16 [[AF0:%f[0-9]+]], [[A0]]; +; CHECK-DAG: cvt.f32.f16 [[AF1:%f[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.f32.f16 [[BF0:%f[0-9]+]], [[B0]]; +; CHECK-DAG: cvt.f32.f16 [[BF1:%f[0-9]+]], [[B1]]; +; CHECK-DAG: max.f32 [[RF0:%f[0-9]+]], [[AF0]], [[BF0]]; +; CHECK-DAG: max.f32 [[RF1:%f[0-9]+]], [[AF1]], [[BF1]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[RF0]]; +; CHECK-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[RF1]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_maxnum(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.maxnum.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; +; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_f32( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f32_param_0]; +; CHECK-DAG: ld.param.v2.f32 {[[B0:%f[0-9]+]], [[B1:%f[0-9]+]]}, [test_copysign_f32_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b32 [[BI0:%r[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b32 [[BI1:%r[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b32 [[BX0:%r[0-9]+]], [[BI0]], -2147483648; +; CHECK-DAG: and.b32 [[BX1:%r[0-9]+]], [[BI1]], -2147483648; +; CHECK-DAG: shr.u32 [[BY0:%r[0-9]+]], [[BX0]], 16; +; CHECK-DAG: shr.u32 [[BY1:%r[0-9]+]], [[BX1]], 16; +; CHECK-DAG: cvt.u16.u32 [[BZ0:%rs[0-9]+]], [[BY0]]; +; CHECK-DAG: cvt.u16.u32 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign_f32(<2 x half> %a, <2 x float> %b) #0 { + %tb = fptrunc <2 x float> %b to <2 x half> + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_f64( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_f64_param_0]; +; CHECK-DAG: ld.param.v2.f64 {[[B0:%fd[0-9]+]], [[B1:%fd[0-9]+]]}, [test_copysign_f64_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b64 [[BI0:%rd[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b64 [[BI1:%rd[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AI0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AI1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b64 [[BX0:%rd[0-9]+]], [[BI0]], -9223372036854775808; +; CHECK-DAG: and.b64 [[BX1:%rd[0-9]+]], [[BI1]], -9223372036854775808; +; CHECK-DAG: shr.u64 [[BY0:%rd[0-9]+]], [[BX0]], 48; +; CHECK-DAG: shr.u64 [[BY1:%rd[0-9]+]], [[BX1]], 48; +; CHECK-DAG: cvt.u16.u64 [[BZ0:%rs[0-9]+]], [[BY0]]; +; CHECK-DAG: cvt.u16.u64 [[BZ1:%rs[0-9]+]], [[BY1]]; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AI0]], [[BZ0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AI1]], [[BZ1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_copysign_f64(<2 x half> %a, <2 x double> %b) #0 { + %tb = fptrunc <2 x double> %b to <2 x half> + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %tb) + ret <2 x half> %r +} + +; CHECK-LABEL: test_copysign_extended( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_copysign_extended_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_copysign_extended_param_1]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-DAG: mov.b16 [[AS0:%rs[0-9]+]], [[A0]]; +; CHECK-DAG: mov.b16 [[AS1:%rs[0-9]+]], [[A1]]; +; CHECK-DAG: mov.b16 [[BS0:%rs[0-9]+]], [[B0]]; +; CHECK-DAG: mov.b16 [[BS1:%rs[0-9]+]], [[B1]]; +; CHECK-DAG: and.b16 [[AX0:%rs[0-9]+]], [[AS0]], 32767; +; CHECK-DAG: and.b16 [[AX1:%rs[0-9]+]], [[AS1]], 32767; +; CHECK-DAG: and.b16 [[BX0:%rs[0-9]+]], [[BS0]], -32768; +; CHECK-DAG: and.b16 [[BX1:%rs[0-9]+]], [[BS1]], -32768; +; CHECK-DAG: or.b16 [[RS0:%rs[0-9]+]], [[AX0]], [[BX0]]; +; CHECK-DAG: or.b16 [[RS1:%rs[0-9]+]], [[AX1]], [[BX1]]; +; CHECK-DAG: mov.b16 [[R0:%h[0-9]+]], [[RS0]]; +; CHECK-DAG: mov.b16 [[R1:%h[0-9]+]], [[RS1]]; +; CHECK-DAG: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: mov.b32 {[[RX0:%h[0-9]+]], [[RX1:%h[0-9]+]]}, [[R]] +; CHECK-DAG: cvt.f32.f16 [[XR0:%f[0-9]+]], [[RX0]]; +; CHECK-DAG: cvt.f32.f16 [[XR1:%f[0-9]+]], [[RX1]]; +; CHECK: st.param.v2.f32 [func_retval0+0], {[[XR0]], [[XR1]]}; +; CHECK: ret; +define <2 x float> @test_copysign_extended(<2 x half> %a, <2 x half> %b) #0 { + %r = call <2 x half> @llvm.copysign.f16(<2 x half> %a, <2 x half> %b) + %xr = fpext <2 x half> %r to <2 x float> + ret <2 x float> %xr +} + +; CHECK-LABEL: test_floor( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_floor_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rmi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rmi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_floor(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.floor.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_ceil( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_ceil_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rpi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rpi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_ceil(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.ceil.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_trunc( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_trunc_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rzi.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rzi.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_trunc(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.trunc.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_rint( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_rint_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_rint(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.rint.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_nearbyint( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_nearbyint_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_nearbyint(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.nearbyint.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_round( +; CHECK: ld.param.b32 [[A:%hh[0-9]+]], [test_round_param_0]; +; CHECK-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R1:%h[0-9]+]], [[A1]]; +; CHECK-DAG: cvt.rni.f16.f16 [[R0:%h[0-9]+]], [[A0]]; +; CHECK: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_round(<2 x half> %a) #0 { + %r = call <2 x half> @llvm.round.f16(<2 x half> %a) + ret <2 x half> %r +} + +; CHECK-LABEL: test_fmuladd( +; CHECK-DAG: ld.param.b32 [[A:%hh[0-9]+]], [test_fmuladd_param_0]; +; CHECK-DAG: ld.param.b32 [[B:%hh[0-9]+]], [test_fmuladd_param_1]; +; CHECK-DAG: ld.param.b32 [[C:%hh[0-9]+]], [test_fmuladd_param_2]; +; +; CHECK-F16: fma.rn.f16x2 [[R:%hh[0-9]+]], [[A]], [[B]], [[C]]; +; +; CHECK-NOF16-DAG: mov.b32 {[[A0:%h[0-9]+]], [[A1:%h[0-9]+]]}, [[A]] +; CHECK-NOF16-DAG: mov.b32 {[[B0:%h[0-9]+]], [[B1:%h[0-9]+]]}, [[B]] +; CHECK-NOF16-DAG: mov.b32 {[[C0:%h[0-9]+]], [[C1:%h[0-9]+]]}, [[C]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA0:%f[0-9]+]], [[A0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB0:%f[0-9]+]], [[B0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FA1:%f[0-9]+]], [[A1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FB1:%f[0-9]+]], [[B1]] +; CHECK-NOF16-DAG: cvt.f32.f16 [[FC0:%f[0-9]+]], [[C0]] +; CHECK-NOF16-DAG: fma.rn.f32 [[FR0:%f[0-9]+]], [[FA0]], [[FB0]], [[FC0]]; +; CHECK-NOF16-DAG: fma.rn.f32 [[FR1:%f[0-9]+]], [[FA1]], [[FB1]], [[FC1]]; +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R0:%h[0-9]+]], [[FR0]] +; CHECK-NOF16-DAG: cvt.rn.f16.f32 [[R1:%h[0-9]+]], [[FR1]] +; CHECK-NOF16: mov.b32 [[R:%hh[0-9]+]], {[[R0]], [[R1]]} +; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define <2 x half> @test_fmuladd(<2 x half> %a, <2 x half> %b, <2 x half> %c) #0 { + %r = call <2 x half> @llvm.fmuladd.f16(<2 x half> %a, <2 x half> %b, <2 x half> %c) + ret <2 x half> %r +} + +attributes #0 = { nounwind } +attributes #1 = { "unsafe-fp-math" = "true" } diff --git a/test/CodeGen/NVPTX/fma.ll b/test/CodeGen/NVPTX/fma.ll index 6785a01827e2..351f9b20dc0c 100644 --- a/test/CodeGen/NVPTX/fma.ll +++ b/test/CodeGen/NVPTX/fma.ll @@ -1,42 +1,42 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast | FileCheck %s - -declare float @dummy_f32(float, float) #0 -declare double @dummy_f64(double, double) #0 - -define ptx_device float @t1_f32(float %x, float %y, float %z) { -; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: ret; - %a = fmul float %x, %y - %b = fadd float %a, %z - ret float %b -} - -define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) { -; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; -; CHECK: ret; - %a = fmul float %x, %y - %b = fadd float %a, %z - %c = fadd float %a, %w - %d = call float @dummy_f32(float %b, float %c) - ret float %d -} - -define ptx_device double @t1_f64(double %x, double %y, double %z) { -; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: ret; - %a = fmul double %x, %y - %b = fadd double %a, %z - ret double %b -} - -define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { -; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; -; CHECK: ret; - %a = fmul double %x, %y - %b = fadd double %a, %z - %c = fadd double %a, %w - %d = call double @dummy_f64(double %b, double %c) - ret double %d -} +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -fp-contract=fast -verify-machineinstrs | FileCheck %s + +declare float @dummy_f32(float, float) #0 +declare double @dummy_f64(double, double) #0 + +define ptx_device float @t1_f32(float %x, float %y, float %z) { +; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: ret; + %a = fmul float %x, %y + %b = fadd float %a, %z + ret float %b +} + +define ptx_device float @t2_f32(float %x, float %y, float %z, float %w) { +; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: fma.rn.f32 %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}, %f{{[0-9]+}}; +; CHECK: ret; + %a = fmul float %x, %y + %b = fadd float %a, %z + %c = fadd float %a, %w + %d = call float @dummy_f32(float %b, float %c) + ret float %d +} + +define ptx_device double @t1_f64(double %x, double %y, double %z) { +; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; +; CHECK: ret; + %a = fmul double %x, %y + %b = fadd double %a, %z + ret double %b +} + +define ptx_device double @t2_f64(double %x, double %y, double %z, double %w) { +; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; +; CHECK: fma.rn.f64 %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}, %fd{{[0-9]+}}; +; CHECK: ret; + %a = fmul double %x, %y + %b = fadd double %a, %z + %c = fadd double %a, %w + %d = call double @dummy_f64(double %b, double %c) + ret double %d +} diff --git a/test/CodeGen/NVPTX/i8-param.ll b/test/CodeGen/NVPTX/i8-param.ll index 6a1e3a0e1a0d..c41da0eebd1f 100644 --- a/test/CodeGen/NVPTX/i8-param.ll +++ b/test/CodeGen/NVPTX/i8-param.ll @@ -1,23 +1,23 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - -target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" - -; CHECK: .visible .func (.param .b32 func_retval0) callee -define i8 @callee(i8 %a) { -; CHECK: ld.param.u8 - %ret = add i8 %a, 42 -; CHECK: st.param.b32 - ret i8 %ret -} - -; CHECK: .visible .func caller -define void @caller(i8* %a) { -; CHECK: ld.u8 - %val = load i8, i8* %a - %ret = tail call i8 @callee(i8 %val) -; CHECK: ld.param.b32 - store i8 %ret, i8* %a - ret void -} - - +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s + +target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" + +; CHECK: .visible .func (.param .b32 func_retval0) callee +define i8 @callee(i8 %a) { +; CHECK: ld.param.u8 + %ret = add i8 %a, 42 +; CHECK: st.param.b32 + ret i8 %ret +} + +; CHECK: .visible .func caller +define void @caller(i8* %a) { +; CHECK: ld.u8 + %val = load i8, i8* %a + %ret = tail call i8 @callee(i8 %val) +; CHECK: ld.param.b32 + store i8 %ret, i8* %a + ret void +} + + diff --git a/test/CodeGen/NVPTX/param-load-store.ll b/test/CodeGen/NVPTX/param-load-store.ll index 8a67567acc96..83991a2930a8 100644 --- a/test/CodeGen/NVPTX/param-load-store.ll +++ b/test/CodeGen/NVPTX/param-load-store.ll @@ -1,939 +1,939 @@ -; Verifies correctness of load/store of parameters and return values. -; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s - -%s_i1 = type { i1 } -%s_i8 = type { i8 } -%s_i16 = type { i16 } -%s_f16 = type { half } -%s_i32 = type { i32 } -%s_f32 = type { float } -%s_i64 = type { i64 } -%s_f64 = type { double } - -; More complicated types. i64 is used to increase natural alignment -; requirement for the type. -%s_i32x4 = type { i32, i32, i32, i32, i64} -%s_i32f32 = type { i32, float, i32, float, i64} -%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64} -%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}> -%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]} -; All scalar parameters must be at least 32 bits in size. -; i1 is loaded/stored as i8. - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i1( -; CHECK-NEXT: .param .b32 test_i1_param_0 -; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0]; -; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]] -; CHECK: .param .b32 retval0; -; CHECK: call.uni -; CHECK-NEXT: test_i1, -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK: ret; -define i1 @test_i1(i1 %a) { - %r = tail call i1 @test_i1(i1 %a); - ret i1 %r; -} - -; Signed i1 is a somewhat special case. We only care about one bit and -; then us neg.s32 to convert it to 32-bit -1 if it's set. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i1s( -; CHECK-NEXT: .param .b32 test_i1s_param_0 -; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; -; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; -; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; -; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni -; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; -; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define signext i1 @test_i1s(i1 signext %a) { - %r = tail call signext i1 @test_i1s(i1 signext %a); - ret i1 %r; -} - -; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v3i1( -; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] -; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; -; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i1, -; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} -; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i1> @test_v3i1(<3 x i1> %a) { - %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); - ret <3 x i1> %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v4i1( -; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] -; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK: test_v4i1, -; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; -; CHECK-NEXT: ret; -define <4 x i1> @test_v4i1(<4 x i1> %a) { - %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); - ret <4 x i1> %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v5i1( -; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] -; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; -; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i1, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i1> @test_v5i1(<5 x i1> %a) { - %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); - ret <5 x i1> %r; -} - -; Unsigned i8 is loaded directly into 32-bit register. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i8( -; CHECK-NEXT: .param .b32 test_i8_param_0 -; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; -; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; -; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i8, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i8 @test_i8(i8 %a) { - %r = tail call i8 @test_i8(i8 %a); - ret i8 %r; -} - -; signed i8 is loaded into 16-bit register which is then sign-extended to i32. -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i8s( -; CHECK-NEXT: .param .b32 test_i8s_param_0 -; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; -; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[A]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK: test_i8s, -; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; -; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? -; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; -; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define signext i8 @test_i8s(i8 signext %a) { - %r = tail call signext i8 @test_i8s(i8 signext %a); - ret i8 %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v3i8( -; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] -; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; -; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b8 [param0+2], [[E2]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i8, -; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; -; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i8> @test_v3i8(<3 x i8> %a) { - %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); - ret <3 x i8> %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v4i8( -; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] -; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i8, -; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-NEXT: ret; -define <4 x i8> @test_v4i8(<4 x i8> %a) { - %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); - ret <4 x i8> %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v5i8( -; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] -; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; -; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i8, -; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i8> @test_v5i8(<5 x i8> %a) { - %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); - ret <5 x i8> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i16( -; CHECK-NEXT: .param .b32 test_i16_param_0 -; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; -; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E32]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i16, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; -; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i16 @test_i16(i16 %a) { - %r = tail call i16 @test_i16(i16 %a); - ret i16 %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i16s( -; CHECK-NEXT: .param .b32 test_i16s_param_0 -; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; -; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E32]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i16s, -; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; -; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define signext i16 @test_i16s(i16 signext %a) { - %r = tail call signext i16 @test_i16s(i16 signext %a); - ret i16 %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v3i16( -; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] -; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; -; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b16 [param0+4], [[E2]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i16, -; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i16> @test_v3i16(<3 x i16> %a) { - %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); - ret <3 x i16> %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v4i16( -; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] -; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i16, -; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-NEXT: ret; -define <4 x i16> @test_v4i16(<4 x i16> %a) { - %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); - ret <4 x i16> %r; -} - -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v5i16( -; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] -; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; -; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i16, -; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i16> @test_v5i16(<5 x i16> %a) { - %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); - ret <5 x i16> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_f16( -; CHECK-NEXT: .param .b32 test_f16_param_0 -; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b16 [param0+0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_f16, -; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]] -; CHECK-NEXT: ret; -define half @test_f16(half %a) { - %r = tail call half @test_f16(half %a); - ret half %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_v2f16( -; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] -; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0]; -; CHECK: .param .align 4 .b8 param0[4]; -; CHECK: st.param.b32 [param0+0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v2f16, -; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]] -; CHECK-NEXT: ret; -define <2 x half> @test_v2f16(<2 x half> %a) { - %r = tail call <2 x half> @test_v2f16(<2 x half> %a); - ret <2 x half> %r; -} - -; CHECK:.func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v3f16( -; CHECK: .param .align 8 .b8 test_v3f16_param_0[8] -; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0]; -; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; -; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK: test_v3f16, -; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4]; -; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}; -; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; -; CHECK: ret; -define <3 x half> @test_v3f16(<3 x half> %a) { - %r = tail call <3 x half> @test_v3f16(<3 x half> %a); - ret <3 x half> %r; -} - -; CHECK:.func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_v4f16( -; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] -; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; -; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; -; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]}; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK: test_v4f16, -; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]}; -; CHECK: ret; -define <4 x half> @test_v4f16(<4 x half> %a) { - %r = tail call <4 x half> @test_v4f16(<4 x half> %a); - ret <4 x half> %r; -} - -; CHECK:.func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v5f16( -; CHECK: .param .align 16 .b8 test_v5f16_param_0[16] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0]; -; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; -; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b16 [param0+0], -; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK: test_v5f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; -; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; -; CHECK: ret; -define <5 x half> @test_v5f16(<5 x half> %a) { - %r = tail call <5 x half> @test_v5f16(<5 x half> %a); - ret <5 x half> %r; -} - -; CHECK:.func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v8f16( -; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] -; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; -; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; -; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; -; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]]; -; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]}; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK: test_v8f16, -; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; -; CHECK: ret; -define <8 x half> @test_v8f16(<8 x half> %a) { - %r = tail call <8 x half> @test_v8f16(<8 x half> %a); - ret <8 x half> %r; -} - -; CHECK:.func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v9f16( -; CHECK: .param .align 32 .b8 test_v9f16_param_0[32] -; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0]; -; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8]; -; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b16 [param0+0], -; CHECK-DAG: st.param.v4.b16 [param0+8], -; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK: test_v9f16, -; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8]; -; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; -; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; -; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; -; CHECK: ret; -define <9 x half> @test_v9f16(<9 x half> %a) { - %r = tail call <9 x half> @test_v9f16(<9 x half> %a); - ret <9 x half> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_i32( -; CHECK-NEXT: .param .b32 test_i32_param_0 -; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.b32 [param0+0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i32 @test_i32(i32 %a) { - %r = tail call i32 @test_i32(i32 %a); - ret i32 %r; -} - -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v3i32( -; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i32, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i32> @test_v3i32(<3 x i32> %a) { - %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); - ret <3 x i32> %r; -} - -; CHECK: .func (.param .align 16 .b8 func_retval0[16]) -; CHECK-LABEL: test_v4i32( -; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] -; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: .param .align 16 .b8 retval0[16]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i32, -; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; -; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHCK-NEXT: ret; -define <4 x i32> @test_v4i32(<4 x i32> %a) { - %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); - ret <4 x i32> %r; -} - -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v5i32( -; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] -; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; -; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v5i32, -; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; -; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} -; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; -; CHECK-NEXT: ret; -define <5 x i32> @test_v5i32(<5 x i32> %a) { - %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); - ret <5 x i32> %r; -} - -; CHECK: .func (.param .b32 func_retval0) -; CHECK-LABEL: test_f32( -; CHECK-NEXT: .param .b32 test_f32_param_0 -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0]; -; CHECK: .param .b32 param0; -; CHECK: st.param.f32 [param0+0], [[E]]; -; CHECK: .param .b32 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define float @test_f32(float %a) { - %r = tail call float @test_f32(float %a); - ret float %r; -} - -; CHECK: .func (.param .b64 func_retval0) -; CHECK-LABEL: test_i64( -; CHECK-NEXT: .param .b64 test_i64_param_0 -; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; -; CHECK: .param .b64 param0; -; CHECK: st.param.b64 [param0+0], [[E]]; -; CHECK: .param .b64 retval0; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define i64 @test_i64(i64 %a) { - %r = tail call i64 @test_i64(i64 %a); - ret i64 %r; -} - -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v3i64( -; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] -; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; -; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b64 [param0+16], [[E2]]; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v3i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; -; CHECK-NEXT: ret; -define <3 x i64> @test_v3i64(<3 x i64> %a) { - %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); - ret <3 x i64> %r; -} - -; For i64 vector loads are limited by PTX to 2 elements. -; CHECK: .func (.param .align 32 .b8 func_retval0[32]) -; CHECK-LABEL: test_v4i64( -; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] -; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; -; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; -; CHECK: .param .align 32 .b8 param0[32]; -; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; -; CHECK: .param .align 32 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_v4i64, -; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; -; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; -; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-NEXT: ret; -define <4 x i64> @test_v4i64(<4 x i64> %a) { - %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); - ret <4 x i64> %r; -} - -; Aggregates, on the other hand, do not get extended. - -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_s_i1( -; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] -; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0+0], [[A]] -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i1, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b8 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i1 @test_s_i1(%s_i1 %a) { - %r = tail call %s_i1 @test_s_i1(%s_i1 %a); - ret %s_i1 %r; -} - -; CHECK: .func (.param .align 1 .b8 func_retval0[1]) -; CHECK-LABEL: test_s_i8( -; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] -; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; -; CHECK: .param .align 1 .b8 param0[1]; -; CHECK: st.param.b8 [param0+0], [[A]] -; CHECK: .param .align 1 .b8 retval0[1]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i8, -; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b8 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i8 @test_s_i8(%s_i8 %a) { - %r = tail call %s_i8 @test_s_i8(%s_i8 %a); - ret %s_i8 %r; -} - -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_s_i16( -; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] -; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[A]] -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni -; CHECK-NEXT: test_s_i16, -; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i16 @test_s_i16(%s_i16 %a) { - %r = tail call %s_i16 @test_s_i16(%s_i16 %a); - ret %s_i16 %r; -} - -; CHECK: .func (.param .align 2 .b8 func_retval0[2]) -; CHECK-LABEL: test_s_f16( -; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] -; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0]; -; CHECK: .param .align 2 .b8 param0[2]; -; CHECK: st.param.b16 [param0+0], [[A]] -; CHECK: .param .align 2 .b8 retval0[2]; -; CHECK: call.uni -; CHECK-NEXT: test_s_f16, -; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; -; CHECK: st.param.b16 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_f16 @test_s_f16(%s_f16 %a) { - %r = tail call %s_f16 @test_s_f16(%s_f16 %a); - ret %s_f16 %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_s_i32( -; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] -; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; -; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.b32 [param0+0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32, -; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; -; CHECK: st.param.b32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i32 @test_s_i32(%s_i32 %a) { - %r = tail call %s_i32 @test_s_i32(%s_i32 %a); - ret %s_i32 %r; -} - -; CHECK: .func (.param .align 4 .b8 func_retval0[4]) -; CHECK-LABEL: test_s_f32( -; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] -; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0]; -; CHECK: .param .align 4 .b8 param0[4] -; CHECK: st.param.f32 [param0+0], [[E]]; -; CHECK: .param .align 4 .b8 retval0[4]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_f32, -; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; -; CHECK: st.param.f32 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_f32 @test_s_f32(%s_f32 %a) { - %r = tail call %s_f32 @test_s_f32(%s_f32 %a); - ret %s_f32 %r; -} - -; CHECK: .func (.param .align 8 .b8 func_retval0[8]) -; CHECK-LABEL: test_s_i64( -; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] -; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; -; CHECK: .param .align 8 .b8 param0[8]; -; CHECK: st.param.b64 [param0+0], [[E]]; -; CHECK: .param .align 8 .b8 retval0[8]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i64, -; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; -; CHECK: st.param.b64 [func_retval0+0], [[R]]; -; CHECK-NEXT: ret; -define %s_i64 @test_s_i64(%s_i64 %a) { - %r = tail call %s_i64 @test_s_i64(%s_i64 %a); - ret %s_i64 %r; -} - -; Fields that have different types, but identical sizes are not vectorized. -; CHECK: .func (.param .align 8 .b8 func_retval0[24]) -; CHECK-LABEL: test_s_i32f32( -; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] -; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; -; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; -; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; -; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; -; CHECK: .param .align 8 .b8 param0[24]; -; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; -; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; -; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; -; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; -; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[24]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32f32, -; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; -; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; -; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; -; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; -; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; -; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; -; CHECK: ret; -define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { - %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); - ret %s_i32f32 %r; -} - -; We do vectorize consecutive fields with matching types. -; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) -; CHECK-LABEL: test_s_i32x4( -; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] -; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; -; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; -; CHECK: .param .align 8 .b8 param0[24]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; -; CHECK: st.param.b64 [param0+16], [[E4]]; -; CHECK: .param .align 8 .b8 retval0[24]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i32x4, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; -; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; -; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; -; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; -; CHECK: ret; - -define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { - %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); - ret %s_i32x4 %r; -} - -; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) -; CHECK-LABEL: test_s_i1i32x4( -; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] -; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; -; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; -; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; -; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; -; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; -; CHECK: .param .align 8 .b8 param0[32]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b8 [param0+8], [[E2]]; -; CHECK: st.param.b32 [param0+12], [[E3]]; -; CHECK: st.param.b32 [param0+16], [[E4]]; -; CHECK: st.param.b64 [param0+24], [[E5]]; -; CHECK: .param .align 8 .b8 retval0[32]; -; CHECK: call.uni (retval0), -; CHECK: test_s_i1i32x4, -; CHECK: ( -; CHECK: param0 -; CHECK: ); -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; -; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; -; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; -; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; -; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; -; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; -; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; -; CHECK: ret; - -define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { - %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); - ret %s_i8i32x4 %r; -} - -; -- All loads/stores from parameters aligned by one must be done one -; -- byte at a time. -; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) -; CHECK-LABEL: test_s_i1i32x4p( -; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; -; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; -; --- TODO -; --- Unaligned parameter store/ return value load is broken in both nvcc -; --- and llvm and needs to be fixed. -; CHECK: .param .align 1 .b8 param0[25]; -; CHECK-DAG: st.param.b32 [param0+0], -; CHECK-DAG: st.param.b32 [param0+4], -; CHECK-DAG: st.param.b8 [param0+8], -; CHECK-DAG: st.param.b32 [param0+9], -; CHECK-DAG: st.param.b32 [param0+13], -; CHECK-DAG: st.param.b64 [param0+17], -; CHECK: .param .align 1 .b8 retval0[25]; -; CHECK: call.uni (retval0), -; CHECK-NEXT: test_s_i1i32x4p, -; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; -; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; -; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; -; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; -; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; -; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; -; CHECK-DAG: st.param.b32 [func_retval0+0], -; CHECK-DAG: st.param.b32 [func_retval0+4], -; CHECK-DAG: st.param.b8 [func_retval0+8], -; CHECK-DAG: st.param.b32 [func_retval0+9], -; CHECK-DAG: st.param.b32 [func_retval0+13], -; CHECK-DAG: st.param.b64 [func_retval0+17], - -define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { - %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); - ret %s_i8i32x4p %r; -} - -; Check that we can vectorize loads that span multiple aggregate fields. -; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) -; CHECK-LABEL: test_s_crossfield( -; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] -; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; -; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; -; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; -; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; -; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; -; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; -; CHECK: .param .align 16 .b8 param0[80]; -; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK: st.param.b32 [param0+8], [[E2]]; -; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; -; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; -; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; -; CHECK: st.param.b32 [param0+64], [[E15]]; -; CHECK: .param .align 16 .b8 retval0[80]; -; CHECK: call.uni (retval0), -; CHECK: test_s_crossfield, -; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; -; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; -; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; -; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; -; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; -; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; -; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; -; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; -; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; -; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; -; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; -; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; -; CHECK: ret; - -define %s_crossfield @test_s_crossfield(%s_crossfield %a) { - %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); - ret %s_crossfield %r; -} +; Verifies correctness of load/store of parameters and return values. +; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 -verify-machineinstrs | FileCheck %s + +%s_i1 = type { i1 } +%s_i8 = type { i8 } +%s_i16 = type { i16 } +%s_f16 = type { half } +%s_i32 = type { i32 } +%s_f32 = type { float } +%s_i64 = type { i64 } +%s_f64 = type { double } + +; More complicated types. i64 is used to increase natural alignment +; requirement for the type. +%s_i32x4 = type { i32, i32, i32, i32, i64} +%s_i32f32 = type { i32, float, i32, float, i64} +%s_i8i32x4 = type { i32, i32, i8, i32, i32, i64} +%s_i8i32x4p = type <{ i32, i32, i8, i32, i32, i64}> +%s_crossfield = type { i32, [2 x i32], <4 x i32>, [3 x {i32, i32, i32}]} +; All scalar parameters must be at least 32 bits in size. +; i1 is loaded/stored as i8. + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1( +; CHECK-NEXT: .param .b32 test_i1_param_0 +; CHECK: ld.param.u8 [[A8:%r[0-9]+]], [test_i1_param_0]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A8]], 1; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]] +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK-NEXT: test_i1, +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R8]], 1; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK: ret; +define i1 @test_i1(i1 %a) { + %r = tail call i1 @test_i1(i1 %a); + ret i1 %r; +} + +; Signed i1 is a somewhat special case. We only care about one bit and +; then us neg.s32 to convert it to 32-bit -1 if it's set. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i1s( +; CHECK-NEXT: .param .b32 test_i1s_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i1s_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A1:%r[0-9]+]], [[A32]], 1; +; CHECK: neg.s32 [[A:%r[0-9]+]], [[A1]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni +; CHECK: ld.param.b32 [[R8:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R1:%r[0-9]+]], [[R8]], 1; +; CHECK: neg.s32 [[R:%r[0-9]+]], [[R1]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i1 @test_i1s(i1 signext %a) { + %r = tail call signext i1 @test_i1s(i1 signext %a); + ret i1 %r; +} + +; Make sure that i1 loads are vectorized as i8 loads, respecting each element alignment. +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i1( +; CHECK-NEXT: .param .align 4 .b8 test_v3i1_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i1_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK-DAG: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i1, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]} +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i1> @test_v3i1(<3 x i1> %a) { + %r = tail call <3 x i1> @test_v3i1(<3 x i1> %a); + ret <3 x i1> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i1( +; CHECK-NEXT: .param .align 4 .b8 test_v4i1_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i1_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK: test_v4i1, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]}; +; CHECK-NEXT: ret; +define <4 x i1> @test_v4i1(<4 x i1> %a) { + %r = tail call <4 x i1> @test_v4i1(<4 x i1> %a); + ret <4 x i1> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i1( +; CHECK-NEXT: .param .align 8 .b8 test_v5i1_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i1_param_0+4]; +; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i1_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i1, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i1> @test_v5i1(<5 x i1> %a) { + %r = tail call <5 x i1> @test_v5i1(<5 x i1> %a); + ret <5 x i1> %r; +} + +; Unsigned i8 is loaded directly into 32-bit register. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8( +; CHECK-NEXT: .param .b32 test_i8_param_0 +; CHECK: ld.param.u8 [[A8:%rs[0-9]+]], [test_i8_param_0]; +; CHECK: cvt.u32.u16 [[A32:%r[0-9]+]], [[A8]]; +; CHECK: and.b32 [[A:%r[0-9]+]], [[A32]], 255; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[R32]], 255; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i8 @test_i8(i8 %a) { + %r = tail call i8 @test_i8(i8 %a); + ret i8 %r; +} + +; signed i8 is loaded into 16-bit register which is then sign-extended to i32. +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i8s( +; CHECK-NEXT: .param .b32 test_i8s_param_0 +; CHECK: ld.param.s8 [[A8:%rs[0-9]+]], [test_i8s_param_0]; +; CHECK: cvt.s32.s16 [[A:%r[0-9]+]], [[A8]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[A]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK: test_i8s, +; CHECK: ld.param.b32 [[R32:%r[0-9]+]], [retval0+0]; +; -- This is suspicious (though correct) -- why not cvt.u8.u32, cvt.s8.s32 ? +; CHECK: cvt.u16.u32 [[R16:%rs[0-9]+]], [[R32]]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[R16]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i8 @test_i8s(i8 signext %a) { + %r = tail call signext i8 @test_i8s(i8 signext %a); + ret i8 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v3i8( +; CHECK-NEXT: .param .align 4 .b8 test_v3i8_param_0[4] +; CHECK-DAG: ld.param.u8 [[E2:%rs[0-9]+]], [test_v3i8_param_0+2]; +; CHECK-DAG: ld.param.v2.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i8_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v2.b8 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+2], [[E2]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i8, +; CHECK-DAG: ld.param.v2.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+2]; +; CHECK-DAG: st.param.v2.b8 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b8 [func_retval0+2], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i8> @test_v3i8(<3 x i8> %a) { + %r = tail call <3 x i8> @test_v3i8(<3 x i8> %a); + ret <3 x i8> %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v4i8( +; CHECK-NEXT: .param .align 4 .b8 test_v4i8_param_0[4] +; CHECK: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i8_param_0] +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i8, +; CHECK: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i8> @test_v4i8(<4 x i8> %a) { + %r = tail call <4 x i8> @test_v4i8(<4 x i8> %a); + ret <4 x i8> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v5i8( +; CHECK-NEXT: .param .align 8 .b8 test_v5i8_param_0[8] +; CHECK-DAG: ld.param.u8 [[E4:%rs[0-9]+]], [test_v5i8_param_0+4]; +; CHECK-DAG ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i8_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v4.b8 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b8 [param0+4], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i8, +; CHECK-DAG: ld.param.v4.b8 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b8 [[RE4:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v4.b8 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b8 [func_retval0+4], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i8> @test_v5i8(<5 x i8> %a) { + %r = tail call <5 x i8> @test_v5i8(<5 x i8> %a); + ret <5 x i8> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16( +; CHECK-NEXT: .param .b32 test_i16_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16_param_0]; +; CHECK: cvt.u32.u16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: and.b32 [[R:%r[0-9]+]], [[RE32]], 65535; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i16 @test_i16(i16 %a) { + %r = tail call i16 @test_i16(i16 %a); + ret i16 %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i16s( +; CHECK-NEXT: .param .b32 test_i16s_param_0 +; CHECK: ld.param.u16 [[E16:%rs[0-9]+]], [test_i16s_param_0]; +; CHECK: cvt.s32.s16 [[E32:%r[0-9]+]], [[E16]]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E32]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i16s, +; CHECK: ld.param.b32 [[RE32:%r[0-9]+]], [retval0+0]; +; CHECK: cvt.s32.s16 [[R:%r[0-9]+]], [[RE32]]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define signext i16 @test_i16s(i16 signext %a) { + %r = tail call signext i16 @test_i16s(i16 signext %a); + ret i16 %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3i16( +; CHECK-NEXT: .param .align 8 .b8 test_v3i16_param_0[8] +; CHECK-DAG: ld.param.u16 [[E2:%rs[0-9]+]], [test_v3i16_param_0+4]; +; CHECK-DAG: ld.param.v2.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]]}, [test_v3i16_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i16, +; CHECK: ld.param.v2.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b16 [[RE2:%rs[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i16> @test_v3i16(<3 x i16> %a) { + %r = tail call <3 x i16> @test_v3i16(<3 x i16> %a); + ret <3 x i16> %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4i16( +; CHECK-NEXT: .param .align 8 .b8 test_v4i16_param_0[8] +; CHECK: ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v4i16_param_0] +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i16, +; CHECK: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-NEXT: ret; +define <4 x i16> @test_v4i16(<4 x i16> %a) { + %r = tail call <4 x i16> @test_v4i16(<4 x i16> %a); + ret <4 x i16> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5i16( +; CHECK-NEXT: .param .align 16 .b8 test_v5i16_param_0[16] +; CHECK-DAG: ld.param.u16 [[E4:%rs[0-9]+]], [test_v5i16_param_0+8]; +; CHECK-DAG ld.param.v4.u16 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [test_v5i16_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i16, +; CHECK-DAG: ld.param.v4.b16 {[[RE0:%rs[0-9]+]], [[RE1:%rs[0-9]+]], [[RE2:%rs[0-9]+]], [[RE3:%rs[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[RE4:%rs[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b16 [func_retval0+8], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i16> @test_v5i16(<5 x i16> %a) { + %r = tail call <5 x i16> @test_v5i16(<5 x i16> %a); + ret <5 x i16> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_f16( +; CHECK-NEXT: .param .b32 test_f16_param_0 +; CHECK: ld.param.b16 [[E:%h[0-9]+]], [test_f16_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b16 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_f16, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define half @test_f16(half %a) { + %r = tail call half @test_f16(half %a); + ret half %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_v2f16( +; CHECK-NEXT: .param .align 4 .b8 test_v2f16_param_0[4] +; CHECK: ld.param.b32 [[E:%hh[0-9]+]], [test_v2f16_param_0]; +; CHECK: .param .align 4 .b8 param0[4]; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v2f16, +; CHECK: ld.param.b32 [[R:%hh[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]] +; CHECK-NEXT: ret; +define <2 x half> @test_v2f16(<2 x half> %a) { + %r = tail call <2 x half> @test_v2f16(<2 x half> %a); + ret <2 x half> %r; +} + +; CHECK:.func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v3f16( +; CHECK: .param .align 8 .b8 test_v3f16_param_0[8] +; CHECK-DAG: ld.param.b32 [[HH01:%hh[0-9]+]], [test_v3f16_param_0]; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; +; CHECK-DAG: ld.param.b16 [[E2:%h[0-9]+]], [test_v3f16_param_0+4]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK-DAG: st.param.v2.b16 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b16 [param0+4], [[E2]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK: test_v3f16, +; CHECK-DAG: ld.param.v2.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R2:%h[0-9]+]], [retval0+4]; +; CHECK-DAG: st.param.v2.b16 [func_retval0+0], {[[R0]], [[R1]]}; +; CHECK-DAG: st.param.b16 [func_retval0+4], [[R2]]; +; CHECK: ret; +define <3 x half> @test_v3f16(<3 x half> %a) { + %r = tail call <3 x half> @test_v3f16(<3 x half> %a); + ret <3 x half> %r; +} + +; CHECK:.func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_v4f16( +; CHECK: .param .align 8 .b8 test_v4f16_param_0[8] +; CHECK: ld.param.v2.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]]}, [test_v4f16_param_0]; +; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; +; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.v2.b32 [param0+0], {[[HH01]], [[HH23]]}; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK: test_v4f16, +; CHECK: ld.param.v2.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RH01]], [[RH23]]}; +; CHECK: ret; +define <4 x half> @test_v4f16(<4 x half> %a) { + %r = tail call <4 x half> @test_v4f16(<4 x half> %a); + ret <4 x half> %r; +} + +; CHECK:.func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v5f16( +; CHECK: .param .align 16 .b8 test_v5f16_param_0[16] +; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v5f16_param_0]; +; CHECK-DAG: mov.b32 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]]}, [[HH01]]; +; CHECK-DAG: ld.param.b16 [[E4:%h[0-9]+]], [test_v5f16_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.b16 [param0+8], [[E4]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK: test_v5f16, +; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b16 [[R4:%h[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.b16 [func_retval0+8], [[R4]]; +; CHECK: ret; +define <5 x half> @test_v5f16(<5 x half> %a) { + %r = tail call <5 x half> @test_v5f16(<5 x half> %a); + ret <5 x half> %r; +} + +; CHECK:.func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v8f16( +; CHECK: .param .align 16 .b8 test_v8f16_param_0[16] +; CHECK: ld.param.v4.u32 {[[R01:%r[0-9]+]], [[R23:%r[0-9]+]], [[R45:%r[0-9]+]], [[R67:%r[0-9]+]]}, [test_v8f16_param_0]; +; CHECK-DAG: mov.b32 [[HH01:%hh[0-9]+]], [[R01]]; +; CHECK-DAG: mov.b32 [[HH23:%hh[0-9]+]], [[R23]]; +; CHECK-DAG: mov.b32 [[HH45:%hh[0-9]+]], [[R45]]; +; CHECK-DAG: mov.b32 [[HH67:%hh[0-9]+]], [[R67]]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[HH01]], [[HH23]], [[HH45]], [[HH67]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK: test_v8f16, +; CHECK: ld.param.v4.b32 {[[RH01:%hh[0-9]+]], [[RH23:%hh[0-9]+]], [[RH45:%hh[0-9]+]], [[RH67:%hh[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RH01]], [[RH23]], [[RH45]], [[RH67]]}; +; CHECK: ret; +define <8 x half> @test_v8f16(<8 x half> %a) { + %r = tail call <8 x half> @test_v8f16(<8 x half> %a); + ret <8 x half> %r; +} + +; CHECK:.func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v9f16( +; CHECK: .param .align 32 .b8 test_v9f16_param_0[32] +; CHECK-DAG: ld.param.v4.b16 {[[E0:%h[0-9]+]], [[E1:%h[0-9]+]], [[E2:%h[0-9]+]], [[E3:%h[0-9]+]]}, [test_v9f16_param_0]; +; CHECK-DAG: ld.param.v4.b16 {[[E4:%h[0-9]+]], [[E5:%h[0-9]+]], [[E6:%h[0-9]+]], [[E7:%h[0-9]+]]}, [test_v9f16_param_0+8]; +; CHECK-DAG: ld.param.b16 [[E8:%h[0-9]+]], [test_v9f16_param_0+16]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b16 [param0+0], +; CHECK-DAG: st.param.v4.b16 [param0+8], +; CHECK-DAG: st.param.b16 [param0+16], [[E8]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_v9f16, +; CHECK-DAG: ld.param.v4.b16 {[[R0:%h[0-9]+]], [[R1:%h[0-9]+]], [[R2:%h[0-9]+]], [[R3:%h[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.v4.b16 {[[R4:%h[0-9]+]], [[R5:%h[0-9]+]], [[R6:%h[0-9]+]], [[R7:%h[0-9]+]]}, [retval0+8]; +; CHECK-DAG: ld.param.b16 [[R8:%h[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b16 [func_retval0+0], {[[R0]], [[R1]], [[R2]], [[R3]]}; +; CHECK-DAG: st.param.v4.b16 [func_retval0+8], {[[R4]], [[R5]], [[R6]], [[R7]]}; +; CHECK-DAG: st.param.b16 [func_retval0+16], [[R8]]; +; CHECK: ret; +define <9 x half> @test_v9f16(<9 x half> %a) { + %r = tail call <9 x half> @test_v9f16(<9 x half> %a); + ret <9 x half> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_i32( +; CHECK-NEXT: .param .b32 test_i32_param_0 +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_i32_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i32 @test_i32(i32 %a) { + %r = tail call i32 @test_i32(i32 %a); + ret i32 %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v3i32( +; CHECK-NEXT: .param .align 16 .b8 test_v3i32_param_0[16] +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_v3i32_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_v3i32_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i32, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i32> @test_v3i32(<3 x i32> %a) { + %r = tail call <3 x i32> @test_v3i32(<3 x i32> %a); + ret <3 x i32> %r; +} + +; CHECK: .func (.param .align 16 .b8 func_retval0[16]) +; CHECK-LABEL: test_v4i32( +; CHECK-NEXT: .param .align 16 .b8 test_v4i32_param_0[16] +; CHECK: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v4i32_param_0] +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: .param .align 16 .b8 retval0[16]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i32, +; CHECK: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHCK-NEXT: ret; +define <4 x i32> @test_v4i32(<4 x i32> %a) { + %r = tail call <4 x i32> @test_v4i32(<4 x i32> %a); + ret <4 x i32> %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v5i32( +; CHECK-NEXT: .param .align 32 .b8 test_v5i32_param_0[32] +; CHECK-DAG: ld.param.u32 [[E4:%r[0-9]+]], [test_v5i32_param_0+16]; +; CHECK-DAG ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_v5i32_param_0] +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK-DAG: st.param.b32 [param0+16], [[E4]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v5i32, +; CHECK-DAG: ld.param.v4.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]], [[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+0]; +; CHECK-DAG: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v4.b32 [func_retval0+0], {[[RE0]], [[RE1]], [[RE2]], [[RE3]]} +; CHECK-DAG: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK-NEXT: ret; +define <5 x i32> @test_v5i32(<5 x i32> %a) { + %r = tail call <5 x i32> @test_v5i32(<5 x i32> %a); + ret <5 x i32> %r; +} + +; CHECK: .func (.param .b32 func_retval0) +; CHECK-LABEL: test_f32( +; CHECK-NEXT: .param .b32 test_f32_param_0 +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_f32_param_0]; +; CHECK: .param .b32 param0; +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .b32 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_f32, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define float @test_f32(float %a) { + %r = tail call float @test_f32(float %a); + ret float %r; +} + +; CHECK: .func (.param .b64 func_retval0) +; CHECK-LABEL: test_i64( +; CHECK-NEXT: .param .b64 test_i64_param_0 +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_i64_param_0]; +; CHECK: .param .b64 param0; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .b64 retval0; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define i64 @test_i64(i64 %a) { + %r = tail call i64 @test_i64(i64 %a); + ret i64 %r; +} + +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v3i64( +; CHECK-NEXT: .param .align 32 .b8 test_v3i64_param_0[32] +; CHECK-DAG: ld.param.u64 [[E2:%rd[0-9]+]], [test_v3i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v3i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b64 [param0+16], [[E2]]; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v3i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b64 [[RE2:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE2]]; +; CHECK-NEXT: ret; +define <3 x i64> @test_v3i64(<3 x i64> %a) { + %r = tail call <3 x i64> @test_v3i64(<3 x i64> %a); + ret <3 x i64> %r; +} + +; For i64 vector loads are limited by PTX to 2 elements. +; CHECK: .func (.param .align 32 .b8 func_retval0[32]) +; CHECK-LABEL: test_v4i64( +; CHECK-NEXT: .param .align 32 .b8 test_v4i64_param_0[32] +; CHECK-DAG: ld.param.v2.u64 {[[E2:%rd[0-9]+]], [[E3:%rd[0-9]+]]}, [test_v4i64_param_0+16]; +; CHECK-DAG: ld.param.v2.u64 {[[E0:%rd[0-9]+]], [[E1:%rd[0-9]+]]}, [test_v4i64_param_0]; +; CHECK: .param .align 32 .b8 param0[32]; +; CHECK: st.param.v2.b64 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b64 [param0+16], {[[E2]], [[E3]]}; +; CHECK: .param .align 32 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_v4i64, +; CHECK: ld.param.v2.b64 {[[RE0:%rd[0-9]+]], [[RE1:%rd[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b64 {[[RE2:%rd[0-9]+]], [[RE3:%rd[0-9]+]]}, [retval0+16]; +; CHECK-DAG: st.param.v2.b64 [func_retval0+16], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.v2.b64 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-NEXT: ret; +define <4 x i64> @test_v4i64(<4 x i64> %a) { + %r = tail call <4 x i64> @test_v4i64(<4 x i64> %a); + ret <4 x i64> %r; +} + +; Aggregates, on the other hand, do not get extended. + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i1( +; CHECK-NEXT: .align 1 .b8 test_s_i1_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i1_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i1, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i1 @test_s_i1(%s_i1 %a) { + %r = tail call %s_i1 @test_s_i1(%s_i1 %a); + ret %s_i1 %r; +} + +; CHECK: .func (.param .align 1 .b8 func_retval0[1]) +; CHECK-LABEL: test_s_i8( +; CHECK-NEXT: .param .align 1 .b8 test_s_i8_param_0[1] +; CHECK: ld.param.u8 [[A:%rs[0-9]+]], [test_s_i8_param_0]; +; CHECK: .param .align 1 .b8 param0[1]; +; CHECK: st.param.b8 [param0+0], [[A]] +; CHECK: .param .align 1 .b8 retval0[1]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i8, +; CHECK: ld.param.b8 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b8 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i8 @test_s_i8(%s_i8 %a) { + %r = tail call %s_i8 @test_s_i8(%s_i8 %a); + ret %s_i8 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_i16( +; CHECK-NEXT: .param .align 2 .b8 test_s_i16_param_0[2] +; CHECK: ld.param.u16 [[A:%rs[0-9]+]], [test_s_i16_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_i16, +; CHECK: ld.param.b16 [[R:%rs[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i16 @test_s_i16(%s_i16 %a) { + %r = tail call %s_i16 @test_s_i16(%s_i16 %a); + ret %s_i16 %r; +} + +; CHECK: .func (.param .align 2 .b8 func_retval0[2]) +; CHECK-LABEL: test_s_f16( +; CHECK-NEXT: .param .align 2 .b8 test_s_f16_param_0[2] +; CHECK: ld.param.b16 [[A:%h[0-9]+]], [test_s_f16_param_0]; +; CHECK: .param .align 2 .b8 param0[2]; +; CHECK: st.param.b16 [param0+0], [[A]] +; CHECK: .param .align 2 .b8 retval0[2]; +; CHECK: call.uni +; CHECK-NEXT: test_s_f16, +; CHECK: ld.param.b16 [[R:%h[0-9]+]], [retval0+0]; +; CHECK: st.param.b16 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_f16 @test_s_f16(%s_f16 %a) { + %r = tail call %s_f16 @test_s_f16(%s_f16 %a); + ret %s_f16 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_i32( +; CHECK-NEXT: .param .align 4 .b8 test_s_i32_param_0[4] +; CHECK: ld.param.u32 [[E:%r[0-9]+]], [test_s_i32_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.b32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32, +; CHECK: ld.param.b32 [[R:%r[0-9]+]], [retval0+0]; +; CHECK: st.param.b32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i32 @test_s_i32(%s_i32 %a) { + %r = tail call %s_i32 @test_s_i32(%s_i32 %a); + ret %s_i32 %r; +} + +; CHECK: .func (.param .align 4 .b8 func_retval0[4]) +; CHECK-LABEL: test_s_f32( +; CHECK-NEXT: .param .align 4 .b8 test_s_f32_param_0[4] +; CHECK: ld.param.f32 [[E:%f[0-9]+]], [test_s_f32_param_0]; +; CHECK: .param .align 4 .b8 param0[4] +; CHECK: st.param.f32 [param0+0], [[E]]; +; CHECK: .param .align 4 .b8 retval0[4]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_f32, +; CHECK: ld.param.f32 [[R:%f[0-9]+]], [retval0+0]; +; CHECK: st.param.f32 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_f32 @test_s_f32(%s_f32 %a) { + %r = tail call %s_f32 @test_s_f32(%s_f32 %a); + ret %s_f32 %r; +} + +; CHECK: .func (.param .align 8 .b8 func_retval0[8]) +; CHECK-LABEL: test_s_i64( +; CHECK-NEXT: .param .align 8 .b8 test_s_i64_param_0[8] +; CHECK: ld.param.u64 [[E:%rd[0-9]+]], [test_s_i64_param_0]; +; CHECK: .param .align 8 .b8 param0[8]; +; CHECK: st.param.b64 [param0+0], [[E]]; +; CHECK: .param .align 8 .b8 retval0[8]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i64, +; CHECK: ld.param.b64 [[R:%rd[0-9]+]], [retval0+0]; +; CHECK: st.param.b64 [func_retval0+0], [[R]]; +; CHECK-NEXT: ret; +define %s_i64 @test_s_i64(%s_i64 %a) { + %r = tail call %s_i64 @test_s_i64(%s_i64 %a); + ret %s_i64 %r; +} + +; Fields that have different types, but identical sizes are not vectorized. +; CHECK: .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32f32( +; CHECK: .param .align 8 .b8 test_s_i32f32_param_0[24] +; CHECK-DAG: ld.param.u64 [[E4:%rd[0-9]+]], [test_s_i32f32_param_0+16]; +; CHECK-DAG: ld.param.f32 [[E3:%f[0-9]+]], [test_s_i32f32_param_0+12]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [test_s_i32f32_param_0+8]; +; CHECK-DAG: ld.param.f32 [[E1:%f[0-9]+]], [test_s_i32f32_param_0+4]; +; CHECK-DAG: ld.param.u32 [[E0:%r[0-9]+]], [test_s_i32f32_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK-DAG: st.param.b32 [param0+0], [[E0]]; +; CHECK-DAG: st.param.f32 [param0+4], [[E1]]; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK-DAG: st.param.f32 [param0+12], [[E3]]; +; CHECK-DAG: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32f32, +; CHECK-DAG: ld.param.b32 [[RE0:%r[0-9]+]], [retval0+0]; +; CHECK-DAG: ld.param.f32 [[RE1:%f[0-9]+]], [retval0+4]; +; CHECK-DAG: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK-DAG: ld.param.f32 [[RE3:%f[0-9]+]], [retval0+12]; +; CHECK-DAG: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.b32 [func_retval0+0], [[RE0]]; +; CHECK-DAG: st.param.f32 [func_retval0+4], [[RE1]]; +; CHECK-DAG: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK-DAG: st.param.f32 [func_retval0+12], [[RE3]]; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; +define %s_i32f32 @test_s_i32f32(%s_i32f32 %a) { + %r = tail call %s_i32f32 @test_s_i32f32(%s_i32f32 %a); + ret %s_i32f32 %r; +} + +; We do vectorize consecutive fields with matching types. +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[24]) +; CHECK-LABEL: test_s_i32x4( +; CHECK: .param .align 8 .b8 test_s_i32x4_param_0[24] +; CHECK-DAG: ld.param.u64 [[RD1:%rd[0-9]+]], [test_s_i32x4_param_0+16]; +; CHECK-DAG: ld.param.v2.u32 {[[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [test_s_i32x4_param_0+8]; +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[24]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.v2.b32 [param0+8], {[[E2]], [[E3]]}; +; CHECK: st.param.b64 [param0+16], [[E4]]; +; CHECK: .param .align 8 .b8 retval0[24]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i32x4, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.v2.b32 {[[RE2:%r[0-9]+]], [[RE3:%r[0-9]+]]}, [retval0+8]; +; CHECK: ld.param.b64 [[RE4:%rd[0-9]+]], [retval0+16]; +; CHECK-DAG: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK-DAG: st.param.v2.b32 [func_retval0+8], {[[RE2]], [[RE3]]}; +; CHECK-DAG: st.param.b64 [func_retval0+16], [[RE4]]; +; CHECK: ret; + +define %s_i32x4 @test_s_i32x4(%s_i32x4 %a) { + %r = tail call %s_i32x4 @test_s_i32x4(%s_i32x4 %a); + ret %s_i32x4 %r; +} + +; CHECK:.visible .func (.param .align 8 .b8 func_retval0[32]) +; CHECK-LABEL: test_s_i1i32x4( +; CHECK: .param .align 8 .b8 test_s_i1i32x4_param_0[32] +; CHECK: ld.param.u64 [[E5:%rd[0-9]+]], [test_s_i1i32x4_param_0+24]; +; CHECK: ld.param.u32 [[E4:%r[0-9]+]], [test_s_i1i32x4_param_0+16]; +; CHECK: ld.param.u32 [[E3:%r[0-9]+]], [test_s_i1i32x4_param_0+12]; +; CHECK: ld.param.u8 [[E2:%rs[0-9]+]], [test_s_i1i32x4_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_i1i32x4_param_0]; +; CHECK: .param .align 8 .b8 param0[32]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b8 [param0+8], [[E2]]; +; CHECK: st.param.b32 [param0+12], [[E3]]; +; CHECK: st.param.b32 [param0+16], [[E4]]; +; CHECK: st.param.b64 [param0+24], [[E5]]; +; CHECK: .param .align 8 .b8 retval0[32]; +; CHECK: call.uni (retval0), +; CHECK: test_s_i1i32x4, +; CHECK: ( +; CHECK: param0 +; CHECK: ); +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b8 [[RE2:%rs[0-9]+]], [retval0+8]; +; CHECK: ld.param.b32 [[RE3:%r[0-9]+]], [retval0+12]; +; CHECK: ld.param.b32 [[RE4:%r[0-9]+]], [retval0+16]; +; CHECK: ld.param.b64 [[RE5:%rd[0-9]+]], [retval0+24]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b8 [func_retval0+8], [[RE2]]; +; CHECK: st.param.b32 [func_retval0+12], [[RE3]]; +; CHECK: st.param.b32 [func_retval0+16], [[RE4]]; +; CHECK: st.param.b64 [func_retval0+24], [[RE5]]; +; CHECK: ret; + +define %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a) { + %r = tail call %s_i8i32x4 @test_s_i1i32x4(%s_i8i32x4 %a); + ret %s_i8i32x4 %r; +} + +; -- All loads/stores from parameters aligned by one must be done one +; -- byte at a time. +; CHECK:.visible .func (.param .align 1 .b8 func_retval0[25]) +; CHECK-LABEL: test_s_i1i32x4p( +; CHECK-DAG: .param .align 1 .b8 test_s_i1i32x4p_param_0[25] +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+24]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+23]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+22]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+21]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+20]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+19]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+18]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+17]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+16]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+15]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+14]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+13]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+12]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+11]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+10]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+9]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+8]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+7]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+6]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+5]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+4]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+3]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+2]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0+1]; +; CHECK-DAG: ld.param.u8 %r{{.*}}, [test_s_i1i32x4p_param_0]; +; --- TODO +; --- Unaligned parameter store/ return value load is broken in both nvcc +; --- and llvm and needs to be fixed. +; CHECK: .param .align 1 .b8 param0[25]; +; CHECK-DAG: st.param.b32 [param0+0], +; CHECK-DAG: st.param.b32 [param0+4], +; CHECK-DAG: st.param.b8 [param0+8], +; CHECK-DAG: st.param.b32 [param0+9], +; CHECK-DAG: st.param.b32 [param0+13], +; CHECK-DAG: st.param.b64 [param0+17], +; CHECK: .param .align 1 .b8 retval0[25]; +; CHECK: call.uni (retval0), +; CHECK-NEXT: test_s_i1i32x4p, +; CHECK-DAG: ld.param.b32 %r41, [retval0+0]; +; CHECK-DAG: ld.param.b32 %r42, [retval0+4]; +; CHECK-DAG: ld.param.b8 %rs2, [retval0+8]; +; CHECK-DAG: ld.param.b32 %r43, [retval0+9]; +; CHECK-DAG: ld.param.b32 %r44, [retval0+13]; +; CHECK-DAG: ld.param.b64 %rd23, [retval0+17]; +; CHECK-DAG: st.param.b32 [func_retval0+0], +; CHECK-DAG: st.param.b32 [func_retval0+4], +; CHECK-DAG: st.param.b8 [func_retval0+8], +; CHECK-DAG: st.param.b32 [func_retval0+9], +; CHECK-DAG: st.param.b32 [func_retval0+13], +; CHECK-DAG: st.param.b64 [func_retval0+17], + +define %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a) { + %r = tail call %s_i8i32x4p @test_s_i1i32x4p(%s_i8i32x4p %a); + ret %s_i8i32x4p %r; +} + +; Check that we can vectorize loads that span multiple aggregate fields. +; CHECK:.visible .func (.param .align 16 .b8 func_retval0[80]) +; CHECK-LABEL: test_s_crossfield( +; CHECK: .param .align 16 .b8 test_s_crossfield_param_0[80] +; CHECK: ld.param.u32 [[E15:%r[0-9]+]], [test_s_crossfield_param_0+64]; +; CHECK: ld.param.v4.u32 {[[E11:%r[0-9]+]], [[E12:%r[0-9]+]], [[E13:%r[0-9]+]], [[E14:%r[0-9]+]]}, [test_s_crossfield_param_0+48]; +; CHECK: ld.param.v4.u32 {[[E7:%r[0-9]+]], [[E8:%r[0-9]+]], [[E9:%r[0-9]+]], [[E10:%r[0-9]+]]}, [test_s_crossfield_param_0+32]; +; CHECK: ld.param.v4.u32 {[[E3:%r[0-9]+]], [[E4:%r[0-9]+]], [[E5:%r[0-9]+]], [[E6:%r[0-9]+]]}, [test_s_crossfield_param_0+16]; +; CHECK: ld.param.u32 [[E2:%r[0-9]+]], [test_s_crossfield_param_0+8]; +; CHECK: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [test_s_crossfield_param_0]; +; CHECK: .param .align 16 .b8 param0[80]; +; CHECK: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK: st.param.b32 [param0+8], [[E2]]; +; CHECK: st.param.v4.b32 [param0+16], {[[E3]], [[E4]], [[E5]], [[E6]]}; +; CHECK: st.param.v4.b32 [param0+32], {[[E7]], [[E8]], [[E9]], [[E10]]}; +; CHECK: st.param.v4.b32 [param0+48], {[[E11]], [[E12]], [[E13]], [[E14]]}; +; CHECK: st.param.b32 [param0+64], [[E15]]; +; CHECK: .param .align 16 .b8 retval0[80]; +; CHECK: call.uni (retval0), +; CHECK: test_s_crossfield, +; CHECK: ld.param.v2.b32 {[[RE0:%r[0-9]+]], [[RE1:%r[0-9]+]]}, [retval0+0]; +; CHECK: ld.param.b32 [[RE2:%r[0-9]+]], [retval0+8]; +; CHECK: ld.param.v4.b32 {[[RE3:%r[0-9]+]], [[RE4:%r[0-9]+]], [[RE5:%r[0-9]+]], [[RE6:%r[0-9]+]]}, [retval0+16]; +; CHECK: ld.param.v4.b32 {[[RE7:%r[0-9]+]], [[RE8:%r[0-9]+]], [[RE9:%r[0-9]+]], [[RE10:%r[0-9]+]]}, [retval0+32]; +; CHECK: ld.param.v4.b32 {[[RE11:%r[0-9]+]], [[RE12:%r[0-9]+]], [[RE13:%r[0-9]+]], [[RE14:%r[0-9]+]]}, [retval0+48]; +; CHECK: ld.param.b32 [[RE15:%r[0-9]+]], [retval0+64]; +; CHECK: st.param.v2.b32 [func_retval0+0], {[[RE0]], [[RE1]]}; +; CHECK: st.param.b32 [func_retval0+8], [[RE2]]; +; CHECK: st.param.v4.b32 [func_retval0+16], {[[RE3]], [[RE4]], [[RE5]], [[RE6]]}; +; CHECK: st.param.v4.b32 [func_retval0+32], {[[RE7]], [[RE8]], [[RE9]], [[RE10]]}; +; CHECK: st.param.v4.b32 [func_retval0+48], {[[RE11]], [[RE12]], [[RE13]], [[RE14]]}; +; CHECK: st.param.b32 [func_retval0+64], [[RE15]]; +; CHECK: ret; + +define %s_crossfield @test_s_crossfield(%s_crossfield %a) { + %r = tail call %s_crossfield @test_s_crossfield(%s_crossfield %a); + ret %s_crossfield %r; +} diff --git a/test/CodeGen/NVPTX/sched1.ll b/test/CodeGen/NVPTX/sched1.ll index fb01eb262adc..ecdf55ecdbeb 100644 --- a/test/CodeGen/NVPTX/sched1.ll +++ b/test/CodeGen/NVPTX/sched1.ll @@ -6,10 +6,10 @@ define void @foo(i32* %a) { ; CHECK: .func foo ; CHECK: ld.u32 ; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 -; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.u32 ; CHECK-NEXT: add.s32 %ptr0 = getelementptr i32, i32* %a, i32 0 %val0 = load i32, i32* %ptr0 diff --git a/test/CodeGen/NVPTX/sched2.ll b/test/CodeGen/NVPTX/sched2.ll index 91ed77878f81..347f77c5682c 100644 --- a/test/CodeGen/NVPTX/sched2.ll +++ b/test/CodeGen/NVPTX/sched2.ll @@ -4,12 +4,12 @@ define void @foo(<2 x i32>* %a) { ; CHECK: .func foo ; CHECK: ld.v2.u32 ; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 -; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 +; CHECK-NEXT: ld.v2.u32 ; CHECK-NEXT: add.s32 ; CHECK-NEXT: add.s32 %ptr0 = getelementptr <2 x i32>, <2 x i32>* %a, i32 0 diff --git a/test/CodeGen/NVPTX/simple-call.ll b/test/CodeGen/NVPTX/simple-call.ll index da6568685fe6..8ff0b5da5bcc 100644 --- a/test/CodeGen/NVPTX/simple-call.ll +++ b/test/CodeGen/NVPTX/simple-call.ll @@ -1,26 +1,26 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s -; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s - - - -; CHECK: .func ({{.*}}) device_func -define float @device_func(float %a) noinline { - %ret = fmul float %a, %a - ret float %ret -} - -; CHECK: .entry kernel_func -define void @kernel_func(float* %a) { - %val = load float, float* %a -; CHECK: call.uni (retval0), -; CHECK: device_func, - %mul = call float @device_func(float %val) - store float %mul, float* %a - ret void -} - - - -!nvvm.annotations = !{!1} - -!1 = !{void (float*)* @kernel_func, !"kernel", i32 1} +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 -verify-machineinstrs | FileCheck %s + + + +; CHECK: .func ({{.*}}) device_func +define float @device_func(float %a) noinline { + %ret = fmul float %a, %a + ret float %ret +} + +; CHECK: .entry kernel_func +define void @kernel_func(float* %a) { + %val = load float, float* %a +; CHECK: call.uni (retval0), +; CHECK: device_func, + %mul = call float @device_func(float %val) + store float %mul, float* %a + ret void +} + + + +!nvvm.annotations = !{!1} + +!1 = !{void (float*)* @kernel_func, !"kernel", i32 1} diff --git a/test/CodeGen/NVPTX/vec8.ll b/test/CodeGen/NVPTX/vec8.ll index a86ba1e29d5c..93b39c1125f8 100644 --- a/test/CodeGen/NVPTX/vec8.ll +++ b/test/CodeGen/NVPTX/vec8.ll @@ -7,7 +7,7 @@ define void @foo(<8 x i8> %a, i8* %b) { ; CHECK-DAG: ld.param.v4.u8 {[[E0:%rs[0-9]+]], [[E1:%rs[0-9]+]], [[E2:%rs[0-9]+]], [[E3:%rs[0-9]+]]}, [foo_param_0] ; CHECK-DAG: ld.param.v4.u8 {[[E4:%rs[0-9]+]], [[E5:%rs[0-9]+]], [[E6:%rs[0-9]+]], [[E7:%rs[0-9]+]]}, [foo_param_0+4] ; CHECK-DAG: ld.param.u32 %[[B:r[0-9+]]], [foo_param_1] -; CHECK: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; +; CHECK-DAG: add.s16 [[T:%rs[0-9+]]], [[E1]], [[E6]]; ; CHECK: st.u8 [%[[B]]], [[T]]; %t0 = extractelement <8 x i8> %a, i32 1 %t1 = extractelement <8 x i8> %a, i32 6 diff --git a/test/CodeGen/NVPTX/vector-call.ll b/test/CodeGen/NVPTX/vector-call.ll index bf7b931a5758..d1ec8d25a107 100644 --- a/test/CodeGen/NVPTX/vector-call.ll +++ b/test/CodeGen/NVPTX/vector-call.ll @@ -1,30 +1,30 @@ -; RUN: llc < %s -march=nvptx -mcpu=sm_20 | FileCheck %s - -target triple = "nvptx-unknown-cuda" - -declare void @bar(<4 x i32>) - -; CHECK-LABEL: .func foo( -; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; -; CHECK: call.uni -; CHECK: ret; -define void @foo(<4 x i32> %a) { - tail call void @bar(<4 x i32> %a) - ret void -} - -; CHECK-LABEL: .func foo3( -; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; -; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; -; CHECK: .param .align 16 .b8 param0[16]; -; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; -; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; -; CHECK: call.uni -; CHECK: ret; -declare void @bar3(<3 x i32>) -define void @foo3(<3 x i32> %a) { - tail call void @bar3(<3 x i32> %a) - ret void -} +; RUN: llc < %s -march=nvptx -mcpu=sm_20 -verify-machineinstrs | FileCheck %s + +target triple = "nvptx-unknown-cuda" + +declare void @bar(<4 x i32>) + +; CHECK-LABEL: .func foo( +; CHECK-DAG: ld.param.v4.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]], [[E2:%r[0-9]+]], [[E3:%r[0-9]+]]}, [foo_param_0]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v4.b32 [param0+0], {[[E0]], [[E1]], [[E2]], [[E3]]}; +; CHECK: call.uni +; CHECK: ret; +define void @foo(<4 x i32> %a) { + tail call void @bar(<4 x i32> %a) + ret void +} + +; CHECK-LABEL: .func foo3( +; CHECK-DAG: ld.param.v2.u32 {[[E0:%r[0-9]+]], [[E1:%r[0-9]+]]}, [foo3_param_0]; +; CHECK-DAG: ld.param.u32 [[E2:%r[0-9]+]], [foo3_param_0+8]; +; CHECK: .param .align 16 .b8 param0[16]; +; CHECK-DAG: st.param.v2.b32 [param0+0], {[[E0]], [[E1]]}; +; CHECK-DAG: st.param.b32 [param0+8], [[E2]]; +; CHECK: call.uni +; CHECK: ret; +declare void @bar3(<3 x i32>) +define void @foo3(<3 x i32> %a) { + tail call void @bar3(<3 x i32> %a) + ret void +} diff --git a/test/CodeGen/NVPTX/zeroext-32bit.ll b/test/CodeGen/NVPTX/zeroext-32bit.ll index c2f0ec4b1447..bcfd987b4a66 100644 --- a/test/CodeGen/NVPTX/zeroext-32bit.ll +++ b/test/CodeGen/NVPTX/zeroext-32bit.ll @@ -1,26 +1,26 @@ -; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 | FileCheck %s - -; The zeroext attribute below should be silently ignored because -; we can pass a 32-bit integer across a function call without -; needing to extend it. - -target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" -target triple = "nvptx64-unknown-cuda" - -; CHECK-LABEL: .visible .func zeroext_test -; CHECK-NOT: cvt.u32.u16 -define void @zeroext_test() { - tail call void @call1(i32 zeroext 0) - ret void -} - -declare void @call1(i32 zeroext) - -; CHECK-LABEL: .visible .func signext_test -; CHECK-NOT: cvt.s32.s16 -define void @signext_test() { - tail call void @call2(i32 zeroext 0) - ret void -} - -declare void @call2(i32 zeroext) +; RUN: llc < %s -march=nvptx64 -mcpu=sm_30 -verify-machineinstrs | FileCheck %s + +; The zeroext attribute below should be silently ignored because +; we can pass a 32-bit integer across a function call without +; needing to extend it. + +target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v16:16:16-v32:32:32-v64:64:64-v128:128:128-n16:32:64" +target triple = "nvptx64-unknown-cuda" + +; CHECK-LABEL: .visible .func zeroext_test +; CHECK-NOT: cvt.u32.u16 +define void @zeroext_test() { + tail call void @call1(i32 zeroext 0) + ret void +} + +declare void @call1(i32 zeroext) + +; CHECK-LABEL: .visible .func signext_test +; CHECK-NOT: cvt.s32.s16 +define void @signext_test() { + tail call void @call2(i32 zeroext 0) + ret void +} + +declare void @call2(i32 zeroext) diff --git a/test/CodeGen/PowerPC/mtvsrdd.ll b/test/CodeGen/PowerPC/mtvsrdd.ll new file mode 100644 index 000000000000..1d6a3553b2a1 --- /dev/null +++ b/test/CodeGen/PowerPC/mtvsrdd.ll @@ -0,0 +1,22 @@ +; RUN: llc -mcpu=pwr9 -ppc-vsr-nums-as-vr -mtriple=powerpc64le-unknown-unknown \ +; RUN: < %s | FileCheck %s + +; This test case checks r0 is used as constant 0 in instruction mtvsrdd. + +define <2 x i64> @const0(i64 %a) { + %vecinit = insertelement <2 x i64> undef, i64 %a, i32 0 + %vecinit1 = insertelement <2 x i64> %vecinit, i64 0, i32 1 + ret <2 x i64> %vecinit1 +; CHECK-LABEL: const0 +; CHECK: mtvsrdd v2, 0, r3 +} + +define <2 x i64> @noconst0(i64* %a, i64* %b) { + %1 = load i64, i64* %a, align 8 + %2 = load i64, i64* %b, align 8 + %vecinit = insertelement <2 x i64> undef, i64 %2, i32 0 + %vecinit1 = insertelement <2 x i64> %vecinit, i64 %1, i32 1 + ret <2 x i64> %vecinit1 +; CHECK-LABEL: noconst0 +; CHECK: mtvsrdd v2, {{r[0-9]+}}, {{r[0-9]+}} +} diff --git a/test/CodeGen/PowerPC/setcc-logic.ll b/test/CodeGen/PowerPC/setcc-logic.ll index 2ed08e2ae380..a5a86f101a94 100644 --- a/test/CodeGen/PowerPC/setcc-logic.ll +++ b/test/CodeGen/PowerPC/setcc-logic.ll @@ -6,7 +6,7 @@ define zeroext i1 @all_bits_clear(i32 %P, i32 %Q) { ; CHECK: # BB#0: ; CHECK-NEXT: or 3, 3, 4 ; CHECK-NEXT: cntlzw 3, 3 -; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr %a = icmp eq i32 %P, 0 %b = icmp eq i32 %Q, 0 @@ -30,11 +30,11 @@ define zeroext i1 @all_sign_bits_clear(i32 %P, i32 %Q) { define zeroext i1 @all_bits_set(i32 %P, i32 %Q) { ; CHECK-LABEL: all_bits_set: ; CHECK: # BB#0: +; CHECK-NEXT: li 5, -1 ; CHECK-NEXT: and 3, 3, 4 -; CHECK-NEXT: li 5, 0 -; CHECK-NEXT: li 12, 1 -; CHECK-NEXT: cmpwi 0, 3, -1 -; CHECK-NEXT: isel 3, 12, 5, 2 +; CHECK-NEXT: xor 3, 3, 5 +; CHECK-NEXT: cntlzw 3, 3 +; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr %a = icmp eq i32 %P, -1 %b = icmp eq i32 %Q, -1 @@ -437,7 +437,7 @@ define zeroext i1 @and_eq(i16 zeroext %a, i16 zeroext %b, i16 zeroext %c, i16 z ; CHECK-NEXT: xor 3, 3, 4 ; CHECK-NEXT: or 3, 3, 5 ; CHECK-NEXT: cntlzw 3, 3 -; CHECK-NEXT: rlwinm 3, 3, 27, 31, 31 +; CHECK-NEXT: srwi 3, 3, 5 ; CHECK-NEXT: blr %cmp1 = icmp eq i16 %a, %b %cmp2 = icmp eq i16 %c, %d diff --git a/test/CodeGen/PowerPC/stackmap-frame-setup.ll b/test/CodeGen/PowerPC/stackmap-frame-setup.ll index b5f1d4cfe4bc..b677b8be2966 100644 --- a/test/CodeGen/PowerPC/stackmap-frame-setup.ll +++ b/test/CodeGen/PowerPC/stackmap-frame-setup.ll @@ -7,11 +7,11 @@ entry: store i64 11, i64* %metadata store i64 12, i64* %metadata store i64 13, i64* %metadata -; ISEL: ADJCALLSTACKDOWN 0, implicit-def +; ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; ISEL-NEXT: STACKMAP ; ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) -; FAST-ISEL: ADJCALLSTACKDOWN 0, implicit-def +; FAST-ISEL: ADJCALLSTACKDOWN 0, 0, implicit-def ; FAST-ISEL-NEXT: STACKMAP ; FAST-ISEL-NEXT: ADJCALLSTACKUP 0, 0, implicit-def ret void diff --git a/test/CodeGen/PowerPC/tail-dup-layout.ll b/test/CodeGen/PowerPC/tail-dup-layout.ll index c9b5bf8c9eeb..9665901e874f 100644 --- a/test/CodeGen/PowerPC/tail-dup-layout.ll +++ b/test/CodeGen/PowerPC/tail-dup-layout.ll @@ -1,4 +1,5 @@ -; RUN: llc -O2 < %s | FileCheck %s +; RUN: llc -O2 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O2 %s +; RUN: llc -O3 -o - %s | FileCheck --check-prefix=CHECK --check-prefix=CHECK-O3 %s target datalayout = "e-m:e-i64:64-n32:64" target triple = "powerpc64le-grtev4-linux-gnu" @@ -99,11 +100,9 @@ exit: ; test1 ; test2 ; test3 -; test4 ; optional1 ; optional2 ; optional3 -; optional4 ; exit ; even for 50/50 branches. ; Tail duplication puts test n+1 at the end of optional n @@ -162,6 +161,98 @@ exit: ret void } +; Intended layout: +; The chain-of-triangles based duplicating produces the layout when 3 +; instructions are allowed for tail-duplication. +; test1 +; test2 +; test3 +; optional1 +; optional2 +; optional3 +; exit +; +; Otherwise it produces the layout: +; test1 +; optional1 +; test2 +; optional2 +; test3 +; optional3 +; exit + +;CHECK-LABEL: straight_test_3_instr_test: +; test1 may have been merged with entry +;CHECK: mr [[TAGREG:[0-9]+]], 3 +;CHECK: clrlwi {{[0-9]+}}, [[TAGREG]], 30 +;CHECK-NEXT: cmplwi {{[0-9]+}}, 2 + +;CHECK-O3-NEXT: bne 0, .[[OPT1LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: # %test2 +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O3-NEXT: bne 0, .[[OPT2LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: .[[TEST3LABEL:[_0-9A-Za-z]+]]: # %test3 +;CHECK-O3-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O3-NEXT: bne 0, .[[OPT3LABEL:[_0-9A-Za-z]+]] +;CHECK-O3-NEXT: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK-O3: blr +;CHECK-O3-NEXT: .[[OPT1LABEL]]: +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O3-NEXT: beq 0, .[[TEST3LABEL]] +;CHECK-O3-NEXT: .[[OPT2LABEL]]: +;CHECK-O3: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O3-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O3-NEXT: beq 0, .[[EXITLABEL]] +;CHECK-O3-NEXT: .[[OPT3LABEL]]: +;CHECK-O3: b .[[EXITLABEL]] + +;CHECK-O2-NEXT: beq 0, .[[TEST2LABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional1 +;CHECK-O2: .[[TEST2LABEL]]: # %test2 +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 28, 29 +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 8 +;CHECK-O2-NEXT: beq 0, .[[TEST3LABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional2 +;CHECK-O2: .[[TEST3LABEL]]: # %test3 +;CHECK-O2-NEXT: rlwinm {{[0-9]+}}, [[TAGREG]], 0, 26, 27 +;CHECK-O2-NEXT: cmplwi {{[0-9]+}}, 32 +;CHECK-O2-NEXT: beq 0, .[[EXITLABEL:[_0-9A-Za-z]+]] +;CHECK-O2-NEXT: # %optional3 +;CHECK-O2: .[[EXITLABEL:[_0-9A-Za-z]+]]: # %exit +;CHECK-O2: blr + + +define void @straight_test_3_instr_test(i32 %tag) { +entry: + br label %test1 +test1: + %tagbit1 = and i32 %tag, 3 + %tagbit1eq0 = icmp eq i32 %tagbit1, 2 + br i1 %tagbit1eq0, label %test2, label %optional1, !prof !2 +optional1: + call void @a() + br label %test2 +test2: + %tagbit2 = and i32 %tag, 12 + %tagbit2eq0 = icmp eq i32 %tagbit2, 8 + br i1 %tagbit2eq0, label %test3, label %optional2, !prof !2 +optional2: + call void @b() + br label %test3 +test3: + %tagbit3 = and i32 %tag, 48 + %tagbit3eq0 = icmp eq i32 %tagbit3, 32 + br i1 %tagbit3eq0, label %exit, label %optional3, !prof !1 +optional3: + call void @c() + br label %exit +exit: + ret void +} + ; Intended layout: ; The chain-based outlining produces the layout ; entry diff --git a/test/CodeGen/PowerPC/testComparesieqsc.ll b/test/CodeGen/PowerPC/testComparesieqsc.ll new file mode 100644 index 000000000000..71ad5ed34969 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqsc.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqsc.c' + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc_sext(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc_z(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsc_sext_z(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_sext_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_ieqsc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_z_store(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsc_sext_z_store(i8 signext %a) { +; CHECK-LABEL: test_ieqsc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesieqsi.ll b/test/CodeGen/PowerPC/testComparesieqsi.ll new file mode 100644 index 000000000000..16882dbd0045 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqsi.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqsi.c' + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi_sext(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi_z(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqsi_sext_z(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_sext_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_ieqsi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_z_store(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqsi_sext_z_store(i32 signext %a) { +; CHECK-LABEL: test_ieqsi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesieqss.ll b/test/CodeGen/PowerPC/testComparesieqss.ll new file mode 100644 index 000000000000..110c5a62804e --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesieqss.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesieqss.c' + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss_sext(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss_z(i16 signext %a) { +; CHECK-LABEL: test_ieqss_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_ieqss_sext_z(i16 signext %a) { +; CHECK-LABEL: test_ieqss_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_sext_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_ieqss_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_z_store(i16 signext %a) { +; CHECK-LABEL: test_ieqss_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_ieqss_sext_z_store(i16 signext %a) { +; CHECK-LABEL: test_ieqss_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequc.ll b/test/CodeGen/PowerPC/testComparesiequc.ll new file mode 100644 index 000000000000..e2c975f2c191 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequc.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequc.c' + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc_sext(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc_z(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequc_sext_z(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_sext_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_iequc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequc_sext_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_iequc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequi.ll b/test/CodeGen/PowerPC/testComparesiequi.ll new file mode 100644 index 000000000000..789b176a7700 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequi.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequi.c' + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi_sext(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi_z(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + ret i32 %conv +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequi_sext_z(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_sext_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_iequi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequi_sext_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_iequi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesiequs.ll b/test/CodeGen/PowerPC/testComparesiequs.ll new file mode 100644 index 000000000000..b72943893e98 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesiequs.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testComparesiequs.c' + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv2 = zext i1 %cmp to i32 + ret i32 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs_sext(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs_z(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv1 = zext i1 %cmp to i32 + ret i32 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define signext i32 @test_iequs_sext_z(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %sub = sext i1 %cmp to i32 + ret i32 %sub +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_sext_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_iequs_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_iequs_sext_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_iequs_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqsc.ll b/test/CodeGen/PowerPC/testCompareslleqsc.ll new file mode 100644 index 000000000000..56af12827931 --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqsc.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; ModuleID = 'ComparisonTestCases/testCompareslleqsc.c' + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc_sext(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc_z(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsc_sext_z(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_sext_store(i8 signext %a, i8 signext %b) { +; CHECK-LABEL: test_lleqsc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_z_store(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsc_sext_z_store(i8 signext %a) { +; CHECK-LABEL: test_lleqsc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqsi.ll b/test/CodeGen/PowerPC/testCompareslleqsi.ll new file mode 100644 index 000000000000..90cf2c85888e --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqsi.ll @@ -0,0 +1,138 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi_sext(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi_z(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqsi_sext_z(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_sext_store(i32 signext %a, i32 signext %b) { +; CHECK-LABEL: test_lleqsi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_z_store(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +; CHECKNEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqsi_sext_z_store(i32 signext %a) { +; CHECK-LABEL: test_lleqsi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testCompareslleqss.ll b/test/CodeGen/PowerPC/testCompareslleqss.ll new file mode 100644 index 000000000000..df60a6ccc00e --- /dev/null +++ b/test/CodeGen/PowerPC/testCompareslleqss.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss_sext(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss_z(i16 signext %a) { +; CHECK-LABEL: test_lleqss_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_lleqss_sext_z(i16 signext %a) { +; CHECK-LABEL: test_lleqss_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_sext_store(i16 signext %a, i16 signext %b) { +; CHECK-LABEL: test_lleqss_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_z_store(i16 signext %a) { +; CHECK-LABEL: test_lleqss_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_lleqss_sext_z_store(i16 signext %a) { +; CHECK-LABEL: test_lleqss_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequc.ll b/test/CodeGen/PowerPC/testComparesllequc.ll new file mode 100644 index 000000000000..248825761295 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequc.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i8 0, align 1 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc_sext(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc_z(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequc_sext_z(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = zext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_sext_store(i8 zeroext %a, i8 zeroext %b) { +; CHECK-LABEL: test_llequc_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, %b + %conv3 = sext i1 %cmp to i8 + store i8 %conv3, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = zext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequc_sext_z_store(i8 zeroext %a) { +; CHECK-LABEL: test_llequc_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stb r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i8 %a, 0 + %conv2 = sext i1 %cmp to i8 + store i8 %conv2, i8* @glob, align 1 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequi.ll b/test/CodeGen/PowerPC/testComparesllequi.ll new file mode 100644 index 000000000000..2342d80d94ef --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequi.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i32 0, align 4 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi_sext(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi_z(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = zext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequi_sext_z(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv1 = sext i1 %cmp to i64 + ret i64 %conv1 +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_sext_store(i32 zeroext %a, i32 zeroext %b) { +; CHECK-LABEL: test_llequi_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, %b + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %conv = zext i1 %cmp to i32 + store i32 %conv, i32* @glob, align 4 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequi_sext_z_store(i32 zeroext %a) { +; CHECK-LABEL: test_llequi_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: stw r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i32 %a, 0 + %sub = sext i1 %cmp to i32 + store i32 %sub, i32* @glob, align 4 + ret void +} diff --git a/test/CodeGen/PowerPC/testComparesllequs.ll b/test/CodeGen/PowerPC/testComparesllequs.ll new file mode 100644 index 000000000000..e79a974c06f5 --- /dev/null +++ b/test/CodeGen/PowerPC/testComparesllequs.ll @@ -0,0 +1,137 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -verify-machineinstrs -mtriple=powerpc64-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl +; RUN: llc -verify-machineinstrs -mtriple=powerpc64le-unknown-linux-gnu -O2 \ +; RUN: -ppc-asm-full-reg-names -mcpu=pwr8 < %s | FileCheck %s \ +; RUN: --implicit-check-not cmpw --implicit-check-not cmpd --implicit-check-not cmpl + +@glob = common local_unnamed_addr global i16 0, align 2 + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs_sext(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs_sext: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i64 + ret i64 %conv3 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs_z(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind readnone +define i64 @test_llequs_sext_z(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_sext_z: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i64 + ret i64 %conv2 +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: ld r12, .LC0@toc@l(r5) +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r12) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = zext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_sext_store(i16 zeroext %a, i16 zeroext %b) { +; CHECK-LABEL: test_llequs_sext_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: xor r3, r3, r4 +; CHECK-NEXT: addis r5, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r5) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, %b + %conv3 = sext i1 %cmp to i16 + store i16 %conv3, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: srwi r3, r3, 5 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = zext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} + +; Function Attrs: norecurse nounwind +define void @test_llequs_sext_z_store(i16 zeroext %a) { +; CHECK-LABEL: test_llequs_sext_z_store: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: addis r4, r2, .LC0@toc@ha +; CHECK-NEXT: cntlzw r3, r3 +; CHECK-NEXT: ld r4, .LC0@toc@l(r4) +; CHECK-NEXT: rldicr r3, r3, 58, 0 +; CHECK-NEXT: sradi r3, r3, 63 +; CHECK-NEXT: sth r3, 0(r4) +; CHECK-NEXT: blr +entry: + %cmp = icmp eq i16 %a, 0 + %conv2 = sext i1 %cmp to i16 + store i16 %conv2, i16* @glob, align 2 + ret void +} diff --git a/test/CodeGen/SPARC/LeonItinerariesUT.ll b/test/CodeGen/SPARC/LeonItinerariesUT.ll index 87e0c4621c08..d586fe183a92 100644 --- a/test/CodeGen/SPARC/LeonItinerariesUT.ll +++ b/test/CodeGen/SPARC/LeonItinerariesUT.ll @@ -28,9 +28,9 @@ ; LEON3_4_ITIN-LABEL: f32_ops: ; LEON3_4_ITIN: ld ; LEON3_4_ITIN-NEXT: ld -; LEON3_4_ITIN-NEXT: ld ; LEON3_4_ITIN-NEXT: fadds ; LEON3_4_ITIN-NEXT: ld +; LEON3_4_ITIN-NEXT: ld ; LEON3_4_ITIN-NEXT: fsubs ; LEON3_4_ITIN-NEXT: fmuls ; LEON3_4_ITIN-NEXT: retl @@ -47,4 +47,4 @@ entry: %6 = fmul float %5, %3 %7 = fdiv float %6, %4 ret float %7 -} \ No newline at end of file +} diff --git a/test/CodeGen/SPARC/inlineasm-v9.ll b/test/CodeGen/SPARC/inlineasm-v9.ll new file mode 100644 index 000000000000..9c5424c46229 --- /dev/null +++ b/test/CodeGen/SPARC/inlineasm-v9.ll @@ -0,0 +1,30 @@ +; RUN: llc -march=sparcv9 <%s | FileCheck %s + +;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints. +; CHECK-LABEL: faddd: +; CHECK: faddd %f0, %f2, %f0 +define double @faddd(double, double) local_unnamed_addr #2 { +entry: + %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e"(double %0, double %1) #7 + ret double %2 +} + +; CHECK-LABEL: faddq: +; CHECK: faddq %f0, %f4, %f0 +define fp128 @faddq(fp128, fp128) local_unnamed_addr #2 { +entry: + %2 = tail call fp128 asm sideeffect "faddq $1, $2, $0;", "=f,f,e"(fp128 %0, fp128 %1) #7 + ret fp128 %2 +} + +;; Ensure that 'e' can indeed go in the high area, and 'f' cannot. +; CHECK-LABEL: faddd_high: +; CHECK: fmovd %f2, %f32 +; CHECK: fmovd %f0, %f2 +; CHECK: faddd %f2, %f32, %f2 +define double @faddd_high(double, double) local_unnamed_addr #2 { +entry: + %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e,~{d0},~{q1},~{q2},~{q3},~{q4},~{q5},~{q6},~{q7}"(double %0, double %1) #7 + ret double %2 +} + diff --git a/test/CodeGen/SPARC/inlineasm.ll b/test/CodeGen/SPARC/inlineasm.ll index af631f0d29f5..35a62706c1ab 100644 --- a/test/CodeGen/SPARC/inlineasm.ll +++ b/test/CodeGen/SPARC/inlineasm.ll @@ -94,3 +94,21 @@ entry: %0 = call i64 asm sideeffect "xor $1, %g0, $0", "=r,0,~{i1}"(i64 5); ret i64 %0 } + + +;; Ensures that inline-asm accepts and uses 'f' and 'e' register constraints. +; CHECK-LABEL: fadds: +; CHECK: fadds %f0, %f1, %f0 +define float @fadds(float, float) local_unnamed_addr #2 { +entry: + %2 = tail call float asm sideeffect "fadds $1, $2, $0;", "=f,f,e"(float %0, float %1) #7 + ret float %2 +} + +; CHECK-LABEL: faddd: +; CHECK: faddd %f0, %f2, %f0 +define double @faddd(double, double) local_unnamed_addr #2 { +entry: + %2 = tail call double asm sideeffect "faddd $1, $2, $0;", "=f,f,e"(double %0, double %1) #7 + ret double %2 +} diff --git a/test/CodeGen/SystemZ/list-ilp-crash.ll b/test/CodeGen/SystemZ/list-ilp-crash.ll new file mode 100644 index 000000000000..c67ed318b93f --- /dev/null +++ b/test/CodeGen/SystemZ/list-ilp-crash.ll @@ -0,0 +1,23 @@ +; RUN: llc < %s -mtriple=s390x-linux-gnu -mcpu=z13 -pre-RA-sched=list-ilp | FileCheck %s +; +; Check that list-ilp scheduler does not crash due to SystemZ's current use +; of MVT::Untyped. + +define void @pr32723(i8) { +; CHECK: .text +BB: + br label %CF245 + +CF245: ; preds = %CF245, %BB + %Shuff57 = shufflevector <4 x i8> zeroinitializer, <4 x i8> zeroinitializer, <4 x i32> + %Cmp84 = icmp uge i8 %0, undef + br i1 %Cmp84, label %CF245, label %CF260 + +CF260: ; preds = %CF245 + %B156 = sdiv <4 x i8> %Shuff57, %Shuff57 + br label %CF255 + +CF255: ; preds = %CF255, %CF260 + %I186 = insertelement <4 x i8> %B156, i8 %0, i32 2 + br label %CF255 +} diff --git a/test/CodeGen/SystemZ/lower-copy-undef-src.mir b/test/CodeGen/SystemZ/lower-copy-undef-src.mir new file mode 100644 index 000000000000..322460d79d68 --- /dev/null +++ b/test/CodeGen/SystemZ/lower-copy-undef-src.mir @@ -0,0 +1,14 @@ +# RUN: llc -mtriple=s390x-linux-gnu -mcpu=z13 -run-pass=postrapseudos -o - %s | FileCheck %s +# +# Test that a COPY with an undef source operand gets handled like an identity +# copy rather than lowered into a target instruction with the undef flag +# dropped. +--- +# CHECK-LABEL: name: undef_copy +# CHECK: %r13d = KILL undef %r0d, implicit killed %r12q, implicit-def %r12q +name: undef_copy +tracksRegLiveness: true +body: | + bb.0: + liveins: %r12q + %r13d = COPY undef %r0d, implicit killed %r12q, implicit-def %r12q diff --git a/test/CodeGen/Thumb2/v8_IT_5.ll b/test/CodeGen/Thumb2/v8_IT_5.ll index d8d60413cb0e..5e7a40299ed7 100644 --- a/test/CodeGen/Thumb2/v8_IT_5.ll +++ b/test/CodeGen/Thumb2/v8_IT_5.ll @@ -9,7 +9,7 @@ ; CHECK-NEXT: b ; CHECK: [[JUMPTARGET]]:{{.*}}%if.else173 ; CHECK-NEXT: mov.w -; CHECK-NEXT: bx lr +; CHECK-NEXT: pop ; CHECK-NEXT: %if.else145 ; CHECK-NEXT: mov.w diff --git a/test/CodeGen/X86/2007-01-08-InstrSched.ll b/test/CodeGen/X86/2007-01-08-InstrSched.ll index 4ec703921e29..24aa5b98d0bb 100644 --- a/test/CodeGen/X86/2007-01-08-InstrSched.ll +++ b/test/CodeGen/X86/2007-01-08-InstrSched.ll @@ -11,12 +11,12 @@ define float @foo(float %x) nounwind { %tmp14 = fadd float %tmp12, %tmp7 ret float %tmp14 -; CHECK: mulss -; CHECK: mulss ; CHECK: mulss ; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss +; CHECK: mulss ; CHECK: addss ; CHECK: ret } diff --git a/test/CodeGen/X86/2010-01-18-DbgValue.ll b/test/CodeGen/X86/2010-01-18-DbgValue.ll index 8b11fd86ef17..ae60d57bbf49 100644 --- a/test/CodeGen/X86/2010-01-18-DbgValue.ll +++ b/test/CodeGen/X86/2010-01-18-DbgValue.ll @@ -1,14 +1,19 @@ -; RUN: llc -march=x86 -O0 < %s | FileCheck %s -; Currently, dbg.declare generates a DEBUG_VALUE comment. Eventually it will -; generate DWARF and this test will need to be modified or removed. +; RUN: llc -march=x86 -O0 < %s -filetype=obj | llvm-dwarfdump - | FileCheck %s +; CHECK-LABEL: .debug_info contents: + +; CHECK-LABEL: DW_TAG_subprogram +; CHECK: DW_AT_name [DW_FORM_strp] ( {{.*}}"foo") +; CHECK: DW_TAG_formal_parameter +; CHECK-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 {{..}} ) +; DW_OP_fbreg ?? +; CHECK-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"my_r0") %struct.Pt = type { double, double } %struct.Rect = type { %struct.Pt, %struct.Pt } define double @foo(%struct.Rect* byval %my_r0) nounwind ssp !dbg !1 { entry: -;CHECK: DEBUG_VALUE %retval = alloca double ; [#uses=2] %0 = alloca double ; [#uses=2] %"alloca point" = bitcast i32 0 to i32 ; [#uses=0] diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll deleted file mode 100644 index 495ff0304b1b..000000000000 --- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll +++ /dev/null @@ -1,51 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \ -; RUN: -verify-machineinstrs | FileCheck %s -; -; Test LiveInterval update handling of DBG_VALUE. -; rdar://12777252. -; -; CHECK: %entry -; CHECK: DEBUG_VALUE: subdivp:hg -; CHECK: j - -%struct.node.0.27 = type { i16, double, [3 x double], i32, i32 } -%struct.hgstruct.2.29 = type { %struct.bnode.1.28*, [3 x double], double, [3 x double] } -%struct.bnode.1.28 = type { i16, double, [3 x double], i32, i32, [3 x double], [3 x double], [3 x double], double, %struct.bnode.1.28*, %struct.bnode.1.28* } - -declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone - -define signext i16 @subdivp(%struct.node.0.27* nocapture %p, double %dsq, double %tolsq, %struct.hgstruct.2.29* nocapture byval align 8 %hg) nounwind uwtable readonly ssp !dbg !14 { -entry: - call void @llvm.dbg.declare(metadata %struct.hgstruct.2.29* %hg, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !14) - %type = getelementptr inbounds %struct.node.0.27, %struct.node.0.27* %p, i64 0, i32 0 - %0 = load i16, i16* %type, align 2 - %cmp = icmp eq i16 %0, 1 - br i1 %cmp, label %return, label %for.cond.preheader - -for.cond.preheader: ; preds = %entry - %arrayidx6.1 = getelementptr inbounds %struct.hgstruct.2.29, %struct.hgstruct.2.29* %hg, i64 0, i32 1, i64 1 - %cmp22 = fcmp olt double 0.000000e+00, %dsq - %conv24 = zext i1 %cmp22 to i16 - br label %return - -return: ; preds = %for.cond.preheader, %entry - %retval.0 = phi i16 [ %conv24, %for.cond.preheader ], [ 0, %entry ] - ret i16 %retval.0 -} - -declare void @llvm.dbg.value(metadata, i64, metadata, metadata) nounwind readnone - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!12} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !11, enums: !2, retainedTypes: !2, globals: !2) -!2 = !{} -!4 = !DILocalVariable(name: "hg", line: 725, arg: 4, scope: !14, file: !5, type: !6) -!5 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh") -!6 = !DIDerivedType(tag: DW_TAG_typedef, name: "hgstruct", line: 492, file: !11, baseType: !7) -!7 = !DICompositeType(tag: DW_TAG_structure_type, line: 487, size: 512, align: 64, file: !11) -!11 = !DIFile(filename: "MultiSource/Benchmarks/Olden/bh/newbh.c", directory: "MultiSource/Benchmarks/Olden/bh") -!12 = !{i32 1, !"Debug Info Version", i32 3} -!14 = distinct !DISubprogram(name: "subdivp", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !11, scope: !5, type: !15) -!15 = !DISubroutineType(types: !16) -!16 = !{null} diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll deleted file mode 100644 index fbe6000d7ace..000000000000 --- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll +++ /dev/null @@ -1,142 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \ -; RUN: -verify-machineinstrs | FileCheck %s -; -; Test MachineScheduler handling of DBG_VALUE. -; rdar://12776937. -; -; CHECK: %if.else581 -; CHECK: DEBUG_VALUE: num1 -; CHECK: call - -%union.rec = type {} - -@.str15 = external hidden unnamed_addr constant [6 x i8], align 1 - -declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone - -define i32 @AttachGalley(%union.rec** nocapture %suspend_pt) nounwind uwtable ssp !dbg !21 { -entry: - %num14075 = alloca [20 x i8], align 16 - br label %if.end33 - -if.end33: ; preds = %entry - %cmp1733 = icmp eq i32 undef, 0 - br label %if.else581 - -if.else581: ; preds = %if.end33 - %cmp586 = icmp eq i8 undef, -123 - br i1 %cmp586, label %if.then588, label %if.else594 - -if.then588: ; preds = %if.else581 - br label %for.cond1710.preheader - -if.else594: ; preds = %if.else581 - unreachable - -for.cond1710.preheader: ; preds = %if.then588 - br label %for.cond1710 - -for.cond1710: ; preds = %for.cond1710, %for.cond1710.preheader - br i1 undef, label %for.cond1710, label %if.then3344 - -if.then3344: - br label %if.then4073 - -if.then4073: ; preds = %if.then3344 - call void @llvm.dbg.declare(metadata [20 x i8]* %num14075, metadata !4, metadata !DIExpression()), !dbg !DILocation(scope: !5) - %arraydecay4078 = getelementptr inbounds [20 x i8], [20 x i8]* %num14075, i64 0, i64 0 - %0 = load i32, i32* undef, align 4 - %add4093 = add nsw i32 %0, 0 - %conv4094 = sitofp i32 %add4093 to float - %div4095 = fdiv float %conv4094, 5.670000e+02 - %conv4096 = fpext float %div4095 to double - %call4097 = call i32 (i8*, i32, i64, i8*, ...) @__sprintf_chk(i8* %arraydecay4078, i32 0, i64 20, i8* getelementptr inbounds ([6 x i8], [6 x i8]* @.str15, i64 0, i64 0), double %conv4096) nounwind - br i1 %cmp1733, label %if.then4107, label %if.else4114 - -if.then4107: ; preds = %if.then4073 - unreachable - -if.else4114: ; preds = %if.then4073 - unreachable -} - -declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...) - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!35} - -!0 = distinct !DICompileUnit(language: DW_LANG_C99, producer: "clang version 3.3 (trunk 168918) (llvm/trunk 168920)", isOptimized: true, emissionKind: FullDebug, file: !19, enums: !2, retainedTypes: !2, globals: !2) -!1 = !{!2} -!2 = !{} -!4 = !DILocalVariable(name: "num1", line: 815, scope: !5, file: !14, type: !15) -!5 = distinct !DILexicalBlock(line: 815, column: 0, file: !14, scope: !6) -!6 = distinct !DILexicalBlock(line: 812, column: 0, file: !14, scope: !7) -!7 = distinct !DILexicalBlock(line: 807, column: 0, file: !14, scope: !8) -!8 = distinct !DILexicalBlock(line: 440, column: 0, file: !14, scope: !9) -!9 = distinct !DILexicalBlock(line: 435, column: 0, file: !14, scope: !10) -!10 = distinct !DILexicalBlock(line: 434, column: 0, file: !14, scope: !11) -!11 = distinct !DILexicalBlock(line: 250, column: 0, file: !14, scope: !12) -!12 = distinct !DILexicalBlock(line: 249, column: 0, file: !14, scope: !13) -!13 = distinct !DILexicalBlock(line: 221, column: 0, file: !14, scope: !21) -!14 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset") -!15 = !DICompositeType(tag: DW_TAG_array_type, size: 160, align: 8, baseType: !16, elements: !17) -!16 = !DIBasicType(tag: DW_TAG_base_type, name: "char", size: 8, align: 8, encoding: DW_ATE_signed_char) -!17 = !{!18} -!18 = !DISubrange(count: 20) -!19 = !DIFile(filename: "MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c", directory: "MultiSource/Benchmarks/MiBench/consumer-typeset") - -!21 = distinct !DISubprogram(name: "AttachGalley", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !19, scope: !14, type: !22) -!22 = !DISubroutineType(types: !23) -!23 = !{null} - -; Test DebugValue uses visited by RegisterPressureTracker findUseBetween(). -; -; CHECK: @main -; CHECK: DEBUG_VALUE: main:X -; CHECK: call - -%"class.__gnu_cxx::hash_map" = type { %"class.__gnu_cxx::hashtable" } -%"class.__gnu_cxx::hashtable" = type { i64, i64, i64, i64, i64, i64 } - -define void @main() uwtable ssp personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) !dbg !37 { -entry: - %X = alloca %"class.__gnu_cxx::hash_map", align 8 - br i1 undef, label %cond.true, label %cond.end - -cond.true: ; preds = %entry - unreachable - -cond.end: ; preds = %entry - call void @llvm.dbg.declare(metadata %"class.__gnu_cxx::hash_map"* %X, metadata !31, metadata !DIExpression()), !dbg !DILocation(scope: !37) - %_M_num_elements.i.i.i.i = getelementptr inbounds %"class.__gnu_cxx::hash_map", %"class.__gnu_cxx::hash_map"* %X, i64 0, i32 0, i32 5 - invoke void @_Znwm() - to label %exit.i unwind label %lpad2.i.i.i.i - -exit.i: ; preds = %cond.end - unreachable - -lpad2.i.i.i.i: ; preds = %cond.end - %0 = landingpad { i8*, i32 } - cleanup - br i1 undef, label %lpad.body.i.i, label %if.then.i.i.i.i.i.i.i.i - -if.then.i.i.i.i.i.i.i.i: ; preds = %lpad2.i.i.i.i - unreachable - -lpad.body.i.i: ; preds = %lpad2.i.i.i.i - resume { i8*, i32 } %0 -} - -declare i32 @__gxx_personality_v0(...) - -declare void @_Znwm() - -!llvm.dbg.cu = !{!30} - -!30 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 169129) (llvm/trunk 169135)", isOptimized: true, emissionKind: FullDebug, file: !34, enums: !2, retainedTypes: !2) -!31 = !DILocalVariable(name: "X", line: 29, scope: !37, type: !32) -!32 = !DIDerivedType(tag: DW_TAG_typedef, name: "HM", line: 28, file: !34, baseType: null) -!33 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++") -!34 = !DIFile(filename: "SingleSource/Benchmarks/Shootout-C++/hash.cpp", directory: "SingleSource/Benchmarks/Shootout-C++") -!35 = !{i32 1, !"Debug Info Version", i32 3} -!37 = distinct !DISubprogram(name: "main", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !30, scopeLine: 1, file: !19, scope: !14, type: !22) diff --git a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll b/test/CodeGen/X86/2012-11-30-regpres-dbg.ll deleted file mode 100644 index a717202d3574..000000000000 --- a/test/CodeGen/X86/2012-11-30-regpres-dbg.ll +++ /dev/null @@ -1,47 +0,0 @@ -; RUN: llc < %s -mtriple=x86_64-apple-macosx -enable-misched \ -; RUN: -verify-machineinstrs | FileCheck %s -; -; Test RegisterPressure handling of DBG_VALUE. -; -; CHECK: %entry -; CHECK: DEBUG_VALUE: test:callback -; CHECK: ret - -%struct.btCompoundLeafCallback = type { i32, i32 } - -declare void @llvm.dbg.declare(metadata, metadata, metadata) nounwind readnone - -define void @test() unnamed_addr uwtable ssp align 2 !dbg !2 { -entry: - %callback = alloca %struct.btCompoundLeafCallback, align 8 - br i1 undef, label %if.end, label %if.then - -if.then: ; preds = %entry - unreachable - -if.end: ; preds = %entry - call void @llvm.dbg.declare(metadata %struct.btCompoundLeafCallback* %callback, metadata !3, metadata !DIExpression()), !dbg !DILocation(scope: !2) - %m = getelementptr inbounds %struct.btCompoundLeafCallback, %struct.btCompoundLeafCallback* %callback, i64 0, i32 1 - store i32 0, i32* undef, align 8 - %cmp12447 = icmp sgt i32 undef, 0 - br i1 %cmp12447, label %for.body.lr.ph, label %invoke.cont44 - -for.body.lr.ph: ; preds = %if.end - unreachable - -invoke.cont44: ; preds = %if.end - ret void -} - -!llvm.dbg.cu = !{!0} -!llvm.module.flags = !{!8} - -!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, producer: "clang version 3.3 (trunk 168984) (llvm/trunk 168983)", isOptimized: true, emissionKind: FullDebug, file: !6) -!2 = distinct !DISubprogram(name: "test", isLocal: false, isDefinition: true, virtualIndex: 6, flags: DIFlagPrototyped, isOptimized: true, unit: !0, scopeLine: 1, file: !6, scope: !5, type: !7) -!3 = !DILocalVariable(name: "callback", line: 214, scope: !2, type: !4) -!4 = !DICompositeType(tag: DW_TAG_structure_type, name: "btCompoundLeafCallback", line: 90, size: 64, align: 64, file: !6) -!5 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet") -!6 = !DIFile(filename: "MultiSource/Benchmarks/Bullet/btCompoundCollisionAlgorithm.cpp", directory: "MultiSource/Benchmarks/Bullet") -!7 = !DISubroutineType(types: !9) -!8 = !{i32 1, !"Debug Info Version", i32 3} -!9 = !{null} diff --git a/test/CodeGen/X86/GlobalISel/add-scalar.ll b/test/CodeGen/X86/GlobalISel/add-scalar.ll new file mode 100644 index 000000000000..553bc2789ff0 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/add-scalar.ll @@ -0,0 +1,44 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 + +define i64 @test_add_i64(i64 %arg1, i64 %arg2) { +; ALL-LABEL: test_add_i64: +; ALL: # BB#0: +; ALL-NEXT: leaq (%rsi,%rdi), %rax +; ALL-NEXT: retq + %ret = add i64 %arg1, %arg2 + ret i64 %ret +} + +define i32 @test_add_i32(i32 %arg1, i32 %arg2) { +; ALL-LABEL: test_add_i32: +; ALL: # BB#0: +; ALL-NEXT: # kill: %EDI %EDI %RDI +; ALL-NEXT: # kill: %ESI %ESI %RSI +; ALL-NEXT: leal (%rsi,%rdi), %eax +; ALL-NEXT: retq + %ret = add i32 %arg1, %arg2 + ret i32 %ret +} + +define i16 @test_add_i16(i16 %arg1, i16 %arg2) { +; ALL-LABEL: test_add_i16: +; ALL: # BB#0: +; ALL-NEXT: # kill: %DI %DI %RDI +; ALL-NEXT: # kill: %SI %SI %RSI +; ALL-NEXT: leal (%rsi,%rdi), %eax +; ALL-NEXT: # kill: %AX %AX %EAX +; ALL-NEXT: retq + %ret = add i16 %arg1, %arg2 + ret i16 %ret +} + +define i8 @test_add_i8(i8 %arg1, i8 %arg2) { +; ALL-LABEL: test_add_i8: +; ALL: # BB#0: +; ALL-NEXT: addb %dil, %sil +; ALL-NEXT: movl %esi, %eax +; ALL-NEXT: retq + %ret = add i8 %arg1, %arg2 + ret i8 %ret +} diff --git a/test/CodeGen/X86/GlobalISel/binop.ll b/test/CodeGen/X86/GlobalISel/binop.ll index bf4c42cb4292..1aae1db8ab07 100644 --- a/test/CodeGen/X86/GlobalISel/binop.ll +++ b/test/CodeGen/X86/GlobalISel/binop.ll @@ -4,48 +4,6 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512F ; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=AVX512VL -define i64 @test_add_i64(i64 %arg1, i64 %arg2) { -; ALL-LABEL: test_add_i64: -; ALL: # BB#0: -; ALL-NEXT: leaq (%rsi,%rdi), %rax -; ALL-NEXT: retq - %ret = add i64 %arg1, %arg2 - ret i64 %ret -} - -define i32 @test_add_i32(i32 %arg1, i32 %arg2) { -; ALL-LABEL: test_add_i32: -; ALL: # BB#0: -; ALL-NEXT: # kill: %EDI %EDI %RDI -; ALL-NEXT: # kill: %ESI %ESI %RSI -; ALL-NEXT: leal (%rsi,%rdi), %eax -; ALL-NEXT: retq - %ret = add i32 %arg1, %arg2 - ret i32 %ret -} - -define i16 @test_add_i16(i16 %arg1, i16 %arg2) { -; ALL-LABEL: test_add_i16: -; ALL: # BB#0: -; ALL-NEXT: # kill: %DI %DI %RDI -; ALL-NEXT: # kill: %SI %SI %RSI -; ALL-NEXT: leal (%rsi,%rdi), %eax -; ALL-NEXT: # kill: %AX %AX %EAX -; ALL-NEXT: retq - %ret = add i16 %arg1, %arg2 - ret i16 %ret -} - -define i8 @test_add_i8(i8 %arg1, i8 %arg2) { -; ALL-LABEL: test_add_i8: -; ALL: # BB#0: -; ALL-NEXT: addb %dil, %sil -; ALL-NEXT: movl %esi, %eax -; ALL-NEXT: retq - %ret = add i8 %arg1, %arg2 - ret i8 %ret -} - define i64 @test_sub_i64(i64 %arg1, i64 %arg2) { ; ALL-LABEL: test_sub_i64: ; ALL: # BB#0: diff --git a/test/CodeGen/X86/GlobalISel/br.ll b/test/CodeGen/X86/GlobalISel/br.ll new file mode 100644 index 000000000000..faa6a0350337 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/br.ll @@ -0,0 +1,19 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O0 -mtriple=x86_64-linux-gnu -global-isel %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64 + +define void @uncondbr() { +; CHECK-LABEL: uncondbr: +; CHECK: # BB#1: # %entry +; CHECK-NEXT: jmp .LBB0_3 +; CHECK-NEXT: .LBB0_2: # %end +; CHECK-NEXT: retq +; CHECK-NEXT: .LBB0_3: # %bb2 +; CHECK-NEXT: jmp .LBB0_2 +entry: + br label %bb2 +end: + ret void +bb2: + br label %end +} + diff --git a/test/CodeGen/X86/GlobalISel/cmp.ll b/test/CodeGen/X86/GlobalISel/cmp.ll new file mode 100644 index 000000000000..03692bb6b1de --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/cmp.ll @@ -0,0 +1,159 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL + +define i32 @test_icmp_eq_i8(i8 %a, i8 %b) { +; ALL-LABEL: test_icmp_eq_i8: +; ALL: # BB#0: +; ALL-NEXT: cmpb %sil, %dil +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i8 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_eq_i16(i16 %a, i16 %b) { +; ALL-LABEL: test_icmp_eq_i16: +; ALL: # BB#0: +; ALL-NEXT: cmpw %si, %di +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i16 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_eq_i64(i64 %a, i64 %b) { +; ALL-LABEL: test_icmp_eq_i64: +; ALL: # BB#0: +; ALL-NEXT: cmpq %rsi, %rdi +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i64 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_eq_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_eq_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: sete %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp eq i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ne_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ne_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setne %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ne i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ugt_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: seta %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ugt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_uge_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_uge_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setae %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp uge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ult_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ult_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setb %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ult i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_ule_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_ule_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setbe %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp ule i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_sgt_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setg %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp sgt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_sge_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_sge_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setge %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp sge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_slt_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_slt_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setl %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp slt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + +define i32 @test_icmp_sle_i32(i32 %a, i32 %b) { +; ALL-LABEL: test_icmp_sle_i32: +; ALL: # BB#0: +; ALL-NEXT: cmpl %esi, %edi +; ALL-NEXT: setle %al +; ALL-NEXT: andl $1, %eax +; ALL-NEXT: retq + %r = icmp sle i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res +} + diff --git a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll index c4d3566008b1..64cd0e70a4fd 100644 --- a/test/CodeGen/X86/GlobalISel/ext-x86-64.ll +++ b/test/CodeGen/X86/GlobalISel/ext-x86-64.ll @@ -1,7 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X64 -; TODO merge with ext.ll after i64 sext suported on 32bit platform +; TODO merge with ext.ll after i64 sext suported on 32bit platform + +define i64 @test_zext_i1(i8 %a) { +; X64-LABEL: test_zext_i1: +; X64: # BB#0: +; X64-NEXT: # kill: %DIL %DIL %RDI +; X64-NEXT: andq $1, %rdi +; X64-NEXT: movq %rdi, %rax +; X64-NEXT: retq + %val = trunc i8 %a to i1 + %r = zext i1 %val to i64 + ret i64 %r +} define i64 @test_sext_i8(i8 %val) { ; X64-LABEL: test_sext_i8: diff --git a/test/CodeGen/X86/GlobalISel/ext.ll b/test/CodeGen/X86/GlobalISel/ext.ll index 3c032686130e..4d4e3b05ca28 100644 --- a/test/CodeGen/X86/GlobalISel/ext.ll +++ b/test/CodeGen/X86/GlobalISel/ext.ll @@ -2,6 +2,24 @@ ; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X64 ; RUN: llc -mtriple=i386-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=X32 +define i32 @test_zext_i1(i32 %a) { +; X64-LABEL: test_zext_i1: +; X64: # BB#0: +; X64-NEXT: andl $1, %edi +; X64-NEXT: movl %edi, %eax +; X64-NEXT: retq +; +; X32-LABEL: test_zext_i1: +; X32: # BB#0: +; X32-NEXT: leal 4(%esp), %eax +; X32-NEXT: movl (%eax), %eax +; X32-NEXT: andl $1, %eax +; X32-NEXT: retl + %val = trunc i32 %a to i1 + %r = zext i1 %val to i32 + ret i32 %r +} + define i32 @test_zext_i8(i8 %val) { ; X64-LABEL: test_zext_i8: ; X64: # BB#0: diff --git a/test/CodeGen/X86/GlobalISel/legalize-cmp.mir b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir new file mode 100644 index 000000000000..68ccbbba0a73 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/legalize-cmp.mir @@ -0,0 +1,179 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s + +--- | + define i32 @test_cmp_i8(i8 %a, i8 %b) { + %r = icmp ult i8 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_i16(i16 %a, i16 %b) { + %r = icmp ult i16 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_i32(i32 %a, i32 %b) { + %r = icmp ult i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_i64(i64 %a, i64 %b) { + %r = icmp ult i64 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_cmp_p0(i32* %a, i32* %b) { + %r = icmp ult i32* %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + +... +--- +name: test_cmp_i8 +# CHECK-LABEL: name: test_cmp_i8 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s8) = COPY %edi +# CHECK-NEXT: %1(s8) = COPY %esi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s8), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s8) = COPY %edi + %1(s8) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s8), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_i16 +# CHECK-LABEL: name: test_cmp_i16 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s16) = COPY %edi +# CHECK-NEXT: %1(s16) = COPY %esi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s16), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s16) = COPY %edi + %1(s16) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s16), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_i32 +# CHECK-LABEL: name: test_cmp_i32 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s32) = COPY %edi +# CHECK-NEXT: %1(s32) = COPY %esi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s32), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_i64 +# CHECK-LABEL: name: test_cmp_i64 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(s64) = COPY %rdi +# CHECK-NEXT: %1(s64) = COPY %rsi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(s64), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s1) = G_ICMP intpred(ult), %0(s64), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_cmp_p0 +# CHECK-LABEL: name: test_cmp_p0 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } + - { id: 3, class: _ } +# CHECK: %0(p0) = COPY %rdi +# CHECK-NEXT: %1(p0) = COPY %rsi +# CHECK-NEXT: %2(s1) = G_ICMP intpred(ult), %0(p0), %1 +# CHECK-NEXT: %3(s32) = G_ZEXT %2(s1) +# CHECK-NEXT: %eax = COPY %3(s32) +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(p0) = COPY %rdi + %1(p0) = COPY %rsi + %2(s1) = G_ICMP intpred(ult), %0(p0), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir index 25af600f2299..6f051f1b6ea5 100644 --- a/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir +++ b/test/CodeGen/X86/GlobalISel/legalize-ext-x86-64.mir @@ -1,6 +1,12 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --- | + define i64 @test_sext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = sext i1 %val to i64 + ret i64 %r + } + define i64 @test_sext_i8(i8 %val) { %r = sext i8 %val to i64 ret i64 %r @@ -16,6 +22,12 @@ ret i64 %r } + define i64 @test_zext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = zext i1 %val to i64 + ret i64 %r + } + define i64 @test_zext_i8(i8 %val) { %r = zext i8 %val to i64 ret i64 %r @@ -31,6 +43,32 @@ ret i64 %r } +... +--- +name: test_sext_i1 +# CHECK-LABEL: name: test_sext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %0(s8) = COPY %edi +# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8) +# CHECK-NEXT: %2(s64) = G_SEXT %1(s1) +# CHECK-NEXT: %rax = COPY %2(s64) +# CHECK-NEXT: RET 0, implicit %rax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s64) = G_SEXT %1(s1) + %rax = COPY %2(s64) + RET 0, implicit %rax + ... --- name: test_sext_i8 @@ -100,6 +138,32 @@ body: | %rax = COPY %1(s64) RET 0, implicit %rax +... +--- +name: test_zext_i1 +# CHECK-LABEL: name: test_zext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# CHECK: %0(s8) = COPY %edi +# CHECK-NEXT: %1(s1) = G_TRUNC %0(s8) +# CHECK-NEXT: %2(s64) = G_ZEXT %1(s1) +# CHECK-NEXT: %rax = COPY %2(s64) +# CHECK-NEXT: RET 0, implicit %rax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s64) = G_ZEXT %1(s1) + %rax = COPY %2(s64) + RET 0, implicit %rax + ... --- name: test_zext_i8 diff --git a/test/CodeGen/X86/GlobalISel/legalize-ext.mir b/test/CodeGen/X86/GlobalISel/legalize-ext.mir index 46457e0fff59..c9add0dc4e95 100644 --- a/test/CodeGen/X86/GlobalISel/legalize-ext.mir +++ b/test/CodeGen/X86/GlobalISel/legalize-ext.mir @@ -1,6 +1,12 @@ # RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X32 # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=legalizer %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | + define i32 @test_zext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = zext i1 %val to i32 + ret i32 %r + } + define i32 @test_zext_i8(i8 %val) { %r = zext i8 %val to i32 ret i32 %r @@ -11,6 +17,12 @@ ret i32 %r } + define i32 @test_sext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = sext i1 %val to i32 + ret i32 %r + } + define i32 @test_sext_i8(i8 %val) { %r = sext i8 %val to i32 ret i32 %r @@ -21,6 +33,32 @@ ret i32 %r } +... +--- +name: test_zext_i1 +# ALL-LABEL: name: test_zext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# ALL: %0(s8) = COPY %edi +# ALL-NEXT: %1(s1) = G_TRUNC %0(s8) +# ALL-NEXT: %2(s32) = G_ZEXT %1(s1) +# ALL-NEXT: %eax = COPY %2(s32) +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s32) = G_ZEXT %1(s1) + %eax = COPY %2(s32) + RET 0, implicit %eax + ... --- name: test_zext_i8 @@ -67,6 +105,32 @@ body: | %eax = COPY %1(s32) RET 0, implicit %eax +... +--- +name: test_sext_i1 +# ALL-LABEL: name: test_sext_i1 +alignment: 4 +legalized: false +regBankSelected: false +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +# ALL: %0(s8) = COPY %edi +# ALL-NEXT: %1(s1) = G_TRUNC %0(s8) +# ALL-NEXT: %2(s32) = G_SEXT %1(s1) +# ALL-NEXT: %eax = COPY %2(s32) +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s32) = G_SEXT %1(s1) + %eax = COPY %2(s32) + RET 0, implicit %eax + ... --- name: test_sext_i8 diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll new file mode 100644 index 000000000000..49a7fd79f8b2 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/memop-scalar-x32.ll @@ -0,0 +1,101 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=i386-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST +; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY + +;TODO merge with x86-64 tests (many operations not suppored yet) + +define i8 @test_load_i8(i8 * %p1) { +; ALL-LABEL: test_load_i8: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: movb (%eax), %al +; ALL-NEXT: retl + %r = load i8, i8* %p1 + ret i8 %r +} + +define i16 @test_load_i16(i16 * %p1) { +; ALL-LABEL: test_load_i16: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: movzwl (%eax), %eax +; ALL-NEXT: retl + %r = load i16, i16* %p1 + ret i16 %r +} + +define i32 @test_load_i32(i32 * %p1) { +; ALL-LABEL: test_load_i32: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: retl + %r = load i32, i32* %p1 + ret i32 %r +} + +define i8 * @test_store_i8(i8 %val, i8 * %p1) { +; ALL-LABEL: test_store_i8: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movb (%eax), %cl +; ALL-NEXT: leal 8(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: movb %cl, (%eax) +; ALL-NEXT: retl + store i8 %val, i8* %p1 + ret i8 * %p1; +} + +define i16 * @test_store_i16(i16 %val, i16 * %p1) { +; ALL-LABEL: test_store_i16: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movzwl (%eax), %ecx +; ALL-NEXT: leal 8(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: movw %cx, (%eax) +; ALL-NEXT: retl + store i16 %val, i16* %p1 + ret i16 * %p1; +} + +define i32 * @test_store_i32(i32 %val, i32 * %p1) { +; ALL-LABEL: test_store_i32: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movl (%eax), %ecx +; ALL-NEXT: leal 8(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: movl %ecx, (%eax) +; ALL-NEXT: retl + store i32 %val, i32* %p1 + ret i32 * %p1; +} + +define i32* @test_load_ptr(i32** %ptr1) { +; ALL-LABEL: test_load_ptr: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: retl + %p = load i32*, i32** %ptr1 + ret i32* %p +} + +define void @test_store_ptr(i32** %ptr1, i32* %a) { +; ALL-LABEL: test_store_ptr: +; ALL: # BB#0: +; ALL-NEXT: leal 4(%esp), %eax +; ALL-NEXT: movl (%eax), %eax +; ALL-NEXT: leal 8(%esp), %ecx +; ALL-NEXT: movl (%ecx), %ecx +; ALL-NEXT: movl %ecx, (%eax) +; ALL-NEXT: retl + store i32* %a, i32** %ptr1 + ret void +} diff --git a/test/CodeGen/X86/GlobalISel/memop-scalar.ll b/test/CodeGen/X86/GlobalISel/memop-scalar.ll new file mode 100644 index 000000000000..3e45a9c9a49d --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/memop-scalar.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_FAST +; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE_GREEDY + +define i8 @test_load_i8(i8 * %p1) { +; ALL-LABEL: test_load_i8: +; ALL: # BB#0: +; ALL-NEXT: movb (%rdi), %al +; ALL-NEXT: retq + %r = load i8, i8* %p1 + ret i8 %r +} + +define i16 @test_load_i16(i16 * %p1) { +; ALL-LABEL: test_load_i16: +; ALL: # BB#0: +; ALL-NEXT: movzwl (%rdi), %eax +; ALL-NEXT: retq + %r = load i16, i16* %p1 + ret i16 %r +} + +define i32 @test_load_i32(i32 * %p1) { +; ALL-LABEL: test_load_i32: +; ALL: # BB#0: +; ALL-NEXT: movl (%rdi), %eax +; ALL-NEXT: retq + %r = load i32, i32* %p1 + ret i32 %r +} + +define i64 @test_load_i64(i64 * %p1) { +; ALL-LABEL: test_load_i64: +; ALL: # BB#0: +; ALL-NEXT: movq (%rdi), %rax +; ALL-NEXT: retq + %r = load i64, i64* %p1 + ret i64 %r +} + +define float @test_load_float(float * %p1) { +; SSE-LABEL: test_load_float: +; SSE: # BB#0: +; SSE-NEXT: movl (%rdi), %eax +; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_load_float: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: movl (%rdi), %eax +; ALL_AVX-NEXT: vmovd %eax, %xmm0 +; ALL_AVX-NEXT: retq + %r = load float, float* %p1 + ret float %r +} + +define double @test_load_double(double * %p1) { +; SSE-LABEL: test_load_double: +; SSE: # BB#0: +; SSE-NEXT: movq (%rdi), %rax +; SSE-NEXT: movq %rax, %xmm0 +; SSE-NEXT: retq +; +; ALL_AVX-LABEL: test_load_double: +; ALL_AVX: # BB#0: +; ALL_AVX-NEXT: movq (%rdi), %rax +; ALL_AVX-NEXT: vmovq %rax, %xmm0 +; ALL_AVX-NEXT: retq + %r = load double, double* %p1 + ret double %r +} + +define i32 * @test_store_i32(i32 %val, i32 * %p1) { +; ALL-LABEL: test_store_i32: +; ALL: # BB#0: +; ALL-NEXT: movl %edi, (%rsi) +; ALL-NEXT: movq %rsi, %rax +; ALL-NEXT: retq + store i32 %val, i32* %p1 + ret i32 * %p1; +} + +define i64 * @test_store_i64(i64 %val, i64 * %p1) { +; ALL-LABEL: test_store_i64: +; ALL: # BB#0: +; ALL-NEXT: movq %rdi, (%rsi) +; ALL-NEXT: movq %rsi, %rax +; ALL-NEXT: retq + store i64 %val, i64* %p1 + ret i64 * %p1; +} + +define float * @test_store_float(float %val, float * %p1) { +; +; SSE_FAST-LABEL: test_store_float: +; SSE_FAST: # BB#0: +; SSE_FAST-NEXT: movd %xmm0, %eax +; SSE_FAST-NEXT: movl %eax, (%rdi) +; SSE_FAST-NEXT: movq %rdi, %rax +; SSE_FAST-NEXT: retq +; +; SSE_GREEDY-LABEL: test_store_float: +; SSE_GREEDY: # BB#0: +; SSE_GREEDY-NEXT: movss %xmm0, (%rdi) +; SSE_GREEDY-NEXT: movq %rdi, %rax +; SSE_GREEDY-NEXT: retq + store float %val, float* %p1 + ret float * %p1; +} + +define double * @test_store_double(double %val, double * %p1) { +; +; SSE_FAST-LABEL: test_store_double: +; SSE_FAST: # BB#0: +; SSE_FAST-NEXT: movq %xmm0, %rax +; SSE_FAST-NEXT: movq %rax, (%rdi) +; SSE_FAST-NEXT: movq %rdi, %rax +; SSE_FAST-NEXT: retq +; +; SSE_GREEDY-LABEL: test_store_double: +; SSE_GREEDY: # BB#0: +; SSE_GREEDY-NEXT: movsd %xmm0, (%rdi) +; SSE_GREEDY-NEXT: movq %rdi, %rax +; SSE_GREEDY-NEXT: retq +; + store double %val, double* %p1 + ret double * %p1; +} + +define i32* @test_load_ptr(i32** %ptr1) { +; ALL-LABEL: test_load_ptr: +; ALL: # BB#0: +; ALL-NEXT: movq (%rdi), %rax +; ALL-NEXT: retq + %p = load i32*, i32** %ptr1 + ret i32* %p +} + +define void @test_store_ptr(i32** %ptr1, i32* %a) { +; ALL-LABEL: test_store_ptr: +; ALL: # BB#0: +; ALL-NEXT: movq %rsi, (%rdi) +; ALL-NEXT: retq + store i32* %a, i32** %ptr1 + ret void +} diff --git a/test/CodeGen/X86/GlobalISel/memop-vec.ll b/test/CodeGen/X86/GlobalISel/memop-vec.ll new file mode 100644 index 000000000000..e218fded4d5f --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/memop-vec.ll @@ -0,0 +1,39 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SKX + +define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { +; ALL-LABEL: test_load_v4i32_noalign: +; ALL: # BB#0: +; ALL-NEXT: vmovups (%rdi), %xmm0 +; ALL-NEXT: retq + %r = load <4 x i32>, <4 x i32>* %p1, align 1 + ret <4 x i32> %r +} + +define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { +; ALL-LABEL: test_load_v4i32_align: +; ALL: # BB#0: +; ALL-NEXT: vmovaps (%rdi), %xmm0 +; ALL-NEXT: retq + %r = load <4 x i32>, <4 x i32>* %p1, align 16 + ret <4 x i32> %r +} + +define void @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { +; ALL-LABEL: test_store_v4i32_noalign: +; ALL: # BB#0: +; ALL-NEXT: vmovups %xmm0, (%rdi) +; ALL-NEXT: retq + store <4 x i32> %val, <4 x i32>* %p1, align 1 + ret void +} + +define void @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { +; ALL-LABEL: test_store_v4i32_align: +; ALL: # BB#0: +; ALL-NEXT: vmovaps %xmm0, (%rdi) +; ALL-NEXT: retq + store <4 x i32> %val, <4 x i32>* %p1, align 16 + ret void +} diff --git a/test/CodeGen/X86/GlobalISel/memop-x32.ll b/test/CodeGen/X86/GlobalISel/memop-x32.ll deleted file mode 100644 index 49a7fd79f8b2..000000000000 --- a/test/CodeGen/X86/GlobalISel/memop-x32.ll +++ /dev/null @@ -1,101 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=i386-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST -; RUN: llc -mtriple=i386-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY - -;TODO merge with x86-64 tests (many operations not suppored yet) - -define i8 @test_load_i8(i8 * %p1) { -; ALL-LABEL: test_load_i8: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: movb (%eax), %al -; ALL-NEXT: retl - %r = load i8, i8* %p1 - ret i8 %r -} - -define i16 @test_load_i16(i16 * %p1) { -; ALL-LABEL: test_load_i16: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: movzwl (%eax), %eax -; ALL-NEXT: retl - %r = load i16, i16* %p1 - ret i16 %r -} - -define i32 @test_load_i32(i32 * %p1) { -; ALL-LABEL: test_load_i32: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: retl - %r = load i32, i32* %p1 - ret i32 %r -} - -define i8 * @test_store_i8(i8 %val, i8 * %p1) { -; ALL-LABEL: test_store_i8: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movb (%eax), %cl -; ALL-NEXT: leal 8(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: movb %cl, (%eax) -; ALL-NEXT: retl - store i8 %val, i8* %p1 - ret i8 * %p1; -} - -define i16 * @test_store_i16(i16 %val, i16 * %p1) { -; ALL-LABEL: test_store_i16: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movzwl (%eax), %ecx -; ALL-NEXT: leal 8(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: movw %cx, (%eax) -; ALL-NEXT: retl - store i16 %val, i16* %p1 - ret i16 * %p1; -} - -define i32 * @test_store_i32(i32 %val, i32 * %p1) { -; ALL-LABEL: test_store_i32: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movl (%eax), %ecx -; ALL-NEXT: leal 8(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: movl %ecx, (%eax) -; ALL-NEXT: retl - store i32 %val, i32* %p1 - ret i32 * %p1; -} - -define i32* @test_load_ptr(i32** %ptr1) { -; ALL-LABEL: test_load_ptr: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: retl - %p = load i32*, i32** %ptr1 - ret i32* %p -} - -define void @test_store_ptr(i32** %ptr1, i32* %a) { -; ALL-LABEL: test_store_ptr: -; ALL: # BB#0: -; ALL-NEXT: leal 4(%esp), %eax -; ALL-NEXT: movl (%eax), %eax -; ALL-NEXT: leal 8(%esp), %ecx -; ALL-NEXT: movl (%ecx), %ecx -; ALL-NEXT: movl %ecx, (%eax) -; ALL-NEXT: retl - store i32* %a, i32** %ptr1 - ret void -} diff --git a/test/CodeGen/X86/GlobalISel/memop.ll b/test/CodeGen/X86/GlobalISel/memop.ll deleted file mode 100644 index a7407c0e6b75..000000000000 --- a/test/CodeGen/X86/GlobalISel/memop.ll +++ /dev/null @@ -1,206 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=x86_64-linux-gnu -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512F_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512F_GREEDY -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_FAST --check-prefix=AVX512VL_FAST -; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -regbankselect-greedy -global-isel < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=ALL_AVX --check-prefix=ALL_AVX_GREEDY --check-prefix=AVX512VL_GREEDY - - -define i8 @test_load_i8(i8 * %p1) { -; ALL-LABEL: test_load_i8: -; ALL: # BB#0: -; ALL-NEXT: movb (%rdi), %al -; ALL-NEXT: retq - %r = load i8, i8* %p1 - ret i8 %r -} - -define i16 @test_load_i16(i16 * %p1) { -; ALL-LABEL: test_load_i16: -; ALL: # BB#0: -; ALL-NEXT: movzwl (%rdi), %eax -; ALL-NEXT: retq - %r = load i16, i16* %p1 - ret i16 %r -} - -define i32 @test_load_i32(i32 * %p1) { -; ALL-LABEL: test_load_i32: -; ALL: # BB#0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: retq - %r = load i32, i32* %p1 - ret i32 %r -} - -define i64 @test_load_i64(i64 * %p1) { -; ALL-LABEL: test_load_i64: -; ALL: # BB#0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: retq - %r = load i64, i64* %p1 - ret i64 %r -} - -define float @test_load_float(float * %p1) { -; SSE-LABEL: test_load_float: -; SSE: # BB#0: -; SSE-NEXT: movl (%rdi), %eax -; SSE-NEXT: movd %eax, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_float: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: movl (%rdi), %eax -; ALL_AVX-NEXT: vmovd %eax, %xmm0 -; ALL_AVX-NEXT: retq - %r = load float, float* %p1 - ret float %r -} - -define double @test_load_double(double * %p1) { -; SSE-LABEL: test_load_double: -; SSE: # BB#0: -; SSE-NEXT: movq (%rdi), %rax -; SSE-NEXT: movq %rax, %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_double: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: movq (%rdi), %rax -; ALL_AVX-NEXT: vmovq %rax, %xmm0 -; ALL_AVX-NEXT: retq - %r = load double, double* %p1 - ret double %r -} - -define <4 x i32> @test_load_v4i32_noalign(<4 x i32> * %p1) { -; SSE-LABEL: test_load_v4i32_noalign: -; SSE: # BB#0: -; SSE-NEXT: movups (%rdi), %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_v4i32_noalign: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovups (%rdi), %xmm0 -; ALL_AVX-NEXT: retq - %r = load <4 x i32>, <4 x i32>* %p1, align 1 - ret <4 x i32> %r -} - -define <4 x i32> @test_load_v4i32_align(<4 x i32> * %p1) { -; SSE-LABEL: test_load_v4i32_align: -; SSE: # BB#0: -; SSE-NEXT: movaps (%rdi), %xmm0 -; SSE-NEXT: retq -; -; ALL_AVX-LABEL: test_load_v4i32_align: -; ALL_AVX: # BB#0: -; ALL_AVX-NEXT: vmovaps (%rdi), %xmm0 -; ALL_AVX-NEXT: retq - %r = load <4 x i32>, <4 x i32>* %p1, align 16 - ret <4 x i32> %r -} - -define i32 * @test_store_i32(i32 %val, i32 * %p1) { -; ALL-LABEL: test_store_i32: -; ALL: # BB#0: -; ALL-NEXT: movl %edi, (%rsi) -; ALL-NEXT: movq %rsi, %rax -; ALL-NEXT: retq - store i32 %val, i32* %p1 - ret i32 * %p1; -} - -define i64 * @test_store_i64(i64 %val, i64 * %p1) { -; ALL-LABEL: test_store_i64: -; ALL: # BB#0: -; ALL-NEXT: movq %rdi, (%rsi) -; ALL-NEXT: movq %rsi, %rax -; ALL-NEXT: retq - store i64 %val, i64* %p1 - ret i64 * %p1; -} - -define float * @test_store_float(float %val, float * %p1) { -; -; SSE_FAST-LABEL: test_store_float: -; SSE_FAST: # BB#0: -; SSE_FAST-NEXT: movd %xmm0, %eax -; SSE_FAST-NEXT: movl %eax, (%rdi) -; SSE_FAST-NEXT: movq %rdi, %rax -; SSE_FAST-NEXT: retq -; -; SSE_GREEDY-LABEL: test_store_float: -; SSE_GREEDY: # BB#0: -; SSE_GREEDY-NEXT: movss %xmm0, (%rdi) -; SSE_GREEDY-NEXT: movq %rdi, %rax -; SSE_GREEDY-NEXT: retq -; -; ALL_AVX_FAST-LABEL: test_store_float: -; ALL_AVX_FAST: # BB#0: -; ALL_AVX_FAST-NEXT: vmovd %xmm0, %eax -; ALL_AVX_FAST-NEXT: movl %eax, (%rdi) -; ALL_AVX_FAST-NEXT: movq %rdi, %rax -; ALL_AVX_FAST-NEXT: retq -; -; ALL_AVX_GREEDY-LABEL: test_store_float: -; ALL_AVX_GREEDY: # BB#0: -; ALL_AVX_GREEDY-NEXT: vmovss %xmm0, (%rdi) -; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax -; ALL_AVX_GREEDY-NEXT: retq - store float %val, float* %p1 - ret float * %p1; -} - -define double * @test_store_double(double %val, double * %p1) { -; -; SSE_FAST-LABEL: test_store_double: -; SSE_FAST: # BB#0: -; SSE_FAST-NEXT: movq %xmm0, %rax -; SSE_FAST-NEXT: movq %rax, (%rdi) -; SSE_FAST-NEXT: movq %rdi, %rax -; SSE_FAST-NEXT: retq -; -; SSE_GREEDY-LABEL: test_store_double: -; SSE_GREEDY: # BB#0: -; SSE_GREEDY-NEXT: movsd %xmm0, (%rdi) -; SSE_GREEDY-NEXT: movq %rdi, %rax -; SSE_GREEDY-NEXT: retq -; -; ALL_AVX_FAST-LABEL: test_store_double: -; ALL_AVX_FAST: # BB#0: -; ALL_AVX_FAST-NEXT: vmovq %xmm0, %rax -; ALL_AVX_FAST-NEXT: movq %rax, (%rdi) -; ALL_AVX_FAST-NEXT: movq %rdi, %rax -; ALL_AVX_FAST-NEXT: retq -; -; ALL_AVX_GREEDY-LABEL: test_store_double: -; ALL_AVX_GREEDY: # BB#0: -; ALL_AVX_GREEDY-NEXT: vmovsd %xmm0, (%rdi) -; ALL_AVX_GREEDY-NEXT: movq %rdi, %rax -; ALL_AVX_GREEDY-NEXT: retq - store double %val, double* %p1 - ret double * %p1; -} - -define i32* @test_load_ptr(i32** %ptr1) { -; ALL-LABEL: test_load_ptr: -; ALL: # BB#0: -; ALL-NEXT: movq (%rdi), %rax -; ALL-NEXT: retq - %p = load i32*, i32** %ptr1 - ret i32* %p -} - -define void @test_store_ptr(i32** %ptr1, i32* %a) { -; ALL-LABEL: test_store_ptr: -; ALL: # BB#0: -; ALL-NEXT: movq %rsi, (%rdi) -; ALL-NEXT: retq - store i32* %a, i32** %ptr1 - ret void -} diff --git a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir index 3a65a9003773..1ea922ee475a 100644 --- a/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir +++ b/test/CodeGen/X86/GlobalISel/regbankselect-X86_64.mir @@ -2,11 +2,6 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -regbankselect-greedy -run-pass=regbankselect %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=GREEDY --- | - ; ModuleID = 'tmp.ll' - source_filename = "tmp.ll" - target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" - target triple = "x86_64--linux-gnu" - define i8 @test_add_i8(i8 %arg1, i8 %arg2) { %ret = add i8 %arg1, %arg2 ret i8 %ret @@ -120,6 +115,26 @@ ret void } + define i1 @test_icmp_eq_i8(i8 %a, i8 %b) { + %r = icmp eq i8 %a, %b + ret i1 %r + } + + define i1 @test_icmp_eq_i16(i16 %a, i16 %b) { + %r = icmp eq i16 %a, %b + ret i1 %r + } + + define i1 @test_icmp_eq_i32(i32 %a, i32 %b) { + %r = icmp eq i32 %a, %b + ret i1 %r + } + + define i1 @test_icmp_eq_i64(i64 %a, i64 %b) { + %r = icmp eq i64 %a, %b + ret i1 %r + } + ... --- name: test_add_i8 @@ -735,3 +750,103 @@ body: | RET 0 ... +--- +name: test_icmp_eq_i8 +# CHECK-LABEL: name: test_icmp_eq_i8 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s8) = COPY %edi + %1(s8) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s8), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... +--- +name: test_icmp_eq_i16 +# CHECK-LABEL: name: test_icmp_eq_i16 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s16) = COPY %edi + %1(s16) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s16), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... +--- +name: test_icmp_eq_i32 +# CHECK-LABEL: name: test_icmp_eq_i32 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s32), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... +--- +name: test_icmp_eq_i64 +# CHECK-LABEL: name: test_icmp_eq_i64 +alignment: 4 +legalized: true +regBankSelected: false +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gpr } +# CHECK-NEXT: - { id: 1, class: gpr } +# CHECK-NEXT: - { id: 2, class: gpr } +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s1) = G_ICMP intpred(eq), %0(s64), %1 + %al = COPY %2(s1) + RET 0, implicit %al + +... diff --git a/test/CodeGen/X86/GlobalISel/select-br.mir b/test/CodeGen/X86/GlobalISel/select-br.mir new file mode 100644 index 000000000000..6d8cd2b1367d --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-br.mir @@ -0,0 +1,39 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X64 +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=X32 + +--- | + define void @uncondbr() { + entry: + br label %bb2 + + end: ; preds = %bb2 + ret void + + bb2: ; preds = %entry + br label %end + } + +... +--- +name: uncondbr +# CHECK-LABEL: name: uncondbr +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: JMP_1 %bb.2.bb2 +# CHECK: JMP_1 %bb.1.end +body: | + bb.1.entry: + successors: %bb.3.bb2(0x80000000) + + G_BR %bb.3.bb2 + + bb.2.end: + RET 0 + + bb.3.bb2: + successors: %bb.2.end(0x80000000) + + G_BR %bb.2.end + +... diff --git a/test/CodeGen/X86/GlobalISel/select-cmp.mir b/test/CodeGen/X86/GlobalISel/select-cmp.mir new file mode 100644 index 000000000000..1d3da6cb88b9 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-cmp.mir @@ -0,0 +1,563 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=CHECK + +--- | + define i32 @test_icmp_eq_i8(i8 %a, i8 %b) { + %r = icmp eq i8 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_eq_i16(i16 %a, i16 %b) { + %r = icmp eq i16 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_eq_i64(i64 %a, i64 %b) { + %r = icmp eq i64 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_eq_i32(i32 %a, i32 %b) { + %r = icmp eq i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ne_i32(i32 %a, i32 %b) { + %r = icmp ne i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ugt_i32(i32 %a, i32 %b) { + %r = icmp ugt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_uge_i32(i32 %a, i32 %b) { + %r = icmp uge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ult_i32(i32 %a, i32 %b) { + %r = icmp ult i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_ule_i32(i32 %a, i32 %b) { + %r = icmp ule i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_sgt_i32(i32 %a, i32 %b) { + %r = icmp sgt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_sge_i32(i32 %a, i32 %b) { + %r = icmp sge i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_slt_i32(i32 %a, i32 %b) { + %r = icmp slt i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + + define i32 @test_icmp_sle_i32(i32 %a, i32 %b) { + %r = icmp sle i32 %a, %b + %res = zext i1 %r to i32 + ret i32 %res + } + +... +--- +name: test_icmp_eq_i8 +# CHECK-LABEL: name: test_icmp_eq_i8 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr8 } +# CHECK-NEXT: - { id: 1, class: gr8 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %dil +# CHECK-NEXT: %1 = COPY %sil +# CHECK-NEXT: CMP8rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s8) = COPY %edi + %1(s8) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s8), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_eq_i16 +# CHECK-LABEL: name: test_icmp_eq_i16 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr16 } +# CHECK-NEXT: - { id: 1, class: gr16 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %di +# CHECK-NEXT: %1 = COPY %si +# CHECK-NEXT: CMP16rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s16) = COPY %edi + %1(s16) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s16), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_eq_i64 +# CHECK-LABEL: name: test_icmp_eq_i64 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr64 } +# CHECK-NEXT: - { id: 1, class: gr64 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %rdi +# CHECK-NEXT: %1 = COPY %rsi +# CHECK-NEXT: CMP64rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(s64) = COPY %rsi + %2(s1) = G_ICMP intpred(eq), %0(s64), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_eq_i32 +# CHECK-LABEL: name: test_icmp_eq_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(eq), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ne_i32 +# CHECK-LABEL: name: test_icmp_ne_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETNEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ne), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ugt_i32 +# CHECK-LABEL: name: test_icmp_ugt_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETAr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ugt), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_uge_i32 +# CHECK-LABEL: name: test_icmp_uge_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETAEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(uge), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ult_i32 +# CHECK-LABEL: name: test_icmp_ult_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETBr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ult), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_ule_i32 +# CHECK-LABEL: name: test_icmp_ule_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETBEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(ule), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_sgt_i32 +# CHECK-LABEL: name: test_icmp_sgt_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETGr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(sgt), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_sge_i32 +# CHECK-LABEL: name: test_icmp_sge_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETGEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(sge), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_slt_i32 +# CHECK-LABEL: name: test_icmp_slt_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETLr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(slt), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... +--- +name: test_icmp_sle_i32 +# CHECK-LABEL: name: test_icmp_sle_i32 +alignment: 4 +legalized: true +regBankSelected: true +# CHECK: registers: +# CHECK-NEXT: - { id: 0, class: gr32 } +# CHECK-NEXT: - { id: 1, class: gr32 } +# CHECK-NEXT: - { id: 2, class: gr8 } +# CHECK-NEXT: - { id: 3, class: gr32 } +# CHECK-NEXT: - { id: 4, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +# CHECK: %0 = COPY %edi +# CHECK-NEXT: %1 = COPY %esi +# CHECK-NEXT: CMP32rr %0, %1, implicit-def %eflags +# CHECK-NEXT: %2 = SETLEr implicit %eflags +# CHECK-NEXT: %4 = SUBREG_TO_REG 0, %2, 1 +# CHECK-NEXT: %3 = AND32ri8 %4, 1, implicit-def %eflags +# CHECK-NEXT: %eax = COPY %3 +# CHECK-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi, %esi + + %0(s32) = COPY %edi + %1(s32) = COPY %esi + %2(s1) = G_ICMP intpred(sle), %0(s32), %1 + %3(s32) = G_ZEXT %2(s1) + %eax = COPY %3(s32) + RET 0, implicit %eax + +... diff --git a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir index 85b3f61a9e44..0844701487bc 100644 --- a/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir +++ b/test/CodeGen/X86/GlobalISel/select-ext-x86-64.mir @@ -1,6 +1,12 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | + define i64 @test_zext_i1(i8 %a) { + %val = trunc i8 %a to i1 + %r = zext i1 %val to i64 + ret i64 %r + } + define i64 @test_sext_i8(i8 %val) { %r = sext i8 %val to i64 ret i64 %r @@ -11,6 +17,38 @@ ret i64 %r } +... +--- +name: test_zext_i1 +# ALL-LABEL: name: test_zext_i1 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr8 } +# ALL-NEXT: - { id: 1, class: gr8 } +# ALL-NEXT: - { id: 2, class: gr64 } +# ALL-NEXT: - { id: 3, class: gr64 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %dil +# ALL-NEXT: %1 = COPY %0 +# ALL-NEXT: %3 = SUBREG_TO_REG 0, %1, 1 +# ALL-NEXT: %2 = AND64ri8 %3, 1, implicit-def %eflags +# ALL-NEXT: %rax = COPY %2 +# ALL-NEXT: RET 0, implicit %rax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s8) = COPY %edi + %1(s1) = G_TRUNC %0(s8) + %2(s64) = G_ZEXT %1(s1) + %rax = COPY %2(s64) + RET 0, implicit %rax + ... --- name: test_sext_i8 diff --git a/test/CodeGen/X86/GlobalISel/select-ext.mir b/test/CodeGen/X86/GlobalISel/select-ext.mir index 63aeae89bd1a..831d6efb75f1 100644 --- a/test/CodeGen/X86/GlobalISel/select-ext.mir +++ b/test/CodeGen/X86/GlobalISel/select-ext.mir @@ -2,6 +2,11 @@ # RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=X64 --- | + define i32 @test_zext_i1(i1 %a) { + %r = zext i1 %a to i32 + ret i32 %r + } + define i32 @test_zext_i8(i8 %val) { %r = zext i8 %val to i32 ret i32 %r @@ -22,6 +27,34 @@ ret i32 %r } +... +--- +name: test_zext_i1 +# ALL-LABEL: name: test_zext_i1 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr8 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %dil +# ALL-NEXT: %2 = SUBREG_TO_REG 0, %0, 1 +# ALL-NEXT: %1 = AND32ri8 %2, 1, implicit-def %eflags +# ALL-NEXT: %eax = COPY %1 +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + liveins: %edi + + %0(s1) = COPY %edi + %1(s32) = G_ZEXT %0(s1) + %eax = COPY %1(s32) + RET 0, implicit %eax + ... --- name: test_zext_i8 diff --git a/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir new file mode 100644 index 000000000000..8e6a2771db6e --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar-x32.mir @@ -0,0 +1,310 @@ +# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL + +--- | + define i8 @test_load_i8(i8* %p1) { + %r = load i8, i8* %p1 + ret i8 %r + } + + define i16 @test_load_i16(i16* %p1) { + %r = load i16, i16* %p1 + ret i16 %r + } + + define i32 @test_load_i32(i32* %p1) { + %r = load i32, i32* %p1 + ret i32 %r + } + + define i8* @test_store_i8(i8 %val, i8* %p1) { + store i8 %val, i8* %p1 + ret i8* %p1 + } + + define i16* @test_store_i16(i16 %val, i16* %p1) { + store i16 %val, i16* %p1 + ret i16* %p1 + } + + define i32* @test_store_i32(i32 %val, i32* %p1) { + store i32 %val, i32* %p1 + ret i32* %p1 + } + + define i32* @test_load_ptr(i32** %ptr1) { + %p = load i32*, i32** %ptr1 + ret i32* %p + } + + define void @test_store_ptr(i32** %ptr1, i32* %a) { + store i32* %a, i32** %ptr1 + ret void + } + +... +--- +name: test_load_i8 +# ALL-LABEL: name: test_load_i8 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr8 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +fixedStack: + - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) +# ALL-NEXT: %2 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1) +# ALL-NEXT: %al = COPY %2 +# ALL-NEXT: RET 0, implicit %al +body: | + bb.1 (%ir-block.0): + %1(p0) = G_FRAME_INDEX %fixed-stack.0 + %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + %2(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1) + %al = COPY %2(s8) + RET 0, implicit %al + +... +--- +name: test_load_i16 +# ALL-LABEL: name: test_load_i16 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr16 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +fixedStack: + - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) +# ALL-NEXT: %2 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1) +# ALL-NEXT: %ax = COPY %2 +# ALL-NEXT: RET 0, implicit %ax +body: | + bb.1 (%ir-block.0): + %1(p0) = G_FRAME_INDEX %fixed-stack.0 + %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + %2(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1) + %ax = COPY %2(s16) + RET 0, implicit %ax + +... +--- +name: test_load_i32 +# ALL-LABEL: name: test_load_i32 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +fixedStack: + - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) +# ALL-NEXT: %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# ALL-NEXT: %eax = COPY %2 +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + %1(p0) = G_FRAME_INDEX %fixed-stack.0 + %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + %2(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) + %eax = COPY %2(s32) + RET 0, implicit %eax + +... +--- +name: test_store_i8 +# ALL-LABEL: name: test_store_i8 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr8 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +# ALL-NEXT: - { id: 3, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +fixedStack: + - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } + - { id: 1, offset: 0, size: 1, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV8rm %2, 1, _, 0, _ :: (invariant load 1 from %fixed-stack.0, align 0) +# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ +# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) +# ALL-NEXT: MOV8mr %1, 1, _, 0, _, %0 :: (store 1 into %ir.p1) +# ALL-NEXT: %eax = COPY %1 +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + %2(p0) = G_FRAME_INDEX %fixed-stack.1 + %0(s8) = G_LOAD %2(p0) :: (invariant load 1 from %fixed-stack.1, align 0) + %3(p0) = G_FRAME_INDEX %fixed-stack.0 + %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + G_STORE %0(s8), %1(p0) :: (store 1 into %ir.p1) + %eax = COPY %1(p0) + RET 0, implicit %eax + +... +--- +name: test_store_i16 +# ALL-LABEL: name: test_store_i16 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr16 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +# ALL-NEXT: - { id: 3, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +fixedStack: + - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } + - { id: 1, offset: 0, size: 2, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV16rm %2, 1, _, 0, _ :: (invariant load 2 from %fixed-stack.0, align 0) +# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ +# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) +# ALL-NEXT: MOV16mr %1, 1, _, 0, _, %0 :: (store 2 into %ir.p1) +# ALL-NEXT: %eax = COPY %1 +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + %2(p0) = G_FRAME_INDEX %fixed-stack.1 + %0(s16) = G_LOAD %2(p0) :: (invariant load 2 from %fixed-stack.1, align 0) + %3(p0) = G_FRAME_INDEX %fixed-stack.0 + %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + G_STORE %0(s16), %1(p0) :: (store 2 into %ir.p1) + %eax = COPY %1(p0) + RET 0, implicit %eax + +... +--- +name: test_store_i32 +# ALL-LABEL: name: test_store_i32 +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +# ALL-NEXT: - { id: 3, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +fixedStack: + - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } + - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) +# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ +# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) +# ALL-NEXT: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# ALL-NEXT: %eax = COPY %1 +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + %2(p0) = G_FRAME_INDEX %fixed-stack.1 + %0(s32) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0) + %3(p0) = G_FRAME_INDEX %fixed-stack.0 + %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) + %eax = COPY %1(p0) + RET 0, implicit %eax + +... +--- +name: test_load_ptr +# ALL-LABEL: name: test_load_ptr +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +fixedStack: + - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) +# ALL-NEXT: %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.ptr1) +# ALL-NEXT: %eax = COPY %2 +# ALL-NEXT: RET 0, implicit %eax +body: | + bb.1 (%ir-block.0): + %1(p0) = G_FRAME_INDEX %fixed-stack.0 + %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + %2(p0) = G_LOAD %0(p0) :: (load 4 from %ir.ptr1) + %eax = COPY %2(p0) + RET 0, implicit %eax + +... +--- +name: test_store_ptr +# ALL-LABEL: name: test_store_ptr +alignment: 4 +legalized: true +regBankSelected: true +# ALL: registers: +# ALL-NEXT: - { id: 0, class: gr32 } +# ALL-NEXT: - { id: 1, class: gr32 } +# ALL-NEXT: - { id: 2, class: gr32 } +# ALL-NEXT: - { id: 3, class: gr32 } +registers: + - { id: 0, class: gpr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } + - { id: 3, class: gpr } +fixedStack: + - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } + - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } +# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ +# ALL-NEXT: %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) +# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ +# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) +# ALL-NEXT: MOV32mr %0, 1, _, 0, _, %1 :: (store 4 into %ir.ptr1) +# ALL-NEXT: RET 0 +body: | + bb.1 (%ir-block.0): + %2(p0) = G_FRAME_INDEX %fixed-stack.1 + %0(p0) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0) + %3(p0) = G_FRAME_INDEX %fixed-stack.0 + %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) + G_STORE %1(p0), %0(p0) :: (store 4 into %ir.ptr1) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir new file mode 100644 index 000000000000..b57c9b0cca98 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-scalar.mir @@ -0,0 +1,500 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + +--- | + define i8 @test_load_i8(i8* %p1) { + %r = load i8, i8* %p1 + ret i8 %r + } + + define i16 @test_load_i16(i16* %p1) { + %r = load i16, i16* %p1 + ret i16 %r + } + + define i32 @test_load_i32(i32* %p1) { + %r = load i32, i32* %p1 + ret i32 %r + } + + define i64 @test_load_i64(i64* %p1) { + %r = load i64, i64* %p1 + ret i64 %r + } + + define float @test_load_float(float* %p1) { + %r = load float, float* %p1 + ret float %r + } + + define float @test_load_float_vecreg(float* %p1) { + %r = load float, float* %p1 + ret float %r + } + + define double @test_load_double(double* %p1) { + %r = load double, double* %p1 + ret double %r + } + + define double @test_load_double_vecreg(double* %p1) { + %r = load double, double* %p1 + ret double %r + } + + define i32* @test_store_i32(i32 %val, i32* %p1) { + store i32 %val, i32* %p1 + ret i32* %p1 + } + + define i64* @test_store_i64(i64 %val, i64* %p1) { + store i64 %val, i64* %p1 + ret i64* %p1 + } + + define float* @test_store_float(float %val, float* %p1) { + store float %val, float* %p1 + ret float* %p1 + } + + define float* @test_store_float_vec(float %val, float* %p1) { + store float %val, float* %p1 + ret float* %p1 + } + + define double* @test_store_double(double %val, double* %p1) { + store double %val, double* %p1 + ret double* %p1 + } + + define double* @test_store_double_vec(double %val, double* %p1) { + store double %val, double* %p1 + ret double* %p1 + } + + define i32* @test_load_ptr(i32** %ptr1) { + %p = load i32*, i32** %ptr1 + ret i32* %p + } + + define void @test_store_ptr(i32** %ptr1, i32* %a) { + store i32* %a, i32** %ptr1 + ret void + } +... +--- +# ALL-LABEL: name: test_load_i8 +name: test_load_i8 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr8 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1) +# ALL: %al = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1) + %al = COPY %1(s8) + RET 0, implicit %al + +... +--- +# ALL-LABEL: name: test_load_i16 +name: test_load_i16 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr16 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1) +# ALL: %ax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1) + %ax = COPY %1(s16) + RET 0, implicit %ax + +... +--- +# ALL-LABEL: name: test_load_i32 +name: test_load_i32 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr32 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# ALL: %eax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) + %eax = COPY %1(s32) + RET 0, implicit %eax + +... +--- +# ALL-LABEL: name: test_load_i64 +name: test_load_i64 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) + %rax = COPY %1(s64) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_load_float +name: test_load_float +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr32 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) + %xmm0 = COPY %1(s32) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_float_vecreg +name: test_load_float_vecreg +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: fr32 } +# AVX512ALL: - { id: 1, class: fr32x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# AVX: %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) + %xmm0 = COPY %1(s32) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_double +name: test_load_double +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) + %xmm0 = COPY %1(s64) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_double_vecreg +name: test_load_double_vecreg +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: fr64 } +# AVX512ALL: - { id: 1, class: fr64x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# AVX: %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) + %xmm0 = COPY %1(s64) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_store_i32 +name: test_store_i32 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr32 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %edi +# ALL: %1 = COPY %rsi +# ALL: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %edi, %rsi + + %0(s32) = COPY %edi + %1(p0) = COPY %rsi + G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_i64 +name: test_store_i64 +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %rdi +# ALL: %1 = COPY %rsi +# ALL: MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(s64) = COPY %rdi + %1(p0) = COPY %rsi + G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_float +name: test_store_float +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: fr32x } +# ALL: - { id: 1, class: gr64 } +# ALL: - { id: 2, class: gr32 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# ALL: %2 = COPY %0 +# ALL: MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s32) = COPY %xmm0 + %1(p0) = COPY %rdi + %2(s32) = COPY %0(s32) + G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_float_vec +name: test_store_float_vec +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: fr32 } +# AVX512ALL: - { id: 0, class: fr32x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# AVX: VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s32) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_double +name: test_store_double +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: fr64x } +# ALL: - { id: 1, class: gr64 } +# ALL: - { id: 2, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } + - { id: 2, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# ALL: %2 = COPY %0 +# ALL: MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s64) = COPY %xmm0 + %1(p0) = COPY %rdi + %2(s64) = COPY %0(s64) + G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_double_vec +name: test_store_double_vec +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: fr64 } +# AVX512ALL: - { id: 0, class: fr64x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# AVX: VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(s64) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_load_ptr +name: test_load_ptr +alignment: 4 +legalized: true +regBankSelected: true +selected: false +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.ptr1) +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(p0) = G_LOAD %0(p0) :: (load 8 from %ir.ptr1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_ptr +name: test_store_ptr +alignment: 4 +legalized: true +regBankSelected: true +selected: false +registers: +# ALL: - { id: 0, class: gr64 } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: gpr } + - { id: 1, class: gpr } +# ALL: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.ptr1) +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %rsi + + %0(p0) = COPY %rdi + %1(p0) = COPY %rsi + G_STORE %1(p0), %0(p0) :: (store 8 into %ir.ptr1) + RET 0 + +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop-v128.mir b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir new file mode 100644 index 000000000000..ce3f6b91dcf6 --- /dev/null +++ b/test/CodeGen/X86/GlobalISel/select-memop-v128.mir @@ -0,0 +1,143 @@ +# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F +# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL + +--- | + define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) { + %r = load <4 x i32>, <4 x i32>* %p1, align 1 + ret <4 x i32> %r + } + + define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) { + %r = load <4 x i32>, <4 x i32>* %p1, align 16 + ret <4 x i32> %r + } + + define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { + store <4 x i32> %val, <4 x i32>* %p1, align 16 + ret <4 x i32>* %p1 + } + + define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { + store <4 x i32> %val, <4 x i32>* %p1, align 1 + ret <4 x i32>* %p1 + } + +... +--- +# ALL-LABEL: name: test_load_v4i32_noalign +name: test_load_v4i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: vr128 } +# AVX512ALL: - { id: 1, class: vr128x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1) + %xmm0 = COPY %1(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_load_v4i32_align +name: test_load_v4i32_align +alignment: 4 +legalized: true +regBankSelected: true +registers: +# ALL: - { id: 0, class: gr64 } +# NO_AVX512F: - { id: 1, class: vr128 } +# AVX512ALL: - { id: 1, class: vr128x } + - { id: 0, class: gpr } + - { id: 1, class: vecr } +# ALL: %0 = COPY %rdi +# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1) +# ALL: %xmm0 = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi + + %0(p0) = COPY %rdi + %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1) + %xmm0 = COPY %1(<4 x s32>) + RET 0, implicit %xmm0 + +... +--- +# ALL-LABEL: name: test_store_v4i32_align +name: test_store_v4i32_align +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: vr128 } +# AVX512ALL: - { id: 0, class: vr128x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(<4 x s32>) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... +--- +# ALL-LABEL: name: test_store_v4i32_noalign +name: test_store_v4i32_noalign +alignment: 4 +legalized: true +regBankSelected: true +registers: +# NO_AVX512F: - { id: 0, class: vr128 } +# AVX512ALL: - { id: 0, class: vr128x } +# ALL: - { id: 1, class: gr64 } + - { id: 0, class: vecr } + - { id: 1, class: gpr } +# ALL: %0 = COPY %xmm0 +# ALL: %1 = COPY %rdi +# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) +# ALL: %rax = COPY %1 +body: | + bb.1 (%ir-block.0): + liveins: %rdi, %xmm0 + + %0(<4 x s32>) = COPY %xmm0 + %1(p0) = COPY %rdi + G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1) + %rax = COPY %1(p0) + RET 0, implicit %rax + +... diff --git a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir b/test/CodeGen/X86/GlobalISel/select-memop-x32.mir deleted file mode 100644 index 8e6a2771db6e..000000000000 --- a/test/CodeGen/X86/GlobalISel/select-memop-x32.mir +++ /dev/null @@ -1,310 +0,0 @@ -# RUN: llc -mtriple=i386-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL - ---- | - define i8 @test_load_i8(i8* %p1) { - %r = load i8, i8* %p1 - ret i8 %r - } - - define i16 @test_load_i16(i16* %p1) { - %r = load i16, i16* %p1 - ret i16 %r - } - - define i32 @test_load_i32(i32* %p1) { - %r = load i32, i32* %p1 - ret i32 %r - } - - define i8* @test_store_i8(i8 %val, i8* %p1) { - store i8 %val, i8* %p1 - ret i8* %p1 - } - - define i16* @test_store_i16(i16 %val, i16* %p1) { - store i16 %val, i16* %p1 - ret i16* %p1 - } - - define i32* @test_store_i32(i32 %val, i32* %p1) { - store i32 %val, i32* %p1 - ret i32* %p1 - } - - define i32* @test_load_ptr(i32** %ptr1) { - %p = load i32*, i32** %ptr1 - ret i32* %p - } - - define void @test_store_ptr(i32** %ptr1, i32* %a) { - store i32* %a, i32** %ptr1 - ret void - } - -... ---- -name: test_load_i8 -# ALL-LABEL: name: test_load_i8 -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr8 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -fixedStack: - - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) -# ALL-NEXT: %2 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1) -# ALL-NEXT: %al = COPY %2 -# ALL-NEXT: RET 0, implicit %al -body: | - bb.1 (%ir-block.0): - %1(p0) = G_FRAME_INDEX %fixed-stack.0 - %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - %2(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1) - %al = COPY %2(s8) - RET 0, implicit %al - -... ---- -name: test_load_i16 -# ALL-LABEL: name: test_load_i16 -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr16 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -fixedStack: - - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) -# ALL-NEXT: %2 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1) -# ALL-NEXT: %ax = COPY %2 -# ALL-NEXT: RET 0, implicit %ax -body: | - bb.1 (%ir-block.0): - %1(p0) = G_FRAME_INDEX %fixed-stack.0 - %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - %2(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1) - %ax = COPY %2(s16) - RET 0, implicit %ax - -... ---- -name: test_load_i32 -# ALL-LABEL: name: test_load_i32 -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -fixedStack: - - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) -# ALL-NEXT: %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# ALL-NEXT: %eax = COPY %2 -# ALL-NEXT: RET 0, implicit %eax -body: | - bb.1 (%ir-block.0): - %1(p0) = G_FRAME_INDEX %fixed-stack.0 - %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - %2(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) - %eax = COPY %2(s32) - RET 0, implicit %eax - -... ---- -name: test_store_i8 -# ALL-LABEL: name: test_store_i8 -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr8 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -# ALL-NEXT: - { id: 3, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } -fixedStack: - - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } - - { id: 1, offset: 0, size: 1, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV8rm %2, 1, _, 0, _ :: (invariant load 1 from %fixed-stack.0, align 0) -# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ -# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) -# ALL-NEXT: MOV8mr %1, 1, _, 0, _, %0 :: (store 1 into %ir.p1) -# ALL-NEXT: %eax = COPY %1 -# ALL-NEXT: RET 0, implicit %eax -body: | - bb.1 (%ir-block.0): - %2(p0) = G_FRAME_INDEX %fixed-stack.1 - %0(s8) = G_LOAD %2(p0) :: (invariant load 1 from %fixed-stack.1, align 0) - %3(p0) = G_FRAME_INDEX %fixed-stack.0 - %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - G_STORE %0(s8), %1(p0) :: (store 1 into %ir.p1) - %eax = COPY %1(p0) - RET 0, implicit %eax - -... ---- -name: test_store_i16 -# ALL-LABEL: name: test_store_i16 -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr16 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -# ALL-NEXT: - { id: 3, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } -fixedStack: - - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } - - { id: 1, offset: 0, size: 2, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV16rm %2, 1, _, 0, _ :: (invariant load 2 from %fixed-stack.0, align 0) -# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ -# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) -# ALL-NEXT: MOV16mr %1, 1, _, 0, _, %0 :: (store 2 into %ir.p1) -# ALL-NEXT: %eax = COPY %1 -# ALL-NEXT: RET 0, implicit %eax -body: | - bb.1 (%ir-block.0): - %2(p0) = G_FRAME_INDEX %fixed-stack.1 - %0(s16) = G_LOAD %2(p0) :: (invariant load 2 from %fixed-stack.1, align 0) - %3(p0) = G_FRAME_INDEX %fixed-stack.0 - %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - G_STORE %0(s16), %1(p0) :: (store 2 into %ir.p1) - %eax = COPY %1(p0) - RET 0, implicit %eax - -... ---- -name: test_store_i32 -# ALL-LABEL: name: test_store_i32 -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -# ALL-NEXT: - { id: 3, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } -fixedStack: - - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } - - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) -# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ -# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) -# ALL-NEXT: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# ALL-NEXT: %eax = COPY %1 -# ALL-NEXT: RET 0, implicit %eax -body: | - bb.1 (%ir-block.0): - %2(p0) = G_FRAME_INDEX %fixed-stack.1 - %0(s32) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0) - %3(p0) = G_FRAME_INDEX %fixed-stack.0 - %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) - %eax = COPY %1(p0) - RET 0, implicit %eax - -... ---- -name: test_load_ptr -# ALL-LABEL: name: test_load_ptr -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -fixedStack: - - { id: 0, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %1 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV32rm %1, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) -# ALL-NEXT: %2 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.ptr1) -# ALL-NEXT: %eax = COPY %2 -# ALL-NEXT: RET 0, implicit %eax -body: | - bb.1 (%ir-block.0): - %1(p0) = G_FRAME_INDEX %fixed-stack.0 - %0(p0) = G_LOAD %1(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - %2(p0) = G_LOAD %0(p0) :: (load 4 from %ir.ptr1) - %eax = COPY %2(p0) - RET 0, implicit %eax - -... ---- -name: test_store_ptr -# ALL-LABEL: name: test_store_ptr -alignment: 4 -legalized: true -regBankSelected: true -# ALL: registers: -# ALL-NEXT: - { id: 0, class: gr32 } -# ALL-NEXT: - { id: 1, class: gr32 } -# ALL-NEXT: - { id: 2, class: gr32 } -# ALL-NEXT: - { id: 3, class: gr32 } -registers: - - { id: 0, class: gpr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } - - { id: 3, class: gpr } -fixedStack: - - { id: 0, offset: 4, size: 4, alignment: 4, isImmutable: true, isAliased: false } - - { id: 1, offset: 0, size: 4, alignment: 16, isImmutable: true, isAliased: false } -# ALL: %2 = LEA32r %fixed-stack.0, 1, _, 0, _ -# ALL-NEXT: %0 = MOV32rm %2, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.0, align 0) -# ALL-NEXT: %3 = LEA32r %fixed-stack.1, 1, _, 0, _ -# ALL-NEXT: %1 = MOV32rm %3, 1, _, 0, _ :: (invariant load 4 from %fixed-stack.1, align 0) -# ALL-NEXT: MOV32mr %0, 1, _, 0, _, %1 :: (store 4 into %ir.ptr1) -# ALL-NEXT: RET 0 -body: | - bb.1 (%ir-block.0): - %2(p0) = G_FRAME_INDEX %fixed-stack.1 - %0(p0) = G_LOAD %2(p0) :: (invariant load 4 from %fixed-stack.1, align 0) - %3(p0) = G_FRAME_INDEX %fixed-stack.0 - %1(p0) = G_LOAD %3(p0) :: (invariant load 4 from %fixed-stack.0, align 0) - G_STORE %1(p0), %0(p0) :: (store 4 into %ir.ptr1) - RET 0 - -... diff --git a/test/CodeGen/X86/GlobalISel/select-memop.mir b/test/CodeGen/X86/GlobalISel/select-memop.mir deleted file mode 100644 index 817dc3cc9764..000000000000 --- a/test/CodeGen/X86/GlobalISel/select-memop.mir +++ /dev/null @@ -1,637 +0,0 @@ -# RUN: llc -mtriple=x86_64-linux-gnu -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=SSE -# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=NO_AVX512F --check-prefix=AVX -# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NO_AVX512VL --check-prefix=AVX512ALL --check-prefix=AVX512F -# RUN: llc -mtriple=x86_64-linux-gnu -mattr=+avx512f -mattr=+avx512vl -global-isel -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=AVX512ALL --check-prefix=AVX512VL - ---- | - define i8 @test_load_i8(i8* %p1) { - %r = load i8, i8* %p1 - ret i8 %r - } - - define i16 @test_load_i16(i16* %p1) { - %r = load i16, i16* %p1 - ret i16 %r - } - - define i32 @test_load_i32(i32* %p1) { - %r = load i32, i32* %p1 - ret i32 %r - } - - define i64 @test_load_i64(i64* %p1) { - %r = load i64, i64* %p1 - ret i64 %r - } - - define float @test_load_float(float* %p1) { - %r = load float, float* %p1 - ret float %r - } - - define float @test_load_float_vecreg(float* %p1) { - %r = load float, float* %p1 - ret float %r - } - - - define double @test_load_double(double* %p1) { - %r = load double, double* %p1 - ret double %r - } - - define double @test_load_double_vecreg(double* %p1) { - %r = load double, double* %p1 - ret double %r - } - - define <4 x i32> @test_load_v4i32_noalign(<4 x i32>* %p1) { - %r = load <4 x i32>, <4 x i32>* %p1, align 1 - ret <4 x i32> %r - } - - define <4 x i32> @test_load_v4i32_align(<4 x i32>* %p1) { - %r = load <4 x i32>, <4 x i32>* %p1, align 16 - ret <4 x i32> %r - } - - define i32* @test_store_i32(i32 %val, i32* %p1) { - store i32 %val, i32* %p1 - ret i32* %p1 - } - - define i64* @test_store_i64(i64 %val, i64* %p1) { - store i64 %val, i64* %p1 - ret i64* %p1 - } - - define float* @test_store_float(float %val, float* %p1) { - store float %val, float* %p1 - ret float* %p1 - } - - define float* @test_store_float_vec(float %val, float* %p1) { - store float %val, float* %p1 - ret float* %p1 - } - - define double* @test_store_double(double %val, double* %p1) { - store double %val, double* %p1 - ret double* %p1 - } - - define double* @test_store_double_vec(double %val, double* %p1) { - store double %val, double* %p1 - ret double* %p1 - } - - define <4 x i32>* @test_store_v4i32_align(<4 x i32> %val, <4 x i32>* %p1) { - store <4 x i32> %val, <4 x i32>* %p1, align 16 - ret <4 x i32>* %p1 - } - - define <4 x i32>* @test_store_v4i32_noalign(<4 x i32> %val, <4 x i32>* %p1) { - store <4 x i32> %val, <4 x i32>* %p1, align 1 - ret <4 x i32>* %p1 - } - - define i32* @test_load_ptr(i32** %ptr1) { - %p = load i32*, i32** %ptr1 - ret i32* %p - } - - define void @test_store_ptr(i32** %ptr1, i32* %a) { - store i32* %a, i32** %ptr1 - ret void - } -... ---- -# ALL-LABEL: name: test_load_i8 -name: test_load_i8 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr8 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV8rm %0, 1, _, 0, _ :: (load 1 from %ir.p1) -# ALL: %al = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s8) = G_LOAD %0(p0) :: (load 1 from %ir.p1) - %al = COPY %1(s8) - RET 0, implicit %al - -... ---- -# ALL-LABEL: name: test_load_i16 -name: test_load_i16 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr16 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV16rm %0, 1, _, 0, _ :: (load 2 from %ir.p1) -# ALL: %ax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s16) = G_LOAD %0(p0) :: (load 2 from %ir.p1) - %ax = COPY %1(s16) - RET 0, implicit %ax - -... ---- -# ALL-LABEL: name: test_load_i32 -name: test_load_i32 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr32 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# ALL: %eax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) - %eax = COPY %1(s32) - RET 0, implicit %eax - -... ---- -# ALL-LABEL: name: test_load_i64 -name: test_load_i64 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) - %rax = COPY %1(s64) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_load_float -name: test_load_float -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr32 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV32rm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) - %xmm0 = COPY %1(s32) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_float_vecreg -name: test_load_float_vecreg -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: fr32 } -# AVX512ALL: - { id: 1, class: fr32x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# AVX: %1 = VMOVSSrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# AVX512ALL: %1 = VMOVSSZrm %0, 1, _, 0, _ :: (load 4 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s32) = G_LOAD %0(p0) :: (load 4 from %ir.p1) - %xmm0 = COPY %1(s32) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_double -name: test_load_double -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) - %xmm0 = COPY %1(s64) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_double_vecreg -name: test_load_double_vecreg -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: fr64 } -# AVX512ALL: - { id: 1, class: fr64x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# AVX: %1 = VMOVSDrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# AVX512ALL: %1 = VMOVSDZrm %0, 1, _, 0, _ :: (load 8 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(s64) = G_LOAD %0(p0) :: (load 8 from %ir.p1) - %xmm0 = COPY %1(s64) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_v4i32_noalign -name: test_load_v4i32_noalign -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: vr128 } -# AVX512ALL: - { id: 1, class: vr128x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX: %1 = VMOVUPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX512F: %1 = VMOVUPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# AVX512VL: %1 = VMOVUPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1, align 1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1, align 1) - %xmm0 = COPY %1(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_load_v4i32_align -name: test_load_v4i32_align -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# NO_AVX512F: - { id: 1, class: vr128 } -# AVX512ALL: - { id: 1, class: vr128x } - - { id: 0, class: gpr } - - { id: 1, class: vecr } -# ALL: %0 = COPY %rdi -# SSE: %1 = MOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX: %1 = VMOVAPSrm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX512F: %1 = VMOVAPSZ128rm_NOVLX %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# AVX512VL: %1 = VMOVAPSZ128rm %0, 1, _, 0, _ :: (load 16 from %ir.p1) -# ALL: %xmm0 = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(<4 x s32>) = G_LOAD %0(p0) :: (load 16 from %ir.p1) - %xmm0 = COPY %1(<4 x s32>) - RET 0, implicit %xmm0 - -... ---- -# ALL-LABEL: name: test_store_i32 -name: test_store_i32 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr32 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %edi -# ALL: %1 = COPY %rsi -# ALL: MOV32mr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %edi, %rsi - - %0(s32) = COPY %edi - %1(p0) = COPY %rsi - G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_i64 -name: test_store_i64 -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %rdi -# ALL: %1 = COPY %rsi -# ALL: MOV64mr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %rsi - - %0(s64) = COPY %rdi - %1(p0) = COPY %rsi - G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_float -name: test_store_float -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: fr32x } -# ALL: - { id: 1, class: gr64 } -# ALL: - { id: 2, class: gr32 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# ALL: %2 = COPY %0 -# ALL: MOV32mr %1, 1, _, 0, _, %2 :: (store 4 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s32) = COPY %xmm0 - %1(p0) = COPY %rdi - %2(s32) = COPY %0(s32) - G_STORE %2(s32), %1(p0) :: (store 4 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_float_vec -name: test_store_float_vec -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: fr32 } -# AVX512ALL: - { id: 0, class: fr32x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# AVX: VMOVSSmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# AVX512ALL: VMOVSSZmr %1, 1, _, 0, _, %0 :: (store 4 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s32) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(s32), %1(p0) :: (store 4 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_double -name: test_store_double -alignment: 4 -legalized: true -regBankSelected: true -registers: -# ALL: - { id: 0, class: fr64x } -# ALL: - { id: 1, class: gr64 } -# ALL: - { id: 2, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } - - { id: 2, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# ALL: %2 = COPY %0 -# ALL: MOV64mr %1, 1, _, 0, _, %2 :: (store 8 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s64) = COPY %xmm0 - %1(p0) = COPY %rdi - %2(s64) = COPY %0(s64) - G_STORE %2(s64), %1(p0) :: (store 8 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_double_vec -name: test_store_double_vec -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: fr64 } -# AVX512ALL: - { id: 0, class: fr64x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# AVX: VMOVSDmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# AVX512ALL: VMOVSDZmr %1, 1, _, 0, _, %0 :: (store 8 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(s64) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(s64), %1(p0) :: (store 8 into %ir.p1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_v4i32_align -name: test_store_v4i32_align -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: vr128 } -# AVX512ALL: - { id: 0, class: vr128x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX: VMOVAPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX512F: VMOVAPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# AVX512VL: VMOVAPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(<4 x s32>) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 16) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_v4i32_noalign -name: test_store_v4i32_noalign -alignment: 4 -legalized: true -regBankSelected: true -registers: -# NO_AVX512F: - { id: 0, class: vr128 } -# AVX512ALL: - { id: 0, class: vr128x } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: vecr } - - { id: 1, class: gpr } -# ALL: %0 = COPY %xmm0 -# ALL: %1 = COPY %rdi -# SSE: MOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX: VMOVUPSmr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX512F: VMOVUPSZ128mr_NOVLX %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# AVX512VL: VMOVUPSZ128mr %1, 1, _, 0, _, %0 :: (store 16 into %ir.p1, align 1) -# ALL: %rax = COPY %1 -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %xmm0 - - %0(<4 x s32>) = COPY %xmm0 - %1(p0) = COPY %rdi - G_STORE %0(<4 x s32>), %1(p0) :: (store 16 into %ir.p1, align 1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_load_ptr -name: test_load_ptr -alignment: 4 -legalized: true -regBankSelected: true -selected: false -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: %1 = MOV64rm %0, 1, _, 0, _ :: (load 8 from %ir.ptr1) -body: | - bb.1 (%ir-block.0): - liveins: %rdi - - %0(p0) = COPY %rdi - %1(p0) = G_LOAD %0(p0) :: (load 8 from %ir.ptr1) - %rax = COPY %1(p0) - RET 0, implicit %rax - -... ---- -# ALL-LABEL: name: test_store_ptr -name: test_store_ptr -alignment: 4 -legalized: true -regBankSelected: true -selected: false -registers: -# ALL: - { id: 0, class: gr64 } -# ALL: - { id: 1, class: gr64 } - - { id: 0, class: gpr } - - { id: 1, class: gpr } -# ALL: MOV64mr %0, 1, _, 0, _, %1 :: (store 8 into %ir.ptr1) -body: | - bb.1 (%ir-block.0): - liveins: %rdi, %rsi - - %0(p0) = COPY %rdi - %1(p0) = COPY %rsi - G_STORE %1(p0), %0(p0) :: (store 8 into %ir.ptr1) - RET 0 - -... diff --git a/test/CodeGen/X86/O0-pipeline.ll b/test/CodeGen/X86/O0-pipeline.ll new file mode 100644 index 000000000000..262cb96ca6d8 --- /dev/null +++ b/test/CodeGen/X86/O0-pipeline.ll @@ -0,0 +1,67 @@ +; RUN: llc -mtriple=x86_64-- -O0 -debug-pass=Structure < %s -o /dev/null 2>&1 | FileCheck %s + +; REQUIRES: asserts + +; CHECK-LABEL: Pass Arguments: +; CHECK-NEXT: Target Library Information +; CHECK-NEXT: Target Transform Information +; CHECK-NEXT: Target Pass Configuration +; CHECK-NEXT: Type-Based Alias Analysis +; CHECK-NEXT: Scoped NoAlias Alias Analysis +; CHECK-NEXT: Assumption Cache Tracker +; CHECK-NEXT: Create Garbage Collector Module Metadata +; CHECK-NEXT: Machine Module Information +; CHECK-NEXT: Machine Branch Probability Analysis +; CHECK-NEXT: ModulePass Manager +; CHECK-NEXT: Pre-ISel Intrinsic Lowering +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Expand Atomic instructions +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Lower Garbage Collection Instructions +; CHECK-NEXT: Shadow Stack GC Lowering +; CHECK-NEXT: Remove unreachable blocks from the CFG +; CHECK-NEXT: Inserts calls to mcount-like functions +; CHECK-NEXT: Scalarize Masked Memory Intrinsics +; CHECK-NEXT: Expand reduction intrinsics +; CHECK-NEXT: Rewrite Symbols +; CHECK-NEXT: FunctionPass Manager +; CHECK-NEXT: Dominator Tree Construction +; CHECK-NEXT: Exception handling preparation +; CHECK-NEXT: Safe Stack instrumentation pass +; CHECK-NEXT: Insert stack protectors +; CHECK-NEXT: Module Verifier +; CHECK-NEXT: X86 DAG->DAG Instruction Selection +; CHECK-NEXT: X86 PIC Global Base Reg Initialization +; CHECK-NEXT: Expand ISel Pseudo-instructions +; CHECK-NEXT: Local Stack Slot Allocation +; CHECK-NEXT: X86 WinAlloca Expander +; CHECK-NEXT: Eliminate PHI nodes for register allocation +; CHECK-NEXT: Two-Address instruction pass +; CHECK-NEXT: Fast Register Allocator +; CHECK-NEXT: Bundle Machine CFG Edges +; CHECK-NEXT: X86 FP Stackifier +; CHECK-NEXT: Prologue/Epilogue Insertion & Frame Finalization +; CHECK-NEXT: Post-RA pseudo instruction expansion pass +; CHECK-NEXT: X86 pseudo instruction expansion pass +; CHECK-NEXT: Analyze Machine Code For Garbage Collection +; CHECK-NEXT: X86 vzeroupper inserter +; CHECK-NEXT: Contiguously Lay Out Funclets +; CHECK-NEXT: StackMap Liveness Analysis +; CHECK-NEXT: Live DEBUG_VALUE analysis +; CHECK-NEXT: Insert fentry calls +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: Insert XRay ops +; CHECK-NEXT: Implement the 'patchable-function' attribute +; CHECK-NEXT: Lazy Machine Block Frequency Analysis +; CHECK-NEXT: Machine Optimization Remark Emitter +; CHECK-NEXT: MachineDominator Tree Construction +; CHECK-NEXT: Machine Natural Loop Construction +; CHECK-NEXT: X86 Assembly Printer +; CHECK-NEXT: Free MachineFunction + +define void @f() { + ret void +} diff --git a/test/CodeGen/X86/all-ones-vector.ll b/test/CodeGen/X86/all-ones-vector.ll index 35f488ea448c..d0160a5b84df 100644 --- a/test/CodeGen/X86/all-ones-vector.ll +++ b/test/CodeGen/X86/all-ones-vector.ll @@ -157,8 +157,8 @@ define <32 x i8> @allones_v32i8() nounwind { ; ; X32-AVX1-LABEL: allones_v32i8: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v32i8: @@ -174,8 +174,8 @@ define <32 x i8> @allones_v32i8() nounwind { ; ; X64-AVX1-LABEL: allones_v32i8: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v32i8: @@ -194,8 +194,8 @@ define <16 x i16> @allones_v16i16() nounwind { ; ; X32-AVX1-LABEL: allones_v16i16: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v16i16: @@ -211,8 +211,8 @@ define <16 x i16> @allones_v16i16() nounwind { ; ; X64-AVX1-LABEL: allones_v16i16: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v16i16: @@ -231,8 +231,8 @@ define <8 x i32> @allones_v8i32() nounwind { ; ; X32-AVX1-LABEL: allones_v8i32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v8i32: @@ -248,8 +248,8 @@ define <8 x i32> @allones_v8i32() nounwind { ; ; X64-AVX1-LABEL: allones_v8i32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v8i32: @@ -268,8 +268,8 @@ define <4 x i64> @allones_v4i64() nounwind { ; ; X32-AVX1-LABEL: allones_v4i64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v4i64: @@ -285,8 +285,8 @@ define <4 x i64> @allones_v4i64() nounwind { ; ; X64-AVX1-LABEL: allones_v4i64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v4i64: @@ -305,8 +305,8 @@ define <4 x double> @allones_v4f64() nounwind { ; ; X32-AVX1-LABEL: allones_v4f64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v4f64: @@ -322,8 +322,8 @@ define <4 x double> @allones_v4f64() nounwind { ; ; X64-AVX1-LABEL: allones_v4f64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v4f64: @@ -342,8 +342,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize { ; ; X32-AVX1-LABEL: allones_v4f64_optsize: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v4f64_optsize: @@ -359,8 +359,8 @@ define <4 x double> @allones_v4f64_optsize() nounwind optsize { ; ; X64-AVX1-LABEL: allones_v4f64_optsize: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v4f64_optsize: @@ -379,8 +379,8 @@ define <8 x float> @allones_v8f32() nounwind { ; ; X32-AVX1-LABEL: allones_v8f32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v8f32: @@ -396,8 +396,8 @@ define <8 x float> @allones_v8f32() nounwind { ; ; X64-AVX1-LABEL: allones_v8f32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v8f32: @@ -416,8 +416,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize { ; ; X32-AVX1-LABEL: allones_v8f32_optsize: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: retl ; ; X32-AVX256-LABEL: allones_v8f32_optsize: @@ -433,8 +433,8 @@ define <8 x float> @allones_v8f32_optsize() nounwind optsize { ; ; X64-AVX1-LABEL: allones_v8f32_optsize: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: retq ; ; X64-AVX256-LABEL: allones_v8f32_optsize: @@ -455,8 +455,8 @@ define <64 x i8> @allones_v64i8() nounwind { ; ; X32-AVX1-LABEL: allones_v64i8: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -487,8 +487,8 @@ define <64 x i8> @allones_v64i8() nounwind { ; ; X64-AVX1-LABEL: allones_v64i8: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -522,8 +522,8 @@ define <32 x i16> @allones_v32i16() nounwind { ; ; X32-AVX1-LABEL: allones_v32i16: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -554,8 +554,8 @@ define <32 x i16> @allones_v32i16() nounwind { ; ; X64-AVX1-LABEL: allones_v32i16: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -589,8 +589,8 @@ define <16 x i32> @allones_v16i32() nounwind { ; ; X32-AVX1-LABEL: allones_v16i32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -615,8 +615,8 @@ define <16 x i32> @allones_v16i32() nounwind { ; ; X64-AVX1-LABEL: allones_v16i32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -644,8 +644,8 @@ define <8 x i64> @allones_v8i64() nounwind { ; ; X32-AVX1-LABEL: allones_v8i64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -670,8 +670,8 @@ define <8 x i64> @allones_v8i64() nounwind { ; ; X64-AVX1-LABEL: allones_v8i64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -699,8 +699,8 @@ define <8 x double> @allones_v8f64() nounwind { ; ; X32-AVX1-LABEL: allones_v8f64: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -725,8 +725,8 @@ define <8 x double> @allones_v8f64() nounwind { ; ; X64-AVX1-LABEL: allones_v8f64: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; @@ -754,8 +754,8 @@ define <16 x float> @allones_v16f32() nounwind { ; ; X32-AVX1-LABEL: allones_v16f32: ; X32-AVX1: # BB#0: -; X32-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X32-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X32-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X32-AVX1-NEXT: retl ; @@ -780,8 +780,8 @@ define <16 x float> @allones_v16f32() nounwind { ; ; X64-AVX1-LABEL: allones_v16f32: ; X64-AVX1: # BB#0: -; X64-AVX1-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; X64-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; X64-AVX1-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; X64-AVX1-NEXT: vmovaps %ymm0, %ymm1 ; X64-AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/avg.ll b/test/CodeGen/X86/avg.ll index 2aaf14001758..aa28ef5175ed 100644 --- a/test/CodeGen/X86/avg.ll +++ b/test/CodeGen/X86/avg.ll @@ -135,88 +135,87 @@ define void @avg_v16i8(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -259,198 +258,183 @@ define void @avg_v32i8(<32 x i8>* %a, <32 x i8>* %b) { define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; SSE2-LABEL: avg_v64i8: ; SSE2: # BB#0: -; SSE2-NEXT: subq $152, %rsp -; SSE2-NEXT: .Lcfi0: -; SSE2-NEXT: .cfi_def_cfa_offset 160 -; SSE2-NEXT: movdqa (%rdi), %xmm1 -; SSE2-NEXT: movdqa 16(%rdi), %xmm4 -; SSE2-NEXT: movdqa 32(%rdi), %xmm5 -; SSE2-NEXT: movdqa 48(%rdi), %xmm6 +; SSE2-NEXT: movdqa (%rdi), %xmm6 +; SSE2-NEXT: movdqa 16(%rdi), %xmm2 +; SSE2-NEXT: movdqa 32(%rdi), %xmm1 +; SSE2-NEXT: movdqa 48(%rdi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa (%rsi), %xmm5 +; SSE2-NEXT: movdqa 16(%rsi), %xmm13 +; SSE2-NEXT: movdqa 32(%rsi), %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3],xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm2, %xmm15 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm0[8],xmm15[9],xmm0[9],xmm15[10],xmm0[10],xmm15[11],xmm0[11],xmm15[12],xmm0[12],xmm15[13],xmm0[13],xmm15[14],xmm0[14],xmm15[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm15, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm0[0],xmm15[1],xmm0[1],xmm15[2],xmm0[2],xmm15[3],xmm0[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm2, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm5, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm0[8],xmm10[9],xmm0[9],xmm10[10],xmm0[10],xmm10[11],xmm0[11],xmm10[12],xmm0[12],xmm10[13],xmm0[13],xmm10[14],xmm0[14],xmm10[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm10, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm10 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3],xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm5 ; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm0[8],xmm8[9],xmm0[9],xmm8[10],xmm0[10],xmm8[11],xmm0[11],xmm8[12],xmm0[12],xmm8[13],xmm0[13],xmm8[14],xmm0[14],xmm8[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm8, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3],xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm6, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsi), %xmm14 -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm0[8],xmm7[9],xmm0[9],xmm7[10],xmm0[10],xmm7[11],xmm0[11],xmm7[12],xmm0[12],xmm7[13],xmm0[13],xmm7[14],xmm0[14],xmm7[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm4, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: paddd %xmm14, %xmm12 +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3],xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm14, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] -; SSE2-NEXT: movdqa 16(%rsi), %xmm12 -; SSE2-NEXT: movdqa %xmm12, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: paddd %xmm15, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3],xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm13, %xmm15 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: paddd %xmm8, %xmm15 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm0[0],xmm13[1],xmm0[1],xmm13[2],xmm0[2],xmm13[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm13 +; SSE2-NEXT: movdqa %xmm11, %xmm6 ; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm0[8],xmm6[9],xmm0[9],xmm6[10],xmm0[10],xmm6[11],xmm0[11],xmm6[12],xmm0[12],xmm6[13],xmm0[13],xmm6[14],xmm0[14],xmm6[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm6, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm6, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm0[4],xmm9[5],xmm0[5],xmm9[6],xmm0[6],xmm9[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm9 +; SSE2-NEXT: movdqa %xmm1, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm0[0],xmm6[1],xmm0[1],xmm6[2],xmm0[2],xmm6[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3],xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm12, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm0[4],xmm10[5],xmm0[5],xmm10[6],xmm0[6],xmm10[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm0[0],xmm12[1],xmm0[1],xmm12[2],xmm0[2],xmm12[3],xmm0[3] -; SSE2-NEXT: movdqa 32(%rsi), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm5 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm0[8],xmm5[9],xmm0[9],xmm5[10],xmm0[10],xmm5[11],xmm0[11],xmm5[12],xmm0[12],xmm5[13],xmm0[13],xmm5[14],xmm0[14],xmm5[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm5, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm0[0],xmm5[1],xmm0[1],xmm5[2],xmm0[2],xmm5[3],xmm0[3] +; SSE2-NEXT: paddd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3],xmm11[4],xmm0[4],xmm11[5],xmm0[5],xmm11[6],xmm0[6],xmm11[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm11, %xmm14 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm0[4],xmm14[5],xmm0[5],xmm14[6],xmm0[6],xmm14[7],xmm0[7] +; SSE2-NEXT: paddd %xmm2, %xmm14 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm0[8],xmm2[9],xmm0[9],xmm2[10],xmm0[10],xmm2[11],xmm0[11],xmm2[12],xmm0[12],xmm2[13],xmm0[13],xmm2[14],xmm0[14],xmm2[15],xmm0[15] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] +; SSE2-NEXT: movdqa 48(%rsi), %xmm7 +; SSE2-NEXT: movdqa %xmm7, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm0[8],xmm3[9],xmm0[9],xmm3[10],xmm0[10],xmm3[11],xmm0[11],xmm3[12],xmm0[12],xmm3[13],xmm0[13],xmm3[14],xmm0[14],xmm3[15],xmm0[15] +; SSE2-NEXT: movdqa %xmm3, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm0[4],xmm8[5],xmm0[5],xmm8[6],xmm0[6],xmm8[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm3 +; SSE2-NEXT: movdqa %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE2-NEXT: movdqa %xmm2, %xmm1 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3],xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm1, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; SSE2-NEXT: movdqa 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, %xmm4 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm0[8],xmm4[9],xmm0[9],xmm4[10],xmm0[10],xmm4[11],xmm0[11],xmm4[12],xmm0[12],xmm4[13],xmm0[13],xmm4[14],xmm0[14],xmm4[15],xmm0[15] -; SSE2-NEXT: movdqa %xmm4, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm1, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm0[4],xmm3[5],xmm0[5],xmm3[6],xmm0[6],xmm3[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm8, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Folded Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload -; SSE2-NEXT: paddd (%rsp), %xmm11 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm12 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm10 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm6 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm14 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload -; SSE2-NEXT: paddd {{[0-9]+}}(%rsp), %xmm15 # 16-byte Folded Reload +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm0[0],xmm7[1],xmm0[1],xmm7[2],xmm0[2],xmm7[3],xmm0[3] +; SSE2-NEXT: paddd %xmm2, %xmm7 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: paddd %xmm0, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm15 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: paddd %xmm0, %xmm12 +; SSE2-NEXT: paddd %xmm0, %xmm14 ; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: psrld $1, %xmm15 +; SSE2-NEXT: paddd %xmm0, %xmm5 +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: psrld $1, %xmm10 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm10 +; SSE2-NEXT: packuswb %xmm1, %xmm10 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm2 +; SSE2-NEXT: packuswb %xmm1, %xmm2 +; SSE2-NEXT: packuswb %xmm10, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm1 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm12 +; SSE2-NEXT: pand %xmm0, %xmm4 +; SSE2-NEXT: packuswb %xmm12, %xmm4 +; SSE2-NEXT: psrld $1, %xmm13 +; SSE2-NEXT: psrld $1, %xmm15 ; SSE2-NEXT: pand %xmm0, %xmm15 -; SSE2-NEXT: pand %xmm0, %xmm7 -; SSE2-NEXT: packuswb %xmm15, %xmm7 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm13 +; SSE2-NEXT: packuswb %xmm15, %xmm13 +; SSE2-NEXT: packuswb %xmm4, %xmm13 +; SSE2-NEXT: psrld $1, %xmm6 ; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: pand %xmm0, %xmm9 -; SSE2-NEXT: pand %xmm0, %xmm14 -; SSE2-NEXT: packuswb %xmm9, %xmm14 -; SSE2-NEXT: packuswb %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm0, %xmm13 ; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: packuswb %xmm13, %xmm6 -; SSE2-NEXT: psrld $1, %xmm12 -; SSE2-NEXT: psrld $1, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm10 -; SSE2-NEXT: pand %xmm0, %xmm12 -; SSE2-NEXT: packuswb %xmm10, %xmm12 -; SSE2-NEXT: packuswb %xmm6, %xmm12 -; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: packuswb %xmm9, %xmm6 ; SSE2-NEXT: psrld $1, %xmm11 +; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: pand %xmm0, %xmm14 ; SSE2-NEXT: pand %xmm0, %xmm11 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: packuswb %xmm11, %xmm5 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm6 -; SSE2-NEXT: pand %xmm0, %xmm2 -; SSE2-NEXT: packuswb %xmm6, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm5 -; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm4 -; SSE2-NEXT: packuswb %xmm5, %xmm4 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: packuswb %xmm14, %xmm11 +; SSE2-NEXT: packuswb %xmm6, %xmm11 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm8 +; SSE2-NEXT: pand %xmm0, %xmm3 +; SSE2-NEXT: packuswb %xmm8, %xmm3 +; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pand %xmm0, %xmm5 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: packuswb %xmm5, %xmm1 -; SSE2-NEXT: packuswb %xmm4, %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm7 +; SSE2-NEXT: packuswb %xmm5, %xmm7 +; SSE2-NEXT: packuswb %xmm3, %xmm7 +; SSE2-NEXT: movdqu %xmm7, (%rax) +; SSE2-NEXT: movdqu %xmm11, (%rax) +; SSE2-NEXT: movdqu %xmm13, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm2, (%rax) -; SSE2-NEXT: movdqu %xmm12, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) -; SSE2-NEXT: addq $152, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v64i8: @@ -464,21 +448,21 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm12 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpaddd %ymm15, %ymm7, %ymm7 -; AVX2-NEXT: vpaddd %ymm14, %ymm6, %ymm6 -; AVX2-NEXT: vpaddd %ymm13, %ymm5, %ymm5 -; AVX2-NEXT: vpaddd %ymm12, %ymm4, %ymm4 -; AVX2-NEXT: vpaddd %ymm11, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm10, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm9, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm3, %ymm3 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm4, %ymm4 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm5, %ymm5 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm6, %ymm6 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpaddd %ymm8, %ymm7, %ymm7 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm8 ; AVX2-NEXT: vpaddd %ymm8, %ymm0, %ymm9 ; AVX2-NEXT: vpaddd %ymm8, %ymm1, %ymm10 @@ -540,13 +524,13 @@ define void @avg_v64i8(<64 x i8>* %a, <64 x i8>* %b) { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm3 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm5 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpaddd %zmm7, %zmm3, %zmm3 -; AVX512F-NEXT: vpaddd %zmm6, %zmm2, %zmm2 -; AVX512F-NEXT: vpaddd %zmm5, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm2, %zmm2 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm4 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpaddd %zmm4, %zmm3, %zmm3 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm4 ; AVX512F-NEXT: vpaddd %zmm4, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm4, %zmm1, %zmm1 @@ -673,27 +657,27 @@ define void @avg_v8i16(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -755,80 +739,79 @@ define void @avg_v16i16(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -837,7 +820,7 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16: @@ -847,13 +830,13 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -884,9 +867,9 @@ define void @avg_v32i16(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 @@ -1047,88 +1030,87 @@ define void @avg_v16i8_2(<16 x i8>* %a, <16 x i8>* %b) { define void @avg_v32i8_2(<32 x i8>* %a, <32 x i8>* %b) { ; SSE2-LABEL: avg_v32i8_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm8 -; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm3 +; SSE2-NEXT: movdqa 16(%rdi), %xmm8 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm4[8],xmm10[9],xmm4[9],xmm10[10],xmm4[10],xmm10[11],xmm4[11],xmm10[12],xmm4[12],xmm10[13],xmm4[13],xmm10[14],xmm4[14],xmm10[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm4[0],xmm10[1],xmm4[1],xmm10[2],xmm4[2],xmm10[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm8, %xmm12 +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm5, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3],xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm3, %xmm12 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: movdqa %xmm8, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm7, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3],xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm4[0],xmm8[1],xmm4[1],xmm8[2],xmm4[2],xmm8[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm15 = xmm15[8],xmm4[8],xmm15[9],xmm4[9],xmm15[10],xmm4[10],xmm15[11],xmm4[11],xmm15[12],xmm4[12],xmm15[13],xmm4[13],xmm15[14],xmm4[14],xmm15[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm15, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm4[0],xmm15[1],xmm4[1],xmm15[2],xmm4[2],xmm15[3],xmm4[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3],xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm11, %xmm9 +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm2, %xmm9 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm4[0],xmm11[1],xmm4[1],xmm11[2],xmm4[2],xmm11[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm6, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm5, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm4[8],xmm2[9],xmm4[9],xmm2[10],xmm4[10],xmm2[11],xmm4[11],xmm2[12],xmm4[12],xmm2[13],xmm4[13],xmm2[14],xmm4[14],xmm2[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] +; SSE2-NEXT: paddd %xmm3, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] +; SSE2-NEXT: paddd %xmm11, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] +; SSE2-NEXT: paddd %xmm7, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] +; SSE2-NEXT: movdqa %xmm1, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm4[4],xmm7[5],xmm4[5],xmm7[6],xmm4[6],xmm7[7],xmm4[7] +; SSE2-NEXT: paddd %xmm10, %xmm7 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: paddd %xmm11, %xmm1 -; SSE2-NEXT: paddd %xmm9, %xmm13 -; SSE2-NEXT: paddd %xmm15, %xmm2 -; SSE2-NEXT: paddd %xmm14, %xmm5 -; SSE2-NEXT: paddd %xmm8, %xmm0 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm3 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] -; SSE2-NEXT: paddd %xmm4, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm3 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm13 +; SSE2-NEXT: paddd %xmm4, %xmm5 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: paddd %xmm4, %xmm7 ; SSE2-NEXT: paddd %xmm4, %xmm1 -; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm1 ; SSE2-NEXT: psrld $1, %xmm7 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] -; SSE2-NEXT: pand %xmm4, %xmm7 -; SSE2-NEXT: pand %xmm4, %xmm3 -; SSE2-NEXT: packuswb %xmm7, %xmm3 -; SSE2-NEXT: psrld $1, %xmm0 +; SSE2-NEXT: psrld $1, %xmm3 ; SSE2-NEXT: psrld $1, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm6 -; SSE2-NEXT: pand %xmm4, %xmm0 -; SSE2-NEXT: packuswb %xmm6, %xmm0 -; SSE2-NEXT: packuswb %xmm3, %xmm0 -; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm0 ; SSE2-NEXT: psrld $1, %xmm5 -; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm9 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,0,0,0,255,0,0,0,255,0,0,0,255,0,0,0] +; SSE2-NEXT: pand %xmm4, %xmm9 ; SSE2-NEXT: pand %xmm4, %xmm2 -; SSE2-NEXT: packuswb %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm13 -; SSE2-NEXT: pand %xmm4, %xmm13 +; SSE2-NEXT: packuswb %xmm9, %xmm2 +; SSE2-NEXT: pand %xmm4, %xmm5 +; SSE2-NEXT: pand %xmm4, %xmm0 +; SSE2-NEXT: packuswb %xmm5, %xmm0 +; SSE2-NEXT: packuswb %xmm2, %xmm0 +; SSE2-NEXT: pand %xmm4, %xmm6 +; SSE2-NEXT: pand %xmm4, %xmm3 +; SSE2-NEXT: packuswb %xmm6, %xmm3 +; SSE2-NEXT: pand %xmm4, %xmm7 ; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: packuswb %xmm13, %xmm1 -; SSE2-NEXT: packuswb %xmm2, %xmm1 +; SSE2-NEXT: packuswb %xmm7, %xmm1 +; SSE2-NEXT: packuswb %xmm3, %xmm1 ; SSE2-NEXT: movdqu %xmm1, (%rax) ; SSE2-NEXT: movdqu %xmm0, (%rax) ; SSE2-NEXT: retq @@ -1512,27 +1494,27 @@ define void @avg_v8i16_2(<8 x i16>* %a, <8 x i16>* %b) { define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { ; SSE2-LABEL: avg_v16i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm4 -; SSE2-NEXT: movdqa 16(%rdi), %xmm5 +; SSE2-NEXT: movdqa (%rdi), %xmm2 +; SSE2-NEXT: movdqa 16(%rdi), %xmm4 ; SSE2-NEXT: movdqa (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: movdqa %xmm4, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm6[4],xmm8[5],xmm6[5],xmm8[6],xmm6[6],xmm8[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm5, %xmm7 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3] +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm2, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1],xmm2[2],xmm5[2],xmm2[3],xmm5[3] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm5[4],xmm7[5],xmm5[5],xmm7[6],xmm5[6],xmm7[7],xmm5[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: movdqa %xmm0, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm6[4],xmm3[5],xmm6[5],xmm3[6],xmm6[6],xmm3[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7] +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm5[0],xmm0[1],xmm5[1],xmm0[2],xmm5[2],xmm0[3],xmm5[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm6[4],xmm2[5],xmm6[5],xmm2[6],xmm6[6],xmm2[7],xmm6[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm6[0],xmm1[1],xmm6[1],xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm5[4],xmm2[5],xmm5[5],xmm2[6],xmm5[6],xmm2[7],xmm5[7] ; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm4, %xmm0 -; SSE2-NEXT: paddd %xmm8, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm5[0],xmm1[1],xmm5[1],xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; SSE2-NEXT: paddd %xmm4, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm4, %xmm3 ; SSE2-NEXT: paddd %xmm4, %xmm0 @@ -1594,80 +1576,79 @@ define void @avg_v16i16_2(<16 x i16>* %a, <16 x i16>* %b) { define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-LABEL: avg_v32i16_2: ; SSE2: # BB#0: -; SSE2-NEXT: movdqa (%rdi), %xmm10 -; SSE2-NEXT: movdqa 16(%rdi), %xmm9 -; SSE2-NEXT: movdqa 32(%rdi), %xmm11 +; SSE2-NEXT: movdqa (%rdi), %xmm4 +; SSE2-NEXT: movdqa 16(%rdi), %xmm11 +; SSE2-NEXT: movdqa 32(%rdi), %xmm10 ; SSE2-NEXT: movdqa 48(%rdi), %xmm8 -; SSE2-NEXT: movdqa (%rsi), %xmm14 +; SSE2-NEXT: movdqa (%rsi), %xmm9 ; SSE2-NEXT: movdqa 16(%rsi), %xmm1 ; SSE2-NEXT: movdqa 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa 48(%rsi), %xmm3 ; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm10, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] -; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm9, %xmm12 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm11, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm0[4],xmm15[5],xmm0[5],xmm15[6],xmm0[6],xmm15[7],xmm0[7] +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm0[0],xmm4[1],xmm0[1],xmm4[2],xmm0[2],xmm4[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm11, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm0[0],xmm11[1],xmm0[1],xmm11[2],xmm0[2],xmm11[3],xmm0[3] +; SSE2-NEXT: movdqa %xmm10, %xmm12 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm0[4],xmm12[5],xmm0[5],xmm12[6],xmm0[6],xmm12[7],xmm0[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm0[0],xmm10[1],xmm0[1],xmm10[2],xmm0[2],xmm10[3],xmm0[3] ; SSE2-NEXT: movdqa %xmm8, %xmm13 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm0[4],xmm13[5],xmm0[5],xmm13[6],xmm0[6],xmm13[7],xmm0[7] ; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm0[0],xmm8[1],xmm0[1],xmm8[2],xmm0[2],xmm8[3],xmm0[3] -; SSE2-NEXT: movdqa %xmm14, %xmm7 +; SSE2-NEXT: movdqa %xmm9, %xmm7 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm0[4],xmm7[5],xmm0[5],xmm7[6],xmm0[6],xmm7[7],xmm0[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm14 = xmm14[0],xmm0[0],xmm14[1],xmm0[1],xmm14[2],xmm0[2],xmm14[3],xmm0[3] +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm0[0],xmm9[1],xmm0[1],xmm9[2],xmm0[2],xmm9[3],xmm0[3] +; SSE2-NEXT: paddd %xmm4, %xmm9 ; SSE2-NEXT: movdqa %xmm1, %xmm6 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm0[4],xmm6[5],xmm0[5],xmm6[6],xmm0[6],xmm6[7],xmm0[7] +; SSE2-NEXT: paddd %xmm5, %xmm6 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSE2-NEXT: paddd %xmm11, %xmm1 ; SSE2-NEXT: movdqa %xmm2, %xmm5 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; SSE2-NEXT: paddd %xmm12, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; SSE2-NEXT: paddd %xmm10, %xmm2 ; SSE2-NEXT: movdqa %xmm3, %xmm4 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm0[4],xmm4[5],xmm0[5],xmm4[6],xmm0[6],xmm4[7],xmm0[7] +; SSE2-NEXT: paddd %xmm13, %xmm4 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] ; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm13, %xmm4 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm15, %xmm5 -; SSE2-NEXT: paddd %xmm9, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm14 -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Folded Reload ; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [1,1,1,1] ; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm14 +; SSE2-NEXT: paddd %xmm0, %xmm9 ; SSE2-NEXT: paddd %xmm0, %xmm6 ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: paddd %xmm0, %xmm5 ; SSE2-NEXT: paddd %xmm0, %xmm2 ; SSE2-NEXT: paddd %xmm0, %xmm4 ; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: psrld $1, %xmm14 +; SSE2-NEXT: psrld $1, %xmm3 +; SSE2-NEXT: psrld $1, %xmm4 +; SSE2-NEXT: psrld $1, %xmm2 +; SSE2-NEXT: psrld $1, %xmm5 +; SSE2-NEXT: psrld $1, %xmm1 +; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: psrld $1, %xmm9 ; SSE2-NEXT: psrld $1, %xmm7 ; SSE2-NEXT: pslld $16, %xmm7 ; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: pslld $16, %xmm14 -; SSE2-NEXT: psrad $16, %xmm14 -; SSE2-NEXT: packssdw %xmm7, %xmm14 -; SSE2-NEXT: psrld $1, %xmm1 -; SSE2-NEXT: psrld $1, %xmm6 +; SSE2-NEXT: pslld $16, %xmm9 +; SSE2-NEXT: psrad $16, %xmm9 +; SSE2-NEXT: packssdw %xmm7, %xmm9 ; SSE2-NEXT: pslld $16, %xmm6 ; SSE2-NEXT: psrad $16, %xmm6 ; SSE2-NEXT: pslld $16, %xmm1 ; SSE2-NEXT: psrad $16, %xmm1 ; SSE2-NEXT: packssdw %xmm6, %xmm1 -; SSE2-NEXT: psrld $1, %xmm2 -; SSE2-NEXT: psrld $1, %xmm5 ; SSE2-NEXT: pslld $16, %xmm5 ; SSE2-NEXT: psrad $16, %xmm5 ; SSE2-NEXT: pslld $16, %xmm2 ; SSE2-NEXT: psrad $16, %xmm2 ; SSE2-NEXT: packssdw %xmm5, %xmm2 -; SSE2-NEXT: psrld $1, %xmm3 -; SSE2-NEXT: psrld $1, %xmm4 ; SSE2-NEXT: pslld $16, %xmm4 ; SSE2-NEXT: psrad $16, %xmm4 ; SSE2-NEXT: pslld $16, %xmm3 @@ -1676,7 +1657,7 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; SSE2-NEXT: movdqu %xmm3, (%rax) ; SSE2-NEXT: movdqu %xmm2, (%rax) ; SSE2-NEXT: movdqu %xmm1, (%rax) -; SSE2-NEXT: movdqu %xmm14, (%rax) +; SSE2-NEXT: movdqu %xmm9, (%rax) ; SSE2-NEXT: retq ; ; AVX2-LABEL: avg_v32i16_2: @@ -1686,13 +1667,13 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero ; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm5 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpmovzxwd {{.*#+}} ymm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero +; AVX2-NEXT: vpaddd %ymm4, %ymm3, %ymm3 ; AVX2-NEXT: vpbroadcastd {{.*}}(%rip), %ymm4 ; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm4, %ymm1, %ymm1 @@ -1723,9 +1704,9 @@ define void @avg_v32i16_2(<32 x i16>* %a, <32 x i16>* %b) { ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm0 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero ; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero -; AVX512F-NEXT: vpaddd %zmm3, %zmm1, %zmm1 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 +; AVX512F-NEXT: vpmovzxwd {{.*#+}} zmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero,mem[4],zero,mem[5],zero,mem[6],zero,mem[7],zero,mem[8],zero,mem[9],zero,mem[10],zero,mem[11],zero,mem[12],zero,mem[13],zero,mem[14],zero,mem[15],zero +; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 ; AVX512F-NEXT: vpbroadcastd {{.*}}(%rip), %zmm2 ; AVX512F-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; AVX512F-NEXT: vpaddd %zmm2, %zmm1, %zmm1 diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll index e6cc95fcdb23..6869d088e7cd 100644 --- a/test/CodeGen/X86/avx-basic.ll +++ b/test/CodeGen/X86/avx-basic.ll @@ -34,8 +34,8 @@ define void @zero256() nounwind ssp { define void @ones([0 x float]* nocapture %RET, [0 x float]* nocapture %aFOO) nounwind { ; CHECK-LABEL: ones: ; CHECK: ## BB#0: ## %allocas -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ float>* %ptr2vec615, align 32 define void @ones2([0 x i32]* nocapture %RET, [0 x i32]* nocapture %aFOO) nounwind { ; CHECK-LABEL: ones2: ; CHECK: ## BB#0: ## %allocas -; CHECK-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 +; CHECK-NEXT: vxorps %ymm0, %ymm0, %ymm0 +; CHECK-NEXT: vcmptrueps %ymm0, %ymm0, %ymm0 ; CHECK-NEXT: vmovaps %ymm0, (%rdi) ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx-cvt-3.ll b/test/CodeGen/X86/avx-cvt-3.ll index 066719b3bfe8..231334ddcb85 100644 --- a/test/CodeGen/X86/avx-cvt-3.ll +++ b/test/CodeGen/X86/avx-cvt-3.ll @@ -48,16 +48,16 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) { define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_insert_allbits_v8i32: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_insert_allbits_v8i32: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq @@ -72,16 +72,16 @@ define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) { define <8 x float> @sitofp_shuffle_allbits_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_shuffle_allbits_v8i32: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_shuffle_allbits_v8i32: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4],ymm0[5],ymm1[6],ymm0[7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq @@ -95,8 +95,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X86: # BB#0: ; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X86-NEXT: movl $2, %eax @@ -111,8 +110,7 @@ define <8 x float> @sitofp_insert_constants_v8i32(<8 x i32> %a0) { ; X64: # BB#0: ; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5,6,7] -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1],ymm1[2],ymm0[3,4,5,6,7] ; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 ; X64-NEXT: movl $2, %eax diff --git a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index 1d925ff8e9bd..3cadbe2a8db3 100644 --- a/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -99,16 +99,16 @@ define <8 x float> @test_mm256_and_ps(<8 x float> %a0, <8 x float> %a1) nounwind define <4 x double> @test_mm256_andnot_pd(<4 x double> %a0, <4 x double> %a1) nounwind { ; X32-LABEL: test_mm256_andnot_pd: ; X32: # BB#0: -; X32-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X32-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; X32-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; X32-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 ; X32-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; X32-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_andnot_pd: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2 -; X64-NEXT: vinsertf128 $1, %xmm2, %ymm2, %ymm2 +; X64-NEXT: vxorps %ymm2, %ymm2, %ymm2 +; X64-NEXT: vcmptrueps %ymm2, %ymm2, %ymm2 ; X64-NEXT: vxorps %ymm2, %ymm0, %ymm0 ; X64-NEXT: vandps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq @@ -2244,11 +2244,11 @@ define <4 x double> @test_mm256_set_pd(double %a0, double %a1, double %a2, doubl ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero -; X32-NEXT: vunpcklpd {{.*#+}} xmm2 = xmm2[0],xmm3[0] ; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; X32-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_pd: @@ -2269,19 +2269,19 @@ define <8 x float> @test_mm256_set_ps(float %a0, float %a1, float %a2, float %a3 ; X32: # BB#0: ; X32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm6[0],xmm4[3] -; X32-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1,2],xmm7[0] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] -; X32-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[2,3] +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0],xmm1[3] +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; X32-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X32-NEXT: retl ; ; X64-LABEL: test_mm256_set_ps: @@ -2881,10 +2881,10 @@ define <4 x double> @test_mm256_setr_pd(double %a0, double %a1, double %a2, doub ; X32: # BB#0: ; X32-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero -; X32-NEXT: vmovsd {{.*#+}} xmm3 = mem[0],zero ; X32-NEXT: vunpcklpd {{.*#+}} xmm0 = xmm1[0],xmm0[0] -; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm3[0],xmm2[0] +; X32-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero +; X32-NEXT: vmovsd {{.*#+}} xmm2 = mem[0],zero +; X32-NEXT: vunpcklpd {{.*#+}} xmm1 = xmm2[0],xmm1[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; @@ -2908,16 +2908,16 @@ define <8 x float> @test_mm256_setr_ps(float %a0, float %a1, float %a2, float %a ; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm5 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm6 = mem[0],zero,zero,zero -; X32-NEXT: vmovss {{.*#+}} xmm7 = mem[0],zero,zero,zero ; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[2,3] ; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3] ; X32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm7[0],xmm6[0],xmm7[2,3] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm5[0],xmm1[3] -; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: vmovss {{.*#+}} xmm4 = mem[0],zero,zero,zero +; X32-NEXT: vinsertps {{.*#+}} xmm3 = xmm4[0],xmm3[0],xmm4[2,3] +; X32-NEXT: vinsertps {{.*#+}} xmm2 = xmm3[0,1],xmm2[0],xmm3[3] +; X32-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[0] ; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; X32-NEXT: retl ; diff --git a/test/CodeGen/X86/avx-schedule.ll b/test/CodeGen/X86/avx-schedule.ll index 052cacfea4dc..bb05481e313d 100644 --- a/test/CodeGen/X86/avx-schedule.ll +++ b/test/CodeGen/X86/avx-schedule.ll @@ -2837,4 +2837,54 @@ define <8 x float> @test_xorps(<8 x float> %a0, <8 x float> %a1, <8 x float> *%a ret <8 x float> %8 } +define void @test_zeroall() { +; SANDY-LABEL: test_zeroall: +; SANDY: # BB#0: +; SANDY-NEXT: vzeroall # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_zeroall: +; HASWELL: # BB#0: +; HASWELL-NEXT: vzeroall # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_zeroall: +; BTVER2: # BB#0: +; BTVER2-NEXT: vzeroall # sched: [?:0.000000e+00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_zeroall: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vzeroall # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.avx.vzeroall() + ret void +} +declare void @llvm.x86.avx.vzeroall() nounwind + +define void @test_zeroupper() { +; SANDY-LABEL: test_zeroupper: +; SANDY: # BB#0: +; SANDY-NEXT: vzeroupper # sched: [?:0.000000e+00] +; SANDY-NEXT: retq # sched: [5:1.00] +; +; HASWELL-LABEL: test_zeroupper: +; HASWELL: # BB#0: +; HASWELL-NEXT: vzeroupper # sched: [1:0.00] +; HASWELL-NEXT: retq # sched: [1:1.00] +; +; BTVER2-LABEL: test_zeroupper: +; BTVER2: # BB#0: +; BTVER2-NEXT: vzeroupper # sched: [?:0.000000e+00] +; BTVER2-NEXT: retq # sched: [4:1.00] +; +; ZNVER1-LABEL: test_zeroupper: +; ZNVER1: # BB#0: +; ZNVER1-NEXT: vzeroupper # sched: [?:0.000000e+00] +; ZNVER1-NEXT: retq # sched: [4:1.00] + call void @llvm.x86.avx.vzeroupper() + ret void +} +declare void @llvm.x86.avx.vzeroupper() nounwind + !0 = !{i32 1} diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll index 341dd867e4ff..647b7a8f4dfc 100644 --- a/test/CodeGen/X86/avx.ll +++ b/test/CodeGen/X86/avx.ll @@ -113,11 +113,11 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; CHECK-NOT: mov ; CHECK: insertps $48 ; CHECK: insertps $48 +; CHECK: vaddps ; CHECK: insertps $48 ; CHECK: insertps $48 ; CHECK: vaddps ; CHECK: vaddps -; CHECK: vaddps ; CHECK-NEXT: ret %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 diff --git a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll index 63b0281a7339..e29cf09718ad 100644 --- a/test/CodeGen/X86/avx512-cmp-kor-sequence.ll +++ b/test/CodeGen/X86/avx512-cmp-kor-sequence.ll @@ -13,10 +13,10 @@ define zeroext i16 @cmp_kor_seq_16(<16 x float> %a, <16 x float> %b, <16 x float ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vcmpgeps %zmm4, %zmm0, %k0 ; CHECK-NEXT: vcmpgeps %zmm4, %zmm1, %k1 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k2 -; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k3 ; CHECK-NEXT: korw %k1, %k0, %k0 -; CHECK-NEXT: korw %k3, %k2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm2, %k1 +; CHECK-NEXT: vcmpgeps %zmm4, %zmm3, %k2 +; CHECK-NEXT: korw %k2, %k1, %k1 ; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, %eax ; CHECK-NEXT: # kill: %AX %AX %EAX diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll index 4890afec2164..c03623a2f035 100644 --- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll +++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll @@ -852,16 +852,16 @@ define <16 x float> @gather_mask_test(<16 x i32> %ind, <16 x float> %src, i8* %b ; CHECK-NEXT: kxorw %k0, %k0, %k1 ; CHECK-NEXT: vmovaps %zmm1, %zmm3 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: movw $1, %ax ; CHECK-NEXT: kmovd %eax, %k1 -; CHECK-NEXT: vmovaps %zmm1, %zmm4 -; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm4 {%k1} +; CHECK-NEXT: vmovaps %zmm1, %zmm3 +; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm3 {%k1} ; CHECK-NEXT: movw $220, %ax ; CHECK-NEXT: kmovd %eax, %k1 ; CHECK-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm0 -; CHECK-NEXT: vaddps %zmm4, %zmm1, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm3, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 -1, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 0, i32 4) diff --git a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll index 0e7a8d25c56f..56962ca2671d 100644 --- a/test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <16 x float> @test_x86_vbroadcast_ss_ps_512(<4 x float> %a0, <16 x float> ; CHECK-NEXT: vbroadcastss %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastss %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastss %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -30,8 +30,8 @@ define <8 x double> @test_x86_vbroadcast_sd_pd_512(<2 x double> %a0, <8 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastsd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vbroadcastsd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq @@ -51,8 +51,8 @@ define <16 x i32>@test_int_x86_avx512_pbroadcastd_512(<4 x i32> %x0, <16 x i32> ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastd %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastd %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.pbroadcastd.512(<4 x i32> %x0, <16 x i32> %x1, i16 -1) @@ -71,8 +71,8 @@ define <8 x i64>@test_int_x86_avx512_pbroadcastq_512(<2 x i64> %x0, <8 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpbroadcastq %xmm0, %zmm1 {%k1} -; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpbroadcastq %xmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.pbroadcastq.512(<2 x i64> %x0, <8 x i64> %x1,i8 -1) @@ -91,8 +91,8 @@ define <16 x float>@test_int_x86_avx512_mask_movsldup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovsldup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] -; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovsldup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6,8,8,10,10,12,12,14,14] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movsldup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -111,8 +111,8 @@ define <16 x float>@test_int_x86_avx512_mask_movshdup_512(<16 x float> %x0, <16 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm2 = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovshdup {{.*#+}} zmm1 {%k1} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] -; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovshdup {{.*#+}} zmm0 {%k1} {z} = zmm0[1,1,3,3,5,5,7,7,9,9,11,11,13,13,15,15] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.movshdup.512(<16 x float> %x0, <16 x float> %x1, i16 %x2) @@ -131,8 +131,8 @@ define <8 x double>@test_int_x86_avx512_mask_movddup_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vmovddup {{.*#+}} zmm2 = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vmovddup {{.*#+}} zmm1 {%k1} = zmm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vmovddup {{.*#+}} zmm0 {%k1} {z} = zmm0[0,0,2,2,4,4,6,6] ; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.movddup.512(<8 x double> %x0, <8 x double> %x1, i8 %x2) @@ -671,9 +671,9 @@ define <8 x i64>@test_int_x86_avx512_mask_punpcklqd_q_512(<8 x i64> %x0, <8 x i6 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm3 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm2 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vpunpcklqdq {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[2],zmm1[2],zmm0[4],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 %x3) %res1 = call <8 x i64> @llvm.x86.avx512.mask.punpcklqd.q.512(<8 x i64> %x0, <8 x i64> %x1, <8 x i64> %x2, i8 -1) @@ -1616,9 +1616,9 @@ define <8 x double>@test_int_x86_avx512_mask_shuf_pd_512(<8 x double> %x0, <8 x ; CHECK-NEXT: vshufpd {{.*#+}} zmm3 = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vshufpd {{.*#+}} zmm2 {%k1} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vshufpd {{.*#+}} zmm0 {%k1} {z} = zmm0[0],zmm1[1],zmm0[3],zmm1[2],zmm0[5],zmm1[4],zmm0[6],zmm1[6] -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm2, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.shuf.pd.512(<8 x double> %x0, <8 x double> %x1, i32 22, <8 x double> %x3, i8 -1) @@ -2031,8 +2031,8 @@ define <8 x i64>@test_int_x86_avx512_mask_psrl_qi_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrlq $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.psrl.qi.512(<8 x i64> %x0, i32 4, <8 x i64> %x2, i8 %x3) @@ -2051,8 +2051,8 @@ define <16 x i32>@test_int_x86_avx512_mask_psrl_di_512(<16 x i32> %x0, i32 %x1, ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm2 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vpsrld $4, %zmm0, %zmm1 {%k1} -; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vpsrld $4, %zmm0, %zmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.psrl.di.512(<16 x i32> %x0, i32 4, <16 x i32> %x2, i16 %x3) @@ -2651,8 +2651,8 @@ define <16 x float>@test_int_x86_avx512_mask_vpermilvar_ps_512_constant_pool(<16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vpermilps {{.*#+}} zmm2 {%k1} = zmm0[2,3,0,1,7,6,5,4,9,8,11,10,12,13,14,15] ; CHECK-NEXT: vpermilps {{.*#+}} zmm1 {%k1} {z} = zmm0[1,0,3,2,4,5,6,7,9,8,11,10,12,13,14,15] -; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm2, %zmm1 +; CHECK-NEXT: vpermilps {{.*#+}} zmm0 = zmm0[1,0,3,2,4,5,6,7,10,11,8,9,14,15,13,12] ; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.vpermilvar.ps.512(<16 x float> %x0, <16 x i32> , <16 x float> %x2, i16 %x3) @@ -2989,9 +2989,9 @@ define <16 x float>@test_int_x86_avx512_mask_insertf32x4_512(<16 x float> %x0, < ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddps %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddps %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddps %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 %x4) %res1 = call <16 x float> @llvm.x86.avx512.mask.insertf32x4.512(<16 x float> %x0, <4 x float> %x1, i32 1, <16 x float> %x3, i16 -1) @@ -3010,9 +3010,9 @@ define <16 x i32>@test_int_x86_avx512_mask_inserti32x4_512(<16 x i32> %x0, <4 x ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 %x4) %res1 = call <16 x i32> @llvm.x86.avx512.mask.inserti32x4.512(<16 x i32> %x0, <4 x i32> %x1, i32 1, <16 x i32> %x3, i16 -1) @@ -3030,9 +3030,9 @@ define <8 x double>@test_int_x86_avx512_mask_insertf64x4_512(<8 x double> %x0, < ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinsertf64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vaddpd %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vaddpd %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 %x4) %res1 = call <8 x double> @llvm.x86.avx512.mask.insertf64x4.512(<8 x double> %x0, <4 x double> %x1, i32 1, <8 x double> %x3, i8 -1) @@ -3050,9 +3050,9 @@ define <8 x i64>@test_int_x86_avx512_mask_inserti64x4_512(<8 x i64> %x0, <4 x i6 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm3 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm2 {%k1} +; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm2 ; CHECK-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0 {%k1} {z} -; CHECK-NEXT: vpaddq %zmm3, %zmm2, %zmm1 -; CHECK-NEXT: vpaddq %zmm1, %zmm0, %zmm0 +; CHECK-NEXT: vpaddq %zmm2, %zmm0, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 %x4) %res1 = call <8 x i64> @llvm.x86.avx512.mask.inserti64x4.512(<8 x i64> %x0, <4 x i64> %x1, i32 1, <8 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll index cc5e9e038e0b..f800d01064ba 100644 --- a/test/CodeGen/X86/avx512-intrinsics.ll +++ b/test/CodeGen/X86/avx512-intrinsics.ll @@ -274,11 +274,11 @@ define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> % ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) @@ -301,11 +301,11 @@ define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x doubl ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) @@ -477,11 +477,11 @@ declare i64 @llvm.x86.avx512.cvttss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %rcx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2usi %xmm0, %rax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double> %a0, i32 4) @@ -496,11 +496,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2usi64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %rcx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtsd2si %xmm0, %rax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double> %a0, i32 4) @@ -515,11 +515,11 @@ declare i64 @llvm.x86.avx512.vcvtsd2si64(<2 x double>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2usi64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %rcx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2usi %xmm0, %rax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float> %a0, i32 4) @@ -534,11 +534,11 @@ declare i64 @llvm.x86.avx512.vcvtss2usi64(<4 x float>, i32) nounwind readnone define i64 @test_x86_avx512_cvtss2si64(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si64: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %rcx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rdx +; CHECK-NEXT: vcvtss2si %xmm0, %rax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %rcx +; CHECK-NEXT: addq %rax, %rcx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %rax ; CHECK-NEXT: addq %rcx, %rax -; CHECK-NEXT: addq %rdx, %rax ; CHECK-NEXT: retq %res = call i64 @llvm.x86.avx512.vcvtss2si64(<4 x float> %a0, i32 4) @@ -553,11 +553,11 @@ declare i64 @llvm.x86.avx512.vcvtss2si64(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2usi32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2usi %xmm0, %ecx -; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2usi %xmm0, %eax +; CHECK-NEXT: vcvtsd2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double> %a0, i32 4) @@ -572,11 +572,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2usi32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtsd2si32(<2 x double> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtsd2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtsd2si %xmm0, %ecx -; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtsd2si %xmm0, %eax +; CHECK-NEXT: vcvtsd2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtsd2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double> %a0, i32 4) @@ -591,11 +591,11 @@ declare i32 @llvm.x86.avx512.vcvtsd2si32(<2 x double>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2usi32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2usi32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2usi %xmm0, %ecx -; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2usi %xmm0, %eax +; CHECK-NEXT: vcvtss2usi {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2usi {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float> %a0, i32 4) @@ -610,11 +610,11 @@ declare i32 @llvm.x86.avx512.vcvtss2usi32(<4 x float>, i32) nounwind readnone define i32 @test_x86_avx512_cvtss2si32(<4 x float> %a0) { ; CHECK-LABEL: test_x86_avx512_cvtss2si32: ; CHECK: ## BB#0: -; CHECK-NEXT: vcvtss2si %xmm0, %ecx -; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %eax -; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %edx +; CHECK-NEXT: vcvtss2si %xmm0, %eax +; CHECK-NEXT: vcvtss2si {rz-sae}, %xmm0, %ecx +; CHECK-NEXT: addl %eax, %ecx +; CHECK-NEXT: vcvtss2si {rd-sae}, %xmm0, %eax ; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: addl %edx, %eax ; CHECK-NEXT: retq %res = call i32 @llvm.x86.avx512.vcvtss2si32(<4 x float> %a0, i32 4) @@ -683,8 +683,9 @@ define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0, <16 x i16> %src, i16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm1 {%k1} ; CHECK-NEXT: vcvtps2ph $2, %zmm0, %ymm2 {%k1} {z} +; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ; CHECK-NEXT: vcvtps2ph $2, %zmm0, (%rsi) -; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm0 +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ; CHECK-NEXT: retq %res1 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1) %res2 = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 %mask) @@ -3656,11 +3657,11 @@ define <4 x float> @test_getexp_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> ; CHECK-NEXT: vmovaps %xmm2, %xmm3 ; CHECK-NEXT: vgetexpss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} +; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm2 +; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} ; CHECK-NEXT: vgetexpss {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getexp.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 8) @@ -3684,10 +3685,10 @@ define <2 x double> @test_getexp_sd(<2 x double> %a0, <2 x double> %a1, <2 x dou ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetexpsd %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm2 ; CHECK-NEXT: vgetexpsd {sae}, %xmm1, %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res0 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getexp.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 8) @@ -3903,11 +3904,11 @@ define <2 x double>@test_int_x86_avx512_mask_getmant_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vmovapd %xmm2, %xmm3 ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm5 +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 +; CHECK-NEXT: vgetmantsd $11, %xmm1, %xmm0, %xmm4 ; CHECK-NEXT: vgetmantsd $11, {sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> %x2, i8 %x3, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.getmant.sd(<2 x double> %x0, <2 x double> %x1, i32 11, <2 x double> zeroinitializer, i8 %x3, i32 4) @@ -3928,11 +3929,11 @@ define <4 x float>@test_int_x86_avx512_mask_getmant_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm2 {%k1} ; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 {%k1} {z} -; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm4 +; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm2 +; CHECK-NEXT: vgetmantss $11, %xmm1, %xmm0, %xmm3 ; CHECK-NEXT: vgetmantss $11, {sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm2, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> %x2, i8 %x3, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.getmant.ss(<4 x float> %x0, <4 x float> %x1, i32 11, <4 x float> zeroinitializer, i8 %x3, i32 4) @@ -4434,8 +4435,8 @@ define <16 x i32>@test_int_x86_avx512_mask_prol_d_512(<16 x i32> %x0, i32 %x1, < ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprold $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprold $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprold $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.prol.d.512(<16 x i32> %x0, i32 3, <16 x i32> %x2, i16 %x3) @@ -4454,8 +4455,8 @@ define <8 x i64>@test_int_x86_avx512_mask_prol_q_512(<8 x i64> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ; CHECK-NEXT: vprolq $3, %zmm0, %zmm1 {%k1} ; CHECK-NEXT: vprolq $3, %zmm0, %zmm2 {%k1} {z} -; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vprolq $3, %zmm0, %zmm0 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <8 x i64> @llvm.x86.avx512.mask.prol.q.512(<8 x i64> %x0, i32 3, <8 x i64> %x2, i8 %x3) @@ -4556,9 +4557,9 @@ define <8 x double>@test_int_x86_avx512_mask_fixupimm_pd_512(<8 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %zmm2, %zmm1, %zmm3 {%k1} ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vfixupimmpd $5, %zmm2, %zmm1, %zmm4 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $3, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm4, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 4, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.mask.fixupimm.pd.512(<8 x double> zeroinitializer, <8 x double> %x1, <8 x i64> %x2, i32 5, i8 %x4, i32 4) @@ -4579,9 +4580,9 @@ define <8 x double>@test_int_x86_avx512_maskz_fixupimm_pd_512(<8 x double> %x0, ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovapd %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmpd $5, %zmm4, %zmm1, %zmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmpd $2, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddpd %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddpd %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddpd %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> %x2, i32 3, i8 %x4, i32 4) %res1 = call <8 x double> @llvm.x86.avx512.maskz.fixupimm.pd.512(<8 x double> %x0, <8 x double> %x1, <8 x i64> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4603,9 +4604,9 @@ define <4 x float>@test_int_x86_avx512_mask_fixupimm_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vxorps %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovaps %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmss $5, %xmm4, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmss $5, {sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> %x2, i32 5, i8 %x4, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.fixupimm.ss(<4 x float> %x0, <4 x float> %x1, <4 x i32> zeroinitializer, i32 5, i8 %x4, i32 4) @@ -4650,9 +4651,9 @@ define <16 x float>@test_int_x86_avx512_mask_fixupimm_ps_512(<16 x float> %x0, < ; CHECK-NEXT: vpxord %zmm4, %zmm4, %zmm4 ; CHECK-NEXT: vmovaps %zmm0, %zmm5 ; CHECK-NEXT: vfixupimmps $5, %zmm4, %zmm1, %zmm5 {%k1} +; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm3 ; CHECK-NEXT: vfixupimmps $5, {sae}, %zmm2, %zmm1, %zmm0 -; CHECK-NEXT: vaddps %zmm5, %zmm3, %zmm1 -; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 +; CHECK-NEXT: vaddps %zmm0, %zmm3, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> %x2, i32 5, i16 %x4, i32 4) %res1 = call <16 x float> @llvm.x86.avx512.mask.fixupimm.ps.512(<16 x float> %x0, <16 x float> %x1, <16 x i32> zeroinitializer, i32 5, i16 %x4, i32 4) @@ -4721,9 +4722,9 @@ define <2 x double>@test_int_x86_avx512_maskz_fixupimm_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vxorpd %xmm4, %xmm4, %xmm4 ; CHECK-NEXT: vmovapd %xmm0, %xmm5 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm4, %xmm1, %xmm5 {%k1} {z} +; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm3 ; CHECK-NEXT: vfixupimmsd $5, {sae}, %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vaddpd %xmm5, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> %x2, i32 5, i8 %x4, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.maskz.fixupimm.sd(<2 x double> %x0, <2 x double> %x1, <2 x i64> zeroinitializer, i32 5, i8 %x4, i32 8) @@ -4821,12 +4822,12 @@ define <2 x double>@test_int_x86_avx512_mask_vfmadd_sd(<2 x double> %x0, <2 x do ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213sd %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vmovapd %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vfmadd213sd {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddpd %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4849,12 +4850,12 @@ define <4 x float>@test_int_x86_avx512_mask_vfmadd_ss(<4 x float> %x0, <4 x floa ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm0, %xmm4 ; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm4 -; CHECK-NEXT: vmovaps %xmm0, %xmm5 -; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm0, %xmm4 +; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vfmadd213ss {rz-sae}, %xmm2, %xmm1, %xmm0 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm1 -; CHECK-NEXT: vaddps %xmm5, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -4909,12 +4910,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmadd_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmadd231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmadd.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -4937,12 +4938,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmadd_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmadd231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmadd231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmadd.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5069,12 +5070,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfmsub_sd(<2 x double> %x0, <2 x d ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5097,12 +5098,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfmsub_ss(<4 x float> %x0, <4 x flo ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) @@ -5125,12 +5126,12 @@ define <2 x double>@test_int_x86_avx512_mask3_vfnmsub_sd(<2 x double> %x0, <2 x ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovapd %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231sd %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovapd %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfnmsub231sd {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddpd %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 -1, i32 4) %res1 = call <2 x double> @llvm.x86.avx512.mask3.vfnmsub.sd(<2 x double> %x0, <2 x double> %x1, <2 x double> %x2, i8 %x3, i32 4) @@ -5153,12 +5154,12 @@ define <4 x float>@test_int_x86_avx512_mask3_vfnmsub_ss(<4 x float> %x0, <4 x fl ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm3 {%k1} ; CHECK-NEXT: vmovaps %xmm2, %xmm4 ; CHECK-NEXT: vfnmsub231ss %xmm1, %xmm0, %xmm4 -; CHECK-NEXT: vmovaps %xmm2, %xmm5 -; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm5 {%k1} +; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm4 {%k1} ; CHECK-NEXT: vfnmsub231ss {rz-sae}, %xmm1, %xmm0, %xmm2 -; CHECK-NEXT: vaddps %xmm3, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm5, %xmm2, %xmm1 -; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm4, %xmm2, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 ; CHECK-NEXT: retq %res = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 -1, i32 4) %res1 = call <4 x float> @llvm.x86.avx512.mask3.vfnmsub.ss(<4 x float> %x0, <4 x float> %x1, <4 x float> %x2, i8 %x3, i32 4) diff --git a/test/CodeGen/X86/avx512-mask-spills.ll b/test/CodeGen/X86/avx512-mask-spills.ll index 4ef88ac495c3..96aefdb10584 100644 --- a/test/CodeGen/X86/avx512-mask-spills.ll +++ b/test/CodeGen/X86/avx512-mask-spills.ll @@ -9,13 +9,11 @@ define <4 x i1> @test_4i1(<4 x i32> %a, <4 x i32> %b) { ; CHECK-NEXT: Lcfi0: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %xmm1, %xmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k0 +; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2d %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -34,14 +32,12 @@ define <8 x i1> @test_8i1(<8 x i32> %a, <8 x i32> %b) { ; CHECK-NEXT: Lcfi1: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %ymm1, %ymm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k0 +; CHECK-NEXT: vpcmpgtd %ymm1, %ymm0, %k1 +; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korb %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2w %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -60,14 +56,12 @@ define <16 x i1> @test_16i1(<16 x i32> %a, <16 x i32> %b) { ; CHECK-NEXT: Lcfi2: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleud %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill -; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtd %zmm1, %zmm0, %k1 +; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: kmovw %k0, {{[0-9]+}}(%rsp) ## 2-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k0 ## 2-byte Reload -; CHECK-NEXT: kmovw {{[0-9]+}}(%rsp), %k1 ## 2-byte Reload -; CHECK-NEXT: korw %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %xmm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -85,14 +79,12 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { ; CHECK-NEXT: Lcfi3: ; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 +; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k1 +; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: kmovd %k0, {{[0-9]+}}(%rsp) ## 4-byte Spill -; CHECK-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovd %k0, (%rsp) ## 4-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f ; CHECK-NEXT: kmovd {{[0-9]+}}(%rsp), %k0 ## 4-byte Reload -; CHECK-NEXT: kmovd (%rsp), %k1 ## 4-byte Reload -; CHECK-NEXT: kord %k1, %k0, %k0 ; CHECK-NEXT: vpmovm2b %k0, %ymm0 ; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq @@ -106,20 +98,18 @@ define <32 x i1> @test_32i1(<32 x i16> %a, <32 x i16> %b) { define <64 x i1> @test_64i1(<64 x i8> %a, <64 x i8> %b) { ; CHECK-LABEL: test_64i1: ; CHECK: ## BB#0: -; CHECK-NEXT: subq $24, %rsp +; CHECK-NEXT: pushq %rax ; CHECK-NEXT: Lcfi4: -; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill -; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 -; CHECK-NEXT: kmovq %k0, {{[0-9]+}}(%rsp) ## 8-byte Spill +; CHECK-NEXT: vpcmpgtb %zmm1, %zmm0, %k1 +; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq %k0, (%rsp) ## 8-byte Spill ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq _f -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k0 ## 8-byte Reload -; CHECK-NEXT: kmovq {{[0-9]+}}(%rsp), %k1 ## 8-byte Reload -; CHECK-NEXT: korq %k1, %k0, %k0 +; CHECK-NEXT: kmovq (%rsp), %k0 ## 8-byte Reload ; CHECK-NEXT: vpmovm2b %k0, %zmm0 -; CHECK-NEXT: addq $24, %rsp +; CHECK-NEXT: popq %rax ; CHECK-NEXT: retq %cmp_res = icmp ugt <64 x i8> %a, %b diff --git a/test/CodeGen/X86/avx512-scalar_mask.ll b/test/CodeGen/X86/avx512-scalar_mask.ll new file mode 100644 index 000000000000..47c6813fa8dc --- /dev/null +++ b/test/CodeGen/X86/avx512-scalar_mask.ll @@ -0,0 +1,107 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s + +declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) +declare <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) + +define <4 x float>@test_var_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) { +; CHECK-LABEL: test_var_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_var_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2, i8 %mask) { +; CHECK-LABEL: test_var_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: andl $1, %edi +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 %mask, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should just return %xmm0 here. +define <4 x float>@test_const0_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const0_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should zero the lower element of xmm0 and return it. +define <4 x float>@test_const0_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const0_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 0, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should just return %xmm0 here. +define <4 x float>@test_const2_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const2_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) + ret < 4 x float> %res +} + +; FIXME: we should zero the lower element of xmm0 and return it. +define <4 x float>@test_const2_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const2_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: kxorw %k0, %k0, %k1 +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 {%k1} {z} +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 2, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_allone_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_allone_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_allone_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_allone_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 -1, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_3_mask(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_3_mask: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) + ret < 4 x float> %res +} + +define <4 x float>@test_const_3_maskz(<4 x float> %v0, <4 x float> %v1, <4 x float> %v2) { +; CHECK-LABEL: test_const_3_maskz: +; CHECK: ## BB#0: +; CHECK-NEXT: vfmadd213ss %xmm2, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res = call <4 x float> @llvm.x86.avx512.maskz.vfmadd.ss(<4 x float> %v0,<4 x float> %v1, <4 x float> %v2, i8 3, i32 4) + ret < 4 x float> %res +} diff --git a/test/CodeGen/X86/avx512-vselect.ll b/test/CodeGen/X86/avx512-vselect.ll new file mode 100644 index 000000000000..1940864824ff --- /dev/null +++ b/test/CodeGen/X86/avx512-vselect.ll @@ -0,0 +1,61 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mcpu=skx | FileCheck %s --check-prefixes=CHECK,CHECK-SKX +; RUN: llc < %s -mcpu=knl | FileCheck %s --check-prefixes=CHECK,CHECK-KNL + +target triple = "x86_64-unknown-unknown" + +define <8 x i64> @test1(<8 x i64> %m, <8 x i64> %a, <8 x i64> %b) { +; CHECK-LABEL: test1: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: vpsllq $63, %zmm0, %zmm0 +; CHECK-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-NEXT: vpblendmq %zmm1, %zmm2, %zmm0 {%k1} +; CHECK-NEXT: retq +entry: + %m.trunc = trunc <8 x i64> %m to <8 x i1> + %ret = select <8 x i1> %m.trunc, <8 x i64> %a, <8 x i64> %b + ret <8 x i64> %ret +} + +; This is a very contrived test case to trick the legalizer into splitting the +; v16i1 masks in the select during type legalization, and in so doing extend them +; into two v8i64 types. This lets us ensure that the lowering code can handle +; both formulations of vselect. All of this trickery is because we can't +; directly form an SDAG input to the lowering. +define <16 x double> @test2(<16 x float> %x, <16 x float> %y, <16 x double> %a, <16 x double> %b) { +; CHECK-SKX-LABEL: test2: +; CHECK-SKX: # BB#0: # %entry +; CHECK-SKX-NEXT: vxorps %zmm6, %zmm6, %zmm6 +; CHECK-SKX-NEXT: vcmpltps %zmm0, %zmm6, %k0 +; CHECK-SKX-NEXT: vcmpltps %zmm6, %zmm1, %k1 +; CHECK-SKX-NEXT: korw %k1, %k0, %k0 +; CHECK-SKX-NEXT: kshiftrw $8, %k0, %k1 +; CHECK-SKX-NEXT: vpmovm2q %k1, %zmm1 +; CHECK-SKX-NEXT: vpmovm2q %k0, %zmm0 +; CHECK-SKX-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-SKX-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} +; CHECK-SKX-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-SKX-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} +; CHECK-SKX-NEXT: retq +; +; CHECK-KNL-LABEL: test2: +; CHECK-KNL: # BB#0: # %entry +; CHECK-KNL-NEXT: vpxord %zmm6, %zmm6, %zmm6 +; CHECK-KNL-NEXT: vcmpltps %zmm0, %zmm6, %k0 +; CHECK-KNL-NEXT: vcmpltps %zmm6, %zmm1, %k1 +; CHECK-KNL-NEXT: korw %k1, %k0, %k1 +; CHECK-KNL-NEXT: kshiftrw $8, %k1, %k2 +; CHECK-KNL-NEXT: vpternlogq $255, %zmm1, %zmm1, %zmm1 {%k2} {z} +; CHECK-KNL-NEXT: vpternlogq $255, %zmm0, %zmm0, %zmm0 {%k1} {z} +; CHECK-KNL-NEXT: vptestmq %zmm0, %zmm0, %k1 +; CHECK-KNL-NEXT: vblendmpd %zmm2, %zmm4, %zmm0 {%k1} +; CHECK-KNL-NEXT: vptestmq %zmm1, %zmm1, %k1 +; CHECK-KNL-NEXT: vblendmpd %zmm3, %zmm5, %zmm1 {%k1} +; CHECK-KNL-NEXT: retq +entry: + %gt.m = fcmp ogt <16 x float> %x, zeroinitializer + %lt.m = fcmp olt <16 x float> %y, zeroinitializer + %m.or = or <16 x i1> %gt.m, %lt.m + %ret = select <16 x i1> %m.or, <16 x double> %a, <16 x double> %b + ret <16 x double> %ret +} diff --git a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll index 9b4e73a18fc2..faa055dfbbf3 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -796,9 +796,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_psrl_w_512: @@ -806,9 +806,9 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_w_512(<32 x i16> %x0, <8 x i16> ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm3 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpsrlw %xmm1, %zmm0, %zmm0 {%k1} {z} -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.psrl.w.512(<32 x i16> %x0, <8 x i16> %x1, <32 x i16> %x2, i32 -1) @@ -826,8 +826,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512BW-NEXT: kmovd %esi, %k1 ; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512BW-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512BW-NEXT: retq ; @@ -836,8 +836,8 @@ define <32 x i16>@test_int_x86_avx512_mask_psrl_wi_512(<32 x i16> %x0, i32 %x1, ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm2 ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm1 {%k1} -; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm2, %zmm1, %zmm1 +; AVX512F-32-NEXT: vpsrlw $3, %zmm0, %zmm0 {%k1} {z} ; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.psrl.wi.512(<32 x i16> %x0, i32 3, <32 x i16> %x2, i32 %x3) diff --git a/test/CodeGen/X86/avx512bw-intrinsics.ll b/test/CodeGen/X86/avx512bw-intrinsics.ll index 3337f42eb142..13b850ccc3b6 100644 --- a/test/CodeGen/X86/avx512bw-intrinsics.ll +++ b/test/CodeGen/X86/avx512bw-intrinsics.ll @@ -2159,9 +2159,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_dbpsadbw_512: @@ -2169,9 +2169,9 @@ define <32 x i16>@test_int_x86_avx512_mask_dbpsadbw_512(<64 x i8> %x0, <64 x i8> ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm2 {%k1} ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vdbpsadbw $2, %zmm1, %zmm0, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> %x3, i32 %x4) %res1 = call <32 x i16> @llvm.x86.avx512.mask.dbpsadbw.512(<64 x i8> %x0, <64 x i8> %x1, i32 2, <32 x i16> zeroinitializer, i32 %x4) @@ -2411,9 +2411,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512BW-NEXT: kmovd %edi, %k1 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512BW-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512BW-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512BW-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512BW-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512BW-NEXT: retq ; ; AVX512F-32-LABEL: test_int_x86_avx512_mask_permvar_hi_512: @@ -2421,9 +2421,9 @@ define <32 x i16>@test_int_x86_avx512_mask_permvar_hi_512(<32 x i16> %x0, <32 x ; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm2 {%k1} ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm3 {%k1} {z} +; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm2 ; AVX512F-32-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; AVX512F-32-NEXT: vpaddw %zmm3, %zmm2, %zmm1 -; AVX512F-32-NEXT: vpaddw %zmm0, %zmm1, %zmm0 +; AVX512F-32-NEXT: vpaddw %zmm0, %zmm2, %zmm0 ; AVX512F-32-NEXT: retl %res = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) %res1 = call <32 x i16> @llvm.x86.avx512.mask.permvar.hi.512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> zeroinitializer, i32 %x3) diff --git a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll index 7df07b0413ed..571f345d4616 100644 --- a/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <32 x i8>@test_int_x86_avx512_pbroadcastb_256(<16 x i8> %x0, <32 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x78,0xc0] ; CHECK-NEXT: vpaddb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i8> @llvm.x86.avx512.pbroadcastb.256(<16 x i8> %x0, <32 x i8> %x1, i32 -1) @@ -29,8 +29,8 @@ define <16 x i8>@test_int_x86_avx512_pbroadcastb_128(<16 x i8> %x0, <16 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x78,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x78,0xc0] ; CHECK-NEXT: vpaddb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i8> @llvm.x86.avx512.pbroadcastb.128(<16 x i8> %x0, <16 x i8> %x1, i16 -1) @@ -49,8 +49,8 @@ define <16 x i16>@test_int_x86_avx512_pbroadcastw_256(<8 x i16> %x0, <16 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x79,0xc0] ; CHECK-NEXT: vpaddw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.pbroadcastw.256(<8 x i16> %x0, <16 x i16> %x1, i16 -1) @@ -69,8 +69,8 @@ define <8 x i16>@test_int_x86_avx512_pbroadcastw_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x79,0xc0] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.pbroadcastw.128(<8 x i16> %x0, <8 x i16> %x1, i8 -1) @@ -89,8 +89,8 @@ define <64 x i8>@test_int_x86_avx512_pbroadcastb_512(<16 x i8> %x0, <64 x i8> %x ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x78,0xd0] ; CHECK-NEXT: kmovq %rdi, %k1 ## encoding: [0xc4,0xe1,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastb %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x78,0xc8] -; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfc,0xc9] +; CHECK-NEXT: vpbroadcastb %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x78,0xc0] ; CHECK-NEXT: vpaddb %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfc,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <64 x i8> @llvm.x86.avx512.pbroadcastb.512(<16 x i8> %x0, <64 x i8> %x1, i64 -1) @@ -109,8 +109,8 @@ define <32 x i16>@test_int_x86_avx512_pbroadcastw_512(<8 x i16> %x0, <32 x i16> ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm2 ## encoding: [0x62,0xf2,0x7d,0x48,0x79,0xd0] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpbroadcastw %xmm0, %zmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x49,0x79,0xc8] -; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm2, %zmm1 ## encoding: [0x62,0xf1,0x6d,0x48,0xfd,0xc9] +; CHECK-NEXT: vpbroadcastw %xmm0, %zmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xc9,0x79,0xc0] ; CHECK-NEXT: vpaddw %zmm1, %zmm0, %zmm0 ## encoding: [0x62,0xf1,0x7d,0x48,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <32 x i16> @llvm.x86.avx512.pbroadcastw.512(<8 x i16> %x0, <32 x i16> %x1, i32 -1) @@ -1476,9 +1476,9 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_w_128(<8 x i16> %x0, <8 x i16> %x ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd1,0xd1] +; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd1,0xc1] -; CHECK-NEXT: vpaddw %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfd,0xcb] -; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] +; CHECK-NEXT: vpaddw %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) %res1 = call <8 x i16> @llvm.x86.avx512.mask.psrl.w.128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 -1) @@ -1496,9 +1496,9 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_w_256(<16 x i16> %x0, <8 x i16> ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd1,0xd9] ; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd1,0xd1] +; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xd3] ; CHECK-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd1,0xc1] -; CHECK-NEXT: vpaddw %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xcb] -; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] +; CHECK-NEXT: vpaddw %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 %x3) %res1 = call <16 x i16> @llvm.x86.avx512.mask.psrl.w.256(<16 x i16> %x0, <8 x i16> %x1, <16 x i16> %x2, i16 -1) @@ -1596,8 +1596,8 @@ define <8 x i16>@test_int_x86_avx512_mask_psrl_wi_128(<8 x i16> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfd,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i16> @llvm.x86.avx512.mask.psrl.wi.128(<8 x i16> %x0, i32 3, <8 x i16> %x2, i8 %x3) @@ -1616,8 +1616,8 @@ define <16 x i16>@test_int_x86_avx512_mask_psrl_wi_256(<16 x i16> %x0, i32 %x1, ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x71,0xd0,0x03] ; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] ; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x71,0xd0,0x03] -; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xca] +; CHECK-NEXT: vpsrlw $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x71,0xd0,0x03] ; CHECK-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfd,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx512.mask.psrl.wi.256(<16 x i16> %x0, i32 3, <16 x i16> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll index 8f528394f5bd..f8f47c87100a 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics-upgrade.ll @@ -9,8 +9,8 @@ define <4 x i32>@test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %res = call <4 x i32> @llvm.x86.avx512.mask.lzcnt.d.128(<4 x i32> %x0, <4 x i32> %x1, i8 %x2) diff --git a/test/CodeGen/X86/avx512cdvl-intrinsics.ll b/test/CodeGen/X86/avx512cdvl-intrinsics.ll index 37aea45e6107..96254f7c95b0 100644 --- a/test/CodeGen/X86/avx512cdvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512cdvl-intrinsics.ll @@ -7,8 +7,8 @@ define <4 x i32> @test_int_x86_avx512_mask_vplzcnt_d_128(<4 x i32> %x0, <4 x i32 ; CHECK-NEXT: vplzcntd %xmm0, %xmm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vplzcntd %xmm0, %xmm1 {%k1} -; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; CHECK-NEXT: vplzcntd %xmm0, %xmm0 {%k1} {z} ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq %1 = call <4 x i32> @llvm.ctlz.v4i32(<4 x i32> %x0, i1 false) diff --git a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll index c5478dad4224..1377733739fe 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics-upgrade.ll @@ -40,8 +40,8 @@ define <8 x float>@test_int_x86_avx512_mask_vextractf32x8(<16 x float> %x0, <8 x ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm2 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm1 {%k1} -; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 +; CHECK-NEXT: vextractf32x8 $1, %zmm0, %ymm0 {%k1} {z} ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; CHECK-NEXT: retq %res = call <8 x float> @llvm.x86.avx512.mask.vextractf32x8.512(<16 x float> %x0,i32 1, <8 x float> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512dq-intrinsics.ll b/test/CodeGen/X86/avx512dq-intrinsics.ll index 000390404b54..97ac0fde10ec 100644 --- a/test/CodeGen/X86/avx512dq-intrinsics.ll +++ b/test/CodeGen/X86/avx512dq-intrinsics.ll @@ -414,8 +414,8 @@ define <16 x float>@test_int_x86_avx512_mask_broadcastf32x2_512(<4 x float> %x0, ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcastf32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vaddps %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x float> @llvm.x86.avx512.mask.broadcastf32x2.512(<4 x float> %x0, <16 x float> %x2, i16 %x3) @@ -434,8 +434,8 @@ define <16 x i32>@test_int_x86_avx512_mask_broadcasti32x2_512(<4 x i32> %x0, <16 ; CHECK-NEXT: kmovw %edi, %k1 ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm1 {%k1} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm2, %zmm1, %zmm1 +; CHECK-NEXT: vbroadcasti32x2 {{.*#+}} zmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; CHECK-NEXT: vpaddd %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %res = call <16 x i32> @llvm.x86.avx512.mask.broadcasti32x2.512(<4 x i32> %x0, <16 x i32> %x2, i16 %x3) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll index 52a84deebf51..595b3e0ebb86 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics-upgrade.ll @@ -1568,8 +1568,8 @@ define <2 x double>@test_int_x86_avx512_mask_vextractf64x2_256(<4 x double> %x0, ; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x19,0xc2,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm1 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x19,0xc1,0x01] -; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] +; CHECK-NEXT: vextractf64x2 $1, %ymm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x19,0xc0,0x01] ; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.vextractf64x2.256(<4 x double> %x0,i32 1, <2 x double> %x2, i8 %x3) @@ -1588,9 +1588,9 @@ define <4 x double>@test_int_x86_avx512_mask_insertf64x2_256(<4 x double> %x0, < ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xd3] ; CHECK-NEXT: vinsertf64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddpd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xcb] -; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] +; CHECK-NEXT: vaddpd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.insertf64x2.256(<4 x double> %x0, <2 x double> %x1, i32 1, <4 x double> %x3, i8 -1) @@ -1608,9 +1608,9 @@ define <4 x i64>@test_int_x86_avx512_mask_inserti64x2_256(<4 x i64> %x0, <2 x i6 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0xfd,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vinserti64x2 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0xfd,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 %x4) %res1 = call <4 x i64> @llvm.x86.avx512.mask.inserti64x2.256(<4 x i64> %x0, <2 x i64> %x1, i32 1, <4 x i64> %x3, i8 -1) diff --git a/test/CodeGen/X86/avx512dqvl-intrinsics.ll b/test/CodeGen/X86/avx512dqvl-intrinsics.ll index ad9ea93c2031..1bfdfd0e634d 100644 --- a/test/CodeGen/X86/avx512dqvl-intrinsics.ll +++ b/test/CodeGen/X86/avx512dqvl-intrinsics.ll @@ -635,8 +635,8 @@ define <8 x float>@test_int_x86_avx512_mask_broadcastf32x2_256(<4 x float> %x0, ; CHECK-NEXT: ## ymm1 {%k1} = xmm0[0,1,0,1,0,1,0,1] ; CHECK-NEXT: vbroadcastf32x2 %xmm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x19,0xd0] ; CHECK-NEXT: ## ymm2 {%k1} {z} = xmm0[0,1,0,1,0,1,0,1] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xc0] ; CHECK-NEXT: vaddps %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcastf32x2.256(<4 x float> %x0, <8 x float> %x2, i8 %x3) @@ -680,8 +680,8 @@ define <4 x i32>@test_int_x86_avx512_mask_broadcasti32x2_128(<4 x i32> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x59,0xc8] ; CHECK-NEXT: vbroadcasti32x2 %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x59,0xd0] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xc0] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.broadcasti32x2.128(<4 x i32> %x0, <4 x i32> %x2, i8 %x3) diff --git a/test/CodeGen/X86/avx512er-intrinsics.ll b/test/CodeGen/X86/avx512er-intrinsics.ll index ca130bd2b676..b8531e25bfa1 100644 --- a/test/CodeGen/X86/avx512er-intrinsics.ll +++ b/test/CodeGen/X86/avx512er-intrinsics.ll @@ -118,78 +118,78 @@ define <4 x float> @test_rcp28_ss(<4 x float> %a0) { } declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone -define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0) { +define <4 x float> @test_rsqrt28_ss_maskz(<4 x float> %a0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_ss_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28ss {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0x7d,0x99,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 7, i32 8) ; + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 %mask, i32 8) ; ret <4 x float> %res } -define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0) { +define <4 x float> @test_rsqrt28_ss_mask(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_ss_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28ss {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0x7d,0x19,0xcd,0xd1] ; CHECK-NEXT: vmovaps %xmm2, %xmm0 # encoding: [0xc5,0xf8,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 7, i32 8) ; + %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %b0, <4 x float> %c0, i8 %mask, i32 8) ; ret <4 x float> %res } -define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0) { +define <2 x double> @test_rsqrt28_sd_maskz(<2 x double> %a0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_maskz: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28sd {sae}, %xmm0, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x99,0xcd,0xc0] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 7, i32 8) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %a0, <2 x double> zeroinitializer, i8 %mask, i32 8) ; ret <2 x double> %res } -define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0) { +define <2 x double> @test_rsqrt28_sd_mask(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_mask: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %edi # encoding: [0x83,0xe7,0x01] +; CHECK-NEXT: kmovw %edi, %k1 # encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vrsqrt28sd {sae}, %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf2,0xfd,0x19,0xcd,0xd1] ; CHECK-NEXT: vmovapd %xmm2, %xmm0 # encoding: [0xc5,0xf9,0x28,0xc2] ; CHECK-NEXT: retq # encoding: [0xc3] - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 7, i32 8) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %b0, <2 x double> %c0, i8 %mask, i32 8) ; ret <2 x double> %res } declare <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone -define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr ) { +define <2 x double> @test_rsqrt28_sd_maskz_mem(<2 x double> %a0, double* %ptr, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %esi # encoding: [0x83,0xe6,0x01] +; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vrsqrt28sd (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x07] ; CHECK-NEXT: retq # encoding: [0xc3] %mem = load double , double * %ptr, align 8 %mem_v = insertelement <2 x double> undef, double %mem, i32 0 - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ; ret <2 x double> %res } -define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr ) { +define <2 x double> @test_rsqrt28_sd_maskz_mem_offset(<2 x double> %a0, double* %ptr, i8 %mask) { ; CHECK-LABEL: test_rsqrt28_sd_maskz_mem_offset: ; CHECK: # BB#0: -; CHECK-NEXT: kxnorw %k0, %k0, %k0 # encoding: [0xc5,0xfc,0x46,0xc0] -; CHECK-NEXT: kshiftrw $15, %k0, %k1 # encoding: [0xc4,0xe3,0xf9,0x30,0xc8,0x0f] +; CHECK-NEXT: andl $1, %esi # encoding: [0x83,0xe6,0x01] +; CHECK-NEXT: kmovw %esi, %k1 # encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vrsqrt28sd 144(%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf2,0xfd,0x89,0xcd,0x47,0x12] ; CHECK-NEXT: retq # encoding: [0xc3] %ptr1 = getelementptr double, double* %ptr, i32 18 %mem = load double , double * %ptr1, align 8 %mem_v = insertelement <2 x double> undef, double %mem, i32 0 - %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 7, i32 4) ; + %res = call <2 x double> @llvm.x86.avx512.rsqrt28.sd(<2 x double> %a0, <2 x double> %mem_v, <2 x double> zeroinitializer, i8 %mask, i32 4) ; ret <2 x double> %res } diff --git a/test/CodeGen/X86/avx512ifma-intrinsics.ll b/test/CodeGen/X86/avx512ifma-intrinsics.ll index 30ecc0d2e49e..9659dc6d455a 100644 --- a/test/CodeGen/X86/avx512ifma-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifma-intrinsics.ll @@ -13,8 +13,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -41,8 +41,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52huq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -69,8 +69,8 @@ define <8 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_512(<8 x i64> %x0, <8 x i ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq @@ -97,8 +97,8 @@ define <8 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_512(<8 x i64> %x0, <8 x ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm4 {%k1} {z} ; CHECK-NEXT: vpxord %zmm2, %zmm2, %zmm2 ; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm0, %zmm4, %zmm0 +; CHECK-NEXT: vpmadd52luq %zmm2, %zmm1, %zmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %zmm2, %zmm3, %zmm1 ; CHECK-NEXT: vpaddq %zmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll index 3ca686cef3bf..b2fe6eba88ab 100644 --- a/test/CodeGen/X86/avx512ifmavl-intrinsics.ll +++ b/test/CodeGen/X86/avx512ifmavl-intrinsics.ll @@ -14,8 +14,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -42,8 +42,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52h_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52huq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -98,8 +98,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52h_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52huq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -126,8 +126,8 @@ define <2 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_128(<2 x i64> %x0, <2 x i ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -154,8 +154,8 @@ define <4 x i64>@test_int_x86_avx512_mask_vpmadd52l_uq_256(<4 x i64> %x0, <4 x i ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq @@ -182,8 +182,8 @@ define <2 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_128(<2 x i64> %x0, <2 x ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm4 {%k1} {z} ; CHECK-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm0, %xmm4, %xmm0 +; CHECK-NEXT: vpmadd52luq %xmm2, %xmm1, %xmm2 {%k1} {z} ; CHECK-NEXT: vpaddq %xmm2, %xmm3, %xmm1 ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: retq @@ -210,8 +210,8 @@ define <4 x i64>@test_int_x86_avx512_maskz_vpmadd52l_uq_256(<4 x i64> %x0, <4 x ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm4 {%k1} {z} ; CHECK-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm0 {%k1} {z} -; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm0, %ymm4, %ymm0 +; CHECK-NEXT: vpmadd52luq %ymm2, %ymm1, %ymm2 {%k1} {z} ; CHECK-NEXT: vpaddq %ymm2, %ymm3, %ymm1 ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; CHECK-NEXT: retq diff --git a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll index 4d906a4fd29a..c2d8df6476b3 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -30,8 +30,8 @@ define <4 x i32>@test_int_x86_avx512_pbroadcastd_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x58,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastd %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x58,0xc8] -; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc9] +; CHECK-NEXT: vpbroadcastd %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x58,0xc0] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.pbroadcastd.128(<4 x i32> %x0, <4 x i32> %x1, i8 -1) @@ -50,8 +50,8 @@ define <4 x i64>@test_int_x86_avx512_pbroadcastq_256(<2 x i64> %x0, <4 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x59,0xc0] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.pbroadcastq.256(<2 x i64> %x0, <4 x i64> %x1,i8 -1) @@ -70,8 +70,8 @@ define <2 x i64>@test_int_x86_avx512_pbroadcastq_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x59,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpbroadcastq %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x09,0x59,0xc8] -; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc9] +; CHECK-NEXT: vpbroadcastq %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0x89,0x59,0xc0] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.pbroadcastq.128(<2 x i64> %x0, <2 x i64> %x1,i8 -1) @@ -90,8 +90,8 @@ define <4 x double> @test_x86_vbroadcast_sd_pd_256(<2 x double> %a0, <4 x double ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x19,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastsd %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0xfd,0x29,0x19,0xc8] -; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x58,0xc9] +; CHECK-NEXT: vbroadcastsd %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0xfd,0xa9,0x19,0xc0] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.broadcast.sd.pd.256(<2 x double> %a0, <4 x double> zeroinitializer, i8 -1) @@ -110,8 +110,8 @@ define <8 x float> @test_x86_vbroadcast_ss_ps_256(<4 x float> %a0, <8 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %ymm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x29,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0xa9,0x18,0xc0] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.256(<4 x float> %a0, <8 x float> zeroinitializer, i8 -1) @@ -130,8 +130,8 @@ define <4 x float> @test_x86_vbroadcast_ss_ps_128(<4 x float> %a0, <4 x float> % ; CHECK-NEXT: vbroadcastss %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xd0] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vbroadcastss %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf2,0x7d,0x09,0x18,0xc8] -; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe8,0x58,0xc9] +; CHECK-NEXT: vbroadcastss %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf2,0x7d,0x89,0x18,0xc0] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.broadcast.ss.ps.128(<4 x float> %a0, <4 x float> zeroinitializer, i8 -1) @@ -152,9 +152,9 @@ define <4 x float>@test_int_x86_avx512_mask_movsldup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0,2,2] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovsldup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0,2,2] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movsldup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -175,9 +175,9 @@ define <8 x float>@test_int_x86_avx512_mask_movsldup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovsldup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2,4,4,6,6] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovsldup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2,4,4,6,6] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movsldup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -198,9 +198,9 @@ define <4 x float>@test_int_x86_avx512_mask_movshdup_128(<4 x float> %x0, <4 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x09,0x16,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[1,1,3,3] +; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vmovshdup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0x89,0x16,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1,1,3,3] -; CHECK-NEXT: vaddps %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x58,0xca] ; CHECK-NEXT: vaddps %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x float> @llvm.x86.avx512.mask.movshdup.128(<4 x float> %x0, <4 x float> %x1, i8 %x2) @@ -221,9 +221,9 @@ define <8 x float>@test_int_x86_avx512_mask_movshdup_256(<8 x float> %x0, <8 x f ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovshdup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7e,0x29,0x16,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[1,1,3,3,5,5,7,7] +; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vmovshdup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7e,0xa9,0x16,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[1,1,3,3,5,5,7,7] -; CHECK-NEXT: vaddps %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf4,0x58,0xca] ; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.movshdup.256(<8 x float> %x0, <8 x float> %x1, i8 %x2) @@ -243,9 +243,9 @@ define <2 x double>@test_int_x86_avx512_mask_movddup_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x09,0x12,0xc8] ; CHECK-NEXT: ## xmm1 {%k1} = xmm0[0,0] +; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vmovddup %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0x89,0x12,0xc0] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[0,0] -; CHECK-NEXT: vaddpd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xca] ; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.movddup.128(<2 x double> %x0, <2 x double> %x1, i8 %x2) @@ -266,9 +266,9 @@ define <4 x double>@test_int_x86_avx512_mask_movddup_256(<4 x double> %x0, <4 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vmovddup %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xff,0x29,0x12,0xc8] ; CHECK-NEXT: ## ymm1 {%k1} = ymm0[0,0,2,2] +; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vmovddup %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xff,0xa9,0x12,0xc0] ; CHECK-NEXT: ## ymm0 {%k1} {z} = ymm0[0,0,2,2] -; CHECK-NEXT: vaddpd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xca] ; CHECK-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x58,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.movddup.256(<4 x double> %x0, <4 x double> %x1, i8 %x2) @@ -3209,10 +3209,10 @@ define <2 x double>@test_int_x86_avx512_mask_shuf_pd_128(<2 x double> %x0, <2 x ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xc6,0xd1,0x01] ; CHECK-NEXT: ## xmm2 {%k1} = xmm0[1],xmm1[0] +; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xd3] ; CHECK-NEXT: vshufpd $1, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xc6,0xc1,0x01] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm0[1],xmm1[0] -; CHECK-NEXT: vaddpd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x58,0xcb] -; CHECK-NEXT: vaddpd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc1] +; CHECK-NEXT: vaddpd %xmm2, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.shuf.pd.128(<2 x double> %x0, <2 x double> %x1, i32 1, <2 x double> %x3, i8 -1) @@ -3540,9 +3540,9 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_q_128(<2 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x09,0xd3,0xd1] +; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0xd3,0xc1] -; CHECK-NEXT: vpaddq %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xcb] -; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] +; CHECK-NEXT: vpaddq %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 %x3) %res1 = call <2 x i64> @llvm.x86.avx512.mask.psrl.q.128(<2 x i64> %x0, <2 x i64> %x1, <2 x i64> %x2, i8 -1) @@ -3560,9 +3560,9 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_q_256(<4 x i64> %x0, <2 x i64> %x ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd3,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0xfd,0x29,0xd3,0xd1] +; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xd3] ; CHECK-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0xd3,0xc1] -; CHECK-NEXT: vpaddq %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xcb] -; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] +; CHECK-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 %x3) %res1 = call <4 x i64> @llvm.x86.avx512.mask.psrl.q.256(<4 x i64> %x0, <2 x i64> %x1, <4 x i64> %x2, i8 -1) @@ -3580,9 +3580,9 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_d_128(<4 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xd2,0xd1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xd2,0xc1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 %x3) %res1 = call <4 x i32> @llvm.x86.avx512.mask.psrl.d.128(<4 x i32> %x0, <4 x i32> %x1, <4 x i32> %x2, i8 -1) @@ -3600,9 +3600,9 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_d_256(<8 x i32> %x0, <4 x i32> %x ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd2,0xd9] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xd2,0xd1] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vpsrld %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xd2,0xc1] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 %x3) %res1 = call <8 x i32> @llvm.x86.avx512.mask.psrl.d.256(<8 x i32> %x0, <4 x i32> %x1, <8 x i32> %x2, i8 -1) @@ -3720,8 +3720,8 @@ define <2 x i64>@test_int_x86_avx512_mask_psrl_qi_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0x89,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.psrl.qi.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -3740,8 +3740,8 @@ define <4 x i64>@test_int_x86_avx512_mask_psrl_qi_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x73,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x73,0xd0,0x03] -; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vpsrlq $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x73,0xd0,0x03] ; CHECK-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xd4,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.psrl.qi.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -3760,8 +3760,8 @@ define <4 x i32>@test_int_x86_avx512_mask_psrl_di_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.psrl.di.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -3780,8 +3780,8 @@ define <8 x i32>@test_int_x86_avx512_mask_psrl_di_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0x72,0xd0,0x03] ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vpsrld $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xd0,0x03] -; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vpsrld $3, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0x72,0xd0,0x03] ; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.psrl.di.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4642,10 +4642,10 @@ define <4 x i32>@test_int_x86_avx512_mask_valign_d_128(<4 x i32> %x0, <4 x i32> ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x09,0x03,0xd1,0x02] ; CHECK-NEXT: ## xmm2 {%k1} = xmm1[2,3],xmm0[0,1] +; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm2 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xd3] ; CHECK-NEXT: valignd $2, %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0x89,0x03,0xc1,0x02] ; CHECK-NEXT: ## xmm0 {%k1} {z} = xmm1[2,3],xmm0[0,1] -; CHECK-NEXT: vpaddd %xmm3, %xmm2, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xcb] -; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] +; CHECK-NEXT: vpaddd %xmm0, %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe9,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 %x4) %res1 = call <4 x i32> @llvm.x86.avx512.mask.valign.d.128(<4 x i32> %x0, <4 x i32> %x1, i32 2, <4 x i32> %x3, i8 -1) @@ -4817,9 +4817,9 @@ define <8 x float>@test_int_x86_avx512_mask_insertf32x4_256(<8 x float> %x0, <4 ; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x18,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x18,0xd1,0x01] +; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xd3] ; CHECK-NEXT: vinsertf32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x18,0xc1,0x01] -; CHECK-NEXT: vaddps %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xec,0x58,0xcb] -; CHECK-NEXT: vaddps %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc1] +; CHECK-NEXT: vaddps %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfc,0x58,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 %x4) %res1 = call <8 x float> @llvm.x86.avx512.mask.insertf32x4.256(<8 x float> %x0, <4 x float> %x1, i32 1, <8 x float> %x3, i8 -1) @@ -4837,9 +4837,9 @@ define <8 x i32>@test_int_x86_avx512_mask_inserti32x4_256(<8 x i32> %x0, <4 x i3 ; CHECK-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm3 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x7d,0x38,0xd9,0x01] ; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf3,0x7d,0x29,0x38,0xd1,0x01] +; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xd3] ; CHECK-NEXT: vinserti32x4 $1, %xmm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf3,0x7d,0xa9,0x38,0xc1,0x01] -; CHECK-NEXT: vpaddd %ymm3, %ymm2, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xed,0xfe,0xcb] -; CHECK-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc1] +; CHECK-NEXT: vpaddd %ymm2, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xfe,0xc2] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.inserti32x4.256(<8 x i32> %x0, <4 x i32> %x1, i32 1, <8 x i32> %x3, i8 %x4) diff --git a/test/CodeGen/X86/avx512vl-intrinsics.ll b/test/CodeGen/X86/avx512vl-intrinsics.ll index 1f324d679564..684b0468cf51 100644 --- a/test/CodeGen/X86/avx512vl-intrinsics.ll +++ b/test/CodeGen/X86/avx512vl-intrinsics.ll @@ -4368,8 +4368,8 @@ define <4 x i32>@test_int_x86_avx512_mask_prol_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprold $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.prol.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4388,8 +4388,8 @@ define <8 x i32>@test_int_x86_avx512_mask_prol_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprold $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprold $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprold $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.prol.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4408,8 +4408,8 @@ define <2 x i64>@test_int_x86_avx512_mask_prol_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.prol.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4428,8 +4428,8 @@ define <4 x i64>@test_int_x86_avx512_mask_prol_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc8,0x03] ; CHECK-NEXT: vprolq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc8,0x03] -; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprolq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc8,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.prol.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4528,8 +4528,8 @@ define <4 x i32>@test_int_x86_avx512_mask_pror_d_128(<4 x i32> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xca] +; CHECK-NEXT: vprord $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0x7d,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx512.mask.pror.d.128(<4 x i32> %x0, i32 3, <4 x i32> %x2, i8 %x3) @@ -4548,8 +4548,8 @@ define <8 x i32>@test_int_x86_avx512_mask_pror_d_256(<8 x i32> %x0, i32 %x1, <8 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprord $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x75,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprord $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0x6d,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xca] +; CHECK-NEXT: vprord $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0x7d,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xfe,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx512.mask.pror.d.256(<8 x i32> %x0, i32 3, <8 x i32> %x2, i8 %x3) @@ -4568,8 +4568,8 @@ define <2 x i64>@test_int_x86_avx512_mask_pror_q_128(<2 x i64> %x0, i32 %x1, <2 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x09,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %xmm0, %xmm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0x89,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm2, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %xmm0, %xmm0 ## encoding: [0x62,0xf1,0xfd,0x08,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x i64> @llvm.x86.avx512.mask.pror.q.128(<2 x i64> %x0, i32 3, <2 x i64> %x2, i8 %x3) @@ -4588,8 +4588,8 @@ define <4 x i64>@test_int_x86_avx512_mask_pror_q_256(<4 x i64> %x0, i32 %x1, <4 ; CHECK-NEXT: kmovw %esi, %k1 ## encoding: [0xc5,0xf8,0x92,0xce] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0xf5,0x29,0x72,0xc0,0x03] ; CHECK-NEXT: vprorq $3, %ymm0, %ymm2 {%k1} {z} ## encoding: [0x62,0xf1,0xed,0xa9,0x72,0xc0,0x03] -; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xca] +; CHECK-NEXT: vprorq $3, %ymm0, %ymm0 ## encoding: [0x62,0xf1,0xfd,0x28,0x72,0xc0,0x03] ; CHECK-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0xd4,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x i64> @llvm.x86.avx512.mask.pror.q.256(<4 x i64> %x0, i32 3, <4 x i64> %x2, i8 %x3) @@ -4690,9 +4690,9 @@ define <2 x double>@test_int_x86_avx512_mask_fixupimm_pd_128(<2 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $5, %xmm2, %xmm1, %xmm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x09,0x54,0xda,0x05] ; CHECK-NEXT: vpxor %xmm4, %xmm4, %xmm4 ## EVEX TO VEX Compression encoding: [0xc5,0xd9,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $4, %xmm2, %xmm1, %xmm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0x89,0x54,0xe2,0x04] +; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %xmm2, %xmm1, %xmm0 ## encoding: [0x62,0xf3,0xf5,0x08,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %xmm4, %xmm3, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xcc] -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf1,0x58,0xc0] +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe1,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> %x0, <2 x double> %x1,<2 x i64> %x2, i32 5, i8 %x4) %res1 = call <2 x double> @llvm.x86.avx512.mask.fixupimm.pd.128(<2 x double> zeroinitializer, <2 x double> %x1, <2 x i64> %x2, i32 4, i8 %x4) @@ -4732,9 +4732,9 @@ define <4 x double>@test_int_x86_avx512_mask_fixupimm_pd_256(<4 x double> %x0, < ; CHECK-NEXT: vfixupimmpd $4, %ymm2, %ymm1, %ymm3 {%k1} ## encoding: [0x62,0xf3,0xf5,0x29,0x54,0xda,0x04] ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vfixupimmpd $5, %ymm2, %ymm1, %ymm4 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xe2,0x05] +; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdc] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm4, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcc] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 4, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.mask.fixupimm.pd.256(<4 x double> zeroinitializer, <4 x double> %x1, <4 x i64> %x2 , i32 5, i8 %x4) @@ -4755,9 +4755,9 @@ define <4 x double>@test_int_x86_avx512_maskz_fixupimm_pd_256(<4 x double> %x0, ; CHECK-NEXT: vpxor %ymm4, %ymm4, %ymm4 ## EVEX TO VEX Compression encoding: [0xc5,0xdd,0xef,0xe4] ; CHECK-NEXT: vmovapd %ymm0, %ymm5 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x28,0xe8] ; CHECK-NEXT: vfixupimmpd $4, %ymm4, %ymm1, %ymm5 {%k1} {z} ## encoding: [0x62,0xf3,0xf5,0xa9,0x54,0xec,0x04] +; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm3 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xdd] ; CHECK-NEXT: vfixupimmpd $3, %ymm2, %ymm1, %ymm0 ## encoding: [0x62,0xf3,0xf5,0x28,0x54,0xc2,0x03] -; CHECK-NEXT: vaddpd %ymm5, %ymm3, %ymm1 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xcd] -; CHECK-NEXT: vaddpd %ymm0, %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf5,0x58,0xc0] +; CHECK-NEXT: vaddpd %ymm0, %ymm3, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xe5,0x58,0xc0] ; CHECK-NEXT: retq ## encoding: [0xc3] %res = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> %x2, i32 5, i8 %x4) %res1 = call <4 x double> @llvm.x86.avx512.maskz.fixupimm.pd.256(<4 x double> %x0, <4 x double> %x1, <4 x i64> zeroinitializer, i32 4, i8 %x4) diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll index afeba4ef2d99..94e2ee7a0aa9 100644 --- a/test/CodeGen/X86/bmi.ll +++ b/test/CodeGen/X86/bmi.ll @@ -454,6 +454,30 @@ entry: ret i32 %and } +define i32 @bzhi32d(i32 %a, i32 %b) { +; CHECK-LABEL: bzhi32d: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq +entry: + %sub = sub i32 32, %b + %shr = lshr i32 -1, %sub + %and = and i32 %shr, %a + ret i32 %and +} + +define i32 @bzhi32e(i32 %a, i32 %b) { +; CHECK-LABEL: bzhi32e: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhil %esi, %edi, %eax +; CHECK-NEXT: retq +entry: + %sub = sub i32 32, %b + %shl = shl i32 %a, %sub + %shr = lshr i32 %shl, %sub + ret i32 %shr +} + define i64 @bzhi64b(i64 %x, i8 zeroext %index) { ; CHECK-LABEL: bzhi64b: ; CHECK: # BB#0: # %entry @@ -468,6 +492,58 @@ entry: ret i64 %and } +define i64 @bzhi64c(i64 %a, i64 %b) { +; CHECK-LABEL: bzhi64c: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i64 64, %b + %shr = lshr i64 -1, %sub + %and = and i64 %shr, %a + ret i64 %and +} + +define i64 @bzhi64d(i64 %a, i32 %b) { +; CHECK-LABEL: bzhi64d: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i32 64, %b + %sh_prom = zext i32 %sub to i64 + %shr = lshr i64 -1, %sh_prom + %and = and i64 %shr, %a + ret i64 %and +} + +define i64 @bzhi64e(i64 %a, i64 %b) { +; CHECK-LABEL: bzhi64e: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i64 64, %b + %shl = shl i64 %a, %sub + %shr = lshr i64 %shl, %sub + ret i64 %shr +} + +define i64 @bzhi64f(i64 %a, i32 %b) { +; CHECK-LABEL: bzhi64f: +; CHECK: # BB#0: # %entry +; CHECK-NEXT: # kill: %ESI %ESI %RSI +; CHECK-NEXT: bzhiq %rsi, %rdi, %rax +; CHECK-NEXT: retq +entry: + %sub = sub i32 64, %b + %sh_prom = zext i32 %sub to i64 + %shl = shl i64 %a, %sh_prom + %shr = lshr i64 %shl, %sh_prom + ret i64 %shr +} + define i64 @bzhi64_constant_mask(i64 %x) { ; CHECK-LABEL: bzhi64_constant_mask: ; CHECK: # BB#0: # %entry diff --git a/test/CodeGen/X86/bswap_tree2.ll b/test/CodeGen/X86/bswap_tree2.ll index a9c74df9d0d9..1340b7662a7a 100644 --- a/test/CodeGen/X86/bswap_tree2.ll +++ b/test/CodeGen/X86/bswap_tree2.ll @@ -9,31 +9,32 @@ define i32 @test1(i32 %x) nounwind { ; CHECK-LABEL: test1: ; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %ecx -; CHECK-NEXT: movl %ecx, %edx -; CHECK-NEXT: andl $16711680, %edx # imm = 0xFF0000 -; CHECK-NEXT: movl %ecx, %eax -; CHECK-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK-NEXT: shll $8, %edx -; CHECK-NEXT: shrl $8, %eax -; CHECK-NEXT: bswapl %ecx -; CHECK-NEXT: shrl $16, %ecx +; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax +; CHECK-NEXT: movl %eax, %ecx +; CHECK-NEXT: andl $16711680, %ecx # imm = 0xFF0000 +; CHECK-NEXT: movl %eax, %edx +; CHECK-NEXT: orl $-16777216, %edx # imm = 0xFF000000 +; CHECK-NEXT: shll $8, %ecx +; CHECK-NEXT: shrl $8, %edx +; CHECK-NEXT: orl %ecx, %edx +; CHECK-NEXT: bswapl %eax +; CHECK-NEXT: shrl $16, %eax ; CHECK-NEXT: orl %edx, %eax -; CHECK-NEXT: orl %ecx, %eax ; CHECK-NEXT: retl ; ; CHECK64-LABEL: test1: ; CHECK64: # BB#0: -; CHECK64-NEXT: movl %edi, %ecx -; CHECK64-NEXT: andl $16711680, %ecx # imm = 0xFF0000 ; CHECK64-NEXT: movl %edi, %eax -; CHECK64-NEXT: orl $-16777216, %eax # imm = 0xFF000000 -; CHECK64-NEXT: shll $8, %ecx -; CHECK64-NEXT: shrl $8, %eax +; CHECK64-NEXT: andl $16711680, %eax # imm = 0xFF0000 +; CHECK64-NEXT: movl %edi, %ecx +; CHECK64-NEXT: orl $-16777216, %ecx # imm = 0xFF000000 +; CHECK64-NEXT: shll $8, %eax +; CHECK64-NEXT: shrl $8, %ecx +; CHECK64-NEXT: orl %eax, %ecx ; CHECK64-NEXT: bswapl %edi ; CHECK64-NEXT: shrl $16, %edi -; CHECK64-NEXT: orl %ecx, %eax -; CHECK64-NEXT: orl %edi, %eax +; CHECK64-NEXT: orl %ecx, %edi +; CHECK64-NEXT: movl %edi, %eax ; CHECK64-NEXT: retq %byte0 = and i32 %x, 255 ; 0x000000ff %byte1 = and i32 %x, 65280 ; 0x0000ff00 diff --git a/test/CodeGen/X86/cast-vsel.ll b/test/CodeGen/X86/cast-vsel.ll index 1e44aec99fc5..83ab2fac2f16 100644 --- a/test/CodeGen/X86/cast-vsel.ll +++ b/test/CodeGen/X86/cast-vsel.ll @@ -200,32 +200,29 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d) ; SSE41: # BB#0: ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 ; SSE41-NEXT: movdqa {{.*#+}} xmm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] -; SSE41-NEXT: pshufb %xmm1, %xmm5 -; SSE41-NEXT: pshufb %xmm1, %xmm4 -; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] ; SSE41-NEXT: pshufb %xmm1, %xmm3 ; SSE41-NEXT: pshufb %xmm1, %xmm2 ; SSE41-NEXT: punpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; SSE41-NEXT: pand %xmm0, %xmm2 -; SSE41-NEXT: pandn %xmm4, %xmm0 -; SSE41-NEXT: por %xmm2, %xmm0 +; SSE41-NEXT: pshufb %xmm1, %xmm5 +; SSE41-NEXT: pshufb %xmm1, %xmm4 +; SSE41-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm5[0] +; SSE41-NEXT: pblendvb %xmm0, %xmm2, %xmm4 +; SSE41-NEXT: movdqa %xmm4, %xmm0 ; SSE41-NEXT: retq ; ; AVX1-LABEL: trunc: ; AVX1: # BB#0: ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 -; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1 ; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15] ; AVX1-NEXT: vpshufb %xmm4, %xmm1, %xmm1 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0] -; AVX1-NEXT: vpandn %xmm1, %xmm0, %xmm1 -; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 ; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 -; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm2[0],xmm3[0] -; AVX1-NEXT: vpand %xmm0, %xmm2, %xmm0 -; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0] +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm3 +; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0] +; AVX1-NEXT: vpblendvb %xmm0, %xmm1, %xmm2, %xmm0 ; AVX1-NEXT: vzeroupper ; AVX1-NEXT: retq ; @@ -233,13 +230,11 @@ define <8 x i16> @trunc(<8 x i16> %a, <8 x i16> %b, <8 x i32> %c, <8 x i32> %d) ; AVX2: # BB#0: ; AVX2-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15,16,17,20,21,24,25,28,29,24,25,28,29,28,29,30,31] -; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm3 -; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,2,2,3] -; AVX2-NEXT: vpandn %xmm3, %xmm0, %xmm3 -; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: vpshufb %ymm1, %ymm2, %ymm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,2,2,3] +; AVX2-NEXT: vpshufb %ymm1, %ymm3, %ymm1 ; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,2,3] -; AVX2-NEXT: vpand %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0 +; AVX2-NEXT: vpblendvb %xmm0, %xmm2, %xmm1, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq %cmp = icmp eq <8 x i16> %a, %b diff --git a/test/CodeGen/X86/combine-abs.ll b/test/CodeGen/X86/combine-abs.ll index 887abe99f6ed..37beb438d737 100644 --- a/test/CodeGen/X86/combine-abs.ll +++ b/test/CodeGen/X86/combine-abs.ll @@ -50,12 +50,11 @@ define <32 x i8> @combine_v32i8_abs_abs(<32 x i8> %a) { define <4 x i64> @combine_v4i64_abs_abs(<4 x i64> %a) { ; AVX2-LABEL: combine_v4i64_abs_abs: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2 +; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/combine-shl.ll b/test/CodeGen/X86/combine-shl.ll index 3ad38f2717d9..3dbff2680c22 100644 --- a/test/CodeGen/X86/combine-shl.ll +++ b/test/CodeGen/X86/combine-shl.ll @@ -11,8 +11,7 @@ define <4 x i32> @combine_vec_shl_zero(<4 x i32> %x) { ; ; AVX-LABEL: combine_vec_shl_zero: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsllvd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = shl <4 x i32> zeroinitializer, %x ret <4 x i32> %1 diff --git a/test/CodeGen/X86/combine-srl.ll b/test/CodeGen/X86/combine-srl.ll index 706e89051a3d..21564cdd7353 100644 --- a/test/CodeGen/X86/combine-srl.ll +++ b/test/CodeGen/X86/combine-srl.ll @@ -6,30 +6,12 @@ define <4 x i32> @combine_vec_lshr_zero(<4 x i32> %x) { ; SSE-LABEL: combine_vec_lshr_zero: ; SSE: # BB#0: -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrldq {{.*#+}} xmm2 = xmm2[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: psrld %xmm2, %xmm3 -; SSE-NEXT: movdqa %xmm0, %xmm2 -; SSE-NEXT: psrlq $32, %xmm2 -; SSE-NEXT: pxor %xmm4, %xmm4 -; SSE-NEXT: psrld %xmm2, %xmm4 -; SSE-NEXT: pblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm3[4,5,6,7] -; SSE-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3] -; SSE-NEXT: pxor %xmm3, %xmm3 -; SSE-NEXT: psrld %xmm0, %xmm3 -; SSE-NEXT: psrld %xmm2, %xmm1 -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm3[4,5,6,7] -; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm4[2,3],xmm1[4,5],xmm4[6,7] -; SSE-NEXT: movdqa %xmm1, %xmm0 +; SSE-NEXT: xorps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_vec_lshr_zero: ; AVX: # BB#0: -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpsrlvd %xmm0, %xmm1, %xmm0 +; AVX-NEXT: vxorps %xmm0, %xmm0, %xmm0 ; AVX-NEXT: retq %1 = lshr <4 x i32> zeroinitializer, %x ret <4 x i32> %1 diff --git a/test/CodeGen/X86/constructor.ll b/test/CodeGen/X86/constructor.ll index 7b82125dc372..2f3c343afac0 100644 --- a/test/CodeGen/X86/constructor.ll +++ b/test/CodeGen/X86/constructor.ll @@ -3,6 +3,8 @@ ; RUN: llc -mtriple x86_64-pc-linux < %s | FileCheck --check-prefix=INIT-ARRAY %s ; RUN: llc -mtriple x86_64-unknown-freebsd < %s | FileCheck --check-prefix=INIT-ARRAY %s ; RUN: llc -mtriple x86_64-unknown-nacl < %s | FileCheck --check-prefix=NACL %s +; RUN: llc -mtriple i586-intel-elfiamcu -use-ctors < %s | FileCheck %s --check-prefix=MCU-CTORS +; RUN: llc -mtriple i586-intel-elfiamcu < %s | FileCheck %s --check-prefix=MCU-INIT-ARRAY @llvm.global_ctors = appending global [2 x { i32, void ()*, i8* }] [{ i32, void ()*, i8* } { i32 65535, void ()* @f, i8* null}, { i32, void ()*, i8* } { i32 15, void ()* @g, i8* @v }] @v = weak_odr global i8 0 @@ -37,3 +39,6 @@ entry: ; NACL-NEXT: .section .init_array,"aw",@init_array ; NACL-NEXT: .p2align 2 ; NACL-NEXT: .long f + +; MCU-CTORS: .section .ctors,"aw",@progbits +; MCU-INIT-ARRAY: .section .init_array,"aw",@init_array diff --git a/test/CodeGen/X86/dbg-baseptr.ll b/test/CodeGen/X86/dbg-baseptr.ll index fb0da1b50d11..893ca93a9944 100644 --- a/test/CodeGen/X86/dbg-baseptr.ll +++ b/test/CodeGen/X86/dbg-baseptr.ll @@ -1,4 +1,5 @@ ; RUN: llc -o - %s | FileCheck %s +; RUN: llc -filetype=obj -o - %s | llvm-dwarfdump - | FileCheck %s --check-prefix=DWARF ; This test checks that parameters on the stack pointer are correctly ; referenced by debug info. target triple = "x86_64--" @@ -7,24 +8,54 @@ target triple = "x86_64--" @ptr = external global i32* %struct.s = type { i32, i32, i32, i32, i32 } +; Simple case: no FP, use offset from RSP. + ; CHECK-LABEL: f0: -; CHECK: DEBUG_VALUE: f:input <- [%RSP+8] +; CHECK-NOT: pushq +; CHECK: movl $42, %eax +; CHECK: retq define i32 @f0(%struct.s* byval align 8 %input) !dbg !8 { call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !4, metadata !17), !dbg !18 - ret i32 42 + ret i32 42, !dbg !18 } +; DWARF-LABEL: .debug_info contents: + +; DWARF-LABEL: DW_TAG_subprogram +; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 57 ) +; 0x57 -> RSP +; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f0") +; DWARF: DW_TAG_formal_parameter +; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 08 ) +; DW_OP_fbreg (0x91) 0x08 +; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input") + + +; Dynamic alloca forces the use of RBP as the base pointer + ; CHECK-LABEL: f1: -; CHECK: DEBUG_VALUE: f:input <- [%RBP+16] +; CHECK: pushq %rbp +; CHECK: movl $42, %eax +; CHECK: popq %rbp +; CHECK: retq define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 { %val = load i64, i64* @glob ; this alloca should force FP usage. %stackspace = alloca i32, i64 %val, align 1 store i32* %stackspace, i32** @ptr call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !20, metadata !17), !dbg !21 - ret i32 42 + ret i32 42, !dbg !21 } +; DWARF-LABEL: DW_TAG_subprogram +; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 ) +; 0x56 -> RBP +; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f1") +; DWARF: DW_TAG_formal_parameter +; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 ) +; DW_OP_fbreg (0x91) 0x10 +; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input") + ; CHECK-LABEL: f2: ; Just check that we are indeed aligning the stack and setting up a base pointer ; in RBX. @@ -34,17 +65,24 @@ define i32 @f1(%struct.s* byval align 8 %input) !dbg !19 { ; CHECK: andq $-64, %rsp ; CHECK: subq $64, %rsp ; CHECK: movq %rsp, %rbx -; The parameter should still be referenced through RBP though. -; CHECK-NOT: DEBUG_VALUE: f:input <- [%RBX -; CHECK: DEBUG_VALUE: f:input <- [%RBP+16] define i32 @f2(%struct.s* byval align 8 %input) !dbg !22 { %val = load i64, i64* @glob %stackspace = alloca i32, i64 %val, align 64 store i32* %stackspace, i32** @ptr call void @llvm.dbg.declare(metadata %struct.s* %input, metadata !23, metadata !17), !dbg !24 - ret i32 42 + ret i32 42, !dbg !24 } +; "input" should still be referred to through RBP. +; DWARF-LABEL: DW_TAG_subprogram +; DWARF: DW_AT_frame_base [DW_FORM_exprloc] (<0x1> 56 ) +; 0x56 -> RBP +; DWARF: DW_AT_name [DW_FORM_strp] ( {{.*}}"f2") +; DWARF: DW_TAG_formal_parameter +; DWARF-NEXT: DW_AT_location [DW_FORM_exprloc] (<0x2> 91 10 ) +; DW_OP_fbreg (0x91) 0x10 +; DWARF-NEXT: DW_AT_name [DW_FORM_strp] ( {{.*}}"input") + declare void @llvm.dbg.declare(metadata, metadata, metadata) !llvm.dbg.cu = !{!2} @@ -52,7 +90,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) !0 = !{i32 2, !"Dwarf Version", i32 4} !1 = !{i32 2, !"Debug Info Version", i32 3} -!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3) +!2 = distinct !DICompileUnit(language: DW_LANG_C99, file: !3, emissionKind: FullDebug) !3 = !DIFile(filename: "dbg-baseptr.ll", directory: "/") !4 = !DILocalVariable(name: "input", arg: 1, scope: !8, file: !3, line: 5, type: !9) !5 = !{} @@ -60,7 +98,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) !6 = !DISubroutineType(types: !7) !7 = !{!10, !9} -!8 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) +!8 = distinct !DISubprogram(name: "f0", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, unit: !2, variables: !5) !9 = distinct !DICompositeType(tag: DW_TAG_structure_type, name: "s", elements: !11) !10 = !DIBasicType(name: "unsigned int", size: 32, encoding: DW_ATE_unsigned) @@ -74,9 +112,9 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) !17 = !DIExpression() !18 = !DILocation(line: 5, scope: !8) -!19 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) +!19 = distinct !DISubprogram(name: "f1", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) !20 = !DILocalVariable(name: "input", arg: 1, scope: !19, file: !3, line: 5, type: !9) !21 = !DILocation(line: 5, scope: !19) -!22 = distinct !DISubprogram(name: "f", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) +!22 = distinct !DISubprogram(name: "f2", file: !3, line: 5, type: !6, isLocal: false, isDefinition: true, flags: DIFlagPrototyped, unit: !2, variables: !5) !23 = !DILocalVariable(name: "input", arg: 1, scope: !22, file: !3, line: 5, type: !9) !24 = !DILocation(line: 5, scope: !22) diff --git a/test/CodeGen/X86/elf-associated.ll b/test/CodeGen/X86/elf-associated.ll index 361cf66cce72..7d58c3437025 100644 --- a/test/CodeGen/X86/elf-associated.ll +++ b/test/CodeGen/X86/elf-associated.ll @@ -37,3 +37,8 @@ @l = global i32 1, section "ccc", !associated !5 !5 = !{i32* null} ; CHECK-DAG: .section ccc,"aw",@progbits + +; Null metadata. +@m = global i32 1, section "ddd", !associated !6 +!6 = distinct !{null} +; CHECK-DAG: .section ddd,"aw",@progbits diff --git a/test/CodeGen/X86/fold-tied-op.ll b/test/CodeGen/X86/fold-tied-op.ll index d68236e9d250..eb06eb75a4d7 100644 --- a/test/CodeGen/X86/fold-tied-op.ll +++ b/test/CodeGen/X86/fold-tied-op.ll @@ -6,9 +6,10 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386--netbsd" ; CHECK-LABEL: fn1 -; CHECK: addl {{.*#+}} 4-byte Folded Reload -; CHECK: imull {{.*#+}} 4-byte Folded Reload -; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: orl {{.*#+}} 4-byte Folded Reload +; CHECK: addl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload +; CHECK: xorl {{.*#+}} 4-byte Folded Reload ; CHECK: retl %struct.XXH_state64_t = type { i32, i32, i64, i64, i64 } diff --git a/test/CodeGen/X86/fp128-i128.ll b/test/CodeGen/X86/fp128-i128.ll index 98082ec611d4..6c6bc8bdc1d1 100644 --- a/test/CodeGen/X86/fp128-i128.ll +++ b/test/CodeGen/X86/fp128-i128.ll @@ -50,8 +50,8 @@ define void @TestUnionLD1(fp128 %s, i64 %n) #0 { ; CHECK-NEXT: andq %rdi, %rcx ; CHECK-NEXT: movabsq $-281474976710656, %rdx # imm = 0xFFFF000000000000 ; CHECK-NEXT: andq -{{[0-9]+}}(%rsp), %rdx -; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: orq %rcx, %rdx +; CHECK-NEXT: movq %rax, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movq %rdx, -{{[0-9]+}}(%rsp) ; CHECK-NEXT: movaps -{{[0-9]+}}(%rsp), %xmm0 ; CHECK-NEXT: jmp foo # TAILCALL diff --git a/test/CodeGen/X86/haddsub-2.ll b/test/CodeGen/X86/haddsub-2.ll index 4596b83f7bc2..b5507523a75a 100644 --- a/test/CodeGen/X86/haddsub-2.ll +++ b/test/CodeGen/X86/haddsub-2.ll @@ -933,14 +933,14 @@ define <4 x float> @not_a_hsub_2(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] ; AVX-NEXT: vsubss %xmm3, %xmm0, %xmm0 -; AVX-NEXT: vpermilps {{.*#+}} xmm3 = xmm1[3,1,2,3] -; AVX-NEXT: vpermilpd {{.*#+}} xmm4 = xmm1[1,0] -; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[2,3] -; AVX-NEXT: vmovshdup {{.*#+}} xmm2 = xmm1[1,1,3,3] -; AVX-NEXT: vsubss %xmm2, %xmm1, %xmm1 +; AVX-NEXT: vpermilps {{.*#+}} xmm2 = xmm1[3,1,2,3] +; AVX-NEXT: vpermilpd {{.*#+}} xmm3 = xmm1[1,0] +; AVX-NEXT: vsubss %xmm3, %xmm2, %xmm2 +; AVX-NEXT: vmovshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] +; AVX-NEXT: vsubss %xmm3, %xmm1, %xmm1 ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %A, i32 2 %vecext1 = extractelement <4 x float> %A, i32 3 diff --git a/test/CodeGen/X86/known-signbits-vector.ll b/test/CodeGen/X86/known-signbits-vector.ll index cea9ac26edbc..ec620b8ce877 100644 --- a/test/CodeGen/X86/known-signbits-vector.ll +++ b/test/CodeGen/X86/known-signbits-vector.ll @@ -137,3 +137,64 @@ define float @signbits_ashr_insert_ashr_extract_sitofp(i64 %a0, i64 %a1) nounwin %6 = sitofp i64 %5 to float ret float %6 } + +define <4 x double> @signbits_sext_shuffle_sitofp(<4 x i32> %a0, <4 x i64> %a1) nounwind { +; X32-LABEL: signbits_sext_shuffle_sitofp: +; X32: # BB#0: +; X32-NEXT: vpmovsxdq %xmm0, %xmm1 +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X32-NEXT: vpmovsxdq %xmm0, %xmm0 +; X32-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X32-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X32-NEXT: vcvtdq2pd %xmm0, %ymm0 +; X32-NEXT: retl +; +; X64-LABEL: signbits_sext_shuffle_sitofp: +; X64: # BB#0: +; X64-NEXT: vpmovsxdq %xmm0, %xmm1 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; X64-NEXT: vpmovsxdq %xmm0, %xmm0 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: vpermilpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; X64-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3,0,1] +; X64-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2] +; X64-NEXT: vcvtdq2pd %xmm0, %ymm0 +; X64-NEXT: retq + %1 = sext <4 x i32> %a0 to <4 x i64> + %2 = shufflevector <4 x i64> %1, <4 x i64>%a1, <4 x i32> + %3 = sitofp <4 x i64> %2 to <4 x double> + ret <4 x double> %3 +} + +define <2 x double> @signbits_ashr_concat_ashr_extract_sitofp(<2 x i64> %a0, <4 x i64> %a1) nounwind { +; X32-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; X32: # BB#0: +; X32-NEXT: vpsrad $16, %xmm0, %xmm1 +; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X32-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X32-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X32-NEXT: vcvtdq2pd %xmm0, %xmm0 +; X32-NEXT: retl +; +; X64-LABEL: signbits_ashr_concat_ashr_extract_sitofp: +; X64: # BB#0: +; X64-NEXT: vpsrad $16, %xmm0, %xmm1 +; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7] +; X64-NEXT: vpsrlq $16, %xmm0, %xmm0 +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; X64-NEXT: vcvtdq2pd %xmm0, %xmm0 +; X64-NEXT: retq + %1 = ashr <2 x i64> %a0, + %2 = shufflevector <2 x i64> %1, <2 x i64> undef, <4 x i32> + %3 = shufflevector <4 x i64> %a1, <4 x i64> %2, <4 x i32> + %4 = ashr <4 x i64> %3, + %5 = shufflevector <4 x i64> %4, <4 x i64> undef, <2 x i32> + %6 = sitofp <2 x i64> %5 to <2 x double> + ret <2 x double> %6 +} diff --git a/test/CodeGen/X86/leaFixup32.mir b/test/CodeGen/X86/leaFixup32.mir new file mode 100644 index 000000000000..70aac21c7ff2 --- /dev/null +++ b/test/CodeGen/X86/leaFixup32.mir @@ -0,0 +1,508 @@ +# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s +--- | + ; ModuleID = 'test/CodeGen/X86/fixup-lea.ll' + source_filename = "test/CodeGen/X86/fixup-lea.ll" + target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" + target triple = "i386" + ;generated using: llc -stop-after x86-pad-short-functions fixup-lea.ll > leaFinxup32.mir + + ;test2add_32: 3 operands LEA32r that can be replaced with 2 add instructions + ; where ADD32ri8 is chosen + define i32 @test2add_32() { + ret i32 0 + } + + ;test2add_ebp_32: 3 operands LEA32r that can be replaced with 2 add instructions + ; where the base is rbp/r13/ebp register + define i32 @test2add_ebp_32() { + ret i32 0 + } + + ;test1add_ebp_32: 2 operands LEA32r where base register is ebp and can be replaced + ; with an add instruction + define i32 @test1add_ebp_32() { + ret i32 0 + } + + ;testleaadd_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + define i32 @testleaadd_32() { + ret i32 0 + } + + ;testleaadd_ebp_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + ; where the base is ebp register + define i32 @testleaadd_ebp_32() { + ret i32 0 + } + + ;test1lea_ebp_32: 2 operands LEA32r wher base register is rbp/r13/ebp and can be replaced + ; with a lea instruction + define i32 @test1lea_ebp_32() { + ret i32 0 + } + + ;test2addi32_32: 3 operands LEA32r that can be replaced with 2 add instructions where ADD32ri32 + ; is chosen + define i32 @test2addi32_32() { + ret i32 0 + } + + ;test1mov1add_ebp_32: 2 operands LEA32r that can be replaced with 1 add 1 mov instructions + ; where the base is rbp/r13/ebp register + define i32 @test1mov1add_ebp_32() { + ret i32 0 + } + + ;testleaadd_ebp_index_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is offset + define i32 @testleaadd_ebp_index_32() { + ret i32 0 + } + + ;testleaadd_ebp_index2_32: 3 operands LEA32r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is scale + define i32 @testleaadd_ebp_index2_32() { + ret i32 0 + } + + ;test_skip_opt_32: 3 operands LEA32r that can not be replaced with 2 instructions + define i32 @test_skip_opt_32() { + ret i32 0 + } + + ;test_skip_eflags_32: LEA32r that cannot be replaced since its not safe to clobber eflags + define i32 @test_skip_eflags_32() { + ret i32 0 + } + +... +--- +name: test2add_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %eax = ADD32rr %eax, killed %ebp + ; CHECK: %eax = ADD32ri8 %eax, -5 + + %eax = LEA32r killed %eax, 1, killed %ebp, -5, _ + RETQ %eax + +... +--- +name: test2add_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebp = ADD32rr %ebp, killed %eax + ; CHECK: %ebp = ADD32ri8 %ebp, -5 + + %ebp = LEA32r killed %ebp, 1, killed %eax, -5, _ + RETQ %ebp + +... +--- +name: test1add_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebp = ADD32rr %ebp, killed %eax + + %ebp = LEA32r killed %ebp, 1, killed %eax, 0, _ + RETQ %ebp + +... +--- +name: testleaadd_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } + - { reg: '%ebx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %esi + ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0 + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA32r killed %eax, 1, killed %ebp, -5, _ + RETQ %ebx + +... +--- +name: testleaadd_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } + - { reg: '%ebx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _ + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA32r killed %ebp, 1, killed %eax, -5, _ + RETQ %ebx + +... +--- +name: test1lea_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } + - { reg: '%ebx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %ebx = LEA32r killed %eax, 1, killed %ebp, 0, _ + + %ebx = LEA32r killed %ebp, 1, killed %eax, 0, _ + RETQ %ebx + +... +--- +name: test2addi32_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp + ; CHECK: %eax = ADD32rr %eax, killed %ebp + ; CHECK: %eax = ADD32ri %eax, 129 + + %eax = LEA32r killed %eax, 1, killed %ebp, 129, _ + RETQ %eax + +... +--- +name: test1mov1add_ebp_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%eax' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = MOV32rr killed %ebp + ; CHECK: %ebx = ADD32rr %ebx, killed %ebp + + %ebx = LEA32r killed %ebp, 1, killed %ebp, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_ebp_index_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebx' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA32r _, 1, killed %ebp, 5, _ + ; CHECK: %ebx = ADD32rr %ebx, killed %ebp + + %ebx = LEA32r killed %ebp, 1, killed %ebp, 5, _ + RETQ %ebx + +... +--- +name: testleaadd_ebp_index2_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebx' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA32r _, 4, killed %ebp, 5, _ + ; CHECK: %ebx = ADD32rr %ebx, killed %ebp + + %ebx = LEA32r killed %ebp, 4, killed %ebp, 5, _ + RETQ %ebx + +... +--- +name: test_skip_opt_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebx' } + - { reg: '%ebp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _ + + %ebp = LEA32r killed %ebp, 4, killed %ebp, 0, _ + RETQ %ebp + +... +--- +name: test_skip_eflags_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%ebp' } + - { reg: '%eax' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA32r killed %eax, 4, killed %eax, 5, _ + ; CHECK: %ebp = LEA32r killed %ebx, 4, killed %ebx, 0, _ + ; CHECK: %ebp = ADD32ri8 %ebp, 5 + + CMP32rr %eax, killed %ebx, implicit-def %eflags + %ebx = LEA32r killed %eax, 4, killed %eax, 5, _ + JE_1 %bb.1, implicit %eflags + RETQ %ebx + bb.1: + liveins: %eax, %ebp, %ebx + %ebp = LEA32r killed %ebx, 4, killed %ebx, 5, _ + RETQ %ebp + +... + + + diff --git a/test/CodeGen/X86/leaFixup64.mir b/test/CodeGen/X86/leaFixup64.mir new file mode 100644 index 000000000000..9b0058750598 --- /dev/null +++ b/test/CodeGen/X86/leaFixup64.mir @@ -0,0 +1,1041 @@ +# RUN: llc -run-pass x86-fixup-LEAs -mcpu=corei7-avx -o - %s | FileCheck %s +--- | + ; ModuleID = 'lea-2.ll' + source_filename = "lea-2.ll" + target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" + ;generated using: llc -stop-after x86-pad-short-functions lea-2.ll > leaFinxup64.mir + + ;testleaadd_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions + ; but can be replaced with 1 lea + 1 add + define i32 @testleaadd_64_32_1() { + ret i32 0 + } + + ;testleaadd_rbp_64_32_1: 3 operands LEA64_32r cannot be replaced with 2 add instructions + ; where the base is rbp/r13/ebp register but it can be replaced with 1 lea + 1 add + define i32 @testleaadd_rbp_64_32_1() { + ret i32 0 + } + + ;test1lea_rbp_64_32_1: 2 operands LEA64_32r where base register is rbp/r13/ebp and can not + ; be replaced with an add instruction but can be replaced with 1 lea instruction + define i32 @test1lea_rbp_64_32_1() { + ret i32 0 + } + + ;test2add_64: 3 operands LEA64r that can be replaced with 2 add instructions + define i32 @test2add_64() { + ret i32 0 + } + + ;test2add_rbp_64: 3 operands LEA64r that can be replaced with 2 add instructions + ; where the base is rbp/r13/ebp register + define i32 @test2add_rbp_64() { + ret i32 0 + } + + ;test1add_rbp_64: 2 operands LEA64r where base register is rbp/r13/ebp and can be replaced + ; with an add instruction + define i32 @test1add_rbp_64() { + ret i32 0 + } + + ;testleaadd_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions + define i32 @testleaadd_64_32() { + ret i32 0 + } + + ;testleaadd_rbp_64_32: 3 operands LEA64_32r that can be replaced with 1 lea 1 add instructions + ; where the base is rbp/r13/ebp register + define i32 @testleaadd_rbp_64_32() { + ret i32 0 + } + + ;test1lea_rbp_64_32: 2 operands LEA64_32r where base register is rbp/r13/ebp and can be replaced + ; with a lea instruction + define i32 @test1lea_rbp_64_32() { + ret i32 0 + } + + ;testleaadd_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + define i32 @testleaadd_64() { + ret i32 0 + } + + ;testleaadd_rbp_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + ; where the base is rbp/r13/ebp register + define i32 @testleaadd_rbp_64() { + ret i32 0 + } + + ;test1lea_rbp_64: 2 operands LEA64r wher base register is rbp/r13/ebp and can be replaced + ; with a lea instruction + define i32 @test1lea_rbp_64() { + ret i32 0 + } + + ;test8: dst = base & scale!=1, can't optimize + define i32 @test8() { + ret i32 0 + } + + ;testleaaddi32_64_32: 3 operands LEA64_32r that can be replaced with 1 lea + 1 add instructions where + ; ADD64ri32 is chosen + define i32 @testleaaddi32_64_32() { + ret i32 0 + } + + ;test1mov1add_rbp_64_32: 2 operands LEA64_32r cannot be replaced with 1 add 1 mov instructions + ; where the base is rbp/r13/ebp register + define i32 @test1mov1add_rbp_64_32() { + ret i32 0 + } + + ;testleaadd_rbp_index_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is offset + define i32 @testleaadd_rbp_index_64_32() { + ret i32 0 + } + + ;testleaadd_rbp_index2_64_32: 3 operands LEA64_32r that cannot replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is scale + define i32 @testleaadd_rbp_index2_64_32() { + ret i32 0 + } + + ;test2addi32_64: 3 operands LEA64r that can be replaced with 2 add instructions where ADD64ri32 + ; is chosen + define i32 @test2addi32_64() { + ret i32 0 + } + + ;test1mov1add_rbp_64: 2 operands LEA64r that can be replaced with 1 add 1 mov instructions + ; where the base is rbp/r13/ebp register + define i32 @test1mov1add_rbp_64() { + ret i32 0 + } + + ;testleaadd_rbp_index_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is offset + define i32 @testleaadd_rbp_index_64() { + ret i32 0 + } + + ;testleaadd_rbp_index2_64: 3 operands LEA64r that can be replaced with 1 lea 1 add instructions + ; where the base and the index are ebp register and there is scale + define i32 @testleaadd_rbp_index2_64() { + ret i32 0 + } + + ;test_skip_opt_64: 3 operands LEA64r that can not be replaced with 2 instructions + define i32 @test_skip_opt_64() { + ret i32 0 + } + + ;test_skip_eflags_64: LEA64r that cannot be replaced since its not safe to clobber eflags + define i32 @test_skip_eflags_64() { + ret i32 0 + } + + ;test_skip_opt_64_32: 3 operands LEA64_32r that can not be replaced with 2 instructions + define i32 @test_skip_opt_64_32() { + ret i32 0 + } + + ;test_skip_eflags_64_32: LEA64_32r that cannot be replaced since its not safe to clobber eflags + define i32 @test_skip_eflags_64_32() { + ret i32 0 + } + + +... +--- +name: testleaadd_64_32_1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0 + ; CHECK: %eax = ADD32ri8 %eax, -5 + + %eax = LEA64_32r killed %rax, 1, killed %rbp, -5, _ + RETQ %eax + +... +--- +name: testleaadd_rbp_64_32_1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0 + ; CHECK: %ebp = ADD32ri8 %ebp, -5 + + %ebp = LEA64_32r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebp + +... +--- +name: test1lea_rbp_64_32_1 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebp = LEA64_32r killed %rax, 1, killed %rbp, 0 + + %ebp = LEA64_32r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebp + +... +--- +name: test2add_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rax = ADD64rr %rax, killed %rbp + ; CHECK: %rax = ADD64ri8 %rax, -5 + + %rax = LEA64r killed %rax, 1, killed %rbp, -5, _ + RETQ %eax + +... +--- +name: test2add_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbp = ADD64rr %rbp, killed %rax + ; CHECK: %rbp = ADD64ri8 %rbp, -5 + + %rbp = LEA64r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebp + +... +--- +name: test1add_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbp = ADD64rr %rbp, killed %rax + + %rbp = LEA64r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebp + +... +--- +name: testleaadd_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA64_32r killed %rax, 1, killed %rbp, -5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %ebx = ADD32ri8 %ebx, -5 + + %ebx = LEA64_32r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebx + +... +--- +name: test1lea_rbp_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %ebx = LEA64_32r killed %rax, 1, killed %rbp, 0, _ + + %ebx = LEA64_32r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %rbx = ADD64ri8 %rbx, -5 + + %rbx = LEA64r killed %rax, 1, killed %rbp, -5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _ + ; CHECK: %rbx = ADD64ri8 %rbx, -5 + + %rbx = LEA64r killed %rbp, 1, killed %rax, -5, _ + RETQ %ebx + +... +--- +name: test1lea_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } + - { reg: '%rbx' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rbx = LEA64r killed %rax, 1, killed %rbp, 0, _ + + %rbx = LEA64r killed %rbp, 1, killed %rax, 0, _ + RETQ %ebx + +... +--- +name: test8 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rdi' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rdi, %rbp + ; CHECK: %r12 = LEA64r _, 2, killed %r13, 5, _ + ; CHECK: %r12 = ADD64rr %r12, killed %rbp + %rbp = KILL %rbp, implicit-def %rbp + %r13 = KILL %rdi, implicit-def %r13 + %r12 = LEA64r killed %rbp, 2, killed %r13, 5, _ + RETQ %r12 + +... +--- +name: testleaaddi32_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %eax = LEA64_32r killed %rax, 1, killed %rbp, 0 + ; CHECK: %eax = ADD32ri %eax, 129 + + %eax = LEA64_32r killed %rax, 1, killed %rbp, 129, _ + RETQ %eax + +... +--- +name: test1mov1add_rbp_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _ + + %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _ + + %ebx = LEA64_32r killed %rbp, 1, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index2_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %eax, %ebp, %ebx + ; CHECK: %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _ + + %ebx = LEA64_32r killed %rbp, 4, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: test2addi32_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp + ; CHECK: %rax = ADD64rr %rax, killed %rbp + ; CHECK: %rax = ADD64ri32 %rax, 129 + + %rax = LEA64r killed %rax, 1, killed %rbp, 129, _ + RETQ %eax + +... +--- +name: test1mov1add_rbp_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rax' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = MOV64rr killed %rbp + ; CHECK: %rbx = ADD64rr %rbx, killed %rbp + + %rbx = LEA64r killed %rbp, 1, killed %rbp, 0, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = LEA64r _, 1, killed %rbp, 5, _ + ; CHECK: %rbx = ADD64rr %rbx, killed %rbp + + %rbx = LEA64r killed %rbp, 1, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: testleaadd_rbp_index2_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = LEA64r _, 4, killed %rbp, 5, _ + ; CHECK: %rbx = ADD64rr %rbx, killed %rbp + + %rbx = LEA64r killed %rbp, 4, killed %rbp, 5, _ + RETQ %ebx + +... +--- +name: test_skip_opt_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _ + + %rbp = LEA64r killed %rbp, 4, killed %rbp, 0, _ + RETQ %ebp + +... +--- +name: test_skip_eflags_64 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbp' } + - { reg: '%rax' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %rbx = LEA64r killed %rax, 4, killed %rax, 5, _ + ; CHECK: %rbp = LEA64r killed %rbx, 4, killed %rbx, 0, _ + ; CHECK: %rbp = ADD64ri8 %rbp, 5 + + CMP64rr %rax, killed %rbx, implicit-def %eflags + %rbx = LEA64r killed %rax, 4, killed %rax, 5, _ + JE_1 %bb.1, implicit %eflags + RETQ %ebx + bb.1: + liveins: %rax, %rbp, %rbx + %rbp = LEA64r killed %rbx, 4, killed %rbx, 5, _ + RETQ %ebp + +... +--- +name: test_skip_opt_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbx' } + - { reg: '%rbp' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _ + + %ebp = LEA64_32r killed %rbp, 4, killed %rbp, 0, _ + RETQ %ebp + +... +--- +name: test_skip_eflags_64_32 +alignment: 4 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +tracksRegLiveness: true +liveins: + - { reg: '%rbp' } + - { reg: '%rax' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 0 + offsetAdjustment: 0 + maxAlignment: 0 + adjustsStack: false + hasCalls: false + maxCallFrameSize: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false +body: | + bb.0 (%ir-block.0): + liveins: %rax, %rbp, %rbx + ; CHECK: %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _ + ; CHECK: %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 0, _ + ; CHECK: %ebp = ADD32ri8 %ebp, 5 + + CMP64rr %rax, killed %rbx, implicit-def %eflags + %ebx = LEA64_32r killed %rax, 4, killed %rax, 5, _ + JE_1 %bb.1, implicit %eflags + RETQ %ebx + bb.1: + liveins: %rax, %rbp, %rbx + %ebp = LEA64_32r killed %rbx, 4, killed %rbx, 5, _ + RETQ %ebp + +... + + + diff --git a/test/CodeGen/X86/lrshrink.ll b/test/CodeGen/X86/lrshrink.ll new file mode 100644 index 000000000000..a9cf086dbd90 --- /dev/null +++ b/test/CodeGen/X86/lrshrink.ll @@ -0,0 +1,57 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + +; Checks if "%7 = add nuw nsw i64 %4, %2" is moved before the last call +; to minimize live-range. + +define i64 @test(i1 %a, i64 %r1, i64 %r2, i64 %s1, i64 %s2, i64 %t1, i64 %t2) { +entry: + br i1 %a, label %then, label %else + +then: + br label %else + +else: + %0 = phi i64 [ 4, %entry ], [ 10, %then ] + %r = phi i64 [ %r1, %entry ], [ %r2, %then ] + %s = phi i64 [ %s1, %entry ], [ %s2, %then ] + %t = phi i64 [ %t1, %entry ], [ %t2, %then ] +; CHECK-LABEL: test: +; CHECK: add +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add +; CHECK: call +; CHECK: add + %1 = tail call i32 @_Z3foov() + %2 = zext i32 %1 to i64 + %3 = tail call i32 @_Z3foov() + %4 = zext i32 %3 to i64 + %5 = tail call i32 @_Z3foov() + %6 = zext i32 %5 to i64 + %7 = add nuw nsw i64 %0, %r + tail call void @llvm.dbg.value(metadata i64 %7, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %8 = add nuw nsw i64 %2, %7 + %9 = add nuw nsw i64 %4, %8 + %10 = add nuw nsw i64 %6, %9 + %11 = add nuw nsw i64 %s, %t + tail call void @llvm.dbg.value(metadata i64 %11, i64 0, metadata !5, metadata !DIExpression()), !dbg !6 + %12 = add nuw nsw i64 %10, %11 + ret i64 %12 +} + +declare i32 @_Z3foov() +declare void @llvm.dbg.value(metadata, i64, metadata, metadata) + +!llvm.dbg.cu = !{!0} +!llvm.module.flags = !{!1, !2} + +!0 = distinct !DICompileUnit(language: DW_LANG_C_plus_plus, file: !3, emissionKind: FullDebug) +!1 = !{i32 2, !"Dwarf Version", i32 4} +!2 = !{i32 2, !"Debug Info Version", i32 3} +!3 = !DIFile(filename: "a.c", directory: "./") +!4 = distinct !DISubprogram(name: "test", scope: !3, unit: !0) +!5 = !DILocalVariable(name: "x", scope: !4) +!6 = !DILocation(line: 4, scope: !4) diff --git a/test/CodeGen/X86/madd.ll b/test/CodeGen/X86/madd.ll index d332b2f3169f..af86df510016 100644 --- a/test/CodeGen/X86/madd.ll +++ b/test/CodeGen/X86/madd.ll @@ -129,9 +129,9 @@ define i32 @test_unsigned_short(i16* nocapture readonly, i16* nocapture readonly ; SSE2-NEXT: pmullw %xmm2, %xmm3 ; SSE2-NEXT: movdqa %xmm3, %xmm2 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] +; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm4[4],xmm3[5],xmm4[5],xmm3[6],xmm4[6],xmm3[7],xmm4[7] ; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm0 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-8, %rax @@ -246,23 +246,23 @@ define i32 @_Z9test_charPcS_i(i8* nocapture readonly, i8* nocapture readonly, i3 ; SSE2-NEXT: pmullw %xmm4, %xmm5 ; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] ; SSE2-NEXT: psrad $16, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm5 -; SSE2-NEXT: movq {{.*#+}} xmm6 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm6 -; SSE2-NEXT: movq {{.*#+}} xmm7 = mem[0],zero -; SSE2-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSE2-NEXT: psraw $8, %xmm7 -; SSE2-NEXT: pmullw %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm7[0],xmm6[1],xmm7[1],xmm6[2],xmm7[2],xmm6[3],xmm7[3] -; SSE2-NEXT: psrad $16, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4,4,5,5,6,6,7,7] -; SSE2-NEXT: psrad $16, %xmm7 -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: paddd %xmm6, %xmm3 -; SSE2-NEXT: paddd %xmm5, %xmm1 ; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movq {{.*#+}} xmm4 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm4 +; SSE2-NEXT: movq {{.*#+}} xmm5 = mem[0],zero +; SSE2-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; SSE2-NEXT: psraw $8, %xmm5 +; SSE2-NEXT: pmullw %xmm4, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7] +; SSE2-NEXT: psrad $16, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm2 ; SSE2-NEXT: addq $16, %rsi ; SSE2-NEXT: addq $16, %rdi ; SSE2-NEXT: addq $-16, %rax diff --git a/test/CodeGen/X86/masked_gather_scatter.ll b/test/CodeGen/X86/masked_gather_scatter.ll index 29a662fb217e..c5de8dd96cbc 100644 --- a/test/CodeGen/X86/masked_gather_scatter.ll +++ b/test/CodeGen/X86/masked_gather_scatter.ll @@ -3,7 +3,7 @@ ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32 ; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX ; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32 -; RUN: opt -mtriple=x86_64-apple-darwin -codegenprepare -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR +; RUN: opt -mtriple=x86_64-apple-darwin -scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR ; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" diff --git a/test/CodeGen/X86/merge-consecutive-loads-128.ll b/test/CodeGen/X86/merge-consecutive-loads-128.ll index 71417694b0d4..2f7714e63886 100644 --- a/test/CodeGen/X86/merge-consecutive-loads-128.ll +++ b/test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -270,9 +270,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_012u: @@ -292,9 +292,9 @@ define <4 x float> @merge_4f32_f32_012u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_012u: @@ -321,9 +321,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; SSE2: # BB#0: ; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; SSE2-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE2-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; SSE2-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: merge_4f32_f32_019u: @@ -343,9 +343,9 @@ define <4 x float> @merge_4f32_f32_019u(float* %ptr) nounwind uwtable noinline s ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-SSE1-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero ; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] -; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-SSE1-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X32-SSE1-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-SSE1-NEXT: retl ; ; X32-SSE41-LABEL: merge_4f32_f32_019u: diff --git a/test/CodeGen/X86/misched-matrix.ll b/test/CodeGen/X86/misched-matrix.ll index e62a1d04dad6..94bbe75702cb 100644 --- a/test/CodeGen/X86/misched-matrix.ll +++ b/test/CodeGen/X86/misched-matrix.ll @@ -17,9 +17,9 @@ ; ; TOPDOWN-LABEL: %for.body ; TOPDOWN: movl %{{.*}}, ( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 4( -; TOPDOWN: imull {{[0-9]*}}( +; TOPDOWN-NOT: imull {{[0-9]*}}( ; TOPDOWN: movl %{{.*}}, 8( ; TOPDOWN: movl %{{.*}}, 12( ; TOPDOWN-LABEL: %for.end diff --git a/test/CodeGen/X86/not-and-simplify.ll b/test/CodeGen/X86/not-and-simplify.ll index dfce6c681500..83b2be83d552 100644 --- a/test/CodeGen/X86/not-and-simplify.ll +++ b/test/CodeGen/X86/not-and-simplify.ll @@ -1,3 +1,4 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefix=ALL --check-prefix=NO_BMI ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefix=ALL --check-prefix=BMI @@ -11,13 +12,24 @@ define i32 @shrink_xor_constant1(i32 %x) { ; ALL-NEXT: xorl $1, %edi ; ALL-NEXT: movl %edi, %eax ; ALL-NEXT: retq -; %sh = lshr i32 %x, 31 %not = xor i32 %sh, -1 %and = and i32 %not, 1 ret i32 %and } +define <4 x i32> @shrink_xor_constant1_splat(<4 x i32> %x) { +; ALL-LABEL: shrink_xor_constant1_splat: +; ALL: # BB#0: +; ALL-NEXT: psrld $31, %xmm0 +; ALL-NEXT: pandn {{.*}}(%rip), %xmm0 +; ALL-NEXT: retq + %sh = lshr <4 x i32> %x, + %not = xor <4 x i32> %sh, + %and = and <4 x i32> %not, + ret <4 x i32> %and +} + ; Clear low bits via shift, set them with xor (not), then mask them off. define i8 @shrink_xor_constant2(i8 %x) { @@ -27,10 +39,22 @@ define i8 @shrink_xor_constant2(i8 %x) { ; ALL-NEXT: xorb $-32, %dil ; ALL-NEXT: movl %edi, %eax ; ALL-NEXT: retq -; %sh = shl i8 %x, 5 %not = xor i8 %sh, -1 %and = and i8 %not, 224 ; 0xE0 ret i8 %and } +define <16 x i8> @shrink_xor_constant2_splat(<16 x i8> %x) { +; ALL-LABEL: shrink_xor_constant2_splat: +; ALL: # BB#0: +; ALL-NEXT: psllw $5, %xmm0 +; ALL-NEXT: pand {{.*}}(%rip), %xmm0 +; ALL-NEXT: pandn {{.*}}(%rip), %xmm0 +; ALL-NEXT: retq + %sh = shl <16 x i8> %x, + %not = xor <16 x i8> %sh, + %and = and <16 x i8> %not, + ret <16 x i8> %and +} + diff --git a/test/CodeGen/X86/oddshuffles.ll b/test/CodeGen/X86/oddshuffles.ll index d26cf02dd942..0bda41a30c69 100644 --- a/test/CodeGen/X86/oddshuffles.ll +++ b/test/CodeGen/X86/oddshuffles.ll @@ -746,9 +746,9 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE2-LABEL: interleave_24i8_in: ; SSE2: # BB#0: ; SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE2-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] +; SSE2-NEXT: punpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm0[0] +; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pxor %xmm2, %xmm2 ; SSE2-NEXT: movdqa %xmm1, %xmm3 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7] @@ -791,17 +791,17 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; SSE42: # BB#0: ; SSE42-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE42-NEXT: movq {{.*#+}} xmm2 = mem[0],zero ; SSE42-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; SSE42-NEXT: movdqa %xmm0, %xmm1 -; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,8],zero,xmm1[1,9],zero,xmm1[2,10],zero,xmm1[3,11],zero,xmm1[4,12],zero,xmm1[5] -; SSE42-NEXT: movdqa %xmm2, %xmm3 +; SSE42-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8],zero,xmm2[1,9],zero,xmm2[2,10],zero,xmm2[3,11],zero,xmm2[4,12],zero,xmm2[5] +; SSE42-NEXT: movdqa %xmm1, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm3 = zero,zero,xmm3[0],zero,zero,xmm3[1],zero,zero,xmm3[2],zero,zero,xmm3[3],zero,zero,xmm3[4],zero -; SSE42-NEXT: por %xmm1, %xmm3 +; SSE42-NEXT: por %xmm2, %xmm3 ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; SSE42-NEXT: pshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; SSE42-NEXT: por %xmm0, %xmm2 -; SSE42-NEXT: movq %xmm2, 16(%rdi) +; SSE42-NEXT: pshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; SSE42-NEXT: por %xmm0, %xmm1 +; SSE42-NEXT: movq %xmm1, 16(%rdi) ; SSE42-NEXT: movdqu %xmm3, (%rdi) ; SSE42-NEXT: retq ; @@ -809,16 +809,16 @@ define void @interleave_24i8_in(<24 x i8>* %p, <8 x i8>* %q1, <8 x i8>* %q2, <8 ; AVX: # BB#0: ; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0] -; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] -; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero -; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5] +; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm1[0],zero,zero,xmm1[1],zero,zero,xmm1[2],zero,zero,xmm1[3],zero,zero,xmm1[4],zero +; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u] -; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u] -; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0 +; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[5],zero,zero,xmm1[6],zero,zero,xmm1[7,u,u,u,u,u,u,u,u] +; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 ; AVX-NEXT: vmovq %xmm0, 16(%rdi) -; AVX-NEXT: vmovdqu %xmm1, (%rdi) +; AVX-NEXT: vmovdqu %xmm2, (%rdi) ; AVX-NEXT: retq %s1 = load <8 x i8>, <8 x i8>* %q1, align 4 %s2 = load <8 x i8>, <8 x i8>* %q2, align 4 diff --git a/test/CodeGen/X86/packss.ll b/test/CodeGen/X86/packss.ll index 5cd649bb3902..24db6ba9ca2f 100644 --- a/test/CodeGen/X86/packss.ll +++ b/test/CodeGen/X86/packss.ll @@ -26,18 +26,17 @@ define <4 x i32> @trunc_ashr_v4i64(<4 x i64> %a) nounwind { ; X64-AVX1-LABEL: trunc_ashr_v4i64: ; X64-AVX1: # BB#0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX1-NEXT: vpsrad $31, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; X64-AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; X64-AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; X64-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X64-AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; X64-AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 ; X64-AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq ; ; X64-AVX2-LABEL: trunc_ashr_v4i64: ; X64-AVX2: # BB#0: -; X64-AVX2-NEXT: vpsrad $31, %ymm0, %ymm0 -; X64-AVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] +; X64-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X64-AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm0 ; X64-AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; X64-AVX2-NEXT: vpacksswb %xmm1, %xmm0, %xmm0 ; X64-AVX2-NEXT: vzeroupper diff --git a/test/CodeGen/X86/pmul.ll b/test/CodeGen/X86/pmul.ll index 88cb7a6d5825..50a661fcca11 100644 --- a/test/CodeGen/X86/pmul.ll +++ b/test/CodeGen/X86/pmul.ll @@ -1152,9 +1152,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE2-NEXT: punpckhdq {{.*#+}} xmm0 = xmm0[2],xmm3[2],xmm0[3],xmm3[3] ; SSE2-NEXT: movdqa %xmm1, %xmm4 ; SSE2-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1] +; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm3[2],xmm1[3],xmm3[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm1 -; SSE2-NEXT: pmuludq %xmm4, %xmm2 ; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,3],xmm1[1,3] ; SSE2-NEXT: movaps %xmm2, %xmm0 ; SSE2-NEXT: retq @@ -1166,9 +1166,9 @@ define <4 x i32> @mul_v4i64_zero_upper(<4 x i32> %val1, <4 x i32> %val2) { ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm3 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pmuludq %xmm3, %xmm0 -; SSE41-NEXT: pmuludq %xmm2, %xmm4 ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: retq ; @@ -1312,17 +1312,17 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE2-NEXT: movdqa %xmm1, %xmm5 ; SSE2-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1] ; SSE2-NEXT: punpckhdq {{.*#+}} xmm1 = xmm1[2],xmm6[2],xmm1[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm2, %xmm8 -; SSE2-NEXT: punpckldq {{.*#+}} xmm8 = xmm8[0],xmm6[0],xmm8[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] -; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: movdqa %xmm2, %xmm7 ; SSE2-NEXT: punpckldq {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1] -; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] -; SSE2-NEXT: pmuludq %xmm1, %xmm3 -; SSE2-NEXT: pmuludq %xmm7, %xmm5 +; SSE2-NEXT: pmuludq %xmm7, %xmm4 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm2 = xmm2[2],xmm6[2],xmm2[3],xmm6[3] ; SSE2-NEXT: pmuludq %xmm0, %xmm2 -; SSE2-NEXT: pmuludq %xmm8, %xmm4 ; SSE2-NEXT: shufps {{.*#+}} xmm4 = xmm4[1,3],xmm2[1,3] +; SSE2-NEXT: movdqa %xmm3, %xmm0 +; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1] +; SSE2-NEXT: pmuludq %xmm0, %xmm5 +; SSE2-NEXT: punpckhdq {{.*#+}} xmm3 = xmm3[2],xmm6[2],xmm3[3],xmm6[3] +; SSE2-NEXT: pmuludq %xmm1, %xmm3 ; SSE2-NEXT: shufps {{.*#+}} xmm5 = xmm5[1,3],xmm3[1,3] ; SSE2-NEXT: movaps %xmm4, %xmm0 ; SSE2-NEXT: movaps %xmm5, %xmm1 @@ -1331,22 +1331,22 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_zero_upper: ; SSE41: # BB#0: # %entry ; SSE41-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm8 = xmm4[0],zero,xmm4[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm4[0],zero,xmm4[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm5 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm6 = xmm0[0],zero,xmm0[1],zero ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm7 = xmm1[0],zero,xmm1[1],zero ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] -; SSE41-NEXT: pmovzxdq {{.*#+}} xmm4 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm0[0],zero,xmm0[1],zero +; SSE41-NEXT: pmuludq %xmm4, %xmm1 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm2[0],zero,xmm2[1],zero +; SSE41-NEXT: pmuludq %xmm5, %xmm0 +; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm3[2,3,0,1] ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero +; SSE41-NEXT: pmuludq %xmm6, %xmm2 ; SSE41-NEXT: pmovzxdq {{.*#+}} xmm1 = xmm3[0],zero,xmm3[1],zero ; SSE41-NEXT: pmuludq %xmm7, %xmm1 -; SSE41-NEXT: pmuludq %xmm6, %xmm2 -; SSE41-NEXT: pmuludq %xmm5, %xmm0 -; SSE41-NEXT: pmuludq %xmm8, %xmm4 -; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm4[1,3] ; SSE41-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm2[1,3] ; SSE41-NEXT: retq ; @@ -1356,11 +1356,11 @@ define <8 x i32> @mul_v8i64_zero_upper(<8 x i32> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm3 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm1 ; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm1 -; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm1[1,3],ymm0[1,3],ymm1[5,7],ymm0[5,7] +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[1,3],ymm0[1,3],ymm2[5,7],ymm0[5,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: retq ; @@ -1467,22 +1467,22 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; SSE41-LABEL: mul_v8i64_sext: ; SSE41: # BB#0: ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[3,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm8 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm4 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[2,3,0,1] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm5 ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,1,2,3] -; SSE41-NEXT: pmovsxwq %xmm3, %xmm7 -; SSE41-NEXT: pmovsxwq %xmm0, %xmm5 +; SSE41-NEXT: pmovsxwq %xmm3, %xmm6 +; SSE41-NEXT: pmovsxwq %xmm0, %xmm7 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm3 +; SSE41-NEXT: pmuldq %xmm4, %xmm3 ; SSE41-NEXT: pmovsxdq %xmm2, %xmm2 +; SSE41-NEXT: pmuldq %xmm5, %xmm2 ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE41-NEXT: pmovsxdq %xmm0, %xmm4 +; SSE41-NEXT: pmuldq %xmm6, %xmm4 ; SSE41-NEXT: pmovsxdq %xmm1, %xmm0 -; SSE41-NEXT: pmuldq %xmm5, %xmm0 -; SSE41-NEXT: pmuldq %xmm7, %xmm4 -; SSE41-NEXT: pmuldq %xmm6, %xmm2 -; SSE41-NEXT: pmuldq %xmm8, %xmm3 +; SSE41-NEXT: pmuldq %xmm7, %xmm0 ; SSE41-NEXT: movdqa %xmm4, %xmm1 ; SSE41-NEXT: retq ; @@ -1493,9 +1493,10 @@ define <8 x i64> @mul_v8i64_sext(<8 x i16> %val1, <8 x i32> %val2) { ; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm1, %xmm3 ; AVX2-NEXT: vpmovsxdq %xmm3, %ymm3 +; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpmovsxdq %xmm1, %ymm1 ; AVX2-NEXT: vpmuldq %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpmuldq %ymm3, %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm2, %ymm1 ; AVX2-NEXT: retq ; ; AVX512-LABEL: mul_v8i64_sext: diff --git a/test/CodeGen/X86/pr28129.ll b/test/CodeGen/X86/pr28129.ll index a155f71f79c3..15bffffa207f 100644 --- a/test/CodeGen/X86/pr28129.ll +++ b/test/CodeGen/X86/pr28129.ll @@ -5,15 +5,15 @@ define <4 x double> @cmp4f64_domain(<4 x double> %a) { ; X86-LABEL: cmp4f64_domain: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp4f64_domain: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer @@ -26,15 +26,15 @@ define <4 x double> @cmp4f64_domain(<4 x double> %a) { define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize { ; X86-LABEL: cmp4f64_domain_optsize: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp4f64_domain_optsize: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddpd %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <4 x double> zeroinitializer, zeroinitializer @@ -47,15 +47,15 @@ define <4 x double> @cmp4f64_domain_optsize(<4 x double> %a) optsize { define <8 x float> @cmp8f32_domain(<8 x float> %a) { ; X86-LABEL: cmp8f32_domain: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp8f32_domain: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer @@ -68,15 +68,15 @@ define <8 x float> @cmp8f32_domain(<8 x float> %a) { define <8 x float> @cmp8f32_domain_optsize(<8 x float> %a) optsize { ; X86-LABEL: cmp8f32_domain_optsize: ; X86: # BB#0: -; X86-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X86-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X86-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X86-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: cmp8f32_domain_optsize: ; X64: # BB#0: -; X64-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; X64-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; X64-NEXT: vaddps %ymm1, %ymm0, %ymm0 ; X64-NEXT: retq %cmp = fcmp oeq <8 x float> zeroinitializer, zeroinitializer diff --git a/test/CodeGen/X86/pr29112.ll b/test/CodeGen/X86/pr29112.ll index 8c970b3d4771..94904018872b 100644 --- a/test/CodeGen/X86/pr29112.ll +++ b/test/CodeGen/X86/pr29112.ll @@ -38,7 +38,8 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm2[1],xmm1[3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm14 = xmm1[0,1,2],xmm3[1] -; CHECK-NEXT: vinsertps {{.*#+}} xmm10 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vinsertps {{.*#+}} xmm1 = xmm10[0,1,2],xmm3[1] +; CHECK-NEXT: vaddps %xmm14, %xmm1, %xmm10 ; CHECK-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,0] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm8[0],xmm0[0],xmm8[2,3] ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[1],xmm0[3] @@ -52,10 +53,9 @@ define <4 x float> @bar(<4 x float>* %a1p, <4 x float>* %a2p, <4 x float> %a3, < ; CHECK-NEXT: vmovaps %xmm15, %xmm1 ; CHECK-NEXT: vmovaps %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm9 -; CHECK-NEXT: vaddps %xmm14, %xmm10, %xmm0 ; CHECK-NEXT: vaddps %xmm1, %xmm1, %xmm8 -; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm3 -; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm11, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm10, %xmm0, %xmm0 ; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 ; CHECK-NEXT: vmovaps %xmm8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: vmovaps %xmm9, (%rsp) diff --git a/test/CodeGen/X86/pr30562.ll b/test/CodeGen/X86/pr30562.ll index dda736a1a183..a8e648074194 100644 --- a/test/CodeGen/X86/pr30562.ll +++ b/test/CodeGen/X86/pr30562.ll @@ -1,5 +1,6 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s + define i32 @foo(i64* nocapture %perm, i32 %n) { entry: br label %body diff --git a/test/CodeGen/X86/pr31088.ll b/test/CodeGen/X86/pr31088.ll index 0dd8eb0ece85..d7a546c7396d 100644 --- a/test/CodeGen/X86/pr31088.ll +++ b/test/CodeGen/X86/pr31088.ll @@ -150,12 +150,12 @@ define <2 x half> @ir_fadd_v2f16(<2 x half> %arg0, <2 x half> %arg1) nounwind { ; F16C-NEXT: vcvtph2ps %xmm3, %xmm3 ; F16C-NEXT: vcvtps2ph $4, %xmm1, %xmm1 ; F16C-NEXT: vcvtph2ps %xmm1, %xmm1 +; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: vcvtps2ph $4, %xmm2, %xmm2 ; F16C-NEXT: vcvtph2ps %xmm2, %xmm2 ; F16C-NEXT: vcvtps2ph $4, %xmm0, %xmm0 ; F16C-NEXT: vcvtph2ps %xmm0, %xmm0 ; F16C-NEXT: vaddss %xmm2, %xmm0, %xmm0 -; F16C-NEXT: vaddss %xmm3, %xmm1, %xmm1 ; F16C-NEXT: retq %retval = fadd <2 x half> %arg0, %arg1 ret <2 x half> %retval diff --git a/test/CodeGen/X86/pr32284.ll b/test/CodeGen/X86/pr32284.ll index e05fc926b080..143e3af82eb7 100644 --- a/test/CodeGen/X86/pr32284.ll +++ b/test/CodeGen/X86/pr32284.ll @@ -30,25 +30,24 @@ define void @foo() { ; X86-O0-NEXT: subl $12, %esp ; X86-O0-NEXT: .Lcfi0: ; X86-O0-NEXT: .cfi_def_cfa_offset 16 -; X86-O0-NEXT: movzbl c, %eax -; X86-O0-NEXT: testl %eax, %eax -; X86-O0-NEXT: setne %cl -; X86-O0-NEXT: movl %eax, %edx -; X86-O0-NEXT: movb %dl, %ch -; X86-O0-NEXT: testb %ch, %ch +; X86-O0-NEXT: movb c, %al +; X86-O0-NEXT: testb %al, %al ; X86-O0-NEXT: setne {{[0-9]+}}(%esp) -; X86-O0-NEXT: movzbl %cl, %edx -; X86-O0-NEXT: subl %eax, %edx -; X86-O0-NEXT: setle %cl -; X86-O0-NEXT: # implicit-def: %EAX -; X86-O0-NEXT: movb %cl, %al -; X86-O0-NEXT: andl $1, %eax -; X86-O0-NEXT: kmovd %eax, %k0 -; X86-O0-NEXT: kmovd %k0, %eax +; X86-O0-NEXT: movzbl c, %ecx +; X86-O0-NEXT: testl %ecx, %ecx +; X86-O0-NEXT: setne %al +; X86-O0-NEXT: movzbl %al, %edx +; X86-O0-NEXT: subl %ecx, %edx +; X86-O0-NEXT: setle %al +; X86-O0-NEXT: # implicit-def: %ECX ; X86-O0-NEXT: movb %al, %cl -; X86-O0-NEXT: andb $1, %cl -; X86-O0-NEXT: movzbl %cl, %eax -; X86-O0-NEXT: movl %eax, {{[0-9]+}}(%esp) +; X86-O0-NEXT: andl $1, %ecx +; X86-O0-NEXT: kmovd %ecx, %k0 +; X86-O0-NEXT: kmovd %k0, %ecx +; X86-O0-NEXT: movb %cl, %al +; X86-O0-NEXT: andb $1, %al +; X86-O0-NEXT: movzbl %al, %ecx +; X86-O0-NEXT: movl %ecx, {{[0-9]+}}(%esp) ; X86-O0-NEXT: movl %edx, (%esp) # 4-byte Spill ; X86-O0-NEXT: addl $12, %esp ; X86-O0-NEXT: retl @@ -69,27 +68,25 @@ define void @foo() { ; ; X64-O0-LABEL: foo: ; X64-O0: # BB#0: # %entry -; X64-O0-NEXT: movzbl {{.*}}(%rip), %eax -; X64-O0-NEXT: movl %eax, %ecx -; X64-O0-NEXT: movb %cl, %dl -; X64-O0-NEXT: movl %ecx, %eax -; X64-O0-NEXT: testq %rcx, %rcx -; X64-O0-NEXT: setne %sil -; X64-O0-NEXT: testb %dl, %dl +; X64-O0-NEXT: movb {{.*}}(%rip), %al +; X64-O0-NEXT: testb %al, %al ; X64-O0-NEXT: setne -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movzbl %sil, %edi -; X64-O0-NEXT: subl %eax, %edi -; X64-O0-NEXT: setle %dl -; X64-O0-NEXT: # implicit-def: %EAX -; X64-O0-NEXT: movb %dl, %al -; X64-O0-NEXT: andl $1, %eax -; X64-O0-NEXT: kmovd %eax, %k0 -; X64-O0-NEXT: kmovd %k0, %eax -; X64-O0-NEXT: movb %al, %dl -; X64-O0-NEXT: andb $1, %dl -; X64-O0-NEXT: movzbl %dl, %eax -; X64-O0-NEXT: movl %eax, -{{[0-9]+}}(%rsp) -; X64-O0-NEXT: movl %edi, -{{[0-9]+}}(%rsp) # 4-byte Spill +; X64-O0-NEXT: movzbl {{.*}}(%rip), %ecx +; X64-O0-NEXT: testl %ecx, %ecx +; X64-O0-NEXT: setne %al +; X64-O0-NEXT: movzbl %al, %edx +; X64-O0-NEXT: subl %ecx, %edx +; X64-O0-NEXT: setle %al +; X64-O0-NEXT: # implicit-def: %ECX +; X64-O0-NEXT: movb %al, %cl +; X64-O0-NEXT: andl $1, %ecx +; X64-O0-NEXT: kmovd %ecx, %k0 +; X64-O0-NEXT: kmovd %k0, %ecx +; X64-O0-NEXT: movb %cl, %al +; X64-O0-NEXT: andb $1, %al +; X64-O0-NEXT: movzbl %al, %ecx +; X64-O0-NEXT: movl %ecx, -{{[0-9]+}}(%rsp) +; X64-O0-NEXT: movl %edx, -{{[0-9]+}}(%rsp) # 4-byte Spill ; X64-O0-NEXT: retq entry: %a = alloca i8, align 1 diff --git a/test/CodeGen/X86/pr32907.ll b/test/CodeGen/X86/pr32907.ll index bc03fbe06843..8057b31c961c 100644 --- a/test/CodeGen/X86/pr32907.ll +++ b/test/CodeGen/X86/pr32907.ll @@ -5,41 +5,44 @@ ; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx512f | FileCheck %s --check-prefix=AVX --check-prefix=AVX512 define <2 x i64> @PR32907(<2 x i64> %astype.i, <2 x i64> %astype6.i) { -; SSE-LABEL: PR32907: -; SSE: # BB#0: # %entry -; SSE-NEXT: psubq %xmm1, %xmm0 -; SSE-NEXT: movdqa %xmm0, %xmm1 -; SSE-NEXT: psrad $31, %xmm1 -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] -; SSE-NEXT: pxor %xmm1, %xmm1 -; SSE-NEXT: psubq %xmm0, %xmm1 -; SSE-NEXT: pand %xmm2, %xmm1 -; SSE-NEXT: pandn %xmm0, %xmm2 -; SSE-NEXT: por %xmm2, %xmm1 -; SSE-NEXT: movdqa %xmm1, %xmm0 -; SSE-NEXT: retq +; SSE2-LABEL: PR32907: +; SSE2: # BB#0: # %entry +; SSE2-NEXT: psubq %xmm1, %xmm0 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,1,3,3] +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: psubq %xmm0, %xmm1 +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 +; SSE2-NEXT: retq +; +; SSE42-LABEL: PR32907: +; SSE42: # BB#0: # %entry +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pcmpgtq %xmm0, %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm0 +; SSE42-NEXT: psubq %xmm1, %xmm0 +; SSE42-NEXT: retq ; ; AVX2-LABEL: PR32907: ; AVX2: # BB#0: # %entry ; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsubq %xmm0, %xmm2, %xmm2 -; AVX2-NEXT: vpandn %xmm0, %xmm1, %xmm0 -; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 +; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: PR32907: ; AVX512: # BB#0: # %entry ; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vpsraq $63, %zmm0, %zmm1 -; AVX512-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512-NEXT: vpsubq %xmm0, %xmm2, %xmm2 -; AVX512-NEXT: vpandn %xmm0, %xmm1, %xmm0 -; AVX512-NEXT: vpand %xmm2, %xmm1, %xmm1 -; AVX512-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpxor %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vpsubq %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: vzeroupper ; AVX512-NEXT: retq entry: diff --git a/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll new file mode 100644 index 000000000000..9a5da33223ba --- /dev/null +++ b/test/CodeGen/X86/replace_unsupported_masked_mem_intrin.ll @@ -0,0 +1,37 @@ +; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mattr=+sse,+sse2 < %s -o /dev/null +; pr33001 - Check that llc doesn't crash when running with O0 option. + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define <4 x i32> @test_masked_load(<4 x i32>* %base, <4 x i1> %mask) { + %res = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %base, i32 4, <4 x i1> %mask, <4 x i32> zeroinitializer) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32, <4 x i1>, <4 x i32>) + + +define void @test_masked_store(<4 x i32>* %base, <4 x i32> %value, <4 x i1> %mask) { + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %base, i32 4, <4 x i1> %mask) + ret void +} + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) + + +define <4 x i32> @llvm_masked_gather(<4 x i32*> %ptrs, <4 x i1> %mask) { + %res = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 4, <4 x i1> %mask, <4 x i32> undef) + ret <4 x i32> %res +} + +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) + + +define void @llvm_masked_scatter(<4 x i32*> %ptrs, <4 x i32> %value, <4 x i1> %mask) { + call void @llvm.masked.scatter.v4i32(<4 x i32> %value, <4 x i32*> %ptrs, i32 4, <4 x i1> %mask) + ret void +} + +declare void @llvm.masked.scatter.v4i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) + diff --git a/test/CodeGen/X86/rotate.ll b/test/CodeGen/X86/rotate.ll index 5d5150ad62d6..4be3a4c2391b 100644 --- a/test/CodeGen/X86/rotate.ll +++ b/test/CodeGen/X86/rotate.ll @@ -33,8 +33,8 @@ define i64 @rotl64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB0_4: -; 32-NEXT: orl %esi, %eax ; 32-NEXT: orl %ebx, %edx +; 32-NEXT: orl %esi, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -86,8 +86,8 @@ define i64 @rotr64(i64 %A, i8 %Amt) nounwind { ; 32-NEXT: movl %ebx, %esi ; 32-NEXT: xorl %ebx, %ebx ; 32-NEXT: .LBB1_4: -; 32-NEXT: orl %ebx, %eax ; 32-NEXT: orl %esi, %edx +; 32-NEXT: orl %ebx, %eax ; 32-NEXT: popl %esi ; 32-NEXT: popl %edi ; 32-NEXT: popl %ebx @@ -546,7 +546,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-LABEL: rotr1_64_mem: ; 32: # BB#0: ; 32-NEXT: pushl %esi -; 32-NEXT: movl 8(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: movl (%eax), %ecx ; 32-NEXT: movl 4(%eax), %edx ; 32-NEXT: movl %edx, %esi @@ -555,11 +555,13 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { ; 32-NEXT: movl %ecx, 4(%eax) ; 32-NEXT: movl %esi, (%eax) ; 32-NEXT: popl %esi - +; 32-NEXT: retl +; ; 64-LABEL: rotr1_64_mem: ; 64: # BB#0: ; 64-NEXT: rorq (%rdi) ; 64-NEXT: retq + %A = load i64, i64 *%Aptr %B = shl i64 %A, 63 %C = lshr i64 %A, 1 @@ -571,7 +573,7 @@ define void @rotr1_64_mem(i64* %Aptr) nounwind { define void @rotr1_32_mem(i32* %Aptr) nounwind { ; 32-LABEL: rotr1_32_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorl (%eax) ; 32-NEXT: retl ; @@ -590,7 +592,7 @@ define void @rotr1_32_mem(i32* %Aptr) nounwind { define void @rotr1_16_mem(i16* %Aptr) nounwind { ; 32-LABEL: rotr1_16_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorw (%eax) ; 32-NEXT: retl ; @@ -609,7 +611,7 @@ define void @rotr1_16_mem(i16* %Aptr) nounwind { define void @rotr1_8_mem(i8* %Aptr) nounwind { ; 32-LABEL: rotr1_8_mem: ; 32: # BB#0: -; 32-NEXT: movl 4(%esp), %eax +; 32-NEXT: movl {{[0-9]+}}(%esp), %eax ; 32-NEXT: rorb (%eax) ; 32-NEXT: retl ; diff --git a/test/CodeGen/X86/sad.ll b/test/CodeGen/X86/sad.ll index b8a8b8afd14f..6a565a5c76f0 100644 --- a/test/CodeGen/X86/sad.ll +++ b/test/CodeGen/X86/sad.ll @@ -149,127 +149,131 @@ middle.block: define i32 @sad_32i8() nounwind { ; SSE2-LABEL: sad_32i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: pxor %xmm11, %xmm11 -; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm12, %xmm12 -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm6, %xmm6 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: pxor %xmm3, %xmm3 ; SSE2-NEXT: pxor %xmm14, %xmm14 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB1_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1040(%rax), %xmm8 ; SSE2-NEXT: movdqa a+1024(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm8 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm11[0],xmm8[1],xmm11[1],xmm8[2],xmm11[2],xmm8[3],xmm11[3],xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm11[8],xmm3[9],xmm11[9],xmm3[10],xmm11[10],xmm3[11],xmm11[11],xmm3[12],xmm11[12],xmm3[13],xmm11[13],xmm3[14],xmm11[14],xmm3[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm11[4],xmm3[5],xmm11[5],xmm3[6],xmm11[6],xmm3[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm11[0],xmm1[1],xmm11[1],xmm1[2],xmm11[2],xmm1[3],xmm11[3],xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm11[0],xmm0[1],xmm11[1],xmm0[2],xmm11[2],xmm0[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm11[4],xmm1[5],xmm11[5],xmm1[6],xmm11[6],xmm1[7],xmm11[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm11[8],xmm6[9],xmm11[9],xmm6[10],xmm11[10],xmm6[11],xmm11[11],xmm6[12],xmm11[12],xmm6[13],xmm11[13],xmm6[14],xmm11[14],xmm6[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm6, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm11[0],xmm7[1],xmm11[1],xmm7[2],xmm11[2],xmm7[3],xmm11[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm11[4],xmm6[5],xmm11[5],xmm6[6],xmm11[6],xmm6[7],xmm11[7] +; SSE2-NEXT: movdqa %xmm3, %xmm4 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm12[0],xmm4[1],xmm12[1],xmm4[2],xmm12[2],xmm4[3],xmm12[3],xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm4, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm12[0],xmm7[1],xmm12[1],xmm7[2],xmm12[2],xmm7[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm12[4],xmm4[5],xmm12[5],xmm4[6],xmm12[6],xmm4[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm12[8],xmm3[9],xmm12[9],xmm3[10],xmm12[10],xmm3[11],xmm12[11],xmm3[12],xmm12[12],xmm3[13],xmm12[13],xmm3[14],xmm12[14],xmm3[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm12[0],xmm1[1],xmm12[1],xmm1[2],xmm12[2],xmm1[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm12[4],xmm3[5],xmm12[5],xmm3[6],xmm12[6],xmm3[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm12[0],xmm0[1],xmm12[1],xmm0[2],xmm12[2],xmm0[3],xmm12[3],xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm12[0],xmm5[1],xmm12[1],xmm5[2],xmm12[2],xmm5[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm12[4],xmm0[5],xmm12[5],xmm0[6],xmm12[6],xmm0[7],xmm12[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm8 = xmm8[8],xmm12[8],xmm8[9],xmm12[9],xmm8[10],xmm12[10],xmm8[11],xmm12[11],xmm8[12],xmm12[12],xmm8[13],xmm12[13],xmm8[14],xmm12[14],xmm8[15],xmm12[15] +; SSE2-NEXT: movdqa b+1024(%rax), %xmm11 +; SSE2-NEXT: movdqa %xmm11, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm12[0],xmm10[1],xmm12[1],xmm10[2],xmm12[2],xmm10[3],xmm12[3],xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: movdqa %xmm10, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm7 ; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm11[8],xmm9[9],xmm11[9],xmm9[10],xmm11[10],xmm9[11],xmm11[11],xmm9[12],xmm11[12],xmm9[13],xmm11[13],xmm9[14],xmm11[14],xmm9[15],xmm11[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm11[4],xmm9[5],xmm11[5],xmm9[6],xmm11[6],xmm9[7],xmm11[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm4 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: psubd %xmm10, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm12[4],xmm10[5],xmm12[5],xmm10[6],xmm12[6],xmm10[7],xmm12[7] +; SSE2-NEXT: psubd %xmm10, %xmm4 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm12[8],xmm11[9],xmm12[9],xmm11[10],xmm12[10],xmm11[11],xmm12[11],xmm11[12],xmm12[12],xmm11[13],xmm12[13],xmm11[14],xmm12[14],xmm11[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm11, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] ; SSE2-NEXT: psubd %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm4, %xmm2 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm11[8],xmm4[9],xmm11[9],xmm4[10],xmm11[10],xmm4[11],xmm11[11],xmm4[12],xmm11[12],xmm4[13],xmm11[13],xmm4[14],xmm11[14],xmm4[15],xmm11[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm4, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm11[4],xmm4[5],xmm11[5],xmm4[6],xmm11[6],xmm4[7],xmm11[7] -; SSE2-NEXT: psubd %xmm4, %xmm3 -; SSE2-NEXT: movdqa %xmm8, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm11[4],xmm8[5],xmm11[5],xmm8[6],xmm11[6],xmm8[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm11[0],xmm5[1],xmm11[1],xmm5[2],xmm11[2],xmm5[3],xmm11[3] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm11[0],xmm2[1],xmm11[1],xmm2[2],xmm11[2],xmm2[3],xmm11[3],xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm11[0],xmm9[1],xmm11[1],xmm9[2],xmm11[2],xmm9[3],xmm11[3] -; SSE2-NEXT: psubd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm2, %xmm4 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm11[4],xmm2[5],xmm11[5],xmm2[6],xmm11[6],xmm2[7],xmm11[7] -; SSE2-NEXT: psubd %xmm2, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm11[0],xmm10[1],xmm11[1],xmm10[2],xmm11[2],xmm10[3],xmm11[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm11[0],xmm4[1],xmm11[1],xmm4[2],xmm11[2],xmm4[3],xmm11[3] -; SSE2-NEXT: psubd %xmm4, %xmm10 -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm10 -; SSE2-NEXT: pxor %xmm2, %xmm10 -; SSE2-NEXT: movdqa %xmm8, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm8 -; SSE2-NEXT: pxor %xmm2, %xmm8 -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm5 -; SSE2-NEXT: pxor %xmm2, %xmm5 -; SSE2-NEXT: movdqa %xmm3, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm3 -; SSE2-NEXT: pxor %xmm2, %xmm3 -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm0 -; SSE2-NEXT: pxor %xmm2, %xmm0 -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm1 -; SSE2-NEXT: movdqa %xmm7, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pxor %xmm2, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm12[4],xmm11[5],xmm12[5],xmm11[6],xmm12[6],xmm11[7],xmm12[7] +; SSE2-NEXT: psubd %xmm11, %xmm3 +; SSE2-NEXT: movdqa %xmm6, %xmm10 +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3],xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] ; SSE2-NEXT: movdqa %xmm6, %xmm2 -; SSE2-NEXT: psrad $31, %xmm2 -; SSE2-NEXT: paddd %xmm2, %xmm6 -; SSE2-NEXT: pxor %xmm2, %xmm6 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm14 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: psubd %xmm2, %xmm5 +; SSE2-NEXT: movdqa %xmm8, %xmm2 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm12[0],xmm2[1],xmm12[1],xmm2[2],xmm12[2],xmm2[3],xmm12[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm12[4],xmm6[5],xmm12[5],xmm6[6],xmm12[6],xmm6[7],xmm12[7] +; SSE2-NEXT: psubd %xmm6, %xmm0 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm12[8],xmm9[9],xmm12[9],xmm9[10],xmm12[10],xmm9[11],xmm12[11],xmm9[12],xmm12[12],xmm9[13],xmm12[13],xmm9[14],xmm12[14],xmm9[15],xmm12[15] +; SSE2-NEXT: movdqa %xmm9, %xmm6 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm12[0],xmm6[1],xmm12[1],xmm6[2],xmm12[2],xmm6[3],xmm12[3] +; SSE2-NEXT: psubd %xmm6, %xmm2 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm12[4],xmm8[5],xmm12[5],xmm8[6],xmm12[6],xmm8[7],xmm12[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm12[4],xmm9[5],xmm12[5],xmm9[6],xmm12[6],xmm9[7],xmm12[7] +; SSE2-NEXT: psubd %xmm9, %xmm8 +; SSE2-NEXT: movdqa %xmm7, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm7 +; SSE2-NEXT: pxor %xmm6, %xmm7 ; SSE2-NEXT: paddd %xmm7, %xmm13 -; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: psrad $31, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm4 +; SSE2-NEXT: pxor %xmm6, %xmm4 +; SSE2-NEXT: movdqa %xmm10, %xmm6 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm1, %xmm4 +; SSE2-NEXT: psrad $31, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: pxor %xmm4, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: pxor %xmm1, %xmm3 ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload ; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm2 -; SSE2-NEXT: paddd %xmm8, %xmm3 -; SSE2-NEXT: paddd %xmm10, %xmm0 +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: paddd %xmm5, %xmm14 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm2, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm2 +; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: paddd %xmm2, %xmm1 +; SSE2-NEXT: movdqa %xmm8, %xmm0 +; SSE2-NEXT: psrad $31, %xmm0 +; SSE2-NEXT: paddd %xmm0, %xmm8 +; SSE2-NEXT: pxor %xmm0, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB1_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm15, %xmm3 -; SSE2-NEXT: paddd %xmm14, %xmm1 -; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm2 -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: paddd %xmm2, %xmm1 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] -; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: paddd %xmm15, %xmm6 +; SSE2-NEXT: paddd %xmm0, %xmm3 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: paddd %xmm14, %xmm13 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm3, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm4[2,3,0,1] +; SSE2-NEXT: paddd %xmm4, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax @@ -398,288 +402,284 @@ middle.block: define i32 @sad_avx64i8() nounwind { ; SSE2-LABEL: sad_avx64i8: ; SSE2: # BB#0: # %entry -; SSE2-NEXT: subq $184, %rsp -; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: subq $200, %rsp +; SSE2-NEXT: pxor %xmm14, %xmm14 ; SSE2-NEXT: movq $-1024, %rax # imm = 0xFC00 -; SSE2-NEXT: pxor %xmm12, %xmm12 +; SSE2-NEXT: pxor %xmm15, %xmm15 +; SSE2-NEXT: pxor %xmm10, %xmm10 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: pxor %xmm13, %xmm13 +; SSE2-NEXT: pxor %xmm1, %xmm1 ; SSE2-NEXT: pxor %xmm8, %xmm8 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm14, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm6, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: pxor %xmm11, %xmm11 ; SSE2-NEXT: pxor %xmm0, %xmm0 ; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm11, %xmm11 +; SSE2-NEXT: pxor %xmm4, %xmm4 +; SSE2-NEXT: movdqa %xmm4, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: pxor %xmm7, %xmm7 -; SSE2-NEXT: pxor %xmm13, %xmm13 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pxor %xmm5, %xmm5 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: pxor %xmm7, %xmm7 +; SSE2-NEXT: movdqa %xmm7, -{{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: .p2align 4, 0x90 ; SSE2-NEXT: .LBB2_1: # %vector.body ; SSE2-NEXT: # =>This Inner Loop Header: Depth=1 -; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm13, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm7, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm3, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm8, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm11, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm5, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa %xmm0, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm14, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm6, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm8, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm12, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa a+1040(%rax), %xmm6 -; SSE2-NEXT: movdqa a+1024(%rax), %xmm4 -; SSE2-NEXT: movdqa a+1056(%rax), %xmm11 -; SSE2-NEXT: movdqa a+1072(%rax), %xmm3 -; SSE2-NEXT: movdqa %xmm3, %xmm5 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3],xmm3[4],xmm15[4],xmm3[5],xmm15[5],xmm3[6],xmm15[6],xmm3[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, %xmm1 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm15[8],xmm1[9],xmm15[9],xmm1[10],xmm15[10],xmm1[11],xmm15[11],xmm1[12],xmm15[12],xmm1[13],xmm15[13],xmm1[14],xmm15[14],xmm1[15],xmm15[15] -; SSE2-NEXT: punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3],xmm11[4],xmm15[4],xmm11[5],xmm15[5],xmm11[6],xmm15[6],xmm11[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm11, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm15[0],xmm11[1],xmm15[1],xmm11[2],xmm15[2],xmm11[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm4, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm15[0],xmm12[1],xmm15[1],xmm12[2],xmm15[2],xmm12[3],xmm15[3],xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm15[4],xmm12[5],xmm15[5],xmm12[6],xmm15[6],xmm12[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm15[8],xmm4[9],xmm15[9],xmm4[10],xmm15[10],xmm4[11],xmm15[11],xmm4[12],xmm15[12],xmm4[13],xmm15[13],xmm4[14],xmm15[14],xmm4[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm4, %xmm0 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm15[4],xmm4[5],xmm15[5],xmm4[6],xmm15[6],xmm4[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm6, %xmm14 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm14 = xmm14[0],xmm15[0],xmm14[1],xmm15[1],xmm14[2],xmm15[2],xmm14[3],xmm15[3],xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm14, %xmm7 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm15[0],xmm7[1],xmm15[1],xmm7[2],xmm15[2],xmm7[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm15[4],xmm14[5],xmm15[5],xmm14[6],xmm15[6],xmm14[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm15[8],xmm6[9],xmm15[9],xmm6[10],xmm15[10],xmm6[11],xmm15[11],xmm6[12],xmm15[12],xmm6[13],xmm15[13],xmm6[14],xmm15[14],xmm6[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm6, %xmm8 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm15[0],xmm8[1],xmm15[1],xmm8[2],xmm15[2],xmm8[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm15[4],xmm6[5],xmm15[5],xmm6[6],xmm15[6],xmm6[7],xmm15[7] -; SSE2-NEXT: movdqa b+1040(%rax), %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm13 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm9 = xmm9[8],xmm15[8],xmm9[9],xmm15[9],xmm9[10],xmm15[10],xmm9[11],xmm15[11],xmm9[12],xmm15[12],xmm9[13],xmm15[13],xmm9[14],xmm15[14],xmm9[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm9, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm6 -; SSE2-NEXT: movdqa b+1024(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm15[0],xmm13[1],xmm15[1],xmm13[2],xmm15[2],xmm13[3],xmm15[3],xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm8 -; SSE2-NEXT: movdqa %xmm13, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: psubd %xmm13, %xmm14 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm15[8],xmm2[9],xmm15[9],xmm2[10],xmm15[10],xmm2[11],xmm15[11],xmm2[12],xmm15[12],xmm2[13],xmm15[13],xmm2[14],xmm15[14],xmm2[15],xmm15[15] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm7 -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm4 -; SSE2-NEXT: movdqa b+1056(%rax), %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3],xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: psubd %xmm10, %xmm12 -; SSE2-NEXT: movdqa %xmm2, %xmm10 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm15[0],xmm9[1],xmm15[1],xmm9[2],xmm15[2],xmm9[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm11 -; SSE2-NEXT: movdqa %xmm1, %xmm13 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm15[0],xmm1[1],xmm15[1],xmm1[2],xmm15[2],xmm1[3],xmm15[3] -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm15[4],xmm0[5],xmm15[5],xmm0[6],xmm15[6],xmm0[7],xmm15[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm10 = xmm10[8],xmm15[8],xmm10[9],xmm15[9],xmm10[10],xmm15[10],xmm10[11],xmm15[11],xmm10[12],xmm15[12],xmm10[13],xmm15[13],xmm10[14],xmm15[14],xmm10[15],xmm15[15] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm0 -; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm10, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm10 = xmm10[0],xmm15[0],xmm10[1],xmm15[1],xmm10[2],xmm15[2],xmm10[3],xmm15[3] -; SSE2-NEXT: psubd %xmm10, %xmm1 -; SSE2-NEXT: movdqa %xmm3, %xmm10 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm15[0],xmm3[1],xmm15[1],xmm3[2],xmm15[2],xmm3[3],xmm15[3] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm15[4],xmm13[5],xmm15[5],xmm13[6],xmm15[6],xmm13[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: movdqa b+1072(%rax), %xmm2 -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3],xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: movdqa %xmm2, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm15[0],xmm2[1],xmm15[1],xmm2[2],xmm15[2],xmm2[3],xmm15[3] -; SSE2-NEXT: psubd %xmm2, %xmm3 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm15[4],xmm10[5],xmm15[5],xmm10[6],xmm15[6],xmm10[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: psubd %xmm9, %xmm10 -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm15[8],xmm5[9],xmm15[9],xmm5[10],xmm15[10],xmm5[11],xmm15[11],xmm5[12],xmm15[12],xmm5[13],xmm15[13],xmm5[14],xmm15[14],xmm5[15],xmm15[15] -; SSE2-NEXT: movdqa %xmm5, %xmm9 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm15[0],xmm5[1],xmm15[1],xmm5[2],xmm15[2],xmm5[3],xmm15[3] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm15[8],xmm0[9],xmm15[9],xmm0[10],xmm15[10],xmm0[11],xmm15[11],xmm0[12],xmm15[12],xmm0[13],xmm15[13],xmm0[14],xmm15[14],xmm0[15],xmm15[15] +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm10, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm1, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm15, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movaps a+1040(%rax), %xmm0 +; SSE2-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa a+1024(%rax), %xmm12 +; SSE2-NEXT: movdqa a+1056(%rax), %xmm15 +; SSE2-NEXT: movdqa a+1072(%rax), %xmm4 +; SSE2-NEXT: movdqa %xmm4, %xmm6 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm6 = xmm6[8],xmm14[8],xmm6[9],xmm14[9],xmm6[10],xmm14[10],xmm6[11],xmm14[11],xmm6[12],xmm14[12],xmm6[13],xmm14[13],xmm6[14],xmm14[14],xmm6[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm14[4],xmm1[5],xmm14[5],xmm1[6],xmm14[6],xmm1[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm14[0],xmm6[1],xmm14[1],xmm6[2],xmm14[2],xmm6[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3],xmm4[4],xmm14[4],xmm4[5],xmm14[5],xmm4[6],xmm14[6],xmm4[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm4, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm14[4],xmm5[5],xmm14[5],xmm5[6],xmm14[6],xmm5[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm14[0],xmm4[1],xmm14[1],xmm4[2],xmm14[2],xmm4[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm15, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm11 = xmm11[8],xmm14[8],xmm11[9],xmm14[9],xmm11[10],xmm14[10],xmm11[11],xmm14[11],xmm11[12],xmm14[12],xmm11[13],xmm14[13],xmm11[14],xmm14[14],xmm11[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm11, %xmm8 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm14[4],xmm8[5],xmm14[5],xmm8[6],xmm14[6],xmm8[7],xmm14[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm14[0],xmm11[1],xmm14[1],xmm11[2],xmm14[2],xmm11[3],xmm14[3] +; SSE2-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm15, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm15[0],xmm0[1],xmm15[1],xmm0[2],xmm15[2],xmm0[3],xmm15[3] -; SSE2-NEXT: psubd %xmm0, %xmm5 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm15[4],xmm9[5],xmm15[5],xmm9[6],xmm15[6],xmm9[7],xmm15[7] -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm15[4],xmm2[5],xmm15[5],xmm2[6],xmm15[6],xmm2[7],xmm15[7] -; SSE2-NEXT: psubd %xmm2, %xmm9 -; SSE2-NEXT: movdqa %xmm9, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm9 -; SSE2-NEXT: pxor %xmm0, %xmm9 -; SSE2-NEXT: movdqa %xmm5, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm5 -; SSE2-NEXT: pxor %xmm0, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm12, %xmm10 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm10 = xmm10[0],xmm14[0],xmm10[1],xmm14[1],xmm10[2],xmm14[2],xmm10[3],xmm14[3],xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] ; SSE2-NEXT: movdqa %xmm10, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm10 -; SSE2-NEXT: pxor %xmm0, %xmm10 -; SSE2-NEXT: movdqa %xmm3, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm3 -; SSE2-NEXT: pxor %xmm0, %xmm3 -; SSE2-NEXT: movdqa %xmm13, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm13 -; SSE2-NEXT: pxor %xmm0, %xmm13 -; SSE2-NEXT: movdqa %xmm1, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm1 -; SSE2-NEXT: pxor %xmm0, %xmm1 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm11, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm11 -; SSE2-NEXT: pxor %xmm0, %xmm11 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm14[4],xmm10[5],xmm14[5],xmm10[6],xmm14[6],xmm10[7],xmm14[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm14[8],xmm12[9],xmm14[9],xmm12[10],xmm14[10],xmm12[11],xmm14[11],xmm12[12],xmm14[12],xmm12[13],xmm14[13],xmm12[14],xmm14[14],xmm12[15],xmm14[15] ; SSE2-NEXT: movdqa %xmm12, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm12 -; SSE2-NEXT: pxor %xmm0, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: movdqa %xmm0, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm12 = xmm12[4],xmm14[4],xmm12[5],xmm14[5],xmm12[6],xmm14[6],xmm12[7],xmm14[7] +; SSE2-NEXT: movdqa b+1072(%rax), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm1 +; SSE2-NEXT: movdqa b+1056(%rax), %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm5 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm4 +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm14[8],xmm3[9],xmm14[9],xmm3[10],xmm14[10],xmm3[11],xmm14[11],xmm3[12],xmm14[12],xmm3[13],xmm14[13],xmm3[14],xmm14[14],xmm3[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm8 +; SSE2-NEXT: movdqa b+1024(%rax), %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm11 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm2 +; SSE2-NEXT: movdqa %xmm2, (%rsp) # 16-byte Spill +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm15 +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3],xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm0, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: movdqa %xmm9, {{[0-9]+}}(%rsp) # 16-byte Spill ; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: movdqa %xmm2, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm2 -; SSE2-NEXT: pxor %xmm0, %xmm2 +; SSE2-NEXT: movdqa %xmm2, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm14[0],xmm9[1],xmm14[1],xmm9[2],xmm14[2],xmm9[3],xmm14[3],xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4],xmm14[4],xmm0[5],xmm14[5],xmm0[6],xmm14[6],xmm0[7],xmm14[7] +; SSE2-NEXT: psubd %xmm0, %xmm10 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm14[8],xmm7[9],xmm14[9],xmm7[10],xmm14[10],xmm7[11],xmm14[11],xmm7[12],xmm14[12],xmm7[13],xmm14[13],xmm7[14],xmm14[14],xmm7[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm7, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: psubd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm13, {{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm9, %xmm0 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm14[0],xmm0[1],xmm14[1],xmm0[2],xmm14[2],xmm0[3],xmm14[3] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm7 = xmm7[4],xmm14[4],xmm7[5],xmm14[5],xmm7[6],xmm14[6],xmm7[7],xmm14[7] +; SSE2-NEXT: psubd %xmm7, %xmm12 +; SSE2-NEXT: movdqa b+1040(%rax), %xmm13 +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3],xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: movdqa %xmm3, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm14[4],xmm9[5],xmm14[5],xmm9[6],xmm14[6],xmm9[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm3 = xmm3[4],xmm14[4],xmm3[5],xmm14[5],xmm3[6],xmm14[6],xmm3[7],xmm14[7] +; SSE2-NEXT: psubd %xmm3, %xmm9 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm2 = xmm2[8],xmm14[8],xmm2[9],xmm14[9],xmm2[10],xmm14[10],xmm2[11],xmm14[11],xmm2[12],xmm14[12],xmm2[13],xmm14[13],xmm2[14],xmm14[14],xmm2[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm2, %xmm7 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm14[0],xmm7[1],xmm14[1],xmm7[2],xmm14[2],xmm7[3],xmm14[3] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm13 = xmm13[8],xmm14[8],xmm13[9],xmm14[9],xmm13[10],xmm14[10],xmm13[11],xmm14[11],xmm13[12],xmm14[12],xmm13[13],xmm14[13],xmm13[14],xmm14[14],xmm13[15],xmm14[15] +; SSE2-NEXT: movdqa %xmm13, %xmm3 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm14[0],xmm3[1],xmm14[1],xmm3[2],xmm14[2],xmm3[3],xmm14[3] +; SSE2-NEXT: psubd %xmm3, %xmm7 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm14[4],xmm2[5],xmm14[5],xmm2[6],xmm14[6],xmm2[7],xmm14[7] +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm14[4],xmm13[5],xmm14[5],xmm13[6],xmm14[6],xmm13[7],xmm14[7] +; SSE2-NEXT: psubd %xmm13, %xmm2 ; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm4, %xmm0 +; SSE2-NEXT: movdqa %xmm1, %xmm3 +; SSE2-NEXT: psrad $31, %xmm3 +; SSE2-NEXT: paddd %xmm3, %xmm1 +; SSE2-NEXT: pxor %xmm3, %xmm1 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: paddd %xmm1, %xmm3 +; SSE2-NEXT: movdqa %xmm3, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm6, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm5, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm5 +; SSE2-NEXT: pxor %xmm1, %xmm5 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm5, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm4, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm8, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm11, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm11 +; SSE2-NEXT: pxor %xmm1, %xmm11 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm11, %xmm1 +; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload +; SSE2-NEXT: movdqa (%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm11 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm15, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm15 +; SSE2-NEXT: pxor %xmm1, %xmm15 +; SSE2-NEXT: paddd %xmm15, %xmm2 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm4, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm4, %xmm6 +; SSE2-NEXT: movdqa %xmm6, %xmm15 +; SSE2-NEXT: movdqa %xmm10, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm10 +; SSE2-NEXT: pxor %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm10 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm6, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm6 +; SSE2-NEXT: pxor %xmm1, %xmm6 +; SSE2-NEXT: paddd %xmm6, %xmm3 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 +; SSE2-NEXT: paddd %xmm12, %xmm5 +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm0 +; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm13 +; SSE2-NEXT: movdqa %xmm9, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm4 -; SSE2-NEXT: pxor %xmm0, %xmm4 +; SSE2-NEXT: paddd %xmm0, %xmm9 +; SSE2-NEXT: pxor %xmm0, %xmm9 +; SSE2-NEXT: paddd %xmm9, %xmm1 ; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 ; SSE2-NEXT: paddd %xmm0, %xmm7 ; SSE2-NEXT: pxor %xmm0, %xmm7 -; SSE2-NEXT: movdqa %xmm14, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm14 -; SSE2-NEXT: pxor %xmm0, %xmm14 -; SSE2-NEXT: movdqa %xmm8, %xmm0 -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: pxor %xmm0, %xmm8 -; SSE2-NEXT: movdqa %xmm6, %xmm0 +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 +; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp) # 16-byte Spill +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload +; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: movdqa %xmm7, %xmm0 ; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: paddd %xmm0, %xmm6 -; SSE2-NEXT: pxor %xmm0, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm6, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm6 # 16-byte Reload -; SSE2-NEXT: paddd %xmm8, %xmm6 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm8 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm14, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm2 -; SSE2-NEXT: movdqa %xmm2, {{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm14 # 16-byte Reload -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd %xmm4, %xmm2 -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm4 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm4 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm12, %xmm8 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm2 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm0, %xmm12 -; SSE2-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd %xmm11, %xmm0 -; SSE2-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa (%rsp), %xmm11 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload +; SSE2-NEXT: paddd %xmm0, %xmm7 +; SSE2-NEXT: pxor %xmm0, %xmm7 ; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload -; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload -; SSE2-NEXT: paddd %xmm1, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm7 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm3, %xmm1 -; SSE2-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Reload -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1 # 16-byte Reload -; SSE2-NEXT: paddd %xmm10, %xmm1 -; SSE2-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload -; SSE2-NEXT: paddd %xmm9, %xmm5 -; SSE2-NEXT: movdqa %xmm5, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Reload +; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: addq $4, %rax ; SSE2-NEXT: jne .LBB2_1 ; SSE2-NEXT: # BB#2: # %middle.block -; SSE2-NEXT: paddd %xmm2, %xmm4 -; SSE2-NEXT: paddd %xmm3, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm2 -; SSE2-NEXT: paddd %xmm11, %xmm2 -; SSE2-NEXT: paddd %xmm13, %xmm14 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Reload -; SSE2-NEXT: paddd %xmm7, %xmm3 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm7 # 16-byte Reload -; SSE2-NEXT: paddd %xmm5, %xmm7 -; SSE2-NEXT: paddd %xmm0, %xmm8 -; SSE2-NEXT: movdqa -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm3 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm8 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm3, %xmm8 +; SSE2-NEXT: paddd %xmm2, %xmm15 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm13 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm8, %xmm13 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm5 # 16-byte Folded Reload +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm0 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm5, %xmm0 +; SSE2-NEXT: paddd %xmm11, %xmm10 +; SSE2-NEXT: paddd -{{[0-9]+}}(%rsp), %xmm1 # 16-byte Folded Reload +; SSE2-NEXT: paddd %xmm0, %xmm1 +; SSE2-NEXT: paddd %xmm10, %xmm1 +; SSE2-NEXT: paddd %xmm13, %xmm1 +; SSE2-NEXT: paddd %xmm15, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[2,3,0,1] ; SSE2-NEXT: paddd %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm7 -; SSE2-NEXT: paddd %xmm4, %xmm6 -; SSE2-NEXT: paddd %xmm14, %xmm6 -; SSE2-NEXT: paddd %xmm0, %xmm7 -; SSE2-NEXT: paddd %xmm8, %xmm7 -; SSE2-NEXT: paddd %xmm6, %xmm7 -; SSE2-NEXT: paddd %xmm2, %xmm7 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm7[2,3,0,1] -; SSE2-NEXT: paddd %xmm7, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: movd %xmm1, %eax -; SSE2-NEXT: addq $184, %rsp +; SSE2-NEXT: addq $200, %rsp ; SSE2-NEXT: retq ; ; AVX2-LABEL: sad_avx64i8: @@ -688,8 +688,8 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: movq $-1024, %rax # imm = 0xFC00 ; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 ; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpxor %ymm6, %ymm6, %ymm6 ; AVX2-NEXT: vpxor %ymm5, %ymm5, %ymm5 ; AVX2-NEXT: vpxor %ymm7, %ymm7, %ymm7 @@ -697,7 +697,6 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: .LBB2_1: # %vector.body ; AVX2-NEXT: # =>This Inner Loop Header: Depth=1 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero @@ -705,48 +704,49 @@ define i32 @sad_avx64i8() nounwind { ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm13 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm14 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm8, %ymm15, %ymm8 +; AVX2-NEXT: vmovdqu %ymm15, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu %ymm8, -{{[0-9]+}}(%rsp) # 32-byte Spill ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 +; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero ; AVX2-NEXT: vpsubd %ymm15, %ymm11, %ymm11 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm10, %ymm10 +; AVX2-NEXT: vpsubd %ymm15, %ymm12, %ymm12 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm9 -; AVX2-NEXT: vmovdqu %ymm9, -{{[0-9]+}}(%rsp) # 32-byte Spill +; AVX2-NEXT: vpsubd %ymm15, %ymm13, %ymm13 +; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero +; AVX2-NEXT: vpsubd %ymm15, %ymm14, %ymm14 ; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm15 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero -; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm9 # 32-byte Reload -; AVX2-NEXT: vpsubd %ymm15, %ymm9, %ymm15 -; AVX2-NEXT: vpabsd %ymm8, %ymm8 +; AVX2-NEXT: vmovdqu -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Reload +; AVX2-NEXT: vpsubd %ymm15, %ymm8, %ymm15 +; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload +; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpabsd %ymm9, %ymm8 +; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm10, %ymm8 +; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 +; AVX2-NEXT: vpabsd %ymm11, %ymm8 ; AVX2-NEXT: vpaddd %ymm3, %ymm8, %ymm3 -; AVX2-NEXT: vpabsd %ymm14, %ymm8 -; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 -; AVX2-NEXT: vpabsd %ymm13, %ymm8 -; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 ; AVX2-NEXT: vpabsd %ymm12, %ymm8 ; AVX2-NEXT: vpaddd %ymm0, %ymm8, %ymm0 -; AVX2-NEXT: vpabsd %ymm11, %ymm8 -; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 -; AVX2-NEXT: vpabsd %ymm10, %ymm8 -; AVX2-NEXT: vpaddd %ymm6, %ymm8, %ymm6 -; AVX2-NEXT: vpabsd -{{[0-9]+}}(%rsp), %ymm8 # 32-byte Folded Reload -; AVX2-NEXT: vpaddd %ymm5, %ymm8, %ymm5 +; AVX2-NEXT: vpabsd %ymm13, %ymm8 +; AVX2-NEXT: vpaddd %ymm2, %ymm8, %ymm2 +; AVX2-NEXT: vpabsd %ymm14, %ymm8 +; AVX2-NEXT: vpaddd %ymm1, %ymm8, %ymm1 ; AVX2-NEXT: vpabsd %ymm15, %ymm8 -; AVX2-NEXT: vpaddd %ymm7, %ymm8, %ymm7 +; AVX2-NEXT: vpaddd %ymm4, %ymm8, %ymm4 ; AVX2-NEXT: addq $4, %rax ; AVX2-NEXT: jne .LBB2_1 ; AVX2-NEXT: # BB#2: # %middle.block ; AVX2-NEXT: vpaddd %ymm6, %ymm2, %ymm2 -; AVX2-NEXT: vpaddd %ymm7, %ymm3, %ymm3 -; AVX2-NEXT: vpaddd %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm7, %ymm4, %ymm4 +; AVX2-NEXT: vpaddd %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vpaddd %ymm5, %ymm1, %ymm1 -; AVX2-NEXT: vpaddd %ymm3, %ymm2, %ymm2 ; AVX2-NEXT: vpaddd %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 @@ -773,21 +773,21 @@ define i32 @sad_avx64i8() nounwind { ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm6 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm7 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero ; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm9 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm10 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm11 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero -; AVX512F-NEXT: vpsubd %zmm11, %zmm7, %zmm7 -; AVX512F-NEXT: vpsubd %zmm10, %zmm6, %zmm6 -; AVX512F-NEXT: vpsubd %zmm9, %zmm5, %zmm5 ; AVX512F-NEXT: vpsubd %zmm8, %zmm4, %zmm4 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm5, %zmm5 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm6, %zmm6 +; AVX512F-NEXT: vpmovzxbd {{.*#+}} zmm8 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero,mem[8],zero,zero,zero,mem[9],zero,zero,zero,mem[10],zero,zero,zero,mem[11],zero,zero,zero,mem[12],zero,zero,zero,mem[13],zero,zero,zero,mem[14],zero,zero,zero,mem[15],zero,zero,zero +; AVX512F-NEXT: vpsubd %zmm8, %zmm7, %zmm7 ; AVX512F-NEXT: vpabsd %zmm4, %zmm4 -; AVX512F-NEXT: vpabsd %zmm5, %zmm5 -; AVX512F-NEXT: vpabsd %zmm6, %zmm6 -; AVX512F-NEXT: vpabsd %zmm7, %zmm7 -; AVX512F-NEXT: vpaddd %zmm3, %zmm7, %zmm3 -; AVX512F-NEXT: vpaddd %zmm2, %zmm6, %zmm2 -; AVX512F-NEXT: vpaddd %zmm1, %zmm5, %zmm1 ; AVX512F-NEXT: vpaddd %zmm0, %zmm4, %zmm0 +; AVX512F-NEXT: vpabsd %zmm5, %zmm4 +; AVX512F-NEXT: vpaddd %zmm1, %zmm4, %zmm1 +; AVX512F-NEXT: vpabsd %zmm6, %zmm4 +; AVX512F-NEXT: vpaddd %zmm2, %zmm4, %zmm2 +; AVX512F-NEXT: vpabsd %zmm7, %zmm4 +; AVX512F-NEXT: vpaddd %zmm3, %zmm4, %zmm3 ; AVX512F-NEXT: addq $4, %rax ; AVX512F-NEXT: jne .LBB2_1 ; AVX512F-NEXT: # BB#2: # %middle.block @@ -1154,59 +1154,54 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-LABEL: sad_nonloop_32i8: ; SSE2: # BB#0: ; SSE2-NEXT: movdqu (%rdi), %xmm0 -; SSE2-NEXT: movdqu 16(%rdi), %xmm3 -; SSE2-NEXT: pxor %xmm4, %xmm4 -; SSE2-NEXT: movdqa %xmm3, %xmm12 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3],xmm12[4],xmm4[4],xmm12[5],xmm4[5],xmm12[6],xmm4[6],xmm12[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm12, %xmm9 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm9 = xmm9[4],xmm4[4],xmm9[5],xmm4[5],xmm9[6],xmm4[6],xmm9[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm0, %xmm13 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3],xmm13[4],xmm4[4],xmm13[5],xmm4[5],xmm13[6],xmm4[6],xmm13[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm13, %xmm10 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm4[4],xmm10[5],xmm4[5],xmm10[6],xmm4[6],xmm10[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm3, %xmm11 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm4[4],xmm11[5],xmm4[5],xmm11[6],xmm4[6],xmm11[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm0, %xmm6 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm6 = xmm6[4],xmm4[4],xmm6[5],xmm4[5],xmm6[6],xmm4[6],xmm6[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm4[0],xmm12[1],xmm4[1],xmm12[2],xmm4[2],xmm12[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm13 = xmm13[0],xmm4[0],xmm13[1],xmm4[1],xmm13[2],xmm4[2],xmm13[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3] -; SSE2-NEXT: movdqu (%rdx), %xmm5 -; SSE2-NEXT: movdqu 16(%rdx), %xmm7 -; SSE2-NEXT: movdqa %xmm7, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm1, %xmm2 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm2 = xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, -{{[0-9]+}}(%rsp) # 16-byte Spill -; SSE2-NEXT: movdqa %xmm5, %xmm2 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7] -; SSE2-NEXT: movdqa %xmm2, %xmm14 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm14 = xmm14[4],xmm4[4],xmm14[5],xmm4[5],xmm14[6],xmm4[6],xmm14[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm4[8],xmm7[9],xmm4[9],xmm7[10],xmm4[10],xmm7[11],xmm4[11],xmm7[12],xmm4[12],xmm7[13],xmm4[13],xmm7[14],xmm4[14],xmm7[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm7, %xmm15 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm15 = xmm15[4],xmm4[4],xmm15[5],xmm4[5],xmm15[6],xmm4[6],xmm15[7],xmm4[7] -; SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm4[8],xmm5[9],xmm4[9],xmm5[10],xmm4[10],xmm5[11],xmm4[11],xmm5[12],xmm4[12],xmm5[13],xmm4[13],xmm5[14],xmm4[14],xmm5[15],xmm4[15] -; SSE2-NEXT: movdqa %xmm5, %xmm8 -; SSE2-NEXT: punpckhwd {{.*#+}} xmm8 = xmm8[4],xmm4[4],xmm8[5],xmm4[5],xmm8[6],xmm4[6],xmm8[7],xmm4[7] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm4[0],xmm7[1],xmm4[1],xmm7[2],xmm4[2],xmm7[3],xmm4[3] -; SSE2-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3] -; SSE2-NEXT: psubd %xmm5, %xmm0 -; SSE2-NEXT: psubd %xmm7, %xmm3 -; SSE2-NEXT: psubd %xmm2, %xmm13 -; SSE2-NEXT: psubd %xmm1, %xmm12 -; SSE2-NEXT: psubd %xmm8, %xmm6 -; SSE2-NEXT: psubd %xmm15, %xmm11 -; SSE2-NEXT: psubd %xmm14, %xmm10 -; SSE2-NEXT: psubd -{{[0-9]+}}(%rsp), %xmm9 # 16-byte Folded Reload -; SSE2-NEXT: movdqa %xmm9, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm9 -; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqu 16(%rdi), %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: movdqa %xmm12, %xmm8 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3],xmm8[4],xmm1[4],xmm8[5],xmm1[5],xmm8[6],xmm1[6],xmm8[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm8, %xmm10 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm10 = xmm10[4],xmm1[4],xmm10[5],xmm1[5],xmm10[6],xmm1[6],xmm10[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm0, %xmm9 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3],xmm9[4],xmm1[4],xmm9[5],xmm1[5],xmm9[6],xmm1[6],xmm9[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm9, %xmm11 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm11 = xmm11[4],xmm1[4],xmm11[5],xmm1[5],xmm11[6],xmm1[6],xmm11[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm12 = xmm12[8],xmm1[8],xmm12[9],xmm1[9],xmm12[10],xmm1[10],xmm12[11],xmm1[11],xmm12[12],xmm1[12],xmm12[13],xmm1[13],xmm12[14],xmm1[14],xmm12[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm12, %xmm13 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm13 = xmm13[4],xmm1[4],xmm13[5],xmm1[5],xmm13[6],xmm1[6],xmm13[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm0, %xmm4 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm4 = xmm4[4],xmm1[4],xmm4[5],xmm1[5],xmm4[6],xmm1[6],xmm4[7],xmm1[7] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm8 = xmm8[0],xmm1[0],xmm8[1],xmm1[1],xmm8[2],xmm1[2],xmm8[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm9 = xmm9[0],xmm1[0],xmm9[1],xmm1[1],xmm9[2],xmm1[2],xmm9[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm12 = xmm12[0],xmm1[0],xmm12[1],xmm1[1],xmm12[2],xmm1[2],xmm12[3],xmm1[3] +; SSE2-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; SSE2-NEXT: movdqu (%rdx), %xmm7 +; SSE2-NEXT: movdqu 16(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, %xmm6 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3],xmm6[4],xmm1[4],xmm6[5],xmm1[5],xmm6[6],xmm1[6],xmm6[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm6, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm10 +; SSE2-NEXT: movdqa %xmm7, %xmm2 +; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; SSE2-NEXT: movdqa %xmm2, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm11 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm3 = xmm3[8],xmm1[8],xmm3[9],xmm1[9],xmm3[10],xmm1[10],xmm3[11],xmm1[11],xmm3[12],xmm1[12],xmm3[13],xmm1[13],xmm3[14],xmm1[14],xmm3[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm3, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm13 +; SSE2-NEXT: punpckhbw {{.*#+}} xmm7 = xmm7[8],xmm1[8],xmm7[9],xmm1[9],xmm7[10],xmm1[10],xmm7[11],xmm1[11],xmm7[12],xmm1[12],xmm7[13],xmm1[13],xmm7[14],xmm1[14],xmm7[15],xmm1[15] +; SSE2-NEXT: movdqa %xmm7, %xmm5 +; SSE2-NEXT: punpckhwd {{.*#+}} xmm5 = xmm5[4],xmm1[4],xmm5[5],xmm1[5],xmm5[6],xmm1[6],xmm5[7],xmm1[7] +; SSE2-NEXT: psubd %xmm5, %xmm4 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1],xmm6[2],xmm1[2],xmm6[3],xmm1[3] +; SSE2-NEXT: psubd %xmm6, %xmm8 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; SSE2-NEXT: psubd %xmm2, %xmm9 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; SSE2-NEXT: psubd %xmm3, %xmm12 +; SSE2-NEXT: punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm1[0],xmm7[1],xmm1[1],xmm7[2],xmm1[2],xmm7[3],xmm1[3] +; SSE2-NEXT: psubd %xmm7, %xmm0 ; SSE2-NEXT: movdqa %xmm10, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm10 @@ -1215,33 +1210,37 @@ define i32 @sad_nonloop_32i8(<32 x i8>* nocapture readonly %p, i64, <32 x i8>* n ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm11 ; SSE2-NEXT: pxor %xmm1, %xmm11 -; SSE2-NEXT: movdqa %xmm6, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm6 -; SSE2-NEXT: pxor %xmm1, %xmm6 -; SSE2-NEXT: movdqa %xmm12, %xmm1 -; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm12 -; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm13, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm13 ; SSE2-NEXT: pxor %xmm1, %xmm13 -; SSE2-NEXT: movdqa %xmm3, %xmm1 +; SSE2-NEXT: movdqa %xmm4, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 -; SSE2-NEXT: paddd %xmm1, %xmm3 -; SSE2-NEXT: pxor %xmm1, %xmm3 +; SSE2-NEXT: paddd %xmm1, %xmm4 +; SSE2-NEXT: pxor %xmm1, %xmm4 +; SSE2-NEXT: paddd %xmm13, %xmm4 +; SSE2-NEXT: paddd %xmm10, %xmm4 +; SSE2-NEXT: paddd %xmm11, %xmm4 +; SSE2-NEXT: movdqa %xmm8, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm8 +; SSE2-NEXT: pxor %xmm1, %xmm8 +; SSE2-NEXT: movdqa %xmm9, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm9 +; SSE2-NEXT: pxor %xmm1, %xmm9 +; SSE2-NEXT: movdqa %xmm12, %xmm1 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: paddd %xmm1, %xmm12 +; SSE2-NEXT: pxor %xmm1, %xmm12 ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: paddd %xmm1, %xmm0 ; SSE2-NEXT: pxor %xmm1, %xmm0 -; SSE2-NEXT: paddd %xmm3, %xmm0 -; SSE2-NEXT: paddd %xmm11, %xmm6 -; SSE2-NEXT: paddd %xmm9, %xmm6 -; SSE2-NEXT: paddd %xmm10, %xmm6 ; SSE2-NEXT: paddd %xmm12, %xmm0 -; SSE2-NEXT: paddd %xmm6, %xmm0 -; SSE2-NEXT: paddd %xmm13, %xmm0 +; SSE2-NEXT: paddd %xmm8, %xmm0 +; SSE2-NEXT: paddd %xmm4, %xmm0 +; SSE2-NEXT: paddd %xmm9, %xmm0 ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,0,1] ; SSE2-NEXT: paddd %xmm0, %xmm1 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,2,3] diff --git a/test/CodeGen/X86/select.ll b/test/CodeGen/X86/select.ll index ce42d0d643e8..1afef86a5f11 100644 --- a/test/CodeGen/X86/select.ll +++ b/test/CodeGen/X86/select.ll @@ -299,20 +299,21 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; GENERIC-NEXT: testb %dil, %dil ; GENERIC-NEXT: jne LBB7_4 ; GENERIC-NEXT: ## BB#5: +; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] ; GENERIC-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; GENERIC-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; GENERIC-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero ; GENERIC-NEXT: jmp LBB7_6 ; GENERIC-NEXT: LBB7_4: -; GENERIC-NEXT: movd %r9d, %xmm2 -; GENERIC-NEXT: movd %ecx, %xmm3 -; GENERIC-NEXT: movd %r8d, %xmm4 +; GENERIC-NEXT: movd %r9d, %xmm1 +; GENERIC-NEXT: movd %ecx, %xmm2 +; GENERIC-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; GENERIC-NEXT: movd %r8d, %xmm3 ; GENERIC-NEXT: movd %edx, %xmm1 ; GENERIC-NEXT: LBB7_6: -; GENERIC-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; GENERIC-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm1 ; GENERIC-NEXT: psubd {{.*}}(%rip), %xmm0 ; GENERIC-NEXT: movq %xmm0, 16(%rsi) @@ -339,16 +340,19 @@ define void @test8(i1 %c, <6 x i32>* %dst.addr, <6 x i32> %src1,<6 x i32> %src2) ; ATOM-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm4 = mem[0],zero,zero,zero ; ATOM-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; ATOM-NEXT: jmp LBB7_6 ; ATOM-NEXT: LBB7_4: -; ATOM-NEXT: movd %r9d, %xmm2 -; ATOM-NEXT: movd %ecx, %xmm3 -; ATOM-NEXT: movd %r8d, %xmm4 +; ATOM-NEXT: movd %r9d, %xmm1 +; ATOM-NEXT: movd %ecx, %xmm2 +; ATOM-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; ATOM-NEXT: movd %r8d, %xmm3 ; ATOM-NEXT: movd %edx, %xmm1 -; ATOM-NEXT: LBB7_6: -; ATOM-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1] -; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm4[0],xmm1[1],xmm4[1] ; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] +; ATOM-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; ATOM-NEXT: LBB7_6: ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm0 ; ATOM-NEXT: psubd {{.*}}(%rip), %xmm1 ; ATOM-NEXT: movq %xmm0, 16(%rsi) diff --git a/test/CodeGen/X86/setcc-wide-types.ll b/test/CodeGen/X86/setcc-wide-types.ll index 2996edaec3e0..332bf2887fb0 100644 --- a/test/CodeGen/X86/setcc-wide-types.ll +++ b/test/CodeGen/X86/setcc-wide-types.ll @@ -58,25 +58,25 @@ define i32 @ne_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: ne_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: setne %al ; SSE2-NEXT: retq ; @@ -100,25 +100,25 @@ define i32 @eq_i256(<4 x i64> %x, <4 x i64> %y) { ; SSE2-LABEL: eq_i256: ; SSE2: # BB#0: ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r8 +; SSE2-NEXT: movq %xmm4, %rax ; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,0,1] -; SSE2-NEXT: movq %xmm4, %r9 -; SSE2-NEXT: movq %xmm0, %r10 -; SSE2-NEXT: movq %xmm1, %rsi +; SSE2-NEXT: movq %xmm4, %rcx +; SSE2-NEXT: movq %xmm0, %rdx +; SSE2-NEXT: movq %xmm1, %r8 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1] ; SSE2-NEXT: movq %xmm0, %rdi +; SSE2-NEXT: xorq %rax, %rdi ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[2,3,0,1] -; SSE2-NEXT: movq %xmm0, %rax -; SSE2-NEXT: movq %xmm2, %rcx -; SSE2-NEXT: movq %xmm3, %rdx -; SSE2-NEXT: xorq %rsi, %rdx -; SSE2-NEXT: xorq %r10, %rcx -; SSE2-NEXT: orq %rdx, %rcx -; SSE2-NEXT: xorq %r9, %rax -; SSE2-NEXT: xorq %r8, %rdi -; SSE2-NEXT: orq %rax, %rdi +; SSE2-NEXT: movq %xmm0, %rsi +; SSE2-NEXT: xorq %rcx, %rsi +; SSE2-NEXT: orq %rdi, %rsi +; SSE2-NEXT: movq %xmm2, %rax +; SSE2-NEXT: xorq %rdx, %rax +; SSE2-NEXT: movq %xmm3, %rcx +; SSE2-NEXT: xorq %r8, %rcx +; SSE2-NEXT: orq %rax, %rcx ; SSE2-NEXT: xorl %eax, %eax -; SSE2-NEXT: orq %rcx, %rdi +; SSE2-NEXT: orq %rsi, %rcx ; SSE2-NEXT: sete %al ; SSE2-NEXT: retq ; diff --git a/test/CodeGen/X86/shrink_vmul_sse.ll b/test/CodeGen/X86/shrink_vmul_sse.ll index c869dff9e642..6701c247e6fc 100644 --- a/test/CodeGen/X86/shrink_vmul_sse.ll +++ b/test/CodeGen/X86/shrink_vmul_sse.ll @@ -20,9 +20,9 @@ define void @mul_2xi8(i8* nocapture readonly %a, i8* nocapture readonly %b, i64 ; CHECK-NEXT: movzbl 1(%edx,%ecx), %edi ; CHECK-NEXT: movzbl (%edx,%ecx), %edx ; CHECK-NEXT: movzbl 1(%eax,%ecx), %ebx +; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movzbl (%eax,%ecx), %eax ; CHECK-NEXT: imull %edx, %eax -; CHECK-NEXT: imull %edi, %ebx ; CHECK-NEXT: movl %ebx, 4(%esi,%ecx,4) ; CHECK-NEXT: movl %eax, (%esi,%ecx,4) ; CHECK-NEXT: popl %esi diff --git a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll index d46082f20a45..cbd5c69b1772 100644 --- a/test/CodeGen/X86/shuffle-of-splat-multiuses.ll +++ b/test/CodeGen/X86/shuffle-of-splat-multiuses.ll @@ -5,9 +5,8 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind { ; AVX2-LABEL: foo2: ; AVX2: # BB#0: -; AVX2-NEXT: vpermilpd {{.*#+}} xmm1 = xmm0[1,1] -; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm1[1,0] -; AVX2-NEXT: vmovapd %xmm1, (%rdi) +; AVX2-NEXT: vpermilpd {{.*#+}} xmm0 = xmm0[1,1] +; AVX2-NEXT: vmovapd %xmm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <2 x double> %v, <2 x double> undef, <2 x i32> %res1 = shufflevector<2 x double> %res, <2 x double> undef, <2 x i32> @@ -18,9 +17,8 @@ define <2 x double> @foo2(<2 x double> %v, <2 x double> *%p) nounwind { define <4 x double> @foo4(<4 x double> %v, <4 x double> *%p) nounwind { ; AVX2-LABEL: foo4: ; AVX2: # BB#0: -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm1[2,0,2,3] -; AVX2-NEXT: vmovapd %ymm1, (%rdi) +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovapd %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <4 x double> %v, <4 x double> undef, <4 x i32> %res1 = shufflevector<4 x double> %res, <4 x double> undef, <4 x i32> @@ -32,10 +30,8 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind { ; AVX2-LABEL: foo8: ; AVX2: # BB#0: ; AVX2-NEXT: vmovshdup {{.*#+}} ymm0 = ymm0[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm0[2,2,2,2] -; AVX2-NEXT: vmovaps {{.*#+}} ymm0 = <2,0,u,u,5,1,3,7> -; AVX2-NEXT: vpermps %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vmovapd %ymm1, (%rdi) +; AVX2-NEXT: vpermpd {{.*#+}} ymm0 = ymm0[2,2,2,2] +; AVX2-NEXT: vmovapd %ymm0, (%rdi) ; AVX2-NEXT: retq %res = shufflevector <8 x float> %v, <8 x float> undef, <8 x i32> %res1 = shufflevector<8 x float> %res, <8 x float> undef, <8 x i32> @@ -46,7 +42,7 @@ define <8 x float> @foo8(<8 x float> %v, <8 x float> *%p) nounwind { define <4 x i32> @undef_splatmask(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -66,7 +62,7 @@ define <4 x i32> @undef_splatmask2(<4 x i32> %v) nounwind { define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { ; AVX2-LABEL: undef_splatmask3: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -76,9 +72,10 @@ define <4 x i32> @undef_splatmask3(<4 x i32> %v) nounwind { define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask4: ; AVX2: # BB#0: -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,2,3,3] +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> @@ -89,9 +86,10 @@ define <4 x i32> @undef_splatmask4(<4 x i32> %v, <4 x i32>* %p) nounwind { define <4 x i32> @undef_splatmask5(<4 x i32> %v, <4 x i32>* %p) nounwind { ; AVX2-LABEL: undef_splatmask5: ; AVX2: # BB#0: -; AVX2-NEXT: vpbroadcastq %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm1[0,2,2,3] -; AVX2-NEXT: vmovdqa %xmm1, (%rdi) +; AVX2-NEXT: vpbroadcastd %xmm0, %xmm1 +; AVX2-NEXT: vpbroadcastq %xmm0, %xmm0 +; AVX2-NEXT: vmovdqa %xmm0, (%rdi) +; AVX2-NEXT: vmovdqa %xmm1, %xmm0 ; AVX2-NEXT: retq %res = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> %res1 = shufflevector <4 x i32> %res, <4 x i32> undef, <4 x i32> diff --git a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll index 0b03dffe99b5..d99cfaf535de 100644 --- a/test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ b/test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1537,9 +1537,9 @@ define <4 x float> @test_mm_set_ps(float %a0, float %a1, float %a2, float %a3) n ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: retl ; @@ -1673,13 +1673,13 @@ define void @test_mm_setcsr(i32 %a0) nounwind { define <4 x float> @test_mm_setr_ps(float %a0, float %a1, float %a2, float %a3) nounwind { ; X32-LABEL: test_mm_setr_ps: ; X32: # BB#0: +; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero -; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; X32-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] -; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1] ; X32-NEXT: retl ; ; X64-LABEL: test_mm_setr_ps: diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll index dfc1aefd31a6..68ab3f9f3205 100644 --- a/test/CodeGen/X86/sse1.ll +++ b/test/CodeGen/X86/sse1.ll @@ -66,7 +66,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X32-NEXT: jne .LBB1_8 ; X32-NEXT: .LBB1_7: ; X32-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X32-NEXT: jmp .LBB1_9 +; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) +; X32-NEXT: je .LBB1_10 +; X32-NEXT: jmp .LBB1_11 ; X32-NEXT: .LBB1_1: ; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) @@ -77,11 +80,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X32-NEXT: je .LBB1_7 ; X32-NEXT: .LBB1_8: # %entry ; X32-NEXT: xorps %xmm3, %xmm3 -; X32-NEXT: .LBB1_9: # %entry -; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X32-NEXT: cmpl $0, {{[0-9]+}}(%esp) ; X32-NEXT: jne .LBB1_11 -; X32-NEXT: # BB#10: +; X32-NEXT: .LBB1_10: ; X32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X32-NEXT: .LBB1_11: # %entry ; X32-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] @@ -103,7 +105,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: jne .LBB1_8 ; X64-NEXT: .LBB1_7: ; X64-NEXT: movss {{.*#+}} xmm3 = mem[0],zero,zero,zero -; X64-NEXT: jmp .LBB1_9 +; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: testl %esi, %esi +; X64-NEXT: je .LBB1_10 +; X64-NEXT: jmp .LBB1_11 ; X64-NEXT: .LBB1_1: ; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero ; X64-NEXT: testl %edx, %edx @@ -114,11 +119,10 @@ define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) { ; X64-NEXT: je .LBB1_7 ; X64-NEXT: .LBB1_8: # %entry ; X64-NEXT: xorps %xmm3, %xmm3 -; X64-NEXT: .LBB1_9: # %entry -; X64-NEXT: testl %esi, %esi ; X64-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; X64-NEXT: testl %esi, %esi ; X64-NEXT: jne .LBB1_11 -; X64-NEXT: # BB#10: +; X64-NEXT: .LBB1_10: ; X64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: .LBB1_11: # %entry ; X64-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1] diff --git a/test/CodeGen/X86/sse3-avx-addsub-2.ll b/test/CodeGen/X86/sse3-avx-addsub-2.ll index 4d895ea264c5..aed5e0d1c32e 100644 --- a/test/CodeGen/X86/sse3-avx-addsub-2.ll +++ b/test/CodeGen/X86/sse3-avx-addsub-2.ll @@ -412,14 +412,14 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; SSE-NEXT: movaps %xmm1, %xmm4 ; SSE-NEXT: movhlps {{.*#+}} xmm4 = xmm4[1,1] ; SSE-NEXT: subss %xmm4, %xmm3 -; SSE-NEXT: movshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] -; SSE-NEXT: addss %xmm0, %xmm4 +; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] +; SSE-NEXT: movshdup {{.*#+}} xmm3 = xmm0[1,1,3,3] +; SSE-NEXT: addss %xmm0, %xmm3 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; SSE-NEXT: addss %xmm0, %xmm1 -; SSE-NEXT: unpcklps {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1] +; SSE-NEXT: unpcklps {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] ; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1] -; SSE-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1] ; SSE-NEXT: movaps %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -431,12 +431,12 @@ define <4 x float> @test16(<4 x float> %A, <4 x float> %B) { ; AVX-NEXT: vsubss %xmm4, %xmm3, %xmm3 ; AVX-NEXT: vmovshdup {{.*#+}} xmm4 = xmm0[1,1,3,3] ; AVX-NEXT: vaddss %xmm0, %xmm4, %xmm4 +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[2,3] +; AVX-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1],xmm3[0],xmm2[3] ; AVX-NEXT: vpermilps {{.*#+}} xmm0 = xmm0[3,1,2,3] ; AVX-NEXT: vpermilps {{.*#+}} xmm1 = xmm1[3,1,2,3] ; AVX-NEXT: vaddss %xmm1, %xmm0, %xmm0 -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0],xmm4[0],xmm2[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1,2],xmm0[0] ; AVX-NEXT: retq %1 = extractelement <4 x float> %A, i32 0 %2 = extractelement <4 x float> %B, i32 0 diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll index 503b9416c8d3..4a0dc9c1eb17 100644 --- a/test/CodeGen/X86/sse41.ll +++ b/test/CodeGen/X86/sse41.ll @@ -273,8 +273,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X32: ## BB#0: ## %entry ; X32-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X32-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: addss %xmm2, %xmm3 +; X32-NEXT: addss %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X32-NEXT: retl ; @@ -282,8 +282,8 @@ define <2 x float> @buildvector(<2 x float> %A, <2 x float> %B) nounwind { ; X64: ## BB#0: ## %entry ; X64-NEXT: movshdup {{.*#+}} xmm2 = xmm0[1,1,3,3] ; X64-NEXT: movshdup {{.*#+}} xmm3 = xmm1[1,1,3,3] -; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: addss %xmm2, %xmm3 +; X64-NEXT: addss %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[2,3] ; X64-NEXT: retq entry: @@ -896,9 +896,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X32-NEXT: addps %xmm1, %xmm0 ; X32-NEXT: addps %xmm2, %xmm3 ; X32-NEXT: addps %xmm3, %xmm0 ; X32-NEXT: retl @@ -908,9 +908,9 @@ define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x fl ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] +; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0] -; X64-NEXT: addps %xmm1, %xmm0 ; X64-NEXT: addps %xmm2, %xmm3 ; X64-NEXT: addps %xmm3, %xmm0 ; X64-NEXT: retq diff --git a/test/CodeGen/X86/stackmap-frame-setup.ll b/test/CodeGen/X86/stackmap-frame-setup.ll index b83a8d61f6a2..df5ed5431b8a 100644 --- a/test/CodeGen/X86/stackmap-frame-setup.ll +++ b/test/CodeGen/X86/stackmap-frame-setup.ll @@ -7,11 +7,11 @@ entry: store i64 11, i64* %metadata store i64 12, i64* %metadata store i64 13, i64* %metadata -; ISEL: ADJCALLSTACKDOWN64 0, 0, implicit-def +; ISEL: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def ; ISEL-NEXT: STACKMAP ; ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def call void (i64, i32, ...) @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata) -; FAST-ISEL: ADJCALLSTACKDOWN64 0, 0, implicit-def +; FAST-ISEL: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def ; FAST-ISEL-NEXT: STACKMAP ; FAST-ISEL-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def ret void diff --git a/test/CodeGen/X86/vec_int_to_fp.ll b/test/CodeGen/X86/vec_int_to_fp.ll index a42b3c96c3ae..1eef67764ab9 100644 --- a/test/CodeGen/X86/vec_int_to_fp.ll +++ b/test/CodeGen/X86/vec_int_to_fp.ll @@ -4344,7 +4344,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_4 ; AVX1-NEXT: # BB#5: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX1-NEXT: jmp .LBB80_6 ; AVX1-NEXT: .LBB80_4: ; AVX1-NEXT: movq %rax, %rcx @@ -4352,22 +4352,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX1-NEXT: .LBB80_6: ; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2 ; AVX1-NEXT: vmovq %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_7 ; AVX1-NEXT: # BB#8: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX1-NEXT: jmp .LBB80_9 ; AVX1-NEXT: .LBB80_7: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX1-NEXT: .LBB80_9: ; AVX1-NEXT: vpextrq $1, %xmm2, %rax ; AVX1-NEXT: testq %rax, %rax @@ -4397,29 +4397,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX1-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX1-NEXT: .LBB80_15: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX1-NEXT: vmovq %xmm0, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_16 ; AVX1-NEXT: # BB#17: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX1-NEXT: jmp .LBB80_18 ; AVX1-NEXT: .LBB80_16: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX1-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX1-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX1-NEXT: .LBB80_18: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX1-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 -; AVX1-NEXT: vmovq %xmm4, %rax +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; AVX1-NEXT: vmovq %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_19 ; AVX1-NEXT: # BB#20: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX1-NEXT: jmp .LBB80_21 ; AVX1-NEXT: .LBB80_19: ; AVX1-NEXT: movq %rax, %rcx @@ -4427,25 +4427,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax ; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX1-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX1-NEXT: .LBB80_21: +; AVX1-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX1-NEXT: vpextrq $1, %xmm4, %rax +; AVX1-NEXT: vpextrq $1, %xmm3, %rax ; AVX1-NEXT: testq %rax, %rax ; AVX1-NEXT: js .LBB80_22 ; AVX1-NEXT: # BB#23: -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX1-NEXT: jmp .LBB80_24 ; AVX1-NEXT: .LBB80_22: ; AVX1-NEXT: movq %rax, %rcx ; AVX1-NEXT: shrq %rcx ; AVX1-NEXT: andl $1, %eax ; AVX1-NEXT: orq %rcx, %rax -; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX1-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX1-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX1-NEXT: .LBB80_24: -; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -4471,7 +4471,7 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_4 ; AVX2-NEXT: # BB#5: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm4 ; AVX2-NEXT: jmp .LBB80_6 ; AVX2-NEXT: .LBB80_4: ; AVX2-NEXT: movq %rax, %rcx @@ -4479,22 +4479,22 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm3, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm4 ; AVX2-NEXT: .LBB80_6: ; AVX2-NEXT: vextracti128 $1, %ymm2, %xmm2 ; AVX2-NEXT: vmovq %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_7 ; AVX2-NEXT: # BB#8: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 ; AVX2-NEXT: jmp .LBB80_9 ; AVX2-NEXT: .LBB80_7: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm4, %xmm4 -; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm3 +; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 ; AVX2-NEXT: .LBB80_9: ; AVX2-NEXT: vpextrq $1, %xmm2, %rax ; AVX2-NEXT: testq %rax, %rax @@ -4524,29 +4524,29 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm5, %xmm5 ; AVX2-NEXT: vaddss %xmm5, %xmm5, %xmm5 ; AVX2-NEXT: .LBB80_15: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0],xmm1[0],xmm3[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0],xmm1[0],xmm4[2,3] ; AVX2-NEXT: vmovq %xmm0, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_16 ; AVX2-NEXT: # BB#17: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 ; AVX2-NEXT: jmp .LBB80_18 ; AVX2-NEXT: .LBB80_16: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm3 -; AVX2-NEXT: vaddss %xmm3, %xmm3, %xmm3 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm4 +; AVX2-NEXT: vaddss %xmm4, %xmm4, %xmm4 ; AVX2-NEXT: .LBB80_18: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm4[0],xmm1[3] -; AVX2-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[2,3] -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm4 -; AVX2-NEXT: vmovq %xmm4, %rax +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0],xmm5[0],xmm4[2,3] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1],xmm3[0],xmm1[3] +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm3 +; AVX2-NEXT: vmovq %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_19 ; AVX2-NEXT: # BB#20: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm5 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 ; AVX2-NEXT: jmp .LBB80_21 ; AVX2-NEXT: .LBB80_19: ; AVX2-NEXT: movq %rax, %rcx @@ -4554,25 +4554,25 @@ define <8 x float> @uitofp_load_8i64_to_8f32(<8 x i64> *%a) { ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax ; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm0 -; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm5 +; AVX2-NEXT: vaddss %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: .LBB80_21: +; AVX2-NEXT: vinsertps {{.*#+}} xmm4 = xmm4[0,1],xmm0[0],xmm4[3] ; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm2[0] -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm3[0,1],xmm5[0],xmm3[3] -; AVX2-NEXT: vpextrq $1, %xmm4, %rax +; AVX2-NEXT: vpextrq $1, %xmm3, %rax ; AVX2-NEXT: testq %rax, %rax ; AVX2-NEXT: js .LBB80_22 ; AVX2-NEXT: # BB#23: -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 ; AVX2-NEXT: jmp .LBB80_24 ; AVX2-NEXT: .LBB80_22: ; AVX2-NEXT: movq %rax, %rcx ; AVX2-NEXT: shrq %rcx ; AVX2-NEXT: andl $1, %eax ; AVX2-NEXT: orq %rcx, %rax -; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm2 -; AVX2-NEXT: vaddss %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vcvtsi2ssq %rax, %xmm6, %xmm1 +; AVX2-NEXT: vaddss %xmm1, %xmm1, %xmm1 ; AVX2-NEXT: .LBB80_24: -; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm2[0] +; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm4[0,1,2],xmm1[0] ; AVX2-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; diff --git a/test/CodeGen/X86/vec_set-2.ll b/test/CodeGen/X86/vec_set-2.ll index 443264cdffd4..51c8b2111107 100644 --- a/test/CodeGen/X86/vec_set-2.ll +++ b/test/CodeGen/X86/vec_set-2.ll @@ -1,11 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64 define <4 x float> @test1(float %a) nounwind { -; CHECK-LABEL: test1: -; CHECK: # BB#0: -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test1: +; X86: # BB#0: +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: xorps %xmm1, %xmm1 +; X64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: movaps %xmm1, %xmm0 +; X64-NEXT: retq %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 0 %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 1 %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 2 @@ -14,10 +22,15 @@ define <4 x float> @test1(float %a) nounwind { } define <2 x i64> @test(i32 %a) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: retq %tmp = insertelement <4 x i32> zeroinitializer, i32 %a, i32 0 %tmp6 = insertelement <4 x i32> %tmp, i32 0, i32 1 %tmp8 = insertelement <4 x i32> %tmp6, i32 0, i32 2 diff --git a/test/CodeGen/X86/vec_set-3.ll b/test/CodeGen/X86/vec_set-3.ll index ee4a08599968..b34f30924a8d 100644 --- a/test/CodeGen/X86/vec_set-3.ll +++ b/test/CodeGen/X86/vec_set-3.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64 define <4 x float> @test(float %a) { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: insertps {{.*#+}} xmm0 = zero,mem[0],zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; X64-NEXT: retq %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 %tmp5 = insertelement <4 x float> %tmp, float 0.000000e+00, i32 2 %tmp6 = insertelement <4 x float> %tmp5, float 0.000000e+00, i32 3 @@ -13,11 +19,17 @@ define <4 x float> @test(float %a) { } define <2 x i64> @test2(i32 %a) { -; CHECK-LABEL: test2: -; CHECK: # BB#0: -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] -; CHECK-NEXT: retl +; X86-LABEL: test2: +; X86: # BB#0: +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: movd %edi, %xmm0 +; X64-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,0,1] +; X64-NEXT: retq %tmp7 = insertelement <4 x i32> zeroinitializer, i32 %a, i32 2 %tmp9 = insertelement <4 x i32> %tmp7, i32 0, i32 3 %tmp10 = bitcast <4 x i32> %tmp9 to <2 x i64> @@ -25,10 +37,15 @@ define <2 x i64> @test2(i32 %a) { } define <4 x float> @test3(<4 x float> %A) { -; CHECK-LABEL: test3: -; CHECK: # BB#0: -; CHECK-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero -; CHECK-NEXT: retl +; X86-LABEL: test3: +; X86: # BB#0: +; X86-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; X86-NEXT: retl +; +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: insertps {{.*#+}} xmm0 = zero,xmm0[0],zero,zero +; X64-NEXT: retq %tmp0 = extractelement <4 x float> %A, i32 0 %tmp1 = insertelement <4 x float> , float %tmp0, i32 1 %tmp2 = insertelement <4 x float> %tmp1, float 0.000000e+00, i32 2 diff --git a/test/CodeGen/X86/vec_set-4.ll b/test/CodeGen/X86/vec_set-4.ll index 8f35529d61b4..09142e16aa6e 100644 --- a/test/CodeGen/X86/vec_set-4.ll +++ b/test/CodeGen/X86/vec_set-4.ll @@ -1,12 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test(i16 %a) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: pxor %xmm0, %xmm0 +; X86-NEXT: pinsrw $3, {{[0-9]+}}(%esp), %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pinsrw $3, %edi, %xmm0 +; X64-NEXT: retq %tmp10 = insertelement <8 x i16> zeroinitializer, i16 %a, i32 3 %tmp12 = insertelement <8 x i16> %tmp10, i16 0, i32 4 %tmp14 = insertelement <8 x i16> %tmp12, i16 0, i32 5 @@ -17,12 +24,19 @@ define <2 x i64> @test(i16 %a) nounwind { } define <2 x i64> @test2(i8 %a) nounwind { -; CHECK-LABEL: test2: -; CHECK: # BB#0: -; CHECK-NEXT: movzbl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: pxor %xmm0, %xmm0 -; CHECK-NEXT: pinsrw $5, %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test2: +; X86: # BB#0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax +; X86-NEXT: pxor %xmm0, %xmm0 +; X86-NEXT: pinsrw $5, %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: movzbl %dil, %eax +; X64-NEXT: pxor %xmm0, %xmm0 +; X64-NEXT: pinsrw $5, %eax, %xmm0 +; X64-NEXT: retq %tmp24 = insertelement <16 x i8> zeroinitializer, i8 %a, i32 10 %tmp26 = insertelement <16 x i8> %tmp24, i8 0, i32 11 %tmp28 = insertelement <16 x i8> %tmp26, i8 0, i32 12 diff --git a/test/CodeGen/X86/vec_set-6.ll b/test/CodeGen/X86/vec_set-6.ll index 4429834b8ef0..3c9aca3a02da 100644 --- a/test/CodeGen/X86/vec_set-6.ll +++ b/test/CodeGen/X86/vec_set-6.ll @@ -1,13 +1,22 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,+sse4.1 | FileCheck %s --check-prefix=X64 define <4 x float> @test(float %a, float %b, float %c) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; CHECK-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; X86-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: unpcklps {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; X64-NEXT: xorps %xmm2, %xmm2 +; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[0,1] +; X64-NEXT: retq %tmp = insertelement <4 x float> zeroinitializer, float %a, i32 1 %tmp8 = insertelement <4 x float> %tmp, float %b, i32 2 %tmp10 = insertelement <4 x float> %tmp8, float %c, i32 3 diff --git a/test/CodeGen/X86/vec_set-7.ll b/test/CodeGen/X86/vec_set-7.ll index e8fe6debb140..757a0d44cd43 100644 --- a/test/CodeGen/X86/vec_set-7.ll +++ b/test/CodeGen/X86/vec_set-7.ll @@ -1,12 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test(<2 x i64>* %p) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: retq %tmp = bitcast <2 x i64>* %p to double* %tmp.upgrd.1 = load double, double* %tmp %tmp.upgrd.2 = insertelement <2 x double> undef, double %tmp.upgrd.1, i32 0 diff --git a/test/CodeGen/X86/vec_set-8.ll b/test/CodeGen/X86/vec_set-8.ll index 7a4326c01bb7..a9dceb90855a 100644 --- a/test/CodeGen/X86/vec_set-8.ll +++ b/test/CodeGen/X86/vec_set-8.ll @@ -1,11 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse4.2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test(i64 %i) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movq %rdi, %xmm0 -; CHECK-NEXT: retq +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: retq %tmp10 = insertelement <2 x i64> undef, i64 %i, i32 0 %tmp11 = insertelement <2 x i64> %tmp10, i64 0, i32 1 ret <2 x i64> %tmp11 diff --git a/test/CodeGen/X86/vec_set-A.ll b/test/CodeGen/X86/vec_set-A.ll index cae39a3d775b..259ace98d362 100644 --- a/test/CodeGen/X86/vec_set-A.ll +++ b/test/CodeGen/X86/vec_set-A.ll @@ -1,12 +1,19 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 define <2 x i64> @test1() nounwind { -; CHECK-LABEL: test1: -; CHECK: # BB#0: -; CHECK-NEXT: movl $1, %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test1: +; X86: # BB#0: +; X86-NEXT: movl $1, %eax +; X86-NEXT: movd %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test1: +; X64: # BB#0: +; X64-NEXT: movl $1, %eax +; X64-NEXT: movq %rax, %xmm0 +; X64-NEXT: retq ret <2 x i64> < i64 1, i64 0 > } diff --git a/test/CodeGen/X86/vec_set-B.ll b/test/CodeGen/X86/vec_set-B.ll index 0580a3376656..ecd9b57cfd0c 100644 --- a/test/CodeGen/X86/vec_set-B.ll +++ b/test/CodeGen/X86/vec_set-B.ll @@ -1,5 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2 | FileCheck %s --check-prefix=X64 ; These should both generate something like this: ;_test3: @@ -9,26 +10,37 @@ ; ret define <2 x i64> @test3(i64 %arg) nounwind { -; CHECK-LABEL: test3: -; CHECK: # BB#0: -; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687 -; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test3: +; X86: # BB#0: +; X86-NEXT: movl $1234567, %eax # imm = 0x12D687 +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test3: +; X64: # BB#0: +; X64-NEXT: andl $1234567, %edi # imm = 0x12D687 +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: retq %A = and i64 %arg, 1234567 %B = insertelement <2 x i64> zeroinitializer, i64 %A, i32 0 ret <2 x i64> %B } define <2 x i64> @test2(i64 %arg) nounwind { -; CHECK-LABEL: test2: -; CHECK: # BB#0: -; CHECK-NEXT: movl $1234567, %eax # imm = 0x12D687 -; CHECK-NEXT: andl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movd %eax, %xmm0 -; CHECK-NEXT: retl +; X86-LABEL: test2: +; X86: # BB#0: +; X86-NEXT: movl $1234567, %eax # imm = 0x12D687 +; X86-NEXT: andl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd %eax, %xmm0 +; X86-NEXT: retl +; +; X64-LABEL: test2: +; X64: # BB#0: +; X64-NEXT: andl $1234567, %edi # imm = 0x12D687 +; X64-NEXT: movq %rdi, %xmm0 +; X64-NEXT: retq %A = and i64 %arg, 1234567 %B = insertelement <2 x i64> undef, i64 %A, i32 0 ret <2 x i64> %B } - diff --git a/test/CodeGen/X86/vec_set-C.ll b/test/CodeGen/X86/vec_set-C.ll index 994bc2b3056e..865e2fb83f17 100644 --- a/test/CodeGen/X86/vec_set-C.ll +++ b/test/CodeGen/X86/vec_set-C.ll @@ -1,12 +1,12 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X32 +; RUN: llc < %s -mtriple=i386-linux-gnu -mattr=+sse2,-avx | FileCheck %s --check-prefix=X86 ; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+sse2,-avx | FileCheck %s --check-prefix=X64 define <2 x i64> @t1(i64 %x) nounwind { -; X32-LABEL: t1: -; X32: # BB#0: -; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; X32-NEXT: retl +; X86-LABEL: t1: +; X86: # BB#0: +; X86-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; X86-NEXT: retl ; ; X64-LABEL: t1: ; X64: # BB#0: diff --git a/test/CodeGen/X86/vec_set.ll b/test/CodeGen/X86/vec_set.ll index 49bd3beef75a..6439a6dcb00b 100644 --- a/test/CodeGen/X86/vec_set.ll +++ b/test/CodeGen/X86/vec_set.ll @@ -1,27 +1,48 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s +; RUN: llc < %s -mtriple=i386-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X86 +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+sse2,-sse4.1 | FileCheck %s --check-prefix=X64 define void @test(<8 x i16>* %b, i16 %a0, i16 %a1, i16 %a2, i16 %a3, i16 %a4, i16 %a5, i16 %a6, i16 %a7) nounwind { -; CHECK-LABEL: test: -; CHECK: # BB#0: -; CHECK-NEXT: movl {{[0-9]+}}(%esp), %eax -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] -; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; CHECK-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] -; CHECK-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] -; CHECK-NEXT: movdqa %xmm3, (%eax) -; CHECK-NEXT: retl +; X86-LABEL: test: +; X86: # BB#0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3] +; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X86-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X86-NEXT: movd {{.*#+}} xmm3 = mem[0],zero,zero,zero +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X86-NEXT: movdqa %xmm3, (%eax) +; X86-NEXT: retl +; +; X64-LABEL: test: +; X64: # BB#0: +; X64-NEXT: movd %r8d, %xmm0 +; X64-NEXT: movd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] +; X64-NEXT: movd %edx, %xmm1 +; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3] +; X64-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; X64-NEXT: movd %ecx, %xmm0 +; X64-NEXT: movd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; X64-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] +; X64-NEXT: movd %r9d, %xmm2 +; X64-NEXT: movd %esi, %xmm3 +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm0[0],xmm3[1],xmm0[1],xmm3[2],xmm0[2],xmm3[3],xmm0[3] +; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3] +; X64-NEXT: movdqa %xmm3, (%rdi) +; X64-NEXT: retq %tmp = insertelement <8 x i16> zeroinitializer, i16 %a0, i32 0 %tmp2 = insertelement <8 x i16> %tmp, i16 %a1, i32 1 %tmp4 = insertelement <8 x i16> %tmp2, i16 %a2, i32 2 diff --git a/test/CodeGen/X86/vector-bitreverse.ll b/test/CodeGen/X86/vector-bitreverse.ll index 226c0adbaf3c..2fb821555dba 100644 --- a/test/CodeGen/X86/vector-bitreverse.ll +++ b/test/CodeGen/X86/vector-bitreverse.ll @@ -2372,10 +2372,10 @@ define <8 x i64> @test_bitreverse_v8i64(<8 x i64> %a) nounwind { ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsrlq $24, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 -; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm3 -; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm3, %zmm3 ; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 -; AVX512F-NEXT: vporq %zmm1, %zmm3, %zmm1 +; AVX512F-NEXT: vpsrlq $8, %zmm0, %zmm2 +; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 +; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm1 ; AVX512F-NEXT: vpsllq $8, %zmm0, %zmm2 ; AVX512F-NEXT: vpandq {{.*}}(%rip){1to8}, %zmm2, %zmm2 ; AVX512F-NEXT: vpsllq $24, %zmm0, %zmm3 diff --git a/test/CodeGen/X86/vector-blend.ll b/test/CodeGen/X86/vector-blend.ll index a05a981daa1f..f0a5fe1dbfff 100644 --- a/test/CodeGen/X86/vector-blend.ll +++ b/test/CodeGen/X86/vector-blend.ll @@ -848,10 +848,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: pand %xmm1, %xmm3 ; SSE2-NEXT: pandn %xmm5, %xmm1 +; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm2 ; SSE2-NEXT: pandn %xmm4, %xmm0 ; SSE2-NEXT: por %xmm2, %xmm0 -; SSE2-NEXT: por %xmm3, %xmm1 ; SSE2-NEXT: retq ; ; SSSE3-LABEL: blend_logic_v8i32: @@ -860,10 +860,10 @@ define <8 x i32> @blend_logic_v8i32(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) { ; SSSE3-NEXT: psrad $31, %xmm1 ; SSSE3-NEXT: pand %xmm1, %xmm3 ; SSSE3-NEXT: pandn %xmm5, %xmm1 +; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: pand %xmm0, %xmm2 ; SSSE3-NEXT: pandn %xmm4, %xmm0 ; SSSE3-NEXT: por %xmm2, %xmm0 -; SSSE3-NEXT: por %xmm3, %xmm1 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: blend_logic_v8i32: diff --git a/test/CodeGen/X86/vector-lzcnt-128.ll b/test/CodeGen/X86/vector-lzcnt-128.ll index f1f795bf3cb0..e3261d15538f 100644 --- a/test/CodeGen/X86/vector-lzcnt-128.ll +++ b/test/CodeGen/X86/vector-lzcnt-128.ll @@ -1,15 +1,17 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE41 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X64 --check-prefix=SSE --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD ; ; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=ALL --check-prefix=X32-SSE --check-prefix=X32-SSE41 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=X32-SSE define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; SSE2-LABEL: testv2i64: @@ -194,16 +196,46 @@ define <2 x i64> @testv2i64(<2 x i64> %in) nounwind { ; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv2i64: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv2i64: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv2i64: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv2i64: @@ -429,16 +461,46 @@ define <2 x i64> @testv2i64u(<2 x i64> %in) nounwind { ; AVX-NEXT: vpaddq %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv2i64u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddq %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv2i64u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv2i64u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv2i64u: @@ -651,16 +713,41 @@ define <4 x i32> @testv4i32(<4 x i32> %in) nounwind { ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv4i32: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i32: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i32: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv4i32: @@ -867,16 +954,41 @@ define <4 x i32> @testv4i32u(<4 x i32> %in) nounwind { ; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv4i32u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpand %xmm2, %xmm1, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm2, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddd %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i32u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %xmm0, %xmm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i32u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %XMM0 %XMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %XMM0 %XMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-SSE-LABEL: testv4i32u: @@ -1054,8 +1166,28 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv8i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i16: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 @@ -1063,7 +1195,7 @@ define <8 x i16> @testv8i16(<8 x i16> %in) nounwind { ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i16: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 @@ -1238,8 +1370,28 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX-NEXT: vpaddw %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv8i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm4 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm4, %xmm1 +; AVX512VLBWDQ-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm1, %xmm5 +; AVX512VLBWDQ-NEXT: vpand %xmm5, %xmm2, %xmm2 +; AVX512VLBWDQ-NEXT: vpshufb %xmm1, %xmm3, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddb %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpaddw %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i16u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: vpmovdw %ymm0, %xmm0 @@ -1247,7 +1399,7 @@ define <8 x i16> @testv8i16u(<8 x i16> %in) nounwind { ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i16u: -; AVX512CD: ## BB#0: +; AVX512CD: # BB#0: ; AVX512CD-NEXT: vpmovzxwd {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512CD-NEXT: vpmovdw %zmm0, %ymm0 @@ -1399,8 +1551,23 @@ define <16 x i8> @testv16i8(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv16i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 @@ -1546,8 +1713,23 @@ define <16 x i8> @testv16i8u(<16 x i8> %in) nounwind { ; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq ; +; AVX512VLBWDQ-LABEL: testv16i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %xmm2, %xmm3, %xmm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm0, %xmm0 +; AVX512VLBWDQ-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm1 +; AVX512VLBWDQ-NEXT: vpand %xmm1, %xmm2, %xmm1 +; AVX512VLBWDQ-NEXT: vpshufb %xmm0, %xmm3, %xmm0 +; AVX512VLBWDQ-NEXT: vpaddb %xmm0, %xmm1, %xmm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i8u: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero,xmm0[8],zero,zero,zero,xmm0[9],zero,zero,zero,xmm0[10],zero,zero,zero,xmm0[11],zero,zero,zero,xmm0[12],zero,zero,zero,xmm0[13],zero,zero,zero,xmm0[14],zero,zero,zero,xmm0[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdb %zmm0, %xmm0 @@ -1582,17 +1764,17 @@ define <2 x i64> @foldv2i64() nounwind { ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: foldv2i64: -; AVX: # BB#0: -; AVX-NEXT: movl $55, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; NOBW-LABEL: foldv2i64: +; NOBW: # BB#0: +; NOBW-NEXT: movl $55, %eax +; NOBW-NEXT: vmovq %rax, %xmm0 +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv2i64: -; AVX512: ## BB#0: -; AVX512-NEXT: movl $55, %eax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv2i64: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: movl $55, %eax +; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64: ; X32-SSE: # BB#0: @@ -1610,17 +1792,17 @@ define <2 x i64> @foldv2i64u() nounwind { ; SSE-NEXT: movq %rax, %xmm0 ; SSE-NEXT: retq ; -; AVX-LABEL: foldv2i64u: -; AVX: # BB#0: -; AVX-NEXT: movl $55, %eax -; AVX-NEXT: vmovq %rax, %xmm0 -; AVX-NEXT: retq +; NOBW-LABEL: foldv2i64u: +; NOBW: # BB#0: +; NOBW-NEXT: movl $55, %eax +; NOBW-NEXT: vmovq %rax, %xmm0 +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv2i64u: -; AVX512: ## BB#0: -; AVX512-NEXT: movl $55, %eax -; AVX512-NEXT: vmovq %rax, %xmm0 -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv2i64u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: movl $55, %eax +; AVX512VLBWDQ-NEXT: vmovq %rax, %xmm0 +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv2i64u: ; X32-SSE: # BB#0: @@ -1637,15 +1819,15 @@ define <4 x i32> @foldv4i32() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv4i32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX-NEXT: retq +; NOBW-LABEL: foldv4i32: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv4i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv4i32: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32: ; X32-SSE: # BB#0: @@ -1661,15 +1843,15 @@ define <4 x i32> @foldv4i32u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [23,0,32,24] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv4i32u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX-NEXT: retq +; NOBW-LABEL: foldv4i32u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv4i32u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv4i32u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovaps {{.*#+}} xmm0 = [23,0,32,24] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv4i32u: ; X32-SSE: # BB#0: @@ -1685,15 +1867,15 @@ define <8 x i16> @foldv8i16() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv8i16: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX-NEXT: retq +; NOBW-LABEL: foldv8i16: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv8i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv8i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16: ; X32-SSE: # BB#0: @@ -1709,15 +1891,15 @@ define <8 x i16> @foldv8i16u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv8i16u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX-NEXT: retq +; NOBW-LABEL: foldv8i16u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv8i16u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv8i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [7,0,16,8,16,13,11,9] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv8i16u: ; X32-SSE: # BB#0: @@ -1733,15 +1915,15 @@ define <16 x i8> @foldv16i8() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv16i8: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX-NEXT: retq +; NOBW-LABEL: foldv16i8: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv16i8: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv16i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8: ; X32-SSE: # BB#0: @@ -1757,15 +1939,15 @@ define <16 x i8> @foldv16i8u() nounwind { ; SSE-NEXT: movaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] ; SSE-NEXT: retq ; -; AVX-LABEL: foldv16i8u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX-NEXT: retq +; NOBW-LABEL: foldv16i8u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; NOBW-NEXT: retq ; -; AVX512-LABEL: foldv16i8u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] -; AVX512-NEXT: retq +; AVX512VLBWDQ-LABEL: foldv16i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} xmm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2] +; AVX512VLBWDQ-NEXT: retq ; ; X32-SSE-LABEL: foldv16i8u: ; X32-SSE: # BB#0: diff --git a/test/CodeGen/X86/vector-lzcnt-256.ll b/test/CodeGen/X86/vector-lzcnt-256.ll index 53cb4d8e445b..185e1f4865ea 100644 --- a/test/CodeGen/X86/vector-lzcnt-256.ll +++ b/test/CodeGen/X86/vector-lzcnt-256.ll @@ -1,11 +1,13 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512VLCD -; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512CD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512VL +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+avx512dq | FileCheck %s --check-prefix=X64 --check-prefix=AVX512VLBWDQ +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd -mattr=+avx512vl | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512VLCD +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512cd | FileCheck %s --check-prefix=X64 --check-prefix=NOBW --check-prefix=AVX512 --check-prefix=AVX512CD ; ; Just one 32-bit run to make sure we do reasonable things for i64 lzcnt. -; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=X32-AVX define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX1-LABEL: testv4i64: @@ -93,16 +95,76 @@ define <4 x i64> @testv4i64(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv4i64: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv4i64: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i64: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i64: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv4i64: @@ -225,16 +287,76 @@ define <4 x i64> @testv4i64u(<4 x i64> %in) nounwind { ; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv4i64u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv4i64u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqd %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlq $32, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv4i64u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntq %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv4i64u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntq %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv4i64u: @@ -342,16 +464,66 @@ define <8 x i32> @testv8i32(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv8i32: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv8i32: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i32: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i32: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv8i32: @@ -454,16 +626,66 @@ define <8 x i32> @testv8i32u(<8 x i32> %in) nounwind { ; AVX2-NEXT: vpaddd %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv8i32u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VL-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv8i32u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpand %ymm2, %ymm1, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm2, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqw %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrld $16, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddd %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512VLCD-LABEL: testv8i32u: -; AVX512VLCD: ## BB#0: +; AVX512VLCD: # BB#0: ; AVX512VLCD-NEXT: vplzcntd %ymm0, %ymm0 ; AVX512VLCD-NEXT: retq ; ; AVX512CD-LABEL: testv8i32u: -; AVX512CD: ## BB#0: -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD: # BB#0: +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: vplzcntd %zmm0, %zmm0 -; AVX512CD-NEXT: ## kill: %YMM0 %YMM0 %ZMM0 +; AVX512CD-NEXT: # kill: %YMM0 %YMM0 %ZMM0 ; AVX512CD-NEXT: retq ; ; X32-AVX-LABEL: testv8i32u: @@ -551,8 +773,48 @@ define <16 x i16> @testv16i16(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv16i16: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv16i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i16: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 @@ -638,8 +900,48 @@ define <16 x i16> @testv16i16u(<16 x i16> %in) nounwind { ; AVX2-NEXT: vpaddw %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv16i16u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VL-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VL-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VL-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VL-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv16i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm4 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm4, %ymm1 +; AVX512VLBWDQ-NEXT: vpxor %ymm4, %ymm4, %ymm4 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm1, %ymm5 +; AVX512VLBWDQ-NEXT: vpand %ymm5, %ymm2, %ymm2 +; AVX512VLBWDQ-NEXT: vpshufb %ymm1, %ymm3, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddb %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: vpsrlw $8, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpaddw %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv16i16u: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero ; AVX512-NEXT: vplzcntd %zmm0, %zmm0 ; AVX512-NEXT: vpmovdw %zmm0, %ymm0 @@ -710,8 +1012,38 @@ define <32 x i8> @testv32i8(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv32i8: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv32i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv32i8: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 @@ -784,8 +1116,38 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { ; AVX2-NEXT: vpaddb %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; +; AVX512VL-LABEL: testv32i8u: +; AVX512VL: # BB#0: +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VL-NEXT: vmovdqa {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VL-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VL-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VL-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VL-NEXT: retq +; +; AVX512VLBWDQ-LABEL: testv32i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqa {{.*#+}} ymm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm2 +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm3 = [4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0,4,3,2,2,1,1,1,1,0,0,0,0,0,0,0,0] +; AVX512VLBWDQ-NEXT: vpshufb %ymm2, %ymm3, %ymm2 +; AVX512VLBWDQ-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512VLBWDQ-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX512VLBWDQ-NEXT: vpcmpeqb %ymm1, %ymm0, %ymm1 +; AVX512VLBWDQ-NEXT: vpand %ymm1, %ymm2, %ymm1 +; AVX512VLBWDQ-NEXT: vpshufb %ymm0, %ymm3, %ymm0 +; AVX512VLBWDQ-NEXT: vpaddb %ymm0, %ymm1, %ymm0 +; AVX512VLBWDQ-NEXT: retq +; ; AVX512-LABEL: testv32i8u: -; AVX512: ## BB#0: +; AVX512: # BB#0: ; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX512-NEXT: vpmovzxbd {{.*#+}} zmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero,xmm1[2],zero,zero,zero,xmm1[3],zero,zero,zero,xmm1[4],zero,zero,zero,xmm1[5],zero,zero,zero,xmm1[6],zero,zero,zero,xmm1[7],zero,zero,zero,xmm1[8],zero,zero,zero,xmm1[9],zero,zero,zero,xmm1[10],zero,zero,zero,xmm1[11],zero,zero,zero,xmm1[12],zero,zero,zero,xmm1[13],zero,zero,zero,xmm1[14],zero,zero,zero,xmm1[15],zero,zero,zero ; AVX512-NEXT: vplzcntd %zmm1, %zmm1 @@ -818,15 +1180,10 @@ define <32 x i8> @testv32i8u(<32 x i8> %in) nounwind { } define <4 x i64> @foldv4i64() nounwind { -; AVX-LABEL: foldv4i64: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv4i64: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX512-NEXT: retq +; X64-LABEL: foldv4i64: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64: ; X32-AVX: # BB#0: @@ -837,15 +1194,10 @@ define <4 x i64> @foldv4i64() nounwind { } define <4 x i64> @foldv4i64u() nounwind { -; AVX-LABEL: foldv4i64u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv4i64u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] -; AVX512-NEXT: retq +; X64-LABEL: foldv4i64u: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [55,0,64,56] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv4i64u: ; X32-AVX: # BB#0: @@ -856,15 +1208,10 @@ define <4 x i64> @foldv4i64u() nounwind { } define <8 x i32> @foldv8i32() nounwind { -; AVX-LABEL: foldv8i32: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv8i32: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512-NEXT: retq +; X64-LABEL: foldv8i32: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32: ; X32-AVX: # BB#0: @@ -875,15 +1222,10 @@ define <8 x i32> @foldv8i32() nounwind { } define <8 x i32> @foldv8i32u() nounwind { -; AVX-LABEL: foldv8i32u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv8i32u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] -; AVX512-NEXT: retq +; X64-LABEL: foldv8i32u: +; X64: # BB#0: +; X64-NEXT: vmovaps {{.*#+}} ymm0 = [23,0,32,24,0,29,27,25] +; X64-NEXT: retq ; ; X32-AVX-LABEL: foldv8i32u: ; X32-AVX: # BB#0: @@ -894,15 +1236,15 @@ define <8 x i32> @foldv8i32u() nounwind { } define <16 x i16> @foldv16i16() nounwind { -; AVX-LABEL: foldv16i16: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv16i16: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv16i16: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv16i16: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16: ; X32-AVX: # BB#0: @@ -913,15 +1255,15 @@ define <16 x i16> @foldv16i16() nounwind { } define <16 x i16> @foldv16i16u() nounwind { -; AVX-LABEL: foldv16i16u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv16i16u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv16i16u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv16i16u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [7,0,16,8,16,13,11,9,0,8,15,14,13,12,11,10] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv16i16u: ; X32-AVX: # BB#0: @@ -932,15 +1274,15 @@ define <16 x i16> @foldv16i16u() nounwind { } define <32 x i8> @foldv32i8() nounwind { -; AVX-LABEL: foldv32i8: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv32i8: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv32i8: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv32i8: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8: ; X32-AVX: # BB#0: @@ -951,15 +1293,15 @@ define <32 x i8> @foldv32i8() nounwind { } define <32 x i8> @foldv32i8u() nounwind { -; AVX-LABEL: foldv32i8u: -; AVX: # BB#0: -; AVX-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX-NEXT: retq -; -; AVX512-LABEL: foldv32i8u: -; AVX512: ## BB#0: -; AVX512-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] -; AVX512-NEXT: retq +; NOBW-LABEL: foldv32i8u: +; NOBW: # BB#0: +; NOBW-NEXT: vmovaps {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; NOBW-NEXT: retq +; +; AVX512VLBWDQ-LABEL: foldv32i8u: +; AVX512VLBWDQ: # BB#0: +; AVX512VLBWDQ-NEXT: vmovdqu {{.*#+}} ymm0 = [8,0,8,0,8,5,3,1,0,0,7,6,5,4,3,2,1,0,8,8,0,0,0,0,0,0,0,0,6,5,5,1] +; AVX512VLBWDQ-NEXT: retq ; ; X32-AVX-LABEL: foldv32i8u: ; X32-AVX: # BB#0: diff --git a/test/CodeGen/X86/vector-narrow-binop.ll b/test/CodeGen/X86/vector-narrow-binop.ll new file mode 100644 index 000000000000..f737ea2b7fba --- /dev/null +++ b/test/CodeGen/X86/vector-narrow-binop.ll @@ -0,0 +1,111 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=ALL --check-prefix=SSE --check-prefix=SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX1 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512F +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512dq | FileCheck %s --check-prefix=ALL --check-prefix=AVX --check-prefix=AVX512 --check-prefix=AVX512DQ + +; AVX1 has support for 256-bit bitwise logic because the FP variants were included. +; If using those ops requires extra insert/extract though, it's probably not worth it. + +define <8 x i32> @PR32790(<8 x i32> %a, <8 x i32> %b, <8 x i32> %c, <8 x i32> %d) { +; SSE-LABEL: PR32790: +; SSE: # BB#0: +; SSE-NEXT: paddd %xmm2, %xmm0 +; SSE-NEXT: paddd %xmm3, %xmm1 +; SSE-NEXT: pand %xmm5, %xmm1 +; SSE-NEXT: pand %xmm4, %xmm0 +; SSE-NEXT: psubd %xmm6, %xmm0 +; SSE-NEXT: psubd %xmm7, %xmm1 +; SSE-NEXT: retq +; +; AVX1-LABEL: PR32790: +; AVX1: # BB#0: +; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5 +; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4 +; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0 +; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2 +; AVX1-NEXT: vpsubd %xmm2, %xmm1, %xmm1 +; AVX1-NEXT: vpsubd %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: retq +; +; AVX2-LABEL: PR32790: +; AVX2: # BB#0: +; AVX2-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpsubd %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: retq +; +; AVX512-LABEL: PR32790: +; AVX512: # BB#0: +; AVX512-NEXT: vpaddd %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0 +; AVX512-NEXT: vpsubd %ymm3, %ymm0, %ymm0 +; AVX512-NEXT: retq + %add = add <8 x i32> %a, %b + %and = and <8 x i32> %add, %c + %sub = sub <8 x i32> %and, %d + ret <8 x i32> %sub +} + +; In a more extreme case, even the later AVX targets should avoid extract/insert just +; because 256-bit ops are supported. + +define <4 x i32> @do_not_use_256bit_op(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c, <4 x i32> %d) { +; SSE-LABEL: do_not_use_256bit_op: +; SSE: # BB#0: +; SSE-NEXT: pand %xmm3, %xmm1 +; SSE-NEXT: pand %xmm2, %xmm0 +; SSE-NEXT: psubd %xmm1, %xmm0 +; SSE-NEXT: retq +; +; AVX1-LABEL: do_not_use_256bit_op: +; AVX1: # BB#0: +; AVX1-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; AVX1-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1 +; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX1-NEXT: vzeroupper +; AVX1-NEXT: retq +; +; AVX2-LABEL: do_not_use_256bit_op: +; AVX2: # BB#0: +; AVX2-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; AVX2-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX2-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq +; +; AVX512-LABEL: do_not_use_256bit_op: +; AVX512: # BB#0: +; AVX512-NEXT: # kill: %XMM2 %XMM2 %YMM2 +; AVX512-NEXT: # kill: %XMM0 %XMM0 %YMM0 +; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX512-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm1 +; AVX512-NEXT: vpand %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm1 +; AVX512-NEXT: vpsubd %xmm1, %xmm0, %xmm0 +; AVX512-NEXT: vzeroupper +; AVX512-NEXT: retq + %concat1 = shufflevector <4 x i32> %a, <4 x i32> %b, <8 x i32> + %concat2 = shufflevector <4 x i32> %c, <4 x i32> %d, <8 x i32> + %and = and <8 x i32> %concat1, %concat2 + %extract1 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> + %extract2 = shufflevector <8 x i32> %and, <8 x i32> undef, <4 x i32> + %sub = sub <4 x i32> %extract1, %extract2 + ret <4 x i32> %sub +} + diff --git a/test/CodeGen/X86/vector-pcmp.ll b/test/CodeGen/X86/vector-pcmp.ll index f05588a2920c..99a05c3d49c0 100644 --- a/test/CodeGen/X86/vector-pcmp.ll +++ b/test/CodeGen/X86/vector-pcmp.ll @@ -148,8 +148,8 @@ define <32 x i8> @test_pcmpgtb_256(<32 x i8> %x) { ; AVX1-NEXT: vpcmpgtb %xmm1, %xmm2, %xmm1 ; AVX1-NEXT: vpcmpgtb %xmm0, %xmm2, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -177,8 +177,8 @@ define <16 x i16> @test_pcmpgtw_256(<16 x i16> %x) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -206,8 +206,8 @@ define <8 x i32> @test_pcmpgtd_256(<8 x i32> %x) { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -242,14 +242,13 @@ define <4 x i64> @test_pcmpgtq_256(<4 x i64> %x) { ; ; AVX1-LABEL: test_pcmpgtq_256: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] -; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 +; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm1, %ymm1, %ymm1 +; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 ; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/test/CodeGen/X86/vector-shift-ashr-256.ll b/test/CodeGen/X86/vector-shift-ashr-256.ll index af3ddcf8048e..09e143ddcd4d 100644 --- a/test/CodeGen/X86/vector-shift-ashr-256.ll +++ b/test/CodeGen/X86/vector-shift-ashr-256.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; 32-bit runs to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 ; ; Variable Shifts @@ -81,6 +85,41 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 +; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <4 x i64> %a, %b ret <4 x i64> %shift } @@ -147,6 +186,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrad %xmm4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpsrad %xmm5, %xmm2, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-AVX1-NEXT: vpsrad %xmm6, %xmm2, %xmm6 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; X32-AVX1-NEXT: vpsrad %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrad %xmm3, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-AVX1-NEXT: vpsrad %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <8 x i32> %a, %b ret <8 x i32> %shift } @@ -253,6 +327,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsravw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpsraw $8, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsraw $4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsraw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsraw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsraw $8, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-AVX2-NEXT: vpsravd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -436,6 +559,89 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8],xmm4[8],xmm0[9],xmm4[9],xmm0[10],xmm4[10],xmm0[11],xmm4[11],xmm0[12],xmm4[12],xmm0[13],xmm4[13],xmm0[14],xmm4[14],xmm0[15],xmm4[15] +; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm6, %xmm5, %xmm3 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3],xmm0[4],xmm2[4],xmm0[5],xmm2[5],xmm0[6],xmm2[6],xmm0[7],xmm2[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm0[0],xmm4[0],xmm0[1],xmm4[1],xmm0[2],xmm4[2],xmm0[3],xmm4[3],xmm0[4],xmm4[4],xmm0[5],xmm4[5],xmm0[6],xmm4[6],xmm0[7],xmm4[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm5, %xmm4, %xmm3 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpackuswb %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -499,6 +705,33 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsraq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpxor %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsubq %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = ashr <4 x i64> %a, %splat ret <4 x i64> %shift @@ -546,6 +779,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512VL-NEXT: vpsrad %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrad %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrad %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX2-NEXT: vpsrad %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer %shift = ashr <8 x i32> %a, %splat ret <8 x i32> %shift @@ -593,6 +841,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsraw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX1-NEXT: vpsraw %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsraw %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX2-NEXT: vpsraw %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = ashr <16 x i16> %a, %splat ret <16 x i16> %shift @@ -776,6 +1039,84 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsravw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm2 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = ashr <32 x i8> %a, %splat ret <32 x i8> %shift @@ -843,6 +1184,43 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,2147483648,0,2147483648] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm6 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm4[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm6, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm6, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm5[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsubq %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrlq %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubq %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,2147483648,0,2147483648,0,2147483648,0,2147483648] +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm2, %ymm3 +; X32-AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubq %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <4 x i64> %a, ret <4 x i64> %shift } @@ -893,6 +1271,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X32-AVX1-NEXT: vpsrad $6, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrad $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrad $9, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrad $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsravd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <8 x i32> %a, ret <8 x i32> %shift } @@ -980,6 +1381,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsraw $8, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsraw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsraw $2, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsraw $1, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; X32-AVX2-NEXT: vpsravd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; X32-AVX2-NEXT: vpsravd %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <16 x i16> %a, ret <16 x i16> %shift } @@ -1149,6 +1584,81 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512BWVL-NEXT: vpsravw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm1 = [8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm4 = xmm0[8],xmm3[8],xmm0[9],xmm3[9],xmm0[10],xmm3[10],xmm0[11],xmm3[11],xmm0[12],xmm3[12],xmm0[13],xmm3[13],xmm0[14],xmm3[14],xmm0[15],xmm3[15] +; X32-AVX1-NEXT: vpsraw $4, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpunpckhbw {{.*#+}} xmm5 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] +; X32-AVX1-NEXT: vpsraw $4, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $2, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $2, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpsraw $1, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpsraw $1, %xmm5, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm6, %xmm5, %xmm2 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3],xmm0[4],xmm3[4],xmm0[5],xmm3[5],xmm0[6],xmm3[6],xmm0[7],xmm3[7] +; X32-AVX1-NEXT: vpsraw $4, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] +; X32-AVX1-NEXT: vpsraw $4, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $2, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $2, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $1, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsraw $1, %xmm0, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpackuswb %xmm1, %xmm3, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm2 = ymm0[8],ymm1[8],ymm0[9],ymm1[9],ymm0[10],ymm1[10],ymm0[11],ymm1[11],ymm0[12],ymm1[12],ymm0[13],ymm1[13],ymm0[14],ymm1[14],ymm0[15],ymm1[15],ymm0[24],ymm1[24],ymm0[25],ymm1[25],ymm0[26],ymm1[26],ymm0[27],ymm1[27],ymm0[28],ymm1[28],ymm0[29],ymm1[29],ymm0[30],ymm1[30],ymm0[31],ymm1[31] +; X32-AVX2-NEXT: vpunpckhbw {{.*#+}} ymm3 = ymm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31] +; X32-AVX2-NEXT: vpsraw $4, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $2, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpsraw $1, %ymm3, %ymm4 +; X32-AVX2-NEXT: vpaddw %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm3, %ymm2 +; X32-AVX2-NEXT: vpsrlw $8, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm1 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] +; X32-AVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] +; X32-AVX2-NEXT: vpsraw $4, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $2, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsraw $1, %ymm0, %ymm3 +; X32-AVX2-NEXT: vpaddw %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $8, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackuswb %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, ret <32 x i8> %shift } @@ -1206,6 +1716,25 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsraq $7, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrad $7, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpsrlq $7, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsrad $7, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrad $7, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7] +; X32-AVX2-NEXT: retl %shift = ashr <4 x i64> %a, ret <4 x i64> %shift } @@ -1246,6 +1775,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrad $5, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrad $5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrad $5, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <8 x i32> %a, ret <8 x i32> %shift } @@ -1286,6 +1828,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsraw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsraw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsraw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <16 x i16> %a, ret <16 x i16> %shift } @@ -1352,6 +1907,31 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: vpsubb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsubb %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpxor %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsubb %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16,16] +; X32-AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsubb %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = ashr <32 x i8> %a, ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shift-lshr-256.ll b/test/CodeGen/X86/vector-shift-lshr-256.ll index 60575250d713..46be36b76e98 100644 --- a/test/CodeGen/X86/vector-shift-lshr-256.ll +++ b/test/CodeGen/X86/vector-shift-lshr-256.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; 32-bit runs to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 ; ; Variable Shifts @@ -59,6 +63,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <4 x i64> %a, %b ret <4 x i64> %shift } @@ -125,6 +149,41 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm4 = xmm3[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrld %xmm4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpsrlq $32, %xmm3, %xmm5 +; X32-AVX1-NEXT: vpsrld %xmm5, %xmm2, %xmm5 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm5[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm6 = xmm3[2],xmm5[2],xmm3[3],xmm5[3] +; X32-AVX1-NEXT: vpsrld %xmm6, %xmm2, %xmm6 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm3[0],zero,xmm3[1],zero +; X32-AVX1-NEXT: vpsrld %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm6[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm4[2,3],xmm2[4,5],xmm4[6,7] +; X32-AVX1-NEXT: vpsrldq {{.*#+}} xmm3 = xmm1[12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; X32-AVX1-NEXT: vpsrld %xmm3, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq $32, %xmm1, %xmm4 +; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm3 = xmm4[0,1,2,3],xmm3[4,5,6,7] +; X32-AVX1-NEXT: vpunpckhdq {{.*#+}} xmm4 = xmm1[2],xmm5[2],xmm1[3],xmm5[3] +; X32-AVX1-NEXT: vpsrld %xmm4, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm4[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3],xmm0[4,5],xmm3[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <8 x i32> %a, %b ret <8 x i32> %shift } @@ -231,6 +290,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsrlvw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpsrlw $8, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsrlw $8, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-AVX2-NEXT: vpsrlvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <16 x i16> %a, %b ret <16 x i16> %shift } @@ -357,6 +465,56 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; X32-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm7 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm7, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, %b ret <32 x i8> %shift } @@ -401,6 +559,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX2-NEXT: vpsrlq %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = lshr <4 x i64> %a, %splat ret <4 x i64> %shift @@ -448,6 +623,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512VL-NEXT: vpsrld %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpsrld %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrld %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX2-NEXT: vpsrld %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer %shift = lshr <8 x i32> %a, %splat ret <8 x i32> %shift @@ -495,6 +685,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX2-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = lshr <16 x i16> %a, %splat ret <16 x i16> %shift @@ -625,6 +830,55 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsrlvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsrlw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; X32-AVX1-NEXT: vpsrlw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpand %xmm3, %xmm4, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = lshr <32 x i8> %a, %splat ret <32 x i8> %shift @@ -677,6 +931,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsrlq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <4 x i64> %a, ret <4 x i64> %shift } @@ -727,6 +1002,29 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1,2,3],xmm1[4,5,6,7] +; X32-AVX1-NEXT: vpsrld $6, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrld $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2,3],xmm2[4,5],xmm1[6,7] +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrld $7, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpsrld $9, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrld $8, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlvd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <8 x i32> %a, ret <8 x i32> %shift } @@ -814,6 +1112,40 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $8, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm2[1],xmm1[2],xmm2[3],xmm1[4],xmm2[5],xmm1[6],xmm2[7] +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7] +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2],xmm2[3],xmm0[4],xmm2[5],xmm0[6],xmm2[7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm2[4],ymm1[4],ymm2[5],ymm1[5],ymm2[6],ymm1[6],ymm2[7],ymm1[7],ymm2[12],ymm1[12],ymm2[13],ymm1[13],ymm2[14],ymm1[14],ymm2[15],ymm1[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm1[4],ymm0[4],ymm1[5],ymm0[5],ymm1[6],ymm0[6],ymm1[7],ymm0[7],ymm1[12],ymm0[12],ymm1[13],ymm0[13],ymm1[14],ymm0[14],ymm1[15],ymm0[15] +; X32-AVX2-NEXT: vpsrlvd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm2 = ymm2[0],ymm1[0],ymm2[1],ymm1[1],ymm2[2],ymm1[2],ymm2[3],ymm1[3],ymm2[8],ymm1[8],ymm2[9],ymm1[9],ymm2[10],ymm1[10],ymm2[11],ymm1[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm1[0],ymm0[0],ymm1[1],ymm0[1],ymm1[2],ymm0[2],ymm1[3],ymm0[3],ymm1[8],ymm0[8],ymm1[9],ymm0[9],ymm1[10],ymm0[10],ymm1[11],ymm0[11] +; X32-AVX2-NEXT: vpsrlvd %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <16 x i16> %a, ret <16 x i16> %shift } @@ -930,6 +1262,52 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512BWVL-NEXT: vpsrlvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsrlw $2, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [63,63,63,63,63,63,63,63,63,63,63,63,63,63,63,63] +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $2, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpsrlw $1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [127,127,127,127,127,127,127,127,127,127,127,127,127,127,127,127] +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $4, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $2, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrlw $1, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, ret <32 x i8> %shift } @@ -974,6 +1352,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlq $7, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrlq $7, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlq $7, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <4 x i64> %a, ret <4 x i64> %shift } @@ -1014,6 +1405,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrld $5, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrld $5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrld $5, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <8 x i32> %a, ret <8 x i32> %shift } @@ -1054,6 +1458,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <16 x i16> %a, ret <16 x i16> %shift } @@ -1103,6 +1520,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsrlw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [31,31,31,31,31,31,31,31,31,31,31,31,31,31,31,31] +; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsrlw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsrlw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = lshr <32 x i8> %a, ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shift-shl-256.ll b/test/CodeGen/X86/vector-shift-shl-256.ll index 7f534050b6a7..4a134f440a78 100644 --- a/test/CodeGen/X86/vector-shift-shl-256.ll +++ b/test/CodeGen/X86/vector-shift-shl-256.ll @@ -7,6 +7,10 @@ ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512dq,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512DQVL ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=knl -mattr=+avx512bw,+avx512vl | FileCheck %s --check-prefix=ALL --check-prefix=AVX512VL --check-prefix=AVX512BWVL +; +; 32-bit runs to make sure we do reasonable things for i64 shifts. +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX1 +; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefix=ALL --check-prefix=X32-AVX --check-prefix=X32-AVX2 ; ; Variable Shifts @@ -56,6 +60,26 @@ define <4 x i64> @var_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvq %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <4 x i64> %a, %b ret <4 x i64> %shift } @@ -105,6 +129,27 @@ define <8 x i32> @var_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpslld $23, %xmm2, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [1065353216,1065353216,1065353216,1065353216] +; X32-AVX1-NEXT: vpaddd %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vcvttps2dq %xmm2, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpmulld %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpslld $23, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vcvttps2dq %xmm1, %xmm1 +; X32-AVX1-NEXT: vpmulld %xmm0, %xmm1, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <8 x i32> %a, %b ret <8 x i32> %shift } @@ -205,6 +250,55 @@ define <16 x i16> @var_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsllvw %ymm1, %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpor %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddw %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4 +; X32-AVX1-NEXT: vpsllw $8, %xmm4, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm2, %xmm5, %xmm4, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $1, %xmm2, %xmm4 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm4, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $12, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpor %xmm3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm1, %xmm1, %xmm3 +; X32-AVX1-NEXT: vpsllw $8, %xmm0, %xmm4 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm4, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $1, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpaddw %xmm3, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm3, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm3 = ymm1[4],ymm2[4],ymm1[5],ymm2[5],ymm1[6],ymm2[6],ymm1[7],ymm2[7],ymm1[12],ymm2[12],ymm1[13],ymm2[13],ymm1[14],ymm2[14],ymm1[15],ymm2[15] +; X32-AVX2-NEXT: vpunpckhwd {{.*#+}} ymm4 = ymm2[4],ymm0[4],ymm2[5],ymm0[5],ymm2[6],ymm0[6],ymm2[7],ymm0[7],ymm2[12],ymm0[12],ymm2[13],ymm0[13],ymm2[14],ymm0[14],ymm2[15],ymm0[15] +; X32-AVX2-NEXT: vpsllvd %ymm3, %ymm4, %ymm3 +; X32-AVX2-NEXT: vpsrld $16, %ymm3, %ymm3 +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm1 = ymm1[0],ymm2[0],ymm1[1],ymm2[1],ymm1[2],ymm2[2],ymm1[3],ymm2[3],ymm1[8],ymm2[8],ymm1[9],ymm2[9],ymm1[10],ymm2[10],ymm1[11],ymm2[11] +; X32-AVX2-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm2[0],ymm0[0],ymm2[1],ymm0[1],ymm2[2],ymm0[2],ymm2[3],ymm0[3],ymm2[8],ymm0[8],ymm2[9],ymm0[9],ymm2[10],ymm0[10],ymm2[11],ymm0[11] +; X32-AVX2-NEXT: vpsllvd %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsrld $16, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpackusdw %ymm3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <16 x i16> %a, %b ret <16 x i16> %shift } @@ -319,6 +413,52 @@ define <32 x i8> @var_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: var_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 +; X32-AVX1-NEXT: vpsllw $5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm6 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm5, %xmm5, %xmm5 +; X32-AVX1-NEXT: vpblendvb %xmm5, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm6, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: var_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <32 x i8> %a, %b ret <32 x i8> %shift } @@ -363,6 +503,23 @@ define <4 x i64> @splatvar_shift_v4i64(<4 x i64> %a, <4 x i64> %b) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllq %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX1-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpextrd $1, %xmm1, %eax +; X32-AVX2-NEXT: vpinsrd $1, %eax, %xmm1, %xmm1 +; X32-AVX2-NEXT: vpsllq %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <4 x i64> %b, <4 x i64> undef, <4 x i32> zeroinitializer %shift = shl <4 x i64> %a, %splat ret <4 x i64> %shift @@ -410,6 +567,21 @@ define <8 x i32> @splatvar_shift_v8i32(<8 x i32> %a, <8 x i32> %b) nounwind { ; AVX512VL-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero ; AVX512VL-NEXT: vpslld %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX1-NEXT: vpslld %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpslld %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero +; X32-AVX2-NEXT: vpslld %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <8 x i32> %b, <8 x i32> undef, <8 x i32> zeroinitializer %shift = shl <8 x i32> %a, %splat ret <8 x i32> %shift @@ -457,6 +629,21 @@ define <16 x i16> @splatvar_shift_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; AVX512VL-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero ; AVX512VL-NEXT: vpsllw %xmm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX1-NEXT: vpsllw %xmm1, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmovzxwq {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero +; X32-AVX2-NEXT: vpsllw %xmm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <16 x i16> %b, <16 x i16> undef, <16 x i32> zeroinitializer %shift = shl <16 x i16> %a, %splat ret <16 x i16> %shift @@ -577,6 +764,51 @@ define <32 x i8> @splatvar_shift_v32i8(<32 x i8> %a, <32 x i8> %b) nounwind { ; AVX512BWVL-NEXT: vpsllvw %zmm1, %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: splatvar_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpshufb %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpsllw $5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X32-AVX1-NEXT: vpand %xmm5, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm2, %xmm2, %xmm3 +; X32-AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpand %xmm4, %xmm3, %xmm3 +; X32-AVX1-NEXT: vpblendvb %xmm1, %xmm3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpand %xmm5, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm1 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatvar_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpbroadcastb %xmm1, %ymm1 +; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpsllw $5, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm2 +; X32-AVX2-NEXT: vpaddb %ymm1, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %splat = shufflevector <32 x i8> %b, <32 x i8> undef, <32 x i32> zeroinitializer %shift = shl <32 x i8> %a, %splat ret <32 x i8> %shift @@ -626,6 +858,27 @@ define <4 x i64> @constant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvq {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vmovdqa {{.*#+}} ymm1 = [1,0,7,0,31,0,62,0] +; X32-AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3 +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm4 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm2, %xmm3, %xmm2 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm4[0,1,2,3],xmm2[4,5,6,7] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm3 +; X32-AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,0,1] +; X32-AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4,5,6,7] +; X32-AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvq {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <4 x i64> %a, ret <4 x i64> %shift } @@ -666,6 +919,19 @@ define <8 x i32> @constant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllvd {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpmulld {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllvd {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <8 x i32> %a, ret <8 x i32> %shift } @@ -719,6 +985,19 @@ define <16 x i16> @constant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512BWVL: # BB#0: ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %ymm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpmullw {{\.LCPI.*}}, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpmullw {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <16 x i16> %a, ret <16 x i16> %shift } @@ -827,6 +1106,48 @@ define <32 x i8> @constant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512BWVL-NEXT: vpsllvw {{.*}}(%rip), %zmm0, %zmm0 ; AVX512BWVL-NEXT: vpmovwb %zmm0, %ymm0 ; AVX512BWVL-NEXT: retq +; +; X32-AVX1-LABEL: constant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsllw $4, %xmm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [240,240,240,240,240,240,240,240,240,240,240,240,240,240,240,240] +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $2, %xmm1, %xmm2 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm5 = [252,252,252,252,252,252,252,252,252,252,252,252,252,252,252,252] +; X32-AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm4, %xmm4, %xmm6 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpaddb %xmm1, %xmm1, %xmm2 +; X32-AVX1-NEXT: vpaddb %xmm6, %xmm6, %xmm7 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $4, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpand %xmm3, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm4, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $2, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpand %xmm5, %xmm2, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm6, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpaddb %xmm0, %xmm0, %xmm2 +; X32-AVX1-NEXT: vpblendvb %xmm7, %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: constant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $4, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [8192,24640,41088,57536,49376,32928,16480,32,8192,24640,41088,57536,49376,32928,16480,32] +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpsllw $2, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm1, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpaddb %ymm0, %ymm0, %ymm1 +; X32-AVX2-NEXT: vpaddb %ymm2, %ymm2, %ymm2 +; X32-AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <32 x i8> %a, ret <32 x i8> %shift } @@ -871,6 +1192,19 @@ define <4 x i64> @splatconstant_shift_v4i64(<4 x i64> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllq $7, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v4i64: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsllq $7, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v4i64: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllq $7, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <4 x i64> %a, ret <4 x i64> %shift } @@ -911,6 +1245,19 @@ define <8 x i32> @splatconstant_shift_v8i32(<8 x i32> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpslld $5, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v8i32: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpslld $5, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v8i32: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpslld $5, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <8 x i32> %a, ret <8 x i32> %shift } @@ -951,6 +1298,19 @@ define <16 x i16> @splatconstant_shift_v16i16(<16 x i16> %a) nounwind { ; AVX512VL: # BB#0: ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v16i16: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm1 +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 +; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v16i16: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <16 x i16> %a, ret <16 x i16> %shift } @@ -999,6 +1359,23 @@ define <32 x i8> @splatconstant_shift_v32i8(<32 x i8> %a) nounwind { ; AVX512VL-NEXT: vpsllw $3, %ymm0, %ymm0 ; AVX512VL-NEXT: vpand {{.*}}(%rip), %ymm0, %ymm0 ; AVX512VL-NEXT: retq +; +; X32-AVX1-LABEL: splatconstant_shift_v32i8: +; X32-AVX1: # BB#0: +; X32-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 +; X32-AVX1-NEXT: vpsllw $3, %xmm1, %xmm1 +; X32-AVX1-NEXT: vmovdqa {{.*#+}} xmm2 = [248,248,248,248,248,248,248,248,248,248,248,248,248,248,248,248] +; X32-AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1 +; X32-AVX1-NEXT: vpsllw $3, %xmm0, %xmm0 +; X32-AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0 +; X32-AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; X32-AVX1-NEXT: retl +; +; X32-AVX2-LABEL: splatconstant_shift_v32i8: +; X32-AVX2: # BB#0: +; X32-AVX2-NEXT: vpsllw $3, %ymm0, %ymm0 +; X32-AVX2-NEXT: vpand {{\.LCPI.*}}, %ymm0, %ymm0 +; X32-AVX2-NEXT: retl %shift = shl <32 x i8> %a, ret <32 x i8> %shift } diff --git a/test/CodeGen/X86/vector-shuffle-512-v32.ll b/test/CodeGen/X86/vector-shuffle-512-v32.ll index 26cd7301fe60..7a5c992bb829 100644 --- a/test/CodeGen/X86/vector-shuffle-512-v32.ll +++ b/test/CodeGen/X86/vector-shuffle-512-v32.ll @@ -1,129 +1,235 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; NOTE: Assertions have been autogenerated by update_llc_test_checks.py -; RUN: llc < %s -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512 --check-prefix=AVX512BW +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck --check-prefixes=ALL,KNL %s +; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck --check-prefixes=ALL,SKX %s target triple = "x86_64-unknown-unknown" -define <32 x i16> @shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: -; ALL: # BB#0: -; ALL-NEXT: vpbroadcastw %xmm0, %zmm0 -; ALL-NEXT: retq +define <32 x i16> @shuffle_v32i16(<32 x i16> %a) { +; KNL-LABEL: shuffle_v32i16: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16: +; SKX: ## BB#0: +; SKX-NEXT: vpbroadcastw %xmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> zeroinitializer ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: -; ALL: # BB#0: -; ALL-NEXT: vextracti32x4 $1, %zmm0, %xmm0 -; ALL-NEXT: vpbroadcastw %xmm0, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08: +; SKX: ## BB#0: +; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm0 +; SKX-NEXT: vpbroadcastw %xmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: -; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31> -; ALL-NEXT: vpermw %zmm0, %zmm1, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: +; KNL: ## BB#0: +; KNL-NEXT: vperm2i128 {{.*#+}} ymm2 = ymm0[2,3,0,1] +; KNL-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,1,10,11,8,9,8,9,14,15,2,3,4,5,2,3,16,17,26,27,24,25,24,25,30,31,18,19,20,21,18,19] +; KNL-NEXT: vpshufb {{.*#+}} ymm4 = ymm0[4,5,10,11,4,5,6,7,14,15,2,3,4,5,2,3,20,21,26,27,20,21,22,23,30,31,18,19,20,21,18,19] +; KNL-NEXT: vmovdqa {{.*#+}} ymm0 = <0,0,0,0,u,u,u,u,0,0,u,u,255,255,0,0,255,255,255,255,u,u,255,255,255,255,u,u,0,0,255,255> +; KNL-NEXT: vpblendvb %ymm0, %ymm3, %ymm4, %ymm0 +; KNL-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[0,1,10,11,8,9,8,9,14,15,6,7,4,5,14,15,16,17,26,27,24,25,24,25,30,31,22,23,20,21,30,31] +; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,0,0,255,255,0,0,0,0,u,u,0,0,0,0,u,u,255,255,u,u> +; KNL-NEXT: vpblendvb %ymm3, %ymm4, %ymm2, %ymm2 +; KNL-NEXT: vmovdqa {{.*#+}} ymm3 = <255,255,255,255,u,u,u,u,255,255,u,u,255,255,255,255,255,255,255,255,u,u,255,255,255,255,u,u,255,255,0,0> +; KNL-NEXT: vpblendvb %ymm3, %ymm2, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_01_02_05_u_u_07_u_0a_01_00_05_u_04_07_u_0a_1f: +; SKX: ## BB#0: +; SKX-NEXT: vmovdqu16 {{.*#+}} zmm1 = <2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,1,2,5,u,u,7,u,10,1,0,5,u,4,7,u,10,31> +; SKX-NEXT: vpermw %zmm0, %zmm1, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> undef, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: -; ALL: # BB#0: -; ALL-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] -; ALL-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] +; KNL-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[8,9,12,13,12,13,10,11,0,1,4,5,4,5,0,1] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[0,1,0,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm1[0,3,2,2,4,5,6,7] +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm1 +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm5 +; KNL-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm5[4],xmm0[4],xmm5[5],xmm0[5],xmm5[6],xmm0[6],xmm5[7],xmm0[7] +; KNL-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12,13,10,11,8,9,14,15,4,5,2,3,2,3,6,7] +; KNL-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[6,7,2,3,4,5,6,7,2,3,2,3,0,1,14,15] +; KNL-NEXT: vinserti128 $1, %xmm5, %ymm0, %ymm0 +; KNL-NEXT: vpblendw {{.*#+}} ymm1 = ymm0[0],ymm1[1],ymm0[2],ymm1[3],ymm0[4],ymm1[5],ymm0[6],ymm1[7],ymm0[8],ymm1[9],ymm0[10],ymm1[11],ymm0[12],ymm1[13],ymm0[14],ymm1[15] +; KNL-NEXT: vextracti128 $1, %ymm3, %xmm3 +; KNL-NEXT: vpbroadcastw %xmm3, %ymm3 +; KNL-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,0] +; KNL-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1 +; KNL-NEXT: vpshufhw {{.*#+}} xmm2 = xmm2[0,1,2,3,4,5,6,4] +; KNL-NEXT: vinserti128 $1, %xmm2, %ymm4, %ymm2 +; KNL-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0],ymm2[1],ymm0[2],ymm2[3],ymm0[4],ymm2[5],ymm0[6],ymm2[7],ymm0[8],ymm2[9],ymm0[10],ymm2[11],ymm0[12],ymm2[13],ymm0[14],ymm2[15] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_18_0f_1f_0e_16_0d_1d_04_1e_0b_1b_0a_1a_09_19_08_38: +; SKX: ## BB#0: +; SKX-NEXT: vmovdqu16 {{.*#+}} zmm2 = [15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,24,15,31,14,22,13,29,4,28,11,27,10,26,9,25,8,56] +; SKX-NEXT: vpermt2w %zmm1, %zmm2, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: -; ALL: # BB#0: -; ALL-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: +; KNL: ## BB#0: +; KNL-NEXT: vpunpcklwd {{.*#+}} ymm0 = ymm0[0],ymm2[0],ymm0[1],ymm2[1],ymm0[2],ymm2[2],ymm0[3],ymm2[3],ymm0[8],ymm2[8],ymm0[9],ymm2[9],ymm0[10],ymm2[10],ymm0[11],ymm2[11] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v16i32_0_32_1_33_2_34_3_35_8_40_9_41_u_u_u_u: +; SKX: ## BB#0: +; SKX-NEXT: vpunpcklwd {{.*#+}} zmm0 = zmm0[0],zmm1[0],zmm0[1],zmm1[1],zmm0[2],zmm1[2],zmm0[3],zmm1[3],zmm0[8],zmm1[8],zmm0[9],zmm1[9],zmm0[10],zmm1[10],zmm0[11],zmm1[11],zmm0[16],zmm1[16],zmm0[17],zmm1[17],zmm0[18],zmm1[18],zmm0[19],zmm1[19],zmm0[24],zmm1[24],zmm0[25],zmm1[25],zmm0[26],zmm1[26],zmm0[27],zmm1[27] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: -; ALL: # BB#0: -; ALL-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: +; KNL: ## BB#0: +; KNL-NEXT: vpunpckhwd {{.*#+}} ymm0 = ymm0[4],ymm2[4],ymm0[5],ymm2[5],ymm0[6],ymm2[6],ymm0[7],ymm2[7],ymm0[12],ymm2[12],ymm0[13],ymm2[13],ymm0[14],ymm2[14],ymm0[15],ymm2[15] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v16i32_4_36_5_37_6_38_7_39_12_44_13_45_u_u_u_u: +; SKX: ## BB#0: +; SKX-NEXT: vpunpckhwd {{.*#+}} zmm0 = zmm0[4],zmm1[4],zmm0[5],zmm1[5],zmm0[6],zmm1[6],zmm0[7],zmm1[7],zmm0[12],zmm1[12],zmm0[13],zmm1[13],zmm0[14],zmm1[14],zmm0[15],zmm1[15],zmm0[20],zmm1[20],zmm0[21],zmm1[21],zmm0[22],zmm1[22],zmm0[23],zmm1[23],zmm0[28],zmm1[28],zmm0[29],zmm1[29],zmm0[30],zmm1[30],zmm0[31],zmm1[31] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> %b, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: -; ALL: # BB#0: -; ALL-NEXT: vpsrld $16, %zmm0, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: +; KNL: ## BB#0: +; KNL-NEXT: vpsrld $16, %ymm0, %ymm0 +; KNL-NEXT: vpsrld $16, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_1_z_3_z_5_z_7_z_9_z_11_z_13_z_15_z_17_z_19_z_21_z_23_z_25_z_27_z_29_z_31_z: +; SKX: ## BB#0: +; SKX-NEXT: vpsrld $16, %zmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: -; ALL: # BB#0: -; ALL-NEXT: vpslld $16, %zmm0, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: +; KNL: ## BB#0: +; KNL-NEXT: vpslld $16, %ymm0, %ymm0 +; KNL-NEXT: vpslld $16, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_z_0_z_2_z_4_z_6_z_8_z_10_z_12_z_14_z_16_z_18_z_20_z_22_z_24_z_26_z_28_z_30: +; SKX: ## BB#0: +; SKX-NEXT: vpslld $16, %zmm0, %zmm0 +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: -; ALL: # BB#0: -; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: +; KNL: ## BB#0: +; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_1_1_0_0_4_5_6_7_9_9_8_8_12_13_14_15_17_17_16_16_20_21_22_23_25_25_24_24_28_29_30_31: +; SKX: ## BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: -; ALL: # BB#0: -; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: +; KNL: ## BB#0: +; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_0_1_2_3_5_5_4_4_8_9_10_11_13_13_12_12_16_17_18_19_21_21_20_20_24_25_26_27_29_29_28_28: +; SKX: ## BB#0: +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28(<32 x i16> %a, <32 x i16> %b) { -; ALL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: -; ALL: # BB#0: -; ALL-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] -; ALL-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: +; KNL: ## BB#0: +; KNL-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: vpshuflw {{.*#+}} ymm1 = ymm1[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15] +; KNL-NEXT: vpshufhw {{.*#+}} ymm1 = ymm1[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12] +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_1_1_0_0_5_5_4_4_9_9_11_11_13_13_12_12_17_17_19_19_21_21_20_20_25_25_27_27_29_29_28_28: +; SKX: ## BB#0: +; SKX-NEXT: vpshuflw {{.*#+}} zmm0 = zmm0[1,1,0,0,4,5,6,7,9,9,8,8,12,13,14,15,17,17,16,16,20,21,22,23,25,25,24,24,28,29,30,31] +; SKX-NEXT: vpshufhw {{.*#+}} zmm0 = zmm0[0,1,2,3,5,5,4,4,8,9,10,11,13,13,12,12,16,17,18,19,21,21,20,20,24,25,26,27,29,29,28,28] +; SKX-NEXT: retq %c = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %c } define <32 x i16> @shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: -; ALL: # BB#0: -; ALL-NEXT: movl $1, %eax -; ALL-NEXT: kmovd %eax, %k1 -; ALL-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: +; KNL: ## BB#0: +; KNL-NEXT: movl $65535, %eax ## imm = 0xFFFF +; KNL-NEXT: vmovd %eax, %xmm1 +; KNL-NEXT: vpand %ymm1, %ymm0, %ymm0 +; KNL-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_0zzzzzzzzzzzzzzzzzzzzzzzzzzzzzzz: +; SKX: ## BB#0: +; SKX-NEXT: movl $1, %eax +; SKX-NEXT: kmovd %eax, %k1 +; SKX-NEXT: vmovdqu16 %zmm0, %zmm0 {%k1} {z} +; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> %a, <32 x i16> zeroinitializer, <32 x i32> ret <32 x i16> %shuffle } define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { -; ALL-LABEL: insert_dup_mem_v32i16_i32: -; ALL: # BB#0: -; ALL-NEXT: movl (%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_mem_v32i16_i32: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw (%rdi), %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_mem_v32i16_i32: +; SKX: ## BB#0: +; SKX-NEXT: movl (%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -132,11 +238,19 @@ define <32 x i16> @insert_dup_mem_v32i16_i32(i32* %ptr) { } define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) { -; ALL-LABEL: insert_dup_mem_v32i16_sext_i16: -; ALL: # BB#0: -; ALL-NEXT: movswl (%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_mem_v32i16_sext_i16: +; KNL: ## BB#0: +; KNL-NEXT: movswl (%rdi), %eax +; KNL-NEXT: vmovd %eax, %xmm0 +; KNL-NEXT: vpbroadcastw %xmm0, %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_mem_v32i16_sext_i16: +; SKX: ## BB#0: +; SKX-NEXT: movswl (%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i16, i16* %ptr, align 2 %tmp1 = sext i16 %tmp to i32 %tmp2 = insertelement <4 x i32> zeroinitializer, i32 %tmp1, i32 0 @@ -146,11 +260,17 @@ define <32 x i16> @insert_dup_mem_v32i16_sext_i16(i16* %ptr) { } define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 { -; ALL-LABEL: insert_dup_elt1_mem_v32i16_i32: -; ALL: # BB#0: -; ALL-NEXT: movzwl 2(%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_elt1_mem_v32i16_i32: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_elt1_mem_v32i16_i32: +; SKX: ## BB#0: +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 0 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -159,11 +279,17 @@ define <32 x i16> @insert_dup_elt1_mem_v32i16_i32(i32* %ptr) #0 { } define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { -; ALL-LABEL: insert_dup_elt3_mem_v32i16_i32: -; ALL: # BB#0: -; ALL-NEXT: movzwl 2(%rdi), %eax -; ALL-NEXT: vpbroadcastw %ax, %zmm0 -; ALL-NEXT: retq +; KNL-LABEL: insert_dup_elt3_mem_v32i16_i32: +; KNL: ## BB#0: +; KNL-NEXT: vpbroadcastw 2(%rdi), %ymm0 +; KNL-NEXT: vmovdqa %ymm0, %ymm1 +; KNL-NEXT: retq +; +; SKX-LABEL: insert_dup_elt3_mem_v32i16_i32: +; SKX: ## BB#0: +; SKX-NEXT: movzwl 2(%rdi), %eax +; SKX-NEXT: vpbroadcastw %ax, %zmm0 +; SKX-NEXT: retq %tmp = load i32, i32* %ptr, align 4 %tmp1 = insertelement <4 x i32> zeroinitializer, i32 %tmp, i32 1 %tmp2 = bitcast <4 x i32> %tmp1 to <8 x i16> @@ -172,19 +298,79 @@ define <32 x i16> @insert_dup_elt3_mem_v32i16_i32(i32* %ptr) #0 { } define <32 x i16> @shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: -; ALL: # BB#0: -; ALL-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm2 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; KNL-NEXT: vpmovzxwq {{.*#+}} ymm1 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero +; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_32_zz_zz_zz_33_zz_zz_zz_34_zz_zz_zz_35_zz_zz_zz_36_zz_zz_zz_37_zz_zz_zz_38_zz_zz_zz_39_zz_zz_zz: +; SKX: ## BB#0: +; SKX-NEXT: vpmovzxwq {{.*#+}} zmm0 = xmm0[0],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[2],zero,zero,zero,xmm0[3],zero,zero,zero,xmm0[4],zero,zero,zero,xmm0[5],zero,zero,zero,xmm0[6],zero,zero,zero,xmm0[7],zero,zero,zero +; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> ret <32 x i16> %shuffle } define <32 x i16> @shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz(<32 x i16> %a) { -; ALL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: -; ALL: # BB#0: -; ALL-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero -; ALL-NEXT: retq +; KNL-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: +; KNL: ## BB#0: +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm0 +; KNL-NEXT: vpmovzxwd {{.*#+}} ymm1 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero +; KNL-NEXT: vmovdqa %ymm2, %ymm0 +; KNL-NEXT: retq +; +; SKX-LABEL: shuffle_v32i16_32_zz_33_zz_34_zz_35_zz_36_zz_37_zz_38_zz_39_zz_40_zz_41_zz_42_zz_43_zz_44_zz_45_zz_46_zz_47_zz: +; SKX: ## BB#0: +; SKX-NEXT: vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero +; SKX-NEXT: retq %shuffle = shufflevector <32 x i16> zeroinitializer, <32 x i16> %a, <32 x i32> ret <32 x i16> %shuffle } + +define <8 x i16> @pr32967(<32 x i16> %v) { +; KNL-LABEL: pr32967: +; KNL: ## BB#0: +; KNL-NEXT: vextracti128 $1, %ymm1, %xmm2 +; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,1,1,3,4,5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,1,1,3,4,5,6,7] +; KNL-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1] +; KNL-NEXT: vextracti128 $1, %ymm0, %xmm2 +; KNL-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[1,3,2,3,4,5,6,7] +; KNL-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; KNL-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,3,2,3,4,5,6,7] +; KNL-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1] +; KNL-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; KNL-NEXT: retq +; +; SKX-LABEL: pr32967: +; SKX: ## BB#0: +; SKX-NEXT: vpextrw $5, %xmm0, %eax +; SKX-NEXT: vpextrw $1, %xmm0, %ecx +; SKX-NEXT: vmovd %ecx, %xmm1 +; SKX-NEXT: vpinsrw $1, %eax, %xmm1, %xmm1 +; SKX-NEXT: vextracti32x4 $1, %zmm0, %xmm2 +; SKX-NEXT: vpextrw $1, %xmm2, %eax +; SKX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; SKX-NEXT: vpextrw $5, %xmm2, %eax +; SKX-NEXT: vpinsrw $3, %eax, %xmm1, %xmm1 +; SKX-NEXT: vextracti32x4 $2, %zmm0, %xmm2 +; SKX-NEXT: vpextrw $1, %xmm2, %eax +; SKX-NEXT: vpinsrw $4, %eax, %xmm1, %xmm1 +; SKX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4],xmm2[5],xmm1[6,7] +; SKX-NEXT: vextracti32x4 $3, %zmm0, %xmm0 +; SKX-NEXT: vpextrw $1, %xmm0, %eax +; SKX-NEXT: vpinsrw $6, %eax, %xmm1, %xmm1 +; SKX-NEXT: vpextrw $5, %xmm0, %eax +; SKX-NEXT: vpinsrw $7, %eax, %xmm1, %xmm0 +; SKX-NEXT: vzeroupper +; SKX-NEXT: retq + %shuffle = shufflevector <32 x i16> %v, <32 x i16> undef, <8 x i32> + ret <8 x i16> %shuffle +} diff --git a/test/CodeGen/X86/vector-sqrt.ll b/test/CodeGen/X86/vector-sqrt.ll index c5ac4466b5fa..8081e9482d67 100644 --- a/test/CodeGen/X86/vector-sqrt.ll +++ b/test/CodeGen/X86/vector-sqrt.ll @@ -29,11 +29,11 @@ define <4 x float> @sqrtf4(float* nocapture readonly %v) local_unnamed_addr #0 { ; CHECK: # BB#0: # %entry ; CHECK-NEXT: vsqrtss (%rdi), %xmm0, %xmm0 ; CHECK-NEXT: vsqrtss 4(%rdi), %xmm1, %xmm1 -; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm2 -; CHECK-NEXT: vsqrtss 12(%rdi), %xmm3, %xmm3 ; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm2[0],xmm0[3] -; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm3[0] +; CHECK-NEXT: vsqrtss 8(%rdi), %xmm2, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3] +; CHECK-NEXT: vsqrtss 12(%rdi), %xmm2, %xmm1 +; CHECK-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] ; CHECK-NEXT: retq entry: %0 = load float, float* %v, align 4 diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll index 34a9df1782a4..f5ec8e540b0b 100644 --- a/test/CodeGen/X86/viabs.ll +++ b/test/CodeGen/X86/viabs.ll @@ -405,16 +405,16 @@ define <2 x i64> @test_abs_ge_v2i64(<2 x i64> %a) nounwind { ; ; AVX1-LABEL: test_abs_ge_v2i64: ; AVX1: # BB#0: -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 ; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_ge_v2i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm1 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,3,3] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm1 ; AVX2-NEXT: vpaddq %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: vpxor %xmm1, %xmm0, %xmm0 ; AVX2-NEXT: retq @@ -447,21 +447,20 @@ define <4 x i64> @test_abs_gt_v4i64(<4 x i64> %a) nounwind { ; AVX1-LABEL: test_abs_gt_v4i64: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2 -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm2 +; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm4 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm1 -; AVX1-NEXT: vxorps %ymm1, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm4, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_gt_v4i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm1 -; AVX2-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm1, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1 ; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq @@ -504,35 +503,31 @@ define <8 x i64> @test_abs_le_v8i64(<8 x i64> %a) nounwind { ; AVX1-LABEL: test_abs_le_v8i64: ; AVX1: # BB#0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64: ; AVX2: # BB#0: -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq @@ -581,37 +576,33 @@ define <8 x i64> @test_abs_le_v8i64_fold(<8 x i64>* %a.ptr) nounwind { ; AVX1-NEXT: vmovdqu (%rdi), %ymm0 ; AVX1-NEXT: vmovdqu 32(%rdi), %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm0, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm0, %xmm0 +; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm6 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm5, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vxorps %ymm6, %ymm0, %ymm0 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm2, %xmm3 -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm3, %xmm2, %xmm2 -; AVX1-NEXT: vpsrad $31, %xmm1, %xmm4 -; AVX1-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,3,3] -; AVX1-NEXT: vpaddq %xmm4, %xmm1, %xmm1 +; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4 +; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm3 +; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm5 +; AVX1-NEXT: vpaddq %xmm4, %xmm2, %xmm2 +; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1 -; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm4, %ymm2 -; AVX1-NEXT: vxorps %ymm2, %ymm1, %ymm1 +; AVX1-NEXT: vxorps %ymm5, %ymm1, %ymm1 ; AVX1-NEXT: retq ; ; AVX2-LABEL: test_abs_le_v8i64_fold: ; AVX2: # BB#0: ; AVX2-NEXT: vmovdqu (%rdi), %ymm0 ; AVX2-NEXT: vmovdqu 32(%rdi), %ymm1 -; AVX2-NEXT: vpsrad $31, %ymm0, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] -; AVX2-NEXT: vpaddq %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2 -; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[1,1,3,3,5,5,7,7] +; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2 +; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3 +; AVX2-NEXT: vpaddq %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2 ; AVX2-NEXT: vpaddq %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: retq diff --git a/test/CodeGen/X86/vselect-pcmp.ll b/test/CodeGen/X86/vselect-pcmp.ll index d33fda4f49c2..7807991b455d 100644 --- a/test/CodeGen/X86/vselect-pcmp.ll +++ b/test/CodeGen/X86/vselect-pcmp.ll @@ -35,9 +35,7 @@ define <8 x i16> @signbit_sel_v8i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %mask) ; AVX: # BB#0: ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpcmpgtw %xmm2, %xmm3, %xmm2 -; AVX-NEXT: vpandn %xmm1, %xmm2, %xmm1 -; AVX-NEXT: vpand %xmm2, %xmm0, %xmm0 -; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0 +; AVX-NEXT: vpblendvb %xmm2, %xmm0, %xmm1, %xmm0 ; AVX-NEXT: retq %tr = icmp slt <8 x i16> %mask, zeroinitializer %z = select <8 x i1> %tr, <8 x i16> %x, <8 x i16> %y @@ -162,18 +160,14 @@ define <16 x i16> @signbit_sel_v16i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> % ; AVX2: # BB#0: ; AVX2-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX2-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 -; AVX2-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX2-NEXT: retq ; ; AVX512-LABEL: signbit_sel_v16i16: ; AVX512: # BB#0: ; AVX512-NEXT: vpxor %ymm3, %ymm3, %ymm3 ; AVX512-NEXT: vpcmpgtw %ymm2, %ymm3, %ymm2 -; AVX512-NEXT: vpandn %ymm1, %ymm2, %ymm1 -; AVX512-NEXT: vpand %ymm2, %ymm0, %ymm0 -; AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0 +; AVX512-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0 ; AVX512-NEXT: retq %tr = icmp slt <16 x i16> %mask, zeroinitializer %z = select <16 x i1> %tr, <16 x i16> %x, <16 x i16> %y diff --git a/test/CodeGen/X86/x86-interleaved-access.ll b/test/CodeGen/X86/x86-interleaved-access.ll index 6fbec91e77a3..450e255313b3 100644 --- a/test/CodeGen/X86/x86-interleaved-access.ll +++ b/test/CodeGen/X86/x86-interleaved-access.ll @@ -11,13 +11,13 @@ define <4 x double> @load_factorf64_4(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm4 ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] ; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] +; AVX-NEXT: vaddpd %ymm2, %ymm4, %ymm2 ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vhaddpd %ymm5, %ymm4, %ymm1 -; AVX-NEXT: vaddpd %ymm2, %ymm1, %ymm1 -; AVX-NEXT: vaddpd %ymm0, %ymm1, %ymm0 +; AVX-NEXT: vaddpd %ymm0, %ymm2, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -39,11 +39,11 @@ define <4 x double> @load_factorf64_2(<16 x double>* %ptr) { ; AVX-NEXT: vmovupd 96(%rdi), %ymm3 ; AVX-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm4 ; AVX-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm5 +; AVX-NEXT: vunpcklpd {{.*#+}} ymm4 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm0 = ymm0[2,3],ymm2[2,3] ; AVX-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3],ymm3[2,3] -; AVX-NEXT: vunpcklpd {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX-NEXT: vunpckhpd {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX-NEXT: vmulpd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vmulpd %ymm0, %ymm4, %ymm0 ; AVX-NEXT: retq %wide.vec = load <16 x double>, <16 x double>* %ptr, align 16 %strided.v0 = shufflevector <16 x double> %wide.vec, <16 x double> undef, <4 x i32> @@ -124,9 +124,9 @@ define <4 x i64> @load_factori64_4(<16 x i64>* %ptr) { ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm2 = ymm4[0],ymm5[0],ymm4[2],ymm5[2] ; AVX2-NEXT: vpunpcklqdq {{.*#+}} ymm3 = ymm0[0],ymm1[0],ymm0[2],ymm1[2] ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm4 = ymm4[1],ymm5[1],ymm4[3],ymm5[3] +; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm3 ; AVX2-NEXT: vpunpckhqdq {{.*#+}} ymm0 = ymm0[1],ymm1[1],ymm0[3],ymm1[3] -; AVX2-NEXT: vpaddq %ymm3, %ymm4, %ymm1 -; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpaddq %ymm0, %ymm3, %ymm0 ; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: retq %wide.vec = load <16 x i64>, <16 x i64>* %ptr, align 16 diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll index 7e370c25e31b..3052a0f615eb 100644 --- a/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll +++ b/test/CodeGen/X86/x86-no_caller_saved_registers-preserve.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py for function "bar" ; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -;; In functions with 'no_caller_saved_registers' attribute, all registers should +;; In functions with 'no_caller_saved_registers' attribute, all registers should ;; be preserved except for registers used for passing/returning arguments. ;; In the following function registers %RDI, %RSI and %XMM0 are used to store ;; arguments %a0, %a1 and %b0 accordingally. The value is returned in %RAX. @@ -28,20 +28,20 @@ define x86_64_sysvcc i32 @bar(i32 %a0, i32 %a1, float %b0) #0 { ret i32 4 } -;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo" -;; doesn't need to preserve registers except for the arguments passed +;; Because "bar" has 'no_caller_saved_registers' attribute, function "foo" +;; doesn't need to preserve registers except for the arguments passed ;; to "bar" (%ESI, %EDI and %XMM0). define x86_64_sysvcc float @foo(i32 %a0, i32 %a1, float %b0) { -; CHECK-LABEL: foo -; CHECK: movaps %xmm0, %xmm1 -; CHECK-NEXT: movl %esi, %ecx -; CHECK-NEXT: movl %edi, %edx -; CHECK-NEXT: callq bar -; CHECK-NEXT: addl %edx, %eax -; CHECK-NEXT: addl %ecx, %eax -; CHECK-NEXT: xorps %xmm0, %xmm0 -; CHECK-NEXT: cvtsi2ssl %eax, %xmm0 -; CHECK-NEXT: addss %xmm0, %xmm1 +; CHECK-LABEL: foo +; CHECK: movaps %xmm0, %xmm1 +; CHECK-NEXT: movl %esi, %ecx +; CHECK-NEXT: movl %edi, %edx +; CHECK-NEXT: callq bar +; CHECK-NEXT: addl %edx, %eax +; CHECK-NEXT: addl %ecx, %eax +; CHECK-NEXT: xorps %xmm0, %xmm0 +; CHECK-NEXT: cvtsi2ssl %eax, %xmm0 +; CHECK-NEXT: addss %xmm0, %xmm1 ; CHECK: retq %call = call i32 @bar(i32 %a0, i32 %a1, float %b0) #0 %c0 = add i32 %a0, %call diff --git a/test/CodeGen/X86/x86-no_caller_saved_registers.ll b/test/CodeGen/X86/x86-no_caller_saved_registers.ll index 9c62e3ee6ba7..4e5403d1847f 100644 --- a/test/CodeGen/X86/x86-no_caller_saved_registers.ll +++ b/test/CodeGen/X86/x86-no_caller_saved_registers.ll @@ -1,31 +1,31 @@ -; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s -; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s -; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s -; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s - -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; -;; In functions with 'no_caller_saved_registers' attribute, all registers should -;; be preserved except for registers used for passing/returning arguments. -;; The test checks that function "bar" preserves xmm0 register. -;; It also checks that caller function "foo" does not store registers for callee -;; "bar". For example, there is no store/load/access to xmm registers. -;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; - -define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 { -; CHECK-LABEL: bar -; CHECK: mov{{.*}} %xmm0 -; CHECK: mov{{.*}} {{.*}}, %xmm0 -; CHECK: ret - call void asm sideeffect "", "~{xmm0}"() - ret i32 1 -} - -define x86_intrcc void @foo(i8* nocapture readnone %c) { -; CHECK-LABEL: foo -; CHECK-NOT: xmm -entry: - tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0 - ret void -} - -attributes #0 = { "no_caller_saved_registers" } +; RUN: llc -mtriple=x86_64-unknown-unknown < %s | FileCheck %s +; RUN: llc -mtriple=x86_64-unknown-unknown -O0 < %s | FileCheck %s +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 < %s | FileCheck %s +; RUN: llc -mtriple=i686-unknown-unknown -mattr=+sse2 -O0 < %s | FileCheck %s + +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; In functions with 'no_caller_saved_registers' attribute, all registers should +;; be preserved except for registers used for passing/returning arguments. +;; The test checks that function "bar" preserves xmm0 register. +;; It also checks that caller function "foo" does not store registers for callee +;; "bar". For example, there is no store/load/access to xmm registers. +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +define i32 @bar(i32 %a0, i32 %a1, i32 %a2, i32 %a3, i32 %a4, i32 %a5, i32 %a6, i32 %a7, i32 %a8) #0 { +; CHECK-LABEL: bar +; CHECK: mov{{.*}} %xmm0 +; CHECK: mov{{.*}} {{.*}}, %xmm0 +; CHECK: ret + call void asm sideeffect "", "~{xmm0}"() + ret i32 1 +} + +define x86_intrcc void @foo(i8* nocapture readnone %c) { +; CHECK-LABEL: foo +; CHECK-NOT: xmm +entry: + tail call i32 @bar(i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8) #0 + ret void +} + +attributes #0 = { "no_caller_saved_registers" } diff --git a/test/CodeGen/X86/x86-shrink-wrapping.ll b/test/CodeGen/X86/x86-shrink-wrapping.ll index 5b6e773fe5d4..519f0d0924e3 100644 --- a/test/CodeGen/X86/x86-shrink-wrapping.ll +++ b/test/CodeGen/X86/x86-shrink-wrapping.ll @@ -270,8 +270,6 @@ if.end: ; preds = %if.else, %for.end ret i32 %sum.1 } -declare void @somethingElse(...) - ; Check with a more complex case that we do not have restore within the loop and ; save outside. ; CHECK-LABEL: loopInfoRestoreOutsideLoop: @@ -982,3 +980,54 @@ for.inc: } attributes #4 = { "no-frame-pointer-elim"="true" } + +@x = external global i32, align 4 +@y = external global i32, align 4 + +; The post-dominator tree does not include the branch containing the infinite +; loop, which can occur into a misplacement of the restore block, if we're +; looking for the nearest common post-dominator of an "unreachable" block. + +; CHECK-LABEL: infiniteLoopNoSuccessor: +; CHECK: ## BB#0: +; Make sure the prologue happens in the entry block. +; CHECK-NEXT: pushq %rbp +; ... +; Make sure we don't shrink-wrap. +; CHECK: ## BB#1 +; CHECK-NOT: pushq %rbp +; ... +; Make sure the epilogue happens in the exit block. +; CHECK: ## BB#5 +; CHECK: popq %rbp +; CHECK-NEXT: retq +define void @infiniteLoopNoSuccessor() #5 { + %1 = load i32, i32* @x, align 4 + %2 = icmp ne i32 %1, 0 + br i1 %2, label %3, label %4 + +;