diff options
Diffstat (limited to 'lib/xray')
-rw-r--r-- | lib/xray/CMakeLists.txt | 71 | ||||
-rw-r--r-- | lib/xray/tests/CMakeLists.txt | 46 | ||||
-rw-r--r-- | lib/xray/tests/unit/buffer_queue_test.cc | 4 | ||||
-rw-r--r-- | lib/xray/tests/unit/fdr_logging_test.cc | 107 | ||||
-rw-r--r-- | lib/xray/weak_symbols.txt | 4 | ||||
-rw-r--r-- | lib/xray/xray_buffer_queue.cc | 79 | ||||
-rw-r--r-- | lib/xray/xray_buffer_queue.h | 97 | ||||
-rw-r--r-- | lib/xray/xray_fdr_log_records.h | 3 | ||||
-rw-r--r-- | lib/xray/xray_fdr_logging.cc | 206 | ||||
-rw-r--r-- | lib/xray/xray_fdr_logging.h | 1 | ||||
-rw-r--r-- | lib/xray/xray_fdr_logging_impl.h | 605 | ||||
-rw-r--r-- | lib/xray/xray_flags.h | 1 | ||||
-rw-r--r-- | lib/xray/xray_flags.inc | 28 | ||||
-rw-r--r-- | lib/xray/xray_init.cc | 35 | ||||
-rw-r--r-- | lib/xray/xray_inmemory_log.cc | 453 | ||||
-rw-r--r-- | lib/xray/xray_inmemory_log.h | 44 | ||||
-rw-r--r-- | lib/xray/xray_interface.cc | 314 | ||||
-rw-r--r-- | lib/xray/xray_interface_internal.h | 6 | ||||
-rw-r--r-- | lib/xray/xray_log_interface.cc | 63 | ||||
-rw-r--r-- | lib/xray/xray_trampoline_x86_64.S | 147 | ||||
-rw-r--r-- | lib/xray/xray_utils.cc | 3 | ||||
-rw-r--r-- | lib/xray/xray_x86_64.cc | 40 |
22 files changed, 1562 insertions, 795 deletions
diff --git a/lib/xray/CMakeLists.txt b/lib/xray/CMakeLists.txt index 72caa9fac732a..5547600b943ae 100644 --- a/lib/xray/CMakeLists.txt +++ b/lib/xray/CMakeLists.txt @@ -13,47 +13,39 @@ set(XRAY_SOURCES set(x86_64_SOURCES xray_x86_64.cc - xray_trampoline_x86_64.S - ${XRAY_SOURCES}) + xray_trampoline_x86_64.S) set(arm_SOURCES xray_arm.cc - xray_trampoline_arm.S - ${XRAY_SOURCES}) + xray_trampoline_arm.S) set(armhf_SOURCES - ${arm_SOURCES}) + ${arm_SOURCES}) set(aarch64_SOURCES xray_AArch64.cc - xray_trampoline_AArch64.S - ${XRAY_SOURCES}) + xray_trampoline_AArch64.S) set(mips_SOURCES xray_mips.cc - xray_trampoline_mips.S - ${XRAY_SOURCES}) + xray_trampoline_mips.S) set(mipsel_SOURCES xray_mips.cc - xray_trampoline_mips.S - ${XRAY_SOURCES}) + xray_trampoline_mips.S) set(mips64_SOURCES xray_mips64.cc - xray_trampoline_mips64.S - ${XRAY_SOURCES}) + xray_trampoline_mips64.S) set(mips64el_SOURCES xray_mips64.cc - xray_trampoline_mips64.S - ${XRAY_SOURCES}) + xray_trampoline_mips64.S) set(powerpc64le_SOURCES - xray_powerpc64.cc - xray_trampoline_powerpc64.cc - xray_trampoline_powerpc64_asm.S - ${XRAY_SOURCES}) + xray_powerpc64.cc + xray_trampoline_powerpc64.cc + xray_trampoline_powerpc64_asm.S) include_directories(..) include_directories(../../include) @@ -62,20 +54,50 @@ set(XRAY_CFLAGS ${SANITIZER_COMMON_CFLAGS}) set(XRAY_COMMON_DEFINITIONS XRAY_HAS_EXCEPTIONS=1) append_list_if( COMPILER_RT_HAS_XRAY_COMPILER_FLAG XRAY_SUPPORTED=1 XRAY_COMMON_DEFINITIONS) - -add_compiler_rt_object_libraries(RTXray - ARCHS ${XRAY_SUPPORTED_ARCH} - SOURCES ${XRAY_SOURCES} CFLAGS ${XRAY_CFLAGS} - DEFS ${XRAY_COMMON_DEFINITIONS}) +append_list_if( + COMPILER_RT_BUILD_XRAY_NO_PREINIT XRAY_NO_PREINIT XRAY_COMMON_DEFINITIONS) add_compiler_rt_component(xray) set(XRAY_COMMON_RUNTIME_OBJECT_LIBS + RTXray RTSanitizerCommon RTSanitizerCommonLibc) +if (APPLE) + set(XRAY_LINK_LIBS ${SANITIZER_COMMON_LINK_LIBS}) + add_asm_sources(XRAY_ASM_SOURCES xray_trampoline_x86_64.S) + + add_weak_symbols("sanitizer_common" WEAK_SYMBOL_LINK_FLAGS) + add_weak_symbols("xray" WEAK_SYMBOL_LINK_FLAGS) + + add_compiler_rt_object_libraries(RTXray + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + SOURCES ${x86_64_SOURCES} + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS}) + + # We only support running on osx for now. + add_compiler_rt_runtime(clang_rt.xray + STATIC + OS ${XRAY_SUPPORTED_OS} + ARCHS ${XRAY_SUPPORTED_ARCH} + OBJECT_LIBS RTXray + RTSanitizerCommon + RTSanitizerCommonLibc + CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS} + LINK_FLAGS ${SANITIZER_COMMON_LINK_FLAGS} ${WEAK_SYMBOL_LINK_FLAGS} + LINK_LIBS ${XRAY_LINK_LIBS} + PARENT_TARGET xray) +else() foreach(arch ${XRAY_SUPPORTED_ARCH}) if(CAN_TARGET_${arch}) + add_compiler_rt_object_libraries(RTXray + ARCHS ${arch} + SOURCES ${XRAY_SOURCES} CFLAGS ${XRAY_CFLAGS} + DEFS ${XRAY_COMMON_DEFINITIONS}) add_compiler_rt_runtime(clang_rt.xray STATIC ARCHS ${arch} @@ -86,6 +108,7 @@ foreach(arch ${XRAY_SUPPORTED_ARCH}) PARENT_TARGET xray) endif() endforeach() +endif() if(COMPILER_RT_INCLUDE_TESTS) add_subdirectory(tests) diff --git a/lib/xray/tests/CMakeLists.txt b/lib/xray/tests/CMakeLists.txt index a1eb4a030ccc5..e54e63f27890c 100644 --- a/lib/xray/tests/CMakeLists.txt +++ b/lib/xray/tests/CMakeLists.txt @@ -11,47 +11,23 @@ set(XRAY_UNITTEST_CFLAGS -I${COMPILER_RT_SOURCE_DIR}/lib/xray -I${COMPILER_RT_SOURCE_DIR}/lib) -macro(xray_compile obj_list source arch) - get_filename_component(basename ${source} NAME) - set(output_obj "${basename}.${arch}.o") - get_target_flags_for_arch(${arch} TARGET_CFLAGS) - if(NOT COMPILER_RT_STANDALONE_BUILD) - list(APPEND COMPILE_DEPS gtest_main xray) - endif() - clang_compile(${output_obj} ${source} - CFLAGS ${XRAY_UNITTEST_CFLAGS} ${TARGET_CFLAGS} - DEPS ${COMPILE_DEPS}) - list(APPEND ${obj_list} ${output_obj}) -endmacro() - +set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH}) macro(add_xray_unittest testname) - set(XRAY_TEST_ARCH ${XRAY_SUPPORTED_ARCH}) - if (APPLE) - darwin_filter_host_archs(XRAY_SUPPORTED_ARCH) - endif() - if(UNIX) + cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN}) + if(UNIX AND NOT APPLE) foreach(arch ${XRAY_TEST_ARCH}) - cmake_parse_arguments(TEST "" "" "SOURCES;HEADERS" ${ARGN}) set(TEST_OBJECTS) - foreach(SOURCE ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE}) - xray_compile(TEST_OBJECTS ${SOURCE} ${arch} ${TEST_HEADERS}) - endforeach() - get_target_flags_for_arch(${arch} TARGET_LINK_FLAGS) - set(TEST_DEPS ${TEST_OBJECTS}) - if(NOT COMPILER_RT_STANDALONE_BUILD) - list(APPEND TEST_DEPS gtest_main xray) - endif() - if(NOT APPLE) - add_compiler_rt_test(XRayUnitTests ${testname}-${arch} - OBJECTS ${TEST_OBJECTS} - DEPS ${TEST_DEPS} - LINK_FLAGS ${TARGET_LINK_FLAGS} + generate_compiler_rt_tests(TEST_OBJECTS + XRayUnitTests "${testname}-${arch}-Test" "${arch}" + SOURCES ${TEST_SOURCES} ${COMPILER_RT_GTEST_SOURCE} + DEPS gtest xray llvm-xray + CFLAGS ${XRAY_UNITTEST_CFLAGS} + LINK_FLAGS -fxray-instrument + ${TARGET_LINK_FLAGS} -lstdc++ -lm ${CMAKE_THREAD_LIBS_INIT} -lpthread - -L${COMPILER_RT_LIBRARY_OUTPUT_DIR} -lclang_rt.xray-${arch} -ldl -lrt) - endif() - # FIXME: Figure out how to run even just the unit tests on APPLE. + set_target_properties(XRayUnitTests PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}) endforeach() endif() endmacro() diff --git a/lib/xray/tests/unit/buffer_queue_test.cc b/lib/xray/tests/unit/buffer_queue_test.cc index ac89a8dbc50e7..1ec7469ce1874 100644 --- a/lib/xray/tests/unit/buffer_queue_test.cc +++ b/lib/xray/tests/unit/buffer_queue_test.cc @@ -68,9 +68,9 @@ TEST(BufferQueueTest, ErrorsWhenFinalising) { ASSERT_NE(nullptr, Buf.Buffer); ASSERT_EQ(Buffers.finalize(), BufferQueue::ErrorCode::Ok); BufferQueue::Buffer OtherBuf; - ASSERT_EQ(BufferQueue::ErrorCode::AlreadyFinalized, + ASSERT_EQ(BufferQueue::ErrorCode::QueueFinalizing, Buffers.getBuffer(OtherBuf)); - ASSERT_EQ(BufferQueue::ErrorCode::AlreadyFinalized, + ASSERT_EQ(BufferQueue::ErrorCode::QueueFinalizing, Buffers.finalize()); ASSERT_EQ(Buffers.releaseBuffer(Buf), BufferQueue::ErrorCode::Ok); } diff --git a/lib/xray/tests/unit/fdr_logging_test.cc b/lib/xray/tests/unit/fdr_logging_test.cc index 0d5e99a743344..1009d56a43b33 100644 --- a/lib/xray/tests/unit/fdr_logging_test.cc +++ b/lib/xray/tests/unit/fdr_logging_test.cc @@ -17,8 +17,10 @@ #include <iostream> #include <sys/mman.h> #include <sys/stat.h> +#include <sys/syscall.h> #include <sys/types.h> #include <system_error> +#include <thread> #include <unistd.h> #include "xray/xray_records.h" @@ -34,14 +36,23 @@ struct ScopedFileCloserAndDeleter { : Fd(Fd), Filename(Filename) {} ~ScopedFileCloserAndDeleter() { + if (Map) + munmap(Map, Size); if (Fd) { close(Fd); unlink(Filename); } } + void registerMap(void *M, size_t S) { + Map = M; + Size = S; + } + int Fd; const char *Filename; + void *Map = nullptr; + size_t Size = 0; }; TEST(FDRLoggingTest, Simple) { @@ -51,13 +62,12 @@ TEST(FDRLoggingTest, Simple) { Options.Fd = mkstemp(TmpFilename); ASSERT_NE(Options.Fd, -1); ASSERT_EQ(fdrLoggingInit(kBufferSize, kBufferMax, &Options, - sizeof(FDRLoggingOptions)), + sizeof(FDRLoggingOptions)), XRayLogInitStatus::XRAY_LOG_INITIALIZED); fdrLoggingHandleArg0(1, XRayEntryType::ENTRY); fdrLoggingHandleArg0(1, XRayEntryType::EXIT); ASSERT_EQ(fdrLoggingFinalize(), XRayLogInitStatus::XRAY_LOG_FINALIZED); ASSERT_EQ(fdrLoggingFlush(), XRayLogFlushStatus::XRAY_LOG_FLUSHED); - ASSERT_EQ(fdrLoggingReset(), XRayLogInitStatus::XRAY_LOG_UNINITIALIZED); // To do this properly, we have to close the file descriptor then re-open the // file for reading this time. @@ -68,20 +78,25 @@ TEST(FDRLoggingTest, Simple) { auto Size = lseek(Fd, 0, SEEK_END); ASSERT_NE(Size, 0); // Map the file contents. - const char *Contents = static_cast<const char *>( - mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0)); + void *Map = mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0); + const char *Contents = static_cast<const char *>(Map); + Guard.registerMap(Map, Size); ASSERT_NE(Contents, nullptr); XRayFileHeader H; memcpy(&H, Contents, sizeof(XRayFileHeader)); - ASSERT_EQ(H.Version, 1); + ASSERT_EQ(H.Version, 2); ASSERT_EQ(H.Type, FileTypes::FDR_LOG); - // We require one buffer at least to have the "start of buffer" metadata - // record. - MetadataRecord MDR; - memcpy(&MDR, Contents + sizeof(XRayFileHeader), sizeof(MetadataRecord)); - ASSERT_EQ(MDR.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer)); + // We require one buffer at least to have the "extents" metadata record, + // followed by the NewBuffer record. + MetadataRecord MDR0, MDR1; + memcpy(&MDR0, Contents + sizeof(XRayFileHeader), sizeof(MetadataRecord)); + memcpy(&MDR1, Contents + sizeof(XRayFileHeader) + sizeof(MetadataRecord), + sizeof(MetadataRecord)); + ASSERT_EQ(MDR0.RecordKind, + uint8_t(MetadataRecord::RecordKinds::BufferExtents)); + ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer)); } TEST(FDRLoggingTest, Multiple) { @@ -90,7 +105,7 @@ TEST(FDRLoggingTest, Multiple) { Options.Fd = mkstemp(TmpFilename); ASSERT_NE(Options.Fd, -1); ASSERT_EQ(fdrLoggingInit(kBufferSize, kBufferMax, &Options, - sizeof(FDRLoggingOptions)), + sizeof(FDRLoggingOptions)), XRayLogInitStatus::XRAY_LOG_INITIALIZED); for (uint64_t I = 0; I < 100; ++I) { fdrLoggingHandleArg0(1, XRayEntryType::ENTRY); @@ -98,7 +113,6 @@ TEST(FDRLoggingTest, Multiple) { } ASSERT_EQ(fdrLoggingFinalize(), XRayLogInitStatus::XRAY_LOG_FINALIZED); ASSERT_EQ(fdrLoggingFlush(), XRayLogFlushStatus::XRAY_LOG_FLUSHED); - ASSERT_EQ(fdrLoggingReset(), XRayLogInitStatus::XRAY_LOG_UNINITIALIZED); // To do this properly, we have to close the file descriptor then re-open the // file for reading this time. @@ -109,18 +123,77 @@ TEST(FDRLoggingTest, Multiple) { auto Size = lseek(Fd, 0, SEEK_END); ASSERT_NE(Size, 0); // Map the file contents. - const char *Contents = static_cast<const char *>( - mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0)); + void *Map = mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0); + const char *Contents = static_cast<const char *>(Map); + Guard.registerMap(Map, Size); + ASSERT_NE(Contents, nullptr); + + XRayFileHeader H; + memcpy(&H, Contents, sizeof(XRayFileHeader)); + ASSERT_EQ(H.Version, 2); + ASSERT_EQ(H.Type, FileTypes::FDR_LOG); + + MetadataRecord MDR0, MDR1; + memcpy(&MDR0, Contents + sizeof(XRayFileHeader), sizeof(MetadataRecord)); + memcpy(&MDR1, Contents + sizeof(XRayFileHeader) + sizeof(MetadataRecord), + sizeof(MetadataRecord)); + ASSERT_EQ(MDR0.RecordKind, + uint8_t(MetadataRecord::RecordKinds::BufferExtents)); + ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer)); +} + +TEST(FDRLoggingTest, MultiThreadedCycling) { + FDRLoggingOptions Options; + char TmpFilename[] = "fdr-logging-test.XXXXXX"; + Options.Fd = mkstemp(TmpFilename); + ASSERT_NE(Options.Fd, -1); + ASSERT_EQ(fdrLoggingInit(kBufferSize, 1, &Options, sizeof(FDRLoggingOptions)), + XRayLogInitStatus::XRAY_LOG_INITIALIZED); + + // Now we want to create one thread, do some logging, then create another one, + // in succession and making sure that we're able to get thread records from + // the latest thread (effectively being able to recycle buffers). + std::array<pid_t, 2> Threads; + for (uint64_t I = 0; I < 2; ++I) { + std::thread t{[I, &Threads] { + fdrLoggingHandleArg0(I + 1, XRayEntryType::ENTRY); + fdrLoggingHandleArg0(I + 1, XRayEntryType::EXIT); + Threads[I] = syscall(SYS_gettid); + }}; + t.join(); + } + ASSERT_EQ(fdrLoggingFinalize(), XRayLogInitStatus::XRAY_LOG_FINALIZED); + ASSERT_EQ(fdrLoggingFlush(), XRayLogFlushStatus::XRAY_LOG_FLUSHED); + + // To do this properly, we have to close the file descriptor then re-open the + // file for reading this time. + ASSERT_EQ(close(Options.Fd), 0); + int Fd = open(TmpFilename, O_RDONLY); + ASSERT_NE(-1, Fd); + ScopedFileCloserAndDeleter Guard(Fd, TmpFilename); + auto Size = lseek(Fd, 0, SEEK_END); + ASSERT_NE(Size, 0); + // Map the file contents. + void *Map = mmap(NULL, Size, PROT_READ, MAP_PRIVATE, Fd, 0); + const char *Contents = static_cast<const char *>(Map); + Guard.registerMap(Map, Size); ASSERT_NE(Contents, nullptr); XRayFileHeader H; memcpy(&H, Contents, sizeof(XRayFileHeader)); - ASSERT_EQ(H.Version, 1); + ASSERT_EQ(H.Version, 2); ASSERT_EQ(H.Type, FileTypes::FDR_LOG); - MetadataRecord MDR0; + MetadataRecord MDR0, MDR1; memcpy(&MDR0, Contents + sizeof(XRayFileHeader), sizeof(MetadataRecord)); - ASSERT_EQ(MDR0.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer)); + memcpy(&MDR1, Contents + sizeof(XRayFileHeader) + sizeof(MetadataRecord), + sizeof(MetadataRecord)); + ASSERT_EQ(MDR0.RecordKind, + uint8_t(MetadataRecord::RecordKinds::BufferExtents)); + ASSERT_EQ(MDR1.RecordKind, uint8_t(MetadataRecord::RecordKinds::NewBuffer)); + pid_t Latest = 0; + memcpy(&Latest, MDR1.Data, sizeof(pid_t)); + ASSERT_EQ(Latest, Threads[1]); } } // namespace diff --git a/lib/xray/weak_symbols.txt b/lib/xray/weak_symbols.txt new file mode 100644 index 0000000000000..963fff2d697ee --- /dev/null +++ b/lib/xray/weak_symbols.txt @@ -0,0 +1,4 @@ +___start_xray_fn_idx +___start_xray_instr_map +___stop_xray_fn_idx +___stop_xray_instr_map diff --git a/lib/xray/xray_buffer_queue.cc b/lib/xray/xray_buffer_queue.cc index 7ba755ac30692..a0018f6b0cba4 100644 --- a/lib/xray/xray_buffer_queue.cc +++ b/lib/xray/xray_buffer_queue.cc @@ -13,28 +13,34 @@ // //===----------------------------------------------------------------------===// #include "xray_buffer_queue.h" +#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_libc.h" -#include <cstdlib> -#include <tuple> - using namespace __xray; using namespace __sanitizer; -BufferQueue::BufferQueue(std::size_t B, std::size_t N, bool &Success) - : BufferSize(B), Buffers(N), Mutex(), OwnedBuffers(), Finalizing{0} { - for (auto &T : Buffers) { - void *Tmp = malloc(BufferSize); +BufferQueue::BufferQueue(size_t B, size_t N, bool &Success) + : BufferSize(B), Buffers(new BufferRep[N]()), BufferCount(N), Finalizing{0}, + OwnedBuffers(new void *[N]()), Next(Buffers), First(Buffers), + LiveBuffers(0) { + for (size_t i = 0; i < N; ++i) { + auto &T = Buffers[i]; + void *Tmp = InternalAlloc(BufferSize, nullptr, 64); if (Tmp == nullptr) { Success = false; return; } - - auto &Buf = std::get<0>(T); + void *Extents = InternalAlloc(sizeof(BufferExtents), nullptr, 64); + if (Extents == nullptr) { + Success = false; + return; + } + auto &Buf = T.Buff; Buf.Buffer = Tmp; Buf.Size = B; - OwnedBuffers.emplace(Tmp); + Buf.Extents = reinterpret_cast<BufferExtents *>(Extents); + OwnedBuffers[i] = Tmp; } Success = true; } @@ -42,27 +48,50 @@ BufferQueue::BufferQueue(std::size_t B, std::size_t N, bool &Success) BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) { if (__sanitizer::atomic_load(&Finalizing, __sanitizer::memory_order_acquire)) return ErrorCode::QueueFinalizing; - __sanitizer::BlockingMutexLock Guard(&Mutex); - if (Buffers.empty()) + __sanitizer::SpinMutexLock Guard(&Mutex); + if (LiveBuffers == BufferCount) return ErrorCode::NotEnoughMemory; - auto &T = Buffers.front(); - auto &B = std::get<0>(T); + + auto &T = *Next; + auto &B = T.Buff; Buf = B; - B.Buffer = nullptr; - B.Size = 0; - Buffers.pop_front(); + T.Used = true; + ++LiveBuffers; + + if (++Next == (Buffers + BufferCount)) + Next = Buffers; + return ErrorCode::Ok; } BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { - if (OwnedBuffers.count(Buf.Buffer) == 0) + // Blitz through the buffers array to find the buffer. + bool Found = false; + for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) { + if (*I == Buf.Buffer) { + Found = true; + break; + } + } + if (!Found) return ErrorCode::UnrecognizedBuffer; - __sanitizer::BlockingMutexLock Guard(&Mutex); + + __sanitizer::SpinMutexLock Guard(&Mutex); + + // This points to a semantic bug, we really ought to not be releasing more + // buffers than we actually get. + if (LiveBuffers == 0) + return ErrorCode::NotEnoughMemory; // Now that the buffer has been released, we mark it as "used". - Buffers.emplace(Buffers.end(), Buf, true /* used */); + First->Buff = Buf; + First->Used = true; Buf.Buffer = nullptr; Buf.Size = 0; + --LiveBuffers; + if (++First == (Buffers + BufferCount)) + First = Buffers; + return ErrorCode::Ok; } @@ -74,8 +103,12 @@ BufferQueue::ErrorCode BufferQueue::finalize() { } BufferQueue::~BufferQueue() { - for (auto &T : Buffers) { - auto &Buf = std::get<0>(T); - free(Buf.Buffer); + for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) { + auto &T = *I; + auto &Buf = T.Buff; + InternalFree(Buf.Buffer); + InternalFree(Buf.Extents); } + delete[] Buffers; + delete[] OwnedBuffers; } diff --git a/lib/xray/xray_buffer_queue.h b/lib/xray/xray_buffer_queue.h index e051695a297b5..1ceb582746165 100644 --- a/lib/xray/xray_buffer_queue.h +++ b/lib/xray/xray_buffer_queue.h @@ -15,11 +15,9 @@ #ifndef XRAY_BUFFER_QUEUE_H #define XRAY_BUFFER_QUEUE_H +#include <cstddef> #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_mutex.h" -#include <deque> -#include <unordered_set> -#include <utility> namespace __xray { @@ -29,23 +27,50 @@ namespace __xray { /// the "flight data recorder" (FDR) mode to support ongoing XRay function call /// trace collection. class BufferQueue { -public: + public: + struct alignas(64) BufferExtents { + __sanitizer::atomic_uint64_t Size; + }; + struct Buffer { void *Buffer = nullptr; size_t Size = 0; + BufferExtents* Extents; + }; + + private: + struct BufferRep { + // The managed buffer. + Buffer Buff; + + // This is true if the buffer has been returned to the available queue, and + // is considered "used" by another thread. + bool Used = false; }; -private: + // Size of each individual Buffer. size_t BufferSize; - // We use a bool to indicate whether the Buffer has been used in this - // freelist implementation. - std::deque<std::tuple<Buffer, bool>> Buffers; - __sanitizer::BlockingMutex Mutex; - std::unordered_set<void *> OwnedBuffers; + BufferRep *Buffers; + size_t BufferCount; + + __sanitizer::SpinMutex Mutex; __sanitizer::atomic_uint8_t Finalizing; -public: + // Pointers to buffers managed/owned by the BufferQueue. + void **OwnedBuffers; + + // Pointer to the next buffer to be handed out. + BufferRep *Next; + + // Pointer to the entry in the array where the next released buffer will be + // placed. + BufferRep *First; + + // Count of buffers that have been handed out through 'getBuffer'. + size_t LiveBuffers; + + public: enum class ErrorCode : unsigned { Ok, NotEnoughMemory, @@ -56,16 +81,16 @@ public: static const char *getErrorString(ErrorCode E) { switch (E) { - case ErrorCode::Ok: - return "(none)"; - case ErrorCode::NotEnoughMemory: - return "no available buffers in the queue"; - case ErrorCode::QueueFinalizing: - return "queue already finalizing"; - case ErrorCode::UnrecognizedBuffer: - return "buffer being returned not owned by buffer queue"; - case ErrorCode::AlreadyFinalized: - return "queue already finalized"; + case ErrorCode::Ok: + return "(none)"; + case ErrorCode::NotEnoughMemory: + return "no available buffers in the queue"; + case ErrorCode::QueueFinalizing: + return "queue already finalizing"; + case ErrorCode::UnrecognizedBuffer: + return "buffer being returned not owned by buffer queue"; + case ErrorCode::AlreadyFinalized: + return "queue already finalized"; } return "unknown error"; } @@ -82,15 +107,18 @@ public: /// - BufferQueue is not finalising. /// /// Returns: - /// - std::errc::not_enough_memory on exceeding MaxSize. - /// - no error when we find a Buffer. - /// - std::errc::state_not_recoverable on finalising BufferQueue. + /// - ErrorCode::NotEnoughMemory on exceeding MaxSize. + /// - ErrorCode::Ok when we find a Buffer. + /// - ErrorCode::QueueFinalizing or ErrorCode::AlreadyFinalized on + /// a finalizing/finalized BufferQueue. ErrorCode getBuffer(Buffer &Buf); /// Updates |Buf| to point to nullptr, with size 0. /// /// Returns: - /// - ... + /// - ErrorCode::Ok when we successfully release the buffer. + /// - ErrorCode::UnrecognizedBuffer for when this BufferQueue does not own + /// the buffer being released. ErrorCode releaseBuffer(Buffer &Buf); bool finalizing() const { @@ -107,17 +135,18 @@ public: /// - All releaseBuffer operations will not fail. /// /// After a call to finalize succeeds, all subsequent calls to finalize will - /// fail with std::errc::state_not_recoverable. + /// fail with ErrorCode::QueueFinalizing. ErrorCode finalize(); /// Applies the provided function F to each Buffer in the queue, only if the /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a - /// releaseBuffer(...) operation. - template <class F> void apply(F Fn) { - __sanitizer::BlockingMutexLock G(&Mutex); - for (const auto &T : Buffers) { - if (std::get<1>(T)) - Fn(std::get<0>(T)); + /// releaseBuffer(...) operation). + template <class F> + void apply(F Fn) { + __sanitizer::SpinMutexLock G(&Mutex); + for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) { + const auto &T = *I; + if (T.Used) Fn(T.Buff); } } @@ -125,6 +154,6 @@ public: ~BufferQueue(); }; -} // namespace __xray +} // namespace __xray -#endif // XRAY_BUFFER_QUEUE_H +#endif // XRAY_BUFFER_QUEUE_H diff --git a/lib/xray/xray_fdr_log_records.h b/lib/xray/xray_fdr_log_records.h index 3d6d38892c76f..324208db82ca0 100644 --- a/lib/xray/xray_fdr_log_records.h +++ b/lib/xray/xray_fdr_log_records.h @@ -30,7 +30,10 @@ struct alignas(16) MetadataRecord { TSCWrap, WalltimeMarker, CustomEventMarker, + CallArgument, + BufferExtents, }; + // Use 7 bits to identify this record type. /* RecordKinds */ uint8_t RecordKind : 7; char Data[15]; diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc index a7e1382c3865b..1bfa10c21f5cd 100644 --- a/lib/xray/xray_fdr_logging.cc +++ b/lib/xray/xray_fdr_logging.cc @@ -15,15 +15,11 @@ // //===----------------------------------------------------------------------===// #include "xray_fdr_logging.h" -#include <algorithm> -#include <bitset> -#include <cerrno> -#include <cstring> +#include <errno.h> #include <sys/syscall.h> #include <sys/time.h> #include <time.h> #include <unistd.h> -#include <unordered_map> #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" @@ -39,7 +35,7 @@ namespace __xray { // Global BufferQueue. -std::shared_ptr<BufferQueue> BQ; +BufferQueue *BQ = nullptr; __sanitizer::atomic_sint32_t LogFlushStatus = { XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; @@ -52,19 +48,31 @@ __sanitizer::SpinMutex FDROptionsMutex; XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { if (__sanitizer::atomic_load(&LoggingStatus, __sanitizer::memory_order_acquire) != - XRayLogInitStatus::XRAY_LOG_FINALIZED) + XRayLogInitStatus::XRAY_LOG_FINALIZED) { + if (__sanitizer::Verbosity()) + Report("Not flushing log, implementation is not finalized.\n"); return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + } s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; if (!__sanitizer::atomic_compare_exchange_strong( &LogFlushStatus, &Result, XRayLogFlushStatus::XRAY_LOG_FLUSHING, - __sanitizer::memory_order_release)) + __sanitizer::memory_order_release)) { + + if (__sanitizer::Verbosity()) + Report("Not flushing log, implementation is still finalizing.\n"); return static_cast<XRayLogFlushStatus>(Result); + } - // Make a copy of the BufferQueue pointer to prevent other threads that may be - // resetting it from blowing away the queue prematurely while we're dealing - // with it. - auto LocalBQ = BQ; + if (BQ == nullptr) { + if (__sanitizer::Verbosity()) + Report("Cannot flush when global buffer queue is null.\n"); + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + } + + // We wait a number of milliseconds to allow threads to see that we've + // finalised before attempting to flush the log. + __sanitizer::SleepForMillis(flags()->xray_fdr_log_grace_period_ms); // We write out the file in the following format: // @@ -95,24 +103,44 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond; XRayFileHeader Header; - Header.Version = 1; + + // Version 2 of the log writes the extents of the buffer, instead of relying + // on an end-of-buffer record. + Header.Version = 2; Header.Type = FileTypes::FDR_LOG; Header.CycleFrequency = CycleFrequency; + // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc' // before setting the values in the header. Header.ConstantTSC = 1; Header.NonstopTSC = 1; - Header.FdrData = FdrAdditionalHeaderData{LocalBQ->ConfiguredBufferSize()}; + Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()}; retryingWriteAll(Fd, reinterpret_cast<char *>(&Header), reinterpret_cast<char *>(&Header) + sizeof(Header)); - LocalBQ->apply([&](const BufferQueue::Buffer &B) { - uint64_t BufferSize = B.Size; - if (BufferSize > 0) { + BQ->apply([&](const BufferQueue::Buffer &B) { + // Starting at version 2 of the FDR logging implementation, we only write + // the records identified by the extents of the buffer. We use the Extents + // from the Buffer and write that out as the first record in the buffer. + // We still use a Metadata record, but fill in the extents instead for the + // data. + MetadataRecord ExtentsRecord; + auto BufferExtents = __sanitizer::atomic_load( + &B.Extents->Size, __sanitizer::memory_order_acquire); + assert(BufferExtents <= B.Size); + ExtentsRecord.Type = uint8_t(RecordType::Metadata); + ExtentsRecord.RecordKind = + uint8_t(MetadataRecord::RecordKinds::BufferExtents); + std::memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents)); + if (BufferExtents > 0) { + retryingWriteAll(Fd, reinterpret_cast<char *>(&ExtentsRecord), + reinterpret_cast<char *>(&ExtentsRecord) + + sizeof(MetadataRecord)); retryingWriteAll(Fd, reinterpret_cast<char *>(B.Buffer), - reinterpret_cast<char *>(B.Buffer) + B.Size); + reinterpret_cast<char *>(B.Buffer) + BufferExtents); } }); + __sanitizer::atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, __sanitizer::memory_order_release); @@ -124,8 +152,11 @@ XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT { if (!__sanitizer::atomic_compare_exchange_strong( &LoggingStatus, &CurrentStatus, XRayLogInitStatus::XRAY_LOG_FINALIZING, - __sanitizer::memory_order_release)) + __sanitizer::memory_order_release)) { + if (__sanitizer::Verbosity()) + Report("Cannot finalize log, implementation not initialized.\n"); return static_cast<XRayLogInitStatus>(CurrentStatus); + } // Do special things to make the log finalize itself, and not allow any more // operations to be performed until re-initialized. @@ -146,7 +177,8 @@ XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT { return static_cast<XRayLogInitStatus>(CurrentStatus); // Release the in-memory buffer queue. - BQ.reset(); + delete BQ; + BQ = nullptr; // Spin until the flushing status is flushed. s32 CurrentFlushingStatus = XRayLogFlushStatus::XRAY_LOG_FLUSHED; @@ -163,19 +195,22 @@ XRayLogInitStatus fdrLoggingReset() XRAY_NEVER_INSTRUMENT { return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; } -static std::tuple<uint64_t, unsigned char> -getTimestamp() XRAY_NEVER_INSTRUMENT { +struct TSCAndCPU { + uint64_t TSC = 0; + unsigned char CPU = 0; +}; + +static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT { // We want to get the TSC as early as possible, so that we can check whether // we've seen this CPU before. We also do it before we load anything else, to // allow for forward progress with the scheduling. - unsigned char CPU; - uint64_t TSC; + TSCAndCPU Result; // Test once for required CPU features static bool TSCSupported = probeRequiredCPUFeatures(); if (TSCSupported) { - TSC = __xray::readTSC(CPU); + Result.TSC = __xray::readTSC(Result.CPU); } else { // FIXME: This code needs refactoring as it appears in multiple locations timespec TS; @@ -184,32 +219,35 @@ getTimestamp() XRAY_NEVER_INSTRUMENT { Report("clock_gettime(2) return %d, errno=%d", result, int(errno)); TS = {0, 0}; } - CPU = 0; - TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; + Result.CPU = 0; + Result.TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; } - return std::make_tuple(TSC, CPU); + return Result; } void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { - auto TSC_CPU = getTimestamp(); - __xray_fdr_internal::processFunctionHook(FuncId, Entry, std::get<0>(TSC_CPU), - std::get<1>(TSC_CPU), clock_gettime, - LoggingStatus, BQ); + auto TC = getTimestamp(); + __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, + clock_gettime, BQ); +} + +void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry, + uint64_t Arg) XRAY_NEVER_INSTRUMENT { + auto TC = getTimestamp(); + __xray_fdr_internal::processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, + clock_gettime, BQ); } void fdrLoggingHandleCustomEvent(void *Event, std::size_t EventSize) XRAY_NEVER_INSTRUMENT { using namespace __xray_fdr_internal; - auto TSC_CPU = getTimestamp(); - auto &TSC = std::get<0>(TSC_CPU); - auto &CPU = std::get<1>(TSC_CPU); - thread_local bool Running = false; + auto TC = getTimestamp(); + auto &TSC = TC.TSC; + auto &CPU = TC.CPU; RecursionGuard Guard{Running}; - if (!Guard) { - assert(Running && "RecursionGuard is buggy!"); + if (!Guard) return; - } if (EventSize > std::numeric_limits<int32_t>::max()) { using Empty = struct {}; static Empty Once = [&] { @@ -220,15 +258,16 @@ void fdrLoggingHandleCustomEvent(void *Event, (void)Once; } int32_t ReducedEventSize = static_cast<int32_t>(EventSize); - if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, clock_gettime)) + auto &TLD = getThreadLocalData(); + if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime)) return; // Here we need to prepare the log to handle: // - The metadata record we're going to write. (16 bytes) // - The additional data we're going to write. Currently, that's the size of // the event we're going to dump into the log as free-form bytes. - if (!prepareBuffer(clock_gettime, MetadataRecSize + EventSize)) { - LocalBQ = nullptr; + if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) { + TLD.BQ = nullptr; return; } @@ -240,27 +279,35 @@ void fdrLoggingHandleCustomEvent(void *Event, CustomEvent.Type = uint8_t(RecordType::Metadata); CustomEvent.RecordKind = uint8_t(MetadataRecord::RecordKinds::CustomEventMarker); - constexpr auto TSCSize = sizeof(std::get<0>(TSC_CPU)); + constexpr auto TSCSize = sizeof(TC.TSC); std::memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); std::memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); - std::memcpy(RecordPtr, &CustomEvent, sizeof(CustomEvent)); - RecordPtr += sizeof(CustomEvent); - std::memcpy(RecordPtr, Event, ReducedEventSize); + std::memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent)); + TLD.RecordPtr += sizeof(CustomEvent); + std::memcpy(TLD.RecordPtr, Event, ReducedEventSize); + incrementExtents(MetadataRecSize + EventSize); endBufferIfFull(); } XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { - if (OptionsSize != sizeof(FDRLoggingOptions)) + if (OptionsSize != sizeof(FDRLoggingOptions)) { + if (__sanitizer::Verbosity()) + Report("Cannot initialize FDR logging; wrong size for options: %d\n", + OptionsSize); return static_cast<XRayLogInitStatus>(__sanitizer::atomic_load( &LoggingStatus, __sanitizer::memory_order_acquire)); + } s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; if (!__sanitizer::atomic_compare_exchange_strong( &LoggingStatus, &CurrentStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZING, - __sanitizer::memory_order_release)) + __sanitizer::memory_order_release)) { + if (__sanitizer::Verbosity()) + Report("Cannot initialize already initialized implementation.\n"); return static_cast<XRayLogInitStatus>(CurrentStatus); + } { __sanitizer::SpinMutexLock Guard(&FDROptionsMutex); @@ -268,12 +315,40 @@ XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, } bool Success = false; - BQ = std::make_shared<BufferQueue>(BufferSize, BufferMax, Success); + + if (BQ != nullptr) { + delete BQ; + BQ = nullptr; + } + + if (BQ == nullptr) + BQ = new BufferQueue(BufferSize, BufferMax, Success); + if (!Success) { Report("BufferQueue init failed.\n"); + if (BQ != nullptr) { + delete BQ; + BQ = nullptr; + } return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; } + static bool UNUSED Once = [] { + pthread_key_create(&__xray_fdr_internal::Key, +[](void *) { + auto &TLD = __xray_fdr_internal::getThreadLocalData(); + if (TLD.BQ == nullptr) + return; + auto EC = TLD.BQ->releaseBuffer(TLD.Buffer); + if (EC != BufferQueue::ErrorCode::Ok) + Report("At thread exit, failed to release buffer at %p; error=%s\n", + TLD.Buffer.Buffer, BufferQueue::getErrorString(EC)); + }); + return false; + }(); + + // Arg1 handler should go in first to avoid concurrent code accidentally + // falling back to arg0 when it should have ran arg1. + __xray_set_handler_arg1(fdrLoggingHandleArg1); // Install the actual handleArg0 handler after initialising the buffers. __xray_set_handler(fdrLoggingHandleArg0); __xray_set_customevent_handler(fdrLoggingHandleCustomEvent); @@ -281,20 +356,31 @@ XRayLogInitStatus fdrLoggingInit(std::size_t BufferSize, std::size_t BufferMax, __sanitizer::atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED, __sanitizer::memory_order_release); - Report("XRay FDR init successful.\n"); + + if (__sanitizer::Verbosity()) + Report("XRay FDR init successful.\n"); return XRayLogInitStatus::XRAY_LOG_INITIALIZED; } -} // namespace __xray - -static auto UNUSED Unused = [] { +bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT { using namespace __xray; - if (flags()->xray_fdr_log) { - XRayLogImpl Impl{ - fdrLoggingInit, fdrLoggingFinalize, fdrLoggingHandleArg0, - fdrLoggingFlush, - }; + XRayLogImpl Impl{ + fdrLoggingInit, + fdrLoggingFinalize, + fdrLoggingHandleArg0, + fdrLoggingFlush, + }; + auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl); + if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && + __sanitizer::Verbosity()) + Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n", + RegistrationResult); + if (flags()->xray_fdr_log || + !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-fdr")) __xray_set_log_impl(Impl); - } return true; -}(); +} + +} // namespace __xray + +static auto UNUSED Unused = __xray::fdrLogDynamicInitializer(); diff --git a/lib/xray/xray_fdr_logging.h b/lib/xray/xray_fdr_logging.h index 426b54dc78843..1639d550a44c6 100644 --- a/lib/xray/xray_fdr_logging.h +++ b/lib/xray/xray_fdr_logging.h @@ -30,6 +30,7 @@ XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax, void *Options, size_t OptionsSize); XRayLogInitStatus fdrLoggingFinalize(); void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry); +void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry, uint64_t Arg1); XRayLogFlushStatus fdrLoggingFlush(); XRayLogInitStatus fdrLoggingReset(); diff --git a/lib/xray/xray_fdr_logging_impl.h b/lib/xray/xray_fdr_logging_impl.h index 4a1d80fd0ebae..59eab55b2573c 100644 --- a/lib/xray/xray_fdr_logging_impl.h +++ b/lib/xray/xray_fdr_logging_impl.h @@ -18,13 +18,13 @@ #define XRAY_XRAY_FDR_LOGGING_IMPL_H #include <cassert> -#include <cstdint> +#include <cstddef> #include <cstring> #include <limits> -#include <memory> -#include <string> +#include <pthread.h> #include <sys/syscall.h> #include <time.h> +#include <type_traits> #include <unistd.h> #include "sanitizer_common/sanitizer_common.h" @@ -52,57 +52,104 @@ __sanitizer::atomic_sint32_t LoggingStatus = { /// cooperation with xray_fdr_logging class, so be careful and think twice. namespace __xray_fdr_internal { -/// Writes the new buffer record and wallclock time that begin a buffer for a -/// thread to MemPtr and increments MemPtr. Bypasses the thread local state -/// machine and writes directly to memory without checks. -static void writeNewBufferPreamble(pid_t Tid, timespec TS, char *&MemPtr); +/// Writes the new buffer record and wallclock time that begin a buffer for the +/// current thread. +static void writeNewBufferPreamble(pid_t Tid, timespec TS); -/// Write a metadata record to switch to a new CPU to MemPtr and increments -/// MemPtr. Bypasses the thread local state machine and writes directly to -/// memory without checks. -static void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, char *&MemPtr); - -/// Writes an EOB metadata record to MemPtr and increments MemPtr. Bypasses the -/// thread local state machine and writes directly to memory without checks. -static void writeEOBMetadata(char *&MemPtr); - -/// Writes a TSC Wrap metadata record to MemPtr and increments MemPtr. Bypasses -/// the thread local state machine and directly writes to memory without checks. -static void writeTSCWrapMetadata(uint64_t TSC, char *&MemPtr); - -/// Writes a Function Record to MemPtr and increments MemPtr. Bypasses the -/// thread local state machine and writes the function record directly to -/// memory. +/// Writes a Function Record to the buffer associated with the current thread. static void writeFunctionRecord(int FuncId, uint32_t TSCDelta, - XRayEntryType EntryType, char *&MemPtr); + XRayEntryType EntryType); /// Sets up a new buffer in thread_local storage and writes a preamble. The /// wall_clock_reader function is used to populate the WallTimeRecord entry. static void setupNewBuffer(int (*wall_clock_reader)(clockid_t, struct timespec *)); -/// Called to record CPU time for a new CPU within the current thread. -static void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC); - -/// Called to close the buffer when the thread exhausts the buffer or when the -/// thread exits (via a thread local variable destructor). -static void writeEOBMetadata(); - /// TSC Wrap records are written when a TSC delta encoding scheme overflows. static void writeTSCWrapMetadata(uint64_t TSC); -/// Here's where the meat of the processing happens. The writer captures -/// function entry, exit and tail exit points with a time and will create -/// TSCWrap, NewCPUId and Function records as necessary. The writer might -/// walk backward through its buffer and erase trivial functions to avoid -/// polluting the log and may use the buffer queue to obtain or release a -/// buffer. -static void processFunctionHook(int32_t FuncId, XRayEntryType Entry, - uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, - struct timespec *), - __sanitizer::atomic_sint32_t &LoggingStatus, - const std::shared_ptr<BufferQueue> &BQ); +// Group together thread-local-data in a struct, then hide it behind a function +// call so that it can be initialized on first use instead of as a global. We +// force the alignment to 64-bytes for x86 cache line alignment, as this +// structure is used in the hot path of implementation. +struct alignas(64) ThreadLocalData { + BufferQueue::Buffer Buffer; + char *RecordPtr = nullptr; + // The number of FunctionEntry records immediately preceding RecordPtr. + uint8_t NumConsecutiveFnEnters = 0; + + // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit + // records preceding RecordPtr. + uint8_t NumTailCalls = 0; + + // We use a thread_local variable to keep track of which CPUs we've already + // run, and the TSC times for these CPUs. This allows us to stop repeating the + // CPU field in the function records. + // + // We assume that we'll support only 65536 CPUs for x86_64. + uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max(); + uint64_t LastTSC = 0; + uint64_t LastFunctionEntryTSC = 0; + + // Make sure a thread that's ever called handleArg0 has a thread-local + // live reference to the buffer queue for this particular instance of + // FDRLogging, and that we're going to clean it up when the thread exits. + BufferQueue *BQ = nullptr; +}; + +static_assert(std::is_trivially_destructible<ThreadLocalData>::value, + "ThreadLocalData must be trivially destructible"); + +static constexpr auto MetadataRecSize = sizeof(MetadataRecord); +static constexpr auto FunctionRecSize = sizeof(FunctionRecord); + +// Use a global pthread key to identify thread-local data for logging. +static pthread_key_t Key; + +// This function will initialize the thread-local data structure used by the FDR +// logging implementation and return a reference to it. The implementation +// details require a bit of care to maintain. +// +// First, some requirements on the implementation in general: +// +// - XRay handlers should not call any memory allocation routines that may +// delegate to an instrumented implementation. This means functions like +// malloc() and free() should not be called while instrumenting. +// +// - We would like to use some thread-local data initialized on first-use of +// the XRay instrumentation. These allow us to implement unsynchronized +// routines that access resources associated with the thread. +// +// The implementation here uses a few mechanisms that allow us to provide both +// the requirements listed above. We do this by: +// +// 1. Using a thread-local aligned storage buffer for representing the +// ThreadLocalData struct. This data will be uninitialized memory by +// design. +// +// 2. Not requiring a thread exit handler/implementation, keeping the +// thread-local as purely a collection of references/data that do not +// require cleanup. +// +// We're doing this to avoid using a `thread_local` object that has a +// non-trivial destructor, because the C++ runtime might call std::malloc(...) +// to register calls to destructors. Deadlocks may arise when, for example, an +// externally provided malloc implementation is XRay instrumented, and +// initializing the thread-locals involves calling into malloc. A malloc +// implementation that does global synchronization might be holding a lock for a +// critical section, calling a function that might be XRay instrumented (and +// thus in turn calling into malloc by virtue of registration of the +// thread_local's destructor). +static ThreadLocalData &getThreadLocalData() { + static_assert(alignof(ThreadLocalData) >= 64, + "ThreadLocalData must be cache line aligned."); + thread_local ThreadLocalData TLD; + thread_local bool UNUSED ThreadOnce = [] { + pthread_setspecific(Key, &TLD); + return false; + }(); + return TLD; +} //-----------------------------------------------------------------------------| // The rest of the file is implementation. | @@ -113,71 +160,12 @@ static void processFunctionHook(int32_t FuncId, XRayEntryType Entry, namespace { -thread_local BufferQueue::Buffer Buffer; -thread_local char *RecordPtr = nullptr; - -// The number of FunctionEntry records immediately preceding RecordPtr. -thread_local uint8_t NumConsecutiveFnEnters = 0; - -// The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit -// records preceding RecordPtr. -thread_local uint8_t NumTailCalls = 0; - -constexpr auto MetadataRecSize = sizeof(MetadataRecord); -constexpr auto FunctionRecSize = sizeof(FunctionRecord); - -// We use a thread_local variable to keep track of which CPUs we've already -// run, and the TSC times for these CPUs. This allows us to stop repeating the -// CPU field in the function records. -// -// We assume that we'll support only 65536 CPUs for x86_64. -thread_local uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max(); -thread_local uint64_t LastTSC = 0; -thread_local uint64_t LastFunctionEntryTSC = 0; - -class ThreadExitBufferCleanup { - std::shared_ptr<BufferQueue> &Buffers; - BufferQueue::Buffer &Buffer; - -public: - explicit ThreadExitBufferCleanup(std::shared_ptr<BufferQueue> &BQ, - BufferQueue::Buffer &Buffer) - XRAY_NEVER_INSTRUMENT : Buffers(BQ), - Buffer(Buffer) {} - - ~ThreadExitBufferCleanup() noexcept XRAY_NEVER_INSTRUMENT { - if (RecordPtr == nullptr) - return; - - // We make sure that upon exit, a thread will write out the EOB - // MetadataRecord in the thread-local log, and also release the buffer to - // the queue. - assert((RecordPtr + MetadataRecSize) - static_cast<char *>(Buffer.Buffer) >= - static_cast<ptrdiff_t>(MetadataRecSize)); - if (Buffers) { - writeEOBMetadata(); - auto EC = Buffers->releaseBuffer(Buffer); - if (EC != BufferQueue::ErrorCode::Ok) - Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, - BufferQueue::getErrorString(EC)); - Buffers = nullptr; - return; - } - } -}; - -// Make sure a thread that's ever called handleArg0 has a thread-local -// live reference to the buffer queue for this particular instance of -// FDRLogging, and that we're going to clean it up when the thread exits. -thread_local std::shared_ptr<BufferQueue> LocalBQ = nullptr; -thread_local ThreadExitBufferCleanup Cleanup(LocalBQ, Buffer); - class RecursionGuard { - bool &Running; + volatile bool &Running; const bool Valid; public: - explicit RecursionGuard(bool &R) : Running(R), Valid(!R) { + explicit RecursionGuard(volatile bool &R) : Running(R), Valid(!R) { if (Valid) Running = true; } @@ -195,34 +183,29 @@ public: } }; -inline bool loggingInitialized( - const __sanitizer::atomic_sint32_t &LoggingStatus) XRAY_NEVER_INSTRUMENT { - return __sanitizer::atomic_load(&LoggingStatus, - __sanitizer::memory_order_acquire) == - XRayLogInitStatus::XRAY_LOG_INITIALIZED; -} - } // namespace -inline void writeNewBufferPreamble(pid_t Tid, timespec TS, - char *&MemPtr) XRAY_NEVER_INSTRUMENT { +static void writeNewBufferPreamble(pid_t Tid, + timespec TS) XRAY_NEVER_INSTRUMENT { static constexpr int InitRecordsCount = 2; - std::aligned_storage<sizeof(MetadataRecord)>::type Records[InitRecordsCount]; + auto &TLD = getThreadLocalData(); + MetadataRecord Metadata[InitRecordsCount]; { // Write out a MetadataRecord to signify that this is the start of a new // buffer, associated with a particular thread, with a new CPU. For the // data, we have 15 bytes to squeeze as much information as we can. At this // point we only write down the following bytes: // - Thread ID (pid_t, 4 bytes) - auto &NewBuffer = *reinterpret_cast<MetadataRecord *>(&Records[0]); + auto &NewBuffer = Metadata[0]; NewBuffer.Type = uint8_t(RecordType::Metadata); NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer); std::memcpy(&NewBuffer.Data, &Tid, sizeof(pid_t)); } + // Also write the WalltimeMarker record. { static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes"); - auto &WalltimeMarker = *reinterpret_cast<MetadataRecord *>(&Records[1]); + auto &WalltimeMarker = Metadata[1]; WalltimeMarker.Type = uint8_t(RecordType::Metadata); WalltimeMarker.RecordKind = uint8_t(MetadataRecord::RecordKinds::WalltimeMarker); @@ -235,26 +218,48 @@ inline void writeNewBufferPreamble(pid_t Tid, timespec TS, std::memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds)); std::memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros, sizeof(Micros)); } - std::memcpy(MemPtr, Records, sizeof(MetadataRecord) * InitRecordsCount); - MemPtr += sizeof(MetadataRecord) * InitRecordsCount; - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + if (TLD.BQ == nullptr || TLD.BQ->finalizing()) + return; + std::memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata)); + TLD.RecordPtr += sizeof(Metadata); + // Since we write out the extents as the first metadata record of the + // buffer, we need to write out the extents including the extents record. + __sanitizer::atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata), + __sanitizer::memory_order_release); } inline void setupNewBuffer(int (*wall_clock_reader)( clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { - RecordPtr = static_cast<char *>(Buffer.Buffer); + auto &TLD = getThreadLocalData(); + auto &B = TLD.Buffer; + TLD.RecordPtr = static_cast<char *>(B.Buffer); pid_t Tid = syscall(SYS_gettid); timespec TS{0, 0}; // This is typically clock_gettime, but callers have injection ability. wall_clock_reader(CLOCK_MONOTONIC, &TS); - writeNewBufferPreamble(Tid, TS, RecordPtr); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + writeNewBufferPreamble(Tid, TS); + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; +} + +static void incrementExtents(size_t Add) { + auto &TLD = getThreadLocalData(); + __sanitizer::atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, + __sanitizer::memory_order_acq_rel); +} + +static void decrementExtents(size_t Subtract) { + auto &TLD = getThreadLocalData(); + __sanitizer::atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, + __sanitizer::memory_order_acq_rel); } -inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, - char *&MemPtr) XRAY_NEVER_INSTRUMENT { +inline void writeNewCPUIdMetadata(uint16_t CPU, + uint64_t TSC) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); MetadataRecord NewCPUId; NewCPUId.Type = uint8_t(RecordType::Metadata); NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId); @@ -265,34 +270,15 @@ inline void writeNewCPUIdMetadata(uint16_t CPU, uint64_t TSC, // Total = 10 bytes. std::memcpy(&NewCPUId.Data, &CPU, sizeof(CPU)); std::memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC)); - std::memcpy(MemPtr, &NewCPUId, sizeof(MetadataRecord)); - MemPtr += sizeof(MetadataRecord); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; -} - -inline void writeNewCPUIdMetadata(uint16_t CPU, - uint64_t TSC) XRAY_NEVER_INSTRUMENT { - writeNewCPUIdMetadata(CPU, TSC, RecordPtr); -} - -inline void writeEOBMetadata(char *&MemPtr) XRAY_NEVER_INSTRUMENT { - MetadataRecord EOBMeta; - EOBMeta.Type = uint8_t(RecordType::Metadata); - EOBMeta.RecordKind = uint8_t(MetadataRecord::RecordKinds::EndOfBuffer); - // For now we don't write any bytes into the Data field. - std::memcpy(MemPtr, &EOBMeta, sizeof(MetadataRecord)); - MemPtr += sizeof(MetadataRecord); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; -} - -inline void writeEOBMetadata() XRAY_NEVER_INSTRUMENT { - writeEOBMetadata(RecordPtr); + std::memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord)); + TLD.RecordPtr += sizeof(MetadataRecord); + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + incrementExtents(sizeof(MetadataRecord)); } -inline void writeTSCWrapMetadata(uint64_t TSC, - char *&MemPtr) XRAY_NEVER_INSTRUMENT { +inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); MetadataRecord TSCWrap; TSCWrap.Type = uint8_t(RecordType::Metadata); TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap); @@ -301,58 +287,67 @@ inline void writeTSCWrapMetadata(uint64_t TSC, // - Full TSC (uint64_t, 8 bytes) // Total = 8 bytes. std::memcpy(&TSCWrap.Data, &TSC, sizeof(TSC)); - std::memcpy(MemPtr, &TSCWrap, sizeof(MetadataRecord)); - MemPtr += sizeof(MetadataRecord); - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + std::memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord)); + TLD.RecordPtr += sizeof(MetadataRecord); + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + incrementExtents(sizeof(MetadataRecord)); } -inline void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { - writeTSCWrapMetadata(TSC, RecordPtr); +// Call Argument metadata records store the arguments to a function in the +// order of their appearance; holes are not supported by the buffer format. +static inline void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + MetadataRecord CallArg; + CallArg.Type = uint8_t(RecordType::Metadata); + CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument); + + std::memcpy(CallArg.Data, &A, sizeof(A)); + std::memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord)); + TLD.RecordPtr += sizeof(MetadataRecord); + incrementExtents(sizeof(MetadataRecord)); } -inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, - XRayEntryType EntryType, - char *&MemPtr) XRAY_NEVER_INSTRUMENT { - std::aligned_storage<sizeof(FunctionRecord), alignof(FunctionRecord)>::type - AlignedFuncRecordBuffer; - auto &FuncRecord = - *reinterpret_cast<FunctionRecord *>(&AlignedFuncRecordBuffer); +static inline void +writeFunctionRecord(int FuncId, uint32_t TSCDelta, + XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT { + FunctionRecord FuncRecord; FuncRecord.Type = uint8_t(RecordType::Function); // Only take 28 bits of the function id. FuncRecord.FuncId = FuncId & ~(0x0F << 28); FuncRecord.TSCDelta = TSCDelta; + auto &TLD = getThreadLocalData(); switch (EntryType) { case XRayEntryType::ENTRY: - ++NumConsecutiveFnEnters; + ++TLD.NumConsecutiveFnEnters; FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); break; case XRayEntryType::LOG_ARGS_ENTRY: // We should not rewind functions with logged args. - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); break; case XRayEntryType::EXIT: // If we've decided to log the function exit, we will never erase the log // before it. - NumConsecutiveFnEnters = 0; - NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit); break; case XRayEntryType::TAIL: // If we just entered the function we're tail exiting from or erased every // invocation since then, this function entry tail pair is a candidate to // be erased when the child function exits. - if (NumConsecutiveFnEnters > 0) { - ++NumTailCalls; - NumConsecutiveFnEnters = 0; + if (TLD.NumConsecutiveFnEnters > 0) { + ++TLD.NumTailCalls; + TLD.NumConsecutiveFnEnters = 0; } else { // We will never be able to erase this tail call since we have logged // something in between the function entry and tail exit. - NumTailCalls = 0; - NumConsecutiveFnEnters = 0; + TLD.NumTailCalls = 0; + TLD.NumConsecutiveFnEnters = 0; } FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); @@ -370,8 +365,9 @@ inline void writeFunctionRecord(int FuncId, uint32_t TSCDelta, } } - std::memcpy(MemPtr, &AlignedFuncRecordBuffer, sizeof(FunctionRecord)); - MemPtr += sizeof(FunctionRecord); + std::memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord)); + TLD.RecordPtr += sizeof(FunctionRecord); + incrementExtents(sizeof(FunctionRecord)); } static uint64_t thresholdTicks() { @@ -387,23 +383,21 @@ static uint64_t thresholdTicks() { // "Function Entry" record and any "Tail Call Exit" records after that. static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, uint64_t &LastFunctionEntryTSC, int32_t FuncId) { - using AlignedFuncStorage = - std::aligned_storage<sizeof(FunctionRecord), - alignof(FunctionRecord)>::type; - RecordPtr -= FunctionRecSize; - AlignedFuncStorage AlignedFuncRecordBuffer; - const auto &FuncRecord = *reinterpret_cast<FunctionRecord *>( - std::memcpy(&AlignedFuncRecordBuffer, RecordPtr, FunctionRecSize)); + auto &TLD = getThreadLocalData(); + TLD.RecordPtr -= FunctionRecSize; + decrementExtents(FunctionRecSize); + FunctionRecord FuncRecord; + std::memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize); assert(FuncRecord.RecordKind == uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && "Expected to find function entry recording when rewinding."); assert(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) && "Expected matching function id when rewinding Exit"); - --NumConsecutiveFnEnters; + --TLD.NumConsecutiveFnEnters; LastTSC -= FuncRecord.TSCDelta; // We unwound one call. Update the state and return without writing a log. - if (NumConsecutiveFnEnters != 0) { + if (TLD.NumConsecutiveFnEnters != 0) { LastFunctionEntryTSC -= FuncRecord.TSCDelta; return; } @@ -413,22 +407,19 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, // exited from via this exit. LastFunctionEntryTSC = 0; auto RewindingTSC = LastTSC; - auto RewindingRecordPtr = RecordPtr - FunctionRecSize; - while (NumTailCalls > 0) { - AlignedFuncStorage TailExitRecordBuffer; + auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize; + while (TLD.NumTailCalls > 0) { // Rewind the TSC back over the TAIL EXIT record. - const auto &ExpectedTailExit = - *reinterpret_cast<FunctionRecord *>(std::memcpy( - &TailExitRecordBuffer, RewindingRecordPtr, FunctionRecSize)); + FunctionRecord ExpectedTailExit; + std::memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize); assert(ExpectedTailExit.RecordKind == uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) && "Expected to find tail exit when rewinding."); RewindingRecordPtr -= FunctionRecSize; RewindingTSC -= ExpectedTailExit.TSCDelta; - AlignedFuncStorage FunctionEntryBuffer; - const auto &ExpectedFunctionEntry = *reinterpret_cast<FunctionRecord *>( - std::memcpy(&FunctionEntryBuffer, RewindingRecordPtr, FunctionRecSize)); + FunctionRecord ExpectedFunctionEntry; + std::memcpy(&ExpectedFunctionEntry, RewindingRecordPtr, FunctionRecSize); assert(ExpectedFunctionEntry.RecordKind == uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && "Expected to find function entry when rewinding tail call."); @@ -437,80 +428,87 @@ static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, // This tail call exceeded the threshold duration. It will not be erased. if ((TSC - RewindingTSC) >= thresholdTicks()) { - NumTailCalls = 0; + TLD.NumTailCalls = 0; return; } // We can erase a tail exit pair that we're exiting through since // its duration is under threshold. - --NumTailCalls; + --TLD.NumTailCalls; RewindingRecordPtr -= FunctionRecSize; RewindingTSC -= ExpectedFunctionEntry.TSCDelta; - RecordPtr -= 2 * FunctionRecSize; + TLD.RecordPtr -= 2 * FunctionRecSize; LastTSC = RewindingTSC; + decrementExtents(2 * FunctionRecSize); } } -inline bool releaseThreadLocalBuffer(BufferQueue *BQ) { - auto EC = BQ->releaseBuffer(Buffer); +inline bool releaseThreadLocalBuffer(BufferQueue &BQArg) { + auto &TLD = getThreadLocalData(); + auto EC = BQArg.releaseBuffer(TLD.Buffer); if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to release buffer at %p; error=%s\n", Buffer.Buffer, + Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Buffer, BufferQueue::getErrorString(EC)); return false; } return true; } -inline bool prepareBuffer(int (*wall_clock_reader)(clockid_t, +inline bool prepareBuffer(uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *), size_t MaxSize) XRAY_NEVER_INSTRUMENT { - char *BufferStart = static_cast<char *>(Buffer.Buffer); - if ((RecordPtr + MaxSize) > (BufferStart + Buffer.Size - MetadataRecSize)) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) + auto &TLD = getThreadLocalData(); + char *BufferStart = static_cast<char *>(TLD.Buffer.Buffer); + if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) { + if (!releaseThreadLocalBuffer(*TLD.BQ)) return false; - auto EC = LocalBQ->getBuffer(Buffer); + auto EC = TLD.BQ->getBuffer(TLD.Buffer); if (EC != BufferQueue::ErrorCode::Ok) { Report("Failed to acquire a buffer; error=%s\n", BufferQueue::getErrorString(EC)); return false; } setupNewBuffer(wall_clock_reader); + + // Always write the CPU metadata as the first record in the buffer. + writeNewCPUIdMetadata(CPU, TSC); } return true; } -inline bool isLogInitializedAndReady( - std::shared_ptr<BufferQueue> &LocalBQ, uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, - struct timespec *)) XRAY_NEVER_INSTRUMENT { +inline bool +isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU, + int (*wall_clock_reader)(clockid_t, struct timespec *)) + XRAY_NEVER_INSTRUMENT { // Bail out right away if logging is not initialized yet. // We should take the opportunity to release the buffer though. auto Status = __sanitizer::atomic_load(&LoggingStatus, __sanitizer::memory_order_acquire); + auto &TLD = getThreadLocalData(); if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { - if (RecordPtr != nullptr && + if (TLD.RecordPtr != nullptr && (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) + if (!releaseThreadLocalBuffer(*LBQ)) return false; - RecordPtr = nullptr; - LocalBQ = nullptr; + TLD.RecordPtr = nullptr; return false; } return false; } - if (!loggingInitialized(LoggingStatus) || LocalBQ->finalizing()) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) + if (__sanitizer::atomic_load(&LoggingStatus, + __sanitizer::memory_order_acquire) != + XRayLogInitStatus::XRAY_LOG_INITIALIZED || + LBQ->finalizing()) { + if (!releaseThreadLocalBuffer(*LBQ)) return false; - RecordPtr = nullptr; + TLD.RecordPtr = nullptr; } - if (Buffer.Buffer == nullptr) { - auto EC = LocalBQ->getBuffer(Buffer); + if (TLD.Buffer.Buffer == nullptr) { + auto EC = LBQ->getBuffer(TLD.Buffer); if (EC != BufferQueue::ErrorCode::Ok) { auto LS = __sanitizer::atomic_load(&LoggingStatus, __sanitizer::memory_order_acquire); @@ -522,51 +520,100 @@ inline bool isLogInitializedAndReady( } setupNewBuffer(wall_clock_reader); + + // Always write the CPU metadata as the first record in the buffer. + writeNewCPUIdMetadata(CPU, TSC); } - if (CurrentCPU == std::numeric_limits<uint16_t>::max()) { + if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) { // This means this is the first CPU this thread has ever run on. We set // the current CPU and record this as the first TSC we've seen. - CurrentCPU = CPU; + TLD.CurrentCPU = CPU; writeNewCPUIdMetadata(CPU, TSC); } return true; } // namespace __xray_fdr_internal +// Compute the TSC difference between the time of measurement and the previous +// event. There are a few interesting situations we need to account for: +// +// - The thread has migrated to a different CPU. If this is the case, then +// we write down the following records: +// +// 1. A 'NewCPUId' Metadata record. +// 2. A FunctionRecord with a 0 for the TSCDelta field. +// +// - The TSC delta is greater than the 32 bits we can store in a +// FunctionRecord. In this case we write down the following records: +// +// 1. A 'TSCWrap' Metadata record. +// 2. A FunctionRecord with a 0 for the TSCDelta field. +// +// - The TSC delta is representable within the 32 bits we can store in a +// FunctionRecord. In this case we write down just a FunctionRecord with +// the correct TSC delta. +inline uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC, + uint8_t CPU) { + if (CPU != TLD.CurrentCPU) { + // We've moved to a new CPU. + writeNewCPUIdMetadata(CPU, TSC); + return 0; + } + // If the delta is greater than the range for a uint32_t, then we write out + // the TSC wrap metadata entry with the full TSC, and the TSC for the + // function record be 0. + uint64_t Delta = TSC - TLD.LastTSC; + if (Delta <= std::numeric_limits<uint32_t>::max()) + return Delta; + + writeTSCWrapMetadata(TSC); + return 0; +} + inline void endBufferIfFull() XRAY_NEVER_INSTRUMENT { - auto BufferStart = static_cast<char *>(Buffer.Buffer); - if ((RecordPtr + MetadataRecSize) - BufferStart == MetadataRecSize) { - writeEOBMetadata(); - if (!releaseThreadLocalBuffer(LocalBQ.get())) + auto &TLD = getThreadLocalData(); + auto BufferStart = static_cast<char *>(TLD.Buffer.Buffer); + if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <= + ptrdiff_t{MetadataRecSize}) { + if (!releaseThreadLocalBuffer(*TLD.BQ)) return; - RecordPtr = nullptr; + TLD.RecordPtr = nullptr; } } -inline void processFunctionHook( - int32_t FuncId, XRayEntryType Entry, uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, struct timespec *), - __sanitizer::atomic_sint32_t &LoggingStatus, - const std::shared_ptr<BufferQueue> &BQ) XRAY_NEVER_INSTRUMENT { +thread_local volatile bool Running = false; + +/// Here's where the meat of the processing happens. The writer captures +/// function entry, exit and tail exit points with a time and will create +/// TSCWrap, NewCPUId and Function records as necessary. The writer might +/// walk backward through its buffer and erase trivial functions to avoid +/// polluting the log and may use the buffer queue to obtain or release a +/// buffer. +inline void processFunctionHook(int32_t FuncId, XRayEntryType Entry, + uint64_t TSC, unsigned char CPU, uint64_t Arg1, + int (*wall_clock_reader)(clockid_t, + struct timespec *), + BufferQueue *BQ) XRAY_NEVER_INSTRUMENT { // Prevent signal handler recursion, so in case we're already in a log writing // mode and the signal handler comes in (and is also instrumented) then we // don't want to be clobbering potentially partial writes already happening in // the thread. We use a simple thread_local latch to only allow one on-going // handleArg0 to happen at any given time. - thread_local bool Running = false; RecursionGuard Guard{Running}; if (!Guard) { assert(Running == true && "RecursionGuard is buggy!"); return; } + auto &TLD = getThreadLocalData(); + // In case the reference has been cleaned up before, we make sure we // initialize it to the provided BufferQueue. - if (LocalBQ == nullptr) - LocalBQ = BQ; + if (TLD.BQ == nullptr) + TLD.BQ = BQ; - if (!isLogInitializedAndReady(LocalBQ, TSC, CPU, wall_clock_reader)) + if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader)) return; // Before we go setting up writing new function entries, we need to be really @@ -579,10 +626,10 @@ inline void processFunctionHook( // - The least number of bytes we will ever write is 8 // (sizeof(FunctionRecord)) only if the delta between the previous entry // and this entry is within 32 bits. - // - The most number of bytes we will ever write is 8 + 16 = 24. This is - // computed by: + // - The most number of bytes we will ever write is 8 + 16 + 16 = 40. + // This is computed by: // - // sizeof(FunctionRecord) + sizeof(MetadataRecord) + // MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord) // // These arise in the following cases: // @@ -596,77 +643,39 @@ inline void processFunctionHook( // FunctionRecord. // 3. When we learn about a new CPU ID, we need to write down a "new cpu // id" MetadataRecord before writing out the actual FunctionRecord. + // 4. The second MetadataRecord is the optional function call argument. // - // - An End-of-Buffer (EOB) MetadataRecord is 16 bytes. - // - // So the math we need to do is to determine whether writing 24 bytes past the - // current pointer leaves us with enough bytes to write the EOB - // MetadataRecord. If we don't have enough space after writing as much as 24 - // bytes in the end of the buffer, we need to write out the EOB, get a new - // Buffer, set it up properly before doing any further writing. - // - if (!prepareBuffer(wall_clock_reader, FunctionRecSize + MetadataRecSize)) { - LocalBQ = nullptr; + // So the math we need to do is to determine whether writing 40 bytes past the + // current pointer exceeds the buffer's maximum size. If we don't have enough + // space to write 40 bytes in the buffer, we need get a new Buffer, set it up + // properly before doing any further writing. + size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize; + if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) { + TLD.BQ = nullptr; return; } - // By this point, we are now ready to write at most 24 bytes (one metadata - // record and one function record). - assert((RecordPtr + (MetadataRecSize + FunctionRecSize)) - - static_cast<char *>(Buffer.Buffer) >= + // By this point, we are now ready to write up to 40 bytes (explained above). + assert((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Buffer) >= static_cast<ptrdiff_t>(MetadataRecSize) && "Misconfigured BufferQueue provided; Buffer size not large enough."); - // Here we compute the TSC Delta. There are a few interesting situations we - // need to account for: - // - // - The thread has migrated to a different CPU. If this is the case, then - // we write down the following records: - // - // 1. A 'NewCPUId' Metadata record. - // 2. A FunctionRecord with a 0 for the TSCDelta field. - // - // - The TSC delta is greater than the 32 bits we can store in a - // FunctionRecord. In this case we write down the following records: - // - // 1. A 'TSCWrap' Metadata record. - // 2. A FunctionRecord with a 0 for the TSCDelta field. - // - // - The TSC delta is representable within the 32 bits we can store in a - // FunctionRecord. In this case we write down just a FunctionRecord with - // the correct TSC delta. - // - uint32_t RecordTSCDelta = 0; - if (CPU != CurrentCPU) { - // We've moved to a new CPU. - writeNewCPUIdMetadata(CPU, TSC); - } else { - // If the delta is greater than the range for a uint32_t, then we write out - // the TSC wrap metadata entry with the full TSC, and the TSC for the - // function record be 0. - auto Delta = TSC - LastTSC; - if (Delta > (1ULL << 32) - 1) - writeTSCWrapMetadata(TSC); - else - RecordTSCDelta = Delta; - } - - LastTSC = TSC; - CurrentCPU = CPU; + auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU); + TLD.LastTSC = TSC; + TLD.CurrentCPU = CPU; switch (Entry) { case XRayEntryType::ENTRY: case XRayEntryType::LOG_ARGS_ENTRY: // Update the thread local state for the next invocation. - LastFunctionEntryTSC = TSC; + TLD.LastFunctionEntryTSC = TSC; break; case XRayEntryType::TAIL: - break; case XRayEntryType::EXIT: // Break out and write the exit record if we can't erase any functions. - if (NumConsecutiveFnEnters == 0 || - (TSC - LastFunctionEntryTSC) >= thresholdTicks()) + if (TLD.NumConsecutiveFnEnters == 0 || + (TSC - TLD.LastFunctionEntryTSC) >= thresholdTicks()) break; - rewindRecentCall(TSC, LastTSC, LastFunctionEntryTSC, FuncId); + rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId); return; // without writing log. case XRayEntryType::CUSTOM_EVENT: { // This is a bug in patching, so we'll report it once and move on. @@ -681,7 +690,9 @@ inline void processFunctionHook( } } - writeFunctionRecord(FuncId, RecordTSCDelta, Entry, RecordPtr); + writeFunctionRecord(FuncId, RecordTSCDelta, Entry); + if (Entry == XRayEntryType::LOG_ARGS_ENTRY) + writeCallArgumentMetadata(Arg1); // If we've exhausted the buffer by this time, we then release the buffer to // make sure that other threads may start using this buffer. diff --git a/lib/xray/xray_flags.h b/lib/xray/xray_flags.h index f4e30283b8de6..3ed5b8844cb46 100644 --- a/lib/xray/xray_flags.h +++ b/lib/xray/xray_flags.h @@ -16,6 +16,7 @@ #define XRAY_FLAGS_H #include "sanitizer_common/sanitizer_flag_parser.h" +#include "sanitizer_common/sanitizer_internal_defs.h" namespace __xray { diff --git a/lib/xray/xray_flags.inc b/lib/xray/xray_flags.inc index 7ddce78eb413e..29f1fce7d7f4d 100644 --- a/lib/xray/xray_flags.inc +++ b/lib/xray/xray_flags.inc @@ -16,12 +16,34 @@ XRAY_FLAG(bool, patch_premain, false, "Whether to patch instrumentation points before main.") -XRAY_FLAG(bool, xray_naive_log, true, - "Whether to install the naive log implementation.") XRAY_FLAG(const char *, xray_logfile_base, "xray-log.", "Filename base for the xray logfile.") +XRAY_FLAG(const char *, xray_mode, "", "Mode to install by default.") +XRAY_FLAG(uptr, xray_page_size_override, 0, + "Override the default page size for the system, in bytes. The size " + "should be a power-of-two.") + +// Basic (Naive) Mode logging options. +XRAY_FLAG(bool, xray_naive_log, false, + "DEPRECATED: Use xray_mode=xray-basic instead.") +XRAY_FLAG(int, xray_naive_log_func_duration_threshold_us, 5, + "Naive logging will try to skip functions that execute for fewer " + "microseconds than this threshold.") +XRAY_FLAG(int, xray_naive_log_max_stack_depth, 64, + "Naive logging will keep track of at most this deep a call stack, " + "any more and the recordings will be droppped.") +XRAY_FLAG(int, xray_naive_log_thread_buffer_size, 1024, + "The number of entries to keep on a per-thread buffer.") + +// FDR (Flight Data Recorder) Mode logging options. XRAY_FLAG(bool, xray_fdr_log, false, - "Whether to install the flight data recorder logging implementation.") + "DEPRECATED: Use xray_mode=xray-fdr instead.") XRAY_FLAG(int, xray_fdr_log_func_duration_threshold_us, 5, "FDR logging will try to skip functions that execute for fewer " "microseconds than this threshold.") +XRAY_FLAG(int, xray_fdr_log_grace_period_us, 0, + "DEPRECATED: use xray_fdr_log_grace_period_ms instead.") +XRAY_FLAG(int, xray_fdr_log_grace_period_ms, 100, + "FDR logging will wait this much time in microseconds before " + "actually flushing the log; this gives a chance for threads to " + "notice that the log has been finalized and clean up.") diff --git a/lib/xray/xray_init.cc b/lib/xray/xray_init.cc index aa660baa99206..11892cb8b7a31 100644 --- a/lib/xray/xray_init.cc +++ b/lib/xray/xray_init.cc @@ -44,12 +44,31 @@ __sanitizer::atomic_uint8_t XRayInitialized{0}; __sanitizer::SpinMutex XRayInstrMapMutex; XRaySledMap XRayInstrMap; +// Global flag to determine whether the flags have been initialized. +__sanitizer::atomic_uint8_t XRayFlagsInitialized{0}; + +// A mutex to allow only one thread to initialize the XRay data structures. +__sanitizer::SpinMutex XRayInitMutex; + // __xray_init() will do the actual loading of the current process' memory map // and then proceed to look for the .xray_instr_map section/segment. void __xray_init() XRAY_NEVER_INSTRUMENT { - initializeFlags(); + __sanitizer::SpinMutexLock Guard(&XRayInitMutex); + // Short-circuit if we've already initialized XRay before. + if (__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) + return; + + if (!__sanitizer::atomic_load(&XRayFlagsInitialized, + __sanitizer::memory_order_acquire)) { + initializeFlags(); + __sanitizer::atomic_store(&XRayFlagsInitialized, true, + __sanitizer::memory_order_release); + } + if (__start_xray_instr_map == nullptr) { - Report("XRay instrumentation map missing. Not initializing XRay.\n"); + if (Verbosity()) + Report("XRay instrumentation map missing. Not initializing XRay.\n"); return; } @@ -63,9 +82,21 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { __sanitizer::atomic_store(&XRayInitialized, true, __sanitizer::memory_order_release); +#ifndef XRAY_NO_PREINIT if (flags()->patch_premain) __xray_patch(); +#endif } +#if !defined(XRAY_NO_PREINIT) && SANITIZER_CAN_USE_PREINIT_ARRAY +// Only add the preinit array initialization if the sanitizers can. __attribute__((section(".preinit_array"), used)) void (*__local_xray_preinit)(void) = __xray_init; +#else +// If we cannot use the .preinit_array section, we should instead use dynamic +// initialisation. +static bool UNUSED __local_xray_dyninit = [] { + __xray_init(); + return true; +}(); +#endif diff --git a/lib/xray/xray_inmemory_log.cc b/lib/xray/xray_inmemory_log.cc index 83aecfaf7700a..a27ffbcbd12ee 100644 --- a/lib/xray/xray_inmemory_log.cc +++ b/lib/xray/xray_inmemory_log.cc @@ -16,80 +16,85 @@ //===----------------------------------------------------------------------===// #include <cassert> +#include <cstring> +#include <errno.h> #include <fcntl.h> -#include <mutex> +#include <pthread.h> #include <sys/stat.h> #include <sys/syscall.h> #include <sys/types.h> -#include <thread> +#include <time.h> #include <unistd.h> +#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_libc.h" #include "xray/xray_records.h" #include "xray_defs.h" #include "xray_flags.h" +#include "xray_inmemory_log.h" #include "xray_interface_internal.h" #include "xray_tsc.h" #include "xray_utils.h" -// __xray_InMemoryRawLog will use a thread-local aligned buffer capped to a -// certain size (32kb by default) and use it as if it were a circular buffer for -// events. We store simple fixed-sized entries in the log for external analysis. +namespace __xray { -extern "C" { -void __xray_InMemoryRawLog(int32_t FuncId, - XRayEntryType Type) XRAY_NEVER_INSTRUMENT; -} +__sanitizer::SpinMutex LogMutex; -namespace __xray { +// We use elements of this type to record the entry TSC of every function ID we +// see as we're tracing a particular thread's execution. +struct alignas(16) StackEntry { + int32_t FuncId; + uint16_t Type; + uint8_t CPU; + uint8_t Padding; + uint64_t TSC; +}; -std::mutex LogMutex; - -class ThreadExitFlusher { - int Fd; - XRayRecord *Start; - size_t &Offset; - -public: - explicit ThreadExitFlusher(int Fd, XRayRecord *Start, - size_t &Offset) XRAY_NEVER_INSTRUMENT - : Fd(Fd), - Start(Start), - Offset(Offset) {} - - ~ThreadExitFlusher() XRAY_NEVER_INSTRUMENT { - std::lock_guard<std::mutex> L(LogMutex); - if (Fd > 0 && Start != nullptr) { - retryingWriteAll(Fd, reinterpret_cast<char *>(Start), - reinterpret_cast<char *>(Start + Offset)); - // Because this thread's exit could be the last one trying to write to the - // file and that we're not able to close out the file properly, we sync - // instead and hope that the pending writes are flushed as the thread - // exits. - fsync(Fd); - } - } +static_assert(sizeof(StackEntry) == 16, "Wrong size for StackEntry"); + +struct alignas(64) ThreadLocalData { + void *InMemoryBuffer = nullptr; + size_t BufferSize = 0; + size_t BufferOffset = 0; + void *ShadowStack = nullptr; + size_t StackSize = 0; + size_t StackEntries = 0; + int Fd = -1; + pid_t TID = 0; }; -} // namespace __xray +static pthread_key_t PThreadKey; + +static __sanitizer::atomic_uint8_t BasicInitialized{0}; + +BasicLoggingOptions GlobalOptions; + +thread_local volatile bool RecursionGuard = false; -using namespace __xray; +static uint64_t thresholdTicks() XRAY_NEVER_INSTRUMENT { + static uint64_t TicksPerSec = probeRequiredCPUFeatures() + ? getTSCFrequency() + : __xray::NanosecondsPerSecond; + static const uint64_t ThresholdTicks = + TicksPerSec * GlobalOptions.DurationFilterMicros / 1000000; + return ThresholdTicks; +} -static int __xray_OpenLogFile() XRAY_NEVER_INSTRUMENT { +static int openLogFile() XRAY_NEVER_INSTRUMENT { int F = getLogFD(); if (F == -1) return -1; // Test for required CPU features and cache the cycle frequency static bool TSCSupported = probeRequiredCPUFeatures(); - static uint64_t CycleFrequency = TSCSupported ? getTSCFrequency() - : __xray::NanosecondsPerSecond; + static uint64_t CycleFrequency = + TSCSupported ? getTSCFrequency() : __xray::NanosecondsPerSecond; // Since we're here, we get to write the header. We set it up so that the // header will only be written once, at the start, and let the threads // logging do writes which just append. XRayFileHeader Header; - Header.Version = 1; + Header.Version = 2; // Version 2 includes tail exit records. Header.Type = FileTypes::NAIVE_LOG; Header.CycleFrequency = CycleFrequency; @@ -102,47 +107,210 @@ static int __xray_OpenLogFile() XRAY_NEVER_INSTRUMENT { return F; } +int getGlobalFd() XRAY_NEVER_INSTRUMENT { + static int Fd = openLogFile(); + return Fd; +} + +ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { + thread_local ThreadLocalData TLD; + thread_local bool UNUSED TOnce = [] { + if (GlobalOptions.ThreadBufferSize == 0) { + if (__sanitizer::Verbosity()) + Report("Not initializing TLD since ThreadBufferSize == 0.\n"); + return false; + } + TLD.TID = __sanitizer::GetTid(); + pthread_setspecific(PThreadKey, &TLD); + TLD.Fd = getGlobalFd(); + TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>( + InternalAlloc(sizeof(XRayRecord) * GlobalOptions.ThreadBufferSize, + nullptr, alignof(XRayRecord))); + TLD.BufferSize = GlobalOptions.ThreadBufferSize; + TLD.BufferOffset = 0; + if (GlobalOptions.MaxStackDepth == 0) { + if (__sanitizer::Verbosity()) + Report("Not initializing the ShadowStack since MaxStackDepth == 0.\n"); + TLD.StackSize = 0; + TLD.StackEntries = 0; + TLD.ShadowStack = nullptr; + return false; + } + TLD.ShadowStack = reinterpret_cast<StackEntry *>( + InternalAlloc(sizeof(StackEntry) * GlobalOptions.MaxStackDepth, nullptr, + alignof(StackEntry))); + TLD.StackSize = GlobalOptions.MaxStackDepth; + TLD.StackEntries = 0; + if (__sanitizer::Verbosity() >= 2) { + static auto UNUSED Once = [] { + auto ticks = thresholdTicks(); + Report("Ticks threshold: %d\n", ticks); + return false; + }(); + } + return false; + }(); + return TLD; +} + template <class RDTSC> -void __xray_InMemoryRawLog(int32_t FuncId, XRayEntryType Type, - RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT { - using Buffer = - std::aligned_storage<sizeof(XRayRecord), alignof(XRayRecord)>::type; - static constexpr size_t BuffLen = 1024; - thread_local static Buffer InMemoryBuffer[BuffLen] = {}; - thread_local static size_t Offset = 0; - static int Fd = __xray_OpenLogFile(); +void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, + RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + auto &InMemoryBuffer = TLD.InMemoryBuffer; + int Fd = getGlobalFd(); if (Fd == -1) return; - thread_local __xray::ThreadExitFlusher Flusher( - Fd, reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer), Offset); - thread_local pid_t TId = syscall(SYS_gettid); - // First we get the useful data, and stuff it into the already aligned buffer - // through a pointer offset. - auto &R = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer)[Offset]; + // Use a simple recursion guard, to handle cases where we're already logging + // and for one reason or another, this function gets called again in the same + // thread. + if (RecursionGuard) + return; + RecursionGuard = true; + auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; }); + + uint8_t CPU = 0; + uint64_t TSC = ReadTSC(CPU); + + switch (Type) { + case XRayEntryType::ENTRY: + case XRayEntryType::LOG_ARGS_ENTRY: { + // Short circuit if we've reached the maximum depth of the stack. + if (TLD.StackEntries++ >= TLD.StackSize) + return; + + // When we encounter an entry event, we keep track of the TSC and the CPU, + // and put it in the stack. + StackEntry E; + E.FuncId = FuncId; + E.CPU = CPU; + E.Type = Type; + E.TSC = TSC; + auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) + + (sizeof(StackEntry) * (TLD.StackEntries - 1)); + __sanitizer::internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry)); + break; + } + case XRayEntryType::EXIT: + case XRayEntryType::TAIL: { + if (TLD.StackEntries == 0) + break; + + if (--TLD.StackEntries >= TLD.StackSize) + return; + + // When we encounter an exit event, we check whether all the following are + // true: + // + // - The Function ID is the same as the most recent entry in the stack. + // - The CPU is the same as the most recent entry in the stack. + // - The Delta of the TSCs is less than the threshold amount of time we're + // looking to record. + // + // If all of these conditions are true, we pop the stack and don't write a + // record and move the record offset back. + StackEntry StackTop; + auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) + + (sizeof(StackEntry) * TLD.StackEntries); + __sanitizer::internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry)); + if (StackTop.FuncId == FuncId && StackTop.CPU == CPU && + StackTop.TSC < TSC) { + auto Delta = TSC - StackTop.TSC; + if (Delta < thresholdTicks()) { + assert(TLD.BufferOffset > 0); + TLD.BufferOffset -= StackTop.Type == XRayEntryType::ENTRY ? 1 : 2; + return; + } + } + break; + } + default: + // Should be unreachable. + assert(false && "Unsupported XRayEntryType encountered."); + break; + } + + // First determine whether the delta between the function's enter record and + // the exit record is higher than the threshold. + __xray::XRayRecord R; R.RecordType = RecordTypes::NORMAL; - R.TSC = ReadTSC(R.CPU); - R.TId = TId; + R.CPU = CPU; + R.TSC = TSC; + R.TId = TLD.TID; R.Type = Type; R.FuncId = FuncId; - ++Offset; - if (Offset == BuffLen) { - std::lock_guard<std::mutex> L(LogMutex); + auto EntryPtr = static_cast<char *>(InMemoryBuffer) + + (sizeof(__xray::XRayRecord) * TLD.BufferOffset); + __sanitizer::internal_memcpy(EntryPtr, &R, sizeof(R)); + if (++TLD.BufferOffset == TLD.BufferSize) { + __sanitizer::SpinMutexLock L(&LogMutex); + auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer); + retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer), + reinterpret_cast<char *>(RecordBuffer + TLD.BufferOffset)); + TLD.BufferOffset = 0; + TLD.StackEntries = 0; + } +} + +template <class RDTSC> +void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1, + RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT { + auto &TLD = getThreadLocalData(); + auto &InMemoryBuffer = TLD.InMemoryBuffer; + auto &Offset = TLD.BufferOffset; + const auto &BuffLen = TLD.BufferSize; + int Fd = getGlobalFd(); + if (Fd == -1) + return; + + // First we check whether there's enough space to write the data consecutively + // in the thread-local buffer. If not, we first flush the buffer before + // attempting to write the two records that must be consecutive. + if (Offset + 2 > BuffLen) { + __sanitizer::SpinMutexLock L(&LogMutex); + auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer); + retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer), + reinterpret_cast<char *>(RecordBuffer + Offset)); + Offset = 0; + TLD.StackEntries = 0; + } + + // Then we write the "we have an argument" record. + InMemoryRawLog(FuncId, Type, ReadTSC); + + if (RecursionGuard) + return; + RecursionGuard = true; + auto ExitGuard = __sanitizer::at_scope_exit([] { RecursionGuard = false; }); + + // And from here on write the arg payload. + __xray::XRayArgPayload R; + R.RecordType = RecordTypes::ARG_PAYLOAD; + R.FuncId = FuncId; + R.TId = TLD.TID; + R.Arg = Arg1; + auto EntryPtr = + &reinterpret_cast<__xray::XRayArgPayload *>(&InMemoryBuffer)[Offset]; + std::memcpy(EntryPtr, &R, sizeof(R)); + if (++Offset == BuffLen) { + __sanitizer::SpinMutexLock L(&LogMutex); auto RecordBuffer = reinterpret_cast<__xray::XRayRecord *>(InMemoryBuffer); retryingWriteAll(Fd, reinterpret_cast<char *>(RecordBuffer), reinterpret_cast<char *>(RecordBuffer + Offset)); Offset = 0; + TLD.StackEntries = 0; } } -void __xray_InMemoryRawLogRealTSC(int32_t FuncId, - XRayEntryType Type) XRAY_NEVER_INSTRUMENT { - __xray_InMemoryRawLog(FuncId, Type, __xray::readTSC); +void basicLoggingHandleArg0RealTSC(int32_t FuncId, + XRayEntryType Type) XRAY_NEVER_INSTRUMENT { + InMemoryRawLog(FuncId, Type, __xray::readTSC); } -void __xray_InMemoryEmulateTSC(int32_t FuncId, - XRayEntryType Type) XRAY_NEVER_INSTRUMENT { - __xray_InMemoryRawLog(FuncId, Type, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT { +void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type) + XRAY_NEVER_INSTRUMENT { + InMemoryRawLog(FuncId, Type, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT { timespec TS; int result = clock_gettime(CLOCK_REALTIME, &TS); if (result != 0) { @@ -154,13 +322,150 @@ void __xray_InMemoryEmulateTSC(int32_t FuncId, }); } -static auto UNUSED Unused = [] { - auto UseRealTSC = probeRequiredCPUFeatures(); - if (!UseRealTSC) +void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Type, + uint64_t Arg1) XRAY_NEVER_INSTRUMENT { + InMemoryRawLogWithArg(FuncId, Type, Arg1, __xray::readTSC); +} + +void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type, + uint64_t Arg1) XRAY_NEVER_INSTRUMENT { + InMemoryRawLogWithArg( + FuncId, Type, Arg1, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT { + timespec TS; + int result = clock_gettime(CLOCK_REALTIME, &TS); + if (result != 0) { + Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno)); + TS = {0, 0}; + } + CPU = 0; + return TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec; + }); +} + +static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT { + ThreadLocalData &TLD = *reinterpret_cast<ThreadLocalData *>(P); + auto ExitGuard = __sanitizer::at_scope_exit([&TLD] { + // Clean up dynamic resources. + if (TLD.InMemoryBuffer) + InternalFree(TLD.InMemoryBuffer); + if (TLD.ShadowStack) + InternalFree(TLD.ShadowStack); + if (__sanitizer::Verbosity()) + Report("Cleaned up log for TID: %d\n", TLD.TID); + }); + + if (TLD.Fd == -1 || TLD.BufferOffset == 0) { + if (__sanitizer::Verbosity()) + Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", TLD.TID, + TLD.Fd, TLD.BufferOffset); + return; + } + + { + __sanitizer::SpinMutexLock L(&LogMutex); + retryingWriteAll(TLD.Fd, reinterpret_cast<char *>(TLD.InMemoryBuffer), + reinterpret_cast<char *>(TLD.InMemoryBuffer) + + (sizeof(__xray::XRayRecord) * TLD.BufferOffset)); + } + + // Because this thread's exit could be the last one trying to write to + // the file and that we're not able to close out the file properly, we + // sync instead and hope that the pending writes are flushed as the + // thread exits. + fsync(TLD.Fd); +} + +XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax, + void *Options, + size_t OptionsSize) XRAY_NEVER_INSTRUMENT { + static bool UNUSED Once = [] { + pthread_key_create(&PThreadKey, TLDDestructor); + return false; + }(); + + uint8_t Expected = 0; + if (!__sanitizer::atomic_compare_exchange_strong( + &BasicInitialized, &Expected, 1, __sanitizer::memory_order_acq_rel)) { + if (__sanitizer::Verbosity()) + Report("Basic logging already initialized.\n"); + return XRayLogInitStatus::XRAY_LOG_INITIALIZED; + } + + if (OptionsSize != sizeof(BasicLoggingOptions)) { + Report("Invalid options size, potential ABI mismatch; expected %d got %d", + sizeof(BasicLoggingOptions), OptionsSize); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + static auto UseRealTSC = probeRequiredCPUFeatures(); + if (!UseRealTSC && __sanitizer::Verbosity()) Report("WARNING: Required CPU features missing for XRay instrumentation, " "using emulation instead.\n"); - if (flags()->xray_naive_log) - __xray_set_handler(UseRealTSC ? __xray_InMemoryRawLogRealTSC - : __xray_InMemoryEmulateTSC); + + GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options); + __xray_set_handler_arg1(UseRealTSC ? basicLoggingHandleArg1RealTSC + : basicLoggingHandleArg1EmulateTSC); + __xray_set_handler(UseRealTSC ? basicLoggingHandleArg0RealTSC + : basicLoggingHandleArg0EmulateTSC); + __xray_remove_customevent_handler(); + + return XRayLogInitStatus::XRAY_LOG_INITIALIZED; +} + +XRayLogInitStatus basicLoggingFinalize() XRAY_NEVER_INSTRUMENT { + uint8_t Expected = 0; + if (!__sanitizer::atomic_compare_exchange_strong( + &BasicInitialized, &Expected, 0, __sanitizer::memory_order_acq_rel) && + __sanitizer::Verbosity()) + Report("Basic logging already finalized.\n"); + + // Nothing really to do aside from marking state of the global to be + // uninitialized. + + return XRayLogInitStatus::XRAY_LOG_FINALIZED; +} + +XRayLogFlushStatus basicLoggingFlush() XRAY_NEVER_INSTRUMENT { + // This really does nothing, since flushing the logs happen at the end of a + // thread's lifetime, or when the buffers are full. + return XRayLogFlushStatus::XRAY_LOG_FLUSHED; +} + +// This is a handler that, effectively, does nothing. +void basicLoggingHandleArg0Empty(int32_t, XRayEntryType) XRAY_NEVER_INSTRUMENT { +} + +bool basicLogDynamicInitializer() XRAY_NEVER_INSTRUMENT { + XRayLogImpl Impl{ + basicLoggingInit, + basicLoggingFinalize, + basicLoggingHandleArg0Empty, + basicLoggingFlush, + }; + auto RegistrationResult = __xray_log_register_mode("xray-basic", Impl); + if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && + __sanitizer::Verbosity()) + Report("Cannot register XRay Basic Mode to 'xray-basic'; error = %d\n", + RegistrationResult); + if (flags()->xray_naive_log || + !__sanitizer::internal_strcmp(flags()->xray_mode, "xray-basic")) { + __xray_set_log_impl(Impl); + BasicLoggingOptions Options; + Options.DurationFilterMicros = + flags()->xray_naive_log_func_duration_threshold_us; + Options.MaxStackDepth = flags()->xray_naive_log_max_stack_depth; + Options.ThreadBufferSize = flags()->xray_naive_log_thread_buffer_size; + __xray_log_init(flags()->xray_naive_log_thread_buffer_size, 0, &Options, + sizeof(BasicLoggingOptions)); + static auto UNUSED Once = [] { + static auto UNUSED &TLD = getThreadLocalData(); + __sanitizer::Atexit(+[] { TLDDestructor(&TLD); }); + return false; + }(); + } return true; -}(); +} + +} // namespace __xray + +static auto UNUSED Unused = __xray::basicLogDynamicInitializer(); diff --git a/lib/xray/xray_inmemory_log.h b/lib/xray/xray_inmemory_log.h new file mode 100644 index 0000000000000..e4fcb8ca5ffdc --- /dev/null +++ b/lib/xray/xray_inmemory_log.h @@ -0,0 +1,44 @@ +//===-- xray_inmemory_log.h +//------------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#ifndef XRAY_XRAY_INMEMORY_LOG_H +#define XRAY_XRAY_INMEMORY_LOG_H + +#include "xray/xray_log_interface.h" + +/// Basic (Naive) Mode +/// ================== +/// +/// This implementation hooks in through the XRay logging implementation +/// framework. The Basic Mode implementation will keep appending to a file as +/// soon as the thread-local buffers are full. It keeps minimal in-memory state +/// and does the minimum filtering required to keep log files smaller. + +namespace __xray { + +XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax, + void *Options, size_t OptionsSize); +XRayLogInitStatus basicLoggingFinalize(); + +void basicLoggingHandleArg0RealTSC(int32_t FuncId, XRayEntryType Entry); +void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Entry); +void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Entry, + uint64_t Arg1); +void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Entry, + uint64_t Arg1); +XRayLogFlushStatus basicLoggingFlush(); +XRayLogInitStatus basicLoggingReset(); + +} // namespace __xray + +#endif // XRAY_XRAY_INMEMORY_LOG_H diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc index 694d34c0102b7..766313e85c584 100644 --- a/lib/xray/xray_interface.cc +++ b/lib/xray/xray_interface.cc @@ -23,12 +23,15 @@ #include "sanitizer_common/sanitizer_common.h" #include "xray_defs.h" +#include "xray_flags.h" + +extern __sanitizer::SpinMutex XRayInstrMapMutex; +extern __sanitizer::atomic_uint8_t XRayInitialized; +extern __xray::XRaySledMap XRayInstrMap; namespace __xray { #if defined(__x86_64__) -// FIXME: The actual length is 11 bytes. Why was length 12 passed to mprotect() -// ? static const int16_t cSledLength = 12; #elif defined(__aarch64__) static const int16_t cSledLength = 32; @@ -53,6 +56,10 @@ __sanitizer::atomic_uintptr_t XRayArgLogger{0}; // This is the function to call when we encounter a custom event log call. __sanitizer::atomic_uintptr_t XRayPatchedCustomEvent{0}; +// This is the global status to determine whether we are currently +// patching/unpatching. +__sanitizer::atomic_uint8_t XRayPatching{0}; + // MProtectHelper is an RAII wrapper for calls to mprotect(...) that will undo // any successful mprotect(...) changes. This is used to make a page writeable // and executable, and upon destruction if it was successful in doing so returns @@ -88,85 +95,10 @@ public: } }; -} // namespace __xray - -extern __sanitizer::SpinMutex XRayInstrMapMutex; -extern __sanitizer::atomic_uint8_t XRayInitialized; -extern __xray::XRaySledMap XRayInstrMap; - -int __xray_set_handler(void (*entry)(int32_t, - XRayEntryType)) XRAY_NEVER_INSTRUMENT { - if (__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) { - - __sanitizer::atomic_store(&__xray::XRayPatchedFunction, - reinterpret_cast<uintptr_t>(entry), - __sanitizer::memory_order_release); - return 1; - } - return 0; -} - -int __xray_set_customevent_handler(void (*entry)(void *, size_t)) - XRAY_NEVER_INSTRUMENT { - if (__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) { - __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent, - reinterpret_cast<uintptr_t>(entry), - __sanitizer::memory_order_release); - return 1; - } - return 0; -} - - -int __xray_remove_handler() XRAY_NEVER_INSTRUMENT { - return __xray_set_handler(nullptr); -} - -int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT { - return __xray_set_customevent_handler(nullptr); -} - -__sanitizer::atomic_uint8_t XRayPatching{0}; - -using namespace __xray; - -// FIXME: Figure out whether we can move this class to sanitizer_common instead -// as a generic "scope guard". -template <class Function> class CleanupInvoker { - Function Fn; - -public: - explicit CleanupInvoker(Function Fn) XRAY_NEVER_INSTRUMENT : Fn(Fn) {} - CleanupInvoker(const CleanupInvoker &) XRAY_NEVER_INSTRUMENT = default; - CleanupInvoker(CleanupInvoker &&) XRAY_NEVER_INSTRUMENT = default; - CleanupInvoker & - operator=(const CleanupInvoker &) XRAY_NEVER_INSTRUMENT = delete; - CleanupInvoker &operator=(CleanupInvoker &&) XRAY_NEVER_INSTRUMENT = delete; - ~CleanupInvoker() XRAY_NEVER_INSTRUMENT { Fn(); } -}; - -template <class Function> -CleanupInvoker<Function> scopeCleanup(Function Fn) XRAY_NEVER_INSTRUMENT { - return CleanupInvoker<Function>{Fn}; -} - -inline bool patchSled(const XRaySledEntry &Sled, bool Enable, - int32_t FuncId) XRAY_NEVER_INSTRUMENT { - // While we're here, we should patch the nop sled. To do that we mprotect - // the page containing the function to be writeable. - const uint64_t PageSize = GetPageSizeCached(); - void *PageAlignedAddr = - reinterpret_cast<void *>(Sled.Address & ~(PageSize - 1)); - std::size_t MProtectLen = (Sled.Address + cSledLength) - - reinterpret_cast<uint64_t>(PageAlignedAddr); - MProtectHelper Protector(PageAlignedAddr, MProtectLen); - if (Protector.MakeWriteable() == -1) { - printf("Failed mprotect: %d\n", errno); - return XRayPatchingStatus::FAILED; - } +namespace { +bool patchSled(const XRaySledEntry &Sled, bool Enable, + int32_t FuncId) XRAY_NEVER_INSTRUMENT { bool Success = false; switch (Sled.Kind) { case XRayEntryType::ENTRY: @@ -191,6 +123,55 @@ inline bool patchSled(const XRaySledEntry &Sled, bool Enable, return Success; } +XRayPatchingStatus patchFunction(int32_t FuncId, + bool Enable) XRAY_NEVER_INSTRUMENT { + if (!__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) + return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. + + uint8_t NotPatching = false; + if (!__sanitizer::atomic_compare_exchange_strong( + &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel)) + return XRayPatchingStatus::ONGOING; // Already patching. + + // Next, we look for the function index. + XRaySledMap InstrMap; + { + __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); + InstrMap = XRayInstrMap; + } + + // If we don't have an index, we can't patch individual functions. + if (InstrMap.Functions == 0) + return XRayPatchingStatus::NOT_INITIALIZED; + + // FuncId must be a positive number, less than the number of functions + // instrumented. + if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) { + Report("Invalid function id provided: %d\n", FuncId); + return XRayPatchingStatus::FAILED; + } + + // Now we patch ths sleds for this specific function. + auto SledRange = InstrMap.SledsIndex[FuncId - 1]; + auto *f = SledRange.Begin; + auto *e = SledRange.End; + + bool SucceedOnce = false; + while (f != e) + SucceedOnce |= patchSled(*f++, Enable, FuncId); + + __sanitizer::atomic_store(&XRayPatching, false, + __sanitizer::memory_order_release); + + if (!SucceedOnce) { + Report("Failed patching any sled for function '%d'.", FuncId); + return XRayPatchingStatus::FAILED; + } + + return XRayPatchingStatus::SUCCESS; +} + // controlPatching implements the common internals of the patching/unpatching // implementation. |Enable| defines whether we're enabling or disabling the // runtime XRay instrumentation. @@ -205,14 +186,13 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { return XRayPatchingStatus::ONGOING; // Already patching. uint8_t PatchingSuccess = false; - auto XRayPatchingStatusResetter = scopeCleanup([&PatchingSuccess] { - if (!PatchingSuccess) - __sanitizer::atomic_store(&XRayPatching, false, - __sanitizer::memory_order_release); - }); - - // Step 1: Compute the function id, as a unique identifier per function in the - // instrumentation map. + auto XRayPatchingStatusResetter = + __sanitizer::at_scope_exit([&PatchingSuccess] { + if (!PatchingSuccess) + __sanitizer::atomic_store(&XRayPatching, false, + __sanitizer::memory_order_release); + }); + XRaySledMap InstrMap; { __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); @@ -221,16 +201,47 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { if (InstrMap.Entries == 0) return XRayPatchingStatus::NOT_INITIALIZED; - const uint64_t PageSize = GetPageSizeCached(); + uint32_t FuncId = 1; + uint64_t CurFun = 0; + + // First we want to find the bounds for which we have instrumentation points, + // and try to get as few calls to mprotect(...) as possible. We're assuming + // that all the sleds for the instrumentation map are contiguous as a single + // set of pages. When we do support dynamic shared object instrumentation, + // we'll need to do this for each set of page load offsets per DSO loaded. For + // now we're assuming we can mprotect the whole section of text between the + // minimum sled address and the maximum sled address (+ the largest sled + // size). + auto MinSled = InstrMap.Sleds[0]; + auto MaxSled = InstrMap.Sleds[InstrMap.Entries - 1]; + for (std::size_t I = 0; I < InstrMap.Entries; I++) { + const auto &Sled = InstrMap.Sleds[I]; + if (Sled.Address < MinSled.Address) + MinSled = Sled; + if (Sled.Address > MaxSled.Address) + MaxSled = Sled; + } + + const size_t PageSize = flags()->xray_page_size_override > 0 + ? flags()->xray_page_size_override + : GetPageSizeCached(); if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) { Report("System page size is not a power of two: %lld\n", PageSize); return XRayPatchingStatus::FAILED; } - uint32_t FuncId = 1; - uint64_t CurFun = 0; - for (std::size_t I = 0; I < InstrMap.Entries; I++) { - auto Sled = InstrMap.Sleds[I]; + void *PageAlignedAddr = + reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1)); + size_t MProtectLen = + (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength; + MProtectHelper Protector(PageAlignedAddr, MProtectLen); + if (Protector.MakeWriteable() == -1) { + Report("Failed mprotect: %d\n", errno); + return XRayPatchingStatus::FAILED; + } + + for (std::size_t I = 0; I < InstrMap.Entries; ++I) { + auto &Sled = InstrMap.Sleds[I]; auto F = Sled.Function; if (CurFun == 0) CurFun = F; @@ -246,36 +257,14 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { return XRayPatchingStatus::SUCCESS; } -XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { - return controlPatching(true); -} - -XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT { - return controlPatching(false); -} - -XRayPatchingStatus patchFunction(int32_t FuncId, - bool Enable) XRAY_NEVER_INSTRUMENT { - if (!__sanitizer::atomic_load(&XRayInitialized, - __sanitizer::memory_order_acquire)) - return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized. - - uint8_t NotPatching = false; - if (!__sanitizer::atomic_compare_exchange_strong( - &XRayPatching, &NotPatching, true, __sanitizer::memory_order_acq_rel)) - return XRayPatchingStatus::ONGOING; // Already patching. - - // Next, we look for the function index. +XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, + bool Enable) XRAY_NEVER_INSTRUMENT { XRaySledMap InstrMap; { __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); InstrMap = XRayInstrMap; } - // If we don't have an index, we can't patch individual functions. - if (InstrMap.Functions == 0) - return XRayPatchingStatus::NOT_INITIALIZED; - // FuncId must be a positive number, less than the number of functions // instrumented. if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) { @@ -283,33 +272,98 @@ XRayPatchingStatus patchFunction(int32_t FuncId, return XRayPatchingStatus::FAILED; } - // Now we patch ths sleds for this specific function. + const size_t PageSize = flags()->xray_page_size_override > 0 + ? flags()->xray_page_size_override + : GetPageSizeCached(); + if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) { + Report("Provided page size is not a power of two: %lld\n", PageSize); + return XRayPatchingStatus::FAILED; + } + + // Here we compute the minumum sled and maximum sled associated with a + // particular function ID. auto SledRange = InstrMap.SledsIndex[FuncId - 1]; auto *f = SledRange.Begin; auto *e = SledRange.End; + auto MinSled = *f; + auto MaxSled = *(SledRange.End - 1); + while (f != e) { + if (f->Address < MinSled.Address) + MinSled = *f; + if (f->Address > MaxSled.Address) + MaxSled = *f; + ++f; + } - bool SucceedOnce = false; - while (f != e) - SucceedOnce |= patchSled(*f++, Enable, FuncId); + void *PageAlignedAddr = + reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1)); + size_t MProtectLen = + (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength; + MProtectHelper Protector(PageAlignedAddr, MProtectLen); + if (Protector.MakeWriteable() == -1) { + Report("Failed mprotect: %d\n", errno); + return XRayPatchingStatus::FAILED; + } + return patchFunction(FuncId, Enable); +} - __sanitizer::atomic_store(&XRayPatching, false, - __sanitizer::memory_order_release); +} // namespace - if (!SucceedOnce) { - Report("Failed patching any sled for function '%d'.", FuncId); - return XRayPatchingStatus::FAILED; +} // namespace __xray + +using namespace __xray; + +// The following functions are declared `extern "C" {...}` in the header, hence +// they're defined in the global namespace. + +int __xray_set_handler(void (*entry)(int32_t, + XRayEntryType)) XRAY_NEVER_INSTRUMENT { + if (__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) { + + __sanitizer::atomic_store(&__xray::XRayPatchedFunction, + reinterpret_cast<uintptr_t>(entry), + __sanitizer::memory_order_release); + return 1; } + return 0; +} - return XRayPatchingStatus::SUCCESS; +int __xray_set_customevent_handler(void (*entry)(void *, size_t)) + XRAY_NEVER_INSTRUMENT { + if (__sanitizer::atomic_load(&XRayInitialized, + __sanitizer::memory_order_acquire)) { + __sanitizer::atomic_store(&__xray::XRayPatchedCustomEvent, + reinterpret_cast<uintptr_t>(entry), + __sanitizer::memory_order_release); + return 1; + } + return 0; +} + +int __xray_remove_handler() XRAY_NEVER_INSTRUMENT { + return __xray_set_handler(nullptr); +} + +int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT { + return __xray_set_customevent_handler(nullptr); +} + +XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT { + return controlPatching(true); +} + +XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT { + return controlPatching(false); } XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - return patchFunction(FuncId, true); + return mprotectAndPatchFunction(FuncId, true); } XRayPatchingStatus __xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT { - return patchFunction(FuncId, false); + return mprotectAndPatchFunction(FuncId, false); } int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) { @@ -331,7 +385,7 @@ uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT { __sanitizer::SpinMutexLock Guard(&XRayInstrMapMutex); if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions) return 0; - return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Address + return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Function // On PPC, function entries are always aligned to 16 bytes. The beginning of a // sled might be a local entry, which is always +8 based on the global entry. // Always return the global entry. diff --git a/lib/xray/xray_interface_internal.h b/lib/xray/xray_interface_internal.h index 4a2784612fcbb..5811e2b7300a6 100644 --- a/lib/xray/xray_interface_internal.h +++ b/lib/xray/xray_interface_internal.h @@ -28,13 +28,15 @@ struct XRaySledEntry { uint64_t Function; unsigned char Kind; unsigned char AlwaysInstrument; - unsigned char Padding[14]; // Need 32 bytes + unsigned char Version; + unsigned char Padding[13]; // Need 32 bytes #elif SANITIZER_WORDSIZE == 32 uint32_t Address; uint32_t Function; unsigned char Kind; unsigned char AlwaysInstrument; - unsigned char Padding[6]; // Need 16 bytes + unsigned char Version; + unsigned char Padding[5]; // Need 16 bytes #else #error "Unsupported word size." #endif diff --git a/lib/xray/xray_log_interface.cc b/lib/xray/xray_log_interface.cc index ee14ae4b1b624..783f004d292aa 100644 --- a/lib/xray/xray_log_interface.cc +++ b/lib/xray/xray_log_interface.cc @@ -12,35 +12,84 @@ //===----------------------------------------------------------------------===// #include "xray/xray_log_interface.h" +#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_mutex.h" #include "xray/xray_interface.h" #include "xray_defs.h" -#include <memory> - __sanitizer::SpinMutex XRayImplMutex; -std::unique_ptr<XRayLogImpl> GlobalXRayImpl; +XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr}; +XRayLogImpl *GlobalXRayImpl = nullptr; + +// We use a linked list of Mode to XRayLogImpl mappings. This is a linked list +// when it should be a map because we're avoiding having to depend on C++ +// standard library data structures at this level of the implementation. +struct ModeImpl { + ModeImpl *Next; + const char *Mode; + XRayLogImpl Impl; +}; + +ModeImpl SentinelModeImpl{ + nullptr, nullptr, {nullptr, nullptr, nullptr, nullptr}}; +ModeImpl *ModeImpls = &SentinelModeImpl; + +XRayLogRegisterStatus +__xray_log_register_mode(const char *Mode, + XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT { + if (Impl.flush_log == nullptr || Impl.handle_arg0 == nullptr || + Impl.log_finalize == nullptr || Impl.log_init == nullptr) + return XRayLogRegisterStatus::XRAY_INCOMPLETE_IMPL; + + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + // First, look for whether the mode already has a registered implementation. + for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) { + if (!__sanitizer::internal_strcmp(Mode, it->Mode)) + return XRayLogRegisterStatus::XRAY_DUPLICATE_MODE; + } + auto *NewModeImpl = + static_cast<ModeImpl *>(__sanitizer::InternalAlloc(sizeof(ModeImpl))); + NewModeImpl->Next = ModeImpls; + NewModeImpl->Mode = __sanitizer::internal_strdup(Mode); + NewModeImpl->Impl = Impl; + ModeImpls = NewModeImpl; + return XRayLogRegisterStatus::XRAY_REGISTRATION_OK; +} + +XRayLogRegisterStatus +__xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT { + __sanitizer::SpinMutexLock Guard(&XRayImplMutex); + for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) { + if (!__sanitizer::internal_strcmp(Mode, it->Mode)) { + CurrentXRayImpl = it->Impl; + GlobalXRayImpl = &CurrentXRayImpl; + __xray_set_handler(it->Impl.handle_arg0); + return XRayLogRegisterStatus::XRAY_REGISTRATION_OK; + } + } + return XRayLogRegisterStatus::XRAY_MODE_NOT_FOUND; +} void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT { if (Impl.log_init == nullptr || Impl.log_finalize == nullptr || Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) { __sanitizer::SpinMutexLock Guard(&XRayImplMutex); - GlobalXRayImpl.reset(); + GlobalXRayImpl = nullptr; __xray_remove_handler(); __xray_remove_handler_arg1(); return; } __sanitizer::SpinMutexLock Guard(&XRayImplMutex); - GlobalXRayImpl.reset(new XRayLogImpl); - *GlobalXRayImpl = Impl; + CurrentXRayImpl = Impl; + GlobalXRayImpl = &CurrentXRayImpl; __xray_set_handler(Impl.handle_arg0); } void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT { __sanitizer::SpinMutexLock Guard(&XRayImplMutex); - GlobalXRayImpl.reset(); + GlobalXRayImpl = nullptr; __xray_remove_handler(); __xray_remove_handler_arg1(); } diff --git a/lib/xray/xray_trampoline_x86_64.S b/lib/xray/xray_trampoline_x86_64.S index b59eedc4bb1b8..350afd9265fde 100644 --- a/lib/xray/xray_trampoline_x86_64.S +++ b/lib/xray/xray_trampoline_x86_64.S @@ -14,10 +14,13 @@ //===----------------------------------------------------------------------===// #include "../builtins/assembly.h" +#include "../sanitizer_common/sanitizer_asm.h" + + .macro SAVE_REGISTERS subq $192, %rsp - .cfi_def_cfa_offset 200 + CFI_DEF_CFA_OFFSET(200) // At this point, the stack pointer should be aligned to an 8-byte boundary, // because any call instructions that come after this will add another 8 // bytes and therefore align it to 16-bytes. @@ -57,63 +60,82 @@ movq 8(%rsp), %r8 movq 0(%rsp), %r9 addq $192, %rsp - .cfi_def_cfa_offset 8 + CFI_DEF_CFA_OFFSET(8) +.endm + +.macro ALIGNED_CALL_RAX + // Call the logging handler, after aligning the stack to a 16-byte boundary. + // The approach we're taking here uses additional stack space to stash the + // stack pointer twice before aligning the pointer to 16-bytes. If the stack + // was 8-byte aligned, it will become 16-byte aligned -- when restoring the + // pointer, we can always look -8 bytes from the current position to get + // either of the values we've stashed in the first place. + pushq %rsp + pushq (%rsp) + andq $-0x10, %rsp + callq *%rax + movq 8(%rsp), %rsp .endm .text +#if !defined(__APPLE__) + .section .text +#else + .section __TEXT,__text +#endif .file "xray_trampoline_x86.S" //===----------------------------------------------------------------------===// - .globl __xray_FunctionEntry + .globl ASM_SYMBOL(__xray_FunctionEntry) .align 16, 0x90 - .type __xray_FunctionEntry,@function - -__xray_FunctionEntry: - .cfi_startproc + ASM_TYPE_FUNCTION(__xray_FunctionEntry) +ASM_SYMBOL(__xray_FunctionEntry): + CFI_STARTPROC SAVE_REGISTERS // This load has to be atomic, it's concurrent with __xray_patch(). // On x86/amd64, a simple (type-aligned) MOV instruction is enough. - movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax, %rax je .Ltmp0 // The patched function prolog puts its xray_instr_map index into %r10d. movl %r10d, %edi xor %esi,%esi - callq *%rax + ALIGNED_CALL_RAX + .Ltmp0: RESTORE_REGISTERS retq -.Ltmp1: - .size __xray_FunctionEntry, .Ltmp1-__xray_FunctionEntry - .cfi_endproc + ASM_SIZE(__xray_FunctionEntry) + CFI_ENDPROC //===----------------------------------------------------------------------===// - .globl __xray_FunctionExit + .globl ASM_SYMBOL(__xray_FunctionExit) .align 16, 0x90 - .type __xray_FunctionExit,@function -__xray_FunctionExit: - .cfi_startproc + ASM_TYPE_FUNCTION(__xray_FunctionExit) +ASM_SYMBOL(__xray_FunctionExit): + CFI_STARTPROC // Save the important registers first. Since we're assuming that this // function is only jumped into, we only preserve the registers for // returning. subq $56, %rsp - .cfi_def_cfa_offset 64 + CFI_DEF_CFA_OFFSET(64) movq %rbp, 48(%rsp) movupd %xmm0, 32(%rsp) movupd %xmm1, 16(%rsp) movq %rax, 8(%rsp) movq %rdx, 0(%rsp) - movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax,%rax je .Ltmp2 movl %r10d, %edi movl $1, %esi - callq *%rax + ALIGNED_CALL_RAX + .Ltmp2: // Restore the important registers. movq 48(%rsp), %rbp @@ -122,111 +144,94 @@ __xray_FunctionExit: movq 8(%rsp), %rax movq 0(%rsp), %rdx addq $56, %rsp - .cfi_def_cfa_offset 8 + CFI_DEF_CFA_OFFSET(8) retq -.Ltmp3: - .size __xray_FunctionExit, .Ltmp3-__xray_FunctionExit - .cfi_endproc + ASM_SIZE(__xray_FunctionExit) + CFI_ENDPROC //===----------------------------------------------------------------------===// - .global __xray_FunctionTailExit + .globl ASM_SYMBOL(__xray_FunctionTailExit) .align 16, 0x90 - .type __xray_FunctionTailExit,@function -__xray_FunctionTailExit: - .cfi_startproc - // Save the important registers as in the entry trampoline, but indicate that - // this is an exit. In the future, we will introduce a new entry type that - // differentiates between a normal exit and a tail exit, but we'd have to do - // this and increment the version number for the header. + ASM_TYPE_FUNCTION(__xray_FunctionTailExit) +ASM_SYMBOL(__xray_FunctionTailExit): + CFI_STARTPROC SAVE_REGISTERS - movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax,%rax je .Ltmp4 movl %r10d, %edi - movl $1, %esi - callq *%rax + movl $2, %esi + + ALIGNED_CALL_RAX .Ltmp4: RESTORE_REGISTERS retq -.Ltmp5: - .size __xray_FunctionTailExit, .Ltmp5-__xray_FunctionTailExit - .cfi_endproc + ASM_SIZE(__xray_FunctionTailExit) + CFI_ENDPROC //===----------------------------------------------------------------------===// - .globl __xray_ArgLoggerEntry + .globl ASM_SYMBOL(__xray_ArgLoggerEntry) .align 16, 0x90 - .type __xray_ArgLoggerEntry,@function -__xray_ArgLoggerEntry: - .cfi_startproc + ASM_TYPE_FUNCTION(__xray_ArgLoggerEntry) +ASM_SYMBOL(__xray_ArgLoggerEntry): + CFI_STARTPROC SAVE_REGISTERS // Again, these function pointer loads must be atomic; MOV is fine. - movq _ZN6__xray13XRayArgLoggerE(%rip), %rax + movq ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax testq %rax, %rax jne .Larg1entryLog // If [arg1 logging handler] not set, defer to no-arg logging. - movq _ZN6__xray19XRayPatchedFunctionE(%rip), %rax + movq ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax testq %rax, %rax je .Larg1entryFail .Larg1entryLog: - // First argument will become the third + // First argument will become the third movq %rdi, %rdx - // XRayEntryType::ENTRY into the second - xorq %rsi, %rsi + // XRayEntryType::LOG_ARGS_ENTRY into the second + mov $0x3, %esi // 32-bit function ID becomes the first movl %r10d, %edi - callq *%rax + ALIGNED_CALL_RAX .Larg1entryFail: RESTORE_REGISTERS retq - -.Larg1entryEnd: - .size __xray_ArgLoggerEntry, .Larg1entryEnd-__xray_ArgLoggerEntry - .cfi_endproc + ASM_SIZE(__xray_ArgLoggerEntry) + CFI_ENDPROC //===----------------------------------------------------------------------===// - .global __xray_CustomEvent + .global ASM_SYMBOL(__xray_CustomEvent) .align 16, 0x90 - .type __xray_CustomEvent,@function -__xray_CustomEvent: - .cfi_startproc - subq $16, %rsp - .cfi_def_cfa_offset 24 - movq %rbp, 8(%rsp) - movq %rax, 0(%rsp) + ASM_TYPE_FUNCTION(__xray_CustomEvent) +ASM_SYMBOL(__xray_CustomEvent): + CFI_STARTPROC + SAVE_REGISTERS // We take two arguments to this trampoline, which should be in rdi and rsi // already. We also make sure that we stash %rax because we use that register // to call the logging handler. - movq _ZN6__xray22XRayPatchedCustomEventE(%rip), %rax + movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax testq %rax,%rax je .LcustomEventCleanup - // At this point we know that rcx and rdx already has the data, so we just - // call the logging handler. - callq *%rax + ALIGNED_CALL_RAX .LcustomEventCleanup: - movq 0(%rsp), %rax - movq 8(%rsp), %rbp - addq $16, %rsp - .cfi_def_cfa_offset 8 + RESTORE_REGISTERS retq - -.Ltmp8: - .size __xray_CustomEvent, .Ltmp8-__xray_CustomEvent - .cfi_endproc + ASM_SIZE(__xray_CustomEvent) + CFI_ENDPROC NO_EXEC_STACK_DIRECTIVE diff --git a/lib/xray/xray_utils.cc b/lib/xray/xray_utils.cc index b9a38d1b98eb4..cf800d3aeaf88 100644 --- a/lib/xray/xray_utils.cc +++ b/lib/xray/xray_utils.cc @@ -117,7 +117,8 @@ int getLogFD() XRAY_NEVER_INSTRUMENT { TmpFilename); return -1; } - Report("XRay: Log file in '%s'\n", TmpFilename); + if (__sanitizer::Verbosity()) + Report("XRay: Log file in '%s'\n", TmpFilename); return Fd; } diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc index e34806fa1cea2..e17f00ac3a62a 100644 --- a/lib/xray/xray_x86_64.cc +++ b/lib/xray/xray_x86_64.cc @@ -76,6 +76,7 @@ static constexpr uint8_t CallOpCode = 0xe8; static constexpr uint16_t MovR10Seq = 0xba41; static constexpr uint16_t Jmp9Seq = 0x09eb; static constexpr uint16_t Jmp20Seq = 0x14eb; +static constexpr uint16_t Jmp15Seq = 0x0feb; static constexpr uint8_t JmpOpCode = 0xe9; static constexpr uint8_t RetOpCode = 0xc3; static constexpr uint16_t NopwSeq = 0x9066; @@ -207,8 +208,10 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT { // Here we do the dance of replacing the following sled: // + // In Version 0: + // // xray_sled_n: - // jmp +19 // 2 bytes + // jmp +20 // 2 bytes // ... // // With the following: @@ -216,24 +219,35 @@ bool patchCustomEvent(const bool Enable, const uint32_t FuncId, // nopw // 2 bytes* // ... // - // We need to do this in the following order: // - // 1. Overwrite the 5-byte nop with the call (relative), where (relative) is - // the relative offset to the __xray_CustomEvent trampoline. - // 2. Do a two-byte atomic write over the 'jmp +24' to turn it into a 'nopw'. - // This allows us to "enable" this code once the changes have committed. + // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'. + // + // --- // - // The "unpatch" should just turn the 'nopw' back to a 'jmp +24'. + // In Version 1: + // + // The jump offset is now 15 bytes (0x0f), so when restoring the nopw back + // to a jmp, use 15 bytes instead. // if (Enable) { std::atomic_store_explicit( reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq, std::memory_order_release); } else { - std::atomic_store_explicit( - reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq, - std::memory_order_release); - } + switch (Sled.Version) { + case 1: + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp15Seq, + std::memory_order_release); + break; + case 0: + default: + std::atomic_store_explicit( + reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq, + std::memory_order_release); + break; + } + } return false; } @@ -244,9 +258,9 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { // We check whether rdtscp support is enabled. According to the x86_64 manual, // level should be set at 0x80000001, and we should have a look at bit 27 in - // EDX. That's 0x8000000 (or 1u << 26). + // EDX. That's 0x8000000 (or 1u << 27). __get_cpuid(0x80000001, &EAX, &EBX, &ECX, &EDX); - if (!(EDX & (1u << 26))) { + if (!(EDX & (1u << 27))) { Report("Missing rdtscp support.\n"); return false; } |