diff options
Diffstat (limited to 'contrib/compiler-rt/lib/xray')
22 files changed, 2881 insertions, 1629 deletions
diff --git a/contrib/compiler-rt/lib/xray/xray_allocator.h b/contrib/compiler-rt/lib/xray/xray_allocator.h index 8244815284a8..907c54542a56 100644 --- a/contrib/compiler-rt/lib/xray/xray_allocator.h +++ b/contrib/compiler-rt/lib/xray/xray_allocator.h @@ -19,18 +19,131 @@ #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_internal_defs.h" #include "sanitizer_common/sanitizer_mutex.h" +#if SANITIZER_FUCHSIA +#include <zircon/process.h> +#include <zircon/status.h> +#include <zircon/syscalls.h> +#else #include "sanitizer_common/sanitizer_posix.h" +#endif +#include "xray_defs.h" #include "xray_utils.h" -#include <sys/mman.h> #include <cstddef> #include <cstdint> +#include <sys/mman.h> + +namespace __xray { + +// We implement our own memory allocation routine which will bypass the +// internal allocator. This allows us to manage the memory directly, using +// mmap'ed memory to back the allocators. +template <class T> T *allocate() XRAY_NEVER_INSTRUMENT { + uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached()); +#if SANITIZER_FUCHSIA + zx_handle_t Vmo; + zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo); + if (Status != ZX_OK) { + if (Verbosity()) + Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", + sizeof(T), _zx_status_get_string(Status)); + return nullptr; + } + uintptr_t B; + Status = + _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, + Vmo, 0, sizeof(T), &B); + _zx_handle_close(Vmo); + if (Status != ZX_OK) { + if (Verbosity()) + Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", sizeof(T), + _zx_status_get_string(Status)); + return nullptr; + } + return reinterpret_cast<T *>(B); +#else + uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + int ErrNo = 0; + if (UNLIKELY(internal_iserror(B, &ErrNo))) { + if (Verbosity()) + Report( + "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n", + RoundedSize, B); + return nullptr; + } +#endif + return reinterpret_cast<T *>(B); +} -#ifndef MAP_NORESERVE -// no-op on NetBSD (at least), unsupported flag on FreeBSD basically because unneeded -#define MAP_NORESERVE 0 +template <class T> void deallocate(T *B) XRAY_NEVER_INSTRUMENT { + if (B == nullptr) + return; + uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached()); +#if SANITIZER_FUCHSIA + _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B), + RoundedSize); +#else + internal_munmap(B, RoundedSize); #endif +} -namespace __xray { +template <class T = unsigned char> +T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT { + uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached()); +#if SANITIZER_FUCHSIA + zx_handle_t Vmo; + zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo); + if (Status != ZX_OK) { + if (Verbosity()) + Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", S, + _zx_status_get_string(Status)); + return nullptr; + } + uintptr_t B; + Status = _zx_vmar_map(_zx_vmar_root_self(), + ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, Vmo, 0, S, &B); + _zx_handle_close(Vmo); + if (Status != ZX_OK) { + if (Verbosity()) + Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", S, + _zx_status_get_string(Status)); + return nullptr; + } +#else + uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE, + MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); + int ErrNo = 0; + if (UNLIKELY(internal_iserror(B, &ErrNo))) { + if (Verbosity()) + Report( + "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n", + RoundedSize, B); + return nullptr; + } +#endif + return reinterpret_cast<T *>(B); +} + +template <class T> void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT { + if (B == nullptr) + return; + uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached()); +#if SANITIZER_FUCHSIA + _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B), + RoundedSize); +#else + internal_munmap(B, RoundedSize); +#endif +} + +template <class T, class... U> +T *initArray(size_t N, U &&... Us) XRAY_NEVER_INSTRUMENT { + auto A = allocateBuffer<T>(N); + if (A != nullptr) + while (N > 0) + new (A + (--N)) T(std::forward<U>(Us)...); + return A; +} /// The Allocator type hands out fixed-sized chunks of memory that are /// cache-line aligned and sized. This is useful for placement of @@ -58,20 +171,18 @@ template <size_t N> struct Allocator { }; private: - const size_t MaxMemory{0}; - void *BackingStore = nullptr; - void *AlignedNextBlock = nullptr; + size_t MaxMemory{0}; + unsigned char *BackingStore = nullptr; + unsigned char *AlignedNextBlock = nullptr; size_t AllocatedBlocks = 0; + bool Owned; SpinMutex Mutex{}; - void *Alloc() { + void *Alloc() XRAY_NEVER_INSTRUMENT { SpinMutexLock Lock(&Mutex); if (UNLIKELY(BackingStore == nullptr)) { - BackingStore = reinterpret_cast<void *>( - internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE, - MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0)); - if (BackingStore == MAP_FAILED) { - BackingStore = nullptr; + BackingStore = allocateBuffer(MaxMemory); + if (BackingStore == nullptr) { if (Verbosity()) Report("XRay Profiling: Failed to allocate memory for allocator.\n"); return nullptr; @@ -84,7 +195,7 @@ private: auto AlignedNextBlockNum = nearest_boundary( reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize); if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) { - munmap(BackingStore, MaxMemory); + deallocateBuffer(BackingStore, MaxMemory); AlignedNextBlock = BackingStore = nullptr; if (Verbosity()) Report("XRay Profiling: Cannot obtain enough memory from " @@ -92,34 +203,83 @@ private: return nullptr; } - AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum); + AlignedNextBlock = reinterpret_cast<unsigned char *>(AlignedNextBlockNum); // Assert that AlignedNextBlock is cache-line aligned. DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize, 0); } - if ((AllocatedBlocks * Block::Size) >= MaxMemory) + if (((AllocatedBlocks + 1) * Block::Size) > MaxMemory) return nullptr; // Align the pointer we'd like to return to an appropriate alignment, then // advance the pointer from where to start allocations. void *Result = AlignedNextBlock; - AlignedNextBlock = reinterpret_cast<void *>( - reinterpret_cast<char *>(AlignedNextBlock) + N); + AlignedNextBlock = + reinterpret_cast<unsigned char *>(AlignedNextBlock) + Block::Size; ++AllocatedBlocks; return Result; } public: - explicit Allocator(size_t M) - : MaxMemory(nearest_boundary(M, kCacheLineSize)) {} + explicit Allocator(size_t M) XRAY_NEVER_INSTRUMENT + : MaxMemory(RoundUpTo(M, kCacheLineSize)), + BackingStore(nullptr), + AlignedNextBlock(nullptr), + AllocatedBlocks(0), + Owned(true), + Mutex() {} + + explicit Allocator(void *P, size_t M) XRAY_NEVER_INSTRUMENT + : MaxMemory(M), + BackingStore(reinterpret_cast<unsigned char *>(P)), + AlignedNextBlock(reinterpret_cast<unsigned char *>(P)), + AllocatedBlocks(0), + Owned(false), + Mutex() {} + + Allocator(const Allocator &) = delete; + Allocator &operator=(const Allocator &) = delete; + + Allocator(Allocator &&O) XRAY_NEVER_INSTRUMENT { + SpinMutexLock L0(&Mutex); + SpinMutexLock L1(&O.Mutex); + MaxMemory = O.MaxMemory; + O.MaxMemory = 0; + BackingStore = O.BackingStore; + O.BackingStore = nullptr; + AlignedNextBlock = O.AlignedNextBlock; + O.AlignedNextBlock = nullptr; + AllocatedBlocks = O.AllocatedBlocks; + O.AllocatedBlocks = 0; + Owned = O.Owned; + O.Owned = false; + } + + Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT { + SpinMutexLock L0(&Mutex); + SpinMutexLock L1(&O.Mutex); + MaxMemory = O.MaxMemory; + O.MaxMemory = 0; + if (BackingStore != nullptr) + deallocateBuffer(BackingStore, MaxMemory); + BackingStore = O.BackingStore; + O.BackingStore = nullptr; + AlignedNextBlock = O.AlignedNextBlock; + O.AlignedNextBlock = nullptr; + AllocatedBlocks = O.AllocatedBlocks; + O.AllocatedBlocks = 0; + Owned = O.Owned; + O.Owned = false; + return *this; + } - Block Allocate() { return {Alloc()}; } + Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; } - ~Allocator() NOEXCEPT { - if (BackingStore != nullptr) { - internal_munmap(BackingStore, MaxMemory); + ~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT { + if (Owned && BackingStore != nullptr) { + deallocateBuffer(BackingStore, MaxMemory); } } }; diff --git a/contrib/compiler-rt/lib/xray/xray_basic_logging.cc b/contrib/compiler-rt/lib/xray/xray_basic_logging.cc index 585ca641cd0c..ae1cc0ba79dd 100644 --- a/contrib/compiler-rt/lib/xray/xray_basic_logging.cc +++ b/contrib/compiler-rt/lib/xray/xray_basic_logging.cc @@ -19,7 +19,9 @@ #include <fcntl.h> #include <pthread.h> #include <sys/stat.h> +#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC #include <sys/syscall.h> +#endif #include <sys/types.h> #include <time.h> #include <unistd.h> @@ -38,8 +40,9 @@ namespace __xray { -SpinMutex LogMutex; +static SpinMutex LogMutex; +namespace { // We use elements of this type to record the entry TSC of every function ID we // see as we're tracing a particular thread's execution. struct alignas(16) StackEntry { @@ -52,21 +55,28 @@ struct alignas(16) StackEntry { static_assert(sizeof(StackEntry) == 16, "Wrong size for StackEntry"); -struct alignas(64) ThreadLocalData { +struct XRAY_TLS_ALIGNAS(64) ThreadLocalData { void *InMemoryBuffer = nullptr; size_t BufferSize = 0; size_t BufferOffset = 0; void *ShadowStack = nullptr; size_t StackSize = 0; size_t StackEntries = 0; - int Fd = -1; + __xray::LogWriter *LogWriter = nullptr; }; +struct BasicLoggingOptions { + int DurationFilterMicros = 0; + size_t MaxStackDepth = 0; + size_t ThreadBufferSize = 0; +}; +} // namespace + static pthread_key_t PThreadKey; static atomic_uint8_t BasicInitialized{0}; -BasicLoggingOptions GlobalOptions; +struct BasicLoggingOptions GlobalOptions; thread_local atomic_uint8_t Guard{0}; @@ -75,10 +85,10 @@ static atomic_uint64_t ThresholdTicks{0}; static atomic_uint64_t TicksPerSec{0}; static atomic_uint64_t CycleFrequency{NanosecondsPerSecond}; -static int openLogFile() XRAY_NEVER_INSTRUMENT { - int F = getLogFD(); - if (F == -1) - return -1; +static LogWriter *getLog() XRAY_NEVER_INSTRUMENT { + LogWriter* LW = LogWriter::Open(); + if (LW == nullptr) + return LW; static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT; pthread_once(&DetectOnce, +[] { @@ -100,16 +110,16 @@ static int openLogFile() XRAY_NEVER_INSTRUMENT { // before setting the values in the header. Header.ConstantTSC = 1; Header.NonstopTSC = 1; - retryingWriteAll(F, reinterpret_cast<char *>(&Header), - reinterpret_cast<char *>(&Header) + sizeof(Header)); - return F; + LW->WriteAll(reinterpret_cast<char *>(&Header), + reinterpret_cast<char *>(&Header) + sizeof(Header)); + return LW; } -static int getGlobalFd() XRAY_NEVER_INSTRUMENT { +static LogWriter *getGlobalLog() XRAY_NEVER_INSTRUMENT { static pthread_once_t OnceInit = PTHREAD_ONCE_INIT; - static int Fd = 0; - pthread_once(&OnceInit, +[] { Fd = openLogFile(); }); - return Fd; + static LogWriter *LW = nullptr; + pthread_once(&OnceInit, +[] { LW = getLog(); }); + return LW; } static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { @@ -121,7 +131,7 @@ static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { return false; } pthread_setspecific(PThreadKey, &TLD); - TLD.Fd = getGlobalFd(); + TLD.LogWriter = getGlobalLog(); TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>( InternalAlloc(sizeof(XRayRecord) * GlobalOptions.ThreadBufferSize, nullptr, alignof(XRayRecord))); @@ -149,8 +159,8 @@ template <class RDTSC> void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT { auto &TLD = getThreadLocalData(); - int Fd = getGlobalFd(); - if (Fd == -1) + LogWriter *LW = getGlobalLog(); + if (LW == nullptr) return; // Use a simple recursion guard, to handle cases where we're already logging @@ -234,9 +244,9 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type, auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer); internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R)); if (++TLD.BufferOffset == TLD.BufferSize) { - SpinMutexLock L(&LogMutex); - retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry), - reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); + SpinMutexLock Lock(&LogMutex); + LW->WriteAll(reinterpret_cast<char *>(FirstEntry), + reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); TLD.BufferOffset = 0; TLD.StackEntries = 0; } @@ -249,17 +259,17 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1, auto FirstEntry = reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer); const auto &BuffLen = TLD.BufferSize; - int Fd = getGlobalFd(); - if (Fd == -1) + LogWriter *LW = getGlobalLog(); + if (LW == nullptr) return; // First we check whether there's enough space to write the data consecutively // in the thread-local buffer. If not, we first flush the buffer before // attempting to write the two records that must be consecutive. if (TLD.BufferOffset + 2 > BuffLen) { - SpinMutexLock L(&LogMutex); - retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry), - reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); + SpinMutexLock Lock(&LogMutex); + LW->WriteAll(reinterpret_cast<char *>(FirstEntry), + reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); TLD.BufferOffset = 0; TLD.StackEntries = 0; } @@ -280,9 +290,9 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1, R.Arg = Arg1; internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R)); if (++TLD.BufferOffset == BuffLen) { - SpinMutexLock L(&LogMutex); - retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry), - reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); + SpinMutexLock Lock(&LogMutex); + LW->WriteAll(reinterpret_cast<char *>(FirstEntry), + reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset)); TLD.BufferOffset = 0; TLD.StackEntries = 0; } @@ -339,29 +349,29 @@ static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT { Report("Cleaned up log for TID: %d\n", GetTid()); }); - if (TLD.Fd == -1 || TLD.BufferOffset == 0) { + if (TLD.LogWriter == nullptr || TLD.BufferOffset == 0) { if (Verbosity()) - Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", GetTid(), - TLD.Fd, TLD.BufferOffset); + Report("Skipping buffer for TID: %d; Offset = %llu\n", GetTid(), + TLD.BufferOffset); return; } { SpinMutexLock L(&LogMutex); - retryingWriteAll(TLD.Fd, reinterpret_cast<char *>(TLD.InMemoryBuffer), - reinterpret_cast<char *>(TLD.InMemoryBuffer) + - (sizeof(XRayRecord) * TLD.BufferOffset)); + TLD.LogWriter->WriteAll(reinterpret_cast<char *>(TLD.InMemoryBuffer), + reinterpret_cast<char *>(TLD.InMemoryBuffer) + + (sizeof(XRayRecord) * TLD.BufferOffset)); } // Because this thread's exit could be the last one trying to write to // the file and that we're not able to close out the file properly, we // sync instead and hope that the pending writes are flushed as the // thread exits. - fsync(TLD.Fd); + TLD.LogWriter->Flush(); } -XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax, - void *Options, +XRayLogInitStatus basicLoggingInit(UNUSED size_t BufferSize, + UNUSED size_t BufferMax, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { uint8_t Expected = 0; if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1, @@ -385,43 +395,32 @@ XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax, "using emulation instead.\n"); }); - if (BufferSize == 0 && BufferMax == 0 && Options != nullptr) { - FlagParser P; - BasicFlags F; - F.setDefaults(); - registerXRayBasicFlags(&P, &F); - P.ParseString(useCompilerDefinedBasicFlags()); - auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS"); - if (EnvOpts == nullptr) - EnvOpts = ""; - - P.ParseString(EnvOpts); - - // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options - // set through XRAY_OPTIONS instead. - if (internal_strlen(EnvOpts) == 0) { - F.func_duration_threshold_us = - flags()->xray_naive_log_func_duration_threshold_us; - F.max_stack_depth = flags()->xray_naive_log_max_stack_depth; - F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size; - } - - P.ParseString(static_cast<const char *>(Options)); - GlobalOptions.ThreadBufferSize = F.thread_buffer_size; - GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us; - GlobalOptions.MaxStackDepth = F.max_stack_depth; - *basicFlags() = F; - } else if (OptionsSize != sizeof(BasicLoggingOptions)) { - Report("Invalid options size, potential ABI mismatch; expected %d got %d", - sizeof(BasicLoggingOptions), OptionsSize); - return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; - } else { - if (Verbosity()) - Report("XRay Basic: struct-based init is deprecated, please use " - "string-based configuration instead.\n"); - GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options); + FlagParser P; + BasicFlags F; + F.setDefaults(); + registerXRayBasicFlags(&P, &F); + P.ParseString(useCompilerDefinedBasicFlags()); + auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS"); + if (EnvOpts == nullptr) + EnvOpts = ""; + + P.ParseString(EnvOpts); + + // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options + // set through XRAY_OPTIONS instead. + if (internal_strlen(EnvOpts) == 0) { + F.func_duration_threshold_us = + flags()->xray_naive_log_func_duration_threshold_us; + F.max_stack_depth = flags()->xray_naive_log_max_stack_depth; + F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size; } + P.ParseString(static_cast<const char *>(Options)); + GlobalOptions.ThreadBufferSize = F.thread_buffer_size; + GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us; + GlobalOptions.MaxStackDepth = F.max_stack_depth; + *basicFlags() = F; + atomic_store(&ThresholdTicks, atomic_load(&TicksPerSec, memory_order_acquire) * GlobalOptions.DurationFilterMicros / 1000000, diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc index 3ce728900787..7d0e5a1f323c 100644 --- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc +++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc @@ -13,141 +13,206 @@ // //===----------------------------------------------------------------------===// #include "xray_buffer_queue.h" +#include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_libc.h" +#if !SANITIZER_FUCHSIA #include "sanitizer_common/sanitizer_posix.h" +#endif +#include "xray_allocator.h" +#include "xray_defs.h" #include <memory> #include <sys/mman.h> -#ifndef MAP_NORESERVE -// no-op on NetBSD (at least), unsupported flag on FreeBSD -#define MAP_NORESERVE 0 -#endif - using namespace __xray; -using namespace __sanitizer; - -template <class T> static T *allocRaw(size_t N) { - // TODO: Report errors? - // We use MAP_NORESERVE on platforms where it's supported to ensure that the - // pages we're allocating for XRay never end up in pages that can be swapped - // in/out. We're doing this because for FDR mode, we want to ensure that - // writes to the buffers stay resident in memory to prevent XRay itself from - // causing swapping/thrashing. - // - // In the case when XRay pages cannot be swapped in/out or there's not enough - // RAM to back these pages, we're willing to cause a segmentation fault - // instead of introducing latency in the measurement. We assume here that - // there are enough pages that are swappable in/out outside of the buffers - // being used by FDR mode (which are bounded and configurable anyway) to allow - // us to keep using always-resident memory. - // - // TODO: Make this configurable? - void *A = reinterpret_cast<void *>( - internal_mmap(NULL, N * sizeof(T), PROT_WRITE | PROT_READ, - MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0)); - return (A == MAP_FAILED) ? nullptr : reinterpret_cast<T *>(A); -} -template <class T> static void deallocRaw(T *ptr, size_t N) { - // TODO: Report errors? - if (ptr != nullptr) - internal_munmap(ptr, N); +namespace { + +BufferQueue::ControlBlock *allocControlBlock(size_t Size, size_t Count) { + auto B = + allocateBuffer((sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count)); + return B == nullptr ? nullptr + : reinterpret_cast<BufferQueue::ControlBlock *>(B); } -template <class T> static T *initArray(size_t N) { - auto A = allocRaw<T>(N); - if (A != nullptr) - while (N > 0) - new (A + (--N)) T(); - return A; +void deallocControlBlock(BufferQueue::ControlBlock *C, size_t Size, + size_t Count) { + deallocateBuffer(reinterpret_cast<unsigned char *>(C), + (sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count)); } -BufferQueue::BufferQueue(size_t B, size_t N, bool &Success) - : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)), - BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)), - Next(Buffers), First(Buffers), LiveBuffers(0) { - if (Buffers == nullptr) { - Success = false; +void decRefCount(BufferQueue::ControlBlock *C, size_t Size, size_t Count) { + if (C == nullptr) return; - } - if (OwnedBuffers == nullptr) { - // Clean up the buffers we've already allocated. - for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B) - B->~BufferRep(); - deallocRaw(Buffers, N); - Success = false; + if (atomic_fetch_sub(&C->RefCount, 1, memory_order_acq_rel) == 1) + deallocControlBlock(C, Size, Count); +} + +void incRefCount(BufferQueue::ControlBlock *C) { + if (C == nullptr) return; + atomic_fetch_add(&C->RefCount, 1, memory_order_acq_rel); +} + +// We use a struct to ensure that we are allocating one atomic_uint64_t per +// cache line. This allows us to not worry about false-sharing among atomic +// objects being updated (constantly) by different threads. +struct ExtentsPadded { + union { + atomic_uint64_t Extents; + unsigned char Storage[kCacheLineSize]; }; +}; - for (size_t i = 0; i < N; ++i) { - auto &T = Buffers[i]; - void *Tmp = allocRaw<char>(BufferSize); - if (Tmp == nullptr) { - Success = false; +constexpr size_t kExtentsSize = sizeof(ExtentsPadded); + +} // namespace + +BufferQueue::ErrorCode BufferQueue::init(size_t BS, size_t BC) { + SpinMutexLock Guard(&Mutex); + + if (!finalizing()) + return BufferQueue::ErrorCode::AlreadyInitialized; + + cleanupBuffers(); + + bool Success = false; + BufferSize = BS; + BufferCount = BC; + + BackingStore = allocControlBlock(BufferSize, BufferCount); + if (BackingStore == nullptr) + return BufferQueue::ErrorCode::NotEnoughMemory; + + auto CleanupBackingStore = at_scope_exit([&, this] { + if (Success) return; - } - auto *Extents = allocRaw<BufferExtents>(1); - if (Extents == nullptr) { - Success = false; + deallocControlBlock(BackingStore, BufferSize, BufferCount); + BackingStore = nullptr; + }); + + // Initialize enough atomic_uint64_t instances, each + ExtentsBackingStore = allocControlBlock(kExtentsSize, BufferCount); + if (ExtentsBackingStore == nullptr) + return BufferQueue::ErrorCode::NotEnoughMemory; + + auto CleanupExtentsBackingStore = at_scope_exit([&, this] { + if (Success) return; - } + deallocControlBlock(ExtentsBackingStore, kExtentsSize, BufferCount); + ExtentsBackingStore = nullptr; + }); + + Buffers = initArray<BufferRep>(BufferCount); + if (Buffers == nullptr) + return BufferQueue::ErrorCode::NotEnoughMemory; + + // At this point we increment the generation number to associate the buffers + // to the new generation. + atomic_fetch_add(&Generation, 1, memory_order_acq_rel); + + // First, we initialize the refcount in the ControlBlock, which we treat as + // being at the start of the BackingStore pointer. + atomic_store(&BackingStore->RefCount, 1, memory_order_release); + atomic_store(&ExtentsBackingStore->RefCount, 1, memory_order_release); + + // Then we initialise the individual buffers that sub-divide the whole backing + // store. Each buffer will start at the `Data` member of the ControlBlock, and + // will be offsets from these locations. + for (size_t i = 0; i < BufferCount; ++i) { + auto &T = Buffers[i]; auto &Buf = T.Buff; - Buf.Data = Tmp; - Buf.Size = B; - Buf.Extents = Extents; - OwnedBuffers[i] = Tmp; + auto *E = reinterpret_cast<ExtentsPadded *>(&ExtentsBackingStore->Data + + (kExtentsSize * i)); + Buf.Extents = &E->Extents; + atomic_store(Buf.Extents, 0, memory_order_release); + Buf.Generation = generation(); + Buf.Data = &BackingStore->Data + (BufferSize * i); + Buf.Size = BufferSize; + Buf.BackingStore = BackingStore; + Buf.ExtentsBackingStore = ExtentsBackingStore; + Buf.Count = BufferCount; + T.Used = false; } + + Next = Buffers; + First = Buffers; + LiveBuffers = 0; + atomic_store(&Finalizing, 0, memory_order_release); Success = true; + return BufferQueue::ErrorCode::Ok; +} + +BufferQueue::BufferQueue(size_t B, size_t N, + bool &Success) XRAY_NEVER_INSTRUMENT + : BufferSize(B), + BufferCount(N), + Mutex(), + Finalizing{1}, + BackingStore(nullptr), + ExtentsBackingStore(nullptr), + Buffers(nullptr), + Next(Buffers), + First(Buffers), + LiveBuffers(0), + Generation{0} { + Success = init(B, N) == BufferQueue::ErrorCode::Ok; } BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) { if (atomic_load(&Finalizing, memory_order_acquire)) return ErrorCode::QueueFinalizing; - SpinMutexLock Guard(&Mutex); - if (LiveBuffers == BufferCount) - return ErrorCode::NotEnoughMemory; - auto &T = *Next; - auto &B = T.Buff; - Buf = B; - T.Used = true; - ++LiveBuffers; - - if (++Next == (Buffers + BufferCount)) - Next = Buffers; + BufferRep *B = nullptr; + { + SpinMutexLock Guard(&Mutex); + if (LiveBuffers == BufferCount) + return ErrorCode::NotEnoughMemory; + B = Next++; + if (Next == (Buffers + BufferCount)) + Next = Buffers; + ++LiveBuffers; + } + incRefCount(BackingStore); + incRefCount(ExtentsBackingStore); + Buf = B->Buff; + Buf.Generation = generation(); + B->Used = true; return ErrorCode::Ok; } BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) { - // Blitz through the buffers array to find the buffer. - bool Found = false; - for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) { - if (*I == Buf.Data) { - Found = true; - break; + // Check whether the buffer being referred to is within the bounds of the + // backing store's range. + BufferRep *B = nullptr; + { + SpinMutexLock Guard(&Mutex); + if (Buf.Generation != generation() || LiveBuffers == 0) { + Buf = {}; + decRefCount(Buf.BackingStore, Buf.Size, Buf.Count); + decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count); + return BufferQueue::ErrorCode::Ok; } - } - if (!Found) - return ErrorCode::UnrecognizedBuffer; - SpinMutexLock Guard(&Mutex); + if (Buf.Data < &BackingStore->Data || + Buf.Data > &BackingStore->Data + (BufferCount * BufferSize)) + return BufferQueue::ErrorCode::UnrecognizedBuffer; - // This points to a semantic bug, we really ought to not be releasing more - // buffers than we actually get. - if (LiveBuffers == 0) - return ErrorCode::NotEnoughMemory; + --LiveBuffers; + B = First++; + if (First == (Buffers + BufferCount)) + First = Buffers; + } // Now that the buffer has been released, we mark it as "used". - First->Buff = Buf; - First->Used = true; - Buf.Data = nullptr; - Buf.Size = 0; - --LiveBuffers; - if (++First == (Buffers + BufferCount)) - First = Buffers; - + B->Buff = Buf; + B->Used = true; + decRefCount(Buf.BackingStore, Buf.Size, Buf.Count); + decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count); + atomic_store(B->Buff.Extents, atomic_load(Buf.Extents, memory_order_acquire), + memory_order_release); + Buf = {}; return ErrorCode::Ok; } @@ -157,15 +222,17 @@ BufferQueue::ErrorCode BufferQueue::finalize() { return ErrorCode::Ok; } -BufferQueue::~BufferQueue() { - for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) { - auto &T = *I; - auto &Buf = T.Buff; - deallocRaw(Buf.Data, Buf.Size); - deallocRaw(Buf.Extents, 1); - } +void BufferQueue::cleanupBuffers() { for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B) B->~BufferRep(); - deallocRaw(Buffers, BufferCount); - deallocRaw(OwnedBuffers, BufferCount); + deallocateBuffer(Buffers, BufferCount); + decRefCount(BackingStore, BufferSize, BufferCount); + decRefCount(ExtentsBackingStore, kExtentsSize, BufferCount); + BackingStore = nullptr; + ExtentsBackingStore = nullptr; + Buffers = nullptr; + BufferCount = 0; + BufferSize = 0; } + +BufferQueue::~BufferQueue() { cleanupBuffers(); } diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h index e76fa7983c90..ef2b433f9a3f 100644 --- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h +++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h @@ -18,25 +18,51 @@ #include "sanitizer_common/sanitizer_atomic.h" #include "sanitizer_common/sanitizer_common.h" #include "sanitizer_common/sanitizer_mutex.h" +#include "xray_defs.h" #include <cstddef> +#include <cstdint> namespace __xray { /// BufferQueue implements a circular queue of fixed sized buffers (much like a -/// freelist) but is concerned mostly with making it really quick to initialise, -/// finalise, and get/return buffers to the queue. This is one key component of -/// the "flight data recorder" (FDR) mode to support ongoing XRay function call +/// freelist) but is concerned with making it quick to initialise, finalise, and +/// get from or return buffers to the queue. This is one key component of the +/// "flight data recorder" (FDR) mode to support ongoing XRay function call /// trace collection. class BufferQueue { public: - struct alignas(64) BufferExtents { - atomic_uint64_t Size; + /// ControlBlock represents the memory layout of how we interpret the backing + /// store for all buffers and extents managed by a BufferQueue instance. The + /// ControlBlock has the reference count as the first member, sized according + /// to platform-specific cache-line size. We never use the Buffer member of + /// the union, which is only there for compiler-supported alignment and + /// sizing. + /// + /// This ensures that the `Data` member will be placed at least kCacheLineSize + /// bytes from the beginning of the structure. + struct ControlBlock { + union { + atomic_uint64_t RefCount; + char Buffer[kCacheLineSize]; + }; + + /// We need to make this size 1, to conform to the C++ rules for array data + /// members. Typically, we want to subtract this 1 byte for sizing + /// information. + char Data[1]; }; struct Buffer { + atomic_uint64_t *Extents = nullptr; + uint64_t Generation{0}; void *Data = nullptr; size_t Size = 0; - BufferExtents *Extents; + + private: + friend class BufferQueue; + ControlBlock *BackingStore = nullptr; + ControlBlock *ExtentsBackingStore = nullptr; + size_t Count = 0; }; struct BufferRep { @@ -76,8 +102,10 @@ private: T *operator->() const { return &(Buffers[Offset].Buff); } - Iterator(BufferRep *Root, size_t O, size_t M) - : Buffers(Root), Offset(O), Max(M) { + Iterator(BufferRep *Root, size_t O, size_t M) XRAY_NEVER_INSTRUMENT + : Buffers(Root), + Offset(O), + Max(M) { // We want to advance to the first Offset where the 'Used' property is // true, or to the end of the list/queue. while (!Buffers[Offset].Used && Offset != Max) { @@ -107,16 +135,20 @@ private: // Size of each individual Buffer. size_t BufferSize; - BufferRep *Buffers; - // Amount of pre-allocated buffers. size_t BufferCount; SpinMutex Mutex; atomic_uint8_t Finalizing; - // Pointers to buffers managed/owned by the BufferQueue. - void **OwnedBuffers; + // The collocated ControlBlock and buffer storage. + ControlBlock *BackingStore; + + // The collocated ControlBlock and extents storage. + ControlBlock *ExtentsBackingStore; + + // A dynamically allocated array of BufferRep instances. + BufferRep *Buffers; // Pointer to the next buffer to be handed out. BufferRep *Next; @@ -128,6 +160,13 @@ private: // Count of buffers that have been handed out through 'getBuffer'. size_t LiveBuffers; + // We use a generation number to identify buffers and which generation they're + // associated with. + atomic_uint64_t Generation; + + /// Releases references to the buffers backed by the current buffer queue. + void cleanupBuffers(); + public: enum class ErrorCode : unsigned { Ok, @@ -135,6 +174,7 @@ public: QueueFinalizing, UnrecognizedBuffer, AlreadyFinalized, + AlreadyInitialized, }; static const char *getErrorString(ErrorCode E) { @@ -149,6 +189,8 @@ public: return "buffer being returned not owned by buffer queue"; case ErrorCode::AlreadyFinalized: return "queue already finalized"; + case ErrorCode::AlreadyInitialized: + return "queue already initialized"; } return "unknown error"; } @@ -179,10 +221,23 @@ public: /// the buffer being released. ErrorCode releaseBuffer(Buffer &Buf); + /// Initializes the buffer queue, starting a new generation. We can re-set the + /// size of buffers with |BS| along with the buffer count with |BC|. + /// + /// Returns: + /// - ErrorCode::Ok when we successfully initialize the buffer. This + /// requires that the buffer queue is previously finalized. + /// - ErrorCode::AlreadyInitialized when the buffer queue is not finalized. + ErrorCode init(size_t BS, size_t BC); + bool finalizing() const { return atomic_load(&Finalizing, memory_order_acquire); } + uint64_t generation() const { + return atomic_load(&Generation, memory_order_acquire); + } + /// Returns the configured size of the buffers in the buffer queue. size_t ConfiguredBufferSize() const { return BufferSize; } @@ -198,7 +253,7 @@ public: /// Applies the provided function F to each Buffer in the queue, only if the /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a /// releaseBuffer(...) operation). - template <class F> void apply(F Fn) { + template <class F> void apply(F Fn) XRAY_NEVER_INSTRUMENT { SpinMutexLock G(&Mutex); for (auto I = begin(), E = end(); I != E; ++I) Fn(*I); diff --git a/contrib/compiler-rt/lib/xray/xray_defs.h b/contrib/compiler-rt/lib/xray/xray_defs.h index e5c37c0665db..c009bcc879f1 100644 --- a/contrib/compiler-rt/lib/xray/xray_defs.h +++ b/contrib/compiler-rt/lib/xray/xray_defs.h @@ -19,4 +19,14 @@ #define XRAY_NEVER_INSTRUMENT #endif +#if SANITIZER_NETBSD +// NetBSD: thread_local is not aligned properly, and the code relying +// on it segfaults +#define XRAY_TLS_ALIGNAS(x) +#define XRAY_HAS_TLS_ALIGNAS 0 +#else +#define XRAY_TLS_ALIGNAS(x) alignas(x) +#define XRAY_HAS_TLS_ALIGNAS 1 +#endif + #endif // XRAY_XRAY_DEFS_H diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_controller.h b/contrib/compiler-rt/lib/xray/xray_fdr_controller.h new file mode 100644 index 000000000000..d44d0309b373 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_fdr_controller.h @@ -0,0 +1,373 @@ +//===-- xray_fdr_controller.h ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_ +#define COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_ + +#include <limits> +#include <time.h> + +#include "xray/xray_interface.h" +#include "xray/xray_records.h" +#include "xray_buffer_queue.h" +#include "xray_fdr_log_writer.h" + +namespace __xray { + +template <size_t Version = 5> class FDRController { + BufferQueue *BQ; + BufferQueue::Buffer &B; + FDRLogWriter &W; + int (*WallClockReader)(clockid_t, struct timespec *) = 0; + uint64_t CycleThreshold = 0; + + uint64_t LastFunctionEntryTSC = 0; + uint64_t LatestTSC = 0; + uint16_t LatestCPU = 0; + tid_t TId = 0; + pid_t PId = 0; + bool First = true; + + uint32_t UndoableFunctionEnters = 0; + uint32_t UndoableTailExits = 0; + + bool finalized() const XRAY_NEVER_INSTRUMENT { + return BQ == nullptr || BQ->finalizing(); + } + + bool hasSpace(size_t S) XRAY_NEVER_INSTRUMENT { + return B.Data != nullptr && B.Generation == BQ->generation() && + W.getNextRecord() + S <= reinterpret_cast<char *>(B.Data) + B.Size; + } + + constexpr int32_t mask(int32_t FuncId) const XRAY_NEVER_INSTRUMENT { + return FuncId & ((1 << 29) - 1); + } + + bool getNewBuffer() XRAY_NEVER_INSTRUMENT { + if (BQ->getBuffer(B) != BufferQueue::ErrorCode::Ok) + return false; + + W.resetRecord(); + DCHECK_EQ(W.getNextRecord(), B.Data); + LatestTSC = 0; + LatestCPU = 0; + First = true; + UndoableFunctionEnters = 0; + UndoableTailExits = 0; + atomic_store(B.Extents, 0, memory_order_release); + return true; + } + + bool setupNewBuffer() XRAY_NEVER_INSTRUMENT { + if (finalized()) + return false; + + DCHECK(hasSpace(sizeof(MetadataRecord) * 3)); + TId = GetTid(); + PId = internal_getpid(); + struct timespec TS { + 0, 0 + }; + WallClockReader(CLOCK_MONOTONIC, &TS); + + MetadataRecord Metadata[] = { + // Write out a MetadataRecord to signify that this is the start of a new + // buffer, associated with a particular thread, with a new CPU. For the + // data, we have 15 bytes to squeeze as much information as we can. At + // this point we only write down the following bytes: + // - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 + // bytes) + createMetadataRecord<MetadataRecord::RecordKinds::NewBuffer>( + static_cast<int32_t>(TId)), + + // Also write the WalltimeMarker record. We only really need microsecond + // precision here, and enforce across platforms that we need 64-bit + // seconds and 32-bit microseconds encoded in the Metadata record. + createMetadataRecord<MetadataRecord::RecordKinds::WalltimeMarker>( + static_cast<int64_t>(TS.tv_sec), + static_cast<int32_t>(TS.tv_nsec / 1000)), + + // Also write the Pid record. + createMetadataRecord<MetadataRecord::RecordKinds::Pid>( + static_cast<int32_t>(PId)), + }; + + if (finalized()) + return false; + return W.writeMetadataRecords(Metadata); + } + + bool prepareBuffer(size_t S) XRAY_NEVER_INSTRUMENT { + if (finalized()) + return returnBuffer(); + + if (UNLIKELY(!hasSpace(S))) { + if (!returnBuffer()) + return false; + if (!getNewBuffer()) + return false; + if (!setupNewBuffer()) + return false; + } + + if (First) { + First = false; + W.resetRecord(); + atomic_store(B.Extents, 0, memory_order_release); + return setupNewBuffer(); + } + + return true; + } + + bool returnBuffer() XRAY_NEVER_INSTRUMENT { + if (BQ == nullptr) + return false; + + First = true; + if (finalized()) { + BQ->releaseBuffer(B); // ignore result. + return false; + } + + return BQ->releaseBuffer(B) == BufferQueue::ErrorCode::Ok; + } + + enum class PreambleResult { NoChange, WroteMetadata, InvalidBuffer }; + PreambleResult recordPreamble(uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { + if (UNLIKELY(LatestCPU != CPU || LatestTSC == 0)) { + // We update our internal tracking state for the Latest TSC and CPU we've + // seen, then write out the appropriate metadata and function records. + LatestTSC = TSC; + LatestCPU = CPU; + + if (B.Generation != BQ->generation()) + return PreambleResult::InvalidBuffer; + + W.writeMetadata<MetadataRecord::RecordKinds::NewCPUId>(CPU, TSC); + return PreambleResult::WroteMetadata; + } + + DCHECK_EQ(LatestCPU, CPU); + + if (UNLIKELY(LatestTSC > TSC || + TSC - LatestTSC > + uint64_t{std::numeric_limits<int32_t>::max()})) { + // Either the TSC has wrapped around from the last TSC we've seen or the + // delta is too large to fit in a 32-bit signed integer, so we write a + // wrap-around record. + LatestTSC = TSC; + + if (B.Generation != BQ->generation()) + return PreambleResult::InvalidBuffer; + + W.writeMetadata<MetadataRecord::RecordKinds::TSCWrap>(TSC); + return PreambleResult::WroteMetadata; + } + + return PreambleResult::NoChange; + } + + bool rewindRecords(int32_t FuncId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { + // Undo one enter record, because at this point we are either at the state + // of: + // - We are exiting a function that we recently entered. + // - We are exiting a function that was the result of a sequence of tail + // exits, and we can check whether the tail exits can be re-wound. + // + FunctionRecord F; + W.undoWrites(sizeof(FunctionRecord)); + if (B.Generation != BQ->generation()) + return false; + internal_memcpy(&F, W.getNextRecord(), sizeof(FunctionRecord)); + + DCHECK(F.RecordKind == + uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && + "Expected to find function entry recording when rewinding."); + DCHECK_EQ(F.FuncId, FuncId & ~(0x0F << 28)); + + LatestTSC -= F.TSCDelta; + if (--UndoableFunctionEnters != 0) { + LastFunctionEntryTSC -= F.TSCDelta; + return true; + } + + LastFunctionEntryTSC = 0; + auto RewindingTSC = LatestTSC; + auto RewindingRecordPtr = W.getNextRecord() - sizeof(FunctionRecord); + while (UndoableTailExits) { + if (B.Generation != BQ->generation()) + return false; + internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord)); + DCHECK_EQ(F.RecordKind, + uint8_t(FunctionRecord::RecordKinds::FunctionTailExit)); + RewindingTSC -= F.TSCDelta; + RewindingRecordPtr -= sizeof(FunctionRecord); + if (B.Generation != BQ->generation()) + return false; + internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord)); + + // This tail call exceeded the threshold duration. It will not be erased. + if ((TSC - RewindingTSC) >= CycleThreshold) { + UndoableTailExits = 0; + return true; + } + + --UndoableTailExits; + W.undoWrites(sizeof(FunctionRecord) * 2); + LatestTSC = RewindingTSC; + } + return true; + } + +public: + template <class WallClockFunc> + FDRController(BufferQueue *BQ, BufferQueue::Buffer &B, FDRLogWriter &W, + WallClockFunc R, uint64_t C) XRAY_NEVER_INSTRUMENT + : BQ(BQ), + B(B), + W(W), + WallClockReader(R), + CycleThreshold(C) {} + + bool functionEnter(int32_t FuncId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { + if (finalized() || + !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord))) + return returnBuffer(); + + auto PreambleStatus = recordPreamble(TSC, CPU); + if (PreambleStatus == PreambleResult::InvalidBuffer) + return returnBuffer(); + + if (PreambleStatus == PreambleResult::WroteMetadata) { + UndoableFunctionEnters = 1; + UndoableTailExits = 0; + } else { + ++UndoableFunctionEnters; + } + + auto Delta = TSC - LatestTSC; + LastFunctionEntryTSC = TSC; + LatestTSC = TSC; + return W.writeFunction(FDRLogWriter::FunctionRecordKind::Enter, + mask(FuncId), Delta); + } + + bool functionTailExit(int32_t FuncId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { + if (finalized()) + return returnBuffer(); + + if (!prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord))) + return returnBuffer(); + + auto PreambleStatus = recordPreamble(TSC, CPU); + if (PreambleStatus == PreambleResult::InvalidBuffer) + return returnBuffer(); + + if (PreambleStatus == PreambleResult::NoChange && + UndoableFunctionEnters != 0 && + TSC - LastFunctionEntryTSC < CycleThreshold) + return rewindRecords(FuncId, TSC, CPU); + + UndoableTailExits = UndoableFunctionEnters ? UndoableTailExits + 1 : 0; + UndoableFunctionEnters = 0; + auto Delta = TSC - LatestTSC; + LatestTSC = TSC; + return W.writeFunction(FDRLogWriter::FunctionRecordKind::TailExit, + mask(FuncId), Delta); + } + + bool functionEnterArg(int32_t FuncId, uint64_t TSC, uint16_t CPU, + uint64_t Arg) XRAY_NEVER_INSTRUMENT { + if (finalized() || + !prepareBuffer((2 * sizeof(MetadataRecord)) + sizeof(FunctionRecord)) || + recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer) + return returnBuffer(); + + auto Delta = TSC - LatestTSC; + LatestTSC = TSC; + LastFunctionEntryTSC = 0; + UndoableFunctionEnters = 0; + UndoableTailExits = 0; + + return W.writeFunctionWithArg(FDRLogWriter::FunctionRecordKind::EnterArg, + mask(FuncId), Delta, Arg); + } + + bool functionExit(int32_t FuncId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { + if (finalized() || + !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord))) + return returnBuffer(); + + auto PreambleStatus = recordPreamble(TSC, CPU); + if (PreambleStatus == PreambleResult::InvalidBuffer) + return returnBuffer(); + + if (PreambleStatus == PreambleResult::NoChange && + UndoableFunctionEnters != 0 && + TSC - LastFunctionEntryTSC < CycleThreshold) + return rewindRecords(FuncId, TSC, CPU); + + auto Delta = TSC - LatestTSC; + LatestTSC = TSC; + UndoableFunctionEnters = 0; + UndoableTailExits = 0; + return W.writeFunction(FDRLogWriter::FunctionRecordKind::Exit, mask(FuncId), + Delta); + } + + bool customEvent(uint64_t TSC, uint16_t CPU, const void *Event, + int32_t EventSize) XRAY_NEVER_INSTRUMENT { + if (finalized() || + !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) || + recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer) + return returnBuffer(); + + auto Delta = TSC - LatestTSC; + LatestTSC = TSC; + UndoableFunctionEnters = 0; + UndoableTailExits = 0; + return W.writeCustomEvent(Delta, Event, EventSize); + } + + bool typedEvent(uint64_t TSC, uint16_t CPU, uint16_t EventType, + const void *Event, int32_t EventSize) XRAY_NEVER_INSTRUMENT { + if (finalized() || + !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) || + recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer) + return returnBuffer(); + + auto Delta = TSC - LatestTSC; + LatestTSC = TSC; + UndoableFunctionEnters = 0; + UndoableTailExits = 0; + return W.writeTypedEvent(Delta, EventType, Event, EventSize); + } + + bool flush() XRAY_NEVER_INSTRUMENT { + if (finalized()) { + returnBuffer(); // ignore result. + return true; + } + return returnBuffer(); + } +}; + +} // namespace __xray + +#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_ diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h index 87096d4fc29e..e7b1ee562e1b 100644 --- a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h +++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h @@ -12,6 +12,9 @@ //===----------------------------------------------------------------------===// #ifndef XRAY_XRAY_FDR_LOG_RECORDS_H #define XRAY_XRAY_FDR_LOG_RECORDS_H +#include <cstdint> + +namespace __xray { enum class RecordType : uint8_t { Function, Metadata }; @@ -68,4 +71,6 @@ struct alignas(8) FunctionRecord { static_assert(sizeof(FunctionRecord) == 8, "Wrong size for FunctionRecord."); +} // namespace __xray + #endif // XRAY_XRAY_FDR_LOG_RECORDS_H diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h b/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h new file mode 100644 index 000000000000..7712e1377763 --- /dev/null +++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h @@ -0,0 +1,232 @@ +//===-- xray_fdr_log_writer.h ---------------------------------------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a function call tracing system. +// +//===----------------------------------------------------------------------===// +#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_ +#define COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_ + +#include "xray_buffer_queue.h" +#include "xray_fdr_log_records.h" +#include <functional> +#include <tuple> +#include <type_traits> +#include <utility> + +namespace __xray { + +template <size_t Index> struct SerializerImpl { + template <class Tuple, + typename std::enable_if< + Index<std::tuple_size< + typename std::remove_reference<Tuple>::type>::value, + int>::type = 0> static void serializeTo(char *Buffer, + Tuple &&T) { + auto P = reinterpret_cast<const char *>(&std::get<Index>(T)); + constexpr auto Size = sizeof(std::get<Index>(T)); + internal_memcpy(Buffer, P, Size); + SerializerImpl<Index + 1>::serializeTo(Buffer + Size, + std::forward<Tuple>(T)); + } + + template <class Tuple, + typename std::enable_if< + Index >= std::tuple_size<typename std::remove_reference< + Tuple>::type>::value, + int>::type = 0> + static void serializeTo(char *, Tuple &&) {} +}; + +using Serializer = SerializerImpl<0>; + +template <class Tuple, size_t Index> struct AggregateSizesImpl { + static constexpr size_t value = + sizeof(typename std::tuple_element<Index, Tuple>::type) + + AggregateSizesImpl<Tuple, Index - 1>::value; +}; + +template <class Tuple> struct AggregateSizesImpl<Tuple, 0> { + static constexpr size_t value = + sizeof(typename std::tuple_element<0, Tuple>::type); +}; + +template <class Tuple> struct AggregateSizes { + static constexpr size_t value = + AggregateSizesImpl<Tuple, std::tuple_size<Tuple>::value - 1>::value; +}; + +template <MetadataRecord::RecordKinds Kind, class... DataTypes> +MetadataRecord createMetadataRecord(DataTypes &&... Ds) { + static_assert(AggregateSizes<std::tuple<DataTypes...>>::value <= + sizeof(MetadataRecord) - 1, + "Metadata payload longer than metadata buffer!"); + MetadataRecord R; + R.Type = 1; + R.RecordKind = static_cast<uint8_t>(Kind); + Serializer::serializeTo(R.Data, + std::make_tuple(std::forward<DataTypes>(Ds)...)); + return R; +} + +class FDRLogWriter { + BufferQueue::Buffer &Buffer; + char *NextRecord = nullptr; + + template <class T> void writeRecord(const T &R) { + internal_memcpy(NextRecord, reinterpret_cast<const char *>(&R), sizeof(T)); + NextRecord += sizeof(T); + // We need this atomic fence here to ensure that other threads attempting to + // read the bytes in the buffer will see the writes committed before the + // extents are updated. + atomic_thread_fence(memory_order_release); + atomic_fetch_add(Buffer.Extents, sizeof(T), memory_order_acq_rel); + } + +public: + explicit FDRLogWriter(BufferQueue::Buffer &B, char *P) + : Buffer(B), NextRecord(P) { + DCHECK_NE(Buffer.Data, nullptr); + DCHECK_NE(NextRecord, nullptr); + } + + explicit FDRLogWriter(BufferQueue::Buffer &B) + : FDRLogWriter(B, static_cast<char *>(B.Data)) {} + + template <MetadataRecord::RecordKinds Kind, class... Data> + bool writeMetadata(Data &&... Ds) { + // TODO: Check boundary conditions: + // 1) Buffer is full, and cannot handle one metadata record. + // 2) Buffer queue is finalising. + writeRecord(createMetadataRecord<Kind>(std::forward<Data>(Ds)...)); + return true; + } + + template <size_t N> size_t writeMetadataRecords(MetadataRecord (&Recs)[N]) { + constexpr auto Size = sizeof(MetadataRecord) * N; + internal_memcpy(NextRecord, reinterpret_cast<const char *>(Recs), Size); + NextRecord += Size; + // We need this atomic fence here to ensure that other threads attempting to + // read the bytes in the buffer will see the writes committed before the + // extents are updated. + atomic_thread_fence(memory_order_release); + atomic_fetch_add(Buffer.Extents, Size, memory_order_acq_rel); + return Size; + } + + enum class FunctionRecordKind : uint8_t { + Enter = 0x00, + Exit = 0x01, + TailExit = 0x02, + EnterArg = 0x03, + }; + + bool writeFunction(FunctionRecordKind Kind, int32_t FuncId, int32_t Delta) { + FunctionRecord R; + R.Type = 0; + R.RecordKind = uint8_t(Kind); + R.FuncId = FuncId; + R.TSCDelta = Delta; + writeRecord(R); + return true; + } + + bool writeFunctionWithArg(FunctionRecordKind Kind, int32_t FuncId, + int32_t Delta, uint64_t Arg) { + // We need to write the function with arg into the buffer, and then + // atomically update the buffer extents. This ensures that any reads + // synchronised on the buffer extents record will always see the writes + // that happen before the atomic update. + FunctionRecord R; + R.Type = 0; + R.RecordKind = uint8_t(Kind); + R.FuncId = FuncId; + R.TSCDelta = Delta; + MetadataRecord A = + createMetadataRecord<MetadataRecord::RecordKinds::CallArgument>(Arg); + NextRecord = reinterpret_cast<char *>(internal_memcpy( + NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) + + sizeof(R); + NextRecord = reinterpret_cast<char *>(internal_memcpy( + NextRecord, reinterpret_cast<char *>(&A), sizeof(A))) + + sizeof(A); + // We need this atomic fence here to ensure that other threads attempting to + // read the bytes in the buffer will see the writes committed before the + // extents are updated. + atomic_thread_fence(memory_order_release); + atomic_fetch_add(Buffer.Extents, sizeof(R) + sizeof(A), + memory_order_acq_rel); + return true; + } + + bool writeCustomEvent(int32_t Delta, const void *Event, int32_t EventSize) { + // We write the metadata record and the custom event data into the buffer + // first, before we atomically update the extents for the buffer. This + // allows us to ensure that any threads reading the extents of the buffer + // will only ever see the full metadata and custom event payload accounted + // (no partial writes accounted). + MetadataRecord R = + createMetadataRecord<MetadataRecord::RecordKinds::CustomEventMarker>( + EventSize, Delta); + NextRecord = reinterpret_cast<char *>(internal_memcpy( + NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) + + sizeof(R); + NextRecord = reinterpret_cast<char *>( + internal_memcpy(NextRecord, Event, EventSize)) + + EventSize; + + // We need this atomic fence here to ensure that other threads attempting to + // read the bytes in the buffer will see the writes committed before the + // extents are updated. + atomic_thread_fence(memory_order_release); + atomic_fetch_add(Buffer.Extents, sizeof(R) + EventSize, + memory_order_acq_rel); + return true; + } + + bool writeTypedEvent(int32_t Delta, uint16_t EventType, const void *Event, + int32_t EventSize) { + // We do something similar when writing out typed events, see + // writeCustomEvent(...) above for details. + MetadataRecord R = + createMetadataRecord<MetadataRecord::RecordKinds::TypedEventMarker>( + EventSize, Delta, EventType); + NextRecord = reinterpret_cast<char *>(internal_memcpy( + NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) + + sizeof(R); + NextRecord = reinterpret_cast<char *>( + internal_memcpy(NextRecord, Event, EventSize)) + + EventSize; + + // We need this atomic fence here to ensure that other threads attempting to + // read the bytes in the buffer will see the writes committed before the + // extents are updated. + atomic_thread_fence(memory_order_release); + atomic_fetch_add(Buffer.Extents, EventSize, memory_order_acq_rel); + return true; + } + + char *getNextRecord() const { return NextRecord; } + + void resetRecord() { + NextRecord = reinterpret_cast<char *>(Buffer.Data); + atomic_store(Buffer.Extents, 0, memory_order_release); + } + + void undoWrites(size_t B) { + DCHECK_GE(NextRecord - B, reinterpret_cast<char *>(Buffer.Data)); + NextRecord -= B; + atomic_fetch_sub(Buffer.Extents, B, memory_order_acq_rel); + } + +}; // namespace __xray + +} // namespace __xray + +#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_ diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc index 6cb2dfa0c658..1eda26df7a85 100644 --- a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc +++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc @@ -20,7 +20,6 @@ #include <limits> #include <memory> #include <pthread.h> -#include <sys/syscall.h> #include <sys/time.h> #include <time.h> #include <unistd.h> @@ -30,9 +29,12 @@ #include "sanitizer_common/sanitizer_common.h" #include "xray/xray_interface.h" #include "xray/xray_records.h" +#include "xray_allocator.h" #include "xray_buffer_queue.h" #include "xray_defs.h" +#include "xray_fdr_controller.h" #include "xray_fdr_flags.h" +#include "xray_fdr_log_writer.h" #include "xray_flags.h" #include "xray_recursion_guard.h" #include "xray_tsc.h" @@ -40,55 +42,53 @@ namespace __xray { -atomic_sint32_t LoggingStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; +static atomic_sint32_t LoggingStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; + +namespace { // Group together thread-local-data in a struct, then hide it behind a function // call so that it can be initialized on first use instead of as a global. We // force the alignment to 64-bytes for x86 cache line alignment, as this // structure is used in the hot path of implementation. -struct alignas(64) ThreadLocalData { - BufferQueue::Buffer Buffer; - char *RecordPtr = nullptr; - // The number of FunctionEntry records immediately preceding RecordPtr. - uint8_t NumConsecutiveFnEnters = 0; - - // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit - // records preceding RecordPtr. - uint8_t NumTailCalls = 0; - - // We use a thread_local variable to keep track of which CPUs we've already - // run, and the TSC times for these CPUs. This allows us to stop repeating the - // CPU field in the function records. - // - // We assume that we'll support only 65536 CPUs for x86_64. - uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max(); - uint64_t LastTSC = 0; - uint64_t LastFunctionEntryTSC = 0; - - // Make sure a thread that's ever called handleArg0 has a thread-local - // live reference to the buffer queue for this particular instance of - // FDRLogging, and that we're going to clean it up when the thread exits. +struct XRAY_TLS_ALIGNAS(64) ThreadLocalData { + BufferQueue::Buffer Buffer{}; BufferQueue *BQ = nullptr; + + using LogWriterStorage = + typename std::aligned_storage<sizeof(FDRLogWriter), + alignof(FDRLogWriter)>::type; + + LogWriterStorage LWStorage; + FDRLogWriter *Writer = nullptr; + + using ControllerStorage = + typename std::aligned_storage<sizeof(FDRController<>), + alignof(FDRController<>)>::type; + ControllerStorage CStorage; + FDRController<> *Controller = nullptr; }; +} // namespace + static_assert(std::is_trivially_destructible<ThreadLocalData>::value, "ThreadLocalData must be trivially destructible"); -static constexpr auto MetadataRecSize = sizeof(MetadataRecord); -static constexpr auto FunctionRecSize = sizeof(FunctionRecord); - // Use a global pthread key to identify thread-local data for logging. static pthread_key_t Key; // Global BufferQueue. +static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage; static BufferQueue *BQ = nullptr; -static atomic_sint32_t LogFlushStatus = { - XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; +// Global thresholds for function durations. +static atomic_uint64_t ThresholdTicks{0}; -static FDRLoggingOptions FDROptions; +// Global for ticks per second. +static atomic_uint64_t TicksPerSec{0}; -static SpinMutex FDROptionsMutex; +static atomic_sint32_t LogFlushStatus = { + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; // This function will initialize the thread-local data structure used by the FDR // logging implementation and return a reference to it. The implementation @@ -124,8 +124,10 @@ static SpinMutex FDROptionsMutex; // critical section, calling a function that might be XRay instrumented (and // thus in turn calling into malloc by virtue of registration of the // thread_local's destructor). +#if XRAY_HAS_TLS_ALIGNAS static_assert(alignof(ThreadLocalData) >= 64, "ThreadLocalData must be cache line aligned."); +#endif static ThreadLocalData &getThreadLocalData() { thread_local typename std::aligned_storage< sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{}; @@ -138,559 +140,36 @@ static ThreadLocalData &getThreadLocalData() { return *reinterpret_cast<ThreadLocalData *>(&TLDStorage); } -static void writeNewBufferPreamble(tid_t Tid, timespec TS, - pid_t Pid) XRAY_NEVER_INSTRUMENT { - static constexpr int InitRecordsCount = 3; - auto &TLD = getThreadLocalData(); - MetadataRecord Metadata[InitRecordsCount]; - { - // Write out a MetadataRecord to signify that this is the start of a new - // buffer, associated with a particular thread, with a new CPU. For the - // data, we have 15 bytes to squeeze as much information as we can. At this - // point we only write down the following bytes: - // - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes) - auto &NewBuffer = Metadata[0]; - NewBuffer.Type = uint8_t(RecordType::Metadata); - NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer); - int32_t tid = static_cast<int32_t>(Tid); - internal_memcpy(&NewBuffer.Data, &tid, sizeof(tid)); - } - - // Also write the WalltimeMarker record. - { - static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes"); - auto &WalltimeMarker = Metadata[1]; - WalltimeMarker.Type = uint8_t(RecordType::Metadata); - WalltimeMarker.RecordKind = - uint8_t(MetadataRecord::RecordKinds::WalltimeMarker); - - // We only really need microsecond precision here, and enforce across - // platforms that we need 64-bit seconds and 32-bit microseconds encoded in - // the Metadata record. - int32_t Micros = TS.tv_nsec / 1000; - int64_t Seconds = TS.tv_sec; - internal_memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds)); - internal_memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros, - sizeof(Micros)); - } - - // Also write the Pid record. - { - // Write out a MetadataRecord that contains the current pid - auto &PidMetadata = Metadata[2]; - PidMetadata.Type = uint8_t(RecordType::Metadata); - PidMetadata.RecordKind = uint8_t(MetadataRecord::RecordKinds::Pid); - int32_t pid = static_cast<int32_t>(Pid); - internal_memcpy(&PidMetadata.Data, &pid, sizeof(pid)); - } - - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - if (TLD.BQ == nullptr || TLD.BQ->finalizing()) - return; - internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata)); - TLD.RecordPtr += sizeof(Metadata); - // Since we write out the extents as the first metadata record of the - // buffer, we need to write out the extents including the extents record. - atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata), - memory_order_release); -} - -static void setupNewBuffer(int (*wall_clock_reader)( - clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - auto &B = TLD.Buffer; - TLD.RecordPtr = static_cast<char *>(B.Data); - tid_t Tid = GetTid(); - timespec TS{0, 0}; - pid_t Pid = internal_getpid(); - // This is typically clock_gettime, but callers have injection ability. - wall_clock_reader(CLOCK_MONOTONIC, &TS); - writeNewBufferPreamble(Tid, TS, Pid); - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; -} - -static void incrementExtents(size_t Add) { - auto &TLD = getThreadLocalData(); - atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel); -} - -static void decrementExtents(size_t Subtract) { - auto &TLD = getThreadLocalData(); - atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel); -} - -static void writeNewCPUIdMetadata(uint16_t CPU, - uint64_t TSC) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - MetadataRecord NewCPUId; - NewCPUId.Type = uint8_t(RecordType::Metadata); - NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId); - - // The data for the New CPU will contain the following bytes: - // - CPU ID (uint16_t, 2 bytes) - // - Full TSC (uint64_t, 8 bytes) - // Total = 10 bytes. - internal_memcpy(&NewCPUId.Data, &CPU, sizeof(CPU)); - internal_memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC)); - internal_memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord)); - TLD.RecordPtr += sizeof(MetadataRecord); - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - incrementExtents(sizeof(MetadataRecord)); -} - -static void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - MetadataRecord TSCWrap; - TSCWrap.Type = uint8_t(RecordType::Metadata); - TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap); - - // The data for the TSCWrap record contains the following bytes: - // - Full TSC (uint64_t, 8 bytes) - // Total = 8 bytes. - internal_memcpy(&TSCWrap.Data, &TSC, sizeof(TSC)); - internal_memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord)); - TLD.RecordPtr += sizeof(MetadataRecord); - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - incrementExtents(sizeof(MetadataRecord)); -} - -// Call Argument metadata records store the arguments to a function in the -// order of their appearance; holes are not supported by the buffer format. -static void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - MetadataRecord CallArg; - CallArg.Type = uint8_t(RecordType::Metadata); - CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument); - - internal_memcpy(CallArg.Data, &A, sizeof(A)); - internal_memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord)); - TLD.RecordPtr += sizeof(MetadataRecord); - incrementExtents(sizeof(MetadataRecord)); -} - -static void writeFunctionRecord(int FuncId, uint32_t TSCDelta, - XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT { - FunctionRecord FuncRecord; - FuncRecord.Type = uint8_t(RecordType::Function); - // Only take 28 bits of the function id. - FuncRecord.FuncId = FuncId & ~(0x0F << 28); - FuncRecord.TSCDelta = TSCDelta; - - auto &TLD = getThreadLocalData(); - switch (EntryType) { - case XRayEntryType::ENTRY: - ++TLD.NumConsecutiveFnEnters; - FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); - break; - case XRayEntryType::LOG_ARGS_ENTRY: - // We should not rewind functions with logged args. - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter); - break; - case XRayEntryType::EXIT: - // If we've decided to log the function exit, we will never erase the log - // before it. - TLD.NumConsecutiveFnEnters = 0; - TLD.NumTailCalls = 0; - FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit); - break; - case XRayEntryType::TAIL: - // If we just entered the function we're tail exiting from or erased every - // invocation since then, this function entry tail pair is a candidate to - // be erased when the child function exits. - if (TLD.NumConsecutiveFnEnters > 0) { - ++TLD.NumTailCalls; - TLD.NumConsecutiveFnEnters = 0; - } else { - // We will never be able to erase this tail call since we have logged - // something in between the function entry and tail exit. - TLD.NumTailCalls = 0; - TLD.NumConsecutiveFnEnters = 0; - } - FuncRecord.RecordKind = - uint8_t(FunctionRecord::RecordKinds::FunctionTailExit); - break; - case XRayEntryType::CUSTOM_EVENT: { - // This is a bug in patching, so we'll report it once and move on. - static atomic_uint8_t ErrorLatch{0}; - if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) - Report("Internal error: patched an XRay custom event call as a function; " - "func id = %d\n", - FuncId); - return; - } - case XRayEntryType::TYPED_EVENT: { - static atomic_uint8_t ErrorLatch{0}; - if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) - Report("Internal error: patched an XRay typed event call as a function; " - "func id = %d\n", - FuncId); - return; - } - } - - internal_memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord)); - TLD.RecordPtr += sizeof(FunctionRecord); - incrementExtents(sizeof(FunctionRecord)); -} - -static atomic_uint64_t TicksPerSec{0}; -static atomic_uint64_t ThresholdTicks{0}; - -// Re-point the thread local pointer into this thread's Buffer before the recent -// "Function Entry" record and any "Tail Call Exit" records after that. -static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC, - uint64_t &LastFunctionEntryTSC, int32_t FuncId) { - auto &TLD = getThreadLocalData(); - TLD.RecordPtr -= FunctionRecSize; - decrementExtents(FunctionRecSize); - FunctionRecord FuncRecord; - internal_memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize); - DCHECK(FuncRecord.RecordKind == - uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && - "Expected to find function entry recording when rewinding."); - DCHECK(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) && - "Expected matching function id when rewinding Exit"); - --TLD.NumConsecutiveFnEnters; - LastTSC -= FuncRecord.TSCDelta; - - // We unwound one call. Update the state and return without writing a log. - if (TLD.NumConsecutiveFnEnters != 0) { - LastFunctionEntryTSC -= FuncRecord.TSCDelta; - return; - } - - // Otherwise we've rewound the stack of all function entries, we might be - // able to rewind further by erasing tail call functions that are being - // exited from via this exit. - LastFunctionEntryTSC = 0; - auto RewindingTSC = LastTSC; - auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize; - while (TLD.NumTailCalls > 0) { - // Rewind the TSC back over the TAIL EXIT record. - FunctionRecord ExpectedTailExit; - internal_memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize); - - DCHECK(ExpectedTailExit.RecordKind == - uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) && - "Expected to find tail exit when rewinding."); - RewindingRecordPtr -= FunctionRecSize; - RewindingTSC -= ExpectedTailExit.TSCDelta; - FunctionRecord ExpectedFunctionEntry; - internal_memcpy(&ExpectedFunctionEntry, RewindingRecordPtr, - FunctionRecSize); - DCHECK(ExpectedFunctionEntry.RecordKind == - uint8_t(FunctionRecord::RecordKinds::FunctionEnter) && - "Expected to find function entry when rewinding tail call."); - DCHECK(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId && - "Expected funcids to match when rewinding tail call."); - - // This tail call exceeded the threshold duration. It will not be erased. - if ((TSC - RewindingTSC) >= atomic_load_relaxed(&ThresholdTicks)) { - TLD.NumTailCalls = 0; - return; - } - - // We can erase a tail exit pair that we're exiting through since - // its duration is under threshold. - --TLD.NumTailCalls; - RewindingRecordPtr -= FunctionRecSize; - RewindingTSC -= ExpectedFunctionEntry.TSCDelta; - TLD.RecordPtr -= 2 * FunctionRecSize; - LastTSC = RewindingTSC; - decrementExtents(2 * FunctionRecSize); - } -} - -static bool releaseThreadLocalBuffer(BufferQueue &BQArg) { - auto &TLD = getThreadLocalData(); - auto EC = BQArg.releaseBuffer(TLD.Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Data, - BufferQueue::getErrorString(EC)); - return false; - } - return true; -} - -static bool prepareBuffer(uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, - struct timespec *), - size_t MaxSize) XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - char *BufferStart = static_cast<char *>(TLD.Buffer.Data); - if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) { - if (!releaseThreadLocalBuffer(*TLD.BQ)) - return false; - auto EC = TLD.BQ->getBuffer(TLD.Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - Report("Failed to prepare a buffer; error = '%s'\n", - BufferQueue::getErrorString(EC)); - return false; - } - setupNewBuffer(wall_clock_reader); - - // Always write the CPU metadata as the first record in the buffer. - writeNewCPUIdMetadata(CPU, TSC); - } - return true; -} - -static bool -isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU, - int (*wall_clock_reader)(clockid_t, struct timespec *)) - XRAY_NEVER_INSTRUMENT { - // Bail out right away if logging is not initialized yet. - // We should take the opportunity to release the buffer though. - auto Status = atomic_load(&LoggingStatus, memory_order_acquire); - auto &TLD = getThreadLocalData(); - if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) { - if (TLD.RecordPtr != nullptr && - (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || - Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) { - if (!releaseThreadLocalBuffer(*LBQ)) - return false; - TLD.RecordPtr = nullptr; - return false; - } - return false; - } - - if (atomic_load(&LoggingStatus, memory_order_acquire) != - XRayLogInitStatus::XRAY_LOG_INITIALIZED || - LBQ->finalizing()) { - if (!releaseThreadLocalBuffer(*LBQ)) - return false; - TLD.RecordPtr = nullptr; - } - - if (TLD.Buffer.Data == nullptr) { - auto EC = LBQ->getBuffer(TLD.Buffer); - if (EC != BufferQueue::ErrorCode::Ok) { - auto LS = atomic_load(&LoggingStatus, memory_order_acquire); - if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING && - LS != XRayLogInitStatus::XRAY_LOG_FINALIZED) - Report("Failed to acquire a buffer; error = '%s'\n", - BufferQueue::getErrorString(EC)); - return false; - } - - setupNewBuffer(wall_clock_reader); - - // Always write the CPU metadata as the first record in the buffer. - writeNewCPUIdMetadata(CPU, TSC); - } - - if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) { - // This means this is the first CPU this thread has ever run on. We set - // the current CPU and record this as the first TSC we've seen. - TLD.CurrentCPU = CPU; - writeNewCPUIdMetadata(CPU, TSC); - } - - return true; -} - -// Compute the TSC difference between the time of measurement and the previous -// event. There are a few interesting situations we need to account for: -// -// - The thread has migrated to a different CPU. If this is the case, then -// we write down the following records: -// -// 1. A 'NewCPUId' Metadata record. -// 2. A FunctionRecord with a 0 for the TSCDelta field. -// -// - The TSC delta is greater than the 32 bits we can store in a -// FunctionRecord. In this case we write down the following records: -// -// 1. A 'TSCWrap' Metadata record. -// 2. A FunctionRecord with a 0 for the TSCDelta field. -// -// - The TSC delta is representable within the 32 bits we can store in a -// FunctionRecord. In this case we write down just a FunctionRecord with -// the correct TSC delta. -static uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC, - uint8_t CPU) { - if (CPU != TLD.CurrentCPU) { - // We've moved to a new CPU. - writeNewCPUIdMetadata(CPU, TSC); - return 0; - } - // If the delta is greater than the range for a uint32_t, then we write out - // the TSC wrap metadata entry with the full TSC, and the TSC for the - // function record be 0. - uint64_t Delta = TSC - TLD.LastTSC; - if (Delta <= std::numeric_limits<uint32_t>::max()) - return Delta; - - writeTSCWrapMetadata(TSC); - return 0; -} - -static void endBufferIfFull() XRAY_NEVER_INSTRUMENT { - auto &TLD = getThreadLocalData(); - auto BufferStart = static_cast<char *>(TLD.Buffer.Data); - if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <= - ptrdiff_t{MetadataRecSize}) { - if (!releaseThreadLocalBuffer(*TLD.BQ)) - return; - TLD.RecordPtr = nullptr; - } -} - -thread_local atomic_uint8_t Running{0}; - -/// Here's where the meat of the processing happens. The writer captures -/// function entry, exit and tail exit points with a time and will create -/// TSCWrap, NewCPUId and Function records as necessary. The writer might -/// walk backward through its buffer and erase trivial functions to avoid -/// polluting the log and may use the buffer queue to obtain or release a -/// buffer. -static void processFunctionHook(int32_t FuncId, XRayEntryType Entry, - uint64_t TSC, unsigned char CPU, uint64_t Arg1, - int (*wall_clock_reader)(clockid_t, - struct timespec *)) - XRAY_NEVER_INSTRUMENT { - __asm volatile("# LLVM-MCA-BEGIN processFunctionHook"); - // Prevent signal handler recursion, so in case we're already in a log writing - // mode and the signal handler comes in (and is also instrumented) then we - // don't want to be clobbering potentially partial writes already happening in - // the thread. We use a simple thread_local latch to only allow one on-going - // handleArg0 to happen at any given time. - RecursionGuard Guard{Running}; - if (!Guard) { - DCHECK(atomic_load_relaxed(&Running) && "RecursionGuard is buggy!"); - return; - } - - auto &TLD = getThreadLocalData(); - - if (TLD.BQ == nullptr) - TLD.BQ = BQ; - - if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader)) - return; - - // Before we go setting up writing new function entries, we need to be really - // careful about the pointer math we're doing. This means we need to ensure - // that the record we are about to write is going to fit into the buffer, - // without overflowing the buffer. - // - // To do this properly, we use the following assumptions: - // - // - The least number of bytes we will ever write is 8 - // (sizeof(FunctionRecord)) only if the delta between the previous entry - // and this entry is within 32 bits. - // - The most number of bytes we will ever write is 8 + 16 + 16 = 40. - // This is computed by: - // - // MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord) - // - // These arise in the following cases: - // - // 1. When the delta between the TSC we get and the previous TSC for the - // same CPU is outside of the uint32_t range, we end up having to - // write a MetadataRecord to indicate a "tsc wrap" before the actual - // FunctionRecord. - // 2. When we learn that we've moved CPUs, we need to write a - // MetadataRecord to indicate a "cpu change", and thus write out the - // current TSC for that CPU before writing out the actual - // FunctionRecord. - // 3. When we learn about a new CPU ID, we need to write down a "new cpu - // id" MetadataRecord before writing out the actual FunctionRecord. - // 4. The second MetadataRecord is the optional function call argument. - // - // So the math we need to do is to determine whether writing 40 bytes past the - // current pointer exceeds the buffer's maximum size. If we don't have enough - // space to write 40 bytes in the buffer, we need get a new Buffer, set it up - // properly before doing any further writing. - size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize; - if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) { - TLD.BQ = nullptr; - return; - } - - // By this point, we are now ready to write up to 40 bytes (explained above). - DCHECK((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Data) >= - static_cast<ptrdiff_t>(MetadataRecSize) && - "Misconfigured BufferQueue provided; Buffer size not large enough."); - - auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU); - TLD.LastTSC = TSC; - TLD.CurrentCPU = CPU; - switch (Entry) { - case XRayEntryType::ENTRY: - case XRayEntryType::LOG_ARGS_ENTRY: - // Update the thread local state for the next invocation. - TLD.LastFunctionEntryTSC = TSC; - break; - case XRayEntryType::TAIL: - case XRayEntryType::EXIT: - // Break out and write the exit record if we can't erase any functions. - if (TLD.NumConsecutiveFnEnters == 0 || - (TSC - TLD.LastFunctionEntryTSC) >= - atomic_load_relaxed(&ThresholdTicks)) - break; - rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId); - return; // without writing log. - case XRayEntryType::CUSTOM_EVENT: { - // This is a bug in patching, so we'll report it once and move on. - static atomic_uint8_t ErrorLatch{0}; - if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) - Report("Internal error: patched an XRay custom event call as a function; " - "func id = %d\n", - FuncId); - return; - } - case XRayEntryType::TYPED_EVENT: { - static atomic_uint8_t ErrorLatch{0}; - if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel)) - Report("Internal error: patched an XRay typed event call as a function; " - "func id = %d\n", - FuncId); - return; - } - } - - writeFunctionRecord(FuncId, RecordTSCDelta, Entry); - if (Entry == XRayEntryType::LOG_ARGS_ENTRY) - writeCallArgumentMetadata(Arg1); - - // If we've exhausted the buffer by this time, we then release the buffer to - // make sure that other threads may start using this buffer. - endBufferIfFull(); - __asm volatile("# LLVM-MCA-END"); -} - static XRayFileHeader &fdrCommonHeaderInfo() { static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage; static pthread_once_t OnceInit = PTHREAD_ONCE_INIT; static bool TSCSupported = true; static uint64_t CycleFrequency = NanosecondsPerSecond; - pthread_once(&OnceInit, +[] { - XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage); - // Version 2 of the log writes the extents of the buffer, instead of - // relying on an end-of-buffer record. - // Version 3 includes PID metadata record - H.Version = 3; - H.Type = FileTypes::FDR_LOG; - - // Test for required CPU features and cache the cycle frequency - TSCSupported = probeRequiredCPUFeatures(); - if (TSCSupported) - CycleFrequency = getTSCFrequency(); - H.CycleFrequency = CycleFrequency; - - // FIXME: Actually check whether we have 'constant_tsc' and - // 'nonstop_tsc' before setting the values in the header. - H.ConstantTSC = 1; - H.NonstopTSC = 1; - }); + pthread_once( + &OnceInit, +[] { + XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage); + // Version 2 of the log writes the extents of the buffer, instead of + // relying on an end-of-buffer record. + // Version 3 includes PID metadata record. + // Version 4 includes CPU data in the custom event records. + // Version 5 uses relative deltas for custom and typed event records, + // and removes the CPU data in custom event records (similar to how + // function records use deltas instead of full TSCs and rely on other + // metadata records for TSC wraparound and CPU migration). + H.Version = 5; + H.Type = FileTypes::FDR_LOG; + + // Test for required CPU features and cache the cycle frequency + TSCSupported = probeRequiredCPUFeatures(); + if (TSCSupported) + CycleFrequency = getTSCFrequency(); + H.CycleFrequency = CycleFrequency; + + // FIXME: Actually check whether we have 'constant_tsc' and + // 'nonstop_tsc' before setting the values in the header. + H.ConstantTSC = 1; + H.NonstopTSC = 1; + }); return reinterpret_cast<XRayFileHeader &>(HStorage); } @@ -728,9 +207,11 @@ XRayBuffer fdrIterator(const XRayBuffer B) { // buffers to expect). static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage; static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT; - pthread_once(&HeaderOnce, +[] { - reinterpret_cast<XRayFileHeader &>(HeaderStorage) = fdrCommonHeaderInfo(); - }); + pthread_once( + &HeaderOnce, +[] { + reinterpret_cast<XRayFileHeader &>(HeaderStorage) = + fdrCommonHeaderInfo(); + }); // We use a convenience alias for code referring to Header from here on out. auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage); @@ -741,7 +222,8 @@ XRayBuffer fdrIterator(const XRayBuffer B) { static BufferQueue::const_iterator It{}; static BufferQueue::const_iterator End{}; - static void *CurrentBuffer{nullptr}; + static uint8_t *CurrentBuffer{nullptr}; + static size_t SerializedBufferSize = 0; if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) { // From this point on, we provide raw access to the raw buffer we're getting // from the BufferQueue. We're relying on the iterators from the current @@ -751,7 +233,7 @@ XRayBuffer fdrIterator(const XRayBuffer B) { } if (CurrentBuffer != nullptr) { - InternalFree(CurrentBuffer); + deallocateBuffer(CurrentBuffer, SerializedBufferSize); CurrentBuffer = nullptr; } @@ -762,9 +244,16 @@ XRayBuffer fdrIterator(const XRayBuffer B) { // out to disk. The difference here would be that we still write "empty" // buffers, or at least go through the iterators faithfully to let the // handlers see the empty buffers in the queue. - auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire); - auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord); - CurrentBuffer = InternalAlloc(SerializedBufferSize); + // + // We need this atomic fence here to ensure that writes happening to the + // buffer have been committed before we load the extents atomically. Because + // the buffer is not explicitly synchronised across threads, we rely on the + // fence ordering to ensure that writes we expect to have been completed + // before the fence are fully committed before we read the extents. + atomic_thread_fence(memory_order_acquire); + auto BufferSize = atomic_load(It->Extents, memory_order_acquire); + SerializedBufferSize = BufferSize + sizeof(MetadataRecord); + CurrentBuffer = allocateBuffer(SerializedBufferSize); if (CurrentBuffer == nullptr) return {nullptr, 0}; @@ -827,14 +316,9 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { }); auto CleanupBuffers = at_scope_exit([] { - if (BQ != nullptr) { - auto &TLD = getThreadLocalData(); - if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr) - releaseThreadLocalBuffer(*TLD.BQ); - BQ->~BufferQueue(); - InternalFree(BQ); - BQ = nullptr; - } + auto &TLD = getThreadLocalData(); + if (TLD.Controller != nullptr) + TLD.Controller->flush(); }); if (fdrFlags()->no_file_flush) { @@ -855,16 +339,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { // (fixed-sized) and let the tools reading the buffers deal with the data // afterwards. // - int Fd = -1; - { - // FIXME: Remove this section of the code, when we remove the struct-based - // configuration API. - SpinMutexLock Guard(&FDROptionsMutex); - Fd = FDROptions.Fd; - } - if (Fd == -1) - Fd = getLogFD(); - if (Fd == -1) { + LogWriter *LW = LogWriter::Open(); + if (LW == nullptr) { auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; atomic_store(&LogFlushStatus, Result, memory_order_release); return Result; @@ -872,8 +348,15 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { XRayFileHeader Header = fdrCommonHeaderInfo(); Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()}; - retryingWriteAll(Fd, reinterpret_cast<char *>(&Header), - reinterpret_cast<char *>(&Header) + sizeof(Header)); + LW->WriteAll(reinterpret_cast<char *>(&Header), + reinterpret_cast<char *>(&Header) + sizeof(Header)); + + // Release the current thread's buffer before we attempt to write out all the + // buffers. This ensures that in case we had only a single thread going, that + // we are able to capture the data nonetheless. + auto &TLD = getThreadLocalData(); + if (TLD.Controller != nullptr) + TLD.Controller->flush(); BQ->apply([&](const BufferQueue::Buffer &B) { // Starting at version 2 of the FDR logging implementation, we only write @@ -882,18 +365,18 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT { // still use a Metadata record, but fill in the extents instead for the // data. MetadataRecord ExtentsRecord; - auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire); + auto BufferExtents = atomic_load(B.Extents, memory_order_acquire); DCHECK(BufferExtents <= B.Size); ExtentsRecord.Type = uint8_t(RecordType::Metadata); ExtentsRecord.RecordKind = uint8_t(MetadataRecord::RecordKinds::BufferExtents); internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents)); if (BufferExtents > 0) { - retryingWriteAll(Fd, reinterpret_cast<char *>(&ExtentsRecord), - reinterpret_cast<char *>(&ExtentsRecord) + - sizeof(MetadataRecord)); - retryingWriteAll(Fd, reinterpret_cast<char *>(B.Data), - reinterpret_cast<char *>(B.Data) + BufferExtents); + LW->WriteAll(reinterpret_cast<char *>(&ExtentsRecord), + reinterpret_cast<char *>(&ExtentsRecord) + + sizeof(MetadataRecord)); + LW->WriteAll(reinterpret_cast<char *>(B.Data), + reinterpret_cast<char *>(B.Data) + BufferExtents); } }); @@ -914,7 +397,12 @@ XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT { // Do special things to make the log finalize itself, and not allow any more // operations to be performed until re-initialized. - BQ->finalize(); + if (BQ == nullptr) { + if (Verbosity()) + Report("Attempting to finalize an uninitialized global buffer!\n"); + } else { + BQ->finalize(); + } atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED, memory_order_release); @@ -935,7 +423,8 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT { // Test once for required CPU features static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT; static bool TSCSupported = true; - pthread_once(&OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); }); + pthread_once( + &OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); }); if (TSCSupported) { Result.TSC = __xray::readTSC(Result.CPU); @@ -953,16 +442,115 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT { return Result; } +thread_local atomic_uint8_t Running{0}; + +static bool setupTLD(ThreadLocalData &TLD) XRAY_NEVER_INSTRUMENT { + // Check if we're finalizing, before proceeding. + { + auto Status = atomic_load(&LoggingStatus, memory_order_acquire); + if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || + Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) { + if (TLD.Controller != nullptr) { + TLD.Controller->flush(); + TLD.Controller = nullptr; + } + return false; + } + } + + if (UNLIKELY(TLD.Controller == nullptr)) { + // Set up the TLD buffer queue. + if (UNLIKELY(BQ == nullptr)) + return false; + TLD.BQ = BQ; + + // Check that we have a valid buffer. + if (TLD.Buffer.Generation != BQ->generation() && + TLD.BQ->releaseBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok) + return false; + + // Set up a buffer, before setting up the log writer. Bail out on failure. + if (TLD.BQ->getBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok) + return false; + + // Set up the Log Writer for this thread. + if (UNLIKELY(TLD.Writer == nullptr)) { + auto *LWStorage = reinterpret_cast<FDRLogWriter *>(&TLD.LWStorage); + new (LWStorage) FDRLogWriter(TLD.Buffer); + TLD.Writer = LWStorage; + } else { + TLD.Writer->resetRecord(); + } + + auto *CStorage = reinterpret_cast<FDRController<> *>(&TLD.CStorage); + new (CStorage) + FDRController<>(TLD.BQ, TLD.Buffer, *TLD.Writer, clock_gettime, + atomic_load_relaxed(&ThresholdTicks)); + TLD.Controller = CStorage; + } + + DCHECK_NE(TLD.Controller, nullptr); + return true; +} + void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { auto TC = getTimestamp(); - processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, clock_gettime); + auto &TSC = TC.TSC; + auto &CPU = TC.CPU; + RecursionGuard Guard{Running}; + if (!Guard) + return; + + auto &TLD = getThreadLocalData(); + if (!setupTLD(TLD)) + return; + + switch (Entry) { + case XRayEntryType::ENTRY: + case XRayEntryType::LOG_ARGS_ENTRY: + TLD.Controller->functionEnter(FuncId, TSC, CPU); + return; + case XRayEntryType::EXIT: + TLD.Controller->functionExit(FuncId, TSC, CPU); + return; + case XRayEntryType::TAIL: + TLD.Controller->functionTailExit(FuncId, TSC, CPU); + return; + case XRayEntryType::CUSTOM_EVENT: + case XRayEntryType::TYPED_EVENT: + break; + } } void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry, uint64_t Arg) XRAY_NEVER_INSTRUMENT { auto TC = getTimestamp(); - processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, clock_gettime); + auto &TSC = TC.TSC; + auto &CPU = TC.CPU; + RecursionGuard Guard{Running}; + if (!Guard) + return; + + auto &TLD = getThreadLocalData(); + if (!setupTLD(TLD)) + return; + + switch (Entry) { + case XRayEntryType::ENTRY: + case XRayEntryType::LOG_ARGS_ENTRY: + TLD.Controller->functionEnterArg(FuncId, TSC, CPU, Arg); + return; + case XRayEntryType::EXIT: + TLD.Controller->functionExit(FuncId, TSC, CPU); + return; + case XRayEntryType::TAIL: + TLD.Controller->functionTailExit(FuncId, TSC, CPU); + return; + case XRayEntryType::CUSTOM_EVENT: + case XRayEntryType::TYPED_EVENT: + break; + } } void fdrLoggingHandleCustomEvent(void *Event, @@ -973,40 +561,25 @@ void fdrLoggingHandleCustomEvent(void *Event, RecursionGuard Guard{Running}; if (!Guard) return; - if (EventSize > std::numeric_limits<int32_t>::max()) { + + // Complain when we ever get at least one custom event that's larger than what + // we can possibly support. + if (EventSize > + static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) { static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { Report("Event size too large.\n"); }); + pthread_once( + &Once, +[] { + Report("Custom event size too large; truncating to %d.\n", + std::numeric_limits<int32_t>::max()); + }); } - int32_t ReducedEventSize = static_cast<int32_t>(EventSize); - auto &TLD = getThreadLocalData(); - if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime)) - return; - // Here we need to prepare the log to handle: - // - The metadata record we're going to write. (16 bytes) - // - The additional data we're going to write. Currently, that's the size - // of the event we're going to dump into the log as free-form bytes. - if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) { - TLD.BQ = nullptr; + auto &TLD = getThreadLocalData(); + if (!setupTLD(TLD)) return; - } - // Write the custom event metadata record, which consists of the following - // information: - // - 8 bytes (64-bits) for the full TSC when the event started. - // - 4 bytes (32-bits) for the length of the data. - MetadataRecord CustomEvent; - CustomEvent.Type = uint8_t(RecordType::Metadata); - CustomEvent.RecordKind = - uint8_t(MetadataRecord::RecordKinds::CustomEventMarker); - constexpr auto TSCSize = sizeof(TC.TSC); - internal_memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t)); - internal_memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize); - internal_memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent)); - TLD.RecordPtr += sizeof(CustomEvent); - internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize); - incrementExtents(MetadataRecSize + EventSize); - endBufferIfFull(); + int32_t ReducedEventSize = static_cast<int32_t>(EventSize); + TLD.Controller->customEvent(TSC, CPU, Event, ReducedEventSize); } void fdrLoggingHandleTypedEvent( @@ -1018,50 +591,28 @@ void fdrLoggingHandleTypedEvent( RecursionGuard Guard{Running}; if (!Guard) return; - if (EventSize > std::numeric_limits<int32_t>::max()) { + + // Complain when we ever get at least one typed event that's larger than what + // we can possibly support. + if (EventSize > + static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) { static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { Report("Event size too large.\n"); }); + pthread_once( + &Once, +[] { + Report("Typed event size too large; truncating to %d.\n", + std::numeric_limits<int32_t>::max()); + }); } - int32_t ReducedEventSize = static_cast<int32_t>(EventSize); + auto &TLD = getThreadLocalData(); - if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime)) + if (!setupTLD(TLD)) return; - // Here we need to prepare the log to handle: - // - The metadata record we're going to write. (16 bytes) - // - The additional data we're going to write. Currently, that's the size - // of the event we're going to dump into the log as free-form bytes. - if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) { - TLD.BQ = nullptr; - return; - } - // Write the custom event metadata record, which consists of the following - // information: - // - 8 bytes (64-bits) for the full TSC when the event started. - // - 4 bytes (32-bits) for the length of the data. - // - 2 bytes (16-bits) for the event type. 3 bytes remain since one of the - // bytes has the record type (Metadata Record) and kind (TypedEvent). - // We'll log the error if the event type is greater than 2 bytes. - // Event types are generated sequentially, so 2^16 is enough. - MetadataRecord TypedEvent; - TypedEvent.Type = uint8_t(RecordType::Metadata); - TypedEvent.RecordKind = - uint8_t(MetadataRecord::RecordKinds::TypedEventMarker); - constexpr auto TSCSize = sizeof(TC.TSC); - internal_memcpy(&TypedEvent.Data, &ReducedEventSize, sizeof(int32_t)); - internal_memcpy(&TypedEvent.Data[sizeof(int32_t)], &TSC, TSCSize); - internal_memcpy(&TypedEvent.Data[sizeof(int32_t) + TSCSize], &EventType, - sizeof(EventType)); - internal_memcpy(TLD.RecordPtr, &TypedEvent, sizeof(TypedEvent)); - - TLD.RecordPtr += sizeof(TypedEvent); - internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize); - incrementExtents(MetadataRecSize + EventSize); - endBufferIfFull(); + int32_t ReducedEventSize = static_cast<int32_t>(EventSize); + TLD.Controller->typedEvent(TSC, CPU, EventType, Event, ReducedEventSize); } -XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax, - void *Options, +XRayLogInitStatus fdrLoggingInit(size_t, size_t, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { if (Options == nullptr) return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; @@ -1075,107 +626,81 @@ XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax, return static_cast<XRayLogInitStatus>(CurrentStatus); } - // Because of __xray_log_init_mode(...) which guarantees that this will be - // called with BufferSize == 0 and BufferMax == 0 we parse the configuration - // provided in the Options pointer as a string instead. - if (BufferSize == 0 && BufferMax == 0) { - if (Verbosity()) - Report("Initializing FDR mode with options: %s\n", - static_cast<const char *>(Options)); - - // TODO: Factor out the flags specific to the FDR mode implementation. For - // now, use the global/single definition of the flags, since the FDR mode - // flags are already defined there. - FlagParser FDRParser; - FDRFlags FDRFlags; - registerXRayFDRFlags(&FDRParser, &FDRFlags); - FDRFlags.setDefaults(); - - // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided - // options until we migrate everyone to use the XRAY_FDR_OPTIONS - // compiler-provided options. - FDRParser.ParseString(useCompilerDefinedFlags()); - FDRParser.ParseString(useCompilerDefinedFDRFlags()); - auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS"); - if (EnvOpts == nullptr) - EnvOpts = ""; - FDRParser.ParseString(EnvOpts); - - // FIXME: Remove this when we fully remove the deprecated flags. - if (internal_strlen(EnvOpts) == 0) { - FDRFlags.func_duration_threshold_us = - flags()->xray_fdr_log_func_duration_threshold_us; - FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms; - } - - // The provided options should always override the compiler-provided and - // environment-variable defined options. - FDRParser.ParseString(static_cast<const char *>(Options)); - *fdrFlags() = FDRFlags; - BufferSize = FDRFlags.buffer_size; - BufferMax = FDRFlags.buffer_max; - SpinMutexLock Guard(&FDROptionsMutex); - FDROptions.Fd = -1; - FDROptions.ReportErrors = true; - } else if (OptionsSize != sizeof(FDRLoggingOptions)) { - // FIXME: This is deprecated, and should really be removed. - // At this point we use the flag parser specific to the FDR mode - // implementation. - if (Verbosity()) - Report("Cannot initialize FDR logging; wrong size for options: %d\n", - OptionsSize); - return static_cast<XRayLogInitStatus>( - atomic_load(&LoggingStatus, memory_order_acquire)); - } else { - if (Verbosity()) - Report("XRay FDR: struct-based init is deprecated, please use " - "string-based configuration instead.\n"); - SpinMutexLock Guard(&FDROptionsMutex); - internal_memcpy(&FDROptions, Options, OptionsSize); - } - - bool Success = false; - - if (BQ != nullptr) { - BQ->~BufferQueue(); - InternalFree(BQ); - BQ = nullptr; - } + if (Verbosity()) + Report("Initializing FDR mode with options: %s\n", + static_cast<const char *>(Options)); + + // TODO: Factor out the flags specific to the FDR mode implementation. For + // now, use the global/single definition of the flags, since the FDR mode + // flags are already defined there. + FlagParser FDRParser; + FDRFlags FDRFlags; + registerXRayFDRFlags(&FDRParser, &FDRFlags); + FDRFlags.setDefaults(); + + // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided + // options until we migrate everyone to use the XRAY_FDR_OPTIONS + // compiler-provided options. + FDRParser.ParseString(useCompilerDefinedFlags()); + FDRParser.ParseString(useCompilerDefinedFDRFlags()); + auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS"); + if (EnvOpts == nullptr) + EnvOpts = ""; + FDRParser.ParseString(EnvOpts); + + // FIXME: Remove this when we fully remove the deprecated flags. + if (internal_strlen(EnvOpts) == 0) { + FDRFlags.func_duration_threshold_us = + flags()->xray_fdr_log_func_duration_threshold_us; + FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms; + } + + // The provided options should always override the compiler-provided and + // environment-variable defined options. + FDRParser.ParseString(static_cast<const char *>(Options)); + *fdrFlags() = FDRFlags; + auto BufferSize = FDRFlags.buffer_size; + auto BufferMax = FDRFlags.buffer_max; if (BQ == nullptr) { - BQ = reinterpret_cast<BufferQueue *>( - InternalAlloc(sizeof(BufferQueue), nullptr, 64)); + bool Success = false; + BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); new (BQ) BufferQueue(BufferSize, BufferMax, Success); - } - - if (!Success) { - Report("BufferQueue init failed.\n"); - if (BQ != nullptr) { - BQ->~BufferQueue(); - InternalFree(BQ); - BQ = nullptr; + if (!Success) { + Report("BufferQueue init failed.\n"); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + } else { + if (BQ->init(BufferSize, BufferMax) != BufferQueue::ErrorCode::Ok) { + if (Verbosity()) + Report("Failed to re-initialize global buffer queue. Init failed.\n"); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; } - return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; } static pthread_once_t OnceInit = PTHREAD_ONCE_INIT; - pthread_once(&OnceInit, +[] { - atomic_store(&TicksPerSec, - probeRequiredCPUFeatures() ? getTSCFrequency() - : __xray::NanosecondsPerSecond, - memory_order_release); - pthread_key_create(&Key, +[](void *TLDPtr) { - if (TLDPtr == nullptr) - return; - auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr); - if (TLD.BQ == nullptr) - return; - auto EC = TLD.BQ->releaseBuffer(TLD.Buffer); - if (EC != BufferQueue::ErrorCode::Ok) - Report("At thread exit, failed to release buffer at %p; error=%s\n", - TLD.Buffer.Data, BufferQueue::getErrorString(EC)); - }); - }); + pthread_once( + &OnceInit, +[] { + atomic_store(&TicksPerSec, + probeRequiredCPUFeatures() ? getTSCFrequency() + : __xray::NanosecondsPerSecond, + memory_order_release); + pthread_key_create( + &Key, +[](void *TLDPtr) { + if (TLDPtr == nullptr) + return; + auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr); + if (TLD.BQ == nullptr) + return; + if (TLD.Buffer.Data == nullptr) + return; + auto EC = TLD.BQ->releaseBuffer(TLD.Buffer); + if (EC != BufferQueue::ErrorCode::Ok) + Report("At thread exit, failed to release buffer at %p; " + "error=%s\n", + TLD.Buffer.Data, BufferQueue::getErrorString(EC)); + }); + }); atomic_store(&ThresholdTicks, atomic_load_relaxed(&TicksPerSec) * @@ -1209,11 +734,22 @@ bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT { }; auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl); if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && - Verbosity()) + Verbosity()) { Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n", RegistrationResult); - if (flags()->xray_fdr_log || !internal_strcmp(flags()->xray_mode, "xray-fdr")) - __xray_set_log_impl(Impl); + return false; + } + + if (flags()->xray_fdr_log || + !internal_strcmp(flags()->xray_mode, "xray-fdr")) { + auto SelectResult = __xray_log_select_mode("xray-fdr"); + if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK && + Verbosity()) { + Report("Cannot select XRay FDR mode as 'xray-fdr'; error = %d\n", + SelectResult); + return false; + } + } return true; } diff --git a/contrib/compiler-rt/lib/xray/xray_function_call_trie.h b/contrib/compiler-rt/lib/xray/xray_function_call_trie.h index 2acf14aa5625..d01ad20e3d71 100644 --- a/contrib/compiler-rt/lib/xray/xray_function_call_trie.h +++ b/contrib/compiler-rt/lib/xray/xray_function_call_trie.h @@ -15,9 +15,11 @@ #ifndef XRAY_FUNCTION_CALL_TRIE_H #define XRAY_FUNCTION_CALL_TRIE_H -#include "sanitizer_common/sanitizer_allocator_internal.h" +#include "xray_buffer_queue.h" +#include "xray_defs.h" #include "xray_profiling_flags.h" #include "xray_segmented_array.h" +#include <limits> #include <memory> // For placement new. #include <utility> @@ -97,9 +99,6 @@ public: struct NodeIdPair { Node *NodePtr; int32_t FId; - - // Constructor for inplace-construction. - NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {} }; using NodeIdPairArray = Array<NodeIdPair>; @@ -113,17 +112,10 @@ public: struct Node { Node *Parent; NodeIdPairArray Callees; - int64_t CallCount; - int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. + uint64_t CallCount; + uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time. int32_t FId; - // We add a constructor here to allow us to inplace-construct through - // Array<...>'s AppendEmplace. - Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT, - int32_t F) - : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT), - FId(F) {} - // TODO: Include the compact histogram. }; @@ -131,10 +123,7 @@ private: struct ShadowStackEntry { uint64_t EntryTSC; Node *NodePtr; - - // We add a constructor here to allow us to inplace-construct through - // Array<...>'s AppendEmplace. - ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {} + uint16_t EntryCPU; }; using NodeArray = Array<Node>; @@ -149,103 +138,184 @@ public: using RootAllocatorType = RootArray::AllocatorType; using ShadowStackAllocatorType = ShadowStackArray::AllocatorType; + // Use hosted aligned storage members to allow for trivial move and init. + // This also allows us to sidestep the potential-failing allocation issue. + typename std::aligned_storage<sizeof(NodeAllocatorType), + alignof(NodeAllocatorType)>::type + NodeAllocatorStorage; + typename std::aligned_storage<sizeof(RootAllocatorType), + alignof(RootAllocatorType)>::type + RootAllocatorStorage; + typename std::aligned_storage<sizeof(ShadowStackAllocatorType), + alignof(ShadowStackAllocatorType)>::type + ShadowStackAllocatorStorage; + typename std::aligned_storage<sizeof(NodeIdPairAllocatorType), + alignof(NodeIdPairAllocatorType)>::type + NodeIdPairAllocatorStorage; + NodeAllocatorType *NodeAllocator = nullptr; RootAllocatorType *RootAllocator = nullptr; ShadowStackAllocatorType *ShadowStackAllocator = nullptr; NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; - Allocators() {} + Allocators() = default; Allocators(const Allocators &) = delete; Allocators &operator=(const Allocators &) = delete; - Allocators(Allocators &&O) - : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator), - ShadowStackAllocator(O.ShadowStackAllocator), - NodeIdPairAllocator(O.NodeIdPairAllocator) { + struct Buffers { + BufferQueue::Buffer NodeBuffer; + BufferQueue::Buffer RootsBuffer; + BufferQueue::Buffer ShadowStackBuffer; + BufferQueue::Buffer NodeIdPairBuffer; + }; + + explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT { + new (&NodeAllocatorStorage) + NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size); + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + + new (&RootAllocatorStorage) + RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + + new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType( + B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + + new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType( + B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + } + + explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT { + new (&NodeAllocatorStorage) NodeAllocatorType(Max); + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + + new (&RootAllocatorStorage) RootAllocatorType(Max); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + + new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + + new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + } + + Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT { + // Here we rely on the safety of memcpy'ing contents of the storage + // members, and then pointing the source pointers to nullptr. + internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage, + sizeof(NodeAllocatorType)); + internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage, + sizeof(RootAllocatorType)); + internal_memcpy(&ShadowStackAllocatorStorage, + &O.ShadowStackAllocatorStorage, + sizeof(ShadowStackAllocatorType)); + internal_memcpy(&NodeIdPairAllocatorStorage, + &O.NodeIdPairAllocatorStorage, + sizeof(NodeIdPairAllocatorType)); + + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + O.NodeAllocator = nullptr; O.RootAllocator = nullptr; O.ShadowStackAllocator = nullptr; O.NodeIdPairAllocator = nullptr; } - Allocators &operator=(Allocators &&O) { - { - auto Tmp = O.NodeAllocator; - O.NodeAllocator = this->NodeAllocator; - this->NodeAllocator = Tmp; - } - { - auto Tmp = O.RootAllocator; - O.RootAllocator = this->RootAllocator; - this->RootAllocator = Tmp; - } - { - auto Tmp = O.ShadowStackAllocator; - O.ShadowStackAllocator = this->ShadowStackAllocator; - this->ShadowStackAllocator = Tmp; - } - { - auto Tmp = O.NodeIdPairAllocator; - O.NodeIdPairAllocator = this->NodeIdPairAllocator; - this->NodeIdPairAllocator = Tmp; - } - return *this; - } - - ~Allocators() { - // Note that we cannot use delete on these pointers, as they need to be - // returned to the sanitizer_common library's internal memory tracking - // system. - if (NodeAllocator != nullptr) { + Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT { + // When moving into an existing instance, we ensure that we clean up the + // current allocators. + if (NodeAllocator) NodeAllocator->~NodeAllocatorType(); - InternalFree(NodeAllocator); + if (O.NodeAllocator) { + new (&NodeAllocatorStorage) + NodeAllocatorType(std::move(*O.NodeAllocator)); + NodeAllocator = + reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage); + O.NodeAllocator = nullptr; + } else { NodeAllocator = nullptr; } - if (RootAllocator != nullptr) { + + if (RootAllocator) RootAllocator->~RootAllocatorType(); - InternalFree(RootAllocator); + if (O.RootAllocator) { + new (&RootAllocatorStorage) + RootAllocatorType(std::move(*O.RootAllocator)); + RootAllocator = + reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage); + O.RootAllocator = nullptr; + } else { RootAllocator = nullptr; } - if (ShadowStackAllocator != nullptr) { + + if (ShadowStackAllocator) ShadowStackAllocator->~ShadowStackAllocatorType(); - InternalFree(ShadowStackAllocator); + if (O.ShadowStackAllocator) { + new (&ShadowStackAllocatorStorage) + ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator)); + ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>( + &ShadowStackAllocatorStorage); + O.ShadowStackAllocator = nullptr; + } else { ShadowStackAllocator = nullptr; } - if (NodeIdPairAllocator != nullptr) { + + if (NodeIdPairAllocator) NodeIdPairAllocator->~NodeIdPairAllocatorType(); - InternalFree(NodeIdPairAllocator); + if (O.NodeIdPairAllocator) { + new (&NodeIdPairAllocatorStorage) + NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator)); + NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( + &NodeIdPairAllocatorStorage); + O.NodeIdPairAllocator = nullptr; + } else { NodeIdPairAllocator = nullptr; } + + return *this; + } + + ~Allocators() XRAY_NEVER_INSTRUMENT { + if (NodeAllocator != nullptr) + NodeAllocator->~NodeAllocatorType(); + if (RootAllocator != nullptr) + RootAllocator->~RootAllocatorType(); + if (ShadowStackAllocator != nullptr) + ShadowStackAllocator->~ShadowStackAllocatorType(); + if (NodeIdPairAllocator != nullptr) + NodeIdPairAllocator->~NodeIdPairAllocatorType(); } }; - // TODO: Support configuration of options through the arguments. - static Allocators InitAllocators() { + static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT { return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max); } - static Allocators InitAllocatorsCustom(uptr Max) { - Allocators A; - auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>( - InternalAlloc(sizeof(Allocators::NodeAllocatorType))); - new (NodeAllocator) Allocators::NodeAllocatorType(Max); - A.NodeAllocator = NodeAllocator; - - auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>( - InternalAlloc(sizeof(Allocators::RootAllocatorType))); - new (RootAllocator) Allocators::RootAllocatorType(Max); - A.RootAllocator = RootAllocator; - - auto ShadowStackAllocator = - reinterpret_cast<Allocators::ShadowStackAllocatorType *>( - InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType))); - new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max); - A.ShadowStackAllocator = ShadowStackAllocator; - - auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>( - InternalAlloc(sizeof(NodeIdPairAllocatorType))); - new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max); - A.NodeIdPairAllocator = NodeIdPairAllocator; + static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT { + Allocators A(Max); + return A; + } + + static Allocators + InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT { + Allocators A(Bufs); return A; } @@ -253,65 +323,135 @@ private: NodeArray Nodes; RootArray Roots; ShadowStackArray ShadowStack; - NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr; + NodeIdPairAllocatorType *NodeIdPairAllocator; + uint32_t OverflowedFunctions; public: - explicit FunctionCallTrie(const Allocators &A) - : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator), + explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT + : Nodes(*A.NodeAllocator), + Roots(*A.RootAllocator), ShadowStack(*A.ShadowStackAllocator), - NodeIdPairAllocator(A.NodeIdPairAllocator) {} + NodeIdPairAllocator(A.NodeIdPairAllocator), + OverflowedFunctions(0) {} + + FunctionCallTrie() = delete; + FunctionCallTrie(const FunctionCallTrie &) = delete; + FunctionCallTrie &operator=(const FunctionCallTrie &) = delete; + + FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT + : Nodes(std::move(O.Nodes)), + Roots(std::move(O.Roots)), + ShadowStack(std::move(O.ShadowStack)), + NodeIdPairAllocator(O.NodeIdPairAllocator), + OverflowedFunctions(O.OverflowedFunctions) {} + + FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT { + Nodes = std::move(O.Nodes); + Roots = std::move(O.Roots); + ShadowStack = std::move(O.ShadowStack); + NodeIdPairAllocator = O.NodeIdPairAllocator; + OverflowedFunctions = O.OverflowedFunctions; + return *this; + } + + ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {} - void enterFunction(const int32_t FId, uint64_t TSC) { + void enterFunction(const int32_t FId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { DCHECK_NE(FId, 0); - // This function primarily deals with ensuring that the ShadowStack is - // consistent and ready for when an exit event is encountered. + + // If we're already overflowed the function call stack, do not bother + // attempting to record any more function entries. + if (UNLIKELY(OverflowedFunctions)) { + ++OverflowedFunctions; + return; + } + + // If this is the first function we've encountered, we want to set up the + // node(s) and treat it as a root. if (UNLIKELY(ShadowStack.empty())) { - auto NewRoot = - Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId); + auto *NewRoot = Nodes.AppendEmplace( + nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewRoot == nullptr)) return; - Roots.Append(NewRoot); - ShadowStack.AppendEmplace(TSC, NewRoot); + if (Roots.AppendEmplace(NewRoot) == nullptr) { + Nodes.trim(1); + return; + } + if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) { + Nodes.trim(1); + Roots.trim(1); + ++OverflowedFunctions; + return; + } return; } - auto &Top = ShadowStack.back(); - auto TopNode = Top.NodePtr; + // From this point on, we require that the stack is not empty. + DCHECK(!ShadowStack.empty()); + auto TopNode = ShadowStack.back().NodePtr; DCHECK_NE(TopNode, nullptr); - // If we've seen this callee before, then we just access that node and place - // that on the top of the stack. - auto Callee = TopNode->Callees.find_element( + // If we've seen this callee before, then we access that node and place that + // on the top of the stack. + auto* Callee = TopNode->Callees.find_element( [FId](const NodeIdPair &NR) { return NR.FId == FId; }); if (Callee != nullptr) { CHECK_NE(Callee->NodePtr, nullptr); - ShadowStack.AppendEmplace(TSC, Callee->NodePtr); + if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr) + ++OverflowedFunctions; return; } // This means we've never seen this stack before, create a new node here. - auto NewNode = - Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId); + auto* NewNode = Nodes.AppendEmplace( + TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId); if (UNLIKELY(NewNode == nullptr)) return; DCHECK_NE(NewNode, nullptr); TopNode->Callees.AppendEmplace(NewNode, FId); - ShadowStack.AppendEmplace(TSC, NewNode); - DCHECK_NE(ShadowStack.back().NodePtr, nullptr); + if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr) + ++OverflowedFunctions; return; } - void exitFunction(int32_t FId, uint64_t TSC) { + void exitFunction(int32_t FId, uint64_t TSC, + uint16_t CPU) XRAY_NEVER_INSTRUMENT { + // If we're exiting functions that have "overflowed" or don't fit into the + // stack due to allocator constraints, we then decrement that count first. + if (OverflowedFunctions) { + --OverflowedFunctions; + return; + } + // When we exit a function, we look up the ShadowStack to see whether we've // entered this function before. We do as little processing here as we can, // since most of the hard work would have already been done at function // entry. uint64_t CumulativeTreeTime = 0; + while (!ShadowStack.empty()) { const auto &Top = ShadowStack.back(); auto TopNode = Top.NodePtr; DCHECK_NE(TopNode, nullptr); - auto LocalTime = TSC - Top.EntryTSC; + + // We may encounter overflow on the TSC we're provided, which may end up + // being less than the TSC when we first entered the function. + // + // To get the accurate measurement of cycles, we need to check whether + // we've overflowed (TSC < Top.EntryTSC) and then account the difference + // between the entry TSC and the max for the TSC counter (max of uint64_t) + // then add the value of TSC. We can prove that the maximum delta we will + // get is at most the 64-bit unsigned value, since the difference between + // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max() + // - 1) + 1. + // + // NOTE: This assumes that TSCs are synchronised across CPUs. + // TODO: Count the number of times we've seen CPU migrations. + uint64_t LocalTime = + Top.EntryTSC > TSC + ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC + : TSC - Top.EntryTSC; TopNode->CallCount++; TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime; CumulativeTreeTime += LocalTime; @@ -323,7 +463,7 @@ public: } } - const RootArray &getRoots() const { return Roots; } + const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; } // The deepCopyInto operation will update the provided FunctionCallTrie by // re-creating the contents of this particular FunctionCallTrie in the other @@ -338,7 +478,7 @@ public: // synchronisation of both "this" and |O|. // // This function must *not* be called with a non-empty FunctionCallTrie |O|. - void deepCopyInto(FunctionCallTrie &O) const { + void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT { DCHECK(O.getRoots().empty()); // We then push the root into a stack, to use as the parent marker for new @@ -356,18 +496,20 @@ public: for (const auto Root : getRoots()) { // Add a node in O for this root. auto NewRoot = O.Nodes.AppendEmplace( - nullptr, *O.NodeIdPairAllocator, Root->CallCount, + nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount, Root->CumulativeLocalTime, Root->FId); // Because we cannot allocate more memory we should bail out right away. if (UNLIKELY(NewRoot == nullptr)) return; - O.Roots.Append(NewRoot); + if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr)) + return; // TODO: Figure out what to do if we fail to allocate any more stack // space. Maybe warn or report once? - DFSStack.AppendEmplace(Root, NewRoot); + if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr) + return; while (!DFSStack.empty()) { NodeAndParent NP = DFSStack.back(); DCHECK_NE(NP.Node, nullptr); @@ -375,12 +517,17 @@ public: DFSStack.trim(1); for (const auto Callee : NP.Node->Callees) { auto NewNode = O.Nodes.AppendEmplace( - NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount, - Callee.NodePtr->CumulativeLocalTime, Callee.FId); + NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator), + Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime, + Callee.FId); if (UNLIKELY(NewNode == nullptr)) return; - NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId); - DFSStack.AppendEmplace(Callee.NodePtr, NewNode); + if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) == + nullptr)) + return; + if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) == + nullptr)) + return; } } } @@ -394,7 +541,7 @@ public: // // This function is *not* thread-safe, and may require external // synchronisation of both "this" and |O|. - void mergeInto(FunctionCallTrie &O) const { + void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT { struct NodeAndTarget { FunctionCallTrie::Node *OrigNode; FunctionCallTrie::Node *TargetNode; @@ -409,8 +556,9 @@ public: auto R = O.Roots.find_element( [&](const Node *Node) { return Node->FId == Root->FId; }); if (R == nullptr) { - TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0, - 0, Root->FId); + TargetRoot = O.Nodes.AppendEmplace( + nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u, + Root->FId); if (UNLIKELY(TargetRoot == nullptr)) return; @@ -419,7 +567,7 @@ public: TargetRoot = *R; } - DFSStack.Append(NodeAndTarget{Root, TargetRoot}); + DFSStack.AppendEmplace(Root, TargetRoot); while (!DFSStack.empty()) { NodeAndTarget NT = DFSStack.back(); DCHECK_NE(NT.OrigNode, nullptr); @@ -435,7 +583,8 @@ public: }); if (TargetCallee == nullptr) { auto NewTargetNode = O.Nodes.AppendEmplace( - NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId); + NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u, + Callee.FId); if (UNLIKELY(NewTargetNode == nullptr)) return; diff --git a/contrib/compiler-rt/lib/xray/xray_init.cc b/contrib/compiler-rt/lib/xray/xray_init.cc index b4e069795195..b0922aa8e379 100644 --- a/contrib/compiler-rt/lib/xray/xray_init.cc +++ b/contrib/compiler-rt/lib/xray/xray_init.cc @@ -27,6 +27,15 @@ extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak)); extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak)); extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak)); extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak)); + +#if SANITIZER_MAC +// HACK: This is a temporary workaround to make XRay build on +// Darwin, but it will probably not work at runtime. +const XRaySledEntry __start_xray_instr_map[] = {}; +extern const XRaySledEntry __stop_xray_instr_map[] = {}; +extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {}; +extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {}; +#endif } using namespace __xray; @@ -58,6 +67,9 @@ void __xray_init() XRAY_NEVER_INSTRUMENT { if (atomic_load(&XRayInitialized, memory_order_acquire)) return; + // XRAY is not compatible with PaX MPROTECT + CheckMPROTECT(); + if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) { initializeFlags(); atomic_store(&XRayFlagsInitialized, true, memory_order_release); @@ -97,8 +109,8 @@ __attribute__((section(".preinit_array"), #else // If we cannot use the .preinit_array section, we should instead use dynamic // initialisation. -static bool UNUSED __local_xray_dyninit = [] { +__attribute__ ((constructor (0))) +static void __local_xray_dyninit() { __xray_init(); - return true; -}(); +} #endif diff --git a/contrib/compiler-rt/lib/xray/xray_interface.cc b/contrib/compiler-rt/lib/xray/xray_interface.cc index 01bf6ddc607e..6f7b6615b2c0 100644 --- a/contrib/compiler-rt/lib/xray/xray_interface.cc +++ b/contrib/compiler-rt/lib/xray/xray_interface.cc @@ -22,6 +22,13 @@ #include <string.h> #include <sys/mman.h> +#if SANITIZER_FUCHSIA +#include <zircon/process.h> +#include <zircon/sanitizer.h> +#include <zircon/status.h> +#include <zircon/syscalls.h> +#endif + #include "sanitizer_common/sanitizer_addrhashmap.h" #include "sanitizer_common/sanitizer_common.h" @@ -92,22 +99,48 @@ class MProtectHelper { public: explicit MProtectHelper(void *PageAlignedAddr, - std::size_t MProtectLen) XRAY_NEVER_INSTRUMENT + std::size_t MProtectLen, + std::size_t PageSize) XRAY_NEVER_INSTRUMENT : PageAlignedAddr(PageAlignedAddr), MProtectLen(MProtectLen), - MustCleanup(false) {} + MustCleanup(false) { +#if SANITIZER_FUCHSIA + MProtectLen = RoundUpTo(MProtectLen, PageSize); +#endif + } int MakeWriteable() XRAY_NEVER_INSTRUMENT { +#if SANITIZER_FUCHSIA + auto R = __sanitizer_change_code_protection( + reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, true); + if (R != ZX_OK) { + Report("XRay: cannot change code protection: %s\n", + _zx_status_get_string(R)); + return -1; + } + MustCleanup = true; + return 0; +#else auto R = mprotect(PageAlignedAddr, MProtectLen, PROT_READ | PROT_WRITE | PROT_EXEC); if (R != -1) MustCleanup = true; return R; +#endif } ~MProtectHelper() XRAY_NEVER_INSTRUMENT { if (MustCleanup) { +#if SANITIZER_FUCHSIA + auto R = __sanitizer_change_code_protection( + reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, false); + if (R != ZX_OK) { + Report("XRay: cannot change code protection: %s\n", + _zx_status_get_string(R)); + } +#else mprotect(PageAlignedAddr, MProtectLen, PROT_READ | PROT_EXEC); +#endif } } }; @@ -254,7 +287,7 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT { reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1)); size_t MProtectLen = (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength; - MProtectHelper Protector(PageAlignedAddr, MProtectLen); + MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize); if (Protector.MakeWriteable() == -1) { Report("Failed mprotect: %d\n", errno); return XRayPatchingStatus::FAILED; @@ -319,7 +352,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId, reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1)); size_t MProtectLen = (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength; - MProtectHelper Protector(PageAlignedAddr, MProtectLen); + MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize); if (Protector.MakeWriteable() == -1) { Report("Failed mprotect: %d\n", errno); return XRayPatchingStatus::FAILED; diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc index 17a611eeacb8..dc3a82069840 100644 --- a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc +++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc @@ -13,10 +13,11 @@ // //===----------------------------------------------------------------------===// #include "xray_profile_collector.h" -#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" -#include "sanitizer_common/sanitizer_vector.h" +#include "xray_allocator.h" +#include "xray_defs.h" #include "xray_profiling_flags.h" +#include "xray_segmented_array.h" #include <memory> #include <pthread.h> #include <utility> @@ -29,7 +30,7 @@ namespace { SpinMutex GlobalMutex; struct ThreadTrie { tid_t TId; - FunctionCallTrie *Trie; + typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage; }; struct ProfileBuffer { @@ -56,65 +57,91 @@ struct BlockHeader { u64 ThreadId; }; -// These need to be pointers that point to heap/internal-allocator-allocated -// objects because these are accessed even at program exit. -Vector<ThreadTrie> *ThreadTries = nullptr; -Vector<ProfileBuffer> *ProfileBuffers = nullptr; -FunctionCallTrie::Allocators *GlobalAllocators = nullptr; +struct ThreadData { + BufferQueue *BQ; + FunctionCallTrie::Allocators::Buffers Buffers; + FunctionCallTrie::Allocators Allocators; + FunctionCallTrie FCT; + tid_t TId; +}; + +using ThreadDataArray = Array<ThreadData>; +using ThreadDataAllocator = ThreadDataArray::AllocatorType; + +// We use a separate buffer queue for the backing store for the allocator used +// by the ThreadData array. This lets us host the buffers, allocators, and tries +// associated with a thread by moving the data into the array instead of +// attempting to copy the data to a separately backed set of tries. +static typename std::aligned_storage< + sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; +static BufferQueue *BQ = nullptr; +static BufferQueue::Buffer Buffer; +static typename std::aligned_storage<sizeof(ThreadDataAllocator), + alignof(ThreadDataAllocator)>::type + ThreadDataAllocatorStorage; +static typename std::aligned_storage<sizeof(ThreadDataArray), + alignof(ThreadDataArray)>::type + ThreadDataArrayStorage; + +static ThreadDataAllocator *TDAllocator = nullptr; +static ThreadDataArray *TDArray = nullptr; + +using ProfileBufferArray = Array<ProfileBuffer>; +using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; + +// These need to be global aligned storage to avoid dynamic initialization. We +// need these to be aligned to allow us to placement new objects into the +// storage, and have pointers to those objects be appropriately aligned. +static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type + ProfileBuffersStorage; +static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type + ProfileBufferArrayAllocatorStorage; + +static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; +static ProfileBufferArray *ProfileBuffers = nullptr; + +// Use a global flag to determine whether the collector implementation has been +// initialized. +static atomic_uint8_t CollectorInitialized{0}; } // namespace -void post(const FunctionCallTrie &T, tid_t TId) { - static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { - SpinMutexLock Lock(&GlobalMutex); - GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( - InternalAlloc(sizeof(FunctionCallTrie::Allocators))); - new (GlobalAllocators) FunctionCallTrie::Allocators(); - *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom( - profilingFlags()->global_allocator_max); - ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>( - InternalAlloc(sizeof(Vector<ThreadTrie>))); - new (ThreadTries) Vector<ThreadTrie>(); - ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>( - InternalAlloc(sizeof(Vector<ProfileBuffer>))); - new (ProfileBuffers) Vector<ProfileBuffer>(); - }); - DCHECK_NE(GlobalAllocators, nullptr); - DCHECK_NE(ThreadTries, nullptr); - DCHECK_NE(ProfileBuffers, nullptr); - - ThreadTrie *Item = nullptr; +void post(BufferQueue *Q, FunctionCallTrie &&T, + FunctionCallTrie::Allocators &&A, + FunctionCallTrie::Allocators::Buffers &&B, + tid_t TId) XRAY_NEVER_INSTRUMENT { + DCHECK_NE(Q, nullptr); + + // Bail out early if the collector has not been initialized. + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + return; + } + { SpinMutexLock Lock(&GlobalMutex); - if (GlobalAllocators == nullptr) - return; - - Item = ThreadTries->PushBack(); - Item->TId = TId; - - // Here we're using the internal allocator instead of the managed allocator - // because: - // - // 1) We're not using the segmented array data structure to host - // FunctionCallTrie objects. We're using a Vector (from sanitizer_common) - // which works like a std::vector<...> keeping elements contiguous in - // memory. The segmented array data structure assumes that elements are - // trivially destructible, where FunctionCallTrie isn't. - // - // 2) Using a managed allocator means we need to manage that separately, - // which complicates the nature of this code. To get around that, we're - // using the internal allocator instead, which has its own global state - // and is decoupled from the lifetime management required by the managed - // allocator we have in XRay. - // - Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc( - sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie))); - DCHECK_NE(Item->Trie, nullptr); - new (Item->Trie) FunctionCallTrie(*GlobalAllocators); + DCHECK_NE(TDAllocator, nullptr); + DCHECK_NE(TDArray, nullptr); + + if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), + TId) == nullptr) { + // If we fail to add the data to the array, we should destroy the objects + // handed us. + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + } } - - T.deepCopyInto(*Item->Trie); } // A PathArray represents the function id's representing a stack trace. In this @@ -127,18 +154,8 @@ struct ProfileRecord { // The Path in this record is the function id's from the leaf to the root of // the function call stack as represented from a FunctionCallTrie. - PathArray *Path = nullptr; - const FunctionCallTrie::Node *Node = nullptr; - - // Constructor for in-place construction. - ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N) - : Path([&] { - auto P = - reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray))); - new (P) PathArray(A); - return P; - }()), - Node(N) {} + PathArray Path; + const FunctionCallTrie::Node *Node; }; namespace { @@ -147,19 +164,21 @@ using ProfileRecordArray = Array<ProfileRecord>; // Walk a depth-first traversal of each root of the FunctionCallTrie to generate // the path(s) and the data associated with the path. -static void populateRecords(ProfileRecordArray &PRs, - ProfileRecord::PathAllocator &PA, - const FunctionCallTrie &Trie) { +static void +populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, + const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { using StackArray = Array<const FunctionCallTrie::Node *>; using StackAllocator = typename StackArray::AllocatorType; StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); StackArray DFSStack(StackAlloc); - for (const auto R : Trie.getRoots()) { + for (const auto *R : Trie.getRoots()) { DFSStack.Append(R); while (!DFSStack.empty()) { - auto Node = DFSStack.back(); + auto *Node = DFSStack.back(); DFSStack.trim(1); - auto Record = PRs.AppendEmplace(PA, Node); + if (Node == nullptr) + continue; + auto Record = PRs.AppendEmplace(PathArray{PA}, Node); if (Record == nullptr) return; DCHECK_NE(Record, nullptr); @@ -167,8 +186,8 @@ static void populateRecords(ProfileRecordArray &PRs, // Traverse the Node's parents and as we're doing so, get the FIds in // the order they appear. for (auto N = Node; N != nullptr; N = N->Parent) - Record->Path->Append(N->FId); - DCHECK(!Record->Path->empty()); + Record->Path.Append(N->FId); + DCHECK(!Record->Path.empty()); for (const auto C : Node->Callees) DFSStack.Append(C.NodePtr); @@ -177,67 +196,89 @@ static void populateRecords(ProfileRecordArray &PRs, } static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, - const ProfileRecordArray &ProfileRecords) { - auto NextPtr = static_cast<char *>( + const ProfileRecordArray &ProfileRecords) + XRAY_NEVER_INSTRUMENT { + auto NextPtr = static_cast<uint8_t *>( internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + sizeof(Header); for (const auto &Record : ProfileRecords) { // List of IDs follow: - for (const auto FId : *Record.Path) + for (const auto FId : Record.Path) NextPtr = - static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + + static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + sizeof(FId); // Add the sentinel here. constexpr int32_t SentinelFId = 0; - NextPtr = static_cast<char *>( + NextPtr = static_cast<uint8_t *>( internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + sizeof(SentinelFId); // Add the node data here. NextPtr = - static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount, - sizeof(Record.Node->CallCount))) + + static_cast<uint8_t *>(internal_memcpy( + NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + sizeof(Record.Node->CallCount); - NextPtr = static_cast<char *>( + NextPtr = static_cast<uint8_t *>( internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, sizeof(Record.Node->CumulativeLocalTime))) + sizeof(Record.Node->CumulativeLocalTime); } - DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size); + DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); } } // namespace -void serialize() { +void serialize() XRAY_NEVER_INSTRUMENT { + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) + return; + SpinMutexLock Lock(&GlobalMutex); - // Clear out the global ProfileBuffers. - for (uptr I = 0; I < ProfileBuffers->Size(); ++I) - InternalFree((*ProfileBuffers)[I].Data); - ProfileBuffers->Reset(); + // Clear out the global ProfileBuffers, if it's not empty. + for (auto &B : *ProfileBuffers) + deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); + ProfileBuffers->trim(ProfileBuffers->size()); - if (ThreadTries->Size() == 0) + DCHECK_NE(TDArray, nullptr); + if (TDArray->empty()) return; // Then repopulate the global ProfileBuffers. - for (u32 I = 0; I < ThreadTries->Size(); ++I) { + u32 I = 0; + auto MaxSize = profilingFlags()->global_allocator_max; + auto ProfileArena = allocateBuffer(MaxSize); + if (ProfileArena == nullptr) + return; + + auto ProfileArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); + + auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); + if (PathArena == nullptr) + return; + + auto PathArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); + + for (const auto &ThreadTrie : *TDArray) { using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; - ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max); + ProfileRecordAllocator PRAlloc(ProfileArena, + profilingFlags()->global_allocator_max); ProfileRecord::PathAllocator PathAlloc( - profilingFlags()->global_allocator_max); + PathArena, profilingFlags()->global_allocator_max); ProfileRecordArray ProfileRecords(PRAlloc); // First, we want to compute the amount of space we're going to need. We'll // use a local allocator and an __xray::Array<...> to store the intermediary // data, then compute the size as we're going along. Then we'll allocate the // contiguous space to contain the thread buffer data. - const auto &Trie = *(*ThreadTries)[I].Trie; - if (Trie.getRoots().empty()) + if (ThreadTrie.FCT.getRoots().empty()) continue; - populateRecords(ProfileRecords, PathAlloc, Trie); - DCHECK(!Trie.getRoots().empty()); + + populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); + DCHECK(!ThreadTrie.FCT.getRoots().empty()); DCHECK(!ProfileRecords.empty()); // Go through each record, to compute the sizes. @@ -251,75 +292,103 @@ void serialize() { // + end of record (8 bytes) u32 CumulativeSizes = 0; for (const auto &Record : ProfileRecords) - CumulativeSizes += 20 + (4 * Record.Path->size()); - - BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId}; - auto Buffer = ProfileBuffers->PushBack(); - Buffer->Size = sizeof(Header) + CumulativeSizes; - Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64); - DCHECK_NE(Buffer->Data, nullptr); - serializeRecords(Buffer, Header, ProfileRecords); - - // Now clean up the ProfileRecords array, one at a time. - for (auto &Record : ProfileRecords) { - Record.Path->~PathArray(); - InternalFree(Record.Path); - } + CumulativeSizes += 20 + (4 * Record.Path.size()); + + BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; + auto B = ProfileBuffers->Append({}); + B->Size = sizeof(Header) + CumulativeSizes; + B->Data = allocateBuffer(B->Size); + DCHECK_NE(B->Data, nullptr); + serializeRecords(B, Header, ProfileRecords); } } -void reset() { +void reset() XRAY_NEVER_INSTRUMENT { + atomic_store(&CollectorInitialized, 0, memory_order_release); SpinMutexLock Lock(&GlobalMutex); + if (ProfileBuffers != nullptr) { // Clear out the profile buffers that have been serialized. - for (uptr I = 0; I < ProfileBuffers->Size(); ++I) - InternalFree((*ProfileBuffers)[I].Data); - ProfileBuffers->Reset(); - InternalFree(ProfileBuffers); + for (auto &B : *ProfileBuffers) + deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); + ProfileBuffers->trim(ProfileBuffers->size()); ProfileBuffers = nullptr; } - if (ThreadTries != nullptr) { - // Clear out the function call tries per thread. - for (uptr I = 0; I < ThreadTries->Size(); ++I) { - auto &T = (*ThreadTries)[I]; - T.Trie->~FunctionCallTrie(); - InternalFree(T.Trie); + if (TDArray != nullptr) { + // Release the resources as required. + for (auto &TD : *TDArray) { + TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); + TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); + TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); + TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); } - ThreadTries->Reset(); - InternalFree(ThreadTries); - ThreadTries = nullptr; + // We don't bother destroying the array here because we've already + // potentially freed the backing store for the array. Instead we're going to + // reset the pointer to nullptr, and re-use the storage later instead + // (placement-new'ing into the storage as-is). + TDArray = nullptr; + } + + if (TDAllocator != nullptr) { + TDAllocator->~Allocator(); + TDAllocator = nullptr; + } + + if (Buffer.Data != nullptr) { + BQ->releaseBuffer(Buffer); } - // Reset the global allocators. - if (GlobalAllocators != nullptr) { - GlobalAllocators->~Allocators(); - InternalFree(GlobalAllocators); - GlobalAllocators = nullptr; + if (BQ == nullptr) { + bool Success = false; + new (&BufferQueueStorage) + BufferQueue(profilingFlags()->global_allocator_max, 1, Success); + if (!Success) + return; + BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); + } else { + BQ->finalize(); + + if (BQ->init(profilingFlags()->global_allocator_max, 1) != + BufferQueue::ErrorCode::Ok) + return; } - GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>( - InternalAlloc(sizeof(FunctionCallTrie::Allocators))); - new (GlobalAllocators) FunctionCallTrie::Allocators(); - *GlobalAllocators = FunctionCallTrie::InitAllocators(); - ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>( - InternalAlloc(sizeof(Vector<ThreadTrie>))); - new (ThreadTries) Vector<ThreadTrie>(); - ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>( - InternalAlloc(sizeof(Vector<ProfileBuffer>))); - new (ProfileBuffers) Vector<ProfileBuffer>(); + + if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) + return; + + new (&ProfileBufferArrayAllocatorStorage) + ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); + ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( + &ProfileBufferArrayAllocatorStorage); + + new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); + ProfileBuffers = + reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); + + new (&ThreadDataAllocatorStorage) + ThreadDataAllocator(Buffer.Data, Buffer.Size); + TDAllocator = + reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); + new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); + TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); + + atomic_store(&CollectorInitialized, 1, memory_order_release); } -XRayBuffer nextBuffer(XRayBuffer B) { +XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { SpinMutexLock Lock(&GlobalMutex); - if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0) + if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) return {nullptr, 0}; static pthread_once_t Once = PTHREAD_ONCE_INIT; static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type FileHeaderStorage; - pthread_once(&Once, - +[] { new (&FileHeaderStorage) XRayProfilingFileHeader{}; }); + pthread_once( + &Once, +[]() XRAY_NEVER_INSTRUMENT { + new (&FileHeaderStorage) XRayProfilingFileHeader{}; + }); if (UNLIKELY(B.Data == nullptr)) { // The first buffer should always contain the file header information. @@ -336,7 +405,7 @@ XRayBuffer nextBuffer(XRayBuffer B) { BlockHeader Header; internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); auto NextBlock = Header.BlockNum + 1; - if (NextBlock < ProfileBuffers->Size()) + if (NextBlock < ProfileBuffers->size()) return {(*ProfileBuffers)[NextBlock].Data, (*ProfileBuffers)[NextBlock].Size}; return {nullptr, 0}; diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.h b/contrib/compiler-rt/lib/xray/xray_profile_collector.h index 335043db9526..86c4ce853797 100644 --- a/contrib/compiler-rt/lib/xray/xray_profile_collector.h +++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.h @@ -33,27 +33,13 @@ namespace profileCollectorService { /// Posts the FunctionCallTrie associated with a specific Thread ID. This /// will: /// -/// - Make a copy of the FunctionCallTrie and store that against the Thread -/// ID. This will use the global allocator for the service-managed -/// FunctionCallTrie instances. -/// - Queue up a pointer to the FunctionCallTrie. -/// - If the queue is long enough (longer than some arbitrary threshold) we -/// then pre-calculate a single FunctionCallTrie for the whole process. +/// Moves the collection of FunctionCallTrie, Allocators, and Buffers associated +/// with a thread's data to the queue. This takes ownership of the memory +/// associated with a thread, and manages those exclusively. /// -/// -/// We are making a copy of the FunctionCallTrie because the intent is to have -/// this function be called at thread exit, or soon after the profiling -/// handler is finalized through the XRay APIs. By letting threads each -/// process their own thread-local FunctionCallTrie instances, we're removing -/// the need for synchronisation across threads while we're profiling. -/// However, once we're done profiling, we can then collect copies of these -/// FunctionCallTrie instances and pay the cost of the copy. -/// -/// NOTE: In the future, if this turns out to be more costly than "moving" the -/// FunctionCallTrie instances from the owning thread to the collector -/// service, then we can change the implementation to do it this way (moving) -/// instead. -void post(const FunctionCallTrie &T, tid_t TId); +void post(BufferQueue *Q, FunctionCallTrie &&T, + FunctionCallTrie::Allocators &&A, + FunctionCallTrie::Allocators::Buffers &&B, tid_t TId); /// The serialize will process all FunctionCallTrie instances in memory, and /// turn those into specifically formatted blocks, each describing the diff --git a/contrib/compiler-rt/lib/xray/xray_profiling.cc b/contrib/compiler-rt/lib/xray/xray_profiling.cc index d4b4345d764a..4323170cd1bb 100644 --- a/contrib/compiler-rt/lib/xray/xray_profiling.cc +++ b/contrib/compiler-rt/lib/xray/xray_profiling.cc @@ -19,7 +19,7 @@ #include "sanitizer_common/sanitizer_flags.h" #include "xray/xray_interface.h" #include "xray/xray_log_interface.h" - +#include "xray_buffer_queue.h" #include "xray_flags.h" #include "xray_profile_collector.h" #include "xray_profiling_flags.h" @@ -32,62 +32,167 @@ namespace __xray { namespace { -atomic_sint32_t ProfilerLogFlushStatus = { +static atomic_sint32_t ProfilerLogFlushStatus = { XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; -atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; +static atomic_sint32_t ProfilerLogStatus = { + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED}; -SpinMutex ProfilerOptionsMutex; +static SpinMutex ProfilerOptionsMutex; -struct alignas(64) ProfilingData { - FunctionCallTrie::Allocators *Allocators = nullptr; - FunctionCallTrie *FCT = nullptr; +struct ProfilingData { + atomic_uintptr_t Allocators; + atomic_uintptr_t FCT; }; static pthread_key_t ProfilingKey; -thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{}; -static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT { - thread_local auto ThreadOnce = [] { - new (&ThreadStorage) ProfilingData{}; - pthread_setspecific(ProfilingKey, &ThreadStorage); +// We use a global buffer queue, which gets initialized once at initialisation +// time, and gets reset when profiling is "done". +static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type + BufferQueueStorage; +static BufferQueue *BQ = nullptr; + +thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers; +thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators), + alignof(FunctionCallTrie::Allocators)>::type + AllocatorsStorage; +thread_local std::aligned_storage<sizeof(FunctionCallTrie), + alignof(FunctionCallTrie)>::type + FunctionCallTrieStorage; +thread_local ProfilingData TLD{{0}, {0}}; +thread_local atomic_uint8_t ReentranceGuard{0}; + +// We use a separate guard for ensuring that for this thread, if we're already +// cleaning up, that any signal handlers don't attempt to cleanup nor +// initialise. +thread_local atomic_uint8_t TLDInitGuard{0}; + +// We also use a separate latch to signal that the thread is exiting, and +// non-essential work should be ignored (things like recording events, etc.). +thread_local atomic_uint8_t ThreadExitingLatch{0}; + +static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT { + thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT { + pthread_setspecific(ProfilingKey, &TLD); return false; }(); (void)ThreadOnce; - auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage); + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return nullptr; - // We need to check whether the global flag to finalizing/finalized has been - // switched. If it is, then we ought to not actually initialise the data. - auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire); - if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING || - Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) - return TLD; - - // If we're live, then we re-initialize TLD if the pointers are not null. - if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) { - TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>( - InternalAlloc(sizeof(FunctionCallTrie::Allocators))); - new (TLD.Allocators) FunctionCallTrie::Allocators(); - *TLD.Allocators = FunctionCallTrie::InitAllocators(); - TLD.FCT = reinterpret_cast<FunctionCallTrie *>( - InternalAlloc(sizeof(FunctionCallTrie))); - new (TLD.FCT) FunctionCallTrie(*TLD.Allocators); + if (atomic_load_relaxed(&ThreadExitingLatch)) + return nullptr; + + uptr Allocators = 0; + if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1, + memory_order_acq_rel)) { + bool Success = false; + auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + atomic_store(&TLD.Allocators, 0, memory_order_release); + }); + + // Acquire a set of buffers for this thread. + if (BQ == nullptr) + return nullptr; + + if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok) + return nullptr; + auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.NodeBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok) + return nullptr; + auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.RootsBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) != + BufferQueue::ErrorCode::Ok) + return nullptr; + auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT { + if (!Success) + BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer); + }); + + if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) != + BufferQueue::ErrorCode::Ok) + return nullptr; + + Success = true; + new (&AllocatorsStorage) FunctionCallTrie::Allocators( + FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers)); + Allocators = reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)); + atomic_store(&TLD.Allocators, Allocators, memory_order_release); + } + + if (Allocators == 1) + return nullptr; + + uptr FCT = 0; + if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) { + new (&FunctionCallTrieStorage) + FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>( + atomic_load_relaxed(&TLD.Allocators))); + FCT = reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)); + atomic_store(&TLD.FCT, FCT, memory_order_release); } - return TLD; + if (FCT == 1) + return nullptr; + + return &TLD; } static void cleanupTLD() XRAY_NEVER_INSTRUMENT { - auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage); - if (TLD.Allocators != nullptr && TLD.FCT != nullptr) { - TLD.FCT->~FunctionCallTrie(); - TLD.Allocators->~Allocators(); - InternalFree(TLD.FCT); - InternalFree(TLD.Allocators); - TLD.FCT = nullptr; - TLD.Allocators = nullptr; - } + auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel); + if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>( + &FunctionCallTrieStorage))) + reinterpret_cast<FunctionCallTrie *>(FCT)->~FunctionCallTrie(); + + auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel); + if (Allocators == + reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage))) + reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)->~Allocators(); +} + +static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT { + RecursionGuard TLDInit(TLDInitGuard); + if (!TLDInit) + return; + + uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel); + if (P != reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage))) + return; + + auto FCT = reinterpret_cast<FunctionCallTrie *>(P); + DCHECK_NE(FCT, nullptr); + + uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel); + if (A != + reinterpret_cast<uptr>( + reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage))) + return; + + auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A); + DCHECK_NE(Allocators, nullptr); + + // Always move the data into the profile collector. + profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators), + std::move(ThreadBuffers), GetTid()); + + // Re-initialize the ThreadBuffers object to a known "default" state. + ThreadBuffers = FunctionCallTrie::Allocators::Buffers{}; } } // namespace @@ -100,9 +205,6 @@ const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT { #endif } -atomic_sint32_t ProfileFlushStatus = { - XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING}; - XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { if (atomic_load(&ProfilerLogStatus, memory_order_acquire) != XRayLogInitStatus::XRAY_LOG_FINALIZED) { @@ -111,12 +213,23 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; } - s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; - if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result, - XRayLogFlushStatus::XRAY_LOG_FLUSHING, - memory_order_acq_rel)) { + RecursionGuard SignalGuard(ReentranceGuard); + if (!SignalGuard) { + if (Verbosity()) + Report("Cannot finalize properly inside a signal handler!\n"); + atomic_store(&ProfilerLogFlushStatus, + XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING, + memory_order_release); + return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING; + } + + s32 Previous = atomic_exchange(&ProfilerLogFlushStatus, + XRayLogFlushStatus::XRAY_LOG_FLUSHING, + memory_order_acq_rel); + if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) { if (Verbosity()) - Report("Not flushing profiles, implementation still finalizing.\n"); + Report("Not flushing profiles, implementation still flushing.\n"); + return XRayLogFlushStatus::XRAY_LOG_FLUSHING; } // At this point, we'll create the file that will contain the profile, but @@ -129,49 +242,33 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT { if (Verbosity()) Report("profiling: No data to flush.\n"); } else { - int Fd = getLogFD(); - if (Fd == -1) { + LogWriter *LW = LogWriter::Open(); + if (LW == nullptr) { if (Verbosity()) Report("profiling: Failed to flush to file, dropping data.\n"); } else { // Now for each of the buffers, write out the profile data as we would // see it in memory, verbatim. while (B.Data != nullptr && B.Size != 0) { - retryingWriteAll(Fd, reinterpret_cast<const char *>(B.Data), - reinterpret_cast<const char *>(B.Data) + B.Size); + LW->WriteAll(reinterpret_cast<const char *>(B.Data), + reinterpret_cast<const char *>(B.Data) + B.Size); B = profileCollectorService::nextBuffer(B); } - // Then we close out the file. - internal_close(Fd); } + LogWriter::Close(LW); } } profileCollectorService::reset(); - // Flush the current thread's local data structures as well. - cleanupTLD(); - - atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED, + memory_order_release); + atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, memory_order_release); return XRayLogFlushStatus::XRAY_LOG_FLUSHED; } -namespace { - -thread_local atomic_uint8_t ReentranceGuard{0}; - -static void postCurrentThreadFCT(ProfilingData &TLD) { - if (TLD.Allocators == nullptr || TLD.FCT == nullptr) - return; - - profileCollectorService::post(*TLD.FCT, GetTid()); - cleanupTLD(); -} - -} // namespace - void profilingHandleArg0(int32_t FuncId, XRayEntryType Entry) XRAY_NEVER_INSTRUMENT { unsigned char CPU; @@ -181,21 +278,29 @@ void profilingHandleArg0(int32_t FuncId, return; auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire); - auto &TLD = getThreadLocalData(); + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED || + Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING)) + return; + if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED || Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) { postCurrentThreadFCT(TLD); return; } + auto T = getThreadLocalData(); + if (T == nullptr) + return; + + auto FCT = reinterpret_cast<FunctionCallTrie *>(atomic_load_relaxed(&T->FCT)); switch (Entry) { case XRayEntryType::ENTRY: case XRayEntryType::LOG_ARGS_ENTRY: - TLD.FCT->enterFunction(FuncId, TSC); + FCT->enterFunction(FuncId, TSC, CPU); break; case XRayEntryType::EXIT: case XRayEntryType::TAIL: - TLD.FCT->exitFunction(FuncId, TSC); + FCT->exitFunction(FuncId, TSC, CPU); break; default: // FIXME: Handle bugs. @@ -218,12 +323,22 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { return static_cast<XRayLogInitStatus>(CurrentStatus); } + // Mark then finalize the current generation of buffers. This allows us to let + // the threads currently holding onto new buffers still use them, but let the + // last reference do the memory cleanup. + DCHECK_NE(BQ, nullptr); + BQ->finalize(); + // Wait a grace period to allow threads to see that we're finalizing. SleepForMillis(profilingFlags()->grace_period_ms); - // We also want to make sure that the current thread's data is cleaned up, - // if we have any. - auto &TLD = getThreadLocalData(); + // If we for some reason are entering this function from an instrumented + // handler, we bail out. + RecursionGuard G(ReentranceGuard); + if (!G) + return static_cast<XRayLogInitStatus>(CurrentStatus); + + // Post the current thread's data if we have any. postCurrentThreadFCT(TLD); // Then we force serialize the log data. @@ -235,19 +350,16 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT { } XRayLogInitStatus -profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options, +profilingLoggingInit(size_t, size_t, void *Options, size_t OptionsSize) XRAY_NEVER_INSTRUMENT { - if (BufferSize != 0 || BufferMax != 0) { - if (Verbosity()) - Report("__xray_log_init() being used, and is unsupported. Use " - "__xray_log_init_mode(...) instead. Bailing out."); + RecursionGuard G(ReentranceGuard); + if (!G) return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; - } s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZING, - memory_order_release)) { + memory_order_acq_rel)) { if (Verbosity()) Report("Cannot initialize already initialised profiling " "implementation.\n"); @@ -276,35 +388,88 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options, // We need to reset the profile data collection implementation now. profileCollectorService::reset(); - // We need to set up the exit handlers. - static pthread_once_t Once = PTHREAD_ONCE_INIT; - pthread_once(&Once, +[] { - pthread_key_create(&ProfilingKey, +[](void *P) { - // This is the thread-exit handler. - auto &TLD = *reinterpret_cast<ProfilingData *>(P); - if (TLD.Allocators == nullptr && TLD.FCT == nullptr) - return; - - postCurrentThreadFCT(TLD); - }); + // Then also reset the buffer queue implementation. + if (BQ == nullptr) { + bool Success = false; + new (&BufferQueueStorage) + BufferQueue(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max, Success); + if (!Success) { + if (Verbosity()) + Report("Failed to initialize preallocated memory buffers!"); + atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, + memory_order_release); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } - // We also need to set up an exit handler, so that we can get the profile - // information at exit time. We use the C API to do this, to not rely on C++ - // ABI functions for registering exit handlers. - Atexit(+[] { - // Finalize and flush. - if (profilingFinalize() != XRAY_LOG_FINALIZED) { - cleanupTLD(); - return; - } - if (profilingFlush() != XRAY_LOG_FLUSHED) { - cleanupTLD(); - return; - } + // If we've succeded, set the global pointer to the initialised storage. + BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); + } else { + BQ->finalize(); + auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max, + profilingFlags()->buffers_max); + + if (InitStatus != BufferQueue::ErrorCode::Ok) { if (Verbosity()) - Report("XRay Profile flushed at exit."); - }); - }); + Report("Failed to initialize preallocated memory buffers; error: %s", + BufferQueue::getErrorString(InitStatus)); + atomic_store(&ProfilerLogStatus, + XRayLogInitStatus::XRAY_LOG_UNINITIALIZED, + memory_order_release); + return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED; + } + + DCHECK(!BQ->finalizing()); + } + + // We need to set up the exit handlers. + static pthread_once_t Once = PTHREAD_ONCE_INIT; + pthread_once( + &Once, +[] { + pthread_key_create( + &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT { + if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel)) + return; + + if (P == nullptr) + return; + + auto T = reinterpret_cast<ProfilingData *>(P); + if (atomic_load_relaxed(&T->Allocators) == 0) + return; + + { + // If we're somehow executing this while inside a + // non-reentrant-friendly context, we skip attempting to post + // the current thread's data. + RecursionGuard G(ReentranceGuard); + if (!G) + return; + + postCurrentThreadFCT(*T); + } + }); + + // We also need to set up an exit handler, so that we can get the + // profile information at exit time. We use the C API to do this, to not + // rely on C++ ABI functions for registering exit handlers. + Atexit(+[]() XRAY_NEVER_INSTRUMENT { + if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel)) + return; + + auto Cleanup = + at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); }); + + // Finalize and flush. + if (profilingFinalize() != XRAY_LOG_FINALIZED || + profilingFlush() != XRAY_LOG_FLUSHED) + return; + + if (Verbosity()) + Report("XRay Profile flushed at exit."); + }); + }); __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer); __xray_set_handler(profilingHandleArg0); diff --git a/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc b/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc index e9230ae64187..ccd70860bf61 100644 --- a/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc +++ b/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc @@ -14,7 +14,7 @@ #error "Define XRAY_FLAG prior to including this file!" #endif -XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20, +XRAY_FLAG(uptr, per_thread_allocator_max, 16384, "Maximum size of any single per-thread allocator.") XRAY_FLAG(uptr, global_allocator_max, 2 << 24, "Maximum size of the global allocator for profile storage.") @@ -27,3 +27,6 @@ XRAY_FLAG(int, grace_period_ms, 1, XRAY_FLAG(bool, no_flush, false, "Set to true if we want the profiling implementation to not write " "out files.") +XRAY_FLAG(int, buffers_max, 128, + "The number of buffers to pre-allocate used by the profiling " + "implementation.") diff --git a/contrib/compiler-rt/lib/xray/xray_segmented_array.h b/contrib/compiler-rt/lib/xray/xray_segmented_array.h index 11dd794fa520..bc7e9379f63b 100644 --- a/contrib/compiler-rt/lib/xray/xray_segmented_array.h +++ b/contrib/compiler-rt/lib/xray/xray_segmented_array.h @@ -32,14 +32,9 @@ namespace __xray { /// is destroyed. When an Array is destroyed, it will destroy elements in the /// backing store but will not free the memory. template <class T> class Array { - struct SegmentBase { - SegmentBase *Prev; - SegmentBase *Next; - }; - - // We want each segment of the array to be cache-line aligned, and elements of - // the array be offset from the beginning of the segment. - struct Segment : SegmentBase { + struct Segment { + Segment *Prev; + Segment *Next; char Data[1]; }; @@ -62,98 +57,46 @@ public: // kCacheLineSize-multiple segments, minus the size of two pointers. // // - Request cacheline-multiple sized elements from the allocator. - static constexpr size_t AlignedElementStorageSize = + static constexpr uint64_t AlignedElementStorageSize = sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type); - static constexpr size_t SegmentSize = - nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize); + static constexpr uint64_t SegmentControlBlockSize = sizeof(Segment *) * 2; + + static constexpr uint64_t SegmentSize = nearest_boundary( + SegmentControlBlockSize + next_pow2(sizeof(T)), kCacheLineSize); using AllocatorType = Allocator<SegmentSize>; - static constexpr size_t ElementsPerSegment = - (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T)); + static constexpr uint64_t ElementsPerSegment = + (SegmentSize - SegmentControlBlockSize) / next_pow2(sizeof(T)); static_assert(ElementsPerSegment > 0, "Must have at least 1 element per segment."); - static SegmentBase SentinelSegment; - -private: - AllocatorType *Alloc; - SegmentBase *Head = &SentinelSegment; - SegmentBase *Tail = &SentinelSegment; - size_t Size = 0; - - // Here we keep track of segments in the freelist, to allow us to re-use - // segments when elements are trimmed off the end. - SegmentBase *Freelist = &SentinelSegment; - - Segment *NewSegment() { - // We need to handle the case in which enough elements have been trimmed to - // allow us to re-use segments we've allocated before. For this we look into - // the Freelist, to see whether we need to actually allocate new blocks or - // just re-use blocks we've already seen before. - if (Freelist != &SentinelSegment) { - auto *FreeSegment = Freelist; - Freelist = FreeSegment->Next; - FreeSegment->Next = &SentinelSegment; - Freelist->Prev = &SentinelSegment; - return static_cast<Segment *>(FreeSegment); - } - - auto SegmentBlock = Alloc->Allocate(); - if (SegmentBlock.Data == nullptr) - return nullptr; - - // Placement-new the Segment element at the beginning of the SegmentBlock. - auto S = reinterpret_cast<Segment *>(SegmentBlock.Data); - new (S) SegmentBase{&SentinelSegment, &SentinelSegment}; - return S; - } - - Segment *InitHeadAndTail() { - DCHECK_EQ(Head, &SentinelSegment); - DCHECK_EQ(Tail, &SentinelSegment); - auto Segment = NewSegment(); - if (Segment == nullptr) - return nullptr; - DCHECK_EQ(Segment->Next, &SentinelSegment); - DCHECK_EQ(Segment->Prev, &SentinelSegment); - Head = Tail = static_cast<SegmentBase *>(Segment); - return Segment; - } + static Segment SentinelSegment; - Segment *AppendNewSegment() { - auto S = NewSegment(); - if (S == nullptr) - return nullptr; - DCHECK_NE(Tail, &SentinelSegment); - DCHECK_EQ(Tail->Next, &SentinelSegment); - DCHECK_EQ(S->Prev, &SentinelSegment); - DCHECK_EQ(S->Next, &SentinelSegment); - Tail->Next = S; - S->Prev = Tail; - Tail = S; - return static_cast<Segment *>(Tail); - } + using size_type = uint64_t; +private: // This Iterator models a BidirectionalIterator. template <class U> class Iterator { - SegmentBase *S = &SentinelSegment; - size_t Offset = 0; - size_t Size = 0; + Segment *S = &SentinelSegment; + uint64_t Offset = 0; + uint64_t Size = 0; public: - Iterator(SegmentBase *IS, size_t Off, size_t S) - : S(IS), Offset(Off), Size(S) {} - Iterator(const Iterator &) noexcept = default; - Iterator() noexcept = default; - Iterator(Iterator &&) noexcept = default; - Iterator &operator=(const Iterator &) = default; - Iterator &operator=(Iterator &&) = default; - ~Iterator() = default; - - Iterator &operator++() { + Iterator(Segment *IS, uint64_t Off, uint64_t S) XRAY_NEVER_INSTRUMENT + : S(IS), + Offset(Off), + Size(S) {} + Iterator(const Iterator &) NOEXCEPT XRAY_NEVER_INSTRUMENT = default; + Iterator() NOEXCEPT XRAY_NEVER_INSTRUMENT = default; + Iterator(Iterator &&) NOEXCEPT XRAY_NEVER_INSTRUMENT = default; + Iterator &operator=(const Iterator &) XRAY_NEVER_INSTRUMENT = default; + Iterator &operator=(Iterator &&) XRAY_NEVER_INSTRUMENT = default; + ~Iterator() XRAY_NEVER_INSTRUMENT = default; + + Iterator &operator++() XRAY_NEVER_INSTRUMENT { if (++Offset % ElementsPerSegment || Offset == Size) return *this; @@ -168,7 +111,7 @@ private: return *this; } - Iterator &operator--() { + Iterator &operator--() XRAY_NEVER_INSTRUMENT { DCHECK_NE(S, &SentinelSegment); DCHECK_GT(Offset, 0); @@ -181,107 +124,295 @@ private: return *this; } - Iterator operator++(int) { + Iterator operator++(int) XRAY_NEVER_INSTRUMENT { Iterator Copy(*this); ++(*this); return Copy; } - Iterator operator--(int) { + Iterator operator--(int) XRAY_NEVER_INSTRUMENT { Iterator Copy(*this); --(*this); return Copy; } template <class V, class W> - friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) { + friend bool operator==(const Iterator<V> &L, + const Iterator<W> &R) XRAY_NEVER_INSTRUMENT { return L.S == R.S && L.Offset == R.Offset; } template <class V, class W> - friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) { + friend bool operator!=(const Iterator<V> &L, + const Iterator<W> &R) XRAY_NEVER_INSTRUMENT { return !(L == R); } - U &operator*() const { + U &operator*() const XRAY_NEVER_INSTRUMENT { DCHECK_NE(S, &SentinelSegment); auto RelOff = Offset % ElementsPerSegment; // We need to compute the character-aligned pointer, offset from the // segment's Data location to get the element in the position of Offset. - auto Base = static_cast<Segment *>(S)->Data; + auto Base = &S->Data; auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize); return *reinterpret_cast<U *>(AlignedOffset); } - U *operator->() const { return &(**this); } + U *operator->() const XRAY_NEVER_INSTRUMENT { return &(**this); } }; + AllocatorType *Alloc; + Segment *Head; + Segment *Tail; + + // Here we keep track of segments in the freelist, to allow us to re-use + // segments when elements are trimmed off the end. + Segment *Freelist; + uint64_t Size; + + // =============================== + // In the following implementation, we work through the algorithms and the + // list operations using the following notation: + // + // - pred(s) is the predecessor (previous node accessor) and succ(s) is + // the successor (next node accessor). + // + // - S is a sentinel segment, which has the following property: + // + // pred(S) == succ(S) == S + // + // - @ is a loop operator, which can imply pred(s) == s if it appears on + // the left of s, or succ(s) == S if it appears on the right of s. + // + // - sL <-> sR : means a bidirectional relation between sL and sR, which + // means: + // + // succ(sL) == sR && pred(SR) == sL + // + // - sL -> sR : implies a unidirectional relation between sL and SR, + // with the following properties: + // + // succ(sL) == sR + // + // sL <- sR : implies a unidirectional relation between sR and sL, + // with the following properties: + // + // pred(sR) == sL + // + // =============================== + + Segment *NewSegment() XRAY_NEVER_INSTRUMENT { + // We need to handle the case in which enough elements have been trimmed to + // allow us to re-use segments we've allocated before. For this we look into + // the Freelist, to see whether we need to actually allocate new blocks or + // just re-use blocks we've already seen before. + if (Freelist != &SentinelSegment) { + // The current state of lists resemble something like this at this point: + // + // Freelist: @S@<-f0->...<->fN->@S@ + // ^ Freelist + // + // We want to perform a splice of `f0` from Freelist to a temporary list, + // which looks like: + // + // Templist: @S@<-f0->@S@ + // ^ FreeSegment + // + // Our algorithm preconditions are: + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + + // Then the algorithm we implement is: + // + // SFS = Freelist + // Freelist = succ(Freelist) + // if (Freelist != S) + // pred(Freelist) = S + // succ(SFS) = S + // pred(SFS) = S + // + auto *FreeSegment = Freelist; + Freelist = Freelist->Next; + + // Note that we need to handle the case where Freelist is now pointing to + // S, which we don't want to be overwriting. + // TODO: Determine whether the cost of the branch is higher than the cost + // of the blind assignment. + if (Freelist != &SentinelSegment) + Freelist->Prev = &SentinelSegment; + + FreeSegment->Next = &SentinelSegment; + FreeSegment->Prev = &SentinelSegment; + + // Our postconditions are: + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + DCHECK_NE(FreeSegment, &SentinelSegment); + return FreeSegment; + } + + auto SegmentBlock = Alloc->Allocate(); + if (SegmentBlock.Data == nullptr) + return nullptr; + + // Placement-new the Segment element at the beginning of the SegmentBlock. + new (SegmentBlock.Data) Segment{&SentinelSegment, &SentinelSegment, {0}}; + auto SB = reinterpret_cast<Segment *>(SegmentBlock.Data); + return SB; + } + + Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT { + DCHECK_EQ(Head, &SentinelSegment); + DCHECK_EQ(Tail, &SentinelSegment); + auto S = NewSegment(); + if (S == nullptr) + return nullptr; + DCHECK_EQ(S->Next, &SentinelSegment); + DCHECK_EQ(S->Prev, &SentinelSegment); + DCHECK_NE(S, &SentinelSegment); + Head = S; + Tail = S; + DCHECK_EQ(Head, Tail); + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Tail->Prev, &SentinelSegment); + return S; + } + + Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT { + auto S = NewSegment(); + if (S == nullptr) + return nullptr; + DCHECK_NE(Tail, &SentinelSegment); + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(S->Prev, &SentinelSegment); + DCHECK_EQ(S->Next, &SentinelSegment); + S->Prev = Tail; + Tail->Next = S; + Tail = S; + DCHECK_EQ(S, S->Prev->Next); + DCHECK_EQ(Tail->Next, &SentinelSegment); + return S; + } + public: - explicit Array(AllocatorType &A) : Alloc(&A) {} + explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT + : Alloc(&A), + Head(&SentinelSegment), + Tail(&SentinelSegment), + Freelist(&SentinelSegment), + Size(0) {} + + Array() XRAY_NEVER_INSTRUMENT : Alloc(nullptr), + Head(&SentinelSegment), + Tail(&SentinelSegment), + Freelist(&SentinelSegment), + Size(0) {} Array(const Array &) = delete; - Array(Array &&O) NOEXCEPT : Alloc(O.Alloc), - Head(O.Head), - Tail(O.Tail), - Size(O.Size) { + Array &operator=(const Array &) = delete; + + Array(Array &&O) XRAY_NEVER_INSTRUMENT : Alloc(O.Alloc), + Head(O.Head), + Tail(O.Tail), + Freelist(O.Freelist), + Size(O.Size) { + O.Alloc = nullptr; O.Head = &SentinelSegment; O.Tail = &SentinelSegment; O.Size = 0; + O.Freelist = &SentinelSegment; } - bool empty() const { return Size == 0; } + Array &operator=(Array &&O) XRAY_NEVER_INSTRUMENT { + Alloc = O.Alloc; + O.Alloc = nullptr; + Head = O.Head; + O.Head = &SentinelSegment; + Tail = O.Tail; + O.Tail = &SentinelSegment; + Freelist = O.Freelist; + O.Freelist = &SentinelSegment; + Size = O.Size; + O.Size = 0; + return *this; + } + + ~Array() XRAY_NEVER_INSTRUMENT { + for (auto &E : *this) + (&E)->~T(); + } - AllocatorType &allocator() const { + bool empty() const XRAY_NEVER_INSTRUMENT { return Size == 0; } + + AllocatorType &allocator() const XRAY_NEVER_INSTRUMENT { DCHECK_NE(Alloc, nullptr); return *Alloc; } - size_t size() const { return Size; } + uint64_t size() const XRAY_NEVER_INSTRUMENT { return Size; } - T *Append(const T &E) { - if (UNLIKELY(Head == &SentinelSegment)) - if (InitHeadAndTail() == nullptr) + template <class... Args> + T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT { + DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + if (UNLIKELY(Head == &SentinelSegment)) { + auto R = InitHeadAndTail(); + if (R == nullptr) return nullptr; + } + + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); auto Offset = Size % ElementsPerSegment; if (UNLIKELY(Size != 0 && Offset == 0)) if (AppendNewSegment() == nullptr) return nullptr; - auto Base = static_cast<Segment *>(Tail)->Data; + DCHECK_NE(Tail, &SentinelSegment); + auto Base = &Tail->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); - auto Position = reinterpret_cast<T *>(AlignedOffset); - *Position = E; + DCHECK_LE(AlignedOffset + sizeof(T), + reinterpret_cast<unsigned char *>(Base) + SegmentSize); + + // In-place construct at Position. + new (AlignedOffset) T{std::forward<Args>(args)...}; ++Size; - return Position; + return reinterpret_cast<T *>(AlignedOffset); } - template <class... Args> T *AppendEmplace(Args &&... args) { - if (UNLIKELY(Head == &SentinelSegment)) - if (InitHeadAndTail() == nullptr) + T *Append(const T &E) XRAY_NEVER_INSTRUMENT { + // FIXME: This is a duplication of AppenEmplace with the copy semantics + // explicitly used, as a work-around to GCC 4.8 not invoking the copy + // constructor with the placement new with braced-init syntax. + DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + if (UNLIKELY(Head == &SentinelSegment)) { + auto R = InitHeadAndTail(); + if (R == nullptr) return nullptr; + } + + DCHECK_NE(Head, &SentinelSegment); + DCHECK_NE(Tail, &SentinelSegment); auto Offset = Size % ElementsPerSegment; - auto *LatestSegment = Tail; - if (UNLIKELY(Size != 0 && Offset == 0)) { - LatestSegment = AppendNewSegment(); - if (LatestSegment == nullptr) + if (UNLIKELY(Size != 0 && Offset == 0)) + if (AppendNewSegment() == nullptr) return nullptr; - } DCHECK_NE(Tail, &SentinelSegment); - auto Base = static_cast<Segment *>(LatestSegment)->Data; + auto Base = &Tail->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); - auto Position = reinterpret_cast<T *>(AlignedOffset); + DCHECK_LE(AlignedOffset + sizeof(T), + reinterpret_cast<unsigned char *>(Tail) + SegmentSize); // In-place construct at Position. - new (Position) T{std::forward<Args>(args)...}; + new (AlignedOffset) T(E); ++Size; - return reinterpret_cast<T *>(Position); + return reinterpret_cast<T *>(AlignedOffset); } - T &operator[](size_t Offset) const { + T &operator[](uint64_t Offset) const XRAY_NEVER_INSTRUMENT { DCHECK_LE(Offset, Size); // We need to traverse the array enough times to find the element at Offset. auto S = Head; @@ -290,19 +421,19 @@ public: Offset -= ElementsPerSegment; DCHECK_NE(S, &SentinelSegment); } - auto Base = static_cast<Segment *>(S)->Data; + auto Base = &S->Data; auto AlignedOffset = Base + (Offset * AlignedElementStorageSize); auto Position = reinterpret_cast<T *>(AlignedOffset); return *reinterpret_cast<T *>(Position); } - T &front() const { + T &front() const XRAY_NEVER_INSTRUMENT { DCHECK_NE(Head, &SentinelSegment); DCHECK_NE(Size, 0u); return *begin(); } - T &back() const { + T &back() const XRAY_NEVER_INSTRUMENT { DCHECK_NE(Tail, &SentinelSegment); DCHECK_NE(Size, 0u); auto It = end(); @@ -310,7 +441,8 @@ public: return *It; } - template <class Predicate> T *find_element(Predicate P) const { + template <class Predicate> + T *find_element(Predicate P) const XRAY_NEVER_INSTRUMENT { if (empty()) return nullptr; @@ -324,51 +456,195 @@ public: /// Remove N Elements from the end. This leaves the blocks behind, and not /// require allocation of new blocks for new elements added after trimming. - void trim(size_t Elements) { - DCHECK_LE(Elements, Size); - DCHECK_GT(Size, 0); + void trim(uint64_t Elements) XRAY_NEVER_INSTRUMENT { auto OldSize = Size; + Elements = Elements > Size ? Size : Elements; Size -= Elements; - DCHECK_NE(Head, &SentinelSegment); - DCHECK_NE(Tail, &SentinelSegment); - - for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) - - nearest_boundary(Size, ElementsPerSegment)) / - ElementsPerSegment; - SegmentsToTrim > 0; --SegmentsToTrim) { + // We compute the number of segments we're going to return from the tail by + // counting how many elements have been trimmed. Given the following: + // + // - Each segment has N valid positions, where N > 0 + // - The previous size > current size + // + // To compute the number of segments to return, we need to perform the + // following calculations for the number of segments required given 'x' + // elements: + // + // f(x) = { + // x == 0 : 0 + // , 0 < x <= N : 1 + // , N < x <= max : x / N + (x % N ? 1 : 0) + // } + // + // We can simplify this down to: + // + // f(x) = { + // x == 0 : 0, + // , 0 < x <= max : x / N + (x < N || x % N ? 1 : 0) + // } + // + // And further down to: + // + // f(x) = x ? x / N + (x < N || x % N ? 1 : 0) : 0 + // + // We can then perform the following calculation `s` which counts the number + // of segments we need to remove from the end of the data structure: + // + // s(p, c) = f(p) - f(c) + // + // If we treat p = previous size, and c = current size, and given the + // properties above, the possible range for s(...) is [0..max(typeof(p))/N] + // given that typeof(p) == typeof(c). + auto F = [](uint64_t X) { + return X ? (X / ElementsPerSegment) + + (X < ElementsPerSegment || X % ElementsPerSegment ? 1 : 0) + : 0; + }; + auto PS = F(OldSize); + auto CS = F(Size); + DCHECK_GE(PS, CS); + auto SegmentsToTrim = PS - CS; + for (auto I = 0uL; I < SegmentsToTrim; ++I) { + // Here we place the current tail segment to the freelist. To do this + // appropriately, we need to perform a splice operation on two + // bidirectional linked-lists. In particular, we have the current state of + // the doubly-linked list of segments: + // + // @S@ <- s0 <-> s1 <-> ... <-> sT -> @S@ + // DCHECK_NE(Head, &SentinelSegment); DCHECK_NE(Tail, &SentinelSegment); - // Put the tail into the Freelist. - auto *FreeSegment = Tail; - Tail = Tail->Prev; - if (Tail == &SentinelSegment) - Head = Tail; - else - Tail->Next = &SentinelSegment; - DCHECK_EQ(Tail->Next, &SentinelSegment); - FreeSegment->Next = Freelist; - FreeSegment->Prev = &SentinelSegment; - if (Freelist != &SentinelSegment) - Freelist->Prev = FreeSegment; - Freelist = FreeSegment; + + if (Freelist == &SentinelSegment) { + // Our two lists at this point are in this configuration: + // + // Freelist: (potentially) @S@ + // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@ + // ^ Head ^ Tail + // + // The end state for us will be this configuration: + // + // Freelist: @S@<-sT->@S@ + // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@ + // ^ Head ^ Tail + // + // The first step for us is to hold a reference to the tail of Mainlist, + // which in our notation is represented by sT. We call this our "free + // segment" which is the segment we are placing on the Freelist. + // + // sF = sT + // + // Then, we also hold a reference to the "pre-tail" element, which we + // call sPT: + // + // sPT = pred(sT) + // + // We want to splice sT into the beginning of the Freelist, which in + // an empty Freelist means placing a segment whose predecessor and + // successor is the sentinel segment. + // + // The splice operation then can be performed in the following + // algorithm: + // + // succ(sPT) = S + // pred(sT) = S + // succ(sT) = Freelist + // Freelist = sT + // Tail = sPT + // + auto SPT = Tail->Prev; + SPT->Next = &SentinelSegment; + Tail->Prev = &SentinelSegment; + Tail->Next = Freelist; + Freelist = Tail; + Tail = SPT; + + // Our post-conditions here are: + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + } else { + // In the other case, where the Freelist is not empty, we perform the + // following transformation instead: + // + // This transforms the current state: + // + // Freelist: @S@<-f0->@S@ + // ^ Freelist + // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@ + // ^ Head ^ Tail + // + // Into the following: + // + // Freelist: @S@<-sT<->f0->@S@ + // ^ Freelist + // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@ + // ^ Head ^ Tail + // + // The algorithm is: + // + // sFH = Freelist + // sPT = pred(sT) + // pred(SFH) = sT + // succ(sT) = Freelist + // pred(sT) = S + // succ(sPT) = S + // Tail = sPT + // Freelist = sT + // + auto SFH = Freelist; + auto SPT = Tail->Prev; + auto ST = Tail; + SFH->Prev = ST; + ST->Next = Freelist; + ST->Prev = &SentinelSegment; + SPT->Next = &SentinelSegment; + Tail = SPT; + Freelist = ST; + + // Our post-conditions here are: + DCHECK_EQ(Tail->Next, &SentinelSegment); + DCHECK_EQ(Freelist->Prev, &SentinelSegment); + DCHECK_EQ(Freelist->Next->Prev, Freelist); + } } + + // Now in case we've spliced all the segments in the end, we ensure that the + // main list is "empty", or both the head and tail pointing to the sentinel + // segment. + if (Tail == &SentinelSegment) + Head = Tail; + + DCHECK( + (Size == 0 && Head == &SentinelSegment && Tail == &SentinelSegment) || + (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment)); + DCHECK( + (Freelist != &SentinelSegment && Freelist->Prev == &SentinelSegment) || + (Freelist == &SentinelSegment && Tail->Next == &SentinelSegment)); } // Provide iterators. - Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); } - Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); } - Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); } - Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); } + Iterator<T> begin() const XRAY_NEVER_INSTRUMENT { + return Iterator<T>(Head, 0, Size); + } + Iterator<T> end() const XRAY_NEVER_INSTRUMENT { + return Iterator<T>(Tail, Size, Size); + } + Iterator<const T> cbegin() const XRAY_NEVER_INSTRUMENT { + return Iterator<const T>(Head, 0, Size); + } + Iterator<const T> cend() const XRAY_NEVER_INSTRUMENT { + return Iterator<const T>(Tail, Size, Size); + } }; // We need to have this storage definition out-of-line so that the compiler can // ensure that storage for the SentinelSegment is defined and has a single // address. template <class T> -typename Array<T>::SegmentBase Array<T>::SentinelSegment{ - &Array<T>::SentinelSegment, &Array<T>::SentinelSegment}; +typename Array<T>::Segment Array<T>::SentinelSegment{ + &Array<T>::SentinelSegment, &Array<T>::SentinelSegment, {'\0'}}; } // namespace __xray diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S index 99ad3966ee3a..52985ffd19ab 100644 --- a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S +++ b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S @@ -19,6 +19,7 @@ .macro SAVE_REGISTERS + pushfq subq $240, %rsp CFI_DEF_CFA_OFFSET(248) movq %rbp, 232(%rsp) @@ -69,6 +70,7 @@ movq 8(%rsp), %r14 movq 0(%rsp), %r15 addq $240, %rsp + popfq CFI_DEF_CFA_OFFSET(8) .endm @@ -89,10 +91,10 @@ .text #if !defined(__APPLE__) .section .text + .file "xray_trampoline_x86.S" #else .section __TEXT,__text #endif - .file "xray_trampoline_x86.S" //===----------------------------------------------------------------------===// diff --git a/contrib/compiler-rt/lib/xray/xray_tsc.h b/contrib/compiler-rt/lib/xray/xray_tsc.h index 4507564e7cd2..180d6df188c1 100644 --- a/contrib/compiler-rt/lib/xray/xray_tsc.h +++ b/contrib/compiler-rt/lib/xray/xray_tsc.h @@ -13,10 +13,32 @@ #ifndef XRAY_EMULATE_TSC_H #define XRAY_EMULATE_TSC_H +#include "sanitizer_common/sanitizer_common.h" + namespace __xray { static constexpr uint64_t NanosecondsPerSecond = 1000ULL * 1000 * 1000; } +#if SANITIZER_FUCHSIA +#include <zircon/syscalls.h> + +namespace __xray { + +inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; } + +ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT { + CPU = 0; + return _zx_ticks_get(); +} + +inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { + return _zx_ticks_per_second(); +} + +} // namespace __xray + +#else // SANITIZER_FUCHSIA + #if defined(__x86_64__) #include "xray_x86_64.inc" #elif defined(__powerpc64__) @@ -64,5 +86,6 @@ inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { #else #error Target architecture is not supported. #endif // CPU architecture +#endif // SANITIZER_FUCHSIA #endif // XRAY_EMULATE_TSC_H diff --git a/contrib/compiler-rt/lib/xray/xray_utils.cc b/contrib/compiler-rt/lib/xray/xray_utils.cc index 68f4e8c1094c..59ba6c3082b2 100644 --- a/contrib/compiler-rt/lib/xray/xray_utils.cc +++ b/contrib/compiler-rt/lib/xray/xray_utils.cc @@ -12,7 +12,9 @@ //===----------------------------------------------------------------------===// #include "xray_utils.h" +#include "sanitizer_common/sanitizer_allocator_internal.h" #include "sanitizer_common/sanitizer_common.h" +#include "xray_allocator.h" #include "xray_defs.h" #include "xray_flags.h" #include <cstdio> @@ -25,13 +27,113 @@ #include <unistd.h> #include <utility> +#if SANITIZER_FUCHSIA +#include "sanitizer_common/sanitizer_symbolizer_fuchsia.h" + +#include <inttypes.h> +#include <zircon/process.h> +#include <zircon/sanitizer.h> +#include <zircon/status.h> +#include <zircon/syscalls.h> +#endif + namespace __xray { -void printToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT { - fprintf(stderr, "%s", Buffer); +#if SANITIZER_FUCHSIA +constexpr const char* ProfileSinkName = "llvm-xray"; + +LogWriter::~LogWriter() { + _zx_handle_close(Vmo); +} + +void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT { + if (Begin == End) + return; + auto TotalBytes = std::distance(Begin, End); + + const size_t PageSize = flags()->xray_page_size_override > 0 + ? flags()->xray_page_size_override + : GetPageSizeCached(); + if (RoundUpTo(Offset, PageSize) != RoundUpTo(Offset + TotalBytes, PageSize)) { + // Resize the VMO to ensure there's sufficient space for the data. + zx_status_t Status = _zx_vmo_set_size(Vmo, Offset + TotalBytes); + if (Status != ZX_OK) { + Report("Failed to resize VMO: %s\n", _zx_status_get_string(Status)); + return; + } + } + + // Write the data into VMO. + zx_status_t Status = _zx_vmo_write(Vmo, Begin, Offset, TotalBytes); + if (Status != ZX_OK) { + Report("Failed to write: %s\n", _zx_status_get_string(Status)); + return; + } + Offset += TotalBytes; +} + +void LogWriter::Flush() XRAY_NEVER_INSTRUMENT { + // Nothing to do here since WriteAll writes directly into the VMO. +} + +LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT { + // Create VMO to hold the profile data. + zx_handle_t Vmo; + zx_status_t Status = _zx_vmo_create(0, 0, &Vmo); + if (Status != ZX_OK) { + Report("XRay: cannot create VMO: %s\n", _zx_status_get_string(Status)); + return nullptr; + } + + // Get the KOID of the current process to use in the VMO name. + zx_info_handle_basic_t Info; + Status = _zx_object_get_info(_zx_process_self(), ZX_INFO_HANDLE_BASIC, &Info, + sizeof(Info), NULL, NULL); + if (Status != ZX_OK) { + Report("XRay: cannot get basic info about current process handle: %s\n", + _zx_status_get_string(Status)); + return nullptr; + } + + // Give the VMO a name including our process KOID so it's easy to spot. + char VmoName[ZX_MAX_NAME_LEN]; + internal_snprintf(VmoName, sizeof(VmoName), "%s.%zu", ProfileSinkName, + Info.koid); + _zx_object_set_property(Vmo, ZX_PROP_NAME, VmoName, strlen(VmoName)); + + // Duplicate the handle since __sanitizer_publish_data consumes it and + // LogWriter needs to hold onto it. + zx_handle_t Handle; + Status =_zx_handle_duplicate(Vmo, ZX_RIGHT_SAME_RIGHTS, &Handle); + if (Status != ZX_OK) { + Report("XRay: cannot duplicate VMO handle: %s\n", + _zx_status_get_string(Status)); + return nullptr; + } + + // Publish the VMO that receives the logging. Note the VMO's contents can + // grow and change after publication. The contents won't be read out until + // after the process exits. + __sanitizer_publish_data(ProfileSinkName, Handle); + + // Use the dumpfile symbolizer markup element to write the name of the VMO. + Report("XRay: " FORMAT_DUMPFILE "\n", ProfileSinkName, VmoName); + + LogWriter *LW = reinterpret_cast<LogWriter *>(InternalAlloc(sizeof(LogWriter))); + new (LW) LogWriter(Vmo); + return LW; +} + +void LogWriter::Close(LogWriter *LW) { + LW->~LogWriter(); + InternalFree(LW); +} +#else // SANITIZER_FUCHSIA +LogWriter::~LogWriter() { + internal_close(Fd); } -void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT { +void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT { if (Begin == End) return; auto TotalBytes = std::distance(Begin, End); @@ -49,50 +151,11 @@ void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INS } } -std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, - char *End) XRAY_NEVER_INSTRUMENT { - auto BytesToRead = std::distance(Begin, End); - ssize_t BytesRead; - ssize_t TotalBytesRead = 0; - while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) { - if (BytesRead == -1) { - if (errno == EINTR) - continue; - Report("Read error; errno = %d\n", errno); - return std::make_pair(TotalBytesRead, false); - } - - TotalBytesRead += BytesRead; - BytesToRead -= BytesRead; - Begin += BytesRead; - } - return std::make_pair(TotalBytesRead, true); -} - -bool readValueFromFile(const char *Filename, - long long *Value) XRAY_NEVER_INSTRUMENT { - int Fd = open(Filename, O_RDONLY | O_CLOEXEC); - if (Fd == -1) - return false; - static constexpr size_t BufSize = 256; - char Line[BufSize] = {}; - ssize_t BytesRead; - bool Success; - std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize); - if (!Success) - return false; - close(Fd); - const char *End = nullptr; - long long Tmp = internal_simple_strtoll(Line, &End, 10); - bool Result = false; - if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) { - *Value = Tmp; - Result = true; - } - return Result; +void LogWriter::Flush() XRAY_NEVER_INSTRUMENT { + fsync(Fd); } -int getLogFD() XRAY_NEVER_INSTRUMENT { +LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT { // Open a temporary file once for the log. char TmpFilename[256] = {}; char TmpWildcardPattern[] = "XXXXXX"; @@ -103,24 +166,31 @@ int getLogFD() XRAY_NEVER_INSTRUMENT { if (LastSlash != nullptr) Progname = LastSlash + 1; - const int HalfLength = sizeof(TmpFilename) / 2 - sizeof(TmpWildcardPattern); int NeededLength = internal_snprintf( - TmpFilename, sizeof(TmpFilename), "%.*s%.*s.%s", HalfLength, - flags()->xray_logfile_base, HalfLength, Progname, TmpWildcardPattern); + TmpFilename, sizeof(TmpFilename), "%s%s.%s", + flags()->xray_logfile_base, Progname, TmpWildcardPattern); if (NeededLength > int(sizeof(TmpFilename))) { Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename); - return -1; + return nullptr; } int Fd = mkstemp(TmpFilename); if (Fd == -1) { Report("XRay: Failed opening temporary file '%s'; not logging events.\n", TmpFilename); - return -1; + return nullptr; } if (Verbosity()) Report("XRay: Log file in '%s'\n", TmpFilename); - return Fd; + LogWriter *LW = allocate<LogWriter>(); + new (LW) LogWriter(Fd); + return LW; +} + +void LogWriter::Close(LogWriter *LW) { + LW->~LogWriter(); + deallocate(LW); } +#endif // SANITIZER_FUCHSIA } // namespace __xray diff --git a/contrib/compiler-rt/lib/xray/xray_utils.h b/contrib/compiler-rt/lib/xray/xray_utils.h index eafa16e1a9d5..60438973fbd0 100644 --- a/contrib/compiler-rt/lib/xray/xray_utils.h +++ b/contrib/compiler-rt/lib/xray/xray_utils.h @@ -20,23 +20,40 @@ #include <sys/types.h> #include <utility> -namespace __xray { - -// Default implementation of the reporting interface for sanitizer errors. -void printToStdErr(const char *Buffer); - -// EINTR-safe write routine, provided a file descriptor and a character range. -void retryingWriteAll(int Fd, const char *Begin, const char *End); +#include "sanitizer_common/sanitizer_common.h" +#if SANITIZER_FUCHSIA +#include <zircon/types.h> +#endif -// Reads a long long value from a provided file. -bool readValueFromFile(const char *Filename, long long *Value); - -// EINTR-safe read routine, providing a file descriptor and a character range. -std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End); +namespace __xray { -// EINTR-safe open routine, uses flag-provided values for initialising a log -// file. -int getLogFD(); +class LogWriter { +public: +#if SANITIZER_FUCHSIA + LogWriter(zx_handle_t Vmo) : Vmo(Vmo) {} +#else + explicit LogWriter(int Fd) : Fd(Fd) {} +#endif + ~LogWriter(); + + // Write a character range into a log. + void WriteAll(const char *Begin, const char *End); + + void Flush(); + + // Returns a new log instance initialized using the flag-provided values. + static LogWriter *Open(); + // Closes and deallocates the log instance. + static void Close(LogWriter *LogWriter); + +private: +#if SANITIZER_FUCHSIA + zx_handle_t Vmo = ZX_HANDLE_INVALID; + uint64_t Offset = 0; +#else + int Fd = -1; +#endif +}; constexpr size_t gcd(size_t a, size_t b) { return (b == 0) ? a : gcd(b, a % b); diff --git a/contrib/compiler-rt/lib/xray/xray_x86_64.cc b/contrib/compiler-rt/lib/xray/xray_x86_64.cc index 51dc4ce43b1c..e63ee1b3bd02 100644 --- a/contrib/compiler-rt/lib/xray/xray_x86_64.cc +++ b/contrib/compiler-rt/lib/xray/xray_x86_64.cc @@ -1,15 +1,20 @@ #include "cpuid.h" #include "sanitizer_common/sanitizer_common.h" +#if !SANITIZER_FUCHSIA +#include "sanitizer_common/sanitizer_posix.h" +#endif #include "xray_defs.h" #include "xray_interface_internal.h" -#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD +#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC #include <sys/types.h> #if SANITIZER_OPENBSD #include <sys/time.h> #include <machine/cpu.h> #endif #include <sys/sysctl.h> +#elif SANITIZER_FUCHSIA +#include <zircon/syscalls.h> #endif #include <atomic> @@ -81,17 +86,20 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { } return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency); } -#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD +#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { long long TSCFrequency = -1; size_t tscfreqsz = sizeof(TSCFrequency); #if SANITIZER_OPENBSD int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ }; - if (sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) { + if (internal_sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) { +#elif SANITIZER_MAC + if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency, + &tscfreqsz, NULL, 0) != -1) { #else - if (sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz, - NULL, 0) != -1) { + if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz, + NULL, 0) != -1) { #endif return static_cast<uint64_t>(TSCFrequency); } else { @@ -100,7 +108,7 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { return 0; } -#else +#elif !SANITIZER_FUCHSIA uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT { /* Not supported */ return 0; @@ -317,6 +325,7 @@ bool patchTypedEvent(const bool Enable, const uint32_t FuncId, return false; } +#if !SANITIZER_FUCHSIA // We determine whether the CPU we're running on has the correct features we // need. In x86_64 this will be rdtscp support. bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { @@ -339,5 +348,6 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { } return true; } +#endif } // namespace __xray |
