summaryrefslogtreecommitdiff
path: root/contrib/compiler-rt/lib/xray
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/compiler-rt/lib/xray')
-rw-r--r--contrib/compiler-rt/lib/xray/xray_allocator.h210
-rw-r--r--contrib/compiler-rt/lib/xray/xray_basic_logging.cc145
-rw-r--r--contrib/compiler-rt/lib/xray/xray_buffer_queue.cc281
-rw-r--r--contrib/compiler-rt/lib/xray/xray_buffer_queue.h81
-rw-r--r--contrib/compiler-rt/lib/xray/xray_defs.h10
-rw-r--r--contrib/compiler-rt/lib/xray/xray_fdr_controller.h373
-rw-r--r--contrib/compiler-rt/lib/xray/xray_fdr_log_records.h5
-rw-r--r--contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h232
-rw-r--r--contrib/compiler-rt/lib/xray/xray_fdr_logging.cc1096
-rw-r--r--contrib/compiler-rt/lib/xray/xray_function_call_trie.h387
-rw-r--r--contrib/compiler-rt/lib/xray/xray_init.cc18
-rw-r--r--contrib/compiler-rt/lib/xray/xray_interface.cc41
-rw-r--r--contrib/compiler-rt/lib/xray/xray_profile_collector.cc361
-rw-r--r--contrib/compiler-rt/lib/xray/xray_profile_collector.h26
-rw-r--r--contrib/compiler-rt/lib/xray/xray_profiling.cc387
-rw-r--r--contrib/compiler-rt/lib/xray/xray_profiling_flags.inc5
-rw-r--r--contrib/compiler-rt/lib/xray/xray_segmented_array.h584
-rw-r--r--contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S4
-rw-r--r--contrib/compiler-rt/lib/xray/xray_tsc.h23
-rw-r--r--contrib/compiler-rt/lib/xray/xray_utils.cc172
-rw-r--r--contrib/compiler-rt/lib/xray/xray_utils.h47
-rw-r--r--contrib/compiler-rt/lib/xray/xray_x86_64.cc22
22 files changed, 2881 insertions, 1629 deletions
diff --git a/contrib/compiler-rt/lib/xray/xray_allocator.h b/contrib/compiler-rt/lib/xray/xray_allocator.h
index 8244815284a8..907c54542a56 100644
--- a/contrib/compiler-rt/lib/xray/xray_allocator.h
+++ b/contrib/compiler-rt/lib/xray/xray_allocator.h
@@ -19,18 +19,131 @@
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_internal_defs.h"
#include "sanitizer_common/sanitizer_mutex.h"
+#if SANITIZER_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#else
#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_defs.h"
#include "xray_utils.h"
-#include <sys/mman.h>
#include <cstddef>
#include <cstdint>
+#include <sys/mman.h>
+
+namespace __xray {
+
+// We implement our own memory allocation routine which will bypass the
+// internal allocator. This allows us to manage the memory directly, using
+// mmap'ed memory to back the allocators.
+template <class T> T *allocate() XRAY_NEVER_INSTRUMENT {
+ uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+ zx_handle_t Vmo;
+ zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo);
+ if (Status != ZX_OK) {
+ if (Verbosity())
+ Report("XRay Profiling: Failed to create VMO of size %zu: %s\n",
+ sizeof(T), _zx_status_get_string(Status));
+ return nullptr;
+ }
+ uintptr_t B;
+ Status =
+ _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0,
+ Vmo, 0, sizeof(T), &B);
+ _zx_handle_close(Vmo);
+ if (Status != ZX_OK) {
+ if (Verbosity())
+ Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", sizeof(T),
+ _zx_status_get_string(Status));
+ return nullptr;
+ }
+ return reinterpret_cast<T *>(B);
+#else
+ uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ int ErrNo = 0;
+ if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+ if (Verbosity())
+ Report(
+ "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n",
+ RoundedSize, B);
+ return nullptr;
+ }
+#endif
+ return reinterpret_cast<T *>(B);
+}
-#ifndef MAP_NORESERVE
-// no-op on NetBSD (at least), unsupported flag on FreeBSD basically because unneeded
-#define MAP_NORESERVE 0
+template <class T> void deallocate(T *B) XRAY_NEVER_INSTRUMENT {
+ if (B == nullptr)
+ return;
+ uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+ _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B),
+ RoundedSize);
+#else
+ internal_munmap(B, RoundedSize);
#endif
+}
-namespace __xray {
+template <class T = unsigned char>
+T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+ uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+ zx_handle_t Vmo;
+ zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo);
+ if (Status != ZX_OK) {
+ if (Verbosity())
+ Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", S,
+ _zx_status_get_string(Status));
+ return nullptr;
+ }
+ uintptr_t B;
+ Status = _zx_vmar_map(_zx_vmar_root_self(),
+ ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, Vmo, 0, S, &B);
+ _zx_handle_close(Vmo);
+ if (Status != ZX_OK) {
+ if (Verbosity())
+ Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", S,
+ _zx_status_get_string(Status));
+ return nullptr;
+ }
+#else
+ uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+ MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+ int ErrNo = 0;
+ if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+ if (Verbosity())
+ Report(
+ "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n",
+ RoundedSize, B);
+ return nullptr;
+ }
+#endif
+ return reinterpret_cast<T *>(B);
+}
+
+template <class T> void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT {
+ if (B == nullptr)
+ return;
+ uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+ _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B),
+ RoundedSize);
+#else
+ internal_munmap(B, RoundedSize);
+#endif
+}
+
+template <class T, class... U>
+T *initArray(size_t N, U &&... Us) XRAY_NEVER_INSTRUMENT {
+ auto A = allocateBuffer<T>(N);
+ if (A != nullptr)
+ while (N > 0)
+ new (A + (--N)) T(std::forward<U>(Us)...);
+ return A;
+}
/// The Allocator type hands out fixed-sized chunks of memory that are
/// cache-line aligned and sized. This is useful for placement of
@@ -58,20 +171,18 @@ template <size_t N> struct Allocator {
};
private:
- const size_t MaxMemory{0};
- void *BackingStore = nullptr;
- void *AlignedNextBlock = nullptr;
+ size_t MaxMemory{0};
+ unsigned char *BackingStore = nullptr;
+ unsigned char *AlignedNextBlock = nullptr;
size_t AllocatedBlocks = 0;
+ bool Owned;
SpinMutex Mutex{};
- void *Alloc() {
+ void *Alloc() XRAY_NEVER_INSTRUMENT {
SpinMutexLock Lock(&Mutex);
if (UNLIKELY(BackingStore == nullptr)) {
- BackingStore = reinterpret_cast<void *>(
- internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE,
- MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0));
- if (BackingStore == MAP_FAILED) {
- BackingStore = nullptr;
+ BackingStore = allocateBuffer(MaxMemory);
+ if (BackingStore == nullptr) {
if (Verbosity())
Report("XRay Profiling: Failed to allocate memory for allocator.\n");
return nullptr;
@@ -84,7 +195,7 @@ private:
auto AlignedNextBlockNum = nearest_boundary(
reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
- munmap(BackingStore, MaxMemory);
+ deallocateBuffer(BackingStore, MaxMemory);
AlignedNextBlock = BackingStore = nullptr;
if (Verbosity())
Report("XRay Profiling: Cannot obtain enough memory from "
@@ -92,34 +203,83 @@ private:
return nullptr;
}
- AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum);
+ AlignedNextBlock = reinterpret_cast<unsigned char *>(AlignedNextBlockNum);
// Assert that AlignedNextBlock is cache-line aligned.
DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
0);
}
- if ((AllocatedBlocks * Block::Size) >= MaxMemory)
+ if (((AllocatedBlocks + 1) * Block::Size) > MaxMemory)
return nullptr;
// Align the pointer we'd like to return to an appropriate alignment, then
// advance the pointer from where to start allocations.
void *Result = AlignedNextBlock;
- AlignedNextBlock = reinterpret_cast<void *>(
- reinterpret_cast<char *>(AlignedNextBlock) + N);
+ AlignedNextBlock =
+ reinterpret_cast<unsigned char *>(AlignedNextBlock) + Block::Size;
++AllocatedBlocks;
return Result;
}
public:
- explicit Allocator(size_t M)
- : MaxMemory(nearest_boundary(M, kCacheLineSize)) {}
+ explicit Allocator(size_t M) XRAY_NEVER_INSTRUMENT
+ : MaxMemory(RoundUpTo(M, kCacheLineSize)),
+ BackingStore(nullptr),
+ AlignedNextBlock(nullptr),
+ AllocatedBlocks(0),
+ Owned(true),
+ Mutex() {}
+
+ explicit Allocator(void *P, size_t M) XRAY_NEVER_INSTRUMENT
+ : MaxMemory(M),
+ BackingStore(reinterpret_cast<unsigned char *>(P)),
+ AlignedNextBlock(reinterpret_cast<unsigned char *>(P)),
+ AllocatedBlocks(0),
+ Owned(false),
+ Mutex() {}
+
+ Allocator(const Allocator &) = delete;
+ Allocator &operator=(const Allocator &) = delete;
+
+ Allocator(Allocator &&O) XRAY_NEVER_INSTRUMENT {
+ SpinMutexLock L0(&Mutex);
+ SpinMutexLock L1(&O.Mutex);
+ MaxMemory = O.MaxMemory;
+ O.MaxMemory = 0;
+ BackingStore = O.BackingStore;
+ O.BackingStore = nullptr;
+ AlignedNextBlock = O.AlignedNextBlock;
+ O.AlignedNextBlock = nullptr;
+ AllocatedBlocks = O.AllocatedBlocks;
+ O.AllocatedBlocks = 0;
+ Owned = O.Owned;
+ O.Owned = false;
+ }
+
+ Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT {
+ SpinMutexLock L0(&Mutex);
+ SpinMutexLock L1(&O.Mutex);
+ MaxMemory = O.MaxMemory;
+ O.MaxMemory = 0;
+ if (BackingStore != nullptr)
+ deallocateBuffer(BackingStore, MaxMemory);
+ BackingStore = O.BackingStore;
+ O.BackingStore = nullptr;
+ AlignedNextBlock = O.AlignedNextBlock;
+ O.AlignedNextBlock = nullptr;
+ AllocatedBlocks = O.AllocatedBlocks;
+ O.AllocatedBlocks = 0;
+ Owned = O.Owned;
+ O.Owned = false;
+ return *this;
+ }
- Block Allocate() { return {Alloc()}; }
+ Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; }
- ~Allocator() NOEXCEPT {
- if (BackingStore != nullptr) {
- internal_munmap(BackingStore, MaxMemory);
+ ~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT {
+ if (Owned && BackingStore != nullptr) {
+ deallocateBuffer(BackingStore, MaxMemory);
}
}
};
diff --git a/contrib/compiler-rt/lib/xray/xray_basic_logging.cc b/contrib/compiler-rt/lib/xray/xray_basic_logging.cc
index 585ca641cd0c..ae1cc0ba79dd 100644
--- a/contrib/compiler-rt/lib/xray/xray_basic_logging.cc
+++ b/contrib/compiler-rt/lib/xray/xray_basic_logging.cc
@@ -19,7 +19,9 @@
#include <fcntl.h>
#include <pthread.h>
#include <sys/stat.h>
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
#include <sys/syscall.h>
+#endif
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
@@ -38,8 +40,9 @@
namespace __xray {
-SpinMutex LogMutex;
+static SpinMutex LogMutex;
+namespace {
// We use elements of this type to record the entry TSC of every function ID we
// see as we're tracing a particular thread's execution.
struct alignas(16) StackEntry {
@@ -52,21 +55,28 @@ struct alignas(16) StackEntry {
static_assert(sizeof(StackEntry) == 16, "Wrong size for StackEntry");
-struct alignas(64) ThreadLocalData {
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
void *InMemoryBuffer = nullptr;
size_t BufferSize = 0;
size_t BufferOffset = 0;
void *ShadowStack = nullptr;
size_t StackSize = 0;
size_t StackEntries = 0;
- int Fd = -1;
+ __xray::LogWriter *LogWriter = nullptr;
};
+struct BasicLoggingOptions {
+ int DurationFilterMicros = 0;
+ size_t MaxStackDepth = 0;
+ size_t ThreadBufferSize = 0;
+};
+} // namespace
+
static pthread_key_t PThreadKey;
static atomic_uint8_t BasicInitialized{0};
-BasicLoggingOptions GlobalOptions;
+struct BasicLoggingOptions GlobalOptions;
thread_local atomic_uint8_t Guard{0};
@@ -75,10 +85,10 @@ static atomic_uint64_t ThresholdTicks{0};
static atomic_uint64_t TicksPerSec{0};
static atomic_uint64_t CycleFrequency{NanosecondsPerSecond};
-static int openLogFile() XRAY_NEVER_INSTRUMENT {
- int F = getLogFD();
- if (F == -1)
- return -1;
+static LogWriter *getLog() XRAY_NEVER_INSTRUMENT {
+ LogWriter* LW = LogWriter::Open();
+ if (LW == nullptr)
+ return LW;
static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT;
pthread_once(&DetectOnce, +[] {
@@ -100,16 +110,16 @@ static int openLogFile() XRAY_NEVER_INSTRUMENT {
// before setting the values in the header.
Header.ConstantTSC = 1;
Header.NonstopTSC = 1;
- retryingWriteAll(F, reinterpret_cast<char *>(&Header),
- reinterpret_cast<char *>(&Header) + sizeof(Header));
- return F;
+ LW->WriteAll(reinterpret_cast<char *>(&Header),
+ reinterpret_cast<char *>(&Header) + sizeof(Header));
+ return LW;
}
-static int getGlobalFd() XRAY_NEVER_INSTRUMENT {
+static LogWriter *getGlobalLog() XRAY_NEVER_INSTRUMENT {
static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
- static int Fd = 0;
- pthread_once(&OnceInit, +[] { Fd = openLogFile(); });
- return Fd;
+ static LogWriter *LW = nullptr;
+ pthread_once(&OnceInit, +[] { LW = getLog(); });
+ return LW;
}
static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
@@ -121,7 +131,7 @@ static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
return false;
}
pthread_setspecific(PThreadKey, &TLD);
- TLD.Fd = getGlobalFd();
+ TLD.LogWriter = getGlobalLog();
TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>(
InternalAlloc(sizeof(XRayRecord) * GlobalOptions.ThreadBufferSize,
nullptr, alignof(XRayRecord)));
@@ -149,8 +159,8 @@ template <class RDTSC>
void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
auto &TLD = getThreadLocalData();
- int Fd = getGlobalFd();
- if (Fd == -1)
+ LogWriter *LW = getGlobalLog();
+ if (LW == nullptr)
return;
// Use a simple recursion guard, to handle cases where we're already logging
@@ -234,9 +244,9 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer);
internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
if (++TLD.BufferOffset == TLD.BufferSize) {
- SpinMutexLock L(&LogMutex);
- retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
- reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+ SpinMutexLock Lock(&LogMutex);
+ LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+ reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
TLD.BufferOffset = 0;
TLD.StackEntries = 0;
}
@@ -249,17 +259,17 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
auto FirstEntry =
reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer);
const auto &BuffLen = TLD.BufferSize;
- int Fd = getGlobalFd();
- if (Fd == -1)
+ LogWriter *LW = getGlobalLog();
+ if (LW == nullptr)
return;
// First we check whether there's enough space to write the data consecutively
// in the thread-local buffer. If not, we first flush the buffer before
// attempting to write the two records that must be consecutive.
if (TLD.BufferOffset + 2 > BuffLen) {
- SpinMutexLock L(&LogMutex);
- retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
- reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+ SpinMutexLock Lock(&LogMutex);
+ LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+ reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
TLD.BufferOffset = 0;
TLD.StackEntries = 0;
}
@@ -280,9 +290,9 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
R.Arg = Arg1;
internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
if (++TLD.BufferOffset == BuffLen) {
- SpinMutexLock L(&LogMutex);
- retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
- reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+ SpinMutexLock Lock(&LogMutex);
+ LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+ reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
TLD.BufferOffset = 0;
TLD.StackEntries = 0;
}
@@ -339,29 +349,29 @@ static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
Report("Cleaned up log for TID: %d\n", GetTid());
});
- if (TLD.Fd == -1 || TLD.BufferOffset == 0) {
+ if (TLD.LogWriter == nullptr || TLD.BufferOffset == 0) {
if (Verbosity())
- Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", GetTid(),
- TLD.Fd, TLD.BufferOffset);
+ Report("Skipping buffer for TID: %d; Offset = %llu\n", GetTid(),
+ TLD.BufferOffset);
return;
}
{
SpinMutexLock L(&LogMutex);
- retryingWriteAll(TLD.Fd, reinterpret_cast<char *>(TLD.InMemoryBuffer),
- reinterpret_cast<char *>(TLD.InMemoryBuffer) +
- (sizeof(XRayRecord) * TLD.BufferOffset));
+ TLD.LogWriter->WriteAll(reinterpret_cast<char *>(TLD.InMemoryBuffer),
+ reinterpret_cast<char *>(TLD.InMemoryBuffer) +
+ (sizeof(XRayRecord) * TLD.BufferOffset));
}
// Because this thread's exit could be the last one trying to write to
// the file and that we're not able to close out the file properly, we
// sync instead and hope that the pending writes are flushed as the
// thread exits.
- fsync(TLD.Fd);
+ TLD.LogWriter->Flush();
}
-XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
- void *Options,
+XRayLogInitStatus basicLoggingInit(UNUSED size_t BufferSize,
+ UNUSED size_t BufferMax, void *Options,
size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
uint8_t Expected = 0;
if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
@@ -385,43 +395,32 @@ XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
"using emulation instead.\n");
});
- if (BufferSize == 0 && BufferMax == 0 && Options != nullptr) {
- FlagParser P;
- BasicFlags F;
- F.setDefaults();
- registerXRayBasicFlags(&P, &F);
- P.ParseString(useCompilerDefinedBasicFlags());
- auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
- if (EnvOpts == nullptr)
- EnvOpts = "";
-
- P.ParseString(EnvOpts);
-
- // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
- // set through XRAY_OPTIONS instead.
- if (internal_strlen(EnvOpts) == 0) {
- F.func_duration_threshold_us =
- flags()->xray_naive_log_func_duration_threshold_us;
- F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
- F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
- }
-
- P.ParseString(static_cast<const char *>(Options));
- GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
- GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
- GlobalOptions.MaxStackDepth = F.max_stack_depth;
- *basicFlags() = F;
- } else if (OptionsSize != sizeof(BasicLoggingOptions)) {
- Report("Invalid options size, potential ABI mismatch; expected %d got %d",
- sizeof(BasicLoggingOptions), OptionsSize);
- return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
- } else {
- if (Verbosity())
- Report("XRay Basic: struct-based init is deprecated, please use "
- "string-based configuration instead.\n");
- GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options);
+ FlagParser P;
+ BasicFlags F;
+ F.setDefaults();
+ registerXRayBasicFlags(&P, &F);
+ P.ParseString(useCompilerDefinedBasicFlags());
+ auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
+ if (EnvOpts == nullptr)
+ EnvOpts = "";
+
+ P.ParseString(EnvOpts);
+
+ // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
+ // set through XRAY_OPTIONS instead.
+ if (internal_strlen(EnvOpts) == 0) {
+ F.func_duration_threshold_us =
+ flags()->xray_naive_log_func_duration_threshold_us;
+ F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
+ F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
}
+ P.ParseString(static_cast<const char *>(Options));
+ GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
+ GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
+ GlobalOptions.MaxStackDepth = F.max_stack_depth;
+ *basicFlags() = F;
+
atomic_store(&ThresholdTicks,
atomic_load(&TicksPerSec, memory_order_acquire) *
GlobalOptions.DurationFilterMicros / 1000000,
diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc
index 3ce728900787..7d0e5a1f323c 100644
--- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc
+++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc
@@ -13,141 +13,206 @@
//
//===----------------------------------------------------------------------===//
#include "xray_buffer_queue.h"
+#include "sanitizer_common/sanitizer_atomic.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_libc.h"
+#if !SANITIZER_FUCHSIA
#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_allocator.h"
+#include "xray_defs.h"
#include <memory>
#include <sys/mman.h>
-#ifndef MAP_NORESERVE
-// no-op on NetBSD (at least), unsupported flag on FreeBSD
-#define MAP_NORESERVE 0
-#endif
-
using namespace __xray;
-using namespace __sanitizer;
-
-template <class T> static T *allocRaw(size_t N) {
- // TODO: Report errors?
- // We use MAP_NORESERVE on platforms where it's supported to ensure that the
- // pages we're allocating for XRay never end up in pages that can be swapped
- // in/out. We're doing this because for FDR mode, we want to ensure that
- // writes to the buffers stay resident in memory to prevent XRay itself from
- // causing swapping/thrashing.
- //
- // In the case when XRay pages cannot be swapped in/out or there's not enough
- // RAM to back these pages, we're willing to cause a segmentation fault
- // instead of introducing latency in the measurement. We assume here that
- // there are enough pages that are swappable in/out outside of the buffers
- // being used by FDR mode (which are bounded and configurable anyway) to allow
- // us to keep using always-resident memory.
- //
- // TODO: Make this configurable?
- void *A = reinterpret_cast<void *>(
- internal_mmap(NULL, N * sizeof(T), PROT_WRITE | PROT_READ,
- MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0));
- return (A == MAP_FAILED) ? nullptr : reinterpret_cast<T *>(A);
-}
-template <class T> static void deallocRaw(T *ptr, size_t N) {
- // TODO: Report errors?
- if (ptr != nullptr)
- internal_munmap(ptr, N);
+namespace {
+
+BufferQueue::ControlBlock *allocControlBlock(size_t Size, size_t Count) {
+ auto B =
+ allocateBuffer((sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
+ return B == nullptr ? nullptr
+ : reinterpret_cast<BufferQueue::ControlBlock *>(B);
}
-template <class T> static T *initArray(size_t N) {
- auto A = allocRaw<T>(N);
- if (A != nullptr)
- while (N > 0)
- new (A + (--N)) T();
- return A;
+void deallocControlBlock(BufferQueue::ControlBlock *C, size_t Size,
+ size_t Count) {
+ deallocateBuffer(reinterpret_cast<unsigned char *>(C),
+ (sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
}
-BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
- : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)),
- BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)),
- Next(Buffers), First(Buffers), LiveBuffers(0) {
- if (Buffers == nullptr) {
- Success = false;
+void decRefCount(BufferQueue::ControlBlock *C, size_t Size, size_t Count) {
+ if (C == nullptr)
return;
- }
- if (OwnedBuffers == nullptr) {
- // Clean up the buffers we've already allocated.
- for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
- B->~BufferRep();
- deallocRaw(Buffers, N);
- Success = false;
+ if (atomic_fetch_sub(&C->RefCount, 1, memory_order_acq_rel) == 1)
+ deallocControlBlock(C, Size, Count);
+}
+
+void incRefCount(BufferQueue::ControlBlock *C) {
+ if (C == nullptr)
return;
+ atomic_fetch_add(&C->RefCount, 1, memory_order_acq_rel);
+}
+
+// We use a struct to ensure that we are allocating one atomic_uint64_t per
+// cache line. This allows us to not worry about false-sharing among atomic
+// objects being updated (constantly) by different threads.
+struct ExtentsPadded {
+ union {
+ atomic_uint64_t Extents;
+ unsigned char Storage[kCacheLineSize];
};
+};
- for (size_t i = 0; i < N; ++i) {
- auto &T = Buffers[i];
- void *Tmp = allocRaw<char>(BufferSize);
- if (Tmp == nullptr) {
- Success = false;
+constexpr size_t kExtentsSize = sizeof(ExtentsPadded);
+
+} // namespace
+
+BufferQueue::ErrorCode BufferQueue::init(size_t BS, size_t BC) {
+ SpinMutexLock Guard(&Mutex);
+
+ if (!finalizing())
+ return BufferQueue::ErrorCode::AlreadyInitialized;
+
+ cleanupBuffers();
+
+ bool Success = false;
+ BufferSize = BS;
+ BufferCount = BC;
+
+ BackingStore = allocControlBlock(BufferSize, BufferCount);
+ if (BackingStore == nullptr)
+ return BufferQueue::ErrorCode::NotEnoughMemory;
+
+ auto CleanupBackingStore = at_scope_exit([&, this] {
+ if (Success)
return;
- }
- auto *Extents = allocRaw<BufferExtents>(1);
- if (Extents == nullptr) {
- Success = false;
+ deallocControlBlock(BackingStore, BufferSize, BufferCount);
+ BackingStore = nullptr;
+ });
+
+ // Initialize enough atomic_uint64_t instances, each
+ ExtentsBackingStore = allocControlBlock(kExtentsSize, BufferCount);
+ if (ExtentsBackingStore == nullptr)
+ return BufferQueue::ErrorCode::NotEnoughMemory;
+
+ auto CleanupExtentsBackingStore = at_scope_exit([&, this] {
+ if (Success)
return;
- }
+ deallocControlBlock(ExtentsBackingStore, kExtentsSize, BufferCount);
+ ExtentsBackingStore = nullptr;
+ });
+
+ Buffers = initArray<BufferRep>(BufferCount);
+ if (Buffers == nullptr)
+ return BufferQueue::ErrorCode::NotEnoughMemory;
+
+ // At this point we increment the generation number to associate the buffers
+ // to the new generation.
+ atomic_fetch_add(&Generation, 1, memory_order_acq_rel);
+
+ // First, we initialize the refcount in the ControlBlock, which we treat as
+ // being at the start of the BackingStore pointer.
+ atomic_store(&BackingStore->RefCount, 1, memory_order_release);
+ atomic_store(&ExtentsBackingStore->RefCount, 1, memory_order_release);
+
+ // Then we initialise the individual buffers that sub-divide the whole backing
+ // store. Each buffer will start at the `Data` member of the ControlBlock, and
+ // will be offsets from these locations.
+ for (size_t i = 0; i < BufferCount; ++i) {
+ auto &T = Buffers[i];
auto &Buf = T.Buff;
- Buf.Data = Tmp;
- Buf.Size = B;
- Buf.Extents = Extents;
- OwnedBuffers[i] = Tmp;
+ auto *E = reinterpret_cast<ExtentsPadded *>(&ExtentsBackingStore->Data +
+ (kExtentsSize * i));
+ Buf.Extents = &E->Extents;
+ atomic_store(Buf.Extents, 0, memory_order_release);
+ Buf.Generation = generation();
+ Buf.Data = &BackingStore->Data + (BufferSize * i);
+ Buf.Size = BufferSize;
+ Buf.BackingStore = BackingStore;
+ Buf.ExtentsBackingStore = ExtentsBackingStore;
+ Buf.Count = BufferCount;
+ T.Used = false;
}
+
+ Next = Buffers;
+ First = Buffers;
+ LiveBuffers = 0;
+ atomic_store(&Finalizing, 0, memory_order_release);
Success = true;
+ return BufferQueue::ErrorCode::Ok;
+}
+
+BufferQueue::BufferQueue(size_t B, size_t N,
+ bool &Success) XRAY_NEVER_INSTRUMENT
+ : BufferSize(B),
+ BufferCount(N),
+ Mutex(),
+ Finalizing{1},
+ BackingStore(nullptr),
+ ExtentsBackingStore(nullptr),
+ Buffers(nullptr),
+ Next(Buffers),
+ First(Buffers),
+ LiveBuffers(0),
+ Generation{0} {
+ Success = init(B, N) == BufferQueue::ErrorCode::Ok;
}
BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
if (atomic_load(&Finalizing, memory_order_acquire))
return ErrorCode::QueueFinalizing;
- SpinMutexLock Guard(&Mutex);
- if (LiveBuffers == BufferCount)
- return ErrorCode::NotEnoughMemory;
- auto &T = *Next;
- auto &B = T.Buff;
- Buf = B;
- T.Used = true;
- ++LiveBuffers;
-
- if (++Next == (Buffers + BufferCount))
- Next = Buffers;
+ BufferRep *B = nullptr;
+ {
+ SpinMutexLock Guard(&Mutex);
+ if (LiveBuffers == BufferCount)
+ return ErrorCode::NotEnoughMemory;
+ B = Next++;
+ if (Next == (Buffers + BufferCount))
+ Next = Buffers;
+ ++LiveBuffers;
+ }
+ incRefCount(BackingStore);
+ incRefCount(ExtentsBackingStore);
+ Buf = B->Buff;
+ Buf.Generation = generation();
+ B->Used = true;
return ErrorCode::Ok;
}
BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
- // Blitz through the buffers array to find the buffer.
- bool Found = false;
- for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) {
- if (*I == Buf.Data) {
- Found = true;
- break;
+ // Check whether the buffer being referred to is within the bounds of the
+ // backing store's range.
+ BufferRep *B = nullptr;
+ {
+ SpinMutexLock Guard(&Mutex);
+ if (Buf.Generation != generation() || LiveBuffers == 0) {
+ Buf = {};
+ decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+ decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+ return BufferQueue::ErrorCode::Ok;
}
- }
- if (!Found)
- return ErrorCode::UnrecognizedBuffer;
- SpinMutexLock Guard(&Mutex);
+ if (Buf.Data < &BackingStore->Data ||
+ Buf.Data > &BackingStore->Data + (BufferCount * BufferSize))
+ return BufferQueue::ErrorCode::UnrecognizedBuffer;
- // This points to a semantic bug, we really ought to not be releasing more
- // buffers than we actually get.
- if (LiveBuffers == 0)
- return ErrorCode::NotEnoughMemory;
+ --LiveBuffers;
+ B = First++;
+ if (First == (Buffers + BufferCount))
+ First = Buffers;
+ }
// Now that the buffer has been released, we mark it as "used".
- First->Buff = Buf;
- First->Used = true;
- Buf.Data = nullptr;
- Buf.Size = 0;
- --LiveBuffers;
- if (++First == (Buffers + BufferCount))
- First = Buffers;
-
+ B->Buff = Buf;
+ B->Used = true;
+ decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+ decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+ atomic_store(B->Buff.Extents, atomic_load(Buf.Extents, memory_order_acquire),
+ memory_order_release);
+ Buf = {};
return ErrorCode::Ok;
}
@@ -157,15 +222,17 @@ BufferQueue::ErrorCode BufferQueue::finalize() {
return ErrorCode::Ok;
}
-BufferQueue::~BufferQueue() {
- for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
- auto &T = *I;
- auto &Buf = T.Buff;
- deallocRaw(Buf.Data, Buf.Size);
- deallocRaw(Buf.Extents, 1);
- }
+void BufferQueue::cleanupBuffers() {
for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
B->~BufferRep();
- deallocRaw(Buffers, BufferCount);
- deallocRaw(OwnedBuffers, BufferCount);
+ deallocateBuffer(Buffers, BufferCount);
+ decRefCount(BackingStore, BufferSize, BufferCount);
+ decRefCount(ExtentsBackingStore, kExtentsSize, BufferCount);
+ BackingStore = nullptr;
+ ExtentsBackingStore = nullptr;
+ Buffers = nullptr;
+ BufferCount = 0;
+ BufferSize = 0;
}
+
+BufferQueue::~BufferQueue() { cleanupBuffers(); }
diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h
index e76fa7983c90..ef2b433f9a3f 100644
--- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h
+++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h
@@ -18,25 +18,51 @@
#include "sanitizer_common/sanitizer_atomic.h"
#include "sanitizer_common/sanitizer_common.h"
#include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_defs.h"
#include <cstddef>
+#include <cstdint>
namespace __xray {
/// BufferQueue implements a circular queue of fixed sized buffers (much like a
-/// freelist) but is concerned mostly with making it really quick to initialise,
-/// finalise, and get/return buffers to the queue. This is one key component of
-/// the "flight data recorder" (FDR) mode to support ongoing XRay function call
+/// freelist) but is concerned with making it quick to initialise, finalise, and
+/// get from or return buffers to the queue. This is one key component of the
+/// "flight data recorder" (FDR) mode to support ongoing XRay function call
/// trace collection.
class BufferQueue {
public:
- struct alignas(64) BufferExtents {
- atomic_uint64_t Size;
+ /// ControlBlock represents the memory layout of how we interpret the backing
+ /// store for all buffers and extents managed by a BufferQueue instance. The
+ /// ControlBlock has the reference count as the first member, sized according
+ /// to platform-specific cache-line size. We never use the Buffer member of
+ /// the union, which is only there for compiler-supported alignment and
+ /// sizing.
+ ///
+ /// This ensures that the `Data` member will be placed at least kCacheLineSize
+ /// bytes from the beginning of the structure.
+ struct ControlBlock {
+ union {
+ atomic_uint64_t RefCount;
+ char Buffer[kCacheLineSize];
+ };
+
+ /// We need to make this size 1, to conform to the C++ rules for array data
+ /// members. Typically, we want to subtract this 1 byte for sizing
+ /// information.
+ char Data[1];
};
struct Buffer {
+ atomic_uint64_t *Extents = nullptr;
+ uint64_t Generation{0};
void *Data = nullptr;
size_t Size = 0;
- BufferExtents *Extents;
+
+ private:
+ friend class BufferQueue;
+ ControlBlock *BackingStore = nullptr;
+ ControlBlock *ExtentsBackingStore = nullptr;
+ size_t Count = 0;
};
struct BufferRep {
@@ -76,8 +102,10 @@ private:
T *operator->() const { return &(Buffers[Offset].Buff); }
- Iterator(BufferRep *Root, size_t O, size_t M)
- : Buffers(Root), Offset(O), Max(M) {
+ Iterator(BufferRep *Root, size_t O, size_t M) XRAY_NEVER_INSTRUMENT
+ : Buffers(Root),
+ Offset(O),
+ Max(M) {
// We want to advance to the first Offset where the 'Used' property is
// true, or to the end of the list/queue.
while (!Buffers[Offset].Used && Offset != Max) {
@@ -107,16 +135,20 @@ private:
// Size of each individual Buffer.
size_t BufferSize;
- BufferRep *Buffers;
-
// Amount of pre-allocated buffers.
size_t BufferCount;
SpinMutex Mutex;
atomic_uint8_t Finalizing;
- // Pointers to buffers managed/owned by the BufferQueue.
- void **OwnedBuffers;
+ // The collocated ControlBlock and buffer storage.
+ ControlBlock *BackingStore;
+
+ // The collocated ControlBlock and extents storage.
+ ControlBlock *ExtentsBackingStore;
+
+ // A dynamically allocated array of BufferRep instances.
+ BufferRep *Buffers;
// Pointer to the next buffer to be handed out.
BufferRep *Next;
@@ -128,6 +160,13 @@ private:
// Count of buffers that have been handed out through 'getBuffer'.
size_t LiveBuffers;
+ // We use a generation number to identify buffers and which generation they're
+ // associated with.
+ atomic_uint64_t Generation;
+
+ /// Releases references to the buffers backed by the current buffer queue.
+ void cleanupBuffers();
+
public:
enum class ErrorCode : unsigned {
Ok,
@@ -135,6 +174,7 @@ public:
QueueFinalizing,
UnrecognizedBuffer,
AlreadyFinalized,
+ AlreadyInitialized,
};
static const char *getErrorString(ErrorCode E) {
@@ -149,6 +189,8 @@ public:
return "buffer being returned not owned by buffer queue";
case ErrorCode::AlreadyFinalized:
return "queue already finalized";
+ case ErrorCode::AlreadyInitialized:
+ return "queue already initialized";
}
return "unknown error";
}
@@ -179,10 +221,23 @@ public:
/// the buffer being released.
ErrorCode releaseBuffer(Buffer &Buf);
+ /// Initializes the buffer queue, starting a new generation. We can re-set the
+ /// size of buffers with |BS| along with the buffer count with |BC|.
+ ///
+ /// Returns:
+ /// - ErrorCode::Ok when we successfully initialize the buffer. This
+ /// requires that the buffer queue is previously finalized.
+ /// - ErrorCode::AlreadyInitialized when the buffer queue is not finalized.
+ ErrorCode init(size_t BS, size_t BC);
+
bool finalizing() const {
return atomic_load(&Finalizing, memory_order_acquire);
}
+ uint64_t generation() const {
+ return atomic_load(&Generation, memory_order_acquire);
+ }
+
/// Returns the configured size of the buffers in the buffer queue.
size_t ConfiguredBufferSize() const { return BufferSize; }
@@ -198,7 +253,7 @@ public:
/// Applies the provided function F to each Buffer in the queue, only if the
/// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a
/// releaseBuffer(...) operation).
- template <class F> void apply(F Fn) {
+ template <class F> void apply(F Fn) XRAY_NEVER_INSTRUMENT {
SpinMutexLock G(&Mutex);
for (auto I = begin(), E = end(); I != E; ++I)
Fn(*I);
diff --git a/contrib/compiler-rt/lib/xray/xray_defs.h b/contrib/compiler-rt/lib/xray/xray_defs.h
index e5c37c0665db..c009bcc879f1 100644
--- a/contrib/compiler-rt/lib/xray/xray_defs.h
+++ b/contrib/compiler-rt/lib/xray/xray_defs.h
@@ -19,4 +19,14 @@
#define XRAY_NEVER_INSTRUMENT
#endif
+#if SANITIZER_NETBSD
+// NetBSD: thread_local is not aligned properly, and the code relying
+// on it segfaults
+#define XRAY_TLS_ALIGNAS(x)
+#define XRAY_HAS_TLS_ALIGNAS 0
+#else
+#define XRAY_TLS_ALIGNAS(x) alignas(x)
+#define XRAY_HAS_TLS_ALIGNAS 1
+#endif
+
#endif // XRAY_XRAY_DEFS_H
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_controller.h b/contrib/compiler-rt/lib/xray/xray_fdr_controller.h
new file mode 100644
index 000000000000..d44d0309b373
--- /dev/null
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_controller.h
@@ -0,0 +1,373 @@
+//===-- xray_fdr_controller.h ---------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
+#define COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
+
+#include <limits>
+#include <time.h>
+
+#include "xray/xray_interface.h"
+#include "xray/xray_records.h"
+#include "xray_buffer_queue.h"
+#include "xray_fdr_log_writer.h"
+
+namespace __xray {
+
+template <size_t Version = 5> class FDRController {
+ BufferQueue *BQ;
+ BufferQueue::Buffer &B;
+ FDRLogWriter &W;
+ int (*WallClockReader)(clockid_t, struct timespec *) = 0;
+ uint64_t CycleThreshold = 0;
+
+ uint64_t LastFunctionEntryTSC = 0;
+ uint64_t LatestTSC = 0;
+ uint16_t LatestCPU = 0;
+ tid_t TId = 0;
+ pid_t PId = 0;
+ bool First = true;
+
+ uint32_t UndoableFunctionEnters = 0;
+ uint32_t UndoableTailExits = 0;
+
+ bool finalized() const XRAY_NEVER_INSTRUMENT {
+ return BQ == nullptr || BQ->finalizing();
+ }
+
+ bool hasSpace(size_t S) XRAY_NEVER_INSTRUMENT {
+ return B.Data != nullptr && B.Generation == BQ->generation() &&
+ W.getNextRecord() + S <= reinterpret_cast<char *>(B.Data) + B.Size;
+ }
+
+ constexpr int32_t mask(int32_t FuncId) const XRAY_NEVER_INSTRUMENT {
+ return FuncId & ((1 << 29) - 1);
+ }
+
+ bool getNewBuffer() XRAY_NEVER_INSTRUMENT {
+ if (BQ->getBuffer(B) != BufferQueue::ErrorCode::Ok)
+ return false;
+
+ W.resetRecord();
+ DCHECK_EQ(W.getNextRecord(), B.Data);
+ LatestTSC = 0;
+ LatestCPU = 0;
+ First = true;
+ UndoableFunctionEnters = 0;
+ UndoableTailExits = 0;
+ atomic_store(B.Extents, 0, memory_order_release);
+ return true;
+ }
+
+ bool setupNewBuffer() XRAY_NEVER_INSTRUMENT {
+ if (finalized())
+ return false;
+
+ DCHECK(hasSpace(sizeof(MetadataRecord) * 3));
+ TId = GetTid();
+ PId = internal_getpid();
+ struct timespec TS {
+ 0, 0
+ };
+ WallClockReader(CLOCK_MONOTONIC, &TS);
+
+ MetadataRecord Metadata[] = {
+ // Write out a MetadataRecord to signify that this is the start of a new
+ // buffer, associated with a particular thread, with a new CPU. For the
+ // data, we have 15 bytes to squeeze as much information as we can. At
+ // this point we only write down the following bytes:
+ // - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8
+ // bytes)
+ createMetadataRecord<MetadataRecord::RecordKinds::NewBuffer>(
+ static_cast<int32_t>(TId)),
+
+ // Also write the WalltimeMarker record. We only really need microsecond
+ // precision here, and enforce across platforms that we need 64-bit
+ // seconds and 32-bit microseconds encoded in the Metadata record.
+ createMetadataRecord<MetadataRecord::RecordKinds::WalltimeMarker>(
+ static_cast<int64_t>(TS.tv_sec),
+ static_cast<int32_t>(TS.tv_nsec / 1000)),
+
+ // Also write the Pid record.
+ createMetadataRecord<MetadataRecord::RecordKinds::Pid>(
+ static_cast<int32_t>(PId)),
+ };
+
+ if (finalized())
+ return false;
+ return W.writeMetadataRecords(Metadata);
+ }
+
+ bool prepareBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+ if (finalized())
+ return returnBuffer();
+
+ if (UNLIKELY(!hasSpace(S))) {
+ if (!returnBuffer())
+ return false;
+ if (!getNewBuffer())
+ return false;
+ if (!setupNewBuffer())
+ return false;
+ }
+
+ if (First) {
+ First = false;
+ W.resetRecord();
+ atomic_store(B.Extents, 0, memory_order_release);
+ return setupNewBuffer();
+ }
+
+ return true;
+ }
+
+ bool returnBuffer() XRAY_NEVER_INSTRUMENT {
+ if (BQ == nullptr)
+ return false;
+
+ First = true;
+ if (finalized()) {
+ BQ->releaseBuffer(B); // ignore result.
+ return false;
+ }
+
+ return BQ->releaseBuffer(B) == BufferQueue::ErrorCode::Ok;
+ }
+
+ enum class PreambleResult { NoChange, WroteMetadata, InvalidBuffer };
+ PreambleResult recordPreamble(uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+ if (UNLIKELY(LatestCPU != CPU || LatestTSC == 0)) {
+ // We update our internal tracking state for the Latest TSC and CPU we've
+ // seen, then write out the appropriate metadata and function records.
+ LatestTSC = TSC;
+ LatestCPU = CPU;
+
+ if (B.Generation != BQ->generation())
+ return PreambleResult::InvalidBuffer;
+
+ W.writeMetadata<MetadataRecord::RecordKinds::NewCPUId>(CPU, TSC);
+ return PreambleResult::WroteMetadata;
+ }
+
+ DCHECK_EQ(LatestCPU, CPU);
+
+ if (UNLIKELY(LatestTSC > TSC ||
+ TSC - LatestTSC >
+ uint64_t{std::numeric_limits<int32_t>::max()})) {
+ // Either the TSC has wrapped around from the last TSC we've seen or the
+ // delta is too large to fit in a 32-bit signed integer, so we write a
+ // wrap-around record.
+ LatestTSC = TSC;
+
+ if (B.Generation != BQ->generation())
+ return PreambleResult::InvalidBuffer;
+
+ W.writeMetadata<MetadataRecord::RecordKinds::TSCWrap>(TSC);
+ return PreambleResult::WroteMetadata;
+ }
+
+ return PreambleResult::NoChange;
+ }
+
+ bool rewindRecords(int32_t FuncId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+ // Undo one enter record, because at this point we are either at the state
+ // of:
+ // - We are exiting a function that we recently entered.
+ // - We are exiting a function that was the result of a sequence of tail
+ // exits, and we can check whether the tail exits can be re-wound.
+ //
+ FunctionRecord F;
+ W.undoWrites(sizeof(FunctionRecord));
+ if (B.Generation != BQ->generation())
+ return false;
+ internal_memcpy(&F, W.getNextRecord(), sizeof(FunctionRecord));
+
+ DCHECK(F.RecordKind ==
+ uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
+ "Expected to find function entry recording when rewinding.");
+ DCHECK_EQ(F.FuncId, FuncId & ~(0x0F << 28));
+
+ LatestTSC -= F.TSCDelta;
+ if (--UndoableFunctionEnters != 0) {
+ LastFunctionEntryTSC -= F.TSCDelta;
+ return true;
+ }
+
+ LastFunctionEntryTSC = 0;
+ auto RewindingTSC = LatestTSC;
+ auto RewindingRecordPtr = W.getNextRecord() - sizeof(FunctionRecord);
+ while (UndoableTailExits) {
+ if (B.Generation != BQ->generation())
+ return false;
+ internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord));
+ DCHECK_EQ(F.RecordKind,
+ uint8_t(FunctionRecord::RecordKinds::FunctionTailExit));
+ RewindingTSC -= F.TSCDelta;
+ RewindingRecordPtr -= sizeof(FunctionRecord);
+ if (B.Generation != BQ->generation())
+ return false;
+ internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord));
+
+ // This tail call exceeded the threshold duration. It will not be erased.
+ if ((TSC - RewindingTSC) >= CycleThreshold) {
+ UndoableTailExits = 0;
+ return true;
+ }
+
+ --UndoableTailExits;
+ W.undoWrites(sizeof(FunctionRecord) * 2);
+ LatestTSC = RewindingTSC;
+ }
+ return true;
+ }
+
+public:
+ template <class WallClockFunc>
+ FDRController(BufferQueue *BQ, BufferQueue::Buffer &B, FDRLogWriter &W,
+ WallClockFunc R, uint64_t C) XRAY_NEVER_INSTRUMENT
+ : BQ(BQ),
+ B(B),
+ W(W),
+ WallClockReader(R),
+ CycleThreshold(C) {}
+
+ bool functionEnter(int32_t FuncId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+ if (finalized() ||
+ !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+ return returnBuffer();
+
+ auto PreambleStatus = recordPreamble(TSC, CPU);
+ if (PreambleStatus == PreambleResult::InvalidBuffer)
+ return returnBuffer();
+
+ if (PreambleStatus == PreambleResult::WroteMetadata) {
+ UndoableFunctionEnters = 1;
+ UndoableTailExits = 0;
+ } else {
+ ++UndoableFunctionEnters;
+ }
+
+ auto Delta = TSC - LatestTSC;
+ LastFunctionEntryTSC = TSC;
+ LatestTSC = TSC;
+ return W.writeFunction(FDRLogWriter::FunctionRecordKind::Enter,
+ mask(FuncId), Delta);
+ }
+
+ bool functionTailExit(int32_t FuncId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+ if (finalized())
+ return returnBuffer();
+
+ if (!prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+ return returnBuffer();
+
+ auto PreambleStatus = recordPreamble(TSC, CPU);
+ if (PreambleStatus == PreambleResult::InvalidBuffer)
+ return returnBuffer();
+
+ if (PreambleStatus == PreambleResult::NoChange &&
+ UndoableFunctionEnters != 0 &&
+ TSC - LastFunctionEntryTSC < CycleThreshold)
+ return rewindRecords(FuncId, TSC, CPU);
+
+ UndoableTailExits = UndoableFunctionEnters ? UndoableTailExits + 1 : 0;
+ UndoableFunctionEnters = 0;
+ auto Delta = TSC - LatestTSC;
+ LatestTSC = TSC;
+ return W.writeFunction(FDRLogWriter::FunctionRecordKind::TailExit,
+ mask(FuncId), Delta);
+ }
+
+ bool functionEnterArg(int32_t FuncId, uint64_t TSC, uint16_t CPU,
+ uint64_t Arg) XRAY_NEVER_INSTRUMENT {
+ if (finalized() ||
+ !prepareBuffer((2 * sizeof(MetadataRecord)) + sizeof(FunctionRecord)) ||
+ recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+ return returnBuffer();
+
+ auto Delta = TSC - LatestTSC;
+ LatestTSC = TSC;
+ LastFunctionEntryTSC = 0;
+ UndoableFunctionEnters = 0;
+ UndoableTailExits = 0;
+
+ return W.writeFunctionWithArg(FDRLogWriter::FunctionRecordKind::EnterArg,
+ mask(FuncId), Delta, Arg);
+ }
+
+ bool functionExit(int32_t FuncId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+ if (finalized() ||
+ !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+ return returnBuffer();
+
+ auto PreambleStatus = recordPreamble(TSC, CPU);
+ if (PreambleStatus == PreambleResult::InvalidBuffer)
+ return returnBuffer();
+
+ if (PreambleStatus == PreambleResult::NoChange &&
+ UndoableFunctionEnters != 0 &&
+ TSC - LastFunctionEntryTSC < CycleThreshold)
+ return rewindRecords(FuncId, TSC, CPU);
+
+ auto Delta = TSC - LatestTSC;
+ LatestTSC = TSC;
+ UndoableFunctionEnters = 0;
+ UndoableTailExits = 0;
+ return W.writeFunction(FDRLogWriter::FunctionRecordKind::Exit, mask(FuncId),
+ Delta);
+ }
+
+ bool customEvent(uint64_t TSC, uint16_t CPU, const void *Event,
+ int32_t EventSize) XRAY_NEVER_INSTRUMENT {
+ if (finalized() ||
+ !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) ||
+ recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+ return returnBuffer();
+
+ auto Delta = TSC - LatestTSC;
+ LatestTSC = TSC;
+ UndoableFunctionEnters = 0;
+ UndoableTailExits = 0;
+ return W.writeCustomEvent(Delta, Event, EventSize);
+ }
+
+ bool typedEvent(uint64_t TSC, uint16_t CPU, uint16_t EventType,
+ const void *Event, int32_t EventSize) XRAY_NEVER_INSTRUMENT {
+ if (finalized() ||
+ !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) ||
+ recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+ return returnBuffer();
+
+ auto Delta = TSC - LatestTSC;
+ LatestTSC = TSC;
+ UndoableFunctionEnters = 0;
+ UndoableTailExits = 0;
+ return W.writeTypedEvent(Delta, EventType, Event, EventSize);
+ }
+
+ bool flush() XRAY_NEVER_INSTRUMENT {
+ if (finalized()) {
+ returnBuffer(); // ignore result.
+ return true;
+ }
+ return returnBuffer();
+ }
+};
+
+} // namespace __xray
+
+#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
index 87096d4fc29e..e7b1ee562e1b 100644
--- a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
@@ -12,6 +12,9 @@
//===----------------------------------------------------------------------===//
#ifndef XRAY_XRAY_FDR_LOG_RECORDS_H
#define XRAY_XRAY_FDR_LOG_RECORDS_H
+#include <cstdint>
+
+namespace __xray {
enum class RecordType : uint8_t { Function, Metadata };
@@ -68,4 +71,6 @@ struct alignas(8) FunctionRecord {
static_assert(sizeof(FunctionRecord) == 8, "Wrong size for FunctionRecord.");
+} // namespace __xray
+
#endif // XRAY_XRAY_FDR_LOG_RECORDS_H
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h b/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h
new file mode 100644
index 000000000000..7712e1377763
--- /dev/null
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h
@@ -0,0 +1,232 @@
+//===-- xray_fdr_log_writer.h ---------------------------------------------===//
+//
+// The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+#define COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+
+#include "xray_buffer_queue.h"
+#include "xray_fdr_log_records.h"
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+template <size_t Index> struct SerializerImpl {
+ template <class Tuple,
+ typename std::enable_if<
+ Index<std::tuple_size<
+ typename std::remove_reference<Tuple>::type>::value,
+ int>::type = 0> static void serializeTo(char *Buffer,
+ Tuple &&T) {
+ auto P = reinterpret_cast<const char *>(&std::get<Index>(T));
+ constexpr auto Size = sizeof(std::get<Index>(T));
+ internal_memcpy(Buffer, P, Size);
+ SerializerImpl<Index + 1>::serializeTo(Buffer + Size,
+ std::forward<Tuple>(T));
+ }
+
+ template <class Tuple,
+ typename std::enable_if<
+ Index >= std::tuple_size<typename std::remove_reference<
+ Tuple>::type>::value,
+ int>::type = 0>
+ static void serializeTo(char *, Tuple &&) {}
+};
+
+using Serializer = SerializerImpl<0>;
+
+template <class Tuple, size_t Index> struct AggregateSizesImpl {
+ static constexpr size_t value =
+ sizeof(typename std::tuple_element<Index, Tuple>::type) +
+ AggregateSizesImpl<Tuple, Index - 1>::value;
+};
+
+template <class Tuple> struct AggregateSizesImpl<Tuple, 0> {
+ static constexpr size_t value =
+ sizeof(typename std::tuple_element<0, Tuple>::type);
+};
+
+template <class Tuple> struct AggregateSizes {
+ static constexpr size_t value =
+ AggregateSizesImpl<Tuple, std::tuple_size<Tuple>::value - 1>::value;
+};
+
+template <MetadataRecord::RecordKinds Kind, class... DataTypes>
+MetadataRecord createMetadataRecord(DataTypes &&... Ds) {
+ static_assert(AggregateSizes<std::tuple<DataTypes...>>::value <=
+ sizeof(MetadataRecord) - 1,
+ "Metadata payload longer than metadata buffer!");
+ MetadataRecord R;
+ R.Type = 1;
+ R.RecordKind = static_cast<uint8_t>(Kind);
+ Serializer::serializeTo(R.Data,
+ std::make_tuple(std::forward<DataTypes>(Ds)...));
+ return R;
+}
+
+class FDRLogWriter {
+ BufferQueue::Buffer &Buffer;
+ char *NextRecord = nullptr;
+
+ template <class T> void writeRecord(const T &R) {
+ internal_memcpy(NextRecord, reinterpret_cast<const char *>(&R), sizeof(T));
+ NextRecord += sizeof(T);
+ // We need this atomic fence here to ensure that other threads attempting to
+ // read the bytes in the buffer will see the writes committed before the
+ // extents are updated.
+ atomic_thread_fence(memory_order_release);
+ atomic_fetch_add(Buffer.Extents, sizeof(T), memory_order_acq_rel);
+ }
+
+public:
+ explicit FDRLogWriter(BufferQueue::Buffer &B, char *P)
+ : Buffer(B), NextRecord(P) {
+ DCHECK_NE(Buffer.Data, nullptr);
+ DCHECK_NE(NextRecord, nullptr);
+ }
+
+ explicit FDRLogWriter(BufferQueue::Buffer &B)
+ : FDRLogWriter(B, static_cast<char *>(B.Data)) {}
+
+ template <MetadataRecord::RecordKinds Kind, class... Data>
+ bool writeMetadata(Data &&... Ds) {
+ // TODO: Check boundary conditions:
+ // 1) Buffer is full, and cannot handle one metadata record.
+ // 2) Buffer queue is finalising.
+ writeRecord(createMetadataRecord<Kind>(std::forward<Data>(Ds)...));
+ return true;
+ }
+
+ template <size_t N> size_t writeMetadataRecords(MetadataRecord (&Recs)[N]) {
+ constexpr auto Size = sizeof(MetadataRecord) * N;
+ internal_memcpy(NextRecord, reinterpret_cast<const char *>(Recs), Size);
+ NextRecord += Size;
+ // We need this atomic fence here to ensure that other threads attempting to
+ // read the bytes in the buffer will see the writes committed before the
+ // extents are updated.
+ atomic_thread_fence(memory_order_release);
+ atomic_fetch_add(Buffer.Extents, Size, memory_order_acq_rel);
+ return Size;
+ }
+
+ enum class FunctionRecordKind : uint8_t {
+ Enter = 0x00,
+ Exit = 0x01,
+ TailExit = 0x02,
+ EnterArg = 0x03,
+ };
+
+ bool writeFunction(FunctionRecordKind Kind, int32_t FuncId, int32_t Delta) {
+ FunctionRecord R;
+ R.Type = 0;
+ R.RecordKind = uint8_t(Kind);
+ R.FuncId = FuncId;
+ R.TSCDelta = Delta;
+ writeRecord(R);
+ return true;
+ }
+
+ bool writeFunctionWithArg(FunctionRecordKind Kind, int32_t FuncId,
+ int32_t Delta, uint64_t Arg) {
+ // We need to write the function with arg into the buffer, and then
+ // atomically update the buffer extents. This ensures that any reads
+ // synchronised on the buffer extents record will always see the writes
+ // that happen before the atomic update.
+ FunctionRecord R;
+ R.Type = 0;
+ R.RecordKind = uint8_t(Kind);
+ R.FuncId = FuncId;
+ R.TSCDelta = Delta;
+ MetadataRecord A =
+ createMetadataRecord<MetadataRecord::RecordKinds::CallArgument>(Arg);
+ NextRecord = reinterpret_cast<char *>(internal_memcpy(
+ NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+ sizeof(R);
+ NextRecord = reinterpret_cast<char *>(internal_memcpy(
+ NextRecord, reinterpret_cast<char *>(&A), sizeof(A))) +
+ sizeof(A);
+ // We need this atomic fence here to ensure that other threads attempting to
+ // read the bytes in the buffer will see the writes committed before the
+ // extents are updated.
+ atomic_thread_fence(memory_order_release);
+ atomic_fetch_add(Buffer.Extents, sizeof(R) + sizeof(A),
+ memory_order_acq_rel);
+ return true;
+ }
+
+ bool writeCustomEvent(int32_t Delta, const void *Event, int32_t EventSize) {
+ // We write the metadata record and the custom event data into the buffer
+ // first, before we atomically update the extents for the buffer. This
+ // allows us to ensure that any threads reading the extents of the buffer
+ // will only ever see the full metadata and custom event payload accounted
+ // (no partial writes accounted).
+ MetadataRecord R =
+ createMetadataRecord<MetadataRecord::RecordKinds::CustomEventMarker>(
+ EventSize, Delta);
+ NextRecord = reinterpret_cast<char *>(internal_memcpy(
+ NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+ sizeof(R);
+ NextRecord = reinterpret_cast<char *>(
+ internal_memcpy(NextRecord, Event, EventSize)) +
+ EventSize;
+
+ // We need this atomic fence here to ensure that other threads attempting to
+ // read the bytes in the buffer will see the writes committed before the
+ // extents are updated.
+ atomic_thread_fence(memory_order_release);
+ atomic_fetch_add(Buffer.Extents, sizeof(R) + EventSize,
+ memory_order_acq_rel);
+ return true;
+ }
+
+ bool writeTypedEvent(int32_t Delta, uint16_t EventType, const void *Event,
+ int32_t EventSize) {
+ // We do something similar when writing out typed events, see
+ // writeCustomEvent(...) above for details.
+ MetadataRecord R =
+ createMetadataRecord<MetadataRecord::RecordKinds::TypedEventMarker>(
+ EventSize, Delta, EventType);
+ NextRecord = reinterpret_cast<char *>(internal_memcpy(
+ NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+ sizeof(R);
+ NextRecord = reinterpret_cast<char *>(
+ internal_memcpy(NextRecord, Event, EventSize)) +
+ EventSize;
+
+ // We need this atomic fence here to ensure that other threads attempting to
+ // read the bytes in the buffer will see the writes committed before the
+ // extents are updated.
+ atomic_thread_fence(memory_order_release);
+ atomic_fetch_add(Buffer.Extents, EventSize, memory_order_acq_rel);
+ return true;
+ }
+
+ char *getNextRecord() const { return NextRecord; }
+
+ void resetRecord() {
+ NextRecord = reinterpret_cast<char *>(Buffer.Data);
+ atomic_store(Buffer.Extents, 0, memory_order_release);
+ }
+
+ void undoWrites(size_t B) {
+ DCHECK_GE(NextRecord - B, reinterpret_cast<char *>(Buffer.Data));
+ NextRecord -= B;
+ atomic_fetch_sub(Buffer.Extents, B, memory_order_acq_rel);
+ }
+
+}; // namespace __xray
+
+} // namespace __xray
+
+#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
index 6cb2dfa0c658..1eda26df7a85 100644
--- a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
@@ -20,7 +20,6 @@
#include <limits>
#include <memory>
#include <pthread.h>
-#include <sys/syscall.h>
#include <sys/time.h>
#include <time.h>
#include <unistd.h>
@@ -30,9 +29,12 @@
#include "sanitizer_common/sanitizer_common.h"
#include "xray/xray_interface.h"
#include "xray/xray_records.h"
+#include "xray_allocator.h"
#include "xray_buffer_queue.h"
#include "xray_defs.h"
+#include "xray_fdr_controller.h"
#include "xray_fdr_flags.h"
+#include "xray_fdr_log_writer.h"
#include "xray_flags.h"
#include "xray_recursion_guard.h"
#include "xray_tsc.h"
@@ -40,55 +42,53 @@
namespace __xray {
-atomic_sint32_t LoggingStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+static atomic_sint32_t LoggingStatus = {
+ XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+namespace {
// Group together thread-local-data in a struct, then hide it behind a function
// call so that it can be initialized on first use instead of as a global. We
// force the alignment to 64-bytes for x86 cache line alignment, as this
// structure is used in the hot path of implementation.
-struct alignas(64) ThreadLocalData {
- BufferQueue::Buffer Buffer;
- char *RecordPtr = nullptr;
- // The number of FunctionEntry records immediately preceding RecordPtr.
- uint8_t NumConsecutiveFnEnters = 0;
-
- // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit
- // records preceding RecordPtr.
- uint8_t NumTailCalls = 0;
-
- // We use a thread_local variable to keep track of which CPUs we've already
- // run, and the TSC times for these CPUs. This allows us to stop repeating the
- // CPU field in the function records.
- //
- // We assume that we'll support only 65536 CPUs for x86_64.
- uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
- uint64_t LastTSC = 0;
- uint64_t LastFunctionEntryTSC = 0;
-
- // Make sure a thread that's ever called handleArg0 has a thread-local
- // live reference to the buffer queue for this particular instance of
- // FDRLogging, and that we're going to clean it up when the thread exits.
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
+ BufferQueue::Buffer Buffer{};
BufferQueue *BQ = nullptr;
+
+ using LogWriterStorage =
+ typename std::aligned_storage<sizeof(FDRLogWriter),
+ alignof(FDRLogWriter)>::type;
+
+ LogWriterStorage LWStorage;
+ FDRLogWriter *Writer = nullptr;
+
+ using ControllerStorage =
+ typename std::aligned_storage<sizeof(FDRController<>),
+ alignof(FDRController<>)>::type;
+ ControllerStorage CStorage;
+ FDRController<> *Controller = nullptr;
};
+} // namespace
+
static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
"ThreadLocalData must be trivially destructible");
-static constexpr auto MetadataRecSize = sizeof(MetadataRecord);
-static constexpr auto FunctionRecSize = sizeof(FunctionRecord);
-
// Use a global pthread key to identify thread-local data for logging.
static pthread_key_t Key;
// Global BufferQueue.
+static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage;
static BufferQueue *BQ = nullptr;
-static atomic_sint32_t LogFlushStatus = {
- XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+// Global thresholds for function durations.
+static atomic_uint64_t ThresholdTicks{0};
-static FDRLoggingOptions FDROptions;
+// Global for ticks per second.
+static atomic_uint64_t TicksPerSec{0};
-static SpinMutex FDROptionsMutex;
+static atomic_sint32_t LogFlushStatus = {
+ XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
// This function will initialize the thread-local data structure used by the FDR
// logging implementation and return a reference to it. The implementation
@@ -124,8 +124,10 @@ static SpinMutex FDROptionsMutex;
// critical section, calling a function that might be XRay instrumented (and
// thus in turn calling into malloc by virtue of registration of the
// thread_local's destructor).
+#if XRAY_HAS_TLS_ALIGNAS
static_assert(alignof(ThreadLocalData) >= 64,
"ThreadLocalData must be cache line aligned.");
+#endif
static ThreadLocalData &getThreadLocalData() {
thread_local typename std::aligned_storage<
sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{};
@@ -138,559 +140,36 @@ static ThreadLocalData &getThreadLocalData() {
return *reinterpret_cast<ThreadLocalData *>(&TLDStorage);
}
-static void writeNewBufferPreamble(tid_t Tid, timespec TS,
- pid_t Pid) XRAY_NEVER_INSTRUMENT {
- static constexpr int InitRecordsCount = 3;
- auto &TLD = getThreadLocalData();
- MetadataRecord Metadata[InitRecordsCount];
- {
- // Write out a MetadataRecord to signify that this is the start of a new
- // buffer, associated with a particular thread, with a new CPU. For the
- // data, we have 15 bytes to squeeze as much information as we can. At this
- // point we only write down the following bytes:
- // - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes)
- auto &NewBuffer = Metadata[0];
- NewBuffer.Type = uint8_t(RecordType::Metadata);
- NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer);
- int32_t tid = static_cast<int32_t>(Tid);
- internal_memcpy(&NewBuffer.Data, &tid, sizeof(tid));
- }
-
- // Also write the WalltimeMarker record.
- {
- static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
- auto &WalltimeMarker = Metadata[1];
- WalltimeMarker.Type = uint8_t(RecordType::Metadata);
- WalltimeMarker.RecordKind =
- uint8_t(MetadataRecord::RecordKinds::WalltimeMarker);
-
- // We only really need microsecond precision here, and enforce across
- // platforms that we need 64-bit seconds and 32-bit microseconds encoded in
- // the Metadata record.
- int32_t Micros = TS.tv_nsec / 1000;
- int64_t Seconds = TS.tv_sec;
- internal_memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds));
- internal_memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros,
- sizeof(Micros));
- }
-
- // Also write the Pid record.
- {
- // Write out a MetadataRecord that contains the current pid
- auto &PidMetadata = Metadata[2];
- PidMetadata.Type = uint8_t(RecordType::Metadata);
- PidMetadata.RecordKind = uint8_t(MetadataRecord::RecordKinds::Pid);
- int32_t pid = static_cast<int32_t>(Pid);
- internal_memcpy(&PidMetadata.Data, &pid, sizeof(pid));
- }
-
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- if (TLD.BQ == nullptr || TLD.BQ->finalizing())
- return;
- internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
- TLD.RecordPtr += sizeof(Metadata);
- // Since we write out the extents as the first metadata record of the
- // buffer, we need to write out the extents including the extents record.
- atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
- memory_order_release);
-}
-
-static void setupNewBuffer(int (*wall_clock_reader)(
- clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- auto &B = TLD.Buffer;
- TLD.RecordPtr = static_cast<char *>(B.Data);
- tid_t Tid = GetTid();
- timespec TS{0, 0};
- pid_t Pid = internal_getpid();
- // This is typically clock_gettime, but callers have injection ability.
- wall_clock_reader(CLOCK_MONOTONIC, &TS);
- writeNewBufferPreamble(Tid, TS, Pid);
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
-}
-
-static void incrementExtents(size_t Add) {
- auto &TLD = getThreadLocalData();
- atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel);
-}
-
-static void decrementExtents(size_t Subtract) {
- auto &TLD = getThreadLocalData();
- atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel);
-}
-
-static void writeNewCPUIdMetadata(uint16_t CPU,
- uint64_t TSC) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- MetadataRecord NewCPUId;
- NewCPUId.Type = uint8_t(RecordType::Metadata);
- NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
-
- // The data for the New CPU will contain the following bytes:
- // - CPU ID (uint16_t, 2 bytes)
- // - Full TSC (uint64_t, 8 bytes)
- // Total = 10 bytes.
- internal_memcpy(&NewCPUId.Data, &CPU, sizeof(CPU));
- internal_memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
- internal_memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord));
- TLD.RecordPtr += sizeof(MetadataRecord);
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- incrementExtents(sizeof(MetadataRecord));
-}
-
-static void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- MetadataRecord TSCWrap;
- TSCWrap.Type = uint8_t(RecordType::Metadata);
- TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
-
- // The data for the TSCWrap record contains the following bytes:
- // - Full TSC (uint64_t, 8 bytes)
- // Total = 8 bytes.
- internal_memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
- internal_memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord));
- TLD.RecordPtr += sizeof(MetadataRecord);
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- incrementExtents(sizeof(MetadataRecord));
-}
-
-// Call Argument metadata records store the arguments to a function in the
-// order of their appearance; holes are not supported by the buffer format.
-static void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- MetadataRecord CallArg;
- CallArg.Type = uint8_t(RecordType::Metadata);
- CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument);
-
- internal_memcpy(CallArg.Data, &A, sizeof(A));
- internal_memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord));
- TLD.RecordPtr += sizeof(MetadataRecord);
- incrementExtents(sizeof(MetadataRecord));
-}
-
-static void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
- XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT {
- FunctionRecord FuncRecord;
- FuncRecord.Type = uint8_t(RecordType::Function);
- // Only take 28 bits of the function id.
- FuncRecord.FuncId = FuncId & ~(0x0F << 28);
- FuncRecord.TSCDelta = TSCDelta;
-
- auto &TLD = getThreadLocalData();
- switch (EntryType) {
- case XRayEntryType::ENTRY:
- ++TLD.NumConsecutiveFnEnters;
- FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
- break;
- case XRayEntryType::LOG_ARGS_ENTRY:
- // We should not rewind functions with logged args.
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
- break;
- case XRayEntryType::EXIT:
- // If we've decided to log the function exit, we will never erase the log
- // before it.
- TLD.NumConsecutiveFnEnters = 0;
- TLD.NumTailCalls = 0;
- FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
- break;
- case XRayEntryType::TAIL:
- // If we just entered the function we're tail exiting from or erased every
- // invocation since then, this function entry tail pair is a candidate to
- // be erased when the child function exits.
- if (TLD.NumConsecutiveFnEnters > 0) {
- ++TLD.NumTailCalls;
- TLD.NumConsecutiveFnEnters = 0;
- } else {
- // We will never be able to erase this tail call since we have logged
- // something in between the function entry and tail exit.
- TLD.NumTailCalls = 0;
- TLD.NumConsecutiveFnEnters = 0;
- }
- FuncRecord.RecordKind =
- uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
- break;
- case XRayEntryType::CUSTOM_EVENT: {
- // This is a bug in patching, so we'll report it once and move on.
- static atomic_uint8_t ErrorLatch{0};
- if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
- Report("Internal error: patched an XRay custom event call as a function; "
- "func id = %d\n",
- FuncId);
- return;
- }
- case XRayEntryType::TYPED_EVENT: {
- static atomic_uint8_t ErrorLatch{0};
- if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
- Report("Internal error: patched an XRay typed event call as a function; "
- "func id = %d\n",
- FuncId);
- return;
- }
- }
-
- internal_memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord));
- TLD.RecordPtr += sizeof(FunctionRecord);
- incrementExtents(sizeof(FunctionRecord));
-}
-
-static atomic_uint64_t TicksPerSec{0};
-static atomic_uint64_t ThresholdTicks{0};
-
-// Re-point the thread local pointer into this thread's Buffer before the recent
-// "Function Entry" record and any "Tail Call Exit" records after that.
-static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
- uint64_t &LastFunctionEntryTSC, int32_t FuncId) {
- auto &TLD = getThreadLocalData();
- TLD.RecordPtr -= FunctionRecSize;
- decrementExtents(FunctionRecSize);
- FunctionRecord FuncRecord;
- internal_memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize);
- DCHECK(FuncRecord.RecordKind ==
- uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
- "Expected to find function entry recording when rewinding.");
- DCHECK(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) &&
- "Expected matching function id when rewinding Exit");
- --TLD.NumConsecutiveFnEnters;
- LastTSC -= FuncRecord.TSCDelta;
-
- // We unwound one call. Update the state and return without writing a log.
- if (TLD.NumConsecutiveFnEnters != 0) {
- LastFunctionEntryTSC -= FuncRecord.TSCDelta;
- return;
- }
-
- // Otherwise we've rewound the stack of all function entries, we might be
- // able to rewind further by erasing tail call functions that are being
- // exited from via this exit.
- LastFunctionEntryTSC = 0;
- auto RewindingTSC = LastTSC;
- auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize;
- while (TLD.NumTailCalls > 0) {
- // Rewind the TSC back over the TAIL EXIT record.
- FunctionRecord ExpectedTailExit;
- internal_memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize);
-
- DCHECK(ExpectedTailExit.RecordKind ==
- uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) &&
- "Expected to find tail exit when rewinding.");
- RewindingRecordPtr -= FunctionRecSize;
- RewindingTSC -= ExpectedTailExit.TSCDelta;
- FunctionRecord ExpectedFunctionEntry;
- internal_memcpy(&ExpectedFunctionEntry, RewindingRecordPtr,
- FunctionRecSize);
- DCHECK(ExpectedFunctionEntry.RecordKind ==
- uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
- "Expected to find function entry when rewinding tail call.");
- DCHECK(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId &&
- "Expected funcids to match when rewinding tail call.");
-
- // This tail call exceeded the threshold duration. It will not be erased.
- if ((TSC - RewindingTSC) >= atomic_load_relaxed(&ThresholdTicks)) {
- TLD.NumTailCalls = 0;
- return;
- }
-
- // We can erase a tail exit pair that we're exiting through since
- // its duration is under threshold.
- --TLD.NumTailCalls;
- RewindingRecordPtr -= FunctionRecSize;
- RewindingTSC -= ExpectedFunctionEntry.TSCDelta;
- TLD.RecordPtr -= 2 * FunctionRecSize;
- LastTSC = RewindingTSC;
- decrementExtents(2 * FunctionRecSize);
- }
-}
-
-static bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
- auto &TLD = getThreadLocalData();
- auto EC = BQArg.releaseBuffer(TLD.Buffer);
- if (EC != BufferQueue::ErrorCode::Ok) {
- Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Data,
- BufferQueue::getErrorString(EC));
- return false;
- }
- return true;
-}
-
-static bool prepareBuffer(uint64_t TSC, unsigned char CPU,
- int (*wall_clock_reader)(clockid_t,
- struct timespec *),
- size_t MaxSize) XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- char *BufferStart = static_cast<char *>(TLD.Buffer.Data);
- if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) {
- if (!releaseThreadLocalBuffer(*TLD.BQ))
- return false;
- auto EC = TLD.BQ->getBuffer(TLD.Buffer);
- if (EC != BufferQueue::ErrorCode::Ok) {
- Report("Failed to prepare a buffer; error = '%s'\n",
- BufferQueue::getErrorString(EC));
- return false;
- }
- setupNewBuffer(wall_clock_reader);
-
- // Always write the CPU metadata as the first record in the buffer.
- writeNewCPUIdMetadata(CPU, TSC);
- }
- return true;
-}
-
-static bool
-isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU,
- int (*wall_clock_reader)(clockid_t, struct timespec *))
- XRAY_NEVER_INSTRUMENT {
- // Bail out right away if logging is not initialized yet.
- // We should take the opportunity to release the buffer though.
- auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
- auto &TLD = getThreadLocalData();
- if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
- if (TLD.RecordPtr != nullptr &&
- (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
- Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
- if (!releaseThreadLocalBuffer(*LBQ))
- return false;
- TLD.RecordPtr = nullptr;
- return false;
- }
- return false;
- }
-
- if (atomic_load(&LoggingStatus, memory_order_acquire) !=
- XRayLogInitStatus::XRAY_LOG_INITIALIZED ||
- LBQ->finalizing()) {
- if (!releaseThreadLocalBuffer(*LBQ))
- return false;
- TLD.RecordPtr = nullptr;
- }
-
- if (TLD.Buffer.Data == nullptr) {
- auto EC = LBQ->getBuffer(TLD.Buffer);
- if (EC != BufferQueue::ErrorCode::Ok) {
- auto LS = atomic_load(&LoggingStatus, memory_order_acquire);
- if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING &&
- LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
- Report("Failed to acquire a buffer; error = '%s'\n",
- BufferQueue::getErrorString(EC));
- return false;
- }
-
- setupNewBuffer(wall_clock_reader);
-
- // Always write the CPU metadata as the first record in the buffer.
- writeNewCPUIdMetadata(CPU, TSC);
- }
-
- if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) {
- // This means this is the first CPU this thread has ever run on. We set
- // the current CPU and record this as the first TSC we've seen.
- TLD.CurrentCPU = CPU;
- writeNewCPUIdMetadata(CPU, TSC);
- }
-
- return true;
-}
-
-// Compute the TSC difference between the time of measurement and the previous
-// event. There are a few interesting situations we need to account for:
-//
-// - The thread has migrated to a different CPU. If this is the case, then
-// we write down the following records:
-//
-// 1. A 'NewCPUId' Metadata record.
-// 2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-// - The TSC delta is greater than the 32 bits we can store in a
-// FunctionRecord. In this case we write down the following records:
-//
-// 1. A 'TSCWrap' Metadata record.
-// 2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-// - The TSC delta is representable within the 32 bits we can store in a
-// FunctionRecord. In this case we write down just a FunctionRecord with
-// the correct TSC delta.
-static uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC,
- uint8_t CPU) {
- if (CPU != TLD.CurrentCPU) {
- // We've moved to a new CPU.
- writeNewCPUIdMetadata(CPU, TSC);
- return 0;
- }
- // If the delta is greater than the range for a uint32_t, then we write out
- // the TSC wrap metadata entry with the full TSC, and the TSC for the
- // function record be 0.
- uint64_t Delta = TSC - TLD.LastTSC;
- if (Delta <= std::numeric_limits<uint32_t>::max())
- return Delta;
-
- writeTSCWrapMetadata(TSC);
- return 0;
-}
-
-static void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
- auto &TLD = getThreadLocalData();
- auto BufferStart = static_cast<char *>(TLD.Buffer.Data);
- if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <=
- ptrdiff_t{MetadataRecSize}) {
- if (!releaseThreadLocalBuffer(*TLD.BQ))
- return;
- TLD.RecordPtr = nullptr;
- }
-}
-
-thread_local atomic_uint8_t Running{0};
-
-/// Here's where the meat of the processing happens. The writer captures
-/// function entry, exit and tail exit points with a time and will create
-/// TSCWrap, NewCPUId and Function records as necessary. The writer might
-/// walk backward through its buffer and erase trivial functions to avoid
-/// polluting the log and may use the buffer queue to obtain or release a
-/// buffer.
-static void processFunctionHook(int32_t FuncId, XRayEntryType Entry,
- uint64_t TSC, unsigned char CPU, uint64_t Arg1,
- int (*wall_clock_reader)(clockid_t,
- struct timespec *))
- XRAY_NEVER_INSTRUMENT {
- __asm volatile("# LLVM-MCA-BEGIN processFunctionHook");
- // Prevent signal handler recursion, so in case we're already in a log writing
- // mode and the signal handler comes in (and is also instrumented) then we
- // don't want to be clobbering potentially partial writes already happening in
- // the thread. We use a simple thread_local latch to only allow one on-going
- // handleArg0 to happen at any given time.
- RecursionGuard Guard{Running};
- if (!Guard) {
- DCHECK(atomic_load_relaxed(&Running) && "RecursionGuard is buggy!");
- return;
- }
-
- auto &TLD = getThreadLocalData();
-
- if (TLD.BQ == nullptr)
- TLD.BQ = BQ;
-
- if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
- return;
-
- // Before we go setting up writing new function entries, we need to be really
- // careful about the pointer math we're doing. This means we need to ensure
- // that the record we are about to write is going to fit into the buffer,
- // without overflowing the buffer.
- //
- // To do this properly, we use the following assumptions:
- //
- // - The least number of bytes we will ever write is 8
- // (sizeof(FunctionRecord)) only if the delta between the previous entry
- // and this entry is within 32 bits.
- // - The most number of bytes we will ever write is 8 + 16 + 16 = 40.
- // This is computed by:
- //
- // MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord)
- //
- // These arise in the following cases:
- //
- // 1. When the delta between the TSC we get and the previous TSC for the
- // same CPU is outside of the uint32_t range, we end up having to
- // write a MetadataRecord to indicate a "tsc wrap" before the actual
- // FunctionRecord.
- // 2. When we learn that we've moved CPUs, we need to write a
- // MetadataRecord to indicate a "cpu change", and thus write out the
- // current TSC for that CPU before writing out the actual
- // FunctionRecord.
- // 3. When we learn about a new CPU ID, we need to write down a "new cpu
- // id" MetadataRecord before writing out the actual FunctionRecord.
- // 4. The second MetadataRecord is the optional function call argument.
- //
- // So the math we need to do is to determine whether writing 40 bytes past the
- // current pointer exceeds the buffer's maximum size. If we don't have enough
- // space to write 40 bytes in the buffer, we need get a new Buffer, set it up
- // properly before doing any further writing.
- size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize;
- if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) {
- TLD.BQ = nullptr;
- return;
- }
-
- // By this point, we are now ready to write up to 40 bytes (explained above).
- DCHECK((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Data) >=
- static_cast<ptrdiff_t>(MetadataRecSize) &&
- "Misconfigured BufferQueue provided; Buffer size not large enough.");
-
- auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU);
- TLD.LastTSC = TSC;
- TLD.CurrentCPU = CPU;
- switch (Entry) {
- case XRayEntryType::ENTRY:
- case XRayEntryType::LOG_ARGS_ENTRY:
- // Update the thread local state for the next invocation.
- TLD.LastFunctionEntryTSC = TSC;
- break;
- case XRayEntryType::TAIL:
- case XRayEntryType::EXIT:
- // Break out and write the exit record if we can't erase any functions.
- if (TLD.NumConsecutiveFnEnters == 0 ||
- (TSC - TLD.LastFunctionEntryTSC) >=
- atomic_load_relaxed(&ThresholdTicks))
- break;
- rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId);
- return; // without writing log.
- case XRayEntryType::CUSTOM_EVENT: {
- // This is a bug in patching, so we'll report it once and move on.
- static atomic_uint8_t ErrorLatch{0};
- if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
- Report("Internal error: patched an XRay custom event call as a function; "
- "func id = %d\n",
- FuncId);
- return;
- }
- case XRayEntryType::TYPED_EVENT: {
- static atomic_uint8_t ErrorLatch{0};
- if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
- Report("Internal error: patched an XRay typed event call as a function; "
- "func id = %d\n",
- FuncId);
- return;
- }
- }
-
- writeFunctionRecord(FuncId, RecordTSCDelta, Entry);
- if (Entry == XRayEntryType::LOG_ARGS_ENTRY)
- writeCallArgumentMetadata(Arg1);
-
- // If we've exhausted the buffer by this time, we then release the buffer to
- // make sure that other threads may start using this buffer.
- endBufferIfFull();
- __asm volatile("# LLVM-MCA-END");
-}
-
static XRayFileHeader &fdrCommonHeaderInfo() {
static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage;
static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
static bool TSCSupported = true;
static uint64_t CycleFrequency = NanosecondsPerSecond;
- pthread_once(&OnceInit, +[] {
- XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
- // Version 2 of the log writes the extents of the buffer, instead of
- // relying on an end-of-buffer record.
- // Version 3 includes PID metadata record
- H.Version = 3;
- H.Type = FileTypes::FDR_LOG;
-
- // Test for required CPU features and cache the cycle frequency
- TSCSupported = probeRequiredCPUFeatures();
- if (TSCSupported)
- CycleFrequency = getTSCFrequency();
- H.CycleFrequency = CycleFrequency;
-
- // FIXME: Actually check whether we have 'constant_tsc' and
- // 'nonstop_tsc' before setting the values in the header.
- H.ConstantTSC = 1;
- H.NonstopTSC = 1;
- });
+ pthread_once(
+ &OnceInit, +[] {
+ XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
+ // Version 2 of the log writes the extents of the buffer, instead of
+ // relying on an end-of-buffer record.
+ // Version 3 includes PID metadata record.
+ // Version 4 includes CPU data in the custom event records.
+ // Version 5 uses relative deltas for custom and typed event records,
+ // and removes the CPU data in custom event records (similar to how
+ // function records use deltas instead of full TSCs and rely on other
+ // metadata records for TSC wraparound and CPU migration).
+ H.Version = 5;
+ H.Type = FileTypes::FDR_LOG;
+
+ // Test for required CPU features and cache the cycle frequency
+ TSCSupported = probeRequiredCPUFeatures();
+ if (TSCSupported)
+ CycleFrequency = getTSCFrequency();
+ H.CycleFrequency = CycleFrequency;
+
+ // FIXME: Actually check whether we have 'constant_tsc' and
+ // 'nonstop_tsc' before setting the values in the header.
+ H.ConstantTSC = 1;
+ H.NonstopTSC = 1;
+ });
return reinterpret_cast<XRayFileHeader &>(HStorage);
}
@@ -728,9 +207,11 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
// buffers to expect).
static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage;
static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT;
- pthread_once(&HeaderOnce, +[] {
- reinterpret_cast<XRayFileHeader &>(HeaderStorage) = fdrCommonHeaderInfo();
- });
+ pthread_once(
+ &HeaderOnce, +[] {
+ reinterpret_cast<XRayFileHeader &>(HeaderStorage) =
+ fdrCommonHeaderInfo();
+ });
// We use a convenience alias for code referring to Header from here on out.
auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage);
@@ -741,7 +222,8 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
static BufferQueue::const_iterator It{};
static BufferQueue::const_iterator End{};
- static void *CurrentBuffer{nullptr};
+ static uint8_t *CurrentBuffer{nullptr};
+ static size_t SerializedBufferSize = 0;
if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
// From this point on, we provide raw access to the raw buffer we're getting
// from the BufferQueue. We're relying on the iterators from the current
@@ -751,7 +233,7 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
}
if (CurrentBuffer != nullptr) {
- InternalFree(CurrentBuffer);
+ deallocateBuffer(CurrentBuffer, SerializedBufferSize);
CurrentBuffer = nullptr;
}
@@ -762,9 +244,16 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
// out to disk. The difference here would be that we still write "empty"
// buffers, or at least go through the iterators faithfully to let the
// handlers see the empty buffers in the queue.
- auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire);
- auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
- CurrentBuffer = InternalAlloc(SerializedBufferSize);
+ //
+ // We need this atomic fence here to ensure that writes happening to the
+ // buffer have been committed before we load the extents atomically. Because
+ // the buffer is not explicitly synchronised across threads, we rely on the
+ // fence ordering to ensure that writes we expect to have been completed
+ // before the fence are fully committed before we read the extents.
+ atomic_thread_fence(memory_order_acquire);
+ auto BufferSize = atomic_load(It->Extents, memory_order_acquire);
+ SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+ CurrentBuffer = allocateBuffer(SerializedBufferSize);
if (CurrentBuffer == nullptr)
return {nullptr, 0};
@@ -827,14 +316,9 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
});
auto CleanupBuffers = at_scope_exit([] {
- if (BQ != nullptr) {
- auto &TLD = getThreadLocalData();
- if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
- releaseThreadLocalBuffer(*TLD.BQ);
- BQ->~BufferQueue();
- InternalFree(BQ);
- BQ = nullptr;
- }
+ auto &TLD = getThreadLocalData();
+ if (TLD.Controller != nullptr)
+ TLD.Controller->flush();
});
if (fdrFlags()->no_file_flush) {
@@ -855,16 +339,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
// (fixed-sized) and let the tools reading the buffers deal with the data
// afterwards.
//
- int Fd = -1;
- {
- // FIXME: Remove this section of the code, when we remove the struct-based
- // configuration API.
- SpinMutexLock Guard(&FDROptionsMutex);
- Fd = FDROptions.Fd;
- }
- if (Fd == -1)
- Fd = getLogFD();
- if (Fd == -1) {
+ LogWriter *LW = LogWriter::Open();
+ if (LW == nullptr) {
auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
atomic_store(&LogFlushStatus, Result, memory_order_release);
return Result;
@@ -872,8 +348,15 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
XRayFileHeader Header = fdrCommonHeaderInfo();
Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
- retryingWriteAll(Fd, reinterpret_cast<char *>(&Header),
- reinterpret_cast<char *>(&Header) + sizeof(Header));
+ LW->WriteAll(reinterpret_cast<char *>(&Header),
+ reinterpret_cast<char *>(&Header) + sizeof(Header));
+
+ // Release the current thread's buffer before we attempt to write out all the
+ // buffers. This ensures that in case we had only a single thread going, that
+ // we are able to capture the data nonetheless.
+ auto &TLD = getThreadLocalData();
+ if (TLD.Controller != nullptr)
+ TLD.Controller->flush();
BQ->apply([&](const BufferQueue::Buffer &B) {
// Starting at version 2 of the FDR logging implementation, we only write
@@ -882,18 +365,18 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
// still use a Metadata record, but fill in the extents instead for the
// data.
MetadataRecord ExtentsRecord;
- auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire);
+ auto BufferExtents = atomic_load(B.Extents, memory_order_acquire);
DCHECK(BufferExtents <= B.Size);
ExtentsRecord.Type = uint8_t(RecordType::Metadata);
ExtentsRecord.RecordKind =
uint8_t(MetadataRecord::RecordKinds::BufferExtents);
internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
if (BufferExtents > 0) {
- retryingWriteAll(Fd, reinterpret_cast<char *>(&ExtentsRecord),
- reinterpret_cast<char *>(&ExtentsRecord) +
- sizeof(MetadataRecord));
- retryingWriteAll(Fd, reinterpret_cast<char *>(B.Data),
- reinterpret_cast<char *>(B.Data) + BufferExtents);
+ LW->WriteAll(reinterpret_cast<char *>(&ExtentsRecord),
+ reinterpret_cast<char *>(&ExtentsRecord) +
+ sizeof(MetadataRecord));
+ LW->WriteAll(reinterpret_cast<char *>(B.Data),
+ reinterpret_cast<char *>(B.Data) + BufferExtents);
}
});
@@ -914,7 +397,12 @@ XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
// Do special things to make the log finalize itself, and not allow any more
// operations to be performed until re-initialized.
- BQ->finalize();
+ if (BQ == nullptr) {
+ if (Verbosity())
+ Report("Attempting to finalize an uninitialized global buffer!\n");
+ } else {
+ BQ->finalize();
+ }
atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
memory_order_release);
@@ -935,7 +423,8 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
// Test once for required CPU features
static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT;
static bool TSCSupported = true;
- pthread_once(&OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
+ pthread_once(
+ &OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
if (TSCSupported) {
Result.TSC = __xray::readTSC(Result.CPU);
@@ -953,16 +442,115 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
return Result;
}
+thread_local atomic_uint8_t Running{0};
+
+static bool setupTLD(ThreadLocalData &TLD) XRAY_NEVER_INSTRUMENT {
+ // Check if we're finalizing, before proceeding.
+ {
+ auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
+ if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+ Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+ if (TLD.Controller != nullptr) {
+ TLD.Controller->flush();
+ TLD.Controller = nullptr;
+ }
+ return false;
+ }
+ }
+
+ if (UNLIKELY(TLD.Controller == nullptr)) {
+ // Set up the TLD buffer queue.
+ if (UNLIKELY(BQ == nullptr))
+ return false;
+ TLD.BQ = BQ;
+
+ // Check that we have a valid buffer.
+ if (TLD.Buffer.Generation != BQ->generation() &&
+ TLD.BQ->releaseBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+ return false;
+
+ // Set up a buffer, before setting up the log writer. Bail out on failure.
+ if (TLD.BQ->getBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+ return false;
+
+ // Set up the Log Writer for this thread.
+ if (UNLIKELY(TLD.Writer == nullptr)) {
+ auto *LWStorage = reinterpret_cast<FDRLogWriter *>(&TLD.LWStorage);
+ new (LWStorage) FDRLogWriter(TLD.Buffer);
+ TLD.Writer = LWStorage;
+ } else {
+ TLD.Writer->resetRecord();
+ }
+
+ auto *CStorage = reinterpret_cast<FDRController<> *>(&TLD.CStorage);
+ new (CStorage)
+ FDRController<>(TLD.BQ, TLD.Buffer, *TLD.Writer, clock_gettime,
+ atomic_load_relaxed(&ThresholdTicks));
+ TLD.Controller = CStorage;
+ }
+
+ DCHECK_NE(TLD.Controller, nullptr);
+ return true;
+}
+
void fdrLoggingHandleArg0(int32_t FuncId,
XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
auto TC = getTimestamp();
- processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, clock_gettime);
+ auto &TSC = TC.TSC;
+ auto &CPU = TC.CPU;
+ RecursionGuard Guard{Running};
+ if (!Guard)
+ return;
+
+ auto &TLD = getThreadLocalData();
+ if (!setupTLD(TLD))
+ return;
+
+ switch (Entry) {
+ case XRayEntryType::ENTRY:
+ case XRayEntryType::LOG_ARGS_ENTRY:
+ TLD.Controller->functionEnter(FuncId, TSC, CPU);
+ return;
+ case XRayEntryType::EXIT:
+ TLD.Controller->functionExit(FuncId, TSC, CPU);
+ return;
+ case XRayEntryType::TAIL:
+ TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+ return;
+ case XRayEntryType::CUSTOM_EVENT:
+ case XRayEntryType::TYPED_EVENT:
+ break;
+ }
}
void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry,
uint64_t Arg) XRAY_NEVER_INSTRUMENT {
auto TC = getTimestamp();
- processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, clock_gettime);
+ auto &TSC = TC.TSC;
+ auto &CPU = TC.CPU;
+ RecursionGuard Guard{Running};
+ if (!Guard)
+ return;
+
+ auto &TLD = getThreadLocalData();
+ if (!setupTLD(TLD))
+ return;
+
+ switch (Entry) {
+ case XRayEntryType::ENTRY:
+ case XRayEntryType::LOG_ARGS_ENTRY:
+ TLD.Controller->functionEnterArg(FuncId, TSC, CPU, Arg);
+ return;
+ case XRayEntryType::EXIT:
+ TLD.Controller->functionExit(FuncId, TSC, CPU);
+ return;
+ case XRayEntryType::TAIL:
+ TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+ return;
+ case XRayEntryType::CUSTOM_EVENT:
+ case XRayEntryType::TYPED_EVENT:
+ break;
+ }
}
void fdrLoggingHandleCustomEvent(void *Event,
@@ -973,40 +561,25 @@ void fdrLoggingHandleCustomEvent(void *Event,
RecursionGuard Guard{Running};
if (!Guard)
return;
- if (EventSize > std::numeric_limits<int32_t>::max()) {
+
+ // Complain when we ever get at least one custom event that's larger than what
+ // we can possibly support.
+ if (EventSize >
+ static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
static pthread_once_t Once = PTHREAD_ONCE_INIT;
- pthread_once(&Once, +[] { Report("Event size too large.\n"); });
+ pthread_once(
+ &Once, +[] {
+ Report("Custom event size too large; truncating to %d.\n",
+ std::numeric_limits<int32_t>::max());
+ });
}
- int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
- auto &TLD = getThreadLocalData();
- if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime))
- return;
- // Here we need to prepare the log to handle:
- // - The metadata record we're going to write. (16 bytes)
- // - The additional data we're going to write. Currently, that's the size
- // of the event we're going to dump into the log as free-form bytes.
- if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
- TLD.BQ = nullptr;
+ auto &TLD = getThreadLocalData();
+ if (!setupTLD(TLD))
return;
- }
- // Write the custom event metadata record, which consists of the following
- // information:
- // - 8 bytes (64-bits) for the full TSC when the event started.
- // - 4 bytes (32-bits) for the length of the data.
- MetadataRecord CustomEvent;
- CustomEvent.Type = uint8_t(RecordType::Metadata);
- CustomEvent.RecordKind =
- uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
- constexpr auto TSCSize = sizeof(TC.TSC);
- internal_memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
- internal_memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
- internal_memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
- TLD.RecordPtr += sizeof(CustomEvent);
- internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
- incrementExtents(MetadataRecSize + EventSize);
- endBufferIfFull();
+ int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+ TLD.Controller->customEvent(TSC, CPU, Event, ReducedEventSize);
}
void fdrLoggingHandleTypedEvent(
@@ -1018,50 +591,28 @@ void fdrLoggingHandleTypedEvent(
RecursionGuard Guard{Running};
if (!Guard)
return;
- if (EventSize > std::numeric_limits<int32_t>::max()) {
+
+ // Complain when we ever get at least one typed event that's larger than what
+ // we can possibly support.
+ if (EventSize >
+ static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
static pthread_once_t Once = PTHREAD_ONCE_INIT;
- pthread_once(&Once, +[] { Report("Event size too large.\n"); });
+ pthread_once(
+ &Once, +[] {
+ Report("Typed event size too large; truncating to %d.\n",
+ std::numeric_limits<int32_t>::max());
+ });
}
- int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+
auto &TLD = getThreadLocalData();
- if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime))
+ if (!setupTLD(TLD))
return;
- // Here we need to prepare the log to handle:
- // - The metadata record we're going to write. (16 bytes)
- // - The additional data we're going to write. Currently, that's the size
- // of the event we're going to dump into the log as free-form bytes.
- if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
- TLD.BQ = nullptr;
- return;
- }
- // Write the custom event metadata record, which consists of the following
- // information:
- // - 8 bytes (64-bits) for the full TSC when the event started.
- // - 4 bytes (32-bits) for the length of the data.
- // - 2 bytes (16-bits) for the event type. 3 bytes remain since one of the
- // bytes has the record type (Metadata Record) and kind (TypedEvent).
- // We'll log the error if the event type is greater than 2 bytes.
- // Event types are generated sequentially, so 2^16 is enough.
- MetadataRecord TypedEvent;
- TypedEvent.Type = uint8_t(RecordType::Metadata);
- TypedEvent.RecordKind =
- uint8_t(MetadataRecord::RecordKinds::TypedEventMarker);
- constexpr auto TSCSize = sizeof(TC.TSC);
- internal_memcpy(&TypedEvent.Data, &ReducedEventSize, sizeof(int32_t));
- internal_memcpy(&TypedEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
- internal_memcpy(&TypedEvent.Data[sizeof(int32_t) + TSCSize], &EventType,
- sizeof(EventType));
- internal_memcpy(TLD.RecordPtr, &TypedEvent, sizeof(TypedEvent));
-
- TLD.RecordPtr += sizeof(TypedEvent);
- internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
- incrementExtents(MetadataRecSize + EventSize);
- endBufferIfFull();
+ int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+ TLD.Controller->typedEvent(TSC, CPU, EventType, Event, ReducedEventSize);
}
-XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
- void *Options,
+XRayLogInitStatus fdrLoggingInit(size_t, size_t, void *Options,
size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
if (Options == nullptr)
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
@@ -1075,107 +626,81 @@ XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
return static_cast<XRayLogInitStatus>(CurrentStatus);
}
- // Because of __xray_log_init_mode(...) which guarantees that this will be
- // called with BufferSize == 0 and BufferMax == 0 we parse the configuration
- // provided in the Options pointer as a string instead.
- if (BufferSize == 0 && BufferMax == 0) {
- if (Verbosity())
- Report("Initializing FDR mode with options: %s\n",
- static_cast<const char *>(Options));
-
- // TODO: Factor out the flags specific to the FDR mode implementation. For
- // now, use the global/single definition of the flags, since the FDR mode
- // flags are already defined there.
- FlagParser FDRParser;
- FDRFlags FDRFlags;
- registerXRayFDRFlags(&FDRParser, &FDRFlags);
- FDRFlags.setDefaults();
-
- // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
- // options until we migrate everyone to use the XRAY_FDR_OPTIONS
- // compiler-provided options.
- FDRParser.ParseString(useCompilerDefinedFlags());
- FDRParser.ParseString(useCompilerDefinedFDRFlags());
- auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
- if (EnvOpts == nullptr)
- EnvOpts = "";
- FDRParser.ParseString(EnvOpts);
-
- // FIXME: Remove this when we fully remove the deprecated flags.
- if (internal_strlen(EnvOpts) == 0) {
- FDRFlags.func_duration_threshold_us =
- flags()->xray_fdr_log_func_duration_threshold_us;
- FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
- }
-
- // The provided options should always override the compiler-provided and
- // environment-variable defined options.
- FDRParser.ParseString(static_cast<const char *>(Options));
- *fdrFlags() = FDRFlags;
- BufferSize = FDRFlags.buffer_size;
- BufferMax = FDRFlags.buffer_max;
- SpinMutexLock Guard(&FDROptionsMutex);
- FDROptions.Fd = -1;
- FDROptions.ReportErrors = true;
- } else if (OptionsSize != sizeof(FDRLoggingOptions)) {
- // FIXME: This is deprecated, and should really be removed.
- // At this point we use the flag parser specific to the FDR mode
- // implementation.
- if (Verbosity())
- Report("Cannot initialize FDR logging; wrong size for options: %d\n",
- OptionsSize);
- return static_cast<XRayLogInitStatus>(
- atomic_load(&LoggingStatus, memory_order_acquire));
- } else {
- if (Verbosity())
- Report("XRay FDR: struct-based init is deprecated, please use "
- "string-based configuration instead.\n");
- SpinMutexLock Guard(&FDROptionsMutex);
- internal_memcpy(&FDROptions, Options, OptionsSize);
- }
-
- bool Success = false;
-
- if (BQ != nullptr) {
- BQ->~BufferQueue();
- InternalFree(BQ);
- BQ = nullptr;
- }
+ if (Verbosity())
+ Report("Initializing FDR mode with options: %s\n",
+ static_cast<const char *>(Options));
+
+ // TODO: Factor out the flags specific to the FDR mode implementation. For
+ // now, use the global/single definition of the flags, since the FDR mode
+ // flags are already defined there.
+ FlagParser FDRParser;
+ FDRFlags FDRFlags;
+ registerXRayFDRFlags(&FDRParser, &FDRFlags);
+ FDRFlags.setDefaults();
+
+ // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
+ // options until we migrate everyone to use the XRAY_FDR_OPTIONS
+ // compiler-provided options.
+ FDRParser.ParseString(useCompilerDefinedFlags());
+ FDRParser.ParseString(useCompilerDefinedFDRFlags());
+ auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
+ if (EnvOpts == nullptr)
+ EnvOpts = "";
+ FDRParser.ParseString(EnvOpts);
+
+ // FIXME: Remove this when we fully remove the deprecated flags.
+ if (internal_strlen(EnvOpts) == 0) {
+ FDRFlags.func_duration_threshold_us =
+ flags()->xray_fdr_log_func_duration_threshold_us;
+ FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
+ }
+
+ // The provided options should always override the compiler-provided and
+ // environment-variable defined options.
+ FDRParser.ParseString(static_cast<const char *>(Options));
+ *fdrFlags() = FDRFlags;
+ auto BufferSize = FDRFlags.buffer_size;
+ auto BufferMax = FDRFlags.buffer_max;
if (BQ == nullptr) {
- BQ = reinterpret_cast<BufferQueue *>(
- InternalAlloc(sizeof(BufferQueue), nullptr, 64));
+ bool Success = false;
+ BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
new (BQ) BufferQueue(BufferSize, BufferMax, Success);
- }
-
- if (!Success) {
- Report("BufferQueue init failed.\n");
- if (BQ != nullptr) {
- BQ->~BufferQueue();
- InternalFree(BQ);
- BQ = nullptr;
+ if (!Success) {
+ Report("BufferQueue init failed.\n");
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ }
+ } else {
+ if (BQ->init(BufferSize, BufferMax) != BufferQueue::ErrorCode::Ok) {
+ if (Verbosity())
+ Report("Failed to re-initialize global buffer queue. Init failed.\n");
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
}
- return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
}
static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
- pthread_once(&OnceInit, +[] {
- atomic_store(&TicksPerSec,
- probeRequiredCPUFeatures() ? getTSCFrequency()
- : __xray::NanosecondsPerSecond,
- memory_order_release);
- pthread_key_create(&Key, +[](void *TLDPtr) {
- if (TLDPtr == nullptr)
- return;
- auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
- if (TLD.BQ == nullptr)
- return;
- auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
- if (EC != BufferQueue::ErrorCode::Ok)
- Report("At thread exit, failed to release buffer at %p; error=%s\n",
- TLD.Buffer.Data, BufferQueue::getErrorString(EC));
- });
- });
+ pthread_once(
+ &OnceInit, +[] {
+ atomic_store(&TicksPerSec,
+ probeRequiredCPUFeatures() ? getTSCFrequency()
+ : __xray::NanosecondsPerSecond,
+ memory_order_release);
+ pthread_key_create(
+ &Key, +[](void *TLDPtr) {
+ if (TLDPtr == nullptr)
+ return;
+ auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
+ if (TLD.BQ == nullptr)
+ return;
+ if (TLD.Buffer.Data == nullptr)
+ return;
+ auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
+ if (EC != BufferQueue::ErrorCode::Ok)
+ Report("At thread exit, failed to release buffer at %p; "
+ "error=%s\n",
+ TLD.Buffer.Data, BufferQueue::getErrorString(EC));
+ });
+ });
atomic_store(&ThresholdTicks,
atomic_load_relaxed(&TicksPerSec) *
@@ -1209,11 +734,22 @@ bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
};
auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
- Verbosity())
+ Verbosity()) {
Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
RegistrationResult);
- if (flags()->xray_fdr_log || !internal_strcmp(flags()->xray_mode, "xray-fdr"))
- __xray_set_log_impl(Impl);
+ return false;
+ }
+
+ if (flags()->xray_fdr_log ||
+ !internal_strcmp(flags()->xray_mode, "xray-fdr")) {
+ auto SelectResult = __xray_log_select_mode("xray-fdr");
+ if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+ Verbosity()) {
+ Report("Cannot select XRay FDR mode as 'xray-fdr'; error = %d\n",
+ SelectResult);
+ return false;
+ }
+ }
return true;
}
diff --git a/contrib/compiler-rt/lib/xray/xray_function_call_trie.h b/contrib/compiler-rt/lib/xray/xray_function_call_trie.h
index 2acf14aa5625..d01ad20e3d71 100644
--- a/contrib/compiler-rt/lib/xray/xray_function_call_trie.h
+++ b/contrib/compiler-rt/lib/xray/xray_function_call_trie.h
@@ -15,9 +15,11 @@
#ifndef XRAY_FUNCTION_CALL_TRIE_H
#define XRAY_FUNCTION_CALL_TRIE_H
-#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "xray_buffer_queue.h"
+#include "xray_defs.h"
#include "xray_profiling_flags.h"
#include "xray_segmented_array.h"
+#include <limits>
#include <memory> // For placement new.
#include <utility>
@@ -97,9 +99,6 @@ public:
struct NodeIdPair {
Node *NodePtr;
int32_t FId;
-
- // Constructor for inplace-construction.
- NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
};
using NodeIdPairArray = Array<NodeIdPair>;
@@ -113,17 +112,10 @@ public:
struct Node {
Node *Parent;
NodeIdPairArray Callees;
- int64_t CallCount;
- int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+ uint64_t CallCount;
+ uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
int32_t FId;
- // We add a constructor here to allow us to inplace-construct through
- // Array<...>'s AppendEmplace.
- Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
- int32_t F)
- : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
- FId(F) {}
-
// TODO: Include the compact histogram.
};
@@ -131,10 +123,7 @@ private:
struct ShadowStackEntry {
uint64_t EntryTSC;
Node *NodePtr;
-
- // We add a constructor here to allow us to inplace-construct through
- // Array<...>'s AppendEmplace.
- ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
+ uint16_t EntryCPU;
};
using NodeArray = Array<Node>;
@@ -149,103 +138,184 @@ public:
using RootAllocatorType = RootArray::AllocatorType;
using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
+ // Use hosted aligned storage members to allow for trivial move and init.
+ // This also allows us to sidestep the potential-failing allocation issue.
+ typename std::aligned_storage<sizeof(NodeAllocatorType),
+ alignof(NodeAllocatorType)>::type
+ NodeAllocatorStorage;
+ typename std::aligned_storage<sizeof(RootAllocatorType),
+ alignof(RootAllocatorType)>::type
+ RootAllocatorStorage;
+ typename std::aligned_storage<sizeof(ShadowStackAllocatorType),
+ alignof(ShadowStackAllocatorType)>::type
+ ShadowStackAllocatorStorage;
+ typename std::aligned_storage<sizeof(NodeIdPairAllocatorType),
+ alignof(NodeIdPairAllocatorType)>::type
+ NodeIdPairAllocatorStorage;
+
NodeAllocatorType *NodeAllocator = nullptr;
RootAllocatorType *RootAllocator = nullptr;
ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
- Allocators() {}
+ Allocators() = default;
Allocators(const Allocators &) = delete;
Allocators &operator=(const Allocators &) = delete;
- Allocators(Allocators &&O)
- : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
- ShadowStackAllocator(O.ShadowStackAllocator),
- NodeIdPairAllocator(O.NodeIdPairAllocator) {
+ struct Buffers {
+ BufferQueue::Buffer NodeBuffer;
+ BufferQueue::Buffer RootsBuffer;
+ BufferQueue::Buffer ShadowStackBuffer;
+ BufferQueue::Buffer NodeIdPairBuffer;
+ };
+
+ explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT {
+ new (&NodeAllocatorStorage)
+ NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size);
+ NodeAllocator =
+ reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+
+ new (&RootAllocatorStorage)
+ RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size);
+ RootAllocator =
+ reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+
+ new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(
+ B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size);
+ ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+ &ShadowStackAllocatorStorage);
+
+ new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(
+ B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size);
+ NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+ &NodeIdPairAllocatorStorage);
+ }
+
+ explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT {
+ new (&NodeAllocatorStorage) NodeAllocatorType(Max);
+ NodeAllocator =
+ reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+
+ new (&RootAllocatorStorage) RootAllocatorType(Max);
+ RootAllocator =
+ reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+
+ new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max);
+ ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+ &ShadowStackAllocatorStorage);
+
+ new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max);
+ NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+ &NodeIdPairAllocatorStorage);
+ }
+
+ Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT {
+ // Here we rely on the safety of memcpy'ing contents of the storage
+ // members, and then pointing the source pointers to nullptr.
+ internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage,
+ sizeof(NodeAllocatorType));
+ internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage,
+ sizeof(RootAllocatorType));
+ internal_memcpy(&ShadowStackAllocatorStorage,
+ &O.ShadowStackAllocatorStorage,
+ sizeof(ShadowStackAllocatorType));
+ internal_memcpy(&NodeIdPairAllocatorStorage,
+ &O.NodeIdPairAllocatorStorage,
+ sizeof(NodeIdPairAllocatorType));
+
+ NodeAllocator =
+ reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+ RootAllocator =
+ reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+ ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+ &ShadowStackAllocatorStorage);
+ NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+ &NodeIdPairAllocatorStorage);
+
O.NodeAllocator = nullptr;
O.RootAllocator = nullptr;
O.ShadowStackAllocator = nullptr;
O.NodeIdPairAllocator = nullptr;
}
- Allocators &operator=(Allocators &&O) {
- {
- auto Tmp = O.NodeAllocator;
- O.NodeAllocator = this->NodeAllocator;
- this->NodeAllocator = Tmp;
- }
- {
- auto Tmp = O.RootAllocator;
- O.RootAllocator = this->RootAllocator;
- this->RootAllocator = Tmp;
- }
- {
- auto Tmp = O.ShadowStackAllocator;
- O.ShadowStackAllocator = this->ShadowStackAllocator;
- this->ShadowStackAllocator = Tmp;
- }
- {
- auto Tmp = O.NodeIdPairAllocator;
- O.NodeIdPairAllocator = this->NodeIdPairAllocator;
- this->NodeIdPairAllocator = Tmp;
- }
- return *this;
- }
-
- ~Allocators() {
- // Note that we cannot use delete on these pointers, as they need to be
- // returned to the sanitizer_common library's internal memory tracking
- // system.
- if (NodeAllocator != nullptr) {
+ Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT {
+ // When moving into an existing instance, we ensure that we clean up the
+ // current allocators.
+ if (NodeAllocator)
NodeAllocator->~NodeAllocatorType();
- InternalFree(NodeAllocator);
+ if (O.NodeAllocator) {
+ new (&NodeAllocatorStorage)
+ NodeAllocatorType(std::move(*O.NodeAllocator));
+ NodeAllocator =
+ reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+ O.NodeAllocator = nullptr;
+ } else {
NodeAllocator = nullptr;
}
- if (RootAllocator != nullptr) {
+
+ if (RootAllocator)
RootAllocator->~RootAllocatorType();
- InternalFree(RootAllocator);
+ if (O.RootAllocator) {
+ new (&RootAllocatorStorage)
+ RootAllocatorType(std::move(*O.RootAllocator));
+ RootAllocator =
+ reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+ O.RootAllocator = nullptr;
+ } else {
RootAllocator = nullptr;
}
- if (ShadowStackAllocator != nullptr) {
+
+ if (ShadowStackAllocator)
ShadowStackAllocator->~ShadowStackAllocatorType();
- InternalFree(ShadowStackAllocator);
+ if (O.ShadowStackAllocator) {
+ new (&ShadowStackAllocatorStorage)
+ ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator));
+ ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+ &ShadowStackAllocatorStorage);
+ O.ShadowStackAllocator = nullptr;
+ } else {
ShadowStackAllocator = nullptr;
}
- if (NodeIdPairAllocator != nullptr) {
+
+ if (NodeIdPairAllocator)
NodeIdPairAllocator->~NodeIdPairAllocatorType();
- InternalFree(NodeIdPairAllocator);
+ if (O.NodeIdPairAllocator) {
+ new (&NodeIdPairAllocatorStorage)
+ NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator));
+ NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+ &NodeIdPairAllocatorStorage);
+ O.NodeIdPairAllocator = nullptr;
+ } else {
NodeIdPairAllocator = nullptr;
}
+
+ return *this;
+ }
+
+ ~Allocators() XRAY_NEVER_INSTRUMENT {
+ if (NodeAllocator != nullptr)
+ NodeAllocator->~NodeAllocatorType();
+ if (RootAllocator != nullptr)
+ RootAllocator->~RootAllocatorType();
+ if (ShadowStackAllocator != nullptr)
+ ShadowStackAllocator->~ShadowStackAllocatorType();
+ if (NodeIdPairAllocator != nullptr)
+ NodeIdPairAllocator->~NodeIdPairAllocatorType();
}
};
- // TODO: Support configuration of options through the arguments.
- static Allocators InitAllocators() {
+ static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT {
return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
}
- static Allocators InitAllocatorsCustom(uptr Max) {
- Allocators A;
- auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
- InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
- new (NodeAllocator) Allocators::NodeAllocatorType(Max);
- A.NodeAllocator = NodeAllocator;
-
- auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
- InternalAlloc(sizeof(Allocators::RootAllocatorType)));
- new (RootAllocator) Allocators::RootAllocatorType(Max);
- A.RootAllocator = RootAllocator;
-
- auto ShadowStackAllocator =
- reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
- InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
- new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max);
- A.ShadowStackAllocator = ShadowStackAllocator;
-
- auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
- InternalAlloc(sizeof(NodeIdPairAllocatorType)));
- new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
- A.NodeIdPairAllocator = NodeIdPairAllocator;
+ static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT {
+ Allocators A(Max);
+ return A;
+ }
+
+ static Allocators
+ InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT {
+ Allocators A(Bufs);
return A;
}
@@ -253,65 +323,135 @@ private:
NodeArray Nodes;
RootArray Roots;
ShadowStackArray ShadowStack;
- NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+ NodeIdPairAllocatorType *NodeIdPairAllocator;
+ uint32_t OverflowedFunctions;
public:
- explicit FunctionCallTrie(const Allocators &A)
- : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+ explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT
+ : Nodes(*A.NodeAllocator),
+ Roots(*A.RootAllocator),
ShadowStack(*A.ShadowStackAllocator),
- NodeIdPairAllocator(A.NodeIdPairAllocator) {}
+ NodeIdPairAllocator(A.NodeIdPairAllocator),
+ OverflowedFunctions(0) {}
+
+ FunctionCallTrie() = delete;
+ FunctionCallTrie(const FunctionCallTrie &) = delete;
+ FunctionCallTrie &operator=(const FunctionCallTrie &) = delete;
+
+ FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT
+ : Nodes(std::move(O.Nodes)),
+ Roots(std::move(O.Roots)),
+ ShadowStack(std::move(O.ShadowStack)),
+ NodeIdPairAllocator(O.NodeIdPairAllocator),
+ OverflowedFunctions(O.OverflowedFunctions) {}
+
+ FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT {
+ Nodes = std::move(O.Nodes);
+ Roots = std::move(O.Roots);
+ ShadowStack = std::move(O.ShadowStack);
+ NodeIdPairAllocator = O.NodeIdPairAllocator;
+ OverflowedFunctions = O.OverflowedFunctions;
+ return *this;
+ }
+
+ ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {}
- void enterFunction(const int32_t FId, uint64_t TSC) {
+ void enterFunction(const int32_t FId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
DCHECK_NE(FId, 0);
- // This function primarily deals with ensuring that the ShadowStack is
- // consistent and ready for when an exit event is encountered.
+
+ // If we're already overflowed the function call stack, do not bother
+ // attempting to record any more function entries.
+ if (UNLIKELY(OverflowedFunctions)) {
+ ++OverflowedFunctions;
+ return;
+ }
+
+ // If this is the first function we've encountered, we want to set up the
+ // node(s) and treat it as a root.
if (UNLIKELY(ShadowStack.empty())) {
- auto NewRoot =
- Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+ auto *NewRoot = Nodes.AppendEmplace(
+ nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
if (UNLIKELY(NewRoot == nullptr))
return;
- Roots.Append(NewRoot);
- ShadowStack.AppendEmplace(TSC, NewRoot);
+ if (Roots.AppendEmplace(NewRoot) == nullptr) {
+ Nodes.trim(1);
+ return;
+ }
+ if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) {
+ Nodes.trim(1);
+ Roots.trim(1);
+ ++OverflowedFunctions;
+ return;
+ }
return;
}
- auto &Top = ShadowStack.back();
- auto TopNode = Top.NodePtr;
+ // From this point on, we require that the stack is not empty.
+ DCHECK(!ShadowStack.empty());
+ auto TopNode = ShadowStack.back().NodePtr;
DCHECK_NE(TopNode, nullptr);
- // If we've seen this callee before, then we just access that node and place
- // that on the top of the stack.
- auto Callee = TopNode->Callees.find_element(
+ // If we've seen this callee before, then we access that node and place that
+ // on the top of the stack.
+ auto* Callee = TopNode->Callees.find_element(
[FId](const NodeIdPair &NR) { return NR.FId == FId; });
if (Callee != nullptr) {
CHECK_NE(Callee->NodePtr, nullptr);
- ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+ if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr)
+ ++OverflowedFunctions;
return;
}
// This means we've never seen this stack before, create a new node here.
- auto NewNode =
- Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+ auto* NewNode = Nodes.AppendEmplace(
+ TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
if (UNLIKELY(NewNode == nullptr))
return;
DCHECK_NE(NewNode, nullptr);
TopNode->Callees.AppendEmplace(NewNode, FId);
- ShadowStack.AppendEmplace(TSC, NewNode);
- DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
+ if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr)
+ ++OverflowedFunctions;
return;
}
- void exitFunction(int32_t FId, uint64_t TSC) {
+ void exitFunction(int32_t FId, uint64_t TSC,
+ uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+ // If we're exiting functions that have "overflowed" or don't fit into the
+ // stack due to allocator constraints, we then decrement that count first.
+ if (OverflowedFunctions) {
+ --OverflowedFunctions;
+ return;
+ }
+
// When we exit a function, we look up the ShadowStack to see whether we've
// entered this function before. We do as little processing here as we can,
// since most of the hard work would have already been done at function
// entry.
uint64_t CumulativeTreeTime = 0;
+
while (!ShadowStack.empty()) {
const auto &Top = ShadowStack.back();
auto TopNode = Top.NodePtr;
DCHECK_NE(TopNode, nullptr);
- auto LocalTime = TSC - Top.EntryTSC;
+
+ // We may encounter overflow on the TSC we're provided, which may end up
+ // being less than the TSC when we first entered the function.
+ //
+ // To get the accurate measurement of cycles, we need to check whether
+ // we've overflowed (TSC < Top.EntryTSC) and then account the difference
+ // between the entry TSC and the max for the TSC counter (max of uint64_t)
+ // then add the value of TSC. We can prove that the maximum delta we will
+ // get is at most the 64-bit unsigned value, since the difference between
+ // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
+ // - 1) + 1.
+ //
+ // NOTE: This assumes that TSCs are synchronised across CPUs.
+ // TODO: Count the number of times we've seen CPU migrations.
+ uint64_t LocalTime =
+ Top.EntryTSC > TSC
+ ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
+ : TSC - Top.EntryTSC;
TopNode->CallCount++;
TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
CumulativeTreeTime += LocalTime;
@@ -323,7 +463,7 @@ public:
}
}
- const RootArray &getRoots() const { return Roots; }
+ const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; }
// The deepCopyInto operation will update the provided FunctionCallTrie by
// re-creating the contents of this particular FunctionCallTrie in the other
@@ -338,7 +478,7 @@ public:
// synchronisation of both "this" and |O|.
//
// This function must *not* be called with a non-empty FunctionCallTrie |O|.
- void deepCopyInto(FunctionCallTrie &O) const {
+ void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
DCHECK(O.getRoots().empty());
// We then push the root into a stack, to use as the parent marker for new
@@ -356,18 +496,20 @@ public:
for (const auto Root : getRoots()) {
// Add a node in O for this root.
auto NewRoot = O.Nodes.AppendEmplace(
- nullptr, *O.NodeIdPairAllocator, Root->CallCount,
+ nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount,
Root->CumulativeLocalTime, Root->FId);
// Because we cannot allocate more memory we should bail out right away.
if (UNLIKELY(NewRoot == nullptr))
return;
- O.Roots.Append(NewRoot);
+ if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr))
+ return;
// TODO: Figure out what to do if we fail to allocate any more stack
// space. Maybe warn or report once?
- DFSStack.AppendEmplace(Root, NewRoot);
+ if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr)
+ return;
while (!DFSStack.empty()) {
NodeAndParent NP = DFSStack.back();
DCHECK_NE(NP.Node, nullptr);
@@ -375,12 +517,17 @@ public:
DFSStack.trim(1);
for (const auto Callee : NP.Node->Callees) {
auto NewNode = O.Nodes.AppendEmplace(
- NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
- Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+ NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator),
+ Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
+ Callee.FId);
if (UNLIKELY(NewNode == nullptr))
return;
- NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
- DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
+ if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) ==
+ nullptr))
+ return;
+ if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) ==
+ nullptr))
+ return;
}
}
}
@@ -394,7 +541,7 @@ public:
//
// This function is *not* thread-safe, and may require external
// synchronisation of both "this" and |O|.
- void mergeInto(FunctionCallTrie &O) const {
+ void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
struct NodeAndTarget {
FunctionCallTrie::Node *OrigNode;
FunctionCallTrie::Node *TargetNode;
@@ -409,8 +556,9 @@ public:
auto R = O.Roots.find_element(
[&](const Node *Node) { return Node->FId == Root->FId; });
if (R == nullptr) {
- TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
- 0, Root->FId);
+ TargetRoot = O.Nodes.AppendEmplace(
+ nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
+ Root->FId);
if (UNLIKELY(TargetRoot == nullptr))
return;
@@ -419,7 +567,7 @@ public:
TargetRoot = *R;
}
- DFSStack.Append(NodeAndTarget{Root, TargetRoot});
+ DFSStack.AppendEmplace(Root, TargetRoot);
while (!DFSStack.empty()) {
NodeAndTarget NT = DFSStack.back();
DCHECK_NE(NT.OrigNode, nullptr);
@@ -435,7 +583,8 @@ public:
});
if (TargetCallee == nullptr) {
auto NewTargetNode = O.Nodes.AppendEmplace(
- NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+ NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
+ Callee.FId);
if (UNLIKELY(NewTargetNode == nullptr))
return;
diff --git a/contrib/compiler-rt/lib/xray/xray_init.cc b/contrib/compiler-rt/lib/xray/xray_init.cc
index b4e069795195..b0922aa8e379 100644
--- a/contrib/compiler-rt/lib/xray/xray_init.cc
+++ b/contrib/compiler-rt/lib/xray/xray_init.cc
@@ -27,6 +27,15 @@ extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak));
extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak));
extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
+
+#if SANITIZER_MAC
+// HACK: This is a temporary workaround to make XRay build on
+// Darwin, but it will probably not work at runtime.
+const XRaySledEntry __start_xray_instr_map[] = {};
+extern const XRaySledEntry __stop_xray_instr_map[] = {};
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
+#endif
}
using namespace __xray;
@@ -58,6 +67,9 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
if (atomic_load(&XRayInitialized, memory_order_acquire))
return;
+ // XRAY is not compatible with PaX MPROTECT
+ CheckMPROTECT();
+
if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) {
initializeFlags();
atomic_store(&XRayFlagsInitialized, true, memory_order_release);
@@ -97,8 +109,8 @@ __attribute__((section(".preinit_array"),
#else
// If we cannot use the .preinit_array section, we should instead use dynamic
// initialisation.
-static bool UNUSED __local_xray_dyninit = [] {
+__attribute__ ((constructor (0)))
+static void __local_xray_dyninit() {
__xray_init();
- return true;
-}();
+}
#endif
diff --git a/contrib/compiler-rt/lib/xray/xray_interface.cc b/contrib/compiler-rt/lib/xray/xray_interface.cc
index 01bf6ddc607e..6f7b6615b2c0 100644
--- a/contrib/compiler-rt/lib/xray/xray_interface.cc
+++ b/contrib/compiler-rt/lib/xray/xray_interface.cc
@@ -22,6 +22,13 @@
#include <string.h>
#include <sys/mman.h>
+#if SANITIZER_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
#include "sanitizer_common/sanitizer_addrhashmap.h"
#include "sanitizer_common/sanitizer_common.h"
@@ -92,22 +99,48 @@ class MProtectHelper {
public:
explicit MProtectHelper(void *PageAlignedAddr,
- std::size_t MProtectLen) XRAY_NEVER_INSTRUMENT
+ std::size_t MProtectLen,
+ std::size_t PageSize) XRAY_NEVER_INSTRUMENT
: PageAlignedAddr(PageAlignedAddr),
MProtectLen(MProtectLen),
- MustCleanup(false) {}
+ MustCleanup(false) {
+#if SANITIZER_FUCHSIA
+ MProtectLen = RoundUpTo(MProtectLen, PageSize);
+#endif
+ }
int MakeWriteable() XRAY_NEVER_INSTRUMENT {
+#if SANITIZER_FUCHSIA
+ auto R = __sanitizer_change_code_protection(
+ reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, true);
+ if (R != ZX_OK) {
+ Report("XRay: cannot change code protection: %s\n",
+ _zx_status_get_string(R));
+ return -1;
+ }
+ MustCleanup = true;
+ return 0;
+#else
auto R = mprotect(PageAlignedAddr, MProtectLen,
PROT_READ | PROT_WRITE | PROT_EXEC);
if (R != -1)
MustCleanup = true;
return R;
+#endif
}
~MProtectHelper() XRAY_NEVER_INSTRUMENT {
if (MustCleanup) {
+#if SANITIZER_FUCHSIA
+ auto R = __sanitizer_change_code_protection(
+ reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, false);
+ if (R != ZX_OK) {
+ Report("XRay: cannot change code protection: %s\n",
+ _zx_status_get_string(R));
+ }
+#else
mprotect(PageAlignedAddr, MProtectLen, PROT_READ | PROT_EXEC);
+#endif
}
}
};
@@ -254,7 +287,7 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
size_t MProtectLen =
(MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
- MProtectHelper Protector(PageAlignedAddr, MProtectLen);
+ MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
if (Protector.MakeWriteable() == -1) {
Report("Failed mprotect: %d\n", errno);
return XRayPatchingStatus::FAILED;
@@ -319,7 +352,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
size_t MProtectLen =
(MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
- MProtectHelper Protector(PageAlignedAddr, MProtectLen);
+ MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
if (Protector.MakeWriteable() == -1) {
Report("Failed mprotect: %d\n", errno);
return XRayPatchingStatus::FAILED;
diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
index 17a611eeacb8..dc3a82069840 100644
--- a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
+++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
@@ -13,10 +13,11 @@
//
//===----------------------------------------------------------------------===//
#include "xray_profile_collector.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
#include <memory>
#include <pthread.h>
#include <utility>
@@ -29,7 +30,7 @@ namespace {
SpinMutex GlobalMutex;
struct ThreadTrie {
tid_t TId;
- FunctionCallTrie *Trie;
+ typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
};
struct ProfileBuffer {
@@ -56,65 +57,91 @@ struct BlockHeader {
u64 ThreadId;
};
-// These need to be pointers that point to heap/internal-allocator-allocated
-// objects because these are accessed even at program exit.
-Vector<ThreadTrie> *ThreadTries = nullptr;
-Vector<ProfileBuffer> *ProfileBuffers = nullptr;
-FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+struct ThreadData {
+ BufferQueue *BQ;
+ FunctionCallTrie::Allocators::Buffers Buffers;
+ FunctionCallTrie::Allocators Allocators;
+ FunctionCallTrie FCT;
+ tid_t TId;
+};
+
+using ThreadDataArray = Array<ThreadData>;
+using ThreadDataAllocator = ThreadDataArray::AllocatorType;
+
+// We use a separate buffer queue for the backing store for the allocator used
+// by the ThreadData array. This lets us host the buffers, allocators, and tries
+// associated with a thread by moving the data into the array instead of
+// attempting to copy the data to a separately backed set of tries.
+static typename std::aligned_storage<
+ sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+static BufferQueue::Buffer Buffer;
+static typename std::aligned_storage<sizeof(ThreadDataAllocator),
+ alignof(ThreadDataAllocator)>::type
+ ThreadDataAllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadDataArray),
+ alignof(ThreadDataArray)>::type
+ ThreadDataArrayStorage;
+
+static ThreadDataAllocator *TDAllocator = nullptr;
+static ThreadDataArray *TDArray = nullptr;
+
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+ ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+ ProfileBufferArrayAllocatorStorage;
+
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+
+// Use a global flag to determine whether the collector implementation has been
+// initialized.
+static atomic_uint8_t CollectorInitialized{0};
} // namespace
-void post(const FunctionCallTrie &T, tid_t TId) {
- static pthread_once_t Once = PTHREAD_ONCE_INIT;
- pthread_once(&Once, +[] {
- SpinMutexLock Lock(&GlobalMutex);
- GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
- InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
- new (GlobalAllocators) FunctionCallTrie::Allocators();
- *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
- profilingFlags()->global_allocator_max);
- ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
- InternalAlloc(sizeof(Vector<ThreadTrie>)));
- new (ThreadTries) Vector<ThreadTrie>();
- ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
- InternalAlloc(sizeof(Vector<ProfileBuffer>)));
- new (ProfileBuffers) Vector<ProfileBuffer>();
- });
- DCHECK_NE(GlobalAllocators, nullptr);
- DCHECK_NE(ThreadTries, nullptr);
- DCHECK_NE(ProfileBuffers, nullptr);
-
- ThreadTrie *Item = nullptr;
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+ FunctionCallTrie::Allocators &&A,
+ FunctionCallTrie::Allocators::Buffers &&B,
+ tid_t TId) XRAY_NEVER_INSTRUMENT {
+ DCHECK_NE(Q, nullptr);
+
+ // Bail out early if the collector has not been initialized.
+ if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
+ T.~FunctionCallTrie();
+ A.~Allocators();
+ Q->releaseBuffer(B.NodeBuffer);
+ Q->releaseBuffer(B.RootsBuffer);
+ Q->releaseBuffer(B.ShadowStackBuffer);
+ Q->releaseBuffer(B.NodeIdPairBuffer);
+ B.~Buffers();
+ return;
+ }
+
{
SpinMutexLock Lock(&GlobalMutex);
- if (GlobalAllocators == nullptr)
- return;
-
- Item = ThreadTries->PushBack();
- Item->TId = TId;
-
- // Here we're using the internal allocator instead of the managed allocator
- // because:
- //
- // 1) We're not using the segmented array data structure to host
- // FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
- // which works like a std::vector<...> keeping elements contiguous in
- // memory. The segmented array data structure assumes that elements are
- // trivially destructible, where FunctionCallTrie isn't.
- //
- // 2) Using a managed allocator means we need to manage that separately,
- // which complicates the nature of this code. To get around that, we're
- // using the internal allocator instead, which has its own global state
- // and is decoupled from the lifetime management required by the managed
- // allocator we have in XRay.
- //
- Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
- sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
- DCHECK_NE(Item->Trie, nullptr);
- new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+ DCHECK_NE(TDAllocator, nullptr);
+ DCHECK_NE(TDArray, nullptr);
+
+ if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
+ TId) == nullptr) {
+ // If we fail to add the data to the array, we should destroy the objects
+ // handed us.
+ T.~FunctionCallTrie();
+ A.~Allocators();
+ Q->releaseBuffer(B.NodeBuffer);
+ Q->releaseBuffer(B.RootsBuffer);
+ Q->releaseBuffer(B.ShadowStackBuffer);
+ Q->releaseBuffer(B.NodeIdPairBuffer);
+ B.~Buffers();
+ }
}
-
- T.deepCopyInto(*Item->Trie);
}
// A PathArray represents the function id's representing a stack trace. In this
@@ -127,18 +154,8 @@ struct ProfileRecord {
// The Path in this record is the function id's from the leaf to the root of
// the function call stack as represented from a FunctionCallTrie.
- PathArray *Path = nullptr;
- const FunctionCallTrie::Node *Node = nullptr;
-
- // Constructor for in-place construction.
- ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
- : Path([&] {
- auto P =
- reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
- new (P) PathArray(A);
- return P;
- }()),
- Node(N) {}
+ PathArray Path;
+ const FunctionCallTrie::Node *Node;
};
namespace {
@@ -147,19 +164,21 @@ using ProfileRecordArray = Array<ProfileRecord>;
// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
// the path(s) and the data associated with the path.
-static void populateRecords(ProfileRecordArray &PRs,
- ProfileRecord::PathAllocator &PA,
- const FunctionCallTrie &Trie) {
+static void
+populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
+ const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
using StackArray = Array<const FunctionCallTrie::Node *>;
using StackAllocator = typename StackArray::AllocatorType;
StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
StackArray DFSStack(StackAlloc);
- for (const auto R : Trie.getRoots()) {
+ for (const auto *R : Trie.getRoots()) {
DFSStack.Append(R);
while (!DFSStack.empty()) {
- auto Node = DFSStack.back();
+ auto *Node = DFSStack.back();
DFSStack.trim(1);
- auto Record = PRs.AppendEmplace(PA, Node);
+ if (Node == nullptr)
+ continue;
+ auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
if (Record == nullptr)
return;
DCHECK_NE(Record, nullptr);
@@ -167,8 +186,8 @@ static void populateRecords(ProfileRecordArray &PRs,
// Traverse the Node's parents and as we're doing so, get the FIds in
// the order they appear.
for (auto N = Node; N != nullptr; N = N->Parent)
- Record->Path->Append(N->FId);
- DCHECK(!Record->Path->empty());
+ Record->Path.Append(N->FId);
+ DCHECK(!Record->Path.empty());
for (const auto C : Node->Callees)
DFSStack.Append(C.NodePtr);
@@ -177,67 +196,89 @@ static void populateRecords(ProfileRecordArray &PRs,
}
static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
- const ProfileRecordArray &ProfileRecords) {
- auto NextPtr = static_cast<char *>(
+ const ProfileRecordArray &ProfileRecords)
+ XRAY_NEVER_INSTRUMENT {
+ auto NextPtr = static_cast<uint8_t *>(
internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
sizeof(Header);
for (const auto &Record : ProfileRecords) {
// List of IDs follow:
- for (const auto FId : *Record.Path)
+ for (const auto FId : Record.Path)
NextPtr =
- static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+ static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
sizeof(FId);
// Add the sentinel here.
constexpr int32_t SentinelFId = 0;
- NextPtr = static_cast<char *>(
+ NextPtr = static_cast<uint8_t *>(
internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
sizeof(SentinelFId);
// Add the node data here.
NextPtr =
- static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
- sizeof(Record.Node->CallCount))) +
+ static_cast<uint8_t *>(internal_memcpy(
+ NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
sizeof(Record.Node->CallCount);
- NextPtr = static_cast<char *>(
+ NextPtr = static_cast<uint8_t *>(
internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
sizeof(Record.Node->CumulativeLocalTime))) +
sizeof(Record.Node->CumulativeLocalTime);
}
- DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
+ DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
}
} // namespace
-void serialize() {
+void serialize() XRAY_NEVER_INSTRUMENT {
+ if (!atomic_load(&CollectorInitialized, memory_order_acquire))
+ return;
+
SpinMutexLock Lock(&GlobalMutex);
- // Clear out the global ProfileBuffers.
- for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
- InternalFree((*ProfileBuffers)[I].Data);
- ProfileBuffers->Reset();
+ // Clear out the global ProfileBuffers, if it's not empty.
+ for (auto &B : *ProfileBuffers)
+ deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
+ ProfileBuffers->trim(ProfileBuffers->size());
- if (ThreadTries->Size() == 0)
+ DCHECK_NE(TDArray, nullptr);
+ if (TDArray->empty())
return;
// Then repopulate the global ProfileBuffers.
- for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+ u32 I = 0;
+ auto MaxSize = profilingFlags()->global_allocator_max;
+ auto ProfileArena = allocateBuffer(MaxSize);
+ if (ProfileArena == nullptr)
+ return;
+
+ auto ProfileArenaCleanup = at_scope_exit(
+ [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
+
+ auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
+ if (PathArena == nullptr)
+ return;
+
+ auto PathArenaCleanup = at_scope_exit(
+ [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
+
+ for (const auto &ThreadTrie : *TDArray) {
using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
- ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+ ProfileRecordAllocator PRAlloc(ProfileArena,
+ profilingFlags()->global_allocator_max);
ProfileRecord::PathAllocator PathAlloc(
- profilingFlags()->global_allocator_max);
+ PathArena, profilingFlags()->global_allocator_max);
ProfileRecordArray ProfileRecords(PRAlloc);
// First, we want to compute the amount of space we're going to need. We'll
// use a local allocator and an __xray::Array<...> to store the intermediary
// data, then compute the size as we're going along. Then we'll allocate the
// contiguous space to contain the thread buffer data.
- const auto &Trie = *(*ThreadTries)[I].Trie;
- if (Trie.getRoots().empty())
+ if (ThreadTrie.FCT.getRoots().empty())
continue;
- populateRecords(ProfileRecords, PathAlloc, Trie);
- DCHECK(!Trie.getRoots().empty());
+
+ populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
+ DCHECK(!ThreadTrie.FCT.getRoots().empty());
DCHECK(!ProfileRecords.empty());
// Go through each record, to compute the sizes.
@@ -251,75 +292,103 @@ void serialize() {
// + end of record (8 bytes)
u32 CumulativeSizes = 0;
for (const auto &Record : ProfileRecords)
- CumulativeSizes += 20 + (4 * Record.Path->size());
-
- BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
- auto Buffer = ProfileBuffers->PushBack();
- Buffer->Size = sizeof(Header) + CumulativeSizes;
- Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
- DCHECK_NE(Buffer->Data, nullptr);
- serializeRecords(Buffer, Header, ProfileRecords);
-
- // Now clean up the ProfileRecords array, one at a time.
- for (auto &Record : ProfileRecords) {
- Record.Path->~PathArray();
- InternalFree(Record.Path);
- }
+ CumulativeSizes += 20 + (4 * Record.Path.size());
+
+ BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+ auto B = ProfileBuffers->Append({});
+ B->Size = sizeof(Header) + CumulativeSizes;
+ B->Data = allocateBuffer(B->Size);
+ DCHECK_NE(B->Data, nullptr);
+ serializeRecords(B, Header, ProfileRecords);
}
}
-void reset() {
+void reset() XRAY_NEVER_INSTRUMENT {
+ atomic_store(&CollectorInitialized, 0, memory_order_release);
SpinMutexLock Lock(&GlobalMutex);
+
if (ProfileBuffers != nullptr) {
// Clear out the profile buffers that have been serialized.
- for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
- InternalFree((*ProfileBuffers)[I].Data);
- ProfileBuffers->Reset();
- InternalFree(ProfileBuffers);
+ for (auto &B : *ProfileBuffers)
+ deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
+ ProfileBuffers->trim(ProfileBuffers->size());
ProfileBuffers = nullptr;
}
- if (ThreadTries != nullptr) {
- // Clear out the function call tries per thread.
- for (uptr I = 0; I < ThreadTries->Size(); ++I) {
- auto &T = (*ThreadTries)[I];
- T.Trie->~FunctionCallTrie();
- InternalFree(T.Trie);
+ if (TDArray != nullptr) {
+ // Release the resources as required.
+ for (auto &TD : *TDArray) {
+ TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
}
- ThreadTries->Reset();
- InternalFree(ThreadTries);
- ThreadTries = nullptr;
+ // We don't bother destroying the array here because we've already
+ // potentially freed the backing store for the array. Instead we're going to
+ // reset the pointer to nullptr, and re-use the storage later instead
+ // (placement-new'ing into the storage as-is).
+ TDArray = nullptr;
+ }
+
+ if (TDAllocator != nullptr) {
+ TDAllocator->~Allocator();
+ TDAllocator = nullptr;
+ }
+
+ if (Buffer.Data != nullptr) {
+ BQ->releaseBuffer(Buffer);
}
- // Reset the global allocators.
- if (GlobalAllocators != nullptr) {
- GlobalAllocators->~Allocators();
- InternalFree(GlobalAllocators);
- GlobalAllocators = nullptr;
+ if (BQ == nullptr) {
+ bool Success = false;
+ new (&BufferQueueStorage)
+ BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
+ if (!Success)
+ return;
+ BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+ } else {
+ BQ->finalize();
+
+ if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
+ BufferQueue::ErrorCode::Ok)
+ return;
}
- GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
- InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
- new (GlobalAllocators) FunctionCallTrie::Allocators();
- *GlobalAllocators = FunctionCallTrie::InitAllocators();
- ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
- InternalAlloc(sizeof(Vector<ThreadTrie>)));
- new (ThreadTries) Vector<ThreadTrie>();
- ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
- InternalAlloc(sizeof(Vector<ProfileBuffer>)));
- new (ProfileBuffers) Vector<ProfileBuffer>();
+
+ if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
+ return;
+
+ new (&ProfileBufferArrayAllocatorStorage)
+ ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+ ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+ &ProfileBufferArrayAllocatorStorage);
+
+ new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
+ ProfileBuffers =
+ reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+
+ new (&ThreadDataAllocatorStorage)
+ ThreadDataAllocator(Buffer.Data, Buffer.Size);
+ TDAllocator =
+ reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
+ new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
+ TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
+
+ atomic_store(&CollectorInitialized, 1, memory_order_release);
}
-XRayBuffer nextBuffer(XRayBuffer B) {
+XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
SpinMutexLock Lock(&GlobalMutex);
- if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+ if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
return {nullptr, 0};
static pthread_once_t Once = PTHREAD_ONCE_INIT;
static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
FileHeaderStorage;
- pthread_once(&Once,
- +[] { new (&FileHeaderStorage) XRayProfilingFileHeader{}; });
+ pthread_once(
+ &Once, +[]() XRAY_NEVER_INSTRUMENT {
+ new (&FileHeaderStorage) XRayProfilingFileHeader{};
+ });
if (UNLIKELY(B.Data == nullptr)) {
// The first buffer should always contain the file header information.
@@ -336,7 +405,7 @@ XRayBuffer nextBuffer(XRayBuffer B) {
BlockHeader Header;
internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
auto NextBlock = Header.BlockNum + 1;
- if (NextBlock < ProfileBuffers->Size())
+ if (NextBlock < ProfileBuffers->size())
return {(*ProfileBuffers)[NextBlock].Data,
(*ProfileBuffers)[NextBlock].Size};
return {nullptr, 0};
diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.h b/contrib/compiler-rt/lib/xray/xray_profile_collector.h
index 335043db9526..86c4ce853797 100644
--- a/contrib/compiler-rt/lib/xray/xray_profile_collector.h
+++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.h
@@ -33,27 +33,13 @@ namespace profileCollectorService {
/// Posts the FunctionCallTrie associated with a specific Thread ID. This
/// will:
///
-/// - Make a copy of the FunctionCallTrie and store that against the Thread
-/// ID. This will use the global allocator for the service-managed
-/// FunctionCallTrie instances.
-/// - Queue up a pointer to the FunctionCallTrie.
-/// - If the queue is long enough (longer than some arbitrary threshold) we
-/// then pre-calculate a single FunctionCallTrie for the whole process.
+/// Moves the collection of FunctionCallTrie, Allocators, and Buffers associated
+/// with a thread's data to the queue. This takes ownership of the memory
+/// associated with a thread, and manages those exclusively.
///
-///
-/// We are making a copy of the FunctionCallTrie because the intent is to have
-/// this function be called at thread exit, or soon after the profiling
-/// handler is finalized through the XRay APIs. By letting threads each
-/// process their own thread-local FunctionCallTrie instances, we're removing
-/// the need for synchronisation across threads while we're profiling.
-/// However, once we're done profiling, we can then collect copies of these
-/// FunctionCallTrie instances and pay the cost of the copy.
-///
-/// NOTE: In the future, if this turns out to be more costly than "moving" the
-/// FunctionCallTrie instances from the owning thread to the collector
-/// service, then we can change the implementation to do it this way (moving)
-/// instead.
-void post(const FunctionCallTrie &T, tid_t TId);
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+ FunctionCallTrie::Allocators &&A,
+ FunctionCallTrie::Allocators::Buffers &&B, tid_t TId);
/// The serialize will process all FunctionCallTrie instances in memory, and
/// turn those into specifically formatted blocks, each describing the
diff --git a/contrib/compiler-rt/lib/xray/xray_profiling.cc b/contrib/compiler-rt/lib/xray/xray_profiling.cc
index d4b4345d764a..4323170cd1bb 100644
--- a/contrib/compiler-rt/lib/xray/xray_profiling.cc
+++ b/contrib/compiler-rt/lib/xray/xray_profiling.cc
@@ -19,7 +19,7 @@
#include "sanitizer_common/sanitizer_flags.h"
#include "xray/xray_interface.h"
#include "xray/xray_log_interface.h"
-
+#include "xray_buffer_queue.h"
#include "xray_flags.h"
#include "xray_profile_collector.h"
#include "xray_profiling_flags.h"
@@ -32,62 +32,167 @@ namespace __xray {
namespace {
-atomic_sint32_t ProfilerLogFlushStatus = {
+static atomic_sint32_t ProfilerLogFlushStatus = {
XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
-atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+static atomic_sint32_t ProfilerLogStatus = {
+ XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-SpinMutex ProfilerOptionsMutex;
+static SpinMutex ProfilerOptionsMutex;
-struct alignas(64) ProfilingData {
- FunctionCallTrie::Allocators *Allocators = nullptr;
- FunctionCallTrie *FCT = nullptr;
+struct ProfilingData {
+ atomic_uintptr_t Allocators;
+ atomic_uintptr_t FCT;
};
static pthread_key_t ProfilingKey;
-thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
-static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
- thread_local auto ThreadOnce = [] {
- new (&ThreadStorage) ProfilingData{};
- pthread_setspecific(ProfilingKey, &ThreadStorage);
+// We use a global buffer queue, which gets initialized once at initialisation
+// time, and gets reset when profiling is "done".
+static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type
+ BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+
+thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers;
+thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators),
+ alignof(FunctionCallTrie::Allocators)>::type
+ AllocatorsStorage;
+thread_local std::aligned_storage<sizeof(FunctionCallTrie),
+ alignof(FunctionCallTrie)>::type
+ FunctionCallTrieStorage;
+thread_local ProfilingData TLD{{0}, {0}};
+thread_local atomic_uint8_t ReentranceGuard{0};
+
+// We use a separate guard for ensuring that for this thread, if we're already
+// cleaning up, that any signal handlers don't attempt to cleanup nor
+// initialise.
+thread_local atomic_uint8_t TLDInitGuard{0};
+
+// We also use a separate latch to signal that the thread is exiting, and
+// non-essential work should be ignored (things like recording events, etc.).
+thread_local atomic_uint8_t ThreadExitingLatch{0};
+
+static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+ thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT {
+ pthread_setspecific(ProfilingKey, &TLD);
return false;
}();
(void)ThreadOnce;
- auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+ RecursionGuard TLDInit(TLDInitGuard);
+ if (!TLDInit)
+ return nullptr;
- // We need to check whether the global flag to finalizing/finalized has been
- // switched. If it is, then we ought to not actually initialise the data.
- auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
- if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
- Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)
- return TLD;
-
- // If we're live, then we re-initialize TLD if the pointers are not null.
- if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) {
- TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
- InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
- new (TLD.Allocators) FunctionCallTrie::Allocators();
- *TLD.Allocators = FunctionCallTrie::InitAllocators();
- TLD.FCT = reinterpret_cast<FunctionCallTrie *>(
- InternalAlloc(sizeof(FunctionCallTrie)));
- new (TLD.FCT) FunctionCallTrie(*TLD.Allocators);
+ if (atomic_load_relaxed(&ThreadExitingLatch))
+ return nullptr;
+
+ uptr Allocators = 0;
+ if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1,
+ memory_order_acq_rel)) {
+ bool Success = false;
+ auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ atomic_store(&TLD.Allocators, 0, memory_order_release);
+ });
+
+ // Acquire a set of buffers for this thread.
+ if (BQ == nullptr)
+ return nullptr;
+
+ if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok)
+ return nullptr;
+ auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ BQ->releaseBuffer(ThreadBuffers.NodeBuffer);
+ });
+
+ if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok)
+ return nullptr;
+ auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ BQ->releaseBuffer(ThreadBuffers.RootsBuffer);
+ });
+
+ if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) !=
+ BufferQueue::ErrorCode::Ok)
+ return nullptr;
+ auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+ if (!Success)
+ BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer);
+ });
+
+ if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) !=
+ BufferQueue::ErrorCode::Ok)
+ return nullptr;
+
+ Success = true;
+ new (&AllocatorsStorage) FunctionCallTrie::Allocators(
+ FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers));
+ Allocators = reinterpret_cast<uptr>(
+ reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage));
+ atomic_store(&TLD.Allocators, Allocators, memory_order_release);
+ }
+
+ if (Allocators == 1)
+ return nullptr;
+
+ uptr FCT = 0;
+ if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) {
+ new (&FunctionCallTrieStorage)
+ FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>(
+ atomic_load_relaxed(&TLD.Allocators)));
+ FCT = reinterpret_cast<uptr>(
+ reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage));
+ atomic_store(&TLD.FCT, FCT, memory_order_release);
}
- return TLD;
+ if (FCT == 1)
+ return nullptr;
+
+ return &TLD;
}
static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
- auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
- if (TLD.Allocators != nullptr && TLD.FCT != nullptr) {
- TLD.FCT->~FunctionCallTrie();
- TLD.Allocators->~Allocators();
- InternalFree(TLD.FCT);
- InternalFree(TLD.Allocators);
- TLD.FCT = nullptr;
- TLD.Allocators = nullptr;
- }
+ auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel);
+ if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>(
+ &FunctionCallTrieStorage)))
+ reinterpret_cast<FunctionCallTrie *>(FCT)->~FunctionCallTrie();
+
+ auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel);
+ if (Allocators ==
+ reinterpret_cast<uptr>(
+ reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+ reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)->~Allocators();
+}
+
+static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT {
+ RecursionGuard TLDInit(TLDInitGuard);
+ if (!TLDInit)
+ return;
+
+ uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel);
+ if (P != reinterpret_cast<uptr>(
+ reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)))
+ return;
+
+ auto FCT = reinterpret_cast<FunctionCallTrie *>(P);
+ DCHECK_NE(FCT, nullptr);
+
+ uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel);
+ if (A !=
+ reinterpret_cast<uptr>(
+ reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+ return;
+
+ auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A);
+ DCHECK_NE(Allocators, nullptr);
+
+ // Always move the data into the profile collector.
+ profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators),
+ std::move(ThreadBuffers), GetTid());
+
+ // Re-initialize the ThreadBuffers object to a known "default" state.
+ ThreadBuffers = FunctionCallTrie::Allocators::Buffers{};
}
} // namespace
@@ -100,9 +205,6 @@ const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
#endif
}
-atomic_sint32_t ProfileFlushStatus = {
- XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
-
XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
if (atomic_load(&ProfilerLogStatus, memory_order_acquire) !=
XRayLogInitStatus::XRAY_LOG_FINALIZED) {
@@ -111,12 +213,23 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
}
- s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
- if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result,
- XRayLogFlushStatus::XRAY_LOG_FLUSHING,
- memory_order_acq_rel)) {
+ RecursionGuard SignalGuard(ReentranceGuard);
+ if (!SignalGuard) {
+ if (Verbosity())
+ Report("Cannot finalize properly inside a signal handler!\n");
+ atomic_store(&ProfilerLogFlushStatus,
+ XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING,
+ memory_order_release);
+ return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+ }
+
+ s32 Previous = atomic_exchange(&ProfilerLogFlushStatus,
+ XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+ memory_order_acq_rel);
+ if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) {
if (Verbosity())
- Report("Not flushing profiles, implementation still finalizing.\n");
+ Report("Not flushing profiles, implementation still flushing.\n");
+ return XRayLogFlushStatus::XRAY_LOG_FLUSHING;
}
// At this point, we'll create the file that will contain the profile, but
@@ -129,49 +242,33 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
if (Verbosity())
Report("profiling: No data to flush.\n");
} else {
- int Fd = getLogFD();
- if (Fd == -1) {
+ LogWriter *LW = LogWriter::Open();
+ if (LW == nullptr) {
if (Verbosity())
Report("profiling: Failed to flush to file, dropping data.\n");
} else {
// Now for each of the buffers, write out the profile data as we would
// see it in memory, verbatim.
while (B.Data != nullptr && B.Size != 0) {
- retryingWriteAll(Fd, reinterpret_cast<const char *>(B.Data),
- reinterpret_cast<const char *>(B.Data) + B.Size);
+ LW->WriteAll(reinterpret_cast<const char *>(B.Data),
+ reinterpret_cast<const char *>(B.Data) + B.Size);
B = profileCollectorService::nextBuffer(B);
}
- // Then we close out the file.
- internal_close(Fd);
}
+ LogWriter::Close(LW);
}
}
profileCollectorService::reset();
- // Flush the current thread's local data structures as well.
- cleanupTLD();
-
- atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+ atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+ memory_order_release);
+ atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
memory_order_release);
return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
}
-namespace {
-
-thread_local atomic_uint8_t ReentranceGuard{0};
-
-static void postCurrentThreadFCT(ProfilingData &TLD) {
- if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
- return;
-
- profileCollectorService::post(*TLD.FCT, GetTid());
- cleanupTLD();
-}
-
-} // namespace
-
void profilingHandleArg0(int32_t FuncId,
XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
unsigned char CPU;
@@ -181,21 +278,29 @@ void profilingHandleArg0(int32_t FuncId,
return;
auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
- auto &TLD = getThreadLocalData();
+ if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED ||
+ Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING))
+ return;
+
if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
postCurrentThreadFCT(TLD);
return;
}
+ auto T = getThreadLocalData();
+ if (T == nullptr)
+ return;
+
+ auto FCT = reinterpret_cast<FunctionCallTrie *>(atomic_load_relaxed(&T->FCT));
switch (Entry) {
case XRayEntryType::ENTRY:
case XRayEntryType::LOG_ARGS_ENTRY:
- TLD.FCT->enterFunction(FuncId, TSC);
+ FCT->enterFunction(FuncId, TSC, CPU);
break;
case XRayEntryType::EXIT:
case XRayEntryType::TAIL:
- TLD.FCT->exitFunction(FuncId, TSC);
+ FCT->exitFunction(FuncId, TSC, CPU);
break;
default:
// FIXME: Handle bugs.
@@ -218,12 +323,22 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
return static_cast<XRayLogInitStatus>(CurrentStatus);
}
+ // Mark then finalize the current generation of buffers. This allows us to let
+ // the threads currently holding onto new buffers still use them, but let the
+ // last reference do the memory cleanup.
+ DCHECK_NE(BQ, nullptr);
+ BQ->finalize();
+
// Wait a grace period to allow threads to see that we're finalizing.
SleepForMillis(profilingFlags()->grace_period_ms);
- // We also want to make sure that the current thread's data is cleaned up,
- // if we have any.
- auto &TLD = getThreadLocalData();
+ // If we for some reason are entering this function from an instrumented
+ // handler, we bail out.
+ RecursionGuard G(ReentranceGuard);
+ if (!G)
+ return static_cast<XRayLogInitStatus>(CurrentStatus);
+
+ // Post the current thread's data if we have any.
postCurrentThreadFCT(TLD);
// Then we force serialize the log data.
@@ -235,19 +350,16 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
}
XRayLogInitStatus
-profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
+profilingLoggingInit(size_t, size_t, void *Options,
size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
- if (BufferSize != 0 || BufferMax != 0) {
- if (Verbosity())
- Report("__xray_log_init() being used, and is unsupported. Use "
- "__xray_log_init_mode(...) instead. Bailing out.");
+ RecursionGuard G(ReentranceGuard);
+ if (!G)
return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
- }
s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
XRayLogInitStatus::XRAY_LOG_INITIALIZING,
- memory_order_release)) {
+ memory_order_acq_rel)) {
if (Verbosity())
Report("Cannot initialize already initialised profiling "
"implementation.\n");
@@ -276,35 +388,88 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
// We need to reset the profile data collection implementation now.
profileCollectorService::reset();
- // We need to set up the exit handlers.
- static pthread_once_t Once = PTHREAD_ONCE_INIT;
- pthread_once(&Once, +[] {
- pthread_key_create(&ProfilingKey, +[](void *P) {
- // This is the thread-exit handler.
- auto &TLD = *reinterpret_cast<ProfilingData *>(P);
- if (TLD.Allocators == nullptr && TLD.FCT == nullptr)
- return;
-
- postCurrentThreadFCT(TLD);
- });
+ // Then also reset the buffer queue implementation.
+ if (BQ == nullptr) {
+ bool Success = false;
+ new (&BufferQueueStorage)
+ BufferQueue(profilingFlags()->per_thread_allocator_max,
+ profilingFlags()->buffers_max, Success);
+ if (!Success) {
+ if (Verbosity())
+ Report("Failed to initialize preallocated memory buffers!");
+ atomic_store(&ProfilerLogStatus,
+ XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+ memory_order_release);
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ }
- // We also need to set up an exit handler, so that we can get the profile
- // information at exit time. We use the C API to do this, to not rely on C++
- // ABI functions for registering exit handlers.
- Atexit(+[] {
- // Finalize and flush.
- if (profilingFinalize() != XRAY_LOG_FINALIZED) {
- cleanupTLD();
- return;
- }
- if (profilingFlush() != XRAY_LOG_FLUSHED) {
- cleanupTLD();
- return;
- }
+ // If we've succeded, set the global pointer to the initialised storage.
+ BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+ } else {
+ BQ->finalize();
+ auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max,
+ profilingFlags()->buffers_max);
+
+ if (InitStatus != BufferQueue::ErrorCode::Ok) {
if (Verbosity())
- Report("XRay Profile flushed at exit.");
- });
- });
+ Report("Failed to initialize preallocated memory buffers; error: %s",
+ BufferQueue::getErrorString(InitStatus));
+ atomic_store(&ProfilerLogStatus,
+ XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+ memory_order_release);
+ return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+ }
+
+ DCHECK(!BQ->finalizing());
+ }
+
+ // We need to set up the exit handlers.
+ static pthread_once_t Once = PTHREAD_ONCE_INIT;
+ pthread_once(
+ &Once, +[] {
+ pthread_key_create(
+ &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT {
+ if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+ return;
+
+ if (P == nullptr)
+ return;
+
+ auto T = reinterpret_cast<ProfilingData *>(P);
+ if (atomic_load_relaxed(&T->Allocators) == 0)
+ return;
+
+ {
+ // If we're somehow executing this while inside a
+ // non-reentrant-friendly context, we skip attempting to post
+ // the current thread's data.
+ RecursionGuard G(ReentranceGuard);
+ if (!G)
+ return;
+
+ postCurrentThreadFCT(*T);
+ }
+ });
+
+ // We also need to set up an exit handler, so that we can get the
+ // profile information at exit time. We use the C API to do this, to not
+ // rely on C++ ABI functions for registering exit handlers.
+ Atexit(+[]() XRAY_NEVER_INSTRUMENT {
+ if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+ return;
+
+ auto Cleanup =
+ at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); });
+
+ // Finalize and flush.
+ if (profilingFinalize() != XRAY_LOG_FINALIZED ||
+ profilingFlush() != XRAY_LOG_FLUSHED)
+ return;
+
+ if (Verbosity())
+ Report("XRay Profile flushed at exit.");
+ });
+ });
__xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
__xray_set_handler(profilingHandleArg0);
diff --git a/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc b/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc
index e9230ae64187..ccd70860bf61 100644
--- a/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc
+++ b/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc
@@ -14,7 +14,7 @@
#error "Define XRAY_FLAG prior to including this file!"
#endif
-XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20,
+XRAY_FLAG(uptr, per_thread_allocator_max, 16384,
"Maximum size of any single per-thread allocator.")
XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
"Maximum size of the global allocator for profile storage.")
@@ -27,3 +27,6 @@ XRAY_FLAG(int, grace_period_ms, 1,
XRAY_FLAG(bool, no_flush, false,
"Set to true if we want the profiling implementation to not write "
"out files.")
+XRAY_FLAG(int, buffers_max, 128,
+ "The number of buffers to pre-allocate used by the profiling "
+ "implementation.")
diff --git a/contrib/compiler-rt/lib/xray/xray_segmented_array.h b/contrib/compiler-rt/lib/xray/xray_segmented_array.h
index 11dd794fa520..bc7e9379f63b 100644
--- a/contrib/compiler-rt/lib/xray/xray_segmented_array.h
+++ b/contrib/compiler-rt/lib/xray/xray_segmented_array.h
@@ -32,14 +32,9 @@ namespace __xray {
/// is destroyed. When an Array is destroyed, it will destroy elements in the
/// backing store but will not free the memory.
template <class T> class Array {
- struct SegmentBase {
- SegmentBase *Prev;
- SegmentBase *Next;
- };
-
- // We want each segment of the array to be cache-line aligned, and elements of
- // the array be offset from the beginning of the segment.
- struct Segment : SegmentBase {
+ struct Segment {
+ Segment *Prev;
+ Segment *Next;
char Data[1];
};
@@ -62,98 +57,46 @@ public:
// kCacheLineSize-multiple segments, minus the size of two pointers.
//
// - Request cacheline-multiple sized elements from the allocator.
- static constexpr size_t AlignedElementStorageSize =
+ static constexpr uint64_t AlignedElementStorageSize =
sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type);
- static constexpr size_t SegmentSize =
- nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize);
+ static constexpr uint64_t SegmentControlBlockSize = sizeof(Segment *) * 2;
+
+ static constexpr uint64_t SegmentSize = nearest_boundary(
+ SegmentControlBlockSize + next_pow2(sizeof(T)), kCacheLineSize);
using AllocatorType = Allocator<SegmentSize>;
- static constexpr size_t ElementsPerSegment =
- (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T));
+ static constexpr uint64_t ElementsPerSegment =
+ (SegmentSize - SegmentControlBlockSize) / next_pow2(sizeof(T));
static_assert(ElementsPerSegment > 0,
"Must have at least 1 element per segment.");
- static SegmentBase SentinelSegment;
-
-private:
- AllocatorType *Alloc;
- SegmentBase *Head = &SentinelSegment;
- SegmentBase *Tail = &SentinelSegment;
- size_t Size = 0;
-
- // Here we keep track of segments in the freelist, to allow us to re-use
- // segments when elements are trimmed off the end.
- SegmentBase *Freelist = &SentinelSegment;
-
- Segment *NewSegment() {
- // We need to handle the case in which enough elements have been trimmed to
- // allow us to re-use segments we've allocated before. For this we look into
- // the Freelist, to see whether we need to actually allocate new blocks or
- // just re-use blocks we've already seen before.
- if (Freelist != &SentinelSegment) {
- auto *FreeSegment = Freelist;
- Freelist = FreeSegment->Next;
- FreeSegment->Next = &SentinelSegment;
- Freelist->Prev = &SentinelSegment;
- return static_cast<Segment *>(FreeSegment);
- }
-
- auto SegmentBlock = Alloc->Allocate();
- if (SegmentBlock.Data == nullptr)
- return nullptr;
-
- // Placement-new the Segment element at the beginning of the SegmentBlock.
- auto S = reinterpret_cast<Segment *>(SegmentBlock.Data);
- new (S) SegmentBase{&SentinelSegment, &SentinelSegment};
- return S;
- }
-
- Segment *InitHeadAndTail() {
- DCHECK_EQ(Head, &SentinelSegment);
- DCHECK_EQ(Tail, &SentinelSegment);
- auto Segment = NewSegment();
- if (Segment == nullptr)
- return nullptr;
- DCHECK_EQ(Segment->Next, &SentinelSegment);
- DCHECK_EQ(Segment->Prev, &SentinelSegment);
- Head = Tail = static_cast<SegmentBase *>(Segment);
- return Segment;
- }
+ static Segment SentinelSegment;
- Segment *AppendNewSegment() {
- auto S = NewSegment();
- if (S == nullptr)
- return nullptr;
- DCHECK_NE(Tail, &SentinelSegment);
- DCHECK_EQ(Tail->Next, &SentinelSegment);
- DCHECK_EQ(S->Prev, &SentinelSegment);
- DCHECK_EQ(S->Next, &SentinelSegment);
- Tail->Next = S;
- S->Prev = Tail;
- Tail = S;
- return static_cast<Segment *>(Tail);
- }
+ using size_type = uint64_t;
+private:
// This Iterator models a BidirectionalIterator.
template <class U> class Iterator {
- SegmentBase *S = &SentinelSegment;
- size_t Offset = 0;
- size_t Size = 0;
+ Segment *S = &SentinelSegment;
+ uint64_t Offset = 0;
+ uint64_t Size = 0;
public:
- Iterator(SegmentBase *IS, size_t Off, size_t S)
- : S(IS), Offset(Off), Size(S) {}
- Iterator(const Iterator &) noexcept = default;
- Iterator() noexcept = default;
- Iterator(Iterator &&) noexcept = default;
- Iterator &operator=(const Iterator &) = default;
- Iterator &operator=(Iterator &&) = default;
- ~Iterator() = default;
-
- Iterator &operator++() {
+ Iterator(Segment *IS, uint64_t Off, uint64_t S) XRAY_NEVER_INSTRUMENT
+ : S(IS),
+ Offset(Off),
+ Size(S) {}
+ Iterator(const Iterator &) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+ Iterator() NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+ Iterator(Iterator &&) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+ Iterator &operator=(const Iterator &) XRAY_NEVER_INSTRUMENT = default;
+ Iterator &operator=(Iterator &&) XRAY_NEVER_INSTRUMENT = default;
+ ~Iterator() XRAY_NEVER_INSTRUMENT = default;
+
+ Iterator &operator++() XRAY_NEVER_INSTRUMENT {
if (++Offset % ElementsPerSegment || Offset == Size)
return *this;
@@ -168,7 +111,7 @@ private:
return *this;
}
- Iterator &operator--() {
+ Iterator &operator--() XRAY_NEVER_INSTRUMENT {
DCHECK_NE(S, &SentinelSegment);
DCHECK_GT(Offset, 0);
@@ -181,107 +124,295 @@ private:
return *this;
}
- Iterator operator++(int) {
+ Iterator operator++(int) XRAY_NEVER_INSTRUMENT {
Iterator Copy(*this);
++(*this);
return Copy;
}
- Iterator operator--(int) {
+ Iterator operator--(int) XRAY_NEVER_INSTRUMENT {
Iterator Copy(*this);
--(*this);
return Copy;
}
template <class V, class W>
- friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
+ friend bool operator==(const Iterator<V> &L,
+ const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
return L.S == R.S && L.Offset == R.Offset;
}
template <class V, class W>
- friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) {
+ friend bool operator!=(const Iterator<V> &L,
+ const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
return !(L == R);
}
- U &operator*() const {
+ U &operator*() const XRAY_NEVER_INSTRUMENT {
DCHECK_NE(S, &SentinelSegment);
auto RelOff = Offset % ElementsPerSegment;
// We need to compute the character-aligned pointer, offset from the
// segment's Data location to get the element in the position of Offset.
- auto Base = static_cast<Segment *>(S)->Data;
+ auto Base = &S->Data;
auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize);
return *reinterpret_cast<U *>(AlignedOffset);
}
- U *operator->() const { return &(**this); }
+ U *operator->() const XRAY_NEVER_INSTRUMENT { return &(**this); }
};
+ AllocatorType *Alloc;
+ Segment *Head;
+ Segment *Tail;
+
+ // Here we keep track of segments in the freelist, to allow us to re-use
+ // segments when elements are trimmed off the end.
+ Segment *Freelist;
+ uint64_t Size;
+
+ // ===============================
+ // In the following implementation, we work through the algorithms and the
+ // list operations using the following notation:
+ //
+ // - pred(s) is the predecessor (previous node accessor) and succ(s) is
+ // the successor (next node accessor).
+ //
+ // - S is a sentinel segment, which has the following property:
+ //
+ // pred(S) == succ(S) == S
+ //
+ // - @ is a loop operator, which can imply pred(s) == s if it appears on
+ // the left of s, or succ(s) == S if it appears on the right of s.
+ //
+ // - sL <-> sR : means a bidirectional relation between sL and sR, which
+ // means:
+ //
+ // succ(sL) == sR && pred(SR) == sL
+ //
+ // - sL -> sR : implies a unidirectional relation between sL and SR,
+ // with the following properties:
+ //
+ // succ(sL) == sR
+ //
+ // sL <- sR : implies a unidirectional relation between sR and sL,
+ // with the following properties:
+ //
+ // pred(sR) == sL
+ //
+ // ===============================
+
+ Segment *NewSegment() XRAY_NEVER_INSTRUMENT {
+ // We need to handle the case in which enough elements have been trimmed to
+ // allow us to re-use segments we've allocated before. For this we look into
+ // the Freelist, to see whether we need to actually allocate new blocks or
+ // just re-use blocks we've already seen before.
+ if (Freelist != &SentinelSegment) {
+ // The current state of lists resemble something like this at this point:
+ //
+ // Freelist: @S@<-f0->...<->fN->@S@
+ // ^ Freelist
+ //
+ // We want to perform a splice of `f0` from Freelist to a temporary list,
+ // which looks like:
+ //
+ // Templist: @S@<-f0->@S@
+ // ^ FreeSegment
+ //
+ // Our algorithm preconditions are:
+ DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+
+ // Then the algorithm we implement is:
+ //
+ // SFS = Freelist
+ // Freelist = succ(Freelist)
+ // if (Freelist != S)
+ // pred(Freelist) = S
+ // succ(SFS) = S
+ // pred(SFS) = S
+ //
+ auto *FreeSegment = Freelist;
+ Freelist = Freelist->Next;
+
+ // Note that we need to handle the case where Freelist is now pointing to
+ // S, which we don't want to be overwriting.
+ // TODO: Determine whether the cost of the branch is higher than the cost
+ // of the blind assignment.
+ if (Freelist != &SentinelSegment)
+ Freelist->Prev = &SentinelSegment;
+
+ FreeSegment->Next = &SentinelSegment;
+ FreeSegment->Prev = &SentinelSegment;
+
+ // Our postconditions are:
+ DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+ DCHECK_NE(FreeSegment, &SentinelSegment);
+ return FreeSegment;
+ }
+
+ auto SegmentBlock = Alloc->Allocate();
+ if (SegmentBlock.Data == nullptr)
+ return nullptr;
+
+ // Placement-new the Segment element at the beginning of the SegmentBlock.
+ new (SegmentBlock.Data) Segment{&SentinelSegment, &SentinelSegment, {0}};
+ auto SB = reinterpret_cast<Segment *>(SegmentBlock.Data);
+ return SB;
+ }
+
+ Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT {
+ DCHECK_EQ(Head, &SentinelSegment);
+ DCHECK_EQ(Tail, &SentinelSegment);
+ auto S = NewSegment();
+ if (S == nullptr)
+ return nullptr;
+ DCHECK_EQ(S->Next, &SentinelSegment);
+ DCHECK_EQ(S->Prev, &SentinelSegment);
+ DCHECK_NE(S, &SentinelSegment);
+ Head = S;
+ Tail = S;
+ DCHECK_EQ(Head, Tail);
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ DCHECK_EQ(Tail->Prev, &SentinelSegment);
+ return S;
+ }
+
+ Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT {
+ auto S = NewSegment();
+ if (S == nullptr)
+ return nullptr;
+ DCHECK_NE(Tail, &SentinelSegment);
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ DCHECK_EQ(S->Prev, &SentinelSegment);
+ DCHECK_EQ(S->Next, &SentinelSegment);
+ S->Prev = Tail;
+ Tail->Next = S;
+ Tail = S;
+ DCHECK_EQ(S, S->Prev->Next);
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ return S;
+ }
+
public:
- explicit Array(AllocatorType &A) : Alloc(&A) {}
+ explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT
+ : Alloc(&A),
+ Head(&SentinelSegment),
+ Tail(&SentinelSegment),
+ Freelist(&SentinelSegment),
+ Size(0) {}
+
+ Array() XRAY_NEVER_INSTRUMENT : Alloc(nullptr),
+ Head(&SentinelSegment),
+ Tail(&SentinelSegment),
+ Freelist(&SentinelSegment),
+ Size(0) {}
Array(const Array &) = delete;
- Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
- Head(O.Head),
- Tail(O.Tail),
- Size(O.Size) {
+ Array &operator=(const Array &) = delete;
+
+ Array(Array &&O) XRAY_NEVER_INSTRUMENT : Alloc(O.Alloc),
+ Head(O.Head),
+ Tail(O.Tail),
+ Freelist(O.Freelist),
+ Size(O.Size) {
+ O.Alloc = nullptr;
O.Head = &SentinelSegment;
O.Tail = &SentinelSegment;
O.Size = 0;
+ O.Freelist = &SentinelSegment;
}
- bool empty() const { return Size == 0; }
+ Array &operator=(Array &&O) XRAY_NEVER_INSTRUMENT {
+ Alloc = O.Alloc;
+ O.Alloc = nullptr;
+ Head = O.Head;
+ O.Head = &SentinelSegment;
+ Tail = O.Tail;
+ O.Tail = &SentinelSegment;
+ Freelist = O.Freelist;
+ O.Freelist = &SentinelSegment;
+ Size = O.Size;
+ O.Size = 0;
+ return *this;
+ }
+
+ ~Array() XRAY_NEVER_INSTRUMENT {
+ for (auto &E : *this)
+ (&E)->~T();
+ }
- AllocatorType &allocator() const {
+ bool empty() const XRAY_NEVER_INSTRUMENT { return Size == 0; }
+
+ AllocatorType &allocator() const XRAY_NEVER_INSTRUMENT {
DCHECK_NE(Alloc, nullptr);
return *Alloc;
}
- size_t size() const { return Size; }
+ uint64_t size() const XRAY_NEVER_INSTRUMENT { return Size; }
- T *Append(const T &E) {
- if (UNLIKELY(Head == &SentinelSegment))
- if (InitHeadAndTail() == nullptr)
+ template <class... Args>
+ T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT {
+ DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) ||
+ (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+ if (UNLIKELY(Head == &SentinelSegment)) {
+ auto R = InitHeadAndTail();
+ if (R == nullptr)
return nullptr;
+ }
+
+ DCHECK_NE(Head, &SentinelSegment);
+ DCHECK_NE(Tail, &SentinelSegment);
auto Offset = Size % ElementsPerSegment;
if (UNLIKELY(Size != 0 && Offset == 0))
if (AppendNewSegment() == nullptr)
return nullptr;
- auto Base = static_cast<Segment *>(Tail)->Data;
+ DCHECK_NE(Tail, &SentinelSegment);
+ auto Base = &Tail->Data;
auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
- auto Position = reinterpret_cast<T *>(AlignedOffset);
- *Position = E;
+ DCHECK_LE(AlignedOffset + sizeof(T),
+ reinterpret_cast<unsigned char *>(Base) + SegmentSize);
+
+ // In-place construct at Position.
+ new (AlignedOffset) T{std::forward<Args>(args)...};
++Size;
- return Position;
+ return reinterpret_cast<T *>(AlignedOffset);
}
- template <class... Args> T *AppendEmplace(Args &&... args) {
- if (UNLIKELY(Head == &SentinelSegment))
- if (InitHeadAndTail() == nullptr)
+ T *Append(const T &E) XRAY_NEVER_INSTRUMENT {
+ // FIXME: This is a duplication of AppenEmplace with the copy semantics
+ // explicitly used, as a work-around to GCC 4.8 not invoking the copy
+ // constructor with the placement new with braced-init syntax.
+ DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) ||
+ (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+ if (UNLIKELY(Head == &SentinelSegment)) {
+ auto R = InitHeadAndTail();
+ if (R == nullptr)
return nullptr;
+ }
+
+ DCHECK_NE(Head, &SentinelSegment);
+ DCHECK_NE(Tail, &SentinelSegment);
auto Offset = Size % ElementsPerSegment;
- auto *LatestSegment = Tail;
- if (UNLIKELY(Size != 0 && Offset == 0)) {
- LatestSegment = AppendNewSegment();
- if (LatestSegment == nullptr)
+ if (UNLIKELY(Size != 0 && Offset == 0))
+ if (AppendNewSegment() == nullptr)
return nullptr;
- }
DCHECK_NE(Tail, &SentinelSegment);
- auto Base = static_cast<Segment *>(LatestSegment)->Data;
+ auto Base = &Tail->Data;
auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
- auto Position = reinterpret_cast<T *>(AlignedOffset);
+ DCHECK_LE(AlignedOffset + sizeof(T),
+ reinterpret_cast<unsigned char *>(Tail) + SegmentSize);
// In-place construct at Position.
- new (Position) T{std::forward<Args>(args)...};
+ new (AlignedOffset) T(E);
++Size;
- return reinterpret_cast<T *>(Position);
+ return reinterpret_cast<T *>(AlignedOffset);
}
- T &operator[](size_t Offset) const {
+ T &operator[](uint64_t Offset) const XRAY_NEVER_INSTRUMENT {
DCHECK_LE(Offset, Size);
// We need to traverse the array enough times to find the element at Offset.
auto S = Head;
@@ -290,19 +421,19 @@ public:
Offset -= ElementsPerSegment;
DCHECK_NE(S, &SentinelSegment);
}
- auto Base = static_cast<Segment *>(S)->Data;
+ auto Base = &S->Data;
auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
auto Position = reinterpret_cast<T *>(AlignedOffset);
return *reinterpret_cast<T *>(Position);
}
- T &front() const {
+ T &front() const XRAY_NEVER_INSTRUMENT {
DCHECK_NE(Head, &SentinelSegment);
DCHECK_NE(Size, 0u);
return *begin();
}
- T &back() const {
+ T &back() const XRAY_NEVER_INSTRUMENT {
DCHECK_NE(Tail, &SentinelSegment);
DCHECK_NE(Size, 0u);
auto It = end();
@@ -310,7 +441,8 @@ public:
return *It;
}
- template <class Predicate> T *find_element(Predicate P) const {
+ template <class Predicate>
+ T *find_element(Predicate P) const XRAY_NEVER_INSTRUMENT {
if (empty())
return nullptr;
@@ -324,51 +456,195 @@ public:
/// Remove N Elements from the end. This leaves the blocks behind, and not
/// require allocation of new blocks for new elements added after trimming.
- void trim(size_t Elements) {
- DCHECK_LE(Elements, Size);
- DCHECK_GT(Size, 0);
+ void trim(uint64_t Elements) XRAY_NEVER_INSTRUMENT {
auto OldSize = Size;
+ Elements = Elements > Size ? Size : Elements;
Size -= Elements;
- DCHECK_NE(Head, &SentinelSegment);
- DCHECK_NE(Tail, &SentinelSegment);
-
- for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) -
- nearest_boundary(Size, ElementsPerSegment)) /
- ElementsPerSegment;
- SegmentsToTrim > 0; --SegmentsToTrim) {
+ // We compute the number of segments we're going to return from the tail by
+ // counting how many elements have been trimmed. Given the following:
+ //
+ // - Each segment has N valid positions, where N > 0
+ // - The previous size > current size
+ //
+ // To compute the number of segments to return, we need to perform the
+ // following calculations for the number of segments required given 'x'
+ // elements:
+ //
+ // f(x) = {
+ // x == 0 : 0
+ // , 0 < x <= N : 1
+ // , N < x <= max : x / N + (x % N ? 1 : 0)
+ // }
+ //
+ // We can simplify this down to:
+ //
+ // f(x) = {
+ // x == 0 : 0,
+ // , 0 < x <= max : x / N + (x < N || x % N ? 1 : 0)
+ // }
+ //
+ // And further down to:
+ //
+ // f(x) = x ? x / N + (x < N || x % N ? 1 : 0) : 0
+ //
+ // We can then perform the following calculation `s` which counts the number
+ // of segments we need to remove from the end of the data structure:
+ //
+ // s(p, c) = f(p) - f(c)
+ //
+ // If we treat p = previous size, and c = current size, and given the
+ // properties above, the possible range for s(...) is [0..max(typeof(p))/N]
+ // given that typeof(p) == typeof(c).
+ auto F = [](uint64_t X) {
+ return X ? (X / ElementsPerSegment) +
+ (X < ElementsPerSegment || X % ElementsPerSegment ? 1 : 0)
+ : 0;
+ };
+ auto PS = F(OldSize);
+ auto CS = F(Size);
+ DCHECK_GE(PS, CS);
+ auto SegmentsToTrim = PS - CS;
+ for (auto I = 0uL; I < SegmentsToTrim; ++I) {
+ // Here we place the current tail segment to the freelist. To do this
+ // appropriately, we need to perform a splice operation on two
+ // bidirectional linked-lists. In particular, we have the current state of
+ // the doubly-linked list of segments:
+ //
+ // @S@ <- s0 <-> s1 <-> ... <-> sT -> @S@
+ //
DCHECK_NE(Head, &SentinelSegment);
DCHECK_NE(Tail, &SentinelSegment);
- // Put the tail into the Freelist.
- auto *FreeSegment = Tail;
- Tail = Tail->Prev;
- if (Tail == &SentinelSegment)
- Head = Tail;
- else
- Tail->Next = &SentinelSegment;
-
DCHECK_EQ(Tail->Next, &SentinelSegment);
- FreeSegment->Next = Freelist;
- FreeSegment->Prev = &SentinelSegment;
- if (Freelist != &SentinelSegment)
- Freelist->Prev = FreeSegment;
- Freelist = FreeSegment;
+
+ if (Freelist == &SentinelSegment) {
+ // Our two lists at this point are in this configuration:
+ //
+ // Freelist: (potentially) @S@
+ // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@
+ // ^ Head ^ Tail
+ //
+ // The end state for us will be this configuration:
+ //
+ // Freelist: @S@<-sT->@S@
+ // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@
+ // ^ Head ^ Tail
+ //
+ // The first step for us is to hold a reference to the tail of Mainlist,
+ // which in our notation is represented by sT. We call this our "free
+ // segment" which is the segment we are placing on the Freelist.
+ //
+ // sF = sT
+ //
+ // Then, we also hold a reference to the "pre-tail" element, which we
+ // call sPT:
+ //
+ // sPT = pred(sT)
+ //
+ // We want to splice sT into the beginning of the Freelist, which in
+ // an empty Freelist means placing a segment whose predecessor and
+ // successor is the sentinel segment.
+ //
+ // The splice operation then can be performed in the following
+ // algorithm:
+ //
+ // succ(sPT) = S
+ // pred(sT) = S
+ // succ(sT) = Freelist
+ // Freelist = sT
+ // Tail = sPT
+ //
+ auto SPT = Tail->Prev;
+ SPT->Next = &SentinelSegment;
+ Tail->Prev = &SentinelSegment;
+ Tail->Next = Freelist;
+ Freelist = Tail;
+ Tail = SPT;
+
+ // Our post-conditions here are:
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+ } else {
+ // In the other case, where the Freelist is not empty, we perform the
+ // following transformation instead:
+ //
+ // This transforms the current state:
+ //
+ // Freelist: @S@<-f0->@S@
+ // ^ Freelist
+ // Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@
+ // ^ Head ^ Tail
+ //
+ // Into the following:
+ //
+ // Freelist: @S@<-sT<->f0->@S@
+ // ^ Freelist
+ // Mainlist: @S@<-s0<->s1<->...<->sPT->@S@
+ // ^ Head ^ Tail
+ //
+ // The algorithm is:
+ //
+ // sFH = Freelist
+ // sPT = pred(sT)
+ // pred(SFH) = sT
+ // succ(sT) = Freelist
+ // pred(sT) = S
+ // succ(sPT) = S
+ // Tail = sPT
+ // Freelist = sT
+ //
+ auto SFH = Freelist;
+ auto SPT = Tail->Prev;
+ auto ST = Tail;
+ SFH->Prev = ST;
+ ST->Next = Freelist;
+ ST->Prev = &SentinelSegment;
+ SPT->Next = &SentinelSegment;
+ Tail = SPT;
+ Freelist = ST;
+
+ // Our post-conditions here are:
+ DCHECK_EQ(Tail->Next, &SentinelSegment);
+ DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+ DCHECK_EQ(Freelist->Next->Prev, Freelist);
+ }
}
+
+ // Now in case we've spliced all the segments in the end, we ensure that the
+ // main list is "empty", or both the head and tail pointing to the sentinel
+ // segment.
+ if (Tail == &SentinelSegment)
+ Head = Tail;
+
+ DCHECK(
+ (Size == 0 && Head == &SentinelSegment && Tail == &SentinelSegment) ||
+ (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+ DCHECK(
+ (Freelist != &SentinelSegment && Freelist->Prev == &SentinelSegment) ||
+ (Freelist == &SentinelSegment && Tail->Next == &SentinelSegment));
}
// Provide iterators.
- Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
- Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
- Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
- Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
+ Iterator<T> begin() const XRAY_NEVER_INSTRUMENT {
+ return Iterator<T>(Head, 0, Size);
+ }
+ Iterator<T> end() const XRAY_NEVER_INSTRUMENT {
+ return Iterator<T>(Tail, Size, Size);
+ }
+ Iterator<const T> cbegin() const XRAY_NEVER_INSTRUMENT {
+ return Iterator<const T>(Head, 0, Size);
+ }
+ Iterator<const T> cend() const XRAY_NEVER_INSTRUMENT {
+ return Iterator<const T>(Tail, Size, Size);
+ }
};
// We need to have this storage definition out-of-line so that the compiler can
// ensure that storage for the SentinelSegment is defined and has a single
// address.
template <class T>
-typename Array<T>::SegmentBase Array<T>::SentinelSegment{
- &Array<T>::SentinelSegment, &Array<T>::SentinelSegment};
+typename Array<T>::Segment Array<T>::SentinelSegment{
+ &Array<T>::SentinelSegment, &Array<T>::SentinelSegment, {'\0'}};
} // namespace __xray
diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
index 99ad3966ee3a..52985ffd19ab 100644
--- a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
+++ b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
@@ -19,6 +19,7 @@
.macro SAVE_REGISTERS
+ pushfq
subq $240, %rsp
CFI_DEF_CFA_OFFSET(248)
movq %rbp, 232(%rsp)
@@ -69,6 +70,7 @@
movq 8(%rsp), %r14
movq 0(%rsp), %r15
addq $240, %rsp
+ popfq
CFI_DEF_CFA_OFFSET(8)
.endm
@@ -89,10 +91,10 @@
.text
#if !defined(__APPLE__)
.section .text
+ .file "xray_trampoline_x86.S"
#else
.section __TEXT,__text
#endif
- .file "xray_trampoline_x86.S"
//===----------------------------------------------------------------------===//
diff --git a/contrib/compiler-rt/lib/xray/xray_tsc.h b/contrib/compiler-rt/lib/xray/xray_tsc.h
index 4507564e7cd2..180d6df188c1 100644
--- a/contrib/compiler-rt/lib/xray/xray_tsc.h
+++ b/contrib/compiler-rt/lib/xray/xray_tsc.h
@@ -13,10 +13,32 @@
#ifndef XRAY_EMULATE_TSC_H
#define XRAY_EMULATE_TSC_H
+#include "sanitizer_common/sanitizer_common.h"
+
namespace __xray {
static constexpr uint64_t NanosecondsPerSecond = 1000ULL * 1000 * 1000;
}
+#if SANITIZER_FUCHSIA
+#include <zircon/syscalls.h>
+
+namespace __xray {
+
+inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+ CPU = 0;
+ return _zx_ticks_get();
+}
+
+inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+ return _zx_ticks_per_second();
+}
+
+} // namespace __xray
+
+#else // SANITIZER_FUCHSIA
+
#if defined(__x86_64__)
#include "xray_x86_64.inc"
#elif defined(__powerpc64__)
@@ -64,5 +86,6 @@ inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
#else
#error Target architecture is not supported.
#endif // CPU architecture
+#endif // SANITIZER_FUCHSIA
#endif // XRAY_EMULATE_TSC_H
diff --git a/contrib/compiler-rt/lib/xray/xray_utils.cc b/contrib/compiler-rt/lib/xray/xray_utils.cc
index 68f4e8c1094c..59ba6c3082b2 100644
--- a/contrib/compiler-rt/lib/xray/xray_utils.cc
+++ b/contrib/compiler-rt/lib/xray/xray_utils.cc
@@ -12,7 +12,9 @@
//===----------------------------------------------------------------------===//
#include "xray_utils.h"
+#include "sanitizer_common/sanitizer_allocator_internal.h"
#include "sanitizer_common/sanitizer_common.h"
+#include "xray_allocator.h"
#include "xray_defs.h"
#include "xray_flags.h"
#include <cstdio>
@@ -25,13 +27,113 @@
#include <unistd.h>
#include <utility>
+#if SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_symbolizer_fuchsia.h"
+
+#include <inttypes.h>
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
namespace __xray {
-void printToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT {
- fprintf(stderr, "%s", Buffer);
+#if SANITIZER_FUCHSIA
+constexpr const char* ProfileSinkName = "llvm-xray";
+
+LogWriter::~LogWriter() {
+ _zx_handle_close(Vmo);
+}
+
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+ if (Begin == End)
+ return;
+ auto TotalBytes = std::distance(Begin, End);
+
+ const size_t PageSize = flags()->xray_page_size_override > 0
+ ? flags()->xray_page_size_override
+ : GetPageSizeCached();
+ if (RoundUpTo(Offset, PageSize) != RoundUpTo(Offset + TotalBytes, PageSize)) {
+ // Resize the VMO to ensure there's sufficient space for the data.
+ zx_status_t Status = _zx_vmo_set_size(Vmo, Offset + TotalBytes);
+ if (Status != ZX_OK) {
+ Report("Failed to resize VMO: %s\n", _zx_status_get_string(Status));
+ return;
+ }
+ }
+
+ // Write the data into VMO.
+ zx_status_t Status = _zx_vmo_write(Vmo, Begin, Offset, TotalBytes);
+ if (Status != ZX_OK) {
+ Report("Failed to write: %s\n", _zx_status_get_string(Status));
+ return;
+ }
+ Offset += TotalBytes;
+}
+
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+ // Nothing to do here since WriteAll writes directly into the VMO.
+}
+
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
+ // Create VMO to hold the profile data.
+ zx_handle_t Vmo;
+ zx_status_t Status = _zx_vmo_create(0, 0, &Vmo);
+ if (Status != ZX_OK) {
+ Report("XRay: cannot create VMO: %s\n", _zx_status_get_string(Status));
+ return nullptr;
+ }
+
+ // Get the KOID of the current process to use in the VMO name.
+ zx_info_handle_basic_t Info;
+ Status = _zx_object_get_info(_zx_process_self(), ZX_INFO_HANDLE_BASIC, &Info,
+ sizeof(Info), NULL, NULL);
+ if (Status != ZX_OK) {
+ Report("XRay: cannot get basic info about current process handle: %s\n",
+ _zx_status_get_string(Status));
+ return nullptr;
+ }
+
+ // Give the VMO a name including our process KOID so it's easy to spot.
+ char VmoName[ZX_MAX_NAME_LEN];
+ internal_snprintf(VmoName, sizeof(VmoName), "%s.%zu", ProfileSinkName,
+ Info.koid);
+ _zx_object_set_property(Vmo, ZX_PROP_NAME, VmoName, strlen(VmoName));
+
+ // Duplicate the handle since __sanitizer_publish_data consumes it and
+ // LogWriter needs to hold onto it.
+ zx_handle_t Handle;
+ Status =_zx_handle_duplicate(Vmo, ZX_RIGHT_SAME_RIGHTS, &Handle);
+ if (Status != ZX_OK) {
+ Report("XRay: cannot duplicate VMO handle: %s\n",
+ _zx_status_get_string(Status));
+ return nullptr;
+ }
+
+ // Publish the VMO that receives the logging. Note the VMO's contents can
+ // grow and change after publication. The contents won't be read out until
+ // after the process exits.
+ __sanitizer_publish_data(ProfileSinkName, Handle);
+
+ // Use the dumpfile symbolizer markup element to write the name of the VMO.
+ Report("XRay: " FORMAT_DUMPFILE "\n", ProfileSinkName, VmoName);
+
+ LogWriter *LW = reinterpret_cast<LogWriter *>(InternalAlloc(sizeof(LogWriter)));
+ new (LW) LogWriter(Vmo);
+ return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+ LW->~LogWriter();
+ InternalFree(LW);
+}
+#else // SANITIZER_FUCHSIA
+LogWriter::~LogWriter() {
+ internal_close(Fd);
}
-void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
if (Begin == End)
return;
auto TotalBytes = std::distance(Begin, End);
@@ -49,50 +151,11 @@ void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INS
}
}
-std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin,
- char *End) XRAY_NEVER_INSTRUMENT {
- auto BytesToRead = std::distance(Begin, End);
- ssize_t BytesRead;
- ssize_t TotalBytesRead = 0;
- while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) {
- if (BytesRead == -1) {
- if (errno == EINTR)
- continue;
- Report("Read error; errno = %d\n", errno);
- return std::make_pair(TotalBytesRead, false);
- }
-
- TotalBytesRead += BytesRead;
- BytesToRead -= BytesRead;
- Begin += BytesRead;
- }
- return std::make_pair(TotalBytesRead, true);
-}
-
-bool readValueFromFile(const char *Filename,
- long long *Value) XRAY_NEVER_INSTRUMENT {
- int Fd = open(Filename, O_RDONLY | O_CLOEXEC);
- if (Fd == -1)
- return false;
- static constexpr size_t BufSize = 256;
- char Line[BufSize] = {};
- ssize_t BytesRead;
- bool Success;
- std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize);
- if (!Success)
- return false;
- close(Fd);
- const char *End = nullptr;
- long long Tmp = internal_simple_strtoll(Line, &End, 10);
- bool Result = false;
- if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
- *Value = Tmp;
- Result = true;
- }
- return Result;
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+ fsync(Fd);
}
-int getLogFD() XRAY_NEVER_INSTRUMENT {
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
// Open a temporary file once for the log.
char TmpFilename[256] = {};
char TmpWildcardPattern[] = "XXXXXX";
@@ -103,24 +166,31 @@ int getLogFD() XRAY_NEVER_INSTRUMENT {
if (LastSlash != nullptr)
Progname = LastSlash + 1;
- const int HalfLength = sizeof(TmpFilename) / 2 - sizeof(TmpWildcardPattern);
int NeededLength = internal_snprintf(
- TmpFilename, sizeof(TmpFilename), "%.*s%.*s.%s", HalfLength,
- flags()->xray_logfile_base, HalfLength, Progname, TmpWildcardPattern);
+ TmpFilename, sizeof(TmpFilename), "%s%s.%s",
+ flags()->xray_logfile_base, Progname, TmpWildcardPattern);
if (NeededLength > int(sizeof(TmpFilename))) {
Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename);
- return -1;
+ return nullptr;
}
int Fd = mkstemp(TmpFilename);
if (Fd == -1) {
Report("XRay: Failed opening temporary file '%s'; not logging events.\n",
TmpFilename);
- return -1;
+ return nullptr;
}
if (Verbosity())
Report("XRay: Log file in '%s'\n", TmpFilename);
- return Fd;
+ LogWriter *LW = allocate<LogWriter>();
+ new (LW) LogWriter(Fd);
+ return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+ LW->~LogWriter();
+ deallocate(LW);
}
+#endif // SANITIZER_FUCHSIA
} // namespace __xray
diff --git a/contrib/compiler-rt/lib/xray/xray_utils.h b/contrib/compiler-rt/lib/xray/xray_utils.h
index eafa16e1a9d5..60438973fbd0 100644
--- a/contrib/compiler-rt/lib/xray/xray_utils.h
+++ b/contrib/compiler-rt/lib/xray/xray_utils.h
@@ -20,23 +20,40 @@
#include <sys/types.h>
#include <utility>
-namespace __xray {
-
-// Default implementation of the reporting interface for sanitizer errors.
-void printToStdErr(const char *Buffer);
-
-// EINTR-safe write routine, provided a file descriptor and a character range.
-void retryingWriteAll(int Fd, const char *Begin, const char *End);
+#include "sanitizer_common/sanitizer_common.h"
+#if SANITIZER_FUCHSIA
+#include <zircon/types.h>
+#endif
-// Reads a long long value from a provided file.
-bool readValueFromFile(const char *Filename, long long *Value);
-
-// EINTR-safe read routine, providing a file descriptor and a character range.
-std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End);
+namespace __xray {
-// EINTR-safe open routine, uses flag-provided values for initialising a log
-// file.
-int getLogFD();
+class LogWriter {
+public:
+#if SANITIZER_FUCHSIA
+ LogWriter(zx_handle_t Vmo) : Vmo(Vmo) {}
+#else
+ explicit LogWriter(int Fd) : Fd(Fd) {}
+#endif
+ ~LogWriter();
+
+ // Write a character range into a log.
+ void WriteAll(const char *Begin, const char *End);
+
+ void Flush();
+
+ // Returns a new log instance initialized using the flag-provided values.
+ static LogWriter *Open();
+ // Closes and deallocates the log instance.
+ static void Close(LogWriter *LogWriter);
+
+private:
+#if SANITIZER_FUCHSIA
+ zx_handle_t Vmo = ZX_HANDLE_INVALID;
+ uint64_t Offset = 0;
+#else
+ int Fd = -1;
+#endif
+};
constexpr size_t gcd(size_t a, size_t b) {
return (b == 0) ? a : gcd(b, a % b);
diff --git a/contrib/compiler-rt/lib/xray/xray_x86_64.cc b/contrib/compiler-rt/lib/xray/xray_x86_64.cc
index 51dc4ce43b1c..e63ee1b3bd02 100644
--- a/contrib/compiler-rt/lib/xray/xray_x86_64.cc
+++ b/contrib/compiler-rt/lib/xray/xray_x86_64.cc
@@ -1,15 +1,20 @@
#include "cpuid.h"
#include "sanitizer_common/sanitizer_common.h"
+#if !SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
#include "xray_defs.h"
#include "xray_interface_internal.h"
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
#include <sys/types.h>
#if SANITIZER_OPENBSD
#include <sys/time.h>
#include <machine/cpu.h>
#endif
#include <sys/sysctl.h>
+#elif SANITIZER_FUCHSIA
+#include <zircon/syscalls.h>
#endif
#include <atomic>
@@ -81,17 +86,20 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
}
return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
}
-#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
long long TSCFrequency = -1;
size_t tscfreqsz = sizeof(TSCFrequency);
#if SANITIZER_OPENBSD
int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ };
- if (sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+ if (internal_sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+#elif SANITIZER_MAC
+ if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
+ &tscfreqsz, NULL, 0) != -1) {
#else
- if (sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
- NULL, 0) != -1) {
+ if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
+ NULL, 0) != -1) {
#endif
return static_cast<uint64_t>(TSCFrequency);
} else {
@@ -100,7 +108,7 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
return 0;
}
-#else
+#elif !SANITIZER_FUCHSIA
uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
/* Not supported */
return 0;
@@ -317,6 +325,7 @@ bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
return false;
}
+#if !SANITIZER_FUCHSIA
// We determine whether the CPU we're running on has the correct features we
// need. In x86_64 this will be rdtscp support.
bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
@@ -339,5 +348,6 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
}
return true;
}
+#endif
} // namespace __xray