22 files changed, 2881 insertions, 1629 deletions
diff --git a/contrib/compiler-rt/lib/xray/xray_allocator.h b/contrib/compiler-rt/lib/xray/xray_allocator.h
index 8244815284a8..907c54542a56 100644
--- a/contrib/compiler-rt/lib/xray/xray_allocator.h
+++ b/contrib/compiler-rt/lib/xray/xray_allocator.h
@@ -19,18 +19,131 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_internal_defs.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#if SANITIZER_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#else
 #include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_defs.h"
 #include "xray_utils.h"
-#include <sys/mman.h>
 #include <cstddef>
 #include <cstdint>
+#include <sys/mman.h>
+
+namespace __xray {
+
+// We implement our own memory allocation routine which will bypass the
+// internal allocator. This allows us to manage the memory directly, using
+// mmap'ed memory to back the allocators.
+template <class T> T *allocate() XRAY_NEVER_INSTRUMENT {
+  uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  zx_handle_t Vmo;
+  zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to create VMO of size %zu: %s\n",
+             sizeof(T), _zx_status_get_string(Status));
+    return nullptr;
+  }
+  uintptr_t B;
+  Status =
+      _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0,
+                   Vmo, 0, sizeof(T), &B);
+  _zx_handle_close(Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", sizeof(T),
+             _zx_status_get_string(Status));
+    return nullptr;
+  }
+  return reinterpret_cast<T *>(B);
+#else
+  uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  int ErrNo = 0;
+  if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+    if (Verbosity())
+      Report(
+          "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n",
+          RoundedSize, B);
+    return nullptr;
+  }
+#endif
+  return reinterpret_cast<T *>(B);
+}
 
-#ifndef MAP_NORESERVE
-// no-op on NetBSD (at least), unsupported flag on FreeBSD basically because unneeded
-#define MAP_NORESERVE 0
+template <class T> void deallocate(T *B) XRAY_NEVER_INSTRUMENT {
+  if (B == nullptr)
+    return;
+  uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B),
+                 RoundedSize);
+#else
+  internal_munmap(B, RoundedSize);
 #endif
+}
 
-namespace __xray {
+template <class T = unsigned char>
+T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+  uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  zx_handle_t Vmo;
+  zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", S,
+             _zx_status_get_string(Status));
+    return nullptr;
+  }
+  uintptr_t B;
+  Status = _zx_vmar_map(_zx_vmar_root_self(),
+                        ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, Vmo, 0, S, &B);
+  _zx_handle_close(Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", S,
+             _zx_status_get_string(Status));
+    return nullptr;
+  }
+#else
+  uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  int ErrNo = 0;
+  if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+    if (Verbosity())
+      Report(
+          "XRay Profiling: Failed to allocate memory of size %d; Error = %d.\n",
+          RoundedSize, B);
+    return nullptr;
+  }
+#endif
+  return reinterpret_cast<T *>(B);
+}
+
+template <class T> void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT {
+  if (B == nullptr)
+    return;
+  uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B),
+                 RoundedSize);
+#else
+  internal_munmap(B, RoundedSize);
+#endif
+}
+
+template <class T, class... U>
+T *initArray(size_t N, U &&... Us) XRAY_NEVER_INSTRUMENT {
+  auto A = allocateBuffer<T>(N);
+  if (A != nullptr)
+    while (N > 0)
+      new (A + (--N)) T(std::forward<U>(Us)...);
+  return A;
+}
 
 /// The Allocator type hands out fixed-sized chunks of memory that are
 /// cache-line aligned and sized. This is useful for placement of
@@ -58,20 +171,18 @@ template <size_t N> struct Allocator {
   };
 
 private:
-  const size_t MaxMemory{0};
-  void *BackingStore = nullptr;
-  void *AlignedNextBlock = nullptr;
+  size_t MaxMemory{0};
+  unsigned char *BackingStore = nullptr;
+  unsigned char *AlignedNextBlock = nullptr;
   size_t AllocatedBlocks = 0;
+  bool Owned;
   SpinMutex Mutex{};
 
-  void *Alloc() {
+  void *Alloc() XRAY_NEVER_INSTRUMENT {
     SpinMutexLock Lock(&Mutex);
     if (UNLIKELY(BackingStore == nullptr)) {
-      BackingStore = reinterpret_cast<void *>(
-          internal_mmap(NULL, MaxMemory, PROT_READ | PROT_WRITE,
-                        MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE, 0, 0));
-      if (BackingStore == MAP_FAILED) {
-        BackingStore = nullptr;
+      BackingStore = allocateBuffer(MaxMemory);
+      if (BackingStore == nullptr) {
         if (Verbosity())
           Report("XRay Profiling: Failed to allocate memory for allocator.\n");
         return nullptr;
@@ -84,7 +195,7 @@ private:
       auto AlignedNextBlockNum = nearest_boundary(
           reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
       if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
-        munmap(BackingStore, MaxMemory);
+        deallocateBuffer(BackingStore, MaxMemory);
         AlignedNextBlock = BackingStore = nullptr;
         if (Verbosity())
           Report("XRay Profiling: Cannot obtain enough memory from "
@@ -92,34 +203,83 @@ private:
         return nullptr;
       }
 
-      AlignedNextBlock = reinterpret_cast<void *>(AlignedNextBlockNum);
+      AlignedNextBlock = reinterpret_cast<unsigned char *>(AlignedNextBlockNum);
 
       // Assert that AlignedNextBlock is cache-line aligned.
       DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
                 0);
     }
 
-    if ((AllocatedBlocks * Block::Size) >= MaxMemory)
+    if (((AllocatedBlocks + 1) * Block::Size) > MaxMemory)
       return nullptr;
 
     // Align the pointer we'd like to return to an appropriate alignment, then
     // advance the pointer from where to start allocations.
     void *Result = AlignedNextBlock;
-    AlignedNextBlock = reinterpret_cast<void *>(
-        reinterpret_cast<char *>(AlignedNextBlock) + N);
+    AlignedNextBlock =
+        reinterpret_cast<unsigned char *>(AlignedNextBlock) + Block::Size;
     ++AllocatedBlocks;
     return Result;
   }
 
 public:
-  explicit Allocator(size_t M)
-      : MaxMemory(nearest_boundary(M, kCacheLineSize)) {}
+  explicit Allocator(size_t M) XRAY_NEVER_INSTRUMENT
+      : MaxMemory(RoundUpTo(M, kCacheLineSize)),
+        BackingStore(nullptr),
+        AlignedNextBlock(nullptr),
+        AllocatedBlocks(0),
+        Owned(true),
+        Mutex() {}
+
+  explicit Allocator(void *P, size_t M) XRAY_NEVER_INSTRUMENT
+      : MaxMemory(M),
+        BackingStore(reinterpret_cast<unsigned char *>(P)),
+        AlignedNextBlock(reinterpret_cast<unsigned char *>(P)),
+        AllocatedBlocks(0),
+        Owned(false),
+        Mutex() {}
+
+  Allocator(const Allocator &) = delete;
+  Allocator &operator=(const Allocator &) = delete;
+
+  Allocator(Allocator &&O) XRAY_NEVER_INSTRUMENT {
+    SpinMutexLock L0(&Mutex);
+    SpinMutexLock L1(&O.Mutex);
+    MaxMemory = O.MaxMemory;
+    O.MaxMemory = 0;
+    BackingStore = O.BackingStore;
+    O.BackingStore = nullptr;
+    AlignedNextBlock = O.AlignedNextBlock;
+    O.AlignedNextBlock = nullptr;
+    AllocatedBlocks = O.AllocatedBlocks;
+    O.AllocatedBlocks = 0;
+    Owned = O.Owned;
+    O.Owned = false;
+  }
+
+  Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT {
+    SpinMutexLock L0(&Mutex);
+    SpinMutexLock L1(&O.Mutex);
+    MaxMemory = O.MaxMemory;
+    O.MaxMemory = 0;
+    if (BackingStore != nullptr)
+      deallocateBuffer(BackingStore, MaxMemory);
+    BackingStore = O.BackingStore;
+    O.BackingStore = nullptr;
+    AlignedNextBlock = O.AlignedNextBlock;
+    O.AlignedNextBlock = nullptr;
+    AllocatedBlocks = O.AllocatedBlocks;
+    O.AllocatedBlocks = 0;
+    Owned = O.Owned;
+    O.Owned = false;
+    return *this;
+  }
 
-  Block Allocate() { return {Alloc()}; }
+  Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; }
 
-  ~Allocator() NOEXCEPT {
-    if (BackingStore != nullptr) {
-      internal_munmap(BackingStore, MaxMemory);
+  ~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT {
+    if (Owned && BackingStore != nullptr) {
+      deallocateBuffer(BackingStore, MaxMemory);
     }
   }
 };
diff --git a/contrib/compiler-rt/lib/xray/xray_basic_logging.cc b/contrib/compiler-rt/lib/xray/xray_basic_logging.cc
index 585ca641cd0c..ae1cc0ba79dd 100644
--- a/contrib/compiler-rt/lib/xray/xray_basic_logging.cc
+++ b/contrib/compiler-rt/lib/xray/xray_basic_logging.cc
@@ -19,7 +19,9 @@
 #include <fcntl.h>
 #include <pthread.h>
 #include <sys/stat.h>
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
 #include <sys/syscall.h>
+#endif
 #include <sys/types.h>
 #include <time.h>
 #include <unistd.h>
@@ -38,8 +40,9 @@
 
 namespace __xray {
 
-SpinMutex LogMutex;
+static SpinMutex LogMutex;
 
+namespace {
 // We use elements of this type to record the entry TSC of every function ID we
 // see as we're tracing a particular thread's execution.
 struct alignas(16) StackEntry {
@@ -52,21 +55,28 @@ struct alignas(16) StackEntry {
 
 static_assert(sizeof(StackEntry) == 16, "Wrong size for StackEntry");
 
-struct alignas(64) ThreadLocalData {
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
   void *InMemoryBuffer = nullptr;
   size_t BufferSize = 0;
   size_t BufferOffset = 0;
   void *ShadowStack = nullptr;
   size_t StackSize = 0;
   size_t StackEntries = 0;
-  int Fd = -1;
+  __xray::LogWriter *LogWriter = nullptr;
 };
 
+struct BasicLoggingOptions {
+  int DurationFilterMicros = 0;
+  size_t MaxStackDepth = 0;
+  size_t ThreadBufferSize = 0;
+};
+} // namespace
+
 static pthread_key_t PThreadKey;
 
 static atomic_uint8_t BasicInitialized{0};
 
-BasicLoggingOptions GlobalOptions;
+struct BasicLoggingOptions GlobalOptions;
 
 thread_local atomic_uint8_t Guard{0};
 
@@ -75,10 +85,10 @@ static atomic_uint64_t ThresholdTicks{0};
 static atomic_uint64_t TicksPerSec{0};
 static atomic_uint64_t CycleFrequency{NanosecondsPerSecond};
 
-static int openLogFile() XRAY_NEVER_INSTRUMENT {
-  int F = getLogFD();
-  if (F == -1)
-    return -1;
+static LogWriter *getLog() XRAY_NEVER_INSTRUMENT {
+  LogWriter* LW = LogWriter::Open();
+  if (LW == nullptr)
+    return LW;
 
   static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT;
   pthread_once(&DetectOnce, +[] {
@@ -100,16 +110,16 @@ static int openLogFile() XRAY_NEVER_INSTRUMENT {
   // before setting the values in the header.
   Header.ConstantTSC = 1;
   Header.NonstopTSC = 1;
-  retryingWriteAll(F, reinterpret_cast<char *>(&Header),
-                   reinterpret_cast<char *>(&Header) + sizeof(Header));
-  return F;
+  LW->WriteAll(reinterpret_cast<char *>(&Header),
+               reinterpret_cast<char *>(&Header) + sizeof(Header));
+  return LW;
 }
 
-static int getGlobalFd() XRAY_NEVER_INSTRUMENT {
+static LogWriter *getGlobalLog() XRAY_NEVER_INSTRUMENT {
   static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
-  static int Fd = 0;
-  pthread_once(&OnceInit, +[] { Fd = openLogFile(); });
-  return Fd;
+  static LogWriter *LW = nullptr;
+  pthread_once(&OnceInit, +[] { LW = getLog(); });
+  return LW;
 }
 
 static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
@@ -121,7 +131,7 @@ static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
       return false;
     }
     pthread_setspecific(PThreadKey, &TLD);
-    TLD.Fd = getGlobalFd();
+    TLD.LogWriter = getGlobalLog();
     TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>(
         InternalAlloc(sizeof(XRayRecord) * GlobalOptions.ThreadBufferSize,
                       nullptr, alignof(XRayRecord)));
@@ -149,8 +159,8 @@ template <class RDTSC>
 void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
                     RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
   auto &TLD = getThreadLocalData();
-  int Fd = getGlobalFd();
-  if (Fd == -1)
+  LogWriter *LW = getGlobalLog();
+  if (LW == nullptr)
     return;
 
   // Use a simple recursion guard, to handle cases where we're already logging
@@ -234,9 +244,9 @@ void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
   auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer);
   internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
   if (++TLD.BufferOffset == TLD.BufferSize) {
-    SpinMutexLock L(&LogMutex);
-    retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
-                     reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
     TLD.BufferOffset = 0;
     TLD.StackEntries = 0;
   }
@@ -249,17 +259,17 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
   auto FirstEntry =
       reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer);
   const auto &BuffLen = TLD.BufferSize;
-  int Fd = getGlobalFd();
-  if (Fd == -1)
+  LogWriter *LW = getGlobalLog();
+  if (LW == nullptr)
     return;
 
   // First we check whether there's enough space to write the data consecutively
   // in the thread-local buffer. If not, we first flush the buffer before
   // attempting to write the two records that must be consecutive.
   if (TLD.BufferOffset + 2 > BuffLen) {
-    SpinMutexLock L(&LogMutex);
-    retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
-                     reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
     TLD.BufferOffset = 0;
     TLD.StackEntries = 0;
   }
@@ -280,9 +290,9 @@ void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
   R.Arg = Arg1;
   internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
   if (++TLD.BufferOffset == BuffLen) {
-    SpinMutexLock L(&LogMutex);
-    retryingWriteAll(Fd, reinterpret_cast<char *>(FirstEntry),
-                     reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
     TLD.BufferOffset = 0;
     TLD.StackEntries = 0;
   }
@@ -339,29 +349,29 @@ static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
       Report("Cleaned up log for TID: %d\n", GetTid());
   });
 
-  if (TLD.Fd == -1 || TLD.BufferOffset == 0) {
+  if (TLD.LogWriter == nullptr || TLD.BufferOffset == 0) {
     if (Verbosity())
-      Report("Skipping buffer for TID: %d; Fd = %d; Offset = %llu\n", GetTid(),
-             TLD.Fd, TLD.BufferOffset);
+      Report("Skipping buffer for TID: %d; Offset = %llu\n", GetTid(),
+             TLD.BufferOffset);
     return;
   }
 
   {
     SpinMutexLock L(&LogMutex);
-    retryingWriteAll(TLD.Fd, reinterpret_cast<char *>(TLD.InMemoryBuffer),
-                     reinterpret_cast<char *>(TLD.InMemoryBuffer) +
-                         (sizeof(XRayRecord) * TLD.BufferOffset));
+    TLD.LogWriter->WriteAll(reinterpret_cast<char *>(TLD.InMemoryBuffer),
+                            reinterpret_cast<char *>(TLD.InMemoryBuffer) +
+                            (sizeof(XRayRecord) * TLD.BufferOffset));
   }
 
   // Because this thread's exit could be the last one trying to write to
   // the file and that we're not able to close out the file properly, we
   // sync instead and hope that the pending writes are flushed as the
   // thread exits.
-  fsync(TLD.Fd);
+  TLD.LogWriter->Flush();
 }
 
-XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
-                                   void *Options,
+XRayLogInitStatus basicLoggingInit(UNUSED size_t BufferSize,
+                                   UNUSED size_t BufferMax, void *Options,
                                    size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
   uint8_t Expected = 0;
   if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
@@ -385,43 +395,32 @@ XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
              "using emulation instead.\n");
   });
 
-  if (BufferSize == 0 && BufferMax == 0 && Options != nullptr) {
-    FlagParser P;
-    BasicFlags F;
-    F.setDefaults();
-    registerXRayBasicFlags(&P, &F);
-    P.ParseString(useCompilerDefinedBasicFlags());
-    auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
-    if (EnvOpts == nullptr)
-      EnvOpts = "";
-
-    P.ParseString(EnvOpts);
-
-    // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
-    // set through XRAY_OPTIONS instead.
-    if (internal_strlen(EnvOpts) == 0) {
-      F.func_duration_threshold_us =
-          flags()->xray_naive_log_func_duration_threshold_us;
-      F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
-      F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
-    }
-
-    P.ParseString(static_cast<const char *>(Options));
-    GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
-    GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
-    GlobalOptions.MaxStackDepth = F.max_stack_depth;
-    *basicFlags() = F;
-  } else if (OptionsSize != sizeof(BasicLoggingOptions)) {
-    Report("Invalid options size, potential ABI mismatch; expected %d got %d",
-           sizeof(BasicLoggingOptions), OptionsSize);
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  } else {
-    if (Verbosity())
-      Report("XRay Basic: struct-based init is deprecated, please use "
-             "string-based configuration instead.\n");
-    GlobalOptions = *reinterpret_cast<BasicLoggingOptions *>(Options);
+  FlagParser P;
+  BasicFlags F;
+  F.setDefaults();
+  registerXRayBasicFlags(&P, &F);
+  P.ParseString(useCompilerDefinedBasicFlags());
+  auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
+
+  P.ParseString(EnvOpts);
+
+  // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
+  // set through XRAY_OPTIONS instead.
+  if (internal_strlen(EnvOpts) == 0) {
+    F.func_duration_threshold_us =
+        flags()->xray_naive_log_func_duration_threshold_us;
+    F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
+    F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
   }
 
+  P.ParseString(static_cast<const char *>(Options));
+  GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
+  GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
+  GlobalOptions.MaxStackDepth = F.max_stack_depth;
+  *basicFlags() = F;
+
   atomic_store(&ThresholdTicks,
                atomic_load(&TicksPerSec, memory_order_acquire) *
                    GlobalOptions.DurationFilterMicros / 1000000,
diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc
index 3ce728900787..7d0e5a1f323c 100644
--- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc
+++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.cc
@@ -13,141 +13,206 @@
 //
 //===----------------------------------------------------------------------===//
 #include "xray_buffer_queue.h"
+#include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_libc.h"
+#if !SANITIZER_FUCHSIA
 #include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_allocator.h"
+#include "xray_defs.h"
 #include <memory>
 #include <sys/mman.h>
 
-#ifndef MAP_NORESERVE
-// no-op on NetBSD (at least), unsupported flag on FreeBSD
-#define MAP_NORESERVE 0
-#endif
-
 using namespace __xray;
-using namespace __sanitizer;
-
-template <class T> static T *allocRaw(size_t N) {
-  // TODO: Report errors?
-  // We use MAP_NORESERVE on platforms where it's supported to ensure that the
-  // pages we're allocating for XRay never end up in pages that can be swapped
-  // in/out. We're doing this because for FDR mode, we want to ensure that
-  // writes to the buffers stay resident in memory to prevent XRay itself from
-  // causing swapping/thrashing.
-  //
-  // In the case when XRay pages cannot be swapped in/out or there's not enough
-  // RAM to back these pages, we're willing to cause a segmentation fault
-  // instead of introducing latency in the measurement. We assume here that
-  // there are enough pages that are swappable in/out outside of the buffers
-  // being used by FDR mode (which are bounded and configurable anyway) to allow
-  // us to keep using always-resident memory.
-  //
-  // TODO: Make this configurable?
-  void *A = reinterpret_cast<void *>(
-      internal_mmap(NULL, N * sizeof(T), PROT_WRITE | PROT_READ,
-                    MAP_ANONYMOUS | MAP_PRIVATE | MAP_NORESERVE, -1, 0));
-  return (A == MAP_FAILED) ? nullptr : reinterpret_cast<T *>(A);
-}
 
-template <class T> static void deallocRaw(T *ptr, size_t N) {
-  // TODO: Report errors?
-  if (ptr != nullptr)
-    internal_munmap(ptr, N);
+namespace {
+
+BufferQueue::ControlBlock *allocControlBlock(size_t Size, size_t Count) {
+  auto B =
+      allocateBuffer((sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
+  return B == nullptr ? nullptr
+                      : reinterpret_cast<BufferQueue::ControlBlock *>(B);
 }
 
-template <class T> static T *initArray(size_t N) {
-  auto A = allocRaw<T>(N);
-  if (A != nullptr)
-    while (N > 0)
-      new (A + (--N)) T();
-  return A;
+void deallocControlBlock(BufferQueue::ControlBlock *C, size_t Size,
+                         size_t Count) {
+  deallocateBuffer(reinterpret_cast<unsigned char *>(C),
+                   (sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
 }
 
-BufferQueue::BufferQueue(size_t B, size_t N, bool &Success)
-    : BufferSize(B), Buffers(initArray<BufferQueue::BufferRep>(N)),
-      BufferCount(N), Finalizing{0}, OwnedBuffers(initArray<void *>(N)),
-      Next(Buffers), First(Buffers), LiveBuffers(0) {
-  if (Buffers == nullptr) {
-    Success = false;
+void decRefCount(BufferQueue::ControlBlock *C, size_t Size, size_t Count) {
+  if (C == nullptr)
     return;
-  }
-  if (OwnedBuffers == nullptr) {
-    // Clean up the buffers we've already allocated.
-    for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
-      B->~BufferRep();
-    deallocRaw(Buffers, N);
-    Success = false;
+  if (atomic_fetch_sub(&C->RefCount, 1, memory_order_acq_rel) == 1)
+    deallocControlBlock(C, Size, Count);
+}
+
+void incRefCount(BufferQueue::ControlBlock *C) {
+  if (C == nullptr)
     return;
+  atomic_fetch_add(&C->RefCount, 1, memory_order_acq_rel);
+}
+
+// We use a struct to ensure that we are allocating one atomic_uint64_t per
+// cache line. This allows us to not worry about false-sharing among atomic
+// objects being updated (constantly) by different threads.
+struct ExtentsPadded {
+  union {
+    atomic_uint64_t Extents;
+    unsigned char Storage[kCacheLineSize];
   };
+};
 
-  for (size_t i = 0; i < N; ++i) {
-    auto &T = Buffers[i];
-    void *Tmp = allocRaw<char>(BufferSize);
-    if (Tmp == nullptr) {
-      Success = false;
+constexpr size_t kExtentsSize = sizeof(ExtentsPadded);
+
+} // namespace
+
+BufferQueue::ErrorCode BufferQueue::init(size_t BS, size_t BC) {
+  SpinMutexLock Guard(&Mutex);
+
+  if (!finalizing())
+    return BufferQueue::ErrorCode::AlreadyInitialized;
+
+  cleanupBuffers();
+
+  bool Success = false;
+  BufferSize = BS;
+  BufferCount = BC;
+
+  BackingStore = allocControlBlock(BufferSize, BufferCount);
+  if (BackingStore == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  auto CleanupBackingStore = at_scope_exit([&, this] {
+    if (Success)
       return;
-    }
-    auto *Extents = allocRaw<BufferExtents>(1);
-    if (Extents == nullptr) {
-      Success = false;
+    deallocControlBlock(BackingStore, BufferSize, BufferCount);
+    BackingStore = nullptr;
+  });
+
+  // Initialize enough atomic_uint64_t instances, each
+  ExtentsBackingStore = allocControlBlock(kExtentsSize, BufferCount);
+  if (ExtentsBackingStore == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  auto CleanupExtentsBackingStore = at_scope_exit([&, this] {
+    if (Success)
       return;
-    }
+    deallocControlBlock(ExtentsBackingStore, kExtentsSize, BufferCount);
+    ExtentsBackingStore = nullptr;
+  });
+
+  Buffers = initArray<BufferRep>(BufferCount);
+  if (Buffers == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  // At this point we increment the generation number to associate the buffers
+  // to the new generation.
+  atomic_fetch_add(&Generation, 1, memory_order_acq_rel);
+
+  // First, we initialize the refcount in the ControlBlock, which we treat as
+  // being at the start of the BackingStore pointer.
+  atomic_store(&BackingStore->RefCount, 1, memory_order_release);
+  atomic_store(&ExtentsBackingStore->RefCount, 1, memory_order_release);
+
+  // Then we initialise the individual buffers that sub-divide the whole backing
+  // store. Each buffer will start at the `Data` member of the ControlBlock, and
+  // will be offsets from these locations.
+  for (size_t i = 0; i < BufferCount; ++i) {
+    auto &T = Buffers[i];
     auto &Buf = T.Buff;
-    Buf.Data = Tmp;
-    Buf.Size = B;
-    Buf.Extents = Extents;
-    OwnedBuffers[i] = Tmp;
+    auto *E = reinterpret_cast<ExtentsPadded *>(&ExtentsBackingStore->Data +
+                                                (kExtentsSize * i));
+    Buf.Extents = &E->Extents;
+    atomic_store(Buf.Extents, 0, memory_order_release);
+    Buf.Generation = generation();
+    Buf.Data = &BackingStore->Data + (BufferSize * i);
+    Buf.Size = BufferSize;
+    Buf.BackingStore = BackingStore;
+    Buf.ExtentsBackingStore = ExtentsBackingStore;
+    Buf.Count = BufferCount;
+    T.Used = false;
   }
+
+  Next = Buffers;
+  First = Buffers;
+  LiveBuffers = 0;
+  atomic_store(&Finalizing, 0, memory_order_release);
   Success = true;
+  return BufferQueue::ErrorCode::Ok;
+}
+
+BufferQueue::BufferQueue(size_t B, size_t N,
+                         bool &Success) XRAY_NEVER_INSTRUMENT
+    : BufferSize(B),
+      BufferCount(N),
+      Mutex(),
+      Finalizing{1},
+      BackingStore(nullptr),
+      ExtentsBackingStore(nullptr),
+      Buffers(nullptr),
+      Next(Buffers),
+      First(Buffers),
+      LiveBuffers(0),
+      Generation{0} {
+  Success = init(B, N) == BufferQueue::ErrorCode::Ok;
 }
 
 BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
   if (atomic_load(&Finalizing, memory_order_acquire))
     return ErrorCode::QueueFinalizing;
-  SpinMutexLock Guard(&Mutex);
-  if (LiveBuffers == BufferCount)
-    return ErrorCode::NotEnoughMemory;
 
-  auto &T = *Next;
-  auto &B = T.Buff;
-  Buf = B;
-  T.Used = true;
-  ++LiveBuffers;
-
-  if (++Next == (Buffers + BufferCount))
-    Next = Buffers;
+  BufferRep *B = nullptr;
+  {
+    SpinMutexLock Guard(&Mutex);
+    if (LiveBuffers == BufferCount)
+      return ErrorCode::NotEnoughMemory;
+    B = Next++;
+    if (Next == (Buffers + BufferCount))
+      Next = Buffers;
+    ++LiveBuffers;
+  }
 
+  incRefCount(BackingStore);
+  incRefCount(ExtentsBackingStore);
+  Buf = B->Buff;
+  Buf.Generation = generation();
+  B->Used = true;
   return ErrorCode::Ok;
 }
 
 BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
-  // Blitz through the buffers array to find the buffer.
-  bool Found = false;
-  for (auto I = OwnedBuffers, E = OwnedBuffers + BufferCount; I != E; ++I) {
-    if (*I == Buf.Data) {
-      Found = true;
-      break;
+  // Check whether the buffer being referred to is within the bounds of the
+  // backing store's range.
+  BufferRep *B = nullptr;
+  {
+    SpinMutexLock Guard(&Mutex);
+    if (Buf.Generation != generation() || LiveBuffers == 0) {
+      Buf = {};
+      decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+      decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+      return BufferQueue::ErrorCode::Ok;
     }
-  }
-  if (!Found)
-    return ErrorCode::UnrecognizedBuffer;
 
-  SpinMutexLock Guard(&Mutex);
+    if (Buf.Data < &BackingStore->Data ||
+        Buf.Data > &BackingStore->Data + (BufferCount * BufferSize))
+      return BufferQueue::ErrorCode::UnrecognizedBuffer;
 
-  // This points to a semantic bug, we really ought to not be releasing more
-  // buffers than we actually get.
-  if (LiveBuffers == 0)
-    return ErrorCode::NotEnoughMemory;
+    --LiveBuffers;
+    B = First++;
+    if (First == (Buffers + BufferCount))
+      First = Buffers;
+  }
 
   // Now that the buffer has been released, we mark it as "used".
-  First->Buff = Buf;
-  First->Used = true;
-  Buf.Data = nullptr;
-  Buf.Size = 0;
-  --LiveBuffers;
-  if (++First == (Buffers + BufferCount))
-    First = Buffers;
-
+  B->Buff = Buf;
+  B->Used = true;
+  decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+  decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+  atomic_store(B->Buff.Extents, atomic_load(Buf.Extents, memory_order_acquire),
+               memory_order_release);
+  Buf = {};
   return ErrorCode::Ok;
 }
 
@@ -157,15 +222,17 @@ BufferQueue::ErrorCode BufferQueue::finalize() {
   return ErrorCode::Ok;
 }
 
-BufferQueue::~BufferQueue() {
-  for (auto I = Buffers, E = Buffers + BufferCount; I != E; ++I) {
-    auto &T = *I;
-    auto &Buf = T.Buff;
-    deallocRaw(Buf.Data, Buf.Size);
-    deallocRaw(Buf.Extents, 1);
-  }
+void BufferQueue::cleanupBuffers() {
   for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
     B->~BufferRep();
-  deallocRaw(Buffers, BufferCount);
-  deallocRaw(OwnedBuffers, BufferCount);
+  deallocateBuffer(Buffers, BufferCount);
+  decRefCount(BackingStore, BufferSize, BufferCount);
+  decRefCount(ExtentsBackingStore, kExtentsSize, BufferCount);
+  BackingStore = nullptr;
+  ExtentsBackingStore = nullptr;
+  Buffers = nullptr;
+  BufferCount = 0;
+  BufferSize = 0;
 }
+
+BufferQueue::~BufferQueue() { cleanupBuffers(); }
diff --git a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h
index e76fa7983c90..ef2b433f9a3f 100644
--- a/contrib/compiler-rt/lib/xray/xray_buffer_queue.h
+++ b/contrib/compiler-rt/lib/xray/xray_buffer_queue.h
@@ -18,25 +18,51 @@
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
 #include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_defs.h"
 #include <cstddef>
+#include <cstdint>
 
 namespace __xray {
 
 /// BufferQueue implements a circular queue of fixed sized buffers (much like a
-/// freelist) but is concerned mostly with making it really quick to initialise,
-/// finalise, and get/return buffers to the queue. This is one key component of
-/// the "flight data recorder" (FDR) mode to support ongoing XRay function call
+/// freelist) but is concerned with making it quick to initialise, finalise, and
+/// get from or return buffers to the queue. This is one key component of the
+/// "flight data recorder" (FDR) mode to support ongoing XRay function call
 /// trace collection.
 class BufferQueue {
 public:
-  struct alignas(64) BufferExtents {
-    atomic_uint64_t Size;
+  /// ControlBlock represents the memory layout of how we interpret the backing
+  /// store for all buffers and extents managed by a BufferQueue instance. The
+  /// ControlBlock has the reference count as the first member, sized according
+  /// to platform-specific cache-line size. We never use the Buffer member of
+  /// the union, which is only there for compiler-supported alignment and
+  /// sizing.
+  ///
+  /// This ensures that the `Data` member will be placed at least kCacheLineSize
+  /// bytes from the beginning of the structure.
+  struct ControlBlock {
+    union {
+      atomic_uint64_t RefCount;
+      char Buffer[kCacheLineSize];
+    };
+
+    /// We need to make this size 1, to conform to the C++ rules for array data
+    /// members. Typically, we want to subtract this 1 byte for sizing
+    /// information.
+    char Data[1];
   };
 
   struct Buffer {
+    atomic_uint64_t *Extents = nullptr;
+    uint64_t Generation{0};
     void *Data = nullptr;
     size_t Size = 0;
-    BufferExtents *Extents;
+
+  private:
+    friend class BufferQueue;
+    ControlBlock *BackingStore = nullptr;
+    ControlBlock *ExtentsBackingStore = nullptr;
+    size_t Count = 0;
   };
 
   struct BufferRep {
@@ -76,8 +102,10 @@ private:
 
     T *operator->() const { return &(Buffers[Offset].Buff); }
 
-    Iterator(BufferRep *Root, size_t O, size_t M)
-        : Buffers(Root), Offset(O), Max(M) {
+    Iterator(BufferRep *Root, size_t O, size_t M) XRAY_NEVER_INSTRUMENT
+        : Buffers(Root),
+          Offset(O),
+          Max(M) {
       // We want to advance to the first Offset where the 'Used' property is
       // true, or to the end of the list/queue.
       while (!Buffers[Offset].Used && Offset != Max) {
@@ -107,16 +135,20 @@ private:
   // Size of each individual Buffer.
   size_t BufferSize;
 
-  BufferRep *Buffers;
-
   // Amount of pre-allocated buffers.
   size_t BufferCount;
 
   SpinMutex Mutex;
   atomic_uint8_t Finalizing;
 
-  // Pointers to buffers managed/owned by the BufferQueue.
-  void **OwnedBuffers;
+  // The collocated ControlBlock and buffer storage.
+  ControlBlock *BackingStore;
+
+  // The collocated ControlBlock and extents storage.
+  ControlBlock *ExtentsBackingStore;
+
+  // A dynamically allocated array of BufferRep instances.
+  BufferRep *Buffers;
 
   // Pointer to the next buffer to be handed out.
   BufferRep *Next;
@@ -128,6 +160,13 @@ private:
   // Count of buffers that have been handed out through 'getBuffer'.
   size_t LiveBuffers;
 
+  // We use a generation number to identify buffers and which generation they're
+  // associated with.
+  atomic_uint64_t Generation;
+
+  /// Releases references to the buffers backed by the current buffer queue.
+  void cleanupBuffers();
+
 public:
   enum class ErrorCode : unsigned {
     Ok,
@@ -135,6 +174,7 @@ public:
     QueueFinalizing,
     UnrecognizedBuffer,
     AlreadyFinalized,
+    AlreadyInitialized,
   };
 
   static const char *getErrorString(ErrorCode E) {
@@ -149,6 +189,8 @@ public:
       return "buffer being returned not owned by buffer queue";
     case ErrorCode::AlreadyFinalized:
       return "queue already finalized";
+    case ErrorCode::AlreadyInitialized:
+      return "queue already initialized";
     }
     return "unknown error";
   }
@@ -179,10 +221,23 @@ public:
   ///     the buffer being released.
   ErrorCode releaseBuffer(Buffer &Buf);
 
+  /// Initializes the buffer queue, starting a new generation. We can re-set the
+  /// size of buffers with |BS| along with the buffer count with |BC|.
+  ///
+  /// Returns:
+  ///   - ErrorCode::Ok when we successfully initialize the buffer. This
+  ///   requires that the buffer queue is previously finalized.
+  ///   - ErrorCode::AlreadyInitialized when the buffer queue is not finalized.
+  ErrorCode init(size_t BS, size_t BC);
+
   bool finalizing() const {
     return atomic_load(&Finalizing, memory_order_acquire);
   }
 
+  uint64_t generation() const {
+    return atomic_load(&Generation, memory_order_acquire);
+  }
+
   /// Returns the configured size of the buffers in the buffer queue.
   size_t ConfiguredBufferSize() const { return BufferSize; }
 
@@ -198,7 +253,7 @@ public:
   /// Applies the provided function F to each Buffer in the queue, only if the
   /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a
   /// releaseBuffer(...) operation).
-  template <class F> void apply(F Fn) {
+  template <class F> void apply(F Fn) XRAY_NEVER_INSTRUMENT {
     SpinMutexLock G(&Mutex);
     for (auto I = begin(), E = end(); I != E; ++I)
       Fn(*I);
diff --git a/contrib/compiler-rt/lib/xray/xray_defs.h b/contrib/compiler-rt/lib/xray/xray_defs.h
index e5c37c0665db..c009bcc879f1 100644
--- a/contrib/compiler-rt/lib/xray/xray_defs.h
+++ b/contrib/compiler-rt/lib/xray/xray_defs.h
@@ -19,4 +19,14 @@
 #define XRAY_NEVER_INSTRUMENT
 #endif
 
+#if SANITIZER_NETBSD
+// NetBSD: thread_local is not aligned properly, and the code relying
+// on it segfaults
+#define XRAY_TLS_ALIGNAS(x)
+#define XRAY_HAS_TLS_ALIGNAS 0
+#else
+#define XRAY_TLS_ALIGNAS(x) alignas(x)
+#define XRAY_HAS_TLS_ALIGNAS 1
+#endif
+
 #endif  // XRAY_XRAY_DEFS_H
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_controller.h b/contrib/compiler-rt/lib/xray/xray_fdr_controller.h
new file mode 100644
index 000000000000..d44d0309b373
--- /dev/null
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_controller.h
@@ -0,0 +1,373 @@
+//===-- xray_fdr_controller.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
+#define COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
+
+#include <limits>
+#include <time.h>
+
+#include "xray/xray_interface.h"
+#include "xray/xray_records.h"
+#include "xray_buffer_queue.h"
+#include "xray_fdr_log_writer.h"
+
+namespace __xray {
+
+template <size_t Version = 5> class FDRController {
+  BufferQueue *BQ;
+  BufferQueue::Buffer &B;
+  FDRLogWriter &W;
+  int (*WallClockReader)(clockid_t, struct timespec *) = 0;
+  uint64_t CycleThreshold = 0;
+
+  uint64_t LastFunctionEntryTSC = 0;
+  uint64_t LatestTSC = 0;
+  uint16_t LatestCPU = 0;
+  tid_t TId = 0;
+  pid_t PId = 0;
+  bool First = true;
+
+  uint32_t UndoableFunctionEnters = 0;
+  uint32_t UndoableTailExits = 0;
+
+  bool finalized() const XRAY_NEVER_INSTRUMENT {
+    return BQ == nullptr || BQ->finalizing();
+  }
+
+  bool hasSpace(size_t S) XRAY_NEVER_INSTRUMENT {
+    return B.Data != nullptr && B.Generation == BQ->generation() &&
+           W.getNextRecord() + S <= reinterpret_cast<char *>(B.Data) + B.Size;
+  }
+
+  constexpr int32_t mask(int32_t FuncId) const XRAY_NEVER_INSTRUMENT {
+    return FuncId & ((1 << 29) - 1);
+  }
+
+  bool getNewBuffer() XRAY_NEVER_INSTRUMENT {
+    if (BQ->getBuffer(B) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    W.resetRecord();
+    DCHECK_EQ(W.getNextRecord(), B.Data);
+    LatestTSC = 0;
+    LatestCPU = 0;
+    First = true;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    atomic_store(B.Extents, 0, memory_order_release);
+    return true;
+  }
+
+  bool setupNewBuffer() XRAY_NEVER_INSTRUMENT {
+    if (finalized())
+      return false;
+
+    DCHECK(hasSpace(sizeof(MetadataRecord) * 3));
+    TId = GetTid();
+    PId = internal_getpid();
+    struct timespec TS {
+      0, 0
+    };
+    WallClockReader(CLOCK_MONOTONIC, &TS);
+
+    MetadataRecord Metadata[] = {
+        // Write out a MetadataRecord to signify that this is the start of a new
+        // buffer, associated with a particular thread, with a new CPU. For the
+        // data, we have 15 bytes to squeeze as much information as we can. At
+        // this point we only write down the following bytes:
+        //   - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8
+        //   bytes)
+        createMetadataRecord<MetadataRecord::RecordKinds::NewBuffer>(
+            static_cast<int32_t>(TId)),
+
+        // Also write the WalltimeMarker record. We only really need microsecond
+        // precision here, and enforce across platforms that we need 64-bit
+        // seconds and 32-bit microseconds encoded in the Metadata record.
+        createMetadataRecord<MetadataRecord::RecordKinds::WalltimeMarker>(
+            static_cast<int64_t>(TS.tv_sec),
+            static_cast<int32_t>(TS.tv_nsec / 1000)),
+
+        // Also write the Pid record.
+        createMetadataRecord<MetadataRecord::RecordKinds::Pid>(
+            static_cast<int32_t>(PId)),
+    };
+
+    if (finalized())
+      return false;
+    return W.writeMetadataRecords(Metadata);
+  }
+
+  bool prepareBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+    if (finalized())
+      return returnBuffer();
+
+    if (UNLIKELY(!hasSpace(S))) {
+      if (!returnBuffer())
+        return false;
+      if (!getNewBuffer())
+        return false;
+      if (!setupNewBuffer())
+        return false;
+    }
+
+    if (First) {
+      First = false;
+      W.resetRecord();
+      atomic_store(B.Extents, 0, memory_order_release);
+      return setupNewBuffer();
+    }
+
+    return true;
+  }
+
+  bool returnBuffer() XRAY_NEVER_INSTRUMENT {
+    if (BQ == nullptr)
+      return false;
+
+    First = true;
+    if (finalized()) {
+      BQ->releaseBuffer(B); // ignore result.
+      return false;
+    }
+
+    return BQ->releaseBuffer(B) == BufferQueue::ErrorCode::Ok;
+  }
+
+  enum class PreambleResult { NoChange, WroteMetadata, InvalidBuffer };
+  PreambleResult recordPreamble(uint64_t TSC,
+                                uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (UNLIKELY(LatestCPU != CPU || LatestTSC == 0)) {
+      // We update our internal tracking state for the Latest TSC and CPU we've
+      // seen, then write out the appropriate metadata and function records.
+      LatestTSC = TSC;
+      LatestCPU = CPU;
+
+      if (B.Generation != BQ->generation())
+        return PreambleResult::InvalidBuffer;
+
+      W.writeMetadata<MetadataRecord::RecordKinds::NewCPUId>(CPU, TSC);
+      return PreambleResult::WroteMetadata;
+    }
+
+    DCHECK_EQ(LatestCPU, CPU);
+
+    if (UNLIKELY(LatestTSC > TSC ||
+                 TSC - LatestTSC >
+                     uint64_t{std::numeric_limits<int32_t>::max()})) {
+      // Either the TSC has wrapped around from the last TSC we've seen or the
+      // delta is too large to fit in a 32-bit signed integer, so we write a
+      // wrap-around record.
+      LatestTSC = TSC;
+
+      if (B.Generation != BQ->generation())
+        return PreambleResult::InvalidBuffer;
+
+      W.writeMetadata<MetadataRecord::RecordKinds::TSCWrap>(TSC);
+      return PreambleResult::WroteMetadata;
+    }
+
+    return PreambleResult::NoChange;
+  }
+
+  bool rewindRecords(int32_t FuncId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    // Undo one enter record, because at this point we are either at the state
+    // of:
+    // - We are exiting a function that we recently entered.
+    // - We are exiting a function that was the result of a sequence of tail
+    //   exits, and we can check whether the tail exits can be re-wound.
+    //
+    FunctionRecord F;
+    W.undoWrites(sizeof(FunctionRecord));
+    if (B.Generation != BQ->generation())
+      return false;
+    internal_memcpy(&F, W.getNextRecord(), sizeof(FunctionRecord));
+
+    DCHECK(F.RecordKind ==
+               uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
+           "Expected to find function entry recording when rewinding.");
+    DCHECK_EQ(F.FuncId, FuncId & ~(0x0F << 28));
+
+    LatestTSC -= F.TSCDelta;
+    if (--UndoableFunctionEnters != 0) {
+      LastFunctionEntryTSC -= F.TSCDelta;
+      return true;
+    }
+
+    LastFunctionEntryTSC = 0;
+    auto RewindingTSC = LatestTSC;
+    auto RewindingRecordPtr = W.getNextRecord() - sizeof(FunctionRecord);
+    while (UndoableTailExits) {
+      if (B.Generation != BQ->generation())
+        return false;
+      internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord));
+      DCHECK_EQ(F.RecordKind,
+                uint8_t(FunctionRecord::RecordKinds::FunctionTailExit));
+      RewindingTSC -= F.TSCDelta;
+      RewindingRecordPtr -= sizeof(FunctionRecord);
+      if (B.Generation != BQ->generation())
+        return false;
+      internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord));
+
+      // This tail call exceeded the threshold duration. It will not be erased.
+      if ((TSC - RewindingTSC) >= CycleThreshold) {
+        UndoableTailExits = 0;
+        return true;
+      }
+
+      --UndoableTailExits;
+      W.undoWrites(sizeof(FunctionRecord) * 2);
+      LatestTSC = RewindingTSC;
+    }
+    return true;
+  }
+
+public:
+  template <class WallClockFunc>
+  FDRController(BufferQueue *BQ, BufferQueue::Buffer &B, FDRLogWriter &W,
+                WallClockFunc R, uint64_t C) XRAY_NEVER_INSTRUMENT
+      : BQ(BQ),
+        B(B),
+        W(W),
+        WallClockReader(R),
+        CycleThreshold(C) {}
+
+  bool functionEnter(int32_t FuncId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+      return returnBuffer();
+
+    auto PreambleStatus = recordPreamble(TSC, CPU);
+    if (PreambleStatus == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    if (PreambleStatus == PreambleResult::WroteMetadata) {
+      UndoableFunctionEnters = 1;
+      UndoableTailExits = 0;
+    } else {
+      ++UndoableFunctionEnters;
+    }
+
+    auto Delta = TSC - LatestTSC;
+    LastFunctionEntryTSC = TSC;
+    LatestTSC = TSC;
+    return W.writeFunction(FDRLogWriter::FunctionRecordKind::Enter,
+                           mask(FuncId), Delta);
+  }
+
+  bool functionTailExit(int32_t FuncId, uint64_t TSC,
+                        uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (finalized())
+      return returnBuffer();
+
+    if (!prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+      return returnBuffer();
+
+    auto PreambleStatus = recordPreamble(TSC, CPU);
+    if (PreambleStatus == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    if (PreambleStatus == PreambleResult::NoChange &&
+        UndoableFunctionEnters != 0 &&
+        TSC - LastFunctionEntryTSC < CycleThreshold)
+      return rewindRecords(FuncId, TSC, CPU);
+
+    UndoableTailExits = UndoableFunctionEnters ? UndoableTailExits + 1 : 0;
+    UndoableFunctionEnters = 0;
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    return W.writeFunction(FDRLogWriter::FunctionRecordKind::TailExit,
+                           mask(FuncId), Delta);
+  }
+
+  bool functionEnterArg(int32_t FuncId, uint64_t TSC, uint16_t CPU,
+                        uint64_t Arg) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer((2 * sizeof(MetadataRecord)) + sizeof(FunctionRecord)) ||
+        recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    LastFunctionEntryTSC = 0;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+
+    return W.writeFunctionWithArg(FDRLogWriter::FunctionRecordKind::EnterArg,
+                                  mask(FuncId), Delta, Arg);
+  }
+
+  bool functionExit(int32_t FuncId, uint64_t TSC,
+                    uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+      return returnBuffer();
+
+    auto PreambleStatus = recordPreamble(TSC, CPU);
+    if (PreambleStatus == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    if (PreambleStatus == PreambleResult::NoChange &&
+        UndoableFunctionEnters != 0 &&
+        TSC - LastFunctionEntryTSC < CycleThreshold)
+      return rewindRecords(FuncId, TSC, CPU);
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    return W.writeFunction(FDRLogWriter::FunctionRecordKind::Exit, mask(FuncId),
+                           Delta);
+  }
+
+  bool customEvent(uint64_t TSC, uint16_t CPU, const void *Event,
+                   int32_t EventSize) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) ||
+        recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    return W.writeCustomEvent(Delta, Event, EventSize);
+  }
+
+  bool typedEvent(uint64_t TSC, uint16_t CPU, uint16_t EventType,
+                  const void *Event, int32_t EventSize) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) ||
+        recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    return W.writeTypedEvent(Delta, EventType, Event, EventSize);
+  }
+
+  bool flush() XRAY_NEVER_INSTRUMENT {
+    if (finalized()) {
+      returnBuffer(); // ignore result.
+      return true;
+    }
+    return returnBuffer();
+  }
+};
+
+} // namespace __xray
+
+#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
index 87096d4fc29e..e7b1ee562e1b 100644
--- a/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_records.h
@@ -12,6 +12,9 @@
 //===----------------------------------------------------------------------===//
 #ifndef XRAY_XRAY_FDR_LOG_RECORDS_H
 #define XRAY_XRAY_FDR_LOG_RECORDS_H
+#include <cstdint>
+
+namespace __xray {
 
 enum class RecordType : uint8_t { Function, Metadata };
 
@@ -68,4 +71,6 @@ struct alignas(8) FunctionRecord {
 
 static_assert(sizeof(FunctionRecord) == 8, "Wrong size for FunctionRecord.");
 
+} // namespace __xray
+
 #endif // XRAY_XRAY_FDR_LOG_RECORDS_H
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h b/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h
new file mode 100644
index 000000000000..7712e1377763
--- /dev/null
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_log_writer.h
@@ -0,0 +1,232 @@
+//===-- xray_fdr_log_writer.h ---------------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+#define COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+
+#include "xray_buffer_queue.h"
+#include "xray_fdr_log_records.h"
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+template <size_t Index> struct SerializerImpl {
+  template <class Tuple,
+            typename std::enable_if<
+                Index<std::tuple_size<
+                          typename std::remove_reference<Tuple>::type>::value,
+                      int>::type = 0> static void serializeTo(char *Buffer,
+                                                              Tuple &&T) {
+    auto P = reinterpret_cast<const char *>(&std::get<Index>(T));
+    constexpr auto Size = sizeof(std::get<Index>(T));
+    internal_memcpy(Buffer, P, Size);
+    SerializerImpl<Index + 1>::serializeTo(Buffer + Size,
+                                           std::forward<Tuple>(T));
+  }
+
+  template <class Tuple,
+            typename std::enable_if<
+                Index >= std::tuple_size<typename std::remove_reference<
+                             Tuple>::type>::value,
+                int>::type = 0>
+  static void serializeTo(char *, Tuple &&) {}
+};
+
+using Serializer = SerializerImpl<0>;
+
+template <class Tuple, size_t Index> struct AggregateSizesImpl {
+  static constexpr size_t value =
+      sizeof(typename std::tuple_element<Index, Tuple>::type) +
+      AggregateSizesImpl<Tuple, Index - 1>::value;
+};
+
+template <class Tuple> struct AggregateSizesImpl<Tuple, 0> {
+  static constexpr size_t value =
+      sizeof(typename std::tuple_element<0, Tuple>::type);
+};
+
+template <class Tuple> struct AggregateSizes {
+  static constexpr size_t value =
+      AggregateSizesImpl<Tuple, std::tuple_size<Tuple>::value - 1>::value;
+};
+
+template <MetadataRecord::RecordKinds Kind, class... DataTypes>
+MetadataRecord createMetadataRecord(DataTypes &&... Ds) {
+  static_assert(AggregateSizes<std::tuple<DataTypes...>>::value <=
+                    sizeof(MetadataRecord) - 1,
+                "Metadata payload longer than metadata buffer!");
+  MetadataRecord R;
+  R.Type = 1;
+  R.RecordKind = static_cast<uint8_t>(Kind);
+  Serializer::serializeTo(R.Data,
+                          std::make_tuple(std::forward<DataTypes>(Ds)...));
+  return R;
+}
+
+class FDRLogWriter {
+  BufferQueue::Buffer &Buffer;
+  char *NextRecord = nullptr;
+
+  template <class T> void writeRecord(const T &R) {
+    internal_memcpy(NextRecord, reinterpret_cast<const char *>(&R), sizeof(T));
+    NextRecord += sizeof(T);
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, sizeof(T), memory_order_acq_rel);
+  }
+
+public:
+  explicit FDRLogWriter(BufferQueue::Buffer &B, char *P)
+      : Buffer(B), NextRecord(P) {
+    DCHECK_NE(Buffer.Data, nullptr);
+    DCHECK_NE(NextRecord, nullptr);
+  }
+
+  explicit FDRLogWriter(BufferQueue::Buffer &B)
+      : FDRLogWriter(B, static_cast<char *>(B.Data)) {}
+
+  template <MetadataRecord::RecordKinds Kind, class... Data>
+  bool writeMetadata(Data &&... Ds) {
+    // TODO: Check boundary conditions:
+    // 1) Buffer is full, and cannot handle one metadata record.
+    // 2) Buffer queue is finalising.
+    writeRecord(createMetadataRecord<Kind>(std::forward<Data>(Ds)...));
+    return true;
+  }
+
+  template <size_t N> size_t writeMetadataRecords(MetadataRecord (&Recs)[N]) {
+    constexpr auto Size = sizeof(MetadataRecord) * N;
+    internal_memcpy(NextRecord, reinterpret_cast<const char *>(Recs), Size);
+    NextRecord += Size;
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, Size, memory_order_acq_rel);
+    return Size;
+  }
+
+  enum class FunctionRecordKind : uint8_t {
+    Enter = 0x00,
+    Exit = 0x01,
+    TailExit = 0x02,
+    EnterArg = 0x03,
+  };
+
+  bool writeFunction(FunctionRecordKind Kind, int32_t FuncId, int32_t Delta) {
+    FunctionRecord R;
+    R.Type = 0;
+    R.RecordKind = uint8_t(Kind);
+    R.FuncId = FuncId;
+    R.TSCDelta = Delta;
+    writeRecord(R);
+    return true;
+  }
+
+  bool writeFunctionWithArg(FunctionRecordKind Kind, int32_t FuncId,
+                            int32_t Delta, uint64_t Arg) {
+    // We need to write the function with arg into the buffer, and then
+    // atomically update the buffer extents. This ensures that any reads
+    // synchronised on the buffer extents record will always see the writes
+    // that happen before the atomic update.
+    FunctionRecord R;
+    R.Type = 0;
+    R.RecordKind = uint8_t(Kind);
+    R.FuncId = FuncId;
+    R.TSCDelta = Delta;
+    MetadataRecord A =
+        createMetadataRecord<MetadataRecord::RecordKinds::CallArgument>(Arg);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+                 sizeof(R);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&A), sizeof(A))) +
+                 sizeof(A);
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, sizeof(R) + sizeof(A),
+                     memory_order_acq_rel);
+    return true;
+  }
+
+  bool writeCustomEvent(int32_t Delta, const void *Event, int32_t EventSize) {
+    // We write the metadata record and the custom event data into the buffer
+    // first, before we atomically update the extents for the buffer. This
+    // allows us to ensure that any threads reading the extents of the buffer
+    // will only ever see the full metadata and custom event payload accounted
+    // (no partial writes accounted).
+    MetadataRecord R =
+        createMetadataRecord<MetadataRecord::RecordKinds::CustomEventMarker>(
+            EventSize, Delta);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+                 sizeof(R);
+    NextRecord = reinterpret_cast<char *>(
+                     internal_memcpy(NextRecord, Event, EventSize)) +
+                 EventSize;
+
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, sizeof(R) + EventSize,
+                     memory_order_acq_rel);
+    return true;
+  }
+
+  bool writeTypedEvent(int32_t Delta, uint16_t EventType, const void *Event,
+                       int32_t EventSize) {
+    // We do something similar when writing out typed events, see
+    // writeCustomEvent(...) above for details.
+    MetadataRecord R =
+        createMetadataRecord<MetadataRecord::RecordKinds::TypedEventMarker>(
+            EventSize, Delta, EventType);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+                 sizeof(R);
+    NextRecord = reinterpret_cast<char *>(
+                     internal_memcpy(NextRecord, Event, EventSize)) +
+                 EventSize;
+
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, EventSize, memory_order_acq_rel);
+    return true;
+  }
+
+  char *getNextRecord() const { return NextRecord; }
+
+  void resetRecord() {
+    NextRecord = reinterpret_cast<char *>(Buffer.Data);
+    atomic_store(Buffer.Extents, 0, memory_order_release);
+  }
+
+  void undoWrites(size_t B) {
+    DCHECK_GE(NextRecord - B, reinterpret_cast<char *>(Buffer.Data));
+    NextRecord -= B;
+    atomic_fetch_sub(Buffer.Extents, B, memory_order_acq_rel);
+  }
+
+}; // namespace __xray
+
+} // namespace __xray
+
+#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
diff --git a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
index 6cb2dfa0c658..1eda26df7a85 100644
--- a/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
+++ b/contrib/compiler-rt/lib/xray/xray_fdr_logging.cc
@@ -20,7 +20,6 @@
 #include <limits>
 #include <memory>
 #include <pthread.h>
-#include <sys/syscall.h>
 #include <sys/time.h>
 #include <time.h>
 #include <unistd.h>
@@ -30,9 +29,12 @@
 #include "sanitizer_common/sanitizer_common.h"
 #include "xray/xray_interface.h"
 #include "xray/xray_records.h"
+#include "xray_allocator.h"
 #include "xray_buffer_queue.h"
 #include "xray_defs.h"
+#include "xray_fdr_controller.h"
 #include "xray_fdr_flags.h"
+#include "xray_fdr_log_writer.h"
 #include "xray_flags.h"
 #include "xray_recursion_guard.h"
 #include "xray_tsc.h"
@@ -40,55 +42,53 @@
 
 namespace __xray {
 
-atomic_sint32_t LoggingStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+static atomic_sint32_t LoggingStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+namespace {
 
 // Group together thread-local-data in a struct, then hide it behind a function
 // call so that it can be initialized on first use instead of as a global. We
 // force the alignment to 64-bytes for x86 cache line alignment, as this
 // structure is used in the hot path of implementation.
-struct alignas(64) ThreadLocalData {
-  BufferQueue::Buffer Buffer;
-  char *RecordPtr = nullptr;
-  // The number of FunctionEntry records immediately preceding RecordPtr.
-  uint8_t NumConsecutiveFnEnters = 0;
-
-  // The number of adjacent, consecutive pairs of FunctionEntry, Tail Exit
-  // records preceding RecordPtr.
-  uint8_t NumTailCalls = 0;
-
-  // We use a thread_local variable to keep track of which CPUs we've already
-  // run, and the TSC times for these CPUs. This allows us to stop repeating the
-  // CPU field in the function records.
-  //
-  // We assume that we'll support only 65536 CPUs for x86_64.
-  uint16_t CurrentCPU = std::numeric_limits<uint16_t>::max();
-  uint64_t LastTSC = 0;
-  uint64_t LastFunctionEntryTSC = 0;
-
-  // Make sure a thread that's ever called handleArg0 has a thread-local
-  // live reference to the buffer queue for this particular instance of
-  // FDRLogging, and that we're going to clean it up when the thread exits.
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
+  BufferQueue::Buffer Buffer{};
   BufferQueue *BQ = nullptr;
+
+  using LogWriterStorage =
+      typename std::aligned_storage<sizeof(FDRLogWriter),
+                                    alignof(FDRLogWriter)>::type;
+
+  LogWriterStorage LWStorage;
+  FDRLogWriter *Writer = nullptr;
+
+  using ControllerStorage =
+      typename std::aligned_storage<sizeof(FDRController<>),
+                                    alignof(FDRController<>)>::type;
+  ControllerStorage CStorage;
+  FDRController<> *Controller = nullptr;
 };
 
+} // namespace
+
 static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
               "ThreadLocalData must be trivially destructible");
 
-static constexpr auto MetadataRecSize = sizeof(MetadataRecord);
-static constexpr auto FunctionRecSize = sizeof(FunctionRecord);
-
 // Use a global pthread key to identify thread-local data for logging.
 static pthread_key_t Key;
 
 // Global BufferQueue.
+static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage;
 static BufferQueue *BQ = nullptr;
 
-static atomic_sint32_t LogFlushStatus = {
-    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+// Global thresholds for function durations.
+static atomic_uint64_t ThresholdTicks{0};
 
-static FDRLoggingOptions FDROptions;
+// Global for ticks per second.
+static atomic_uint64_t TicksPerSec{0};
 
-static SpinMutex FDROptionsMutex;
+static atomic_sint32_t LogFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
 
 // This function will initialize the thread-local data structure used by the FDR
 // logging implementation and return a reference to it. The implementation
@@ -124,8 +124,10 @@ static SpinMutex FDROptionsMutex;
 // critical section, calling a function that might be XRay instrumented (and
 // thus in turn calling into malloc by virtue of registration of the
 // thread_local's destructor).
+#if XRAY_HAS_TLS_ALIGNAS
 static_assert(alignof(ThreadLocalData) >= 64,
               "ThreadLocalData must be cache line aligned.");
+#endif
 static ThreadLocalData &getThreadLocalData() {
   thread_local typename std::aligned_storage<
       sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{};
@@ -138,559 +140,36 @@ static ThreadLocalData &getThreadLocalData() {
   return *reinterpret_cast<ThreadLocalData *>(&TLDStorage);
 }
 
-static void writeNewBufferPreamble(tid_t Tid, timespec TS,
-                                   pid_t Pid) XRAY_NEVER_INSTRUMENT {
-  static constexpr int InitRecordsCount = 3;
-  auto &TLD = getThreadLocalData();
-  MetadataRecord Metadata[InitRecordsCount];
-  {
-    // Write out a MetadataRecord to signify that this is the start of a new
-    // buffer, associated with a particular thread, with a new CPU.  For the
-    // data, we have 15 bytes to squeeze as much information as we can.  At this
-    // point we only write down the following bytes:
-    //   - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8 bytes)
-    auto &NewBuffer = Metadata[0];
-    NewBuffer.Type = uint8_t(RecordType::Metadata);
-    NewBuffer.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewBuffer);
-    int32_t tid = static_cast<int32_t>(Tid);
-    internal_memcpy(&NewBuffer.Data, &tid, sizeof(tid));
-  }
-
-  // Also write the WalltimeMarker record.
-  {
-    static_assert(sizeof(time_t) <= 8, "time_t needs to be at most 8 bytes");
-    auto &WalltimeMarker = Metadata[1];
-    WalltimeMarker.Type = uint8_t(RecordType::Metadata);
-    WalltimeMarker.RecordKind =
-        uint8_t(MetadataRecord::RecordKinds::WalltimeMarker);
-
-    // We only really need microsecond precision here, and enforce across
-    // platforms that we need 64-bit seconds and 32-bit microseconds encoded in
-    // the Metadata record.
-    int32_t Micros = TS.tv_nsec / 1000;
-    int64_t Seconds = TS.tv_sec;
-    internal_memcpy(WalltimeMarker.Data, &Seconds, sizeof(Seconds));
-    internal_memcpy(WalltimeMarker.Data + sizeof(Seconds), &Micros,
-                    sizeof(Micros));
-  }
-
-  // Also write the Pid record.
-  {
-    // Write out a MetadataRecord that contains the current pid
-    auto &PidMetadata = Metadata[2];
-    PidMetadata.Type = uint8_t(RecordType::Metadata);
-    PidMetadata.RecordKind = uint8_t(MetadataRecord::RecordKinds::Pid);
-    int32_t pid = static_cast<int32_t>(Pid);
-    internal_memcpy(&PidMetadata.Data, &pid, sizeof(pid));
-  }
-
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-  if (TLD.BQ == nullptr || TLD.BQ->finalizing())
-    return;
-  internal_memcpy(TLD.RecordPtr, Metadata, sizeof(Metadata));
-  TLD.RecordPtr += sizeof(Metadata);
-  // Since we write out the extents as the first metadata record of the
-  // buffer, we need to write out the extents including the extents record.
-  atomic_store(&TLD.Buffer.Extents->Size, sizeof(Metadata),
-               memory_order_release);
-}
-
-static void setupNewBuffer(int (*wall_clock_reader)(
-    clockid_t, struct timespec *)) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  auto &B = TLD.Buffer;
-  TLD.RecordPtr = static_cast<char *>(B.Data);
-  tid_t Tid = GetTid();
-  timespec TS{0, 0};
-  pid_t Pid = internal_getpid();
-  // This is typically clock_gettime, but callers have injection ability.
-  wall_clock_reader(CLOCK_MONOTONIC, &TS);
-  writeNewBufferPreamble(Tid, TS, Pid);
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-}
-
-static void incrementExtents(size_t Add) {
-  auto &TLD = getThreadLocalData();
-  atomic_fetch_add(&TLD.Buffer.Extents->Size, Add, memory_order_acq_rel);
-}
-
-static void decrementExtents(size_t Subtract) {
-  auto &TLD = getThreadLocalData();
-  atomic_fetch_sub(&TLD.Buffer.Extents->Size, Subtract, memory_order_acq_rel);
-}
-
-static void writeNewCPUIdMetadata(uint16_t CPU,
-                                  uint64_t TSC) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  MetadataRecord NewCPUId;
-  NewCPUId.Type = uint8_t(RecordType::Metadata);
-  NewCPUId.RecordKind = uint8_t(MetadataRecord::RecordKinds::NewCPUId);
-
-  // The data for the New CPU will contain the following bytes:
-  //   - CPU ID (uint16_t, 2 bytes)
-  //   - Full TSC (uint64_t, 8 bytes)
-  // Total = 10 bytes.
-  internal_memcpy(&NewCPUId.Data, &CPU, sizeof(CPU));
-  internal_memcpy(&NewCPUId.Data[sizeof(CPU)], &TSC, sizeof(TSC));
-  internal_memcpy(TLD.RecordPtr, &NewCPUId, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-  incrementExtents(sizeof(MetadataRecord));
-}
-
-static void writeTSCWrapMetadata(uint64_t TSC) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  MetadataRecord TSCWrap;
-  TSCWrap.Type = uint8_t(RecordType::Metadata);
-  TSCWrap.RecordKind = uint8_t(MetadataRecord::RecordKinds::TSCWrap);
-
-  // The data for the TSCWrap record contains the following bytes:
-  //   - Full TSC (uint64_t, 8 bytes)
-  // Total = 8 bytes.
-  internal_memcpy(&TSCWrap.Data, &TSC, sizeof(TSC));
-  internal_memcpy(TLD.RecordPtr, &TSCWrap, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
-  TLD.NumConsecutiveFnEnters = 0;
-  TLD.NumTailCalls = 0;
-  incrementExtents(sizeof(MetadataRecord));
-}
-
-// Call Argument metadata records store the arguments to a function in the
-// order of their appearance; holes are not supported by the buffer format.
-static void writeCallArgumentMetadata(uint64_t A) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  MetadataRecord CallArg;
-  CallArg.Type = uint8_t(RecordType::Metadata);
-  CallArg.RecordKind = uint8_t(MetadataRecord::RecordKinds::CallArgument);
-
-  internal_memcpy(CallArg.Data, &A, sizeof(A));
-  internal_memcpy(TLD.RecordPtr, &CallArg, sizeof(MetadataRecord));
-  TLD.RecordPtr += sizeof(MetadataRecord);
-  incrementExtents(sizeof(MetadataRecord));
-}
-
-static void writeFunctionRecord(int FuncId, uint32_t TSCDelta,
-                                XRayEntryType EntryType) XRAY_NEVER_INSTRUMENT {
-  FunctionRecord FuncRecord;
-  FuncRecord.Type = uint8_t(RecordType::Function);
-  // Only take 28 bits of the function id.
-  FuncRecord.FuncId = FuncId & ~(0x0F << 28);
-  FuncRecord.TSCDelta = TSCDelta;
-
-  auto &TLD = getThreadLocalData();
-  switch (EntryType) {
-  case XRayEntryType::ENTRY:
-    ++TLD.NumConsecutiveFnEnters;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
-    break;
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    // We should not rewind functions with logged args.
-    TLD.NumConsecutiveFnEnters = 0;
-    TLD.NumTailCalls = 0;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionEnter);
-    break;
-  case XRayEntryType::EXIT:
-    // If we've decided to log the function exit, we will never erase the log
-    // before it.
-    TLD.NumConsecutiveFnEnters = 0;
-    TLD.NumTailCalls = 0;
-    FuncRecord.RecordKind = uint8_t(FunctionRecord::RecordKinds::FunctionExit);
-    break;
-  case XRayEntryType::TAIL:
-    // If we just entered the function we're tail exiting from or erased every
-    // invocation since then, this function entry tail pair is a candidate to
-    // be erased when the child function exits.
-    if (TLD.NumConsecutiveFnEnters > 0) {
-      ++TLD.NumTailCalls;
-      TLD.NumConsecutiveFnEnters = 0;
-    } else {
-      // We will never be able to erase this tail call since we have logged
-      // something in between the function entry and tail exit.
-      TLD.NumTailCalls = 0;
-      TLD.NumConsecutiveFnEnters = 0;
-    }
-    FuncRecord.RecordKind =
-        uint8_t(FunctionRecord::RecordKinds::FunctionTailExit);
-    break;
-  case XRayEntryType::CUSTOM_EVENT: {
-    // This is a bug in patching, so we'll report it once and move on.
-    static atomic_uint8_t ErrorLatch{0};
-    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
-      Report("Internal error: patched an XRay custom event call as a function; "
-             "func id = %d\n",
-             FuncId);
-    return;
-  }
-  case XRayEntryType::TYPED_EVENT: {
-    static atomic_uint8_t ErrorLatch{0};
-    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
-      Report("Internal error: patched an XRay typed event call as a function; "
-             "func id = %d\n",
-             FuncId);
-    return;
-  }
-  }
-
-  internal_memcpy(TLD.RecordPtr, &FuncRecord, sizeof(FunctionRecord));
-  TLD.RecordPtr += sizeof(FunctionRecord);
-  incrementExtents(sizeof(FunctionRecord));
-}
-
-static atomic_uint64_t TicksPerSec{0};
-static atomic_uint64_t ThresholdTicks{0};
-
-// Re-point the thread local pointer into this thread's Buffer before the recent
-// "Function Entry" record and any "Tail Call Exit" records after that.
-static void rewindRecentCall(uint64_t TSC, uint64_t &LastTSC,
-                             uint64_t &LastFunctionEntryTSC, int32_t FuncId) {
-  auto &TLD = getThreadLocalData();
-  TLD.RecordPtr -= FunctionRecSize;
-  decrementExtents(FunctionRecSize);
-  FunctionRecord FuncRecord;
-  internal_memcpy(&FuncRecord, TLD.RecordPtr, FunctionRecSize);
-  DCHECK(FuncRecord.RecordKind ==
-             uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
-         "Expected to find function entry recording when rewinding.");
-  DCHECK(FuncRecord.FuncId == (FuncId & ~(0x0F << 28)) &&
-         "Expected matching function id when rewinding Exit");
-  --TLD.NumConsecutiveFnEnters;
-  LastTSC -= FuncRecord.TSCDelta;
-
-  // We unwound one call. Update the state and return without writing a log.
-  if (TLD.NumConsecutiveFnEnters != 0) {
-    LastFunctionEntryTSC -= FuncRecord.TSCDelta;
-    return;
-  }
-
-  // Otherwise we've rewound the stack of all function entries, we might be
-  // able to rewind further by erasing tail call functions that are being
-  // exited from via this exit.
-  LastFunctionEntryTSC = 0;
-  auto RewindingTSC = LastTSC;
-  auto RewindingRecordPtr = TLD.RecordPtr - FunctionRecSize;
-  while (TLD.NumTailCalls > 0) {
-    // Rewind the TSC back over the TAIL EXIT record.
-    FunctionRecord ExpectedTailExit;
-    internal_memcpy(&ExpectedTailExit, RewindingRecordPtr, FunctionRecSize);
-
-    DCHECK(ExpectedTailExit.RecordKind ==
-               uint8_t(FunctionRecord::RecordKinds::FunctionTailExit) &&
-           "Expected to find tail exit when rewinding.");
-    RewindingRecordPtr -= FunctionRecSize;
-    RewindingTSC -= ExpectedTailExit.TSCDelta;
-    FunctionRecord ExpectedFunctionEntry;
-    internal_memcpy(&ExpectedFunctionEntry, RewindingRecordPtr,
-                    FunctionRecSize);
-    DCHECK(ExpectedFunctionEntry.RecordKind ==
-               uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
-           "Expected to find function entry when rewinding tail call.");
-    DCHECK(ExpectedFunctionEntry.FuncId == ExpectedTailExit.FuncId &&
-           "Expected funcids to match when rewinding tail call.");
-
-    // This tail call exceeded the threshold duration. It will not be erased.
-    if ((TSC - RewindingTSC) >= atomic_load_relaxed(&ThresholdTicks)) {
-      TLD.NumTailCalls = 0;
-      return;
-    }
-
-    // We can erase a tail exit pair that we're exiting through since
-    // its duration is under threshold.
-    --TLD.NumTailCalls;
-    RewindingRecordPtr -= FunctionRecSize;
-    RewindingTSC -= ExpectedFunctionEntry.TSCDelta;
-    TLD.RecordPtr -= 2 * FunctionRecSize;
-    LastTSC = RewindingTSC;
-    decrementExtents(2 * FunctionRecSize);
-  }
-}
-
-static bool releaseThreadLocalBuffer(BufferQueue &BQArg) {
-  auto &TLD = getThreadLocalData();
-  auto EC = BQArg.releaseBuffer(TLD.Buffer);
-  if (EC != BufferQueue::ErrorCode::Ok) {
-    Report("Failed to release buffer at %p; error=%s\n", TLD.Buffer.Data,
-           BufferQueue::getErrorString(EC));
-    return false;
-  }
-  return true;
-}
-
-static bool prepareBuffer(uint64_t TSC, unsigned char CPU,
-                          int (*wall_clock_reader)(clockid_t,
-                                                   struct timespec *),
-                          size_t MaxSize) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  char *BufferStart = static_cast<char *>(TLD.Buffer.Data);
-  if ((TLD.RecordPtr + MaxSize) > (BufferStart + TLD.Buffer.Size)) {
-    if (!releaseThreadLocalBuffer(*TLD.BQ))
-      return false;
-    auto EC = TLD.BQ->getBuffer(TLD.Buffer);
-    if (EC != BufferQueue::ErrorCode::Ok) {
-      Report("Failed to prepare a buffer; error = '%s'\n",
-             BufferQueue::getErrorString(EC));
-      return false;
-    }
-    setupNewBuffer(wall_clock_reader);
-
-    // Always write the CPU metadata as the first record in the buffer.
-    writeNewCPUIdMetadata(CPU, TSC);
-  }
-  return true;
-}
-
-static bool
-isLogInitializedAndReady(BufferQueue *LBQ, uint64_t TSC, unsigned char CPU,
-                         int (*wall_clock_reader)(clockid_t, struct timespec *))
-    XRAY_NEVER_INSTRUMENT {
-  // Bail out right away if logging is not initialized yet.
-  // We should take the opportunity to release the buffer though.
-  auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
-  auto &TLD = getThreadLocalData();
-  if (Status != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
-    if (TLD.RecordPtr != nullptr &&
-        (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
-         Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)) {
-      if (!releaseThreadLocalBuffer(*LBQ))
-        return false;
-      TLD.RecordPtr = nullptr;
-      return false;
-    }
-    return false;
-  }
-
-  if (atomic_load(&LoggingStatus, memory_order_acquire) !=
-          XRayLogInitStatus::XRAY_LOG_INITIALIZED ||
-      LBQ->finalizing()) {
-    if (!releaseThreadLocalBuffer(*LBQ))
-      return false;
-    TLD.RecordPtr = nullptr;
-  }
-
-  if (TLD.Buffer.Data == nullptr) {
-    auto EC = LBQ->getBuffer(TLD.Buffer);
-    if (EC != BufferQueue::ErrorCode::Ok) {
-      auto LS = atomic_load(&LoggingStatus, memory_order_acquire);
-      if (LS != XRayLogInitStatus::XRAY_LOG_FINALIZING &&
-          LS != XRayLogInitStatus::XRAY_LOG_FINALIZED)
-        Report("Failed to acquire a buffer; error = '%s'\n",
-               BufferQueue::getErrorString(EC));
-      return false;
-    }
-
-    setupNewBuffer(wall_clock_reader);
-
-    // Always write the CPU metadata as the first record in the buffer.
-    writeNewCPUIdMetadata(CPU, TSC);
-  }
-
-  if (TLD.CurrentCPU == std::numeric_limits<uint16_t>::max()) {
-    // This means this is the first CPU this thread has ever run on. We set
-    // the current CPU and record this as the first TSC we've seen.
-    TLD.CurrentCPU = CPU;
-    writeNewCPUIdMetadata(CPU, TSC);
-  }
-
-  return true;
-}
-
-// Compute the TSC difference between the time of measurement and the previous
-// event. There are a few interesting situations we need to account for:
-//
-//   - The thread has migrated to a different CPU. If this is the case, then
-//     we write down the following records:
-//
-//       1. A 'NewCPUId' Metadata record.
-//       2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-//   - The TSC delta is greater than the 32 bits we can store in a
-//     FunctionRecord. In this case we write down the following records:
-//
-//       1. A 'TSCWrap' Metadata record.
-//       2. A FunctionRecord with a 0 for the TSCDelta field.
-//
-//   - The TSC delta is representable within the 32 bits we can store in a
-//     FunctionRecord. In this case we write down just a FunctionRecord with
-//     the correct TSC delta.
-static uint32_t writeCurrentCPUTSC(ThreadLocalData &TLD, uint64_t TSC,
-                                   uint8_t CPU) {
-  if (CPU != TLD.CurrentCPU) {
-    // We've moved to a new CPU.
-    writeNewCPUIdMetadata(CPU, TSC);
-    return 0;
-  }
-  // If the delta is greater than the range for a uint32_t, then we write out
-  // the TSC wrap metadata entry with the full TSC, and the TSC for the
-  // function record be 0.
-  uint64_t Delta = TSC - TLD.LastTSC;
-  if (Delta <= std::numeric_limits<uint32_t>::max())
-    return Delta;
-
-  writeTSCWrapMetadata(TSC);
-  return 0;
-}
-
-static void endBufferIfFull() XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  auto BufferStart = static_cast<char *>(TLD.Buffer.Data);
-  if ((TLD.RecordPtr + MetadataRecSize) - BufferStart <=
-      ptrdiff_t{MetadataRecSize}) {
-    if (!releaseThreadLocalBuffer(*TLD.BQ))
-      return;
-    TLD.RecordPtr = nullptr;
-  }
-}
-
-thread_local atomic_uint8_t Running{0};
-
-/// Here's where the meat of the processing happens. The writer captures
-/// function entry, exit and tail exit points with a time and will create
-/// TSCWrap, NewCPUId and Function records as necessary. The writer might
-/// walk backward through its buffer and erase trivial functions to avoid
-/// polluting the log and may use the buffer queue to obtain or release a
-/// buffer.
-static void processFunctionHook(int32_t FuncId, XRayEntryType Entry,
-                                uint64_t TSC, unsigned char CPU, uint64_t Arg1,
-                                int (*wall_clock_reader)(clockid_t,
-                                                         struct timespec *))
-    XRAY_NEVER_INSTRUMENT {
-  __asm volatile("# LLVM-MCA-BEGIN processFunctionHook");
-  // Prevent signal handler recursion, so in case we're already in a log writing
-  // mode and the signal handler comes in (and is also instrumented) then we
-  // don't want to be clobbering potentially partial writes already happening in
-  // the thread. We use a simple thread_local latch to only allow one on-going
-  // handleArg0 to happen at any given time.
-  RecursionGuard Guard{Running};
-  if (!Guard) {
-    DCHECK(atomic_load_relaxed(&Running) && "RecursionGuard is buggy!");
-    return;
-  }
-
-  auto &TLD = getThreadLocalData();
-
-  if (TLD.BQ == nullptr)
-    TLD.BQ = BQ;
-
-  if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, wall_clock_reader))
-    return;
-
-  // Before we go setting up writing new function entries, we need to be really
-  // careful about the pointer math we're doing. This means we need to ensure
-  // that the record we are about to write is going to fit into the buffer,
-  // without overflowing the buffer.
-  //
-  // To do this properly, we use the following assumptions:
-  //
-  //   - The least number of bytes we will ever write is 8
-  //     (sizeof(FunctionRecord)) only if the delta between the previous entry
-  //     and this entry is within 32 bits.
-  //   - The most number of bytes we will ever write is 8 + 16 + 16 = 40.
-  //     This is computed by:
-  //
-  //       MaxSize = sizeof(FunctionRecord) + 2 * sizeof(MetadataRecord)
-  //
-  //     These arise in the following cases:
-  //
-  //       1. When the delta between the TSC we get and the previous TSC for the
-  //          same CPU is outside of the uint32_t range, we end up having to
-  //          write a MetadataRecord to indicate a "tsc wrap" before the actual
-  //          FunctionRecord.
-  //       2. When we learn that we've moved CPUs, we need to write a
-  //          MetadataRecord to indicate a "cpu change", and thus write out the
-  //          current TSC for that CPU before writing out the actual
-  //          FunctionRecord.
-  //       3. When we learn about a new CPU ID, we need to write down a "new cpu
-  //          id" MetadataRecord before writing out the actual FunctionRecord.
-  //       4. The second MetadataRecord is the optional function call argument.
-  //
-  // So the math we need to do is to determine whether writing 40 bytes past the
-  // current pointer exceeds the buffer's maximum size. If we don't have enough
-  // space to write 40 bytes in the buffer, we need get a new Buffer, set it up
-  // properly before doing any further writing.
-  size_t MaxSize = FunctionRecSize + 2 * MetadataRecSize;
-  if (!prepareBuffer(TSC, CPU, wall_clock_reader, MaxSize)) {
-    TLD.BQ = nullptr;
-    return;
-  }
-
-  // By this point, we are now ready to write up to 40 bytes (explained above).
-  DCHECK((TLD.RecordPtr + MaxSize) - static_cast<char *>(TLD.Buffer.Data) >=
-             static_cast<ptrdiff_t>(MetadataRecSize) &&
-         "Misconfigured BufferQueue provided; Buffer size not large enough.");
-
-  auto RecordTSCDelta = writeCurrentCPUTSC(TLD, TSC, CPU);
-  TLD.LastTSC = TSC;
-  TLD.CurrentCPU = CPU;
-  switch (Entry) {
-  case XRayEntryType::ENTRY:
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    // Update the thread local state for the next invocation.
-    TLD.LastFunctionEntryTSC = TSC;
-    break;
-  case XRayEntryType::TAIL:
-  case XRayEntryType::EXIT:
-    // Break out and write the exit record if we can't erase any functions.
-    if (TLD.NumConsecutiveFnEnters == 0 ||
-        (TSC - TLD.LastFunctionEntryTSC) >=
-            atomic_load_relaxed(&ThresholdTicks))
-      break;
-    rewindRecentCall(TSC, TLD.LastTSC, TLD.LastFunctionEntryTSC, FuncId);
-    return; // without writing log.
-  case XRayEntryType::CUSTOM_EVENT: {
-    // This is a bug in patching, so we'll report it once and move on.
-    static atomic_uint8_t ErrorLatch{0};
-    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
-      Report("Internal error: patched an XRay custom event call as a function; "
-             "func id = %d\n",
-             FuncId);
-    return;
-  }
-  case XRayEntryType::TYPED_EVENT: {
-    static atomic_uint8_t ErrorLatch{0};
-    if (!atomic_exchange(&ErrorLatch, 1, memory_order_acq_rel))
-      Report("Internal error: patched an XRay typed event call as a function; "
-             "func id = %d\n",
-             FuncId);
-    return;
-  }
-  }
-
-  writeFunctionRecord(FuncId, RecordTSCDelta, Entry);
-  if (Entry == XRayEntryType::LOG_ARGS_ENTRY)
-    writeCallArgumentMetadata(Arg1);
-
-  // If we've exhausted the buffer by this time, we then release the buffer to
-  // make sure that other threads may start using this buffer.
-  endBufferIfFull();
-  __asm volatile("# LLVM-MCA-END");
-}
-
 static XRayFileHeader &fdrCommonHeaderInfo() {
   static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage;
   static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
   static bool TSCSupported = true;
   static uint64_t CycleFrequency = NanosecondsPerSecond;
-  pthread_once(&OnceInit, +[] {
-    XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
-    // Version 2 of the log writes the extents of the buffer, instead of
-    // relying on an end-of-buffer record.
-    // Version 3 includes PID metadata record
-    H.Version = 3;
-    H.Type = FileTypes::FDR_LOG;
-
-    // Test for required CPU features and cache the cycle frequency
-    TSCSupported = probeRequiredCPUFeatures();
-    if (TSCSupported)
-      CycleFrequency = getTSCFrequency();
-    H.CycleFrequency = CycleFrequency;
-
-    // FIXME: Actually check whether we have 'constant_tsc' and
-    // 'nonstop_tsc' before setting the values in the header.
-    H.ConstantTSC = 1;
-    H.NonstopTSC = 1;
-  });
+  pthread_once(
+      &OnceInit, +[] {
+        XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
+        // Version 2 of the log writes the extents of the buffer, instead of
+        // relying on an end-of-buffer record.
+        // Version 3 includes PID metadata record.
+        // Version 4 includes CPU data in the custom event records.
+        // Version 5 uses relative deltas for custom and typed event records,
+        // and removes the CPU data in custom event records (similar to how
+        // function records use deltas instead of full TSCs and rely on other
+        // metadata records for TSC wraparound and CPU migration).
+        H.Version = 5;
+        H.Type = FileTypes::FDR_LOG;
+
+        // Test for required CPU features and cache the cycle frequency
+        TSCSupported = probeRequiredCPUFeatures();
+        if (TSCSupported)
+          CycleFrequency = getTSCFrequency();
+        H.CycleFrequency = CycleFrequency;
+
+        // FIXME: Actually check whether we have 'constant_tsc' and
+        // 'nonstop_tsc' before setting the values in the header.
+        H.ConstantTSC = 1;
+        H.NonstopTSC = 1;
+      });
   return reinterpret_cast<XRayFileHeader &>(HStorage);
 }
 
@@ -728,9 +207,11 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
   // buffers to expect).
   static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage;
   static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT;
-  pthread_once(&HeaderOnce, +[] {
-    reinterpret_cast<XRayFileHeader &>(HeaderStorage) = fdrCommonHeaderInfo();
-  });
+  pthread_once(
+      &HeaderOnce, +[] {
+        reinterpret_cast<XRayFileHeader &>(HeaderStorage) =
+            fdrCommonHeaderInfo();
+      });
 
   // We use a convenience alias for code referring to Header from here on out.
   auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage);
@@ -741,7 +222,8 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
 
   static BufferQueue::const_iterator It{};
   static BufferQueue::const_iterator End{};
-  static void *CurrentBuffer{nullptr};
+  static uint8_t *CurrentBuffer{nullptr};
+  static size_t SerializedBufferSize = 0;
   if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
     // From this point on, we provide raw access to the raw buffer we're getting
     // from the BufferQueue. We're relying on the iterators from the current
@@ -751,7 +233,7 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
   }
 
   if (CurrentBuffer != nullptr) {
-    InternalFree(CurrentBuffer);
+    deallocateBuffer(CurrentBuffer, SerializedBufferSize);
     CurrentBuffer = nullptr;
   }
 
@@ -762,9 +244,16 @@ XRayBuffer fdrIterator(const XRayBuffer B) {
   // out to disk. The difference here would be that we still write "empty"
   // buffers, or at least go through the iterators faithfully to let the
   // handlers see the empty buffers in the queue.
-  auto BufferSize = atomic_load(&It->Extents->Size, memory_order_acquire);
-  auto SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
-  CurrentBuffer = InternalAlloc(SerializedBufferSize);
+  //
+  // We need this atomic fence here to ensure that writes happening to the
+  // buffer have been committed before we load the extents atomically. Because
+  // the buffer is not explicitly synchronised across threads, we rely on the
+  // fence ordering to ensure that writes we expect to have been completed
+  // before the fence are fully committed before we read the extents.
+  atomic_thread_fence(memory_order_acquire);
+  auto BufferSize = atomic_load(It->Extents, memory_order_acquire);
+  SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+  CurrentBuffer = allocateBuffer(SerializedBufferSize);
   if (CurrentBuffer == nullptr)
     return {nullptr, 0};
 
@@ -827,14 +316,9 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
   });
 
   auto CleanupBuffers = at_scope_exit([] {
-    if (BQ != nullptr) {
-      auto &TLD = getThreadLocalData();
-      if (TLD.RecordPtr != nullptr && TLD.BQ != nullptr)
-        releaseThreadLocalBuffer(*TLD.BQ);
-      BQ->~BufferQueue();
-      InternalFree(BQ);
-      BQ = nullptr;
-    }
+    auto &TLD = getThreadLocalData();
+    if (TLD.Controller != nullptr)
+      TLD.Controller->flush();
   });
 
   if (fdrFlags()->no_file_flush) {
@@ -855,16 +339,8 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
   //      (fixed-sized) and let the tools reading the buffers deal with the data
   //      afterwards.
   //
-  int Fd = -1;
-  {
-    // FIXME: Remove this section of the code, when we remove the struct-based
-    // configuration API.
-    SpinMutexLock Guard(&FDROptionsMutex);
-    Fd = FDROptions.Fd;
-  }
-  if (Fd == -1)
-    Fd = getLogFD();
-  if (Fd == -1) {
+  LogWriter *LW = LogWriter::Open();
+  if (LW == nullptr) {
     auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
     atomic_store(&LogFlushStatus, Result, memory_order_release);
     return Result;
@@ -872,8 +348,15 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
 
   XRayFileHeader Header = fdrCommonHeaderInfo();
   Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
-  retryingWriteAll(Fd, reinterpret_cast<char *>(&Header),
-                   reinterpret_cast<char *>(&Header) + sizeof(Header));
+  LW->WriteAll(reinterpret_cast<char *>(&Header),
+               reinterpret_cast<char *>(&Header) + sizeof(Header));
+
+  // Release the current thread's buffer before we attempt to write out all the
+  // buffers. This ensures that in case we had only a single thread going, that
+  // we are able to capture the data nonetheless.
+  auto &TLD = getThreadLocalData();
+  if (TLD.Controller != nullptr)
+    TLD.Controller->flush();
 
   BQ->apply([&](const BufferQueue::Buffer &B) {
     // Starting at version 2 of the FDR logging implementation, we only write
@@ -882,18 +365,18 @@ XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
     // still use a Metadata record, but fill in the extents instead for the
     // data.
     MetadataRecord ExtentsRecord;
-    auto BufferExtents = atomic_load(&B.Extents->Size, memory_order_acquire);
+    auto BufferExtents = atomic_load(B.Extents, memory_order_acquire);
     DCHECK(BufferExtents <= B.Size);
     ExtentsRecord.Type = uint8_t(RecordType::Metadata);
     ExtentsRecord.RecordKind =
         uint8_t(MetadataRecord::RecordKinds::BufferExtents);
     internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
     if (BufferExtents > 0) {
-      retryingWriteAll(Fd, reinterpret_cast<char *>(&ExtentsRecord),
-                       reinterpret_cast<char *>(&ExtentsRecord) +
-                           sizeof(MetadataRecord));
-      retryingWriteAll(Fd, reinterpret_cast<char *>(B.Data),
-                       reinterpret_cast<char *>(B.Data) + BufferExtents);
+      LW->WriteAll(reinterpret_cast<char *>(&ExtentsRecord),
+                   reinterpret_cast<char *>(&ExtentsRecord) +
+                       sizeof(MetadataRecord));
+      LW->WriteAll(reinterpret_cast<char *>(B.Data),
+                   reinterpret_cast<char *>(B.Data) + BufferExtents);
     }
   });
 
@@ -914,7 +397,12 @@ XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
 
   // Do special things to make the log finalize itself, and not allow any more
   // operations to be performed until re-initialized.
-  BQ->finalize();
+  if (BQ == nullptr) {
+    if (Verbosity())
+      Report("Attempting to finalize an uninitialized global buffer!\n");
+  } else {
+    BQ->finalize();
+  }
 
   atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
                memory_order_release);
@@ -935,7 +423,8 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
   // Test once for required CPU features
   static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT;
   static bool TSCSupported = true;
-  pthread_once(&OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
+  pthread_once(
+      &OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
 
   if (TSCSupported) {
     Result.TSC = __xray::readTSC(Result.CPU);
@@ -953,16 +442,115 @@ static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
   return Result;
 }
 
+thread_local atomic_uint8_t Running{0};
+
+static bool setupTLD(ThreadLocalData &TLD) XRAY_NEVER_INSTRUMENT {
+  // Check if we're finalizing, before proceeding.
+  {
+    auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
+    if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+        Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+      if (TLD.Controller != nullptr) {
+        TLD.Controller->flush();
+        TLD.Controller = nullptr;
+      }
+      return false;
+    }
+  }
+
+  if (UNLIKELY(TLD.Controller == nullptr)) {
+    // Set up the TLD buffer queue.
+    if (UNLIKELY(BQ == nullptr))
+      return false;
+    TLD.BQ = BQ;
+
+    // Check that we have a valid buffer.
+    if (TLD.Buffer.Generation != BQ->generation() &&
+        TLD.BQ->releaseBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    // Set up a buffer, before setting up the log writer. Bail out on failure.
+    if (TLD.BQ->getBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    // Set up the Log Writer for this thread.
+    if (UNLIKELY(TLD.Writer == nullptr)) {
+      auto *LWStorage = reinterpret_cast<FDRLogWriter *>(&TLD.LWStorage);
+      new (LWStorage) FDRLogWriter(TLD.Buffer);
+      TLD.Writer = LWStorage;
+    } else {
+      TLD.Writer->resetRecord();
+    }
+
+    auto *CStorage = reinterpret_cast<FDRController<> *>(&TLD.CStorage);
+    new (CStorage)
+        FDRController<>(TLD.BQ, TLD.Buffer, *TLD.Writer, clock_gettime,
+                        atomic_load_relaxed(&ThresholdTicks));
+    TLD.Controller = CStorage;
+  }
+
+  DCHECK_NE(TLD.Controller, nullptr);
+  return true;
+}
+
 void fdrLoggingHandleArg0(int32_t FuncId,
                           XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
   auto TC = getTimestamp();
-  processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, 0, clock_gettime);
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.Controller->functionEnter(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::EXIT:
+    TLD.Controller->functionExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::TAIL:
+    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::CUSTOM_EVENT:
+  case XRayEntryType::TYPED_EVENT:
+    break;
+  }
 }
 
 void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry,
                           uint64_t Arg) XRAY_NEVER_INSTRUMENT {
   auto TC = getTimestamp();
-  processFunctionHook(FuncId, Entry, TC.TSC, TC.CPU, Arg, clock_gettime);
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.Controller->functionEnterArg(FuncId, TSC, CPU, Arg);
+    return;
+  case XRayEntryType::EXIT:
+    TLD.Controller->functionExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::TAIL:
+    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::CUSTOM_EVENT:
+  case XRayEntryType::TYPED_EVENT:
+    break;
+  }
 }
 
 void fdrLoggingHandleCustomEvent(void *Event,
@@ -973,40 +561,25 @@ void fdrLoggingHandleCustomEvent(void *Event,
   RecursionGuard Guard{Running};
   if (!Guard)
     return;
-  if (EventSize > std::numeric_limits<int32_t>::max()) {
+
+  // Complain when we ever get at least one custom event that's larger than what
+  // we can possibly support.
+  if (EventSize >
+      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
     static pthread_once_t Once = PTHREAD_ONCE_INIT;
-    pthread_once(&Once, +[] { Report("Event size too large.\n"); });
+    pthread_once(
+        &Once, +[] {
+          Report("Custom event size too large; truncating to %d.\n",
+                 std::numeric_limits<int32_t>::max());
+        });
   }
-  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
-  auto &TLD = getThreadLocalData();
-  if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime))
-    return;
 
-  // Here we need to prepare the log to handle:
-  //   - The metadata record we're going to write. (16 bytes)
-  //   - The additional data we're going to write. Currently, that's the size
-  //   of the event we're going to dump into the log as free-form bytes.
-  if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
-    TLD.BQ = nullptr;
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
     return;
-  }
 
-  // Write the custom event metadata record, which consists of the following
-  // information:
-  //   - 8 bytes (64-bits) for the full TSC when the event started.
-  //   - 4 bytes (32-bits) for the length of the data.
-  MetadataRecord CustomEvent;
-  CustomEvent.Type = uint8_t(RecordType::Metadata);
-  CustomEvent.RecordKind =
-      uint8_t(MetadataRecord::RecordKinds::CustomEventMarker);
-  constexpr auto TSCSize = sizeof(TC.TSC);
-  internal_memcpy(&CustomEvent.Data, &ReducedEventSize, sizeof(int32_t));
-  internal_memcpy(&CustomEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
-  internal_memcpy(TLD.RecordPtr, &CustomEvent, sizeof(CustomEvent));
-  TLD.RecordPtr += sizeof(CustomEvent);
-  internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
-  incrementExtents(MetadataRecSize + EventSize);
-  endBufferIfFull();
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  TLD.Controller->customEvent(TSC, CPU, Event, ReducedEventSize);
 }
 
 void fdrLoggingHandleTypedEvent(
@@ -1018,50 +591,28 @@ void fdrLoggingHandleTypedEvent(
   RecursionGuard Guard{Running};
   if (!Guard)
     return;
-  if (EventSize > std::numeric_limits<int32_t>::max()) {
+
+  // Complain when we ever get at least one typed event that's larger than what
+  // we can possibly support.
+  if (EventSize >
+      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
     static pthread_once_t Once = PTHREAD_ONCE_INIT;
-    pthread_once(&Once, +[] { Report("Event size too large.\n"); });
+    pthread_once(
+        &Once, +[] {
+          Report("Typed event size too large; truncating to %d.\n",
+                 std::numeric_limits<int32_t>::max());
+        });
   }
-  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+
   auto &TLD = getThreadLocalData();
-  if (!isLogInitializedAndReady(TLD.BQ, TSC, CPU, clock_gettime))
+  if (!setupTLD(TLD))
     return;
 
-  // Here we need to prepare the log to handle:
-  //   - The metadata record we're going to write. (16 bytes)
-  //   - The additional data we're going to write. Currently, that's the size
-  //   of the event we're going to dump into the log as free-form bytes.
-  if (!prepareBuffer(TSC, CPU, clock_gettime, MetadataRecSize + EventSize)) {
-    TLD.BQ = nullptr;
-    return;
-  }
-  // Write the custom event metadata record, which consists of the following
-  // information:
-  //   - 8 bytes (64-bits) for the full TSC when the event started.
-  //   - 4 bytes (32-bits) for the length of the data.
-  //   - 2 bytes (16-bits) for the event type. 3 bytes remain since one of the
-  //       bytes has the record type (Metadata Record) and kind (TypedEvent).
-  //       We'll log the error if the event type is greater than 2 bytes.
-  //       Event types are generated sequentially, so 2^16 is enough.
-  MetadataRecord TypedEvent;
-  TypedEvent.Type = uint8_t(RecordType::Metadata);
-  TypedEvent.RecordKind =
-      uint8_t(MetadataRecord::RecordKinds::TypedEventMarker);
-  constexpr auto TSCSize = sizeof(TC.TSC);
-  internal_memcpy(&TypedEvent.Data, &ReducedEventSize, sizeof(int32_t));
-  internal_memcpy(&TypedEvent.Data[sizeof(int32_t)], &TSC, TSCSize);
-  internal_memcpy(&TypedEvent.Data[sizeof(int32_t) + TSCSize], &EventType,
-                  sizeof(EventType));
-  internal_memcpy(TLD.RecordPtr, &TypedEvent, sizeof(TypedEvent));
-
-  TLD.RecordPtr += sizeof(TypedEvent);
-  internal_memcpy(TLD.RecordPtr, Event, ReducedEventSize);
-  incrementExtents(MetadataRecSize + EventSize);
-  endBufferIfFull();
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  TLD.Controller->typedEvent(TSC, CPU, EventType, Event, ReducedEventSize);
 }
 
-XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
-                                 void *Options,
+XRayLogInitStatus fdrLoggingInit(size_t, size_t, void *Options,
                                  size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
   if (Options == nullptr)
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
@@ -1075,107 +626,81 @@ XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
     return static_cast<XRayLogInitStatus>(CurrentStatus);
   }
 
-  // Because of __xray_log_init_mode(...) which guarantees that this will be
-  // called with BufferSize == 0 and BufferMax == 0 we parse the configuration
-  // provided in the Options pointer as a string instead.
-  if (BufferSize == 0 && BufferMax == 0) {
-    if (Verbosity())
-      Report("Initializing FDR mode with options: %s\n",
-             static_cast<const char *>(Options));
-
-    // TODO: Factor out the flags specific to the FDR mode implementation. For
-    // now, use the global/single definition of the flags, since the FDR mode
-    // flags are already defined there.
-    FlagParser FDRParser;
-    FDRFlags FDRFlags;
-    registerXRayFDRFlags(&FDRParser, &FDRFlags);
-    FDRFlags.setDefaults();
-
-    // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
-    // options until we migrate everyone to use the XRAY_FDR_OPTIONS
-    // compiler-provided options.
-    FDRParser.ParseString(useCompilerDefinedFlags());
-    FDRParser.ParseString(useCompilerDefinedFDRFlags());
-    auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
-    if (EnvOpts == nullptr)
-      EnvOpts = "";
-    FDRParser.ParseString(EnvOpts);
-
-    // FIXME: Remove this when we fully remove the deprecated flags.
-    if (internal_strlen(EnvOpts) == 0) {
-      FDRFlags.func_duration_threshold_us =
-          flags()->xray_fdr_log_func_duration_threshold_us;
-      FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
-    }
-
-    // The provided options should always override the compiler-provided and
-    // environment-variable defined options.
-    FDRParser.ParseString(static_cast<const char *>(Options));
-    *fdrFlags() = FDRFlags;
-    BufferSize = FDRFlags.buffer_size;
-    BufferMax = FDRFlags.buffer_max;
-    SpinMutexLock Guard(&FDROptionsMutex);
-    FDROptions.Fd = -1;
-    FDROptions.ReportErrors = true;
-  } else if (OptionsSize != sizeof(FDRLoggingOptions)) {
-    // FIXME: This is deprecated, and should really be removed.
-    // At this point we use the flag parser specific to the FDR mode
-    // implementation.
-    if (Verbosity())
-      Report("Cannot initialize FDR logging; wrong size for options: %d\n",
-             OptionsSize);
-    return static_cast<XRayLogInitStatus>(
-        atomic_load(&LoggingStatus, memory_order_acquire));
-  } else {
-    if (Verbosity())
-      Report("XRay FDR: struct-based init is deprecated, please use "
-             "string-based configuration instead.\n");
-    SpinMutexLock Guard(&FDROptionsMutex);
-    internal_memcpy(&FDROptions, Options, OptionsSize);
-  }
-
-  bool Success = false;
-
-  if (BQ != nullptr) {
-    BQ->~BufferQueue();
-    InternalFree(BQ);
-    BQ = nullptr;
-  }
+  if (Verbosity())
+    Report("Initializing FDR mode with options: %s\n",
+           static_cast<const char *>(Options));
+
+  // TODO: Factor out the flags specific to the FDR mode implementation. For
+  // now, use the global/single definition of the flags, since the FDR mode
+  // flags are already defined there.
+  FlagParser FDRParser;
+  FDRFlags FDRFlags;
+  registerXRayFDRFlags(&FDRParser, &FDRFlags);
+  FDRFlags.setDefaults();
+
+  // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
+  // options until we migrate everyone to use the XRAY_FDR_OPTIONS
+  // compiler-provided options.
+  FDRParser.ParseString(useCompilerDefinedFlags());
+  FDRParser.ParseString(useCompilerDefinedFDRFlags());
+  auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
+  FDRParser.ParseString(EnvOpts);
+
+  // FIXME: Remove this when we fully remove the deprecated flags.
+  if (internal_strlen(EnvOpts) == 0) {
+    FDRFlags.func_duration_threshold_us =
+        flags()->xray_fdr_log_func_duration_threshold_us;
+    FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
+  }
+
+  // The provided options should always override the compiler-provided and
+  // environment-variable defined options.
+  FDRParser.ParseString(static_cast<const char *>(Options));
+  *fdrFlags() = FDRFlags;
+  auto BufferSize = FDRFlags.buffer_size;
+  auto BufferMax = FDRFlags.buffer_max;
 
   if (BQ == nullptr) {
-    BQ = reinterpret_cast<BufferQueue *>(
-        InternalAlloc(sizeof(BufferQueue), nullptr, 64));
+    bool Success = false;
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
     new (BQ) BufferQueue(BufferSize, BufferMax, Success);
-  }
-
-  if (!Success) {
-    Report("BufferQueue init failed.\n");
-    if (BQ != nullptr) {
-      BQ->~BufferQueue();
-      InternalFree(BQ);
-      BQ = nullptr;
+    if (!Success) {
+      Report("BufferQueue init failed.\n");
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+  } else {
+    if (BQ->init(BufferSize, BufferMax) != BufferQueue::ErrorCode::Ok) {
+      if (Verbosity())
+        Report("Failed to re-initialize global buffer queue. Init failed.\n");
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
     }
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
   }
 
   static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
-  pthread_once(&OnceInit, +[] {
-    atomic_store(&TicksPerSec,
-                 probeRequiredCPUFeatures() ? getTSCFrequency()
-                                            : __xray::NanosecondsPerSecond,
-                 memory_order_release);
-    pthread_key_create(&Key, +[](void *TLDPtr) {
-      if (TLDPtr == nullptr)
-        return;
-      auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
-      if (TLD.BQ == nullptr)
-        return;
-      auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
-      if (EC != BufferQueue::ErrorCode::Ok)
-        Report("At thread exit, failed to release buffer at %p; error=%s\n",
-               TLD.Buffer.Data, BufferQueue::getErrorString(EC));
-    });
-  });
+  pthread_once(
+      &OnceInit, +[] {
+        atomic_store(&TicksPerSec,
+                     probeRequiredCPUFeatures() ? getTSCFrequency()
+                                                : __xray::NanosecondsPerSecond,
+                     memory_order_release);
+        pthread_key_create(
+            &Key, +[](void *TLDPtr) {
+              if (TLDPtr == nullptr)
+                return;
+              auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
+              if (TLD.BQ == nullptr)
+                return;
+              if (TLD.Buffer.Data == nullptr)
+                return;
+              auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
+              if (EC != BufferQueue::ErrorCode::Ok)
+                Report("At thread exit, failed to release buffer at %p; "
+                       "error=%s\n",
+                       TLD.Buffer.Data, BufferQueue::getErrorString(EC));
+            });
+      });
 
   atomic_store(&ThresholdTicks,
                atomic_load_relaxed(&TicksPerSec) *
@@ -1209,11 +734,22 @@ bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
   };
   auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
   if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
-      Verbosity())
+      Verbosity()) {
     Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
            RegistrationResult);
-  if (flags()->xray_fdr_log || !internal_strcmp(flags()->xray_mode, "xray-fdr"))
-    __xray_set_log_impl(Impl);
+    return false;
+  }
+
+  if (flags()->xray_fdr_log ||
+      !internal_strcmp(flags()->xray_mode, "xray-fdr")) {
+    auto SelectResult = __xray_log_select_mode("xray-fdr");
+    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+        Verbosity()) {
+      Report("Cannot select XRay FDR mode as 'xray-fdr'; error = %d\n",
+             SelectResult);
+      return false;
+    }
+  }
   return true;
 }
 
diff --git a/contrib/compiler-rt/lib/xray/xray_function_call_trie.h b/contrib/compiler-rt/lib/xray/xray_function_call_trie.h
index 2acf14aa5625..d01ad20e3d71 100644
--- a/contrib/compiler-rt/lib/xray/xray_function_call_trie.h
+++ b/contrib/compiler-rt/lib/xray/xray_function_call_trie.h
@@ -15,9 +15,11 @@
 #ifndef XRAY_FUNCTION_CALL_TRIE_H
 #define XRAY_FUNCTION_CALL_TRIE_H
 
-#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "xray_buffer_queue.h"
+#include "xray_defs.h"
 #include "xray_profiling_flags.h"
 #include "xray_segmented_array.h"
+#include <limits>
 #include <memory> // For placement new.
 #include <utility>
 
@@ -97,9 +99,6 @@ public:
   struct NodeIdPair {
     Node *NodePtr;
     int32_t FId;
-
-    // Constructor for inplace-construction.
-    NodeIdPair(Node *N, int32_t F) : NodePtr(N), FId(F) {}
   };
 
   using NodeIdPairArray = Array<NodeIdPair>;
@@ -113,17 +112,10 @@ public:
   struct Node {
     Node *Parent;
     NodeIdPairArray Callees;
-    int64_t CallCount;
-    int64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+    uint64_t CallCount;
+    uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
     int32_t FId;
 
-    // We add a constructor here to allow us to inplace-construct through
-    // Array<...>'s AppendEmplace.
-    Node(Node *P, NodeIdPairAllocatorType &A, int64_t CC, int64_t CLT,
-         int32_t F)
-        : Parent(P), Callees(A), CallCount(CC), CumulativeLocalTime(CLT),
-          FId(F) {}
-
     // TODO: Include the compact histogram.
   };
 
@@ -131,10 +123,7 @@ private:
   struct ShadowStackEntry {
     uint64_t EntryTSC;
     Node *NodePtr;
-
-    // We add a constructor here to allow us to inplace-construct through
-    // Array<...>'s AppendEmplace.
-    ShadowStackEntry(uint64_t T, Node *N) : EntryTSC{T}, NodePtr{N} {}
+    uint16_t EntryCPU;
   };
 
   using NodeArray = Array<Node>;
@@ -149,103 +138,184 @@ public:
     using RootAllocatorType = RootArray::AllocatorType;
     using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
 
+    // Use hosted aligned storage members to allow for trivial move and init.
+    // This also allows us to sidestep the potential-failing allocation issue.
+    typename std::aligned_storage<sizeof(NodeAllocatorType),
+                                  alignof(NodeAllocatorType)>::type
+        NodeAllocatorStorage;
+    typename std::aligned_storage<sizeof(RootAllocatorType),
+                                  alignof(RootAllocatorType)>::type
+        RootAllocatorStorage;
+    typename std::aligned_storage<sizeof(ShadowStackAllocatorType),
+                                  alignof(ShadowStackAllocatorType)>::type
+        ShadowStackAllocatorStorage;
+    typename std::aligned_storage<sizeof(NodeIdPairAllocatorType),
+                                  alignof(NodeIdPairAllocatorType)>::type
+        NodeIdPairAllocatorStorage;
+
     NodeAllocatorType *NodeAllocator = nullptr;
     RootAllocatorType *RootAllocator = nullptr;
     ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
     NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
 
-    Allocators() {}
+    Allocators() = default;
     Allocators(const Allocators &) = delete;
     Allocators &operator=(const Allocators &) = delete;
 
-    Allocators(Allocators &&O)
-        : NodeAllocator(O.NodeAllocator), RootAllocator(O.RootAllocator),
-          ShadowStackAllocator(O.ShadowStackAllocator),
-          NodeIdPairAllocator(O.NodeIdPairAllocator) {
+    struct Buffers {
+      BufferQueue::Buffer NodeBuffer;
+      BufferQueue::Buffer RootsBuffer;
+      BufferQueue::Buffer ShadowStackBuffer;
+      BufferQueue::Buffer NodeIdPairBuffer;
+    };
+
+    explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT {
+      new (&NodeAllocatorStorage)
+          NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size);
+      NodeAllocator =
+          reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+
+      new (&RootAllocatorStorage)
+          RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size);
+      RootAllocator =
+          reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+
+      new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(
+          B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size);
+      ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+          &ShadowStackAllocatorStorage);
+
+      new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(
+          B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size);
+      NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+          &NodeIdPairAllocatorStorage);
+    }
+
+    explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT {
+      new (&NodeAllocatorStorage) NodeAllocatorType(Max);
+      NodeAllocator =
+          reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+
+      new (&RootAllocatorStorage) RootAllocatorType(Max);
+      RootAllocator =
+          reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+
+      new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max);
+      ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+          &ShadowStackAllocatorStorage);
+
+      new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max);
+      NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+          &NodeIdPairAllocatorStorage);
+    }
+
+    Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT {
+      // Here we rely on the safety of memcpy'ing contents of the storage
+      // members, and then pointing the source pointers to nullptr.
+      internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage,
+                      sizeof(NodeAllocatorType));
+      internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage,
+                      sizeof(RootAllocatorType));
+      internal_memcpy(&ShadowStackAllocatorStorage,
+                      &O.ShadowStackAllocatorStorage,
+                      sizeof(ShadowStackAllocatorType));
+      internal_memcpy(&NodeIdPairAllocatorStorage,
+                      &O.NodeIdPairAllocatorStorage,
+                      sizeof(NodeIdPairAllocatorType));
+
+      NodeAllocator =
+          reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+      RootAllocator =
+          reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+      ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+          &ShadowStackAllocatorStorage);
+      NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+          &NodeIdPairAllocatorStorage);
+
       O.NodeAllocator = nullptr;
       O.RootAllocator = nullptr;
       O.ShadowStackAllocator = nullptr;
       O.NodeIdPairAllocator = nullptr;
     }
 
-    Allocators &operator=(Allocators &&O) {
-      {
-        auto Tmp = O.NodeAllocator;
-        O.NodeAllocator = this->NodeAllocator;
-        this->NodeAllocator = Tmp;
-      }
-      {
-        auto Tmp = O.RootAllocator;
-        O.RootAllocator = this->RootAllocator;
-        this->RootAllocator = Tmp;
-      }
-      {
-        auto Tmp = O.ShadowStackAllocator;
-        O.ShadowStackAllocator = this->ShadowStackAllocator;
-        this->ShadowStackAllocator = Tmp;
-      }
-      {
-        auto Tmp = O.NodeIdPairAllocator;
-        O.NodeIdPairAllocator = this->NodeIdPairAllocator;
-        this->NodeIdPairAllocator = Tmp;
-      }
-      return *this;
-    }
-
-    ~Allocators() {
-      // Note that we cannot use delete on these pointers, as they need to be
-      // returned to the sanitizer_common library's internal memory tracking
-      // system.
-      if (NodeAllocator != nullptr) {
+    Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT {
+      // When moving into an existing instance, we ensure that we clean up the
+      // current allocators.
+      if (NodeAllocator)
         NodeAllocator->~NodeAllocatorType();
-        InternalFree(NodeAllocator);
+      if (O.NodeAllocator) {
+        new (&NodeAllocatorStorage)
+            NodeAllocatorType(std::move(*O.NodeAllocator));
+        NodeAllocator =
+            reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+        O.NodeAllocator = nullptr;
+      } else {
         NodeAllocator = nullptr;
       }
-      if (RootAllocator != nullptr) {
+
+      if (RootAllocator)
         RootAllocator->~RootAllocatorType();
-        InternalFree(RootAllocator);
+      if (O.RootAllocator) {
+        new (&RootAllocatorStorage)
+            RootAllocatorType(std::move(*O.RootAllocator));
+        RootAllocator =
+            reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+        O.RootAllocator = nullptr;
+      } else {
         RootAllocator = nullptr;
       }
-      if (ShadowStackAllocator != nullptr) {
+
+      if (ShadowStackAllocator)
         ShadowStackAllocator->~ShadowStackAllocatorType();
-        InternalFree(ShadowStackAllocator);
+      if (O.ShadowStackAllocator) {
+        new (&ShadowStackAllocatorStorage)
+            ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator));
+        ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+            &ShadowStackAllocatorStorage);
+        O.ShadowStackAllocator = nullptr;
+      } else {
         ShadowStackAllocator = nullptr;
       }
-      if (NodeIdPairAllocator != nullptr) {
+
+      if (NodeIdPairAllocator)
         NodeIdPairAllocator->~NodeIdPairAllocatorType();
-        InternalFree(NodeIdPairAllocator);
+      if (O.NodeIdPairAllocator) {
+        new (&NodeIdPairAllocatorStorage)
+            NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator));
+        NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+            &NodeIdPairAllocatorStorage);
+        O.NodeIdPairAllocator = nullptr;
+      } else {
         NodeIdPairAllocator = nullptr;
       }
+
+      return *this;
+    }
+
+    ~Allocators() XRAY_NEVER_INSTRUMENT {
+      if (NodeAllocator != nullptr)
+        NodeAllocator->~NodeAllocatorType();
+      if (RootAllocator != nullptr)
+        RootAllocator->~RootAllocatorType();
+      if (ShadowStackAllocator != nullptr)
+        ShadowStackAllocator->~ShadowStackAllocatorType();
+      if (NodeIdPairAllocator != nullptr)
+        NodeIdPairAllocator->~NodeIdPairAllocatorType();
     }
   };
 
-  // TODO: Support configuration of options through the arguments.
-  static Allocators InitAllocators() {
+  static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT {
     return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
   }
 
-  static Allocators InitAllocatorsCustom(uptr Max) {
-    Allocators A;
-    auto NodeAllocator = reinterpret_cast<Allocators::NodeAllocatorType *>(
-        InternalAlloc(sizeof(Allocators::NodeAllocatorType)));
-    new (NodeAllocator) Allocators::NodeAllocatorType(Max);
-    A.NodeAllocator = NodeAllocator;
-
-    auto RootAllocator = reinterpret_cast<Allocators::RootAllocatorType *>(
-        InternalAlloc(sizeof(Allocators::RootAllocatorType)));
-    new (RootAllocator) Allocators::RootAllocatorType(Max);
-    A.RootAllocator = RootAllocator;
-
-    auto ShadowStackAllocator =
-        reinterpret_cast<Allocators::ShadowStackAllocatorType *>(
-            InternalAlloc(sizeof(Allocators::ShadowStackAllocatorType)));
-    new (ShadowStackAllocator) Allocators::ShadowStackAllocatorType(Max);
-    A.ShadowStackAllocator = ShadowStackAllocator;
-
-    auto NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
-        InternalAlloc(sizeof(NodeIdPairAllocatorType)));
-    new (NodeIdPairAllocator) NodeIdPairAllocatorType(Max);
-    A.NodeIdPairAllocator = NodeIdPairAllocator;
+  static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT {
+    Allocators A(Max);
+    return A;
+  }
+
+  static Allocators
+  InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT {
+    Allocators A(Bufs);
     return A;
   }
 
@@ -253,65 +323,135 @@ private:
   NodeArray Nodes;
   RootArray Roots;
   ShadowStackArray ShadowStack;
-  NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+  NodeIdPairAllocatorType *NodeIdPairAllocator;
+  uint32_t OverflowedFunctions;
 
 public:
-  explicit FunctionCallTrie(const Allocators &A)
-      : Nodes(*A.NodeAllocator), Roots(*A.RootAllocator),
+  explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT
+      : Nodes(*A.NodeAllocator),
+        Roots(*A.RootAllocator),
         ShadowStack(*A.ShadowStackAllocator),
-        NodeIdPairAllocator(A.NodeIdPairAllocator) {}
+        NodeIdPairAllocator(A.NodeIdPairAllocator),
+        OverflowedFunctions(0) {}
+
+  FunctionCallTrie() = delete;
+  FunctionCallTrie(const FunctionCallTrie &) = delete;
+  FunctionCallTrie &operator=(const FunctionCallTrie &) = delete;
+
+  FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT
+      : Nodes(std::move(O.Nodes)),
+        Roots(std::move(O.Roots)),
+        ShadowStack(std::move(O.ShadowStack)),
+        NodeIdPairAllocator(O.NodeIdPairAllocator),
+        OverflowedFunctions(O.OverflowedFunctions) {}
+
+  FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT {
+    Nodes = std::move(O.Nodes);
+    Roots = std::move(O.Roots);
+    ShadowStack = std::move(O.ShadowStack);
+    NodeIdPairAllocator = O.NodeIdPairAllocator;
+    OverflowedFunctions = O.OverflowedFunctions;
+    return *this;
+  }
+
+  ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {}
 
-  void enterFunction(const int32_t FId, uint64_t TSC) {
+  void enterFunction(const int32_t FId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(FId, 0);
-    // This function primarily deals with ensuring that the ShadowStack is
-    // consistent and ready for when an exit event is encountered.
+
+    // If we're already overflowed the function call stack, do not bother
+    // attempting to record any more function entries.
+    if (UNLIKELY(OverflowedFunctions)) {
+      ++OverflowedFunctions;
+      return;
+    }
+
+    // If this is the first function we've encountered, we want to set up the
+    // node(s) and treat it as a root.
     if (UNLIKELY(ShadowStack.empty())) {
-      auto NewRoot =
-          Nodes.AppendEmplace(nullptr, *NodeIdPairAllocator, 0, 0, FId);
+      auto *NewRoot = Nodes.AppendEmplace(
+          nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
       if (UNLIKELY(NewRoot == nullptr))
         return;
-      Roots.Append(NewRoot);
-      ShadowStack.AppendEmplace(TSC, NewRoot);
+      if (Roots.AppendEmplace(NewRoot) == nullptr) {
+        Nodes.trim(1);
+        return;
+      }
+      if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) {
+        Nodes.trim(1);
+        Roots.trim(1);
+        ++OverflowedFunctions;
+        return;
+      }
       return;
     }
 
-    auto &Top = ShadowStack.back();
-    auto TopNode = Top.NodePtr;
+    // From this point on, we require that the stack is not empty.
+    DCHECK(!ShadowStack.empty());
+    auto TopNode = ShadowStack.back().NodePtr;
     DCHECK_NE(TopNode, nullptr);
 
-    // If we've seen this callee before, then we just access that node and place
-    // that on the top of the stack.
-    auto Callee = TopNode->Callees.find_element(
+    // If we've seen this callee before, then we access that node and place that
+    // on the top of the stack.
+    auto* Callee = TopNode->Callees.find_element(
         [FId](const NodeIdPair &NR) { return NR.FId == FId; });
     if (Callee != nullptr) {
       CHECK_NE(Callee->NodePtr, nullptr);
-      ShadowStack.AppendEmplace(TSC, Callee->NodePtr);
+      if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr)
+        ++OverflowedFunctions;
       return;
     }
 
     // This means we've never seen this stack before, create a new node here.
-    auto NewNode =
-        Nodes.AppendEmplace(TopNode, *NodeIdPairAllocator, 0, 0, FId);
+    auto* NewNode = Nodes.AppendEmplace(
+        TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
     if (UNLIKELY(NewNode == nullptr))
       return;
     DCHECK_NE(NewNode, nullptr);
     TopNode->Callees.AppendEmplace(NewNode, FId);
-    ShadowStack.AppendEmplace(TSC, NewNode);
-    DCHECK_NE(ShadowStack.back().NodePtr, nullptr);
+    if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr)
+      ++OverflowedFunctions;
     return;
   }
 
-  void exitFunction(int32_t FId, uint64_t TSC) {
+  void exitFunction(int32_t FId, uint64_t TSC,
+                    uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    // If we're exiting functions that have "overflowed" or don't fit into the
+    // stack due to allocator constraints, we then decrement that count first.
+    if (OverflowedFunctions) {
+      --OverflowedFunctions;
+      return;
+    }
+
     // When we exit a function, we look up the ShadowStack to see whether we've
     // entered this function before. We do as little processing here as we can,
     // since most of the hard work would have already been done at function
     // entry.
     uint64_t CumulativeTreeTime = 0;
+
     while (!ShadowStack.empty()) {
       const auto &Top = ShadowStack.back();
       auto TopNode = Top.NodePtr;
       DCHECK_NE(TopNode, nullptr);
-      auto LocalTime = TSC - Top.EntryTSC;
+
+      // We may encounter overflow on the TSC we're provided, which may end up
+      // being less than the TSC when we first entered the function.
+      //
+      // To get the accurate measurement of cycles, we need to check whether
+      // we've overflowed (TSC < Top.EntryTSC) and then account the difference
+      // between the entry TSC and the max for the TSC counter (max of uint64_t)
+      // then add the value of TSC. We can prove that the maximum delta we will
+      // get is at most the 64-bit unsigned value, since the difference between
+      // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
+      // - 1) + 1.
+      //
+      // NOTE: This assumes that TSCs are synchronised across CPUs.
+      // TODO: Count the number of times we've seen CPU migrations.
+      uint64_t LocalTime =
+          Top.EntryTSC > TSC
+              ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
+              : TSC - Top.EntryTSC;
       TopNode->CallCount++;
       TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
       CumulativeTreeTime += LocalTime;
@@ -323,7 +463,7 @@ public:
     }
   }
 
-  const RootArray &getRoots() const { return Roots; }
+  const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; }
 
   // The deepCopyInto operation will update the provided FunctionCallTrie by
   // re-creating the contents of this particular FunctionCallTrie in the other
@@ -338,7 +478,7 @@ public:
   // synchronisation of both "this" and |O|.
   //
   // This function must *not* be called with a non-empty FunctionCallTrie |O|.
-  void deepCopyInto(FunctionCallTrie &O) const {
+  void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
     DCHECK(O.getRoots().empty());
 
     // We then push the root into a stack, to use as the parent marker for new
@@ -356,18 +496,20 @@ public:
     for (const auto Root : getRoots()) {
       // Add a node in O for this root.
       auto NewRoot = O.Nodes.AppendEmplace(
-          nullptr, *O.NodeIdPairAllocator, Root->CallCount,
+          nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount,
           Root->CumulativeLocalTime, Root->FId);
 
       // Because we cannot allocate more memory we should bail out right away.
       if (UNLIKELY(NewRoot == nullptr))
         return;
 
-      O.Roots.Append(NewRoot);
+      if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr))
+        return;
 
       // TODO: Figure out what to do if we fail to allocate any more stack
       // space. Maybe warn or report once?
-      DFSStack.AppendEmplace(Root, NewRoot);
+      if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr)
+        return;
       while (!DFSStack.empty()) {
         NodeAndParent NP = DFSStack.back();
         DCHECK_NE(NP.Node, nullptr);
@@ -375,12 +517,17 @@ public:
         DFSStack.trim(1);
         for (const auto Callee : NP.Node->Callees) {
           auto NewNode = O.Nodes.AppendEmplace(
-              NP.NewNode, *O.NodeIdPairAllocator, Callee.NodePtr->CallCount,
-              Callee.NodePtr->CumulativeLocalTime, Callee.FId);
+              NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator),
+              Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
+              Callee.FId);
           if (UNLIKELY(NewNode == nullptr))
             return;
-          NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId);
-          DFSStack.AppendEmplace(Callee.NodePtr, NewNode);
+          if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) ==
+                       nullptr))
+            return;
+          if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) ==
+                       nullptr))
+            return;
         }
       }
     }
@@ -394,7 +541,7 @@ public:
   //
   // This function is *not* thread-safe, and may require external
   // synchronisation of both "this" and |O|.
-  void mergeInto(FunctionCallTrie &O) const {
+  void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
     struct NodeAndTarget {
       FunctionCallTrie::Node *OrigNode;
       FunctionCallTrie::Node *TargetNode;
@@ -409,8 +556,9 @@ public:
       auto R = O.Roots.find_element(
           [&](const Node *Node) { return Node->FId == Root->FId; });
       if (R == nullptr) {
-        TargetRoot = O.Nodes.AppendEmplace(nullptr, *O.NodeIdPairAllocator, 0,
-                                           0, Root->FId);
+        TargetRoot = O.Nodes.AppendEmplace(
+            nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
+            Root->FId);
         if (UNLIKELY(TargetRoot == nullptr))
           return;
 
@@ -419,7 +567,7 @@ public:
         TargetRoot = *R;
       }
 
-      DFSStack.Append(NodeAndTarget{Root, TargetRoot});
+      DFSStack.AppendEmplace(Root, TargetRoot);
       while (!DFSStack.empty()) {
         NodeAndTarget NT = DFSStack.back();
         DCHECK_NE(NT.OrigNode, nullptr);
@@ -435,7 +583,8 @@ public:
               });
           if (TargetCallee == nullptr) {
             auto NewTargetNode = O.Nodes.AppendEmplace(
-                NT.TargetNode, *O.NodeIdPairAllocator, 0, 0, Callee.FId);
+                NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
+                Callee.FId);
 
             if (UNLIKELY(NewTargetNode == nullptr))
               return;
diff --git a/contrib/compiler-rt/lib/xray/xray_init.cc b/contrib/compiler-rt/lib/xray/xray_init.cc
index b4e069795195..b0922aa8e379 100644
--- a/contrib/compiler-rt/lib/xray/xray_init.cc
+++ b/contrib/compiler-rt/lib/xray/xray_init.cc
@@ -27,6 +27,15 @@ extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak));
 extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak));
 extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
 extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
+
+#if SANITIZER_MAC
+// HACK: This is a temporary workaround to make XRay build on 
+// Darwin, but it will probably not work at runtime.
+const XRaySledEntry __start_xray_instr_map[] = {};
+extern const XRaySledEntry __stop_xray_instr_map[] = {};
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
+#endif
 }
 
 using namespace __xray;
@@ -58,6 +67,9 @@ void __xray_init() XRAY_NEVER_INSTRUMENT {
   if (atomic_load(&XRayInitialized, memory_order_acquire))
     return;
 
+  // XRAY is not compatible with PaX MPROTECT
+  CheckMPROTECT();
+
   if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) {
     initializeFlags();
     atomic_store(&XRayFlagsInitialized, true, memory_order_release);
@@ -97,8 +109,8 @@ __attribute__((section(".preinit_array"),
 #else
 // If we cannot use the .preinit_array section, we should instead use dynamic
 // initialisation.
-static bool UNUSED __local_xray_dyninit = [] {
+__attribute__ ((constructor (0)))
+static void __local_xray_dyninit() {
   __xray_init();
-  return true;
-}();
+}
 #endif
diff --git a/contrib/compiler-rt/lib/xray/xray_interface.cc b/contrib/compiler-rt/lib/xray/xray_interface.cc
index 01bf6ddc607e..6f7b6615b2c0 100644
--- a/contrib/compiler-rt/lib/xray/xray_interface.cc
+++ b/contrib/compiler-rt/lib/xray/xray_interface.cc
@@ -22,6 +22,13 @@
 #include <string.h>
 #include <sys/mman.h>
 
+#if SANITIZER_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
 #include "sanitizer_common/sanitizer_addrhashmap.h"
 #include "sanitizer_common/sanitizer_common.h"
 
@@ -92,22 +99,48 @@ class MProtectHelper {
 
 public:
   explicit MProtectHelper(void *PageAlignedAddr,
-                          std::size_t MProtectLen) XRAY_NEVER_INSTRUMENT
+                          std::size_t MProtectLen,
+                          std::size_t PageSize) XRAY_NEVER_INSTRUMENT
       : PageAlignedAddr(PageAlignedAddr),
         MProtectLen(MProtectLen),
-        MustCleanup(false) {}
+        MustCleanup(false) {
+#if SANITIZER_FUCHSIA
+    MProtectLen = RoundUpTo(MProtectLen, PageSize);
+#endif
+  }
 
   int MakeWriteable() XRAY_NEVER_INSTRUMENT {
+#if SANITIZER_FUCHSIA
+    auto R = __sanitizer_change_code_protection(
+        reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, true);
+    if (R != ZX_OK) {
+      Report("XRay: cannot change code protection: %s\n",
+             _zx_status_get_string(R));
+      return -1;
+    }
+    MustCleanup = true;
+    return 0;
+#else
     auto R = mprotect(PageAlignedAddr, MProtectLen,
                       PROT_READ | PROT_WRITE | PROT_EXEC);
     if (R != -1)
       MustCleanup = true;
     return R;
+#endif
   }
 
   ~MProtectHelper() XRAY_NEVER_INSTRUMENT {
     if (MustCleanup) {
+#if SANITIZER_FUCHSIA
+      auto R = __sanitizer_change_code_protection(
+          reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, false);
+      if (R != ZX_OK) {
+        Report("XRay: cannot change code protection: %s\n",
+               _zx_status_get_string(R));
+      }
+#else
       mprotect(PageAlignedAddr, MProtectLen, PROT_READ | PROT_EXEC);
+#endif
     }
   }
 };
@@ -254,7 +287,7 @@ XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
       reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
   size_t MProtectLen =
       (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
-  MProtectHelper Protector(PageAlignedAddr, MProtectLen);
+  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
   if (Protector.MakeWriteable() == -1) {
     Report("Failed mprotect: %d\n", errno);
     return XRayPatchingStatus::FAILED;
@@ -319,7 +352,7 @@ XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
       reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
   size_t MProtectLen =
       (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
-  MProtectHelper Protector(PageAlignedAddr, MProtectLen);
+  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
   if (Protector.MakeWriteable() == -1) {
     Report("Failed mprotect: %d\n", errno);
     return XRayPatchingStatus::FAILED;
diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
index 17a611eeacb8..dc3a82069840 100644
--- a/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
+++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.cc
@@ -13,10 +13,11 @@
 //
 //===----------------------------------------------------------------------===//
 #include "xray_profile_collector.h"
-#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_vector.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
 #include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
 #include <memory>
 #include <pthread.h>
 #include <utility>
@@ -29,7 +30,7 @@ namespace {
 SpinMutex GlobalMutex;
 struct ThreadTrie {
   tid_t TId;
-  FunctionCallTrie *Trie;
+  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
 };
 
 struct ProfileBuffer {
@@ -56,65 +57,91 @@ struct BlockHeader {
   u64 ThreadId;
 };
 
-// These need to be pointers that point to heap/internal-allocator-allocated
-// objects because these are accessed even at program exit.
-Vector<ThreadTrie> *ThreadTries = nullptr;
-Vector<ProfileBuffer> *ProfileBuffers = nullptr;
-FunctionCallTrie::Allocators *GlobalAllocators = nullptr;
+struct ThreadData {
+  BufferQueue *BQ;
+  FunctionCallTrie::Allocators::Buffers Buffers;
+  FunctionCallTrie::Allocators Allocators;
+  FunctionCallTrie FCT;
+  tid_t TId;
+};
+
+using ThreadDataArray = Array<ThreadData>;
+using ThreadDataAllocator = ThreadDataArray::AllocatorType;
+
+// We use a separate buffer queue for the backing store for the allocator used
+// by the ThreadData array. This lets us host the buffers, allocators, and tries
+// associated with a thread by moving the data into the array instead of
+// attempting to copy the data to a separately backed set of tries.
+static typename std::aligned_storage<
+    sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+static BufferQueue::Buffer Buffer;
+static typename std::aligned_storage<sizeof(ThreadDataAllocator),
+                                     alignof(ThreadDataAllocator)>::type
+    ThreadDataAllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadDataArray),
+                                     alignof(ThreadDataArray)>::type
+    ThreadDataArrayStorage;
+
+static ThreadDataAllocator *TDAllocator = nullptr;
+static ThreadDataArray *TDArray = nullptr;
+
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+    ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+    ProfileBufferArrayAllocatorStorage;
+
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+
+// Use a global flag to determine whether the collector implementation has been
+// initialized.
+static atomic_uint8_t CollectorInitialized{0};
 
 } // namespace
 
-void post(const FunctionCallTrie &T, tid_t TId) {
-  static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  pthread_once(&Once, +[] {
-    SpinMutexLock Lock(&GlobalMutex);
-    GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
-    new (GlobalAllocators) FunctionCallTrie::Allocators();
-    *GlobalAllocators = FunctionCallTrie::InitAllocatorsCustom(
-        profilingFlags()->global_allocator_max);
-    ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-        InternalAlloc(sizeof(Vector<ThreadTrie>)));
-    new (ThreadTries) Vector<ThreadTrie>();
-    ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-        InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-    new (ProfileBuffers) Vector<ProfileBuffer>();
-  });
-  DCHECK_NE(GlobalAllocators, nullptr);
-  DCHECK_NE(ThreadTries, nullptr);
-  DCHECK_NE(ProfileBuffers, nullptr);
-
-  ThreadTrie *Item = nullptr;
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+          FunctionCallTrie::Allocators &&A,
+          FunctionCallTrie::Allocators::Buffers &&B,
+          tid_t TId) XRAY_NEVER_INSTRUMENT {
+  DCHECK_NE(Q, nullptr);
+
+  // Bail out early if the collector has not been initialized.
+  if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
+    T.~FunctionCallTrie();
+    A.~Allocators();
+    Q->releaseBuffer(B.NodeBuffer);
+    Q->releaseBuffer(B.RootsBuffer);
+    Q->releaseBuffer(B.ShadowStackBuffer);
+    Q->releaseBuffer(B.NodeIdPairBuffer);
+    B.~Buffers();
+    return;
+  }
+
   {
     SpinMutexLock Lock(&GlobalMutex);
-    if (GlobalAllocators == nullptr)
-      return;
-
-    Item = ThreadTries->PushBack();
-    Item->TId = TId;
-
-    // Here we're using the internal allocator instead of the managed allocator
-    // because:
-    //
-    // 1) We're not using the segmented array data structure to host
-    //    FunctionCallTrie objects. We're using a Vector (from sanitizer_common)
-    //    which works like a std::vector<...> keeping elements contiguous in
-    //    memory. The segmented array data structure assumes that elements are
-    //    trivially destructible, where FunctionCallTrie isn't.
-    //
-    // 2) Using a managed allocator means we need to manage that separately,
-    //    which complicates the nature of this code. To get around that, we're
-    //    using the internal allocator instead, which has its own global state
-    //    and is decoupled from the lifetime management required by the managed
-    //    allocator we have in XRay.
-    //
-    Item->Trie = reinterpret_cast<FunctionCallTrie *>(InternalAlloc(
-        sizeof(FunctionCallTrie), nullptr, alignof(FunctionCallTrie)));
-    DCHECK_NE(Item->Trie, nullptr);
-    new (Item->Trie) FunctionCallTrie(*GlobalAllocators);
+    DCHECK_NE(TDAllocator, nullptr);
+    DCHECK_NE(TDArray, nullptr);
+
+    if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
+                               TId) == nullptr) {
+      // If we fail to add the data to the array, we should destroy the objects
+      // handed us.
+      T.~FunctionCallTrie();
+      A.~Allocators();
+      Q->releaseBuffer(B.NodeBuffer);
+      Q->releaseBuffer(B.RootsBuffer);
+      Q->releaseBuffer(B.ShadowStackBuffer);
+      Q->releaseBuffer(B.NodeIdPairBuffer);
+      B.~Buffers();
+    }
   }
-
-  T.deepCopyInto(*Item->Trie);
 }
 
 // A PathArray represents the function id's representing a stack trace. In this
@@ -127,18 +154,8 @@ struct ProfileRecord {
 
   // The Path in this record is the function id's from the leaf to the root of
   // the function call stack as represented from a FunctionCallTrie.
-  PathArray *Path = nullptr;
-  const FunctionCallTrie::Node *Node = nullptr;
-
-  // Constructor for in-place construction.
-  ProfileRecord(PathAllocator &A, const FunctionCallTrie::Node *N)
-      : Path([&] {
-          auto P =
-              reinterpret_cast<PathArray *>(InternalAlloc(sizeof(PathArray)));
-          new (P) PathArray(A);
-          return P;
-        }()),
-        Node(N) {}
+  PathArray Path;
+  const FunctionCallTrie::Node *Node;
 };
 
 namespace {
@@ -147,19 +164,21 @@ using ProfileRecordArray = Array<ProfileRecord>;
 
 // Walk a depth-first traversal of each root of the FunctionCallTrie to generate
 // the path(s) and the data associated with the path.
-static void populateRecords(ProfileRecordArray &PRs,
-                            ProfileRecord::PathAllocator &PA,
-                            const FunctionCallTrie &Trie) {
+static void
+populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
+                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
   using StackArray = Array<const FunctionCallTrie::Node *>;
   using StackAllocator = typename StackArray::AllocatorType;
   StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
   StackArray DFSStack(StackAlloc);
-  for (const auto R : Trie.getRoots()) {
+  for (const auto *R : Trie.getRoots()) {
     DFSStack.Append(R);
     while (!DFSStack.empty()) {
-      auto Node = DFSStack.back();
+      auto *Node = DFSStack.back();
       DFSStack.trim(1);
-      auto Record = PRs.AppendEmplace(PA, Node);
+      if (Node == nullptr)
+        continue;
+      auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
       if (Record == nullptr)
         return;
       DCHECK_NE(Record, nullptr);
@@ -167,8 +186,8 @@ static void populateRecords(ProfileRecordArray &PRs,
       // Traverse the Node's parents and as we're doing so, get the FIds in
       // the order they appear.
       for (auto N = Node; N != nullptr; N = N->Parent)
-        Record->Path->Append(N->FId);
-      DCHECK(!Record->Path->empty());
+        Record->Path.Append(N->FId);
+      DCHECK(!Record->Path.empty());
 
       for (const auto C : Node->Callees)
         DFSStack.Append(C.NodePtr);
@@ -177,67 +196,89 @@ static void populateRecords(ProfileRecordArray &PRs,
 }
 
 static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
-                             const ProfileRecordArray &ProfileRecords) {
-  auto NextPtr = static_cast<char *>(
+                             const ProfileRecordArray &ProfileRecords)
+    XRAY_NEVER_INSTRUMENT {
+  auto NextPtr = static_cast<uint8_t *>(
                      internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
                  sizeof(Header);
   for (const auto &Record : ProfileRecords) {
     // List of IDs follow:
-    for (const auto FId : *Record.Path)
+    for (const auto FId : Record.Path)
       NextPtr =
-          static_cast<char *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
           sizeof(FId);
 
     // Add the sentinel here.
     constexpr int32_t SentinelFId = 0;
-    NextPtr = static_cast<char *>(
+    NextPtr = static_cast<uint8_t *>(
                   internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
               sizeof(SentinelFId);
 
     // Add the node data here.
     NextPtr =
-        static_cast<char *>(internal_memcpy(NextPtr, &Record.Node->CallCount,
-                                            sizeof(Record.Node->CallCount))) +
+        static_cast<uint8_t *>(internal_memcpy(
+            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
         sizeof(Record.Node->CallCount);
-    NextPtr = static_cast<char *>(
+    NextPtr = static_cast<uint8_t *>(
                   internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
                                   sizeof(Record.Node->CumulativeLocalTime))) +
               sizeof(Record.Node->CumulativeLocalTime);
   }
 
-  DCHECK_EQ(NextPtr - static_cast<char *>(Buffer->Data), Buffer->Size);
+  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
 }
 
 } // namespace
 
-void serialize() {
+void serialize() XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&CollectorInitialized, memory_order_acquire))
+    return;
+
   SpinMutexLock Lock(&GlobalMutex);
 
-  // Clear out the global ProfileBuffers.
-  for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-    InternalFree((*ProfileBuffers)[I].Data);
-  ProfileBuffers->Reset();
+  // Clear out the global ProfileBuffers, if it's not empty.
+  for (auto &B : *ProfileBuffers)
+    deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
+  ProfileBuffers->trim(ProfileBuffers->size());
 
-  if (ThreadTries->Size() == 0)
+  DCHECK_NE(TDArray, nullptr);
+  if (TDArray->empty())
     return;
 
   // Then repopulate the global ProfileBuffers.
-  for (u32 I = 0; I < ThreadTries->Size(); ++I) {
+  u32 I = 0;
+  auto MaxSize = profilingFlags()->global_allocator_max;
+  auto ProfileArena = allocateBuffer(MaxSize);
+  if (ProfileArena == nullptr)
+    return;
+
+  auto ProfileArenaCleanup = at_scope_exit(
+      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
+
+  auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
+  if (PathArena == nullptr)
+    return;
+
+  auto PathArenaCleanup = at_scope_exit(
+      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
+
+  for (const auto &ThreadTrie : *TDArray) {
     using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
-    ProfileRecordAllocator PRAlloc(profilingFlags()->global_allocator_max);
+    ProfileRecordAllocator PRAlloc(ProfileArena,
+                                   profilingFlags()->global_allocator_max);
     ProfileRecord::PathAllocator PathAlloc(
-        profilingFlags()->global_allocator_max);
+        PathArena, profilingFlags()->global_allocator_max);
     ProfileRecordArray ProfileRecords(PRAlloc);
 
     // First, we want to compute the amount of space we're going to need. We'll
     // use a local allocator and an __xray::Array<...> to store the intermediary
     // data, then compute the size as we're going along. Then we'll allocate the
     // contiguous space to contain the thread buffer data.
-    const auto &Trie = *(*ThreadTries)[I].Trie;
-    if (Trie.getRoots().empty())
+    if (ThreadTrie.FCT.getRoots().empty())
       continue;
-    populateRecords(ProfileRecords, PathAlloc, Trie);
-    DCHECK(!Trie.getRoots().empty());
+
+    populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
+    DCHECK(!ThreadTrie.FCT.getRoots().empty());
     DCHECK(!ProfileRecords.empty());
 
     // Go through each record, to compute the sizes.
@@ -251,75 +292,103 @@ void serialize() {
     //   + end of record (8 bytes)
     u32 CumulativeSizes = 0;
     for (const auto &Record : ProfileRecords)
-      CumulativeSizes += 20 + (4 * Record.Path->size());
-
-    BlockHeader Header{16 + CumulativeSizes, I, (*ThreadTries)[I].TId};
-    auto Buffer = ProfileBuffers->PushBack();
-    Buffer->Size = sizeof(Header) + CumulativeSizes;
-    Buffer->Data = InternalAlloc(Buffer->Size, nullptr, 64);
-    DCHECK_NE(Buffer->Data, nullptr);
-    serializeRecords(Buffer, Header, ProfileRecords);
-
-    // Now clean up the ProfileRecords array, one at a time.
-    for (auto &Record : ProfileRecords) {
-      Record.Path->~PathArray();
-      InternalFree(Record.Path);
-    }
+      CumulativeSizes += 20 + (4 * Record.Path.size());
+
+    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+    auto B = ProfileBuffers->Append({});
+    B->Size = sizeof(Header) + CumulativeSizes;
+    B->Data = allocateBuffer(B->Size);
+    DCHECK_NE(B->Data, nullptr);
+    serializeRecords(B, Header, ProfileRecords);
   }
 }
 
-void reset() {
+void reset() XRAY_NEVER_INSTRUMENT {
+  atomic_store(&CollectorInitialized, 0, memory_order_release);
   SpinMutexLock Lock(&GlobalMutex);
+
   if (ProfileBuffers != nullptr) {
     // Clear out the profile buffers that have been serialized.
-    for (uptr I = 0; I < ProfileBuffers->Size(); ++I)
-      InternalFree((*ProfileBuffers)[I].Data);
-    ProfileBuffers->Reset();
-    InternalFree(ProfileBuffers);
+    for (auto &B : *ProfileBuffers)
+      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
+    ProfileBuffers->trim(ProfileBuffers->size());
     ProfileBuffers = nullptr;
   }
 
-  if (ThreadTries != nullptr) {
-    // Clear out the function call tries per thread.
-    for (uptr I = 0; I < ThreadTries->Size(); ++I) {
-      auto &T = (*ThreadTries)[I];
-      T.Trie->~FunctionCallTrie();
-      InternalFree(T.Trie);
+  if (TDArray != nullptr) {
+    // Release the resources as required.
+    for (auto &TD : *TDArray) {
+      TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
     }
-    ThreadTries->Reset();
-    InternalFree(ThreadTries);
-    ThreadTries = nullptr;
+    // We don't bother destroying the array here because we've already
+    // potentially freed the backing store for the array. Instead we're going to
+    // reset the pointer to nullptr, and re-use the storage later instead
+    // (placement-new'ing into the storage as-is).
+    TDArray = nullptr;
+  }
+
+  if (TDAllocator != nullptr) {
+    TDAllocator->~Allocator();
+    TDAllocator = nullptr;
+  }
+
+  if (Buffer.Data != nullptr) {
+    BQ->releaseBuffer(Buffer);
   }
 
-  // Reset the global allocators.
-  if (GlobalAllocators != nullptr) {
-    GlobalAllocators->~Allocators();
-    InternalFree(GlobalAllocators);
-    GlobalAllocators = nullptr;
+  if (BQ == nullptr) {
+    bool Success = false;
+    new (&BufferQueueStorage)
+        BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
+    if (!Success)
+      return;
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+  } else {
+    BQ->finalize();
+
+    if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
+        BufferQueue::ErrorCode::Ok)
+      return;
   }
-  GlobalAllocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-      InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
-  new (GlobalAllocators) FunctionCallTrie::Allocators();
-  *GlobalAllocators = FunctionCallTrie::InitAllocators();
-  ThreadTries = reinterpret_cast<Vector<ThreadTrie> *>(
-      InternalAlloc(sizeof(Vector<ThreadTrie>)));
-  new (ThreadTries) Vector<ThreadTrie>();
-  ProfileBuffers = reinterpret_cast<Vector<ProfileBuffer> *>(
-      InternalAlloc(sizeof(Vector<ProfileBuffer>)));
-  new (ProfileBuffers) Vector<ProfileBuffer>();
+
+  if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
+    return;
+
+  new (&ProfileBufferArrayAllocatorStorage)
+      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+      &ProfileBufferArrayAllocatorStorage);
+
+  new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
+  ProfileBuffers =
+      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+
+  new (&ThreadDataAllocatorStorage)
+      ThreadDataAllocator(Buffer.Data, Buffer.Size);
+  TDAllocator =
+      reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
+  new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
+  TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
+
+  atomic_store(&CollectorInitialized, 1, memory_order_release);
 }
 
-XRayBuffer nextBuffer(XRayBuffer B) {
+XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
   SpinMutexLock Lock(&GlobalMutex);
 
-  if (ProfileBuffers == nullptr || ProfileBuffers->Size() == 0)
+  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
     return {nullptr, 0};
 
   static pthread_once_t Once = PTHREAD_ONCE_INIT;
   static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
       FileHeaderStorage;
-  pthread_once(&Once,
-               +[] { new (&FileHeaderStorage) XRayProfilingFileHeader{}; });
+  pthread_once(
+      &Once, +[]() XRAY_NEVER_INSTRUMENT {
+        new (&FileHeaderStorage) XRayProfilingFileHeader{};
+      });
 
   if (UNLIKELY(B.Data == nullptr)) {
     // The first buffer should always contain the file header information.
@@ -336,7 +405,7 @@ XRayBuffer nextBuffer(XRayBuffer B) {
   BlockHeader Header;
   internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
   auto NextBlock = Header.BlockNum + 1;
-  if (NextBlock < ProfileBuffers->Size())
+  if (NextBlock < ProfileBuffers->size())
     return {(*ProfileBuffers)[NextBlock].Data,
             (*ProfileBuffers)[NextBlock].Size};
   return {nullptr, 0};
diff --git a/contrib/compiler-rt/lib/xray/xray_profile_collector.h b/contrib/compiler-rt/lib/xray/xray_profile_collector.h
index 335043db9526..86c4ce853797 100644
--- a/contrib/compiler-rt/lib/xray/xray_profile_collector.h
+++ b/contrib/compiler-rt/lib/xray/xray_profile_collector.h
@@ -33,27 +33,13 @@ namespace profileCollectorService {
 /// Posts the FunctionCallTrie associated with a specific Thread ID. This
 /// will:
 ///
-///   - Make a copy of the FunctionCallTrie and store that against the Thread
-///     ID. This will use the global allocator for the service-managed
-///     FunctionCallTrie instances.
-///   - Queue up a pointer to the FunctionCallTrie.
-///   - If the queue is long enough (longer than some arbitrary threshold) we
-///     then pre-calculate a single FunctionCallTrie for the whole process.
+/// Moves the collection of FunctionCallTrie, Allocators, and Buffers associated
+/// with a thread's data to the queue. This takes ownership of the memory
+/// associated with a thread, and manages those exclusively.
 ///
-///
-/// We are making a copy of the FunctionCallTrie because the intent is to have
-/// this function be called at thread exit, or soon after the profiling
-/// handler is finalized through the XRay APIs. By letting threads each
-/// process their own thread-local FunctionCallTrie instances, we're removing
-/// the need for synchronisation across threads while we're profiling.
-/// However, once we're done profiling, we can then collect copies of these
-/// FunctionCallTrie instances and pay the cost of the copy.
-///
-/// NOTE: In the future, if this turns out to be more costly than "moving" the
-/// FunctionCallTrie instances from the owning thread to the collector
-/// service, then we can change the implementation to do it this way (moving)
-/// instead.
-void post(const FunctionCallTrie &T, tid_t TId);
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+          FunctionCallTrie::Allocators &&A,
+          FunctionCallTrie::Allocators::Buffers &&B, tid_t TId);
 
 /// The serialize will process all FunctionCallTrie instances in memory, and
 /// turn those into specifically formatted blocks, each describing the
diff --git a/contrib/compiler-rt/lib/xray/xray_profiling.cc b/contrib/compiler-rt/lib/xray/xray_profiling.cc
index d4b4345d764a..4323170cd1bb 100644
--- a/contrib/compiler-rt/lib/xray/xray_profiling.cc
+++ b/contrib/compiler-rt/lib/xray/xray_profiling.cc
@@ -19,7 +19,7 @@
 #include "sanitizer_common/sanitizer_flags.h"
 #include "xray/xray_interface.h"
 #include "xray/xray_log_interface.h"
-
+#include "xray_buffer_queue.h"
 #include "xray_flags.h"
 #include "xray_profile_collector.h"
 #include "xray_profiling_flags.h"
@@ -32,62 +32,167 @@ namespace __xray {
 
 namespace {
 
-atomic_sint32_t ProfilerLogFlushStatus = {
+static atomic_sint32_t ProfilerLogFlushStatus = {
     XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
 
-atomic_sint32_t ProfilerLogStatus = {XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+static atomic_sint32_t ProfilerLogStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
 
-SpinMutex ProfilerOptionsMutex;
+static SpinMutex ProfilerOptionsMutex;
 
-struct alignas(64) ProfilingData {
-  FunctionCallTrie::Allocators *Allocators = nullptr;
-  FunctionCallTrie *FCT = nullptr;
+struct ProfilingData {
+  atomic_uintptr_t Allocators;
+  atomic_uintptr_t FCT;
 };
 
 static pthread_key_t ProfilingKey;
 
-thread_local std::aligned_storage<sizeof(ProfilingData)>::type ThreadStorage{};
-static ProfilingData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
-  thread_local auto ThreadOnce = [] {
-    new (&ThreadStorage) ProfilingData{};
-    pthread_setspecific(ProfilingKey, &ThreadStorage);
+// We use a global buffer queue, which gets initialized once at initialisation
+// time, and gets reset when profiling is "done".
+static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type
+    BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+
+thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers;
+thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators),
+                                  alignof(FunctionCallTrie::Allocators)>::type
+    AllocatorsStorage;
+thread_local std::aligned_storage<sizeof(FunctionCallTrie),
+                                  alignof(FunctionCallTrie)>::type
+    FunctionCallTrieStorage;
+thread_local ProfilingData TLD{{0}, {0}};
+thread_local atomic_uint8_t ReentranceGuard{0};
+
+// We use a separate guard for ensuring that for this thread, if we're already
+// cleaning up, that any signal handlers don't attempt to cleanup nor
+// initialise.
+thread_local atomic_uint8_t TLDInitGuard{0};
+
+// We also use a separate latch to signal that the thread is exiting, and
+// non-essential work should be ignored (things like recording events, etc.).
+thread_local atomic_uint8_t ThreadExitingLatch{0};
+
+static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+  thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT {
+    pthread_setspecific(ProfilingKey, &TLD);
     return false;
   }();
   (void)ThreadOnce;
 
-  auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
+  RecursionGuard TLDInit(TLDInitGuard);
+  if (!TLDInit)
+    return nullptr;
 
-  // We need to check whether the global flag to finalizing/finalized has been
-  // switched. If it is, then we ought to not actually initialise the data.
-  auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
-  if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
-      Status == XRayLogInitStatus::XRAY_LOG_FINALIZED)
-    return TLD;
-
-  // If we're live, then we re-initialize TLD if the pointers are not null.
-  if (UNLIKELY(TLD.Allocators == nullptr && TLD.FCT == nullptr)) {
-    TLD.Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(
-        InternalAlloc(sizeof(FunctionCallTrie::Allocators)));
-    new (TLD.Allocators) FunctionCallTrie::Allocators();
-    *TLD.Allocators = FunctionCallTrie::InitAllocators();
-    TLD.FCT = reinterpret_cast<FunctionCallTrie *>(
-        InternalAlloc(sizeof(FunctionCallTrie)));
-    new (TLD.FCT) FunctionCallTrie(*TLD.Allocators);
+  if (atomic_load_relaxed(&ThreadExitingLatch))
+    return nullptr;
+
+  uptr Allocators = 0;
+  if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1,
+                                     memory_order_acq_rel)) {
+    bool Success = false;
+    auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        atomic_store(&TLD.Allocators, 0, memory_order_release);
+    });
+
+    // Acquire a set of buffers for this thread.
+    if (BQ == nullptr)
+      return nullptr;
+
+    if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.NodeBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.RootsBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) !=
+        BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) !=
+        BufferQueue::ErrorCode::Ok)
+      return nullptr;
+
+    Success = true;
+    new (&AllocatorsStorage) FunctionCallTrie::Allocators(
+        FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers));
+    Allocators = reinterpret_cast<uptr>(
+        reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage));
+    atomic_store(&TLD.Allocators, Allocators, memory_order_release);
+  }
+
+  if (Allocators == 1)
+    return nullptr;
+
+  uptr FCT = 0;
+  if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) {
+    new (&FunctionCallTrieStorage)
+        FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>(
+            atomic_load_relaxed(&TLD.Allocators)));
+    FCT = reinterpret_cast<uptr>(
+        reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage));
+    atomic_store(&TLD.FCT, FCT, memory_order_release);
   }
 
-  return TLD;
+  if (FCT == 1)
+    return nullptr;
+
+  return &TLD;
 }
 
 static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
-  auto &TLD = *reinterpret_cast<ProfilingData *>(&ThreadStorage);
-  if (TLD.Allocators != nullptr && TLD.FCT != nullptr) {
-    TLD.FCT->~FunctionCallTrie();
-    TLD.Allocators->~Allocators();
-    InternalFree(TLD.FCT);
-    InternalFree(TLD.Allocators);
-    TLD.FCT = nullptr;
-    TLD.Allocators = nullptr;
-  }
+  auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel);
+  if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>(
+                 &FunctionCallTrieStorage)))
+    reinterpret_cast<FunctionCallTrie *>(FCT)->~FunctionCallTrie();
+
+  auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel);
+  if (Allocators ==
+      reinterpret_cast<uptr>(
+          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+    reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)->~Allocators();
+}
+
+static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT {
+  RecursionGuard TLDInit(TLDInitGuard);
+  if (!TLDInit)
+    return;
+
+  uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel);
+  if (P != reinterpret_cast<uptr>(
+               reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)))
+    return;
+
+  auto FCT = reinterpret_cast<FunctionCallTrie *>(P);
+  DCHECK_NE(FCT, nullptr);
+
+  uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel);
+  if (A !=
+      reinterpret_cast<uptr>(
+          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+    return;
+
+  auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A);
+  DCHECK_NE(Allocators, nullptr);
+
+  // Always move the data into the profile collector.
+  profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators),
+                                std::move(ThreadBuffers), GetTid());
+
+  // Re-initialize the ThreadBuffers object to a known "default" state.
+  ThreadBuffers = FunctionCallTrie::Allocators::Buffers{};
 }
 
 } // namespace
@@ -100,9 +205,6 @@ const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
 #endif
 }
 
-atomic_sint32_t ProfileFlushStatus = {
-    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
-
 XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
   if (atomic_load(&ProfilerLogStatus, memory_order_acquire) !=
       XRayLogInitStatus::XRAY_LOG_FINALIZED) {
@@ -111,12 +213,23 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
     return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
   }
 
-  s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  if (!atomic_compare_exchange_strong(&ProfilerLogFlushStatus, &Result,
-                                      XRayLogFlushStatus::XRAY_LOG_FLUSHING,
-                                      memory_order_acq_rel)) {
+  RecursionGuard SignalGuard(ReentranceGuard);
+  if (!SignalGuard) {
+    if (Verbosity())
+      Report("Cannot finalize properly inside a signal handler!\n");
+    atomic_store(&ProfilerLogFlushStatus,
+                 XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING,
+                 memory_order_release);
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  s32 Previous = atomic_exchange(&ProfilerLogFlushStatus,
+                                 XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+                                 memory_order_acq_rel);
+  if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) {
     if (Verbosity())
-      Report("Not flushing profiles, implementation still finalizing.\n");
+      Report("Not flushing profiles, implementation still flushing.\n");
+    return XRayLogFlushStatus::XRAY_LOG_FLUSHING;
   }
 
   // At this point, we'll create the file that will contain the profile, but
@@ -129,49 +242,33 @@ XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
       if (Verbosity())
         Report("profiling: No data to flush.\n");
     } else {
-      int Fd = getLogFD();
-      if (Fd == -1) {
+      LogWriter *LW = LogWriter::Open();
+      if (LW == nullptr) {
         if (Verbosity())
           Report("profiling: Failed to flush to file, dropping data.\n");
       } else {
         // Now for each of the buffers, write out the profile data as we would
         // see it in memory, verbatim.
         while (B.Data != nullptr && B.Size != 0) {
-          retryingWriteAll(Fd, reinterpret_cast<const char *>(B.Data),
-                           reinterpret_cast<const char *>(B.Data) + B.Size);
+          LW->WriteAll(reinterpret_cast<const char *>(B.Data),
+                       reinterpret_cast<const char *>(B.Data) + B.Size);
           B = profileCollectorService::nextBuffer(B);
         }
-        // Then we close out the file.
-        internal_close(Fd);
       }
+      LogWriter::Close(LW);
     }
   }
 
   profileCollectorService::reset();
 
-  // Flush the current thread's local data structures as well.
-  cleanupTLD();
-
-  atomic_store(&ProfilerLogStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+  atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+               memory_order_release);
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
                memory_order_release);
 
   return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
 }
 
-namespace {
-
-thread_local atomic_uint8_t ReentranceGuard{0};
-
-static void postCurrentThreadFCT(ProfilingData &TLD) {
-  if (TLD.Allocators == nullptr || TLD.FCT == nullptr)
-    return;
-
-  profileCollectorService::post(*TLD.FCT, GetTid());
-  cleanupTLD();
-}
-
-} // namespace
-
 void profilingHandleArg0(int32_t FuncId,
                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
   unsigned char CPU;
@@ -181,21 +278,29 @@ void profilingHandleArg0(int32_t FuncId,
     return;
 
   auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
-  auto &TLD = getThreadLocalData();
+  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED ||
+               Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING))
+    return;
+
   if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
                Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
     postCurrentThreadFCT(TLD);
     return;
   }
 
+  auto T = getThreadLocalData();
+  if (T == nullptr)
+    return;
+
+  auto FCT = reinterpret_cast<FunctionCallTrie *>(atomic_load_relaxed(&T->FCT));
   switch (Entry) {
   case XRayEntryType::ENTRY:
   case XRayEntryType::LOG_ARGS_ENTRY:
-    TLD.FCT->enterFunction(FuncId, TSC);
+    FCT->enterFunction(FuncId, TSC, CPU);
     break;
   case XRayEntryType::EXIT:
   case XRayEntryType::TAIL:
-    TLD.FCT->exitFunction(FuncId, TSC);
+    FCT->exitFunction(FuncId, TSC, CPU);
     break;
   default:
     // FIXME: Handle bugs.
@@ -218,12 +323,22 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
     return static_cast<XRayLogInitStatus>(CurrentStatus);
   }
 
+  // Mark then finalize the current generation of buffers. This allows us to let
+  // the threads currently holding onto new buffers still use them, but let the
+  // last reference do the memory cleanup.
+  DCHECK_NE(BQ, nullptr);
+  BQ->finalize();
+
   // Wait a grace period to allow threads to see that we're finalizing.
   SleepForMillis(profilingFlags()->grace_period_ms);
 
-  // We also want to make sure that the current thread's data is cleaned up,
-  // if we have any.
-  auto &TLD = getThreadLocalData();
+  // If we for some reason are entering this function from an instrumented
+  // handler, we bail out.
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+
+  // Post the current thread's data if we have any.
   postCurrentThreadFCT(TLD);
 
   // Then we force serialize the log data.
@@ -235,19 +350,16 @@ XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
 }
 
 XRayLogInitStatus
-profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
+profilingLoggingInit(size_t, size_t, void *Options,
                      size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  if (BufferSize != 0 || BufferMax != 0) {
-    if (Verbosity())
-      Report("__xray_log_init() being used, and is unsupported. Use "
-             "__xray_log_init_mode(...) instead. Bailing out.");
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
     return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  }
 
   s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
   if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
                                       XRayLogInitStatus::XRAY_LOG_INITIALIZING,
-                                      memory_order_release)) {
+                                      memory_order_acq_rel)) {
     if (Verbosity())
       Report("Cannot initialize already initialised profiling "
              "implementation.\n");
@@ -276,35 +388,88 @@ profilingLoggingInit(size_t BufferSize, size_t BufferMax, void *Options,
   // We need to reset the profile data collection implementation now.
   profileCollectorService::reset();
 
-  // We need to set up the exit handlers.
-  static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  pthread_once(&Once, +[] {
-    pthread_key_create(&ProfilingKey, +[](void *P) {
-      // This is the thread-exit handler.
-      auto &TLD = *reinterpret_cast<ProfilingData *>(P);
-      if (TLD.Allocators == nullptr && TLD.FCT == nullptr)
-        return;
-
-      postCurrentThreadFCT(TLD);
-    });
+  // Then also reset the buffer queue implementation.
+  if (BQ == nullptr) {
+    bool Success = false;
+    new (&BufferQueueStorage)
+        BufferQueue(profilingFlags()->per_thread_allocator_max,
+                    profilingFlags()->buffers_max, Success);
+    if (!Success) {
+      if (Verbosity())
+        Report("Failed to initialize preallocated memory buffers!");
+      atomic_store(&ProfilerLogStatus,
+                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                   memory_order_release);
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
 
-    // We also need to set up an exit handler, so that we can get the profile
-    // information at exit time. We use the C API to do this, to not rely on C++
-    // ABI functions for registering exit handlers.
-    Atexit(+[] {
-      // Finalize and flush.
-      if (profilingFinalize() != XRAY_LOG_FINALIZED) {
-        cleanupTLD();
-        return;
-      }
-      if (profilingFlush() != XRAY_LOG_FLUSHED) {
-        cleanupTLD();
-        return;
-      }
+    // If we've succeded, set the global pointer to the initialised storage.
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+  } else {
+    BQ->finalize();
+    auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max,
+                               profilingFlags()->buffers_max);
+
+    if (InitStatus != BufferQueue::ErrorCode::Ok) {
       if (Verbosity())
-        Report("XRay Profile flushed at exit.");
-    });
-  });
+        Report("Failed to initialize preallocated memory buffers; error: %s",
+               BufferQueue::getErrorString(InitStatus));
+      atomic_store(&ProfilerLogStatus,
+                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                   memory_order_release);
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+
+    DCHECK(!BQ->finalizing());
+  }
+
+  // We need to set up the exit handlers.
+  static pthread_once_t Once = PTHREAD_ONCE_INIT;
+  pthread_once(
+      &Once, +[] {
+        pthread_key_create(
+            &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT {
+              if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+                return;
+
+              if (P == nullptr)
+                return;
+
+              auto T = reinterpret_cast<ProfilingData *>(P);
+              if (atomic_load_relaxed(&T->Allocators) == 0)
+                return;
+
+              {
+                // If we're somehow executing this while inside a
+                // non-reentrant-friendly context, we skip attempting to post
+                // the current thread's data.
+                RecursionGuard G(ReentranceGuard);
+                if (!G)
+                  return;
+
+                postCurrentThreadFCT(*T);
+              }
+            });
+
+        // We also need to set up an exit handler, so that we can get the
+        // profile information at exit time. We use the C API to do this, to not
+        // rely on C++ ABI functions for registering exit handlers.
+        Atexit(+[]() XRAY_NEVER_INSTRUMENT {
+          if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+            return;
+
+          auto Cleanup =
+              at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); });
+
+          // Finalize and flush.
+          if (profilingFinalize() != XRAY_LOG_FINALIZED ||
+              profilingFlush() != XRAY_LOG_FLUSHED)
+            return;
+
+          if (Verbosity())
+            Report("XRay Profile flushed at exit.");
+        });
+      });
 
   __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
   __xray_set_handler(profilingHandleArg0);
diff --git a/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc b/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc
index e9230ae64187..ccd70860bf61 100644
--- a/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc
+++ b/contrib/compiler-rt/lib/xray/xray_profiling_flags.inc
@@ -14,7 +14,7 @@
 #error "Define XRAY_FLAG prior to including this file!"
 #endif
 
-XRAY_FLAG(uptr, per_thread_allocator_max, 2 << 20,
+XRAY_FLAG(uptr, per_thread_allocator_max, 16384,
           "Maximum size of any single per-thread allocator.")
 XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
           "Maximum size of the global allocator for profile storage.")
@@ -27,3 +27,6 @@ XRAY_FLAG(int, grace_period_ms, 1,
 XRAY_FLAG(bool, no_flush, false,
           "Set to true if we want the profiling implementation to not write "
           "out files.")
+XRAY_FLAG(int, buffers_max, 128,
+          "The number of buffers to pre-allocate used by the profiling "
+          "implementation.")
diff --git a/contrib/compiler-rt/lib/xray/xray_segmented_array.h b/contrib/compiler-rt/lib/xray/xray_segmented_array.h
index 11dd794fa520..bc7e9379f63b 100644
--- a/contrib/compiler-rt/lib/xray/xray_segmented_array.h
+++ b/contrib/compiler-rt/lib/xray/xray_segmented_array.h
@@ -32,14 +32,9 @@ namespace __xray {
 /// is destroyed. When an Array is destroyed, it will destroy elements in the
 /// backing store but will not free the memory.
 template <class T> class Array {
-  struct SegmentBase {
-    SegmentBase *Prev;
-    SegmentBase *Next;
-  };
-
-  // We want each segment of the array to be cache-line aligned, and elements of
-  // the array be offset from the beginning of the segment.
-  struct Segment : SegmentBase {
+  struct Segment {
+    Segment *Prev;
+    Segment *Next;
     char Data[1];
   };
 
@@ -62,98 +57,46 @@ public:
   //     kCacheLineSize-multiple segments, minus the size of two pointers.
   //
   //   - Request cacheline-multiple sized elements from the allocator.
-  static constexpr size_t AlignedElementStorageSize =
+  static constexpr uint64_t AlignedElementStorageSize =
       sizeof(typename std::aligned_storage<sizeof(T), alignof(T)>::type);
 
-  static constexpr size_t SegmentSize =
-      nearest_boundary(sizeof(Segment) + next_pow2(sizeof(T)), kCacheLineSize);
+  static constexpr uint64_t SegmentControlBlockSize = sizeof(Segment *) * 2;
+
+  static constexpr uint64_t SegmentSize = nearest_boundary(
+      SegmentControlBlockSize + next_pow2(sizeof(T)), kCacheLineSize);
 
   using AllocatorType = Allocator<SegmentSize>;
 
-  static constexpr size_t ElementsPerSegment =
-      (SegmentSize - sizeof(Segment)) / next_pow2(sizeof(T));
+  static constexpr uint64_t ElementsPerSegment =
+      (SegmentSize - SegmentControlBlockSize) / next_pow2(sizeof(T));
 
   static_assert(ElementsPerSegment > 0,
                 "Must have at least 1 element per segment.");
 
-  static SegmentBase SentinelSegment;
-
-private:
-  AllocatorType *Alloc;
-  SegmentBase *Head = &SentinelSegment;
-  SegmentBase *Tail = &SentinelSegment;
-  size_t Size = 0;
-
-  // Here we keep track of segments in the freelist, to allow us to re-use
-  // segments when elements are trimmed off the end.
-  SegmentBase *Freelist = &SentinelSegment;
-
-  Segment *NewSegment() {
-    // We need to handle the case in which enough elements have been trimmed to
-    // allow us to re-use segments we've allocated before. For this we look into
-    // the Freelist, to see whether we need to actually allocate new blocks or
-    // just re-use blocks we've already seen before.
-    if (Freelist != &SentinelSegment) {
-      auto *FreeSegment = Freelist;
-      Freelist = FreeSegment->Next;
-      FreeSegment->Next = &SentinelSegment;
-      Freelist->Prev = &SentinelSegment;
-      return static_cast<Segment *>(FreeSegment);
-    }
-
-    auto SegmentBlock = Alloc->Allocate();
-    if (SegmentBlock.Data == nullptr)
-      return nullptr;
-
-    // Placement-new the Segment element at the beginning of the SegmentBlock.
-    auto S = reinterpret_cast<Segment *>(SegmentBlock.Data);
-    new (S) SegmentBase{&SentinelSegment, &SentinelSegment};
-    return S;
-  }
-
-  Segment *InitHeadAndTail() {
-    DCHECK_EQ(Head, &SentinelSegment);
-    DCHECK_EQ(Tail, &SentinelSegment);
-    auto Segment = NewSegment();
-    if (Segment == nullptr)
-      return nullptr;
-    DCHECK_EQ(Segment->Next, &SentinelSegment);
-    DCHECK_EQ(Segment->Prev, &SentinelSegment);
-    Head = Tail = static_cast<SegmentBase *>(Segment);
-    return Segment;
-  }
+  static Segment SentinelSegment;
 
-  Segment *AppendNewSegment() {
-    auto S = NewSegment();
-    if (S == nullptr)
-      return nullptr;
-    DCHECK_NE(Tail, &SentinelSegment);
-    DCHECK_EQ(Tail->Next, &SentinelSegment);
-    DCHECK_EQ(S->Prev, &SentinelSegment);
-    DCHECK_EQ(S->Next, &SentinelSegment);
-    Tail->Next = S;
-    S->Prev = Tail;
-    Tail = S;
-    return static_cast<Segment *>(Tail);
-  }
+  using size_type = uint64_t;
 
+private:
   // This Iterator models a BidirectionalIterator.
   template <class U> class Iterator {
-    SegmentBase *S = &SentinelSegment;
-    size_t Offset = 0;
-    size_t Size = 0;
+    Segment *S = &SentinelSegment;
+    uint64_t Offset = 0;
+    uint64_t Size = 0;
 
   public:
-    Iterator(SegmentBase *IS, size_t Off, size_t S)
-        : S(IS), Offset(Off), Size(S) {}
-    Iterator(const Iterator &) noexcept = default;
-    Iterator() noexcept = default;
-    Iterator(Iterator &&) noexcept = default;
-    Iterator &operator=(const Iterator &) = default;
-    Iterator &operator=(Iterator &&) = default;
-    ~Iterator() = default;
-
-    Iterator &operator++() {
+    Iterator(Segment *IS, uint64_t Off, uint64_t S) XRAY_NEVER_INSTRUMENT
+        : S(IS),
+          Offset(Off),
+          Size(S) {}
+    Iterator(const Iterator &) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator() NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator(Iterator &&) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator &operator=(const Iterator &) XRAY_NEVER_INSTRUMENT = default;
+    Iterator &operator=(Iterator &&) XRAY_NEVER_INSTRUMENT = default;
+    ~Iterator() XRAY_NEVER_INSTRUMENT = default;
+
+    Iterator &operator++() XRAY_NEVER_INSTRUMENT {
       if (++Offset % ElementsPerSegment || Offset == Size)
         return *this;
 
@@ -168,7 +111,7 @@ private:
       return *this;
     }
 
-    Iterator &operator--() {
+    Iterator &operator--() XRAY_NEVER_INSTRUMENT {
       DCHECK_NE(S, &SentinelSegment);
       DCHECK_GT(Offset, 0);
 
@@ -181,107 +124,295 @@ private:
       return *this;
     }
 
-    Iterator operator++(int) {
+    Iterator operator++(int) XRAY_NEVER_INSTRUMENT {
       Iterator Copy(*this);
       ++(*this);
       return Copy;
     }
 
-    Iterator operator--(int) {
+    Iterator operator--(int) XRAY_NEVER_INSTRUMENT {
       Iterator Copy(*this);
       --(*this);
       return Copy;
     }
 
     template <class V, class W>
-    friend bool operator==(const Iterator<V> &L, const Iterator<W> &R) {
+    friend bool operator==(const Iterator<V> &L,
+                           const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
       return L.S == R.S && L.Offset == R.Offset;
     }
 
     template <class V, class W>
-    friend bool operator!=(const Iterator<V> &L, const Iterator<W> &R) {
+    friend bool operator!=(const Iterator<V> &L,
+                           const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
       return !(L == R);
     }
 
-    U &operator*() const {
+    U &operator*() const XRAY_NEVER_INSTRUMENT {
       DCHECK_NE(S, &SentinelSegment);
       auto RelOff = Offset % ElementsPerSegment;
 
       // We need to compute the character-aligned pointer, offset from the
       // segment's Data location to get the element in the position of Offset.
-      auto Base = static_cast<Segment *>(S)->Data;
+      auto Base = &S->Data;
       auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize);
       return *reinterpret_cast<U *>(AlignedOffset);
     }
 
-    U *operator->() const { return &(**this); }
+    U *operator->() const XRAY_NEVER_INSTRUMENT { return &(**this); }
   };
 
+  AllocatorType *Alloc;
+  Segment *Head;
+  Segment *Tail;
+
+  // Here we keep track of segments in the freelist, to allow us to re-use
+  // segments when elements are trimmed off the end.
+  Segment *Freelist;
+  uint64_t Size;
+
+  // ===============================
+  // In the following implementation, we work through the algorithms and the
+  // list operations using the following notation:
+  //
+  //   - pred(s) is the predecessor (previous node accessor) and succ(s) is
+  //     the successor (next node accessor).
+  //
+  //   - S is a sentinel segment, which has the following property:
+  //
+  //         pred(S) == succ(S) == S
+  //
+  //   - @ is a loop operator, which can imply pred(s) == s if it appears on
+  //     the left of s, or succ(s) == S if it appears on the right of s.
+  //
+  //   - sL <-> sR : means a bidirectional relation between sL and sR, which
+  //     means:
+  //
+  //         succ(sL) == sR && pred(SR) == sL
+  //
+  //   - sL -> sR : implies a unidirectional relation between sL and SR,
+  //     with the following properties:
+  //
+  //         succ(sL) == sR
+  //
+  //     sL <- sR : implies a unidirectional relation between sR and sL,
+  //     with the following properties:
+  //
+  //         pred(sR) == sL
+  //
+  // ===============================
+
+  Segment *NewSegment() XRAY_NEVER_INSTRUMENT {
+    // We need to handle the case in which enough elements have been trimmed to
+    // allow us to re-use segments we've allocated before. For this we look into
+    // the Freelist, to see whether we need to actually allocate new blocks or
+    // just re-use blocks we've already seen before.
+    if (Freelist != &SentinelSegment) {
+      // The current state of lists resemble something like this at this point:
+      //
+      //   Freelist: @S@<-f0->...<->fN->@S@
+      //                  ^ Freelist
+      //
+      // We want to perform a splice of `f0` from Freelist to a temporary list,
+      // which looks like:
+      //
+      //   Templist: @S@<-f0->@S@
+      //                  ^ FreeSegment
+      //
+      // Our algorithm preconditions are:
+      DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+
+      // Then the algorithm we implement is:
+      //
+      //   SFS = Freelist
+      //   Freelist = succ(Freelist)
+      //   if (Freelist != S)
+      //     pred(Freelist) = S
+      //   succ(SFS) = S
+      //   pred(SFS) = S
+      //
+      auto *FreeSegment = Freelist;
+      Freelist = Freelist->Next;
+
+      // Note that we need to handle the case where Freelist is now pointing to
+      // S, which we don't want to be overwriting.
+      // TODO: Determine whether the cost of the branch is higher than the cost
+      // of the blind assignment.
+      if (Freelist != &SentinelSegment)
+        Freelist->Prev = &SentinelSegment;
+
+      FreeSegment->Next = &SentinelSegment;
+      FreeSegment->Prev = &SentinelSegment;
+
+      // Our postconditions are:
+      DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+      DCHECK_NE(FreeSegment, &SentinelSegment);
+      return FreeSegment;
+    }
+
+    auto SegmentBlock = Alloc->Allocate();
+    if (SegmentBlock.Data == nullptr)
+      return nullptr;
+
+    // Placement-new the Segment element at the beginning of the SegmentBlock.
+    new (SegmentBlock.Data) Segment{&SentinelSegment, &SentinelSegment, {0}};
+    auto SB = reinterpret_cast<Segment *>(SegmentBlock.Data);
+    return SB;
+  }
+
+  Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT {
+    DCHECK_EQ(Head, &SentinelSegment);
+    DCHECK_EQ(Tail, &SentinelSegment);
+    auto S = NewSegment();
+    if (S == nullptr)
+      return nullptr;
+    DCHECK_EQ(S->Next, &SentinelSegment);
+    DCHECK_EQ(S->Prev, &SentinelSegment);
+    DCHECK_NE(S, &SentinelSegment);
+    Head = S;
+    Tail = S;
+    DCHECK_EQ(Head, Tail);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    DCHECK_EQ(Tail->Prev, &SentinelSegment);
+    return S;
+  }
+
+  Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT {
+    auto S = NewSegment();
+    if (S == nullptr)
+      return nullptr;
+    DCHECK_NE(Tail, &SentinelSegment);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    DCHECK_EQ(S->Prev, &SentinelSegment);
+    DCHECK_EQ(S->Next, &SentinelSegment);
+    S->Prev = Tail;
+    Tail->Next = S;
+    Tail = S;
+    DCHECK_EQ(S, S->Prev->Next);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    return S;
+  }
+
 public:
-  explicit Array(AllocatorType &A) : Alloc(&A) {}
+  explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT
+      : Alloc(&A),
+        Head(&SentinelSegment),
+        Tail(&SentinelSegment),
+        Freelist(&SentinelSegment),
+        Size(0) {}
+
+  Array() XRAY_NEVER_INSTRUMENT : Alloc(nullptr),
+                                  Head(&SentinelSegment),
+                                  Tail(&SentinelSegment),
+                                  Freelist(&SentinelSegment),
+                                  Size(0) {}
 
   Array(const Array &) = delete;
-  Array(Array &&O) NOEXCEPT : Alloc(O.Alloc),
-                              Head(O.Head),
-                              Tail(O.Tail),
-                              Size(O.Size) {
+  Array &operator=(const Array &) = delete;
+
+  Array(Array &&O) XRAY_NEVER_INSTRUMENT : Alloc(O.Alloc),
+                                           Head(O.Head),
+                                           Tail(O.Tail),
+                                           Freelist(O.Freelist),
+                                           Size(O.Size) {
+    O.Alloc = nullptr;
     O.Head = &SentinelSegment;
     O.Tail = &SentinelSegment;
     O.Size = 0;
+    O.Freelist = &SentinelSegment;
   }
 
-  bool empty() const { return Size == 0; }
+  Array &operator=(Array &&O) XRAY_NEVER_INSTRUMENT {
+    Alloc = O.Alloc;
+    O.Alloc = nullptr;
+    Head = O.Head;
+    O.Head = &SentinelSegment;
+    Tail = O.Tail;
+    O.Tail = &SentinelSegment;
+    Freelist = O.Freelist;
+    O.Freelist = &SentinelSegment;
+    Size = O.Size;
+    O.Size = 0;
+    return *this;
+  }
+
+  ~Array() XRAY_NEVER_INSTRUMENT {
+    for (auto &E : *this)
+      (&E)->~T();
+  }
 
-  AllocatorType &allocator() const {
+  bool empty() const XRAY_NEVER_INSTRUMENT { return Size == 0; }
+
+  AllocatorType &allocator() const XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(Alloc, nullptr);
     return *Alloc;
   }
 
-  size_t size() const { return Size; }
+  uint64_t size() const XRAY_NEVER_INSTRUMENT { return Size; }
 
-  T *Append(const T &E) {
-    if (UNLIKELY(Head == &SentinelSegment))
-      if (InitHeadAndTail() == nullptr)
+  template <class... Args>
+  T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT {
+    DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) ||
+           (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+    if (UNLIKELY(Head == &SentinelSegment)) {
+      auto R = InitHeadAndTail();
+      if (R == nullptr)
         return nullptr;
+    }
+
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Tail, &SentinelSegment);
 
     auto Offset = Size % ElementsPerSegment;
     if (UNLIKELY(Size != 0 && Offset == 0))
       if (AppendNewSegment() == nullptr)
         return nullptr;
 
-    auto Base = static_cast<Segment *>(Tail)->Data;
+    DCHECK_NE(Tail, &SentinelSegment);
+    auto Base = &Tail->Data;
     auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
-    auto Position = reinterpret_cast<T *>(AlignedOffset);
-    *Position = E;
+    DCHECK_LE(AlignedOffset + sizeof(T),
+              reinterpret_cast<unsigned char *>(Base) + SegmentSize);
+
+    // In-place construct at Position.
+    new (AlignedOffset) T{std::forward<Args>(args)...};
     ++Size;
-    return Position;
+    return reinterpret_cast<T *>(AlignedOffset);
   }
 
-  template <class... Args> T *AppendEmplace(Args &&... args) {
-    if (UNLIKELY(Head == &SentinelSegment))
-      if (InitHeadAndTail() == nullptr)
+  T *Append(const T &E) XRAY_NEVER_INSTRUMENT {
+    // FIXME: This is a duplication of AppenEmplace with the copy semantics
+    // explicitly used, as a work-around to GCC 4.8 not invoking the copy
+    // constructor with the placement new with braced-init syntax.
+    DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) ||
+           (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+    if (UNLIKELY(Head == &SentinelSegment)) {
+      auto R = InitHeadAndTail();
+      if (R == nullptr)
         return nullptr;
+    }
+
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Tail, &SentinelSegment);
 
     auto Offset = Size % ElementsPerSegment;
-    auto *LatestSegment = Tail;
-    if (UNLIKELY(Size != 0 && Offset == 0)) {
-      LatestSegment = AppendNewSegment();
-      if (LatestSegment == nullptr)
+    if (UNLIKELY(Size != 0 && Offset == 0))
+      if (AppendNewSegment() == nullptr)
         return nullptr;
-    }
 
     DCHECK_NE(Tail, &SentinelSegment);
-    auto Base = static_cast<Segment *>(LatestSegment)->Data;
+    auto Base = &Tail->Data;
     auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
-    auto Position = reinterpret_cast<T *>(AlignedOffset);
+    DCHECK_LE(AlignedOffset + sizeof(T),
+              reinterpret_cast<unsigned char *>(Tail) + SegmentSize);
 
     // In-place construct at Position.
-    new (Position) T{std::forward<Args>(args)...};
+    new (AlignedOffset) T(E);
     ++Size;
-    return reinterpret_cast<T *>(Position);
+    return reinterpret_cast<T *>(AlignedOffset);
   }
 
-  T &operator[](size_t Offset) const {
+  T &operator[](uint64_t Offset) const XRAY_NEVER_INSTRUMENT {
     DCHECK_LE(Offset, Size);
     // We need to traverse the array enough times to find the element at Offset.
     auto S = Head;
@@ -290,19 +421,19 @@ public:
       Offset -= ElementsPerSegment;
       DCHECK_NE(S, &SentinelSegment);
     }
-    auto Base = static_cast<Segment *>(S)->Data;
+    auto Base = &S->Data;
     auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
     auto Position = reinterpret_cast<T *>(AlignedOffset);
     return *reinterpret_cast<T *>(Position);
   }
 
-  T &front() const {
+  T &front() const XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(Head, &SentinelSegment);
     DCHECK_NE(Size, 0u);
     return *begin();
   }
 
-  T &back() const {
+  T &back() const XRAY_NEVER_INSTRUMENT {
     DCHECK_NE(Tail, &SentinelSegment);
     DCHECK_NE(Size, 0u);
     auto It = end();
@@ -310,7 +441,8 @@ public:
     return *It;
   }
 
-  template <class Predicate> T *find_element(Predicate P) const {
+  template <class Predicate>
+  T *find_element(Predicate P) const XRAY_NEVER_INSTRUMENT {
     if (empty())
       return nullptr;
 
@@ -324,51 +456,195 @@ public:
 
   /// Remove N Elements from the end. This leaves the blocks behind, and not
   /// require allocation of new blocks for new elements added after trimming.
-  void trim(size_t Elements) {
-    DCHECK_LE(Elements, Size);
-    DCHECK_GT(Size, 0);
+  void trim(uint64_t Elements) XRAY_NEVER_INSTRUMENT {
     auto OldSize = Size;
+    Elements = Elements > Size ? Size : Elements;
     Size -= Elements;
 
-    DCHECK_NE(Head, &SentinelSegment);
-    DCHECK_NE(Tail, &SentinelSegment);
-
-    for (auto SegmentsToTrim = (nearest_boundary(OldSize, ElementsPerSegment) -
-                                nearest_boundary(Size, ElementsPerSegment)) /
-                               ElementsPerSegment;
-         SegmentsToTrim > 0; --SegmentsToTrim) {
+    // We compute the number of segments we're going to return from the tail by
+    // counting how many elements have been trimmed. Given the following:
+    //
+    // - Each segment has N valid positions, where N > 0
+    // - The previous size > current size
+    //
+    // To compute the number of segments to return, we need to perform the
+    // following calculations for the number of segments required given 'x'
+    // elements:
+    //
+    //   f(x) = {
+    //            x == 0          : 0
+    //          , 0 < x <= N      : 1
+    //          , N < x <= max    : x / N + (x % N ? 1 : 0)
+    //          }
+    //
+    // We can simplify this down to:
+    //
+    //   f(x) = {
+    //            x == 0          : 0,
+    //          , 0 < x <= max    : x / N + (x < N || x % N ? 1 : 0)
+    //          }
+    //
+    // And further down to:
+    //
+    //   f(x) = x ? x / N + (x < N || x % N ? 1 : 0) : 0
+    //
+    // We can then perform the following calculation `s` which counts the number
+    // of segments we need to remove from the end of the data structure:
+    //
+    //   s(p, c) = f(p) - f(c)
+    //
+    // If we treat p = previous size, and c = current size, and given the
+    // properties above, the possible range for s(...) is [0..max(typeof(p))/N]
+    // given that typeof(p) == typeof(c).
+    auto F = [](uint64_t X) {
+      return X ? (X / ElementsPerSegment) +
+                     (X < ElementsPerSegment || X % ElementsPerSegment ? 1 : 0)
+               : 0;
+    };
+    auto PS = F(OldSize);
+    auto CS = F(Size);
+    DCHECK_GE(PS, CS);
+    auto SegmentsToTrim = PS - CS;
+    for (auto I = 0uL; I < SegmentsToTrim; ++I) {
+      // Here we place the current tail segment to the freelist. To do this
+      // appropriately, we need to perform a splice operation on two
+      // bidirectional linked-lists. In particular, we have the current state of
+      // the doubly-linked list of segments:
+      //
+      //   @S@ <- s0 <-> s1 <-> ... <-> sT -> @S@
+      //
       DCHECK_NE(Head, &SentinelSegment);
       DCHECK_NE(Tail, &SentinelSegment);
-      // Put the tail into the Freelist.
-      auto *FreeSegment = Tail;
-      Tail = Tail->Prev;
-      if (Tail == &SentinelSegment)
-        Head = Tail;
-      else
-        Tail->Next = &SentinelSegment;
-
       DCHECK_EQ(Tail->Next, &SentinelSegment);
-      FreeSegment->Next = Freelist;
-      FreeSegment->Prev = &SentinelSegment;
-      if (Freelist != &SentinelSegment)
-        Freelist->Prev = FreeSegment;
-      Freelist = FreeSegment;
+
+      if (Freelist == &SentinelSegment) {
+        // Our two lists at this point are in this configuration:
+        //
+        //   Freelist: (potentially) @S@
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@
+        //                  ^ Head                ^ Tail
+        //
+        // The end state for us will be this configuration:
+        //
+        //   Freelist: @S@<-sT->@S@
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT->@S@
+        //                  ^ Head          ^ Tail
+        //
+        // The first step for us is to hold a reference to the tail of Mainlist,
+        // which in our notation is represented by sT. We call this our "free
+        // segment" which is the segment we are placing on the Freelist.
+        //
+        //   sF = sT
+        //
+        // Then, we also hold a reference to the "pre-tail" element, which we
+        // call sPT:
+        //
+        //   sPT = pred(sT)
+        //
+        // We want to splice sT into the beginning of the Freelist, which in
+        // an empty Freelist means placing a segment whose predecessor and
+        // successor is the sentinel segment.
+        //
+        // The splice operation then can be performed in the following
+        // algorithm:
+        //
+        //   succ(sPT) = S
+        //   pred(sT) = S
+        //   succ(sT) = Freelist
+        //   Freelist = sT
+        //   Tail = sPT
+        //
+        auto SPT = Tail->Prev;
+        SPT->Next = &SentinelSegment;
+        Tail->Prev = &SentinelSegment;
+        Tail->Next = Freelist;
+        Freelist = Tail;
+        Tail = SPT;
+
+        // Our post-conditions here are:
+        DCHECK_EQ(Tail->Next, &SentinelSegment);
+        DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+      } else {
+        // In the other case, where the Freelist is not empty, we perform the
+        // following transformation instead:
+        //
+        // This transforms the current state:
+        //
+        //   Freelist: @S@<-f0->@S@
+        //                  ^ Freelist
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@
+        //                  ^ Head                ^ Tail
+        //
+        // Into the following:
+        //
+        //   Freelist: @S@<-sT<->f0->@S@
+        //                  ^ Freelist
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT->@S@
+        //                  ^ Head          ^ Tail
+        //
+        // The algorithm is:
+        //
+        //   sFH = Freelist
+        //   sPT = pred(sT)
+        //   pred(SFH) = sT
+        //   succ(sT) = Freelist
+        //   pred(sT) = S
+        //   succ(sPT) = S
+        //   Tail = sPT
+        //   Freelist = sT
+        //
+        auto SFH = Freelist;
+        auto SPT = Tail->Prev;
+        auto ST = Tail;
+        SFH->Prev = ST;
+        ST->Next = Freelist;
+        ST->Prev = &SentinelSegment;
+        SPT->Next = &SentinelSegment;
+        Tail = SPT;
+        Freelist = ST;
+
+        // Our post-conditions here are:
+        DCHECK_EQ(Tail->Next, &SentinelSegment);
+        DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+        DCHECK_EQ(Freelist->Next->Prev, Freelist);
+      }
     }
+
+    // Now in case we've spliced all the segments in the end, we ensure that the
+    // main list is "empty", or both the head and tail pointing to the sentinel
+    // segment.
+    if (Tail == &SentinelSegment)
+      Head = Tail;
+
+    DCHECK(
+        (Size == 0 && Head == &SentinelSegment && Tail == &SentinelSegment) ||
+        (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+    DCHECK(
+        (Freelist != &SentinelSegment && Freelist->Prev == &SentinelSegment) ||
+        (Freelist == &SentinelSegment && Tail->Next == &SentinelSegment));
   }
 
   // Provide iterators.
-  Iterator<T> begin() const { return Iterator<T>(Head, 0, Size); }
-  Iterator<T> end() const { return Iterator<T>(Tail, Size, Size); }
-  Iterator<const T> cbegin() const { return Iterator<const T>(Head, 0, Size); }
-  Iterator<const T> cend() const { return Iterator<const T>(Tail, Size, Size); }
+  Iterator<T> begin() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<T>(Head, 0, Size);
+  }
+  Iterator<T> end() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<T>(Tail, Size, Size);
+  }
+  Iterator<const T> cbegin() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<const T>(Head, 0, Size);
+  }
+  Iterator<const T> cend() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<const T>(Tail, Size, Size);
+  }
 };
 
 // We need to have this storage definition out-of-line so that the compiler can
 // ensure that storage for the SentinelSegment is defined and has a single
 // address.
 template <class T>
-typename Array<T>::SegmentBase Array<T>::SentinelSegment{
-    &Array<T>::SentinelSegment, &Array<T>::SentinelSegment};
+typename Array<T>::Segment Array<T>::SentinelSegment{
+    &Array<T>::SentinelSegment, &Array<T>::SentinelSegment, {'\0'}};
 
 } // namespace __xray
 
diff --git a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
index 99ad3966ee3a..52985ffd19ab 100644
--- a/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
+++ b/contrib/compiler-rt/lib/xray/xray_trampoline_x86_64.S
@@ -19,6 +19,7 @@
 
 
 .macro SAVE_REGISTERS
+	pushfq
 	subq $240, %rsp
 	CFI_DEF_CFA_OFFSET(248)
 	movq %rbp, 232(%rsp)
@@ -69,6 +70,7 @@
 	movq  8(%rsp), %r14
 	movq  0(%rsp), %r15
 	addq	$240, %rsp
+	popfq
 	CFI_DEF_CFA_OFFSET(8)
 .endm
 
@@ -89,10 +91,10 @@
 	.text
 #if !defined(__APPLE__)
 	.section .text
+	.file "xray_trampoline_x86.S"
 #else
 	.section __TEXT,__text
 #endif
-	.file "xray_trampoline_x86.S"
 
 //===----------------------------------------------------------------------===//
 
diff --git a/contrib/compiler-rt/lib/xray/xray_tsc.h b/contrib/compiler-rt/lib/xray/xray_tsc.h
index 4507564e7cd2..180d6df188c1 100644
--- a/contrib/compiler-rt/lib/xray/xray_tsc.h
+++ b/contrib/compiler-rt/lib/xray/xray_tsc.h
@@ -13,10 +13,32 @@
 #ifndef XRAY_EMULATE_TSC_H
 #define XRAY_EMULATE_TSC_H
 
+#include "sanitizer_common/sanitizer_common.h"
+
 namespace __xray {
 static constexpr uint64_t NanosecondsPerSecond = 1000ULL * 1000 * 1000;
 }
 
+#if SANITIZER_FUCHSIA
+#include <zircon/syscalls.h>
+
+namespace __xray {
+
+inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+  CPU = 0;
+  return _zx_ticks_get();
+}
+
+inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+  return _zx_ticks_per_second();
+}
+
+} // namespace __xray
+
+#else // SANITIZER_FUCHSIA
+
 #if defined(__x86_64__)
 #include "xray_x86_64.inc"
 #elif defined(__powerpc64__)
@@ -64,5 +86,6 @@ inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
 #else
 #error Target architecture is not supported.
 #endif // CPU architecture
+#endif // SANITIZER_FUCHSIA
 
 #endif // XRAY_EMULATE_TSC_H
diff --git a/contrib/compiler-rt/lib/xray/xray_utils.cc b/contrib/compiler-rt/lib/xray/xray_utils.cc
index 68f4e8c1094c..59ba6c3082b2 100644
--- a/contrib/compiler-rt/lib/xray/xray_utils.cc
+++ b/contrib/compiler-rt/lib/xray/xray_utils.cc
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 #include "xray_utils.h"
 
+#include "sanitizer_common/sanitizer_allocator_internal.h"
 #include "sanitizer_common/sanitizer_common.h"
+#include "xray_allocator.h"
 #include "xray_defs.h"
 #include "xray_flags.h"
 #include <cstdio>
@@ -25,13 +27,113 @@
 #include <unistd.h>
 #include <utility>
 
+#if SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_symbolizer_fuchsia.h"
+
+#include <inttypes.h>
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
 namespace __xray {
 
-void printToStdErr(const char *Buffer) XRAY_NEVER_INSTRUMENT {
-  fprintf(stderr, "%s", Buffer);
+#if SANITIZER_FUCHSIA
+constexpr const char* ProfileSinkName = "llvm-xray";
+
+LogWriter::~LogWriter() {
+  _zx_handle_close(Vmo);
+}
+
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+  if (Begin == End)
+    return;
+  auto TotalBytes = std::distance(Begin, End);
+
+  const size_t PageSize = flags()->xray_page_size_override > 0
+                              ? flags()->xray_page_size_override
+                              : GetPageSizeCached();
+  if (RoundUpTo(Offset, PageSize) != RoundUpTo(Offset + TotalBytes, PageSize)) {
+    // Resize the VMO to ensure there's sufficient space for the data.
+    zx_status_t Status = _zx_vmo_set_size(Vmo, Offset + TotalBytes);
+    if (Status != ZX_OK) {
+      Report("Failed to resize VMO: %s\n", _zx_status_get_string(Status));
+      return;
+    }
+  }
+
+  // Write the data into VMO.
+  zx_status_t Status = _zx_vmo_write(Vmo, Begin, Offset, TotalBytes);
+  if (Status != ZX_OK) {
+    Report("Failed to write: %s\n", _zx_status_get_string(Status));
+    return;
+  }
+  Offset += TotalBytes;
+}
+
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+  // Nothing to do here since WriteAll writes directly into the VMO.
+}
+
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
+  // Create VMO to hold the profile data.
+  zx_handle_t Vmo;
+  zx_status_t Status = _zx_vmo_create(0, 0, &Vmo);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot create VMO: %s\n", _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Get the KOID of the current process to use in the VMO name.
+  zx_info_handle_basic_t Info;
+  Status = _zx_object_get_info(_zx_process_self(), ZX_INFO_HANDLE_BASIC, &Info,
+                               sizeof(Info), NULL, NULL);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot get basic info about current process handle: %s\n",
+           _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Give the VMO a name including our process KOID so it's easy to spot.
+  char VmoName[ZX_MAX_NAME_LEN];
+  internal_snprintf(VmoName, sizeof(VmoName), "%s.%zu", ProfileSinkName,
+                    Info.koid);
+  _zx_object_set_property(Vmo, ZX_PROP_NAME, VmoName, strlen(VmoName));
+
+  // Duplicate the handle since __sanitizer_publish_data consumes it and
+  // LogWriter needs to hold onto it.
+  zx_handle_t Handle;
+  Status =_zx_handle_duplicate(Vmo, ZX_RIGHT_SAME_RIGHTS, &Handle);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot duplicate VMO handle: %s\n",
+           _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Publish the VMO that receives the logging. Note the VMO's contents can
+  // grow and change after publication. The contents won't be read out until
+  // after the process exits.
+  __sanitizer_publish_data(ProfileSinkName, Handle);
+
+  // Use the dumpfile symbolizer markup element to write the name of the VMO.
+  Report("XRay: " FORMAT_DUMPFILE "\n", ProfileSinkName, VmoName);
+
+  LogWriter *LW = reinterpret_cast<LogWriter *>(InternalAlloc(sizeof(LogWriter)));
+  new (LW) LogWriter(Vmo);
+  return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+  LW->~LogWriter();
+  InternalFree(LW);
+}
+#else // SANITIZER_FUCHSIA
+LogWriter::~LogWriter() {
+  internal_close(Fd);
 }
 
-void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
   if (Begin == End)
     return;
   auto TotalBytes = std::distance(Begin, End);
@@ -49,50 +151,11 @@ void retryingWriteAll(int Fd, const char *Begin, const char *End) XRAY_NEVER_INS
   }
 }
 
-std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin,
-                                          char *End) XRAY_NEVER_INSTRUMENT {
-  auto BytesToRead = std::distance(Begin, End);
-  ssize_t BytesRead;
-  ssize_t TotalBytesRead = 0;
-  while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) {
-    if (BytesRead == -1) {
-      if (errno == EINTR)
-        continue;
-      Report("Read error; errno = %d\n", errno);
-      return std::make_pair(TotalBytesRead, false);
-    }
-
-    TotalBytesRead += BytesRead;
-    BytesToRead -= BytesRead;
-    Begin += BytesRead;
-  }
-  return std::make_pair(TotalBytesRead, true);
-}
-
-bool readValueFromFile(const char *Filename,
-                       long long *Value) XRAY_NEVER_INSTRUMENT {
-  int Fd = open(Filename, O_RDONLY | O_CLOEXEC);
-  if (Fd == -1)
-    return false;
-  static constexpr size_t BufSize = 256;
-  char Line[BufSize] = {};
-  ssize_t BytesRead;
-  bool Success;
-  std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize);
-  if (!Success)
-    return false;
-  close(Fd);
-  const char *End = nullptr;
-  long long Tmp = internal_simple_strtoll(Line, &End, 10);
-  bool Result = false;
-  if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
-    *Value = Tmp;
-    Result = true;
-  }
-  return Result;
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+  fsync(Fd);
 }
 
-int getLogFD() XRAY_NEVER_INSTRUMENT {
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
   // Open a temporary file once for the log.
   char TmpFilename[256] = {};
   char TmpWildcardPattern[] = "XXXXXX";
@@ -103,24 +166,31 @@ int getLogFD() XRAY_NEVER_INSTRUMENT {
   if (LastSlash != nullptr)
     Progname = LastSlash + 1;
 
-  const int HalfLength = sizeof(TmpFilename) / 2 - sizeof(TmpWildcardPattern);
   int NeededLength = internal_snprintf(
-      TmpFilename, sizeof(TmpFilename), "%.*s%.*s.%s", HalfLength,
-      flags()->xray_logfile_base, HalfLength, Progname, TmpWildcardPattern);
+      TmpFilename, sizeof(TmpFilename), "%s%s.%s",
+      flags()->xray_logfile_base, Progname, TmpWildcardPattern);
   if (NeededLength > int(sizeof(TmpFilename))) {
     Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename);
-    return -1;
+    return nullptr;
   }
   int Fd = mkstemp(TmpFilename);
   if (Fd == -1) {
     Report("XRay: Failed opening temporary file '%s'; not logging events.\n",
            TmpFilename);
-    return -1;
+    return nullptr;
   }
   if (Verbosity())
     Report("XRay: Log file in '%s'\n", TmpFilename);
 
-  return Fd;
+  LogWriter *LW = allocate<LogWriter>();
+  new (LW) LogWriter(Fd);
+  return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+  LW->~LogWriter();
+  deallocate(LW);
 }
+#endif // SANITIZER_FUCHSIA
 
 } // namespace __xray
diff --git a/contrib/compiler-rt/lib/xray/xray_utils.h b/contrib/compiler-rt/lib/xray/xray_utils.h
index eafa16e1a9d5..60438973fbd0 100644
--- a/contrib/compiler-rt/lib/xray/xray_utils.h
+++ b/contrib/compiler-rt/lib/xray/xray_utils.h
@@ -20,23 +20,40 @@
 #include <sys/types.h>
 #include <utility>
 
-namespace __xray {
-
-// Default implementation of the reporting interface for sanitizer errors.
-void printToStdErr(const char *Buffer);
-
-// EINTR-safe write routine, provided a file descriptor and a character range.
-void retryingWriteAll(int Fd, const char *Begin, const char *End);
+#include "sanitizer_common/sanitizer_common.h"
+#if SANITIZER_FUCHSIA
+#include <zircon/types.h>
+#endif
 
-// Reads a long long value from a provided file.
-bool readValueFromFile(const char *Filename, long long *Value);
-
-// EINTR-safe read routine, providing a file descriptor and a character range.
-std::pair<ssize_t, bool> retryingReadSome(int Fd, char *Begin, char *End);
+namespace __xray {
 
-// EINTR-safe open routine, uses flag-provided values for initialising a log
-// file.
-int getLogFD();
+class LogWriter {
+public:
+#if SANITIZER_FUCHSIA
+ LogWriter(zx_handle_t Vmo) : Vmo(Vmo) {}
+#else
+  explicit LogWriter(int Fd) : Fd(Fd) {}
+#endif
+ ~LogWriter();
+
+ // Write a character range into a log.
+ void WriteAll(const char *Begin, const char *End);
+
+ void Flush();
+
+ // Returns a new log instance initialized using the flag-provided values.
+ static LogWriter *Open();
+ // Closes and deallocates the log instance.
+ static void Close(LogWriter *LogWriter);
+
+private:
+#if SANITIZER_FUCHSIA
+ zx_handle_t Vmo = ZX_HANDLE_INVALID;
+ uint64_t Offset = 0;
+#else
+ int Fd = -1;
+#endif
+};
 
 constexpr size_t gcd(size_t a, size_t b) {
   return (b == 0) ? a : gcd(b, a % b);
diff --git a/contrib/compiler-rt/lib/xray/xray_x86_64.cc b/contrib/compiler-rt/lib/xray/xray_x86_64.cc
index 51dc4ce43b1c..e63ee1b3bd02 100644
--- a/contrib/compiler-rt/lib/xray/xray_x86_64.cc
+++ b/contrib/compiler-rt/lib/xray/xray_x86_64.cc
@@ -1,15 +1,20 @@
 #include "cpuid.h"
 #include "sanitizer_common/sanitizer_common.h"
+#if !SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
 #include "xray_defs.h"
 #include "xray_interface_internal.h"
 
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
 #include <sys/types.h>
 #if SANITIZER_OPENBSD
 #include <sys/time.h>
 #include <machine/cpu.h>
 #endif
 #include <sys/sysctl.h>
+#elif SANITIZER_FUCHSIA
+#include <zircon/syscalls.h>
 #endif
 
 #include <atomic>
@@ -81,17 +86,20 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
   }
   return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
 }
-#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
 uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
     long long TSCFrequency = -1;
     size_t tscfreqsz = sizeof(TSCFrequency);
 #if SANITIZER_OPENBSD
     int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ };
-    if (sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+    if (internal_sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+#elif SANITIZER_MAC
+    if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
+                              &tscfreqsz, NULL, 0) != -1) {
 
 #else
-    if (sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
-        NULL, 0) != -1) {
+    if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
+                              NULL, 0) != -1) {
 #endif
         return static_cast<uint64_t>(TSCFrequency);
     } else {
@@ -100,7 +108,7 @@ uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
 
     return 0;
 }
-#else
+#elif !SANITIZER_FUCHSIA
 uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
     /* Not supported */
     return 0;
@@ -317,6 +325,7 @@ bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
   return false;
 }
 
+#if !SANITIZER_FUCHSIA
 // We determine whether the CPU we're running on has the correct features we
 // need. In x86_64 this will be rdtscp support.
 bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
@@ -339,5 +348,6 @@ bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
   }
   return true;
 }
+#endif
 
 } // namespace __xray