From 3a1720af1d7f43edc5b214cde0be11bfb94d077e Mon Sep 17 00:00:00 2001
From: Dimitry Andric <dim@FreeBSD.org>
Date: Wed, 23 Oct 2019 17:52:22 +0000
Subject: Vendor import of stripped compiler-rt trunk r375505, the last commit
 before the upstream Subversion repository was made read-only, and the LLVM
 project migrated to GitHub:

https://llvm.org/svn/llvm-project/compiler-rt/trunk@375505
---
 lib/xray/xray_AArch64.cc               | 127 ------
 lib/xray/xray_AArch64.cpp              | 127 ++++++
 lib/xray/xray_arm.cc                   | 164 -------
 lib/xray/xray_arm.cpp                  | 164 +++++++
 lib/xray/xray_basic_flags.cc           |  49 ---
 lib/xray/xray_basic_flags.cpp          |  49 +++
 lib/xray/xray_basic_logging.cc         | 515 ----------------------
 lib/xray/xray_basic_logging.cpp        | 515 ++++++++++++++++++++++
 lib/xray/xray_buffer_queue.cc          | 237 -----------
 lib/xray/xray_buffer_queue.cpp         | 237 +++++++++++
 lib/xray/xray_fdr_flags.cc             |  47 --
 lib/xray/xray_fdr_flags.cpp            |  47 ++
 lib/xray/xray_fdr_logging.cc           | 757 ---------------------------------
 lib/xray/xray_fdr_logging.cpp          | 757 +++++++++++++++++++++++++++++++++
 lib/xray/xray_flags.cc                 |  84 ----
 lib/xray/xray_flags.cpp                |  84 ++++
 lib/xray/xray_init.cc                  | 115 -----
 lib/xray/xray_init.cpp                 | 115 +++++
 lib/xray/xray_interface.cc             | 480 ---------------------
 lib/xray/xray_interface.cpp            | 480 +++++++++++++++++++++
 lib/xray/xray_log_interface.cc         | 209 ---------
 lib/xray/xray_log_interface.cpp        | 209 +++++++++
 lib/xray/xray_mips.cc                  | 170 --------
 lib/xray/xray_mips.cpp                 | 170 ++++++++
 lib/xray/xray_mips64.cc                | 178 --------
 lib/xray/xray_mips64.cpp               | 178 ++++++++
 lib/xray/xray_powerpc64.cc             | 111 -----
 lib/xray/xray_powerpc64.cpp            | 111 +++++
 lib/xray/xray_profile_collector.cc     | 414 ------------------
 lib/xray/xray_profile_collector.cpp    | 414 ++++++++++++++++++
 lib/xray/xray_profiling.cc             | 519 ----------------------
 lib/xray/xray_profiling.cpp            | 519 ++++++++++++++++++++++
 lib/xray/xray_profiling_flags.cc       |  39 --
 lib/xray/xray_profiling_flags.cpp      |  39 ++
 lib/xray/xray_trampoline_powerpc64.cc  |  15 -
 lib/xray/xray_trampoline_powerpc64.cpp |  15 +
 lib/xray/xray_utils.cc                 | 195 ---------
 lib/xray/xray_utils.cpp                | 195 +++++++++
 lib/xray/xray_x86_64.cc                | 353 ---------------
 lib/xray/xray_x86_64.cpp               | 353 +++++++++++++++
 40 files changed, 4778 insertions(+), 4778 deletions(-)
 delete mode 100644 lib/xray/xray_AArch64.cc
 create mode 100644 lib/xray/xray_AArch64.cpp
 delete mode 100644 lib/xray/xray_arm.cc
 create mode 100644 lib/xray/xray_arm.cpp
 delete mode 100644 lib/xray/xray_basic_flags.cc
 create mode 100644 lib/xray/xray_basic_flags.cpp
 delete mode 100644 lib/xray/xray_basic_logging.cc
 create mode 100644 lib/xray/xray_basic_logging.cpp
 delete mode 100644 lib/xray/xray_buffer_queue.cc
 create mode 100644 lib/xray/xray_buffer_queue.cpp
 delete mode 100644 lib/xray/xray_fdr_flags.cc
 create mode 100644 lib/xray/xray_fdr_flags.cpp
 delete mode 100644 lib/xray/xray_fdr_logging.cc
 create mode 100644 lib/xray/xray_fdr_logging.cpp
 delete mode 100644 lib/xray/xray_flags.cc
 create mode 100644 lib/xray/xray_flags.cpp
 delete mode 100644 lib/xray/xray_init.cc
 create mode 100644 lib/xray/xray_init.cpp
 delete mode 100644 lib/xray/xray_interface.cc
 create mode 100644 lib/xray/xray_interface.cpp
 delete mode 100644 lib/xray/xray_log_interface.cc
 create mode 100644 lib/xray/xray_log_interface.cpp
 delete mode 100644 lib/xray/xray_mips.cc
 create mode 100644 lib/xray/xray_mips.cpp
 delete mode 100644 lib/xray/xray_mips64.cc
 create mode 100644 lib/xray/xray_mips64.cpp
 delete mode 100644 lib/xray/xray_powerpc64.cc
 create mode 100644 lib/xray/xray_powerpc64.cpp
 delete mode 100644 lib/xray/xray_profile_collector.cc
 create mode 100644 lib/xray/xray_profile_collector.cpp
 delete mode 100644 lib/xray/xray_profiling.cc
 create mode 100644 lib/xray/xray_profiling.cpp
 delete mode 100644 lib/xray/xray_profiling_flags.cc
 create mode 100644 lib/xray/xray_profiling_flags.cpp
 delete mode 100644 lib/xray/xray_trampoline_powerpc64.cc
 create mode 100644 lib/xray/xray_trampoline_powerpc64.cpp
 delete mode 100644 lib/xray/xray_utils.cc
 create mode 100644 lib/xray/xray_utils.cpp
 delete mode 100644 lib/xray/xray_x86_64.cc
 create mode 100644 lib/xray/xray_x86_64.cpp

(limited to 'lib/xray')

diff --git a/lib/xray/xray_AArch64.cc b/lib/xray/xray_AArch64.cc
deleted file mode 100644
index 4c7805488ab8..000000000000
--- a/lib/xray/xray_AArch64.cc
+++ /dev/null
@@ -1,127 +0,0 @@
-//===-- xray_AArch64.cc -----------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Implementation of AArch64-specific routines (64-bit).
-//
-//===----------------------------------------------------------------------===//
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_defs.h"
-#include "xray_interface_internal.h"
-#include <atomic>
-#include <cassert>
-
-extern "C" void __clear_cache(void *start, void *end);
-
-namespace __xray {
-
-// The machine codes for some instructions used in runtime patching.
-enum class PatchOpcodes : uint32_t {
-  PO_StpX0X30SP_m16e = 0xA9BF7BE0, // STP X0, X30, [SP, #-16]!
-  PO_LdrW0_12 = 0x18000060,        // LDR W0, #12
-  PO_LdrX16_12 = 0x58000070,       // LDR X16, #12
-  PO_BlrX16 = 0xD63F0200,          // BLR X16
-  PO_LdpX0X30SP_16 = 0xA8C17BE0,   // LDP X0, X30, [SP], #16
-  PO_B32 = 0x14000008              // B #32
-};
-
-inline static bool patchSled(const bool Enable, const uint32_t FuncId,
-                             const XRaySledEntry &Sled,
-                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
-  // When |Enable| == true,
-  // We replace the following compile-time stub (sled):
-  //
-  // xray_sled_n:
-  //   B #32
-  //   7 NOPs (24 bytes)
-  //
-  // With the following runtime patch:
-  //
-  // xray_sled_n:
-  //   STP X0, X30, [SP, #-16]! ; PUSH {r0, lr}
-  //   LDR W0, #12 ; W0 := function ID
-  //   LDR X16,#12 ; X16 := address of the trampoline
-  //   BLR X16
-  //   ;DATA: 32 bits of function ID
-  //   ;DATA: lower 32 bits of the address of the trampoline
-  //   ;DATA: higher 32 bits of the address of the trampoline
-  //   LDP X0, X30, [SP], #16 ; POP {r0, lr}
-  //
-  // Replacement of the first 4-byte instruction should be the last and atomic
-  // operation, so that the user code which reaches the sled concurrently
-  // either jumps over the whole sled, or executes the whole sled when the
-  // latter is ready.
-  //
-  // When |Enable|==false, we set back the first instruction in the sled to be
-  //   B #32
-
-  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
-  uint32_t *CurAddress = FirstAddress + 1;
-  if (Enable) {
-    *CurAddress = uint32_t(PatchOpcodes::PO_LdrW0_12);
-    CurAddress++;
-    *CurAddress = uint32_t(PatchOpcodes::PO_LdrX16_12);
-    CurAddress++;
-    *CurAddress = uint32_t(PatchOpcodes::PO_BlrX16);
-    CurAddress++;
-    *CurAddress = FuncId;
-    CurAddress++;
-    *reinterpret_cast<void (**)()>(CurAddress) = TracingHook;
-    CurAddress += 2;
-    *CurAddress = uint32_t(PatchOpcodes::PO_LdpX0X30SP_16);
-    CurAddress++;
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
-        uint32_t(PatchOpcodes::PO_StpX0X30SP_m16e), std::memory_order_release);
-  } else {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
-        uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
-  }
-  __clear_cache(reinterpret_cast<char *>(FirstAddress),
-                reinterpret_cast<char *>(CurAddress));
-  return true;
-}
-
-bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
-                        const XRaySledEntry &Sled,
-                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, Trampoline);
-}
-
-bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
-                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
-}
-
-bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
-}
-
-bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled)
-    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64?
-  return false;
-}
-
-bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in aarch64?
-  return false;
-}
-
-// FIXME: Maybe implement this better?
-bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
-
-} // namespace __xray
-
-extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
-  // FIXME: this will have to be implemented in the trampoline assembly file
-}
diff --git a/lib/xray/xray_AArch64.cpp b/lib/xray/xray_AArch64.cpp
new file mode 100644
index 000000000000..081941b70375
--- /dev/null
+++ b/lib/xray/xray_AArch64.cpp
@@ -0,0 +1,127 @@
+//===-- xray_AArch64.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of AArch64-specific routines (64-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+#include <cassert>
+
+extern "C" void __clear_cache(void *start, void *end);
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum class PatchOpcodes : uint32_t {
+  PO_StpX0X30SP_m16e = 0xA9BF7BE0, // STP X0, X30, [SP, #-16]!
+  PO_LdrW0_12 = 0x18000060,        // LDR W0, #12
+  PO_LdrX16_12 = 0x58000070,       // LDR X16, #12
+  PO_BlrX16 = 0xD63F0200,          // BLR X16
+  PO_LdpX0X30SP_16 = 0xA8C17BE0,   // LDP X0, X30, [SP], #16
+  PO_B32 = 0x14000008              // B #32
+};
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //   B #32
+  //   7 NOPs (24 bytes)
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n:
+  //   STP X0, X30, [SP, #-16]! ; PUSH {r0, lr}
+  //   LDR W0, #12 ; W0 := function ID
+  //   LDR X16,#12 ; X16 := address of the trampoline
+  //   BLR X16
+  //   ;DATA: 32 bits of function ID
+  //   ;DATA: lower 32 bits of the address of the trampoline
+  //   ;DATA: higher 32 bits of the address of the trampoline
+  //   LDP X0, X30, [SP], #16 ; POP {r0, lr}
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #32
+
+  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
+  uint32_t *CurAddress = FirstAddress + 1;
+  if (Enable) {
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdrW0_12);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdrX16_12);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_BlrX16);
+    CurAddress++;
+    *CurAddress = FuncId;
+    CurAddress++;
+    *reinterpret_cast<void (**)()>(CurAddress) = TracingHook;
+    CurAddress += 2;
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdpX0X30SP_16);
+    CurAddress++;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_StpX0X30SP_m16e), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
+  }
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in aarch64?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in aarch64?
+  return false;
+}
+
+// FIXME: Maybe implement this better?
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/lib/xray/xray_arm.cc b/lib/xray/xray_arm.cc
deleted file mode 100644
index db26efaa782a..000000000000
--- a/lib/xray/xray_arm.cc
+++ /dev/null
@@ -1,164 +0,0 @@
-//===-- xray_arm.cc ---------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Implementation of ARM-specific routines (32-bit).
-//
-//===----------------------------------------------------------------------===//
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_defs.h"
-#include "xray_interface_internal.h"
-#include <atomic>
-#include <cassert>
-
-extern "C" void __clear_cache(void *start, void *end);
-
-namespace __xray {
-
-// The machine codes for some instructions used in runtime patching.
-enum class PatchOpcodes : uint32_t {
-  PO_PushR0Lr = 0xE92D4001, // PUSH {r0, lr}
-  PO_BlxIp = 0xE12FFF3C,    // BLX ip
-  PO_PopR0Lr = 0xE8BD4001,  // POP {r0, lr}
-  PO_B20 = 0xEA000005       // B #20
-};
-
-// 0xUUUUWXYZ -> 0x000W0XYZ
-inline static uint32_t getMovwMask(const uint32_t Value) XRAY_NEVER_INSTRUMENT {
-  return (Value & 0xfff) | ((Value & 0xf000) << 4);
-}
-
-// 0xWXYZUUUU -> 0x000W0XYZ
-inline static uint32_t getMovtMask(const uint32_t Value) XRAY_NEVER_INSTRUMENT {
-  return getMovwMask(Value >> 16);
-}
-
-// Writes the following instructions:
-//   MOVW R<regNo>, #<lower 16 bits of the |Value|>
-//   MOVT R<regNo>, #<higher 16 bits of the |Value|>
-inline static uint32_t *
-write32bitLoadReg(uint8_t regNo, uint32_t *Address,
-                  const uint32_t Value) XRAY_NEVER_INSTRUMENT {
-  // This is a fatal error: we cannot just report it and continue execution.
-  assert(regNo <= 15 && "Register number must be 0 to 15.");
-  // MOVW R, #0xWXYZ in machine code is 0xE30WRXYZ
-  *Address = (0xE3000000 | (uint32_t(regNo) << 12) | getMovwMask(Value));
-  Address++;
-  // MOVT R, #0xWXYZ in machine code is 0xE34WRXYZ
-  *Address = (0xE3400000 | (uint32_t(regNo) << 12) | getMovtMask(Value));
-  return Address + 1;
-}
-
-// Writes the following instructions:
-//   MOVW r0, #<lower 16 bits of the |Value|>
-//   MOVT r0, #<higher 16 bits of the |Value|>
-inline static uint32_t *
-write32bitLoadR0(uint32_t *Address,
-                 const uint32_t Value) XRAY_NEVER_INSTRUMENT {
-  return write32bitLoadReg(0, Address, Value);
-}
-
-// Writes the following instructions:
-//   MOVW ip, #<lower 16 bits of the |Value|>
-//   MOVT ip, #<higher 16 bits of the |Value|>
-inline static uint32_t *
-write32bitLoadIP(uint32_t *Address,
-                 const uint32_t Value) XRAY_NEVER_INSTRUMENT {
-  return write32bitLoadReg(12, Address, Value);
-}
-
-inline static bool patchSled(const bool Enable, const uint32_t FuncId,
-                             const XRaySledEntry &Sled,
-                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
-  // When |Enable| == true,
-  // We replace the following compile-time stub (sled):
-  //
-  // xray_sled_n:
-  //   B #20
-  //   6 NOPs (24 bytes)
-  //
-  // With the following runtime patch:
-  //
-  // xray_sled_n:
-  //   PUSH {r0, lr}
-  //   MOVW r0, #<lower 16 bits of function ID>
-  //   MOVT r0, #<higher 16 bits of function ID>
-  //   MOVW ip, #<lower 16 bits of address of TracingHook>
-  //   MOVT ip, #<higher 16 bits of address of TracingHook>
-  //   BLX ip
-  //   POP {r0, lr}
-  //
-  // Replacement of the first 4-byte instruction should be the last and atomic
-  // operation, so that the user code which reaches the sled concurrently
-  // either jumps over the whole sled, or executes the whole sled when the
-  // latter is ready.
-  //
-  // When |Enable|==false, we set back the first instruction in the sled to be
-  //   B #20
-
-  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
-  uint32_t *CurAddress = FirstAddress + 1;
-  if (Enable) {
-    CurAddress =
-        write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId));
-    CurAddress =
-        write32bitLoadIP(CurAddress, reinterpret_cast<uint32_t>(TracingHook));
-    *CurAddress = uint32_t(PatchOpcodes::PO_BlxIp);
-    CurAddress++;
-    *CurAddress = uint32_t(PatchOpcodes::PO_PopR0Lr);
-    CurAddress++;
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
-        uint32_t(PatchOpcodes::PO_PushR0Lr), std::memory_order_release);
-  } else {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
-        uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
-  }
-  __clear_cache(reinterpret_cast<char *>(FirstAddress),
-                reinterpret_cast<char *>(CurAddress));
-  return true;
-}
-
-bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
-                        const XRaySledEntry &Sled,
-                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, Trampoline);
-}
-
-bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
-                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
-}
-
-bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
-}
-
-bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled)
-    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm?
-  return false;
-}
-
-bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in arm?
-  return false;
-}
-
-// FIXME: Maybe implement this better?
-bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
-
-} // namespace __xray
-
-extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
-  // FIXME: this will have to be implemented in the trampoline assembly file
-}
diff --git a/lib/xray/xray_arm.cpp b/lib/xray/xray_arm.cpp
new file mode 100644
index 000000000000..9ad8065eb886
--- /dev/null
+++ b/lib/xray/xray_arm.cpp
@@ -0,0 +1,164 @@
+//===-- xray_arm.cpp --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of ARM-specific routines (32-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+#include <cassert>
+
+extern "C" void __clear_cache(void *start, void *end);
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum class PatchOpcodes : uint32_t {
+  PO_PushR0Lr = 0xE92D4001, // PUSH {r0, lr}
+  PO_BlxIp = 0xE12FFF3C,    // BLX ip
+  PO_PopR0Lr = 0xE8BD4001,  // POP {r0, lr}
+  PO_B20 = 0xEA000005       // B #20
+};
+
+// 0xUUUUWXYZ -> 0x000W0XYZ
+inline static uint32_t getMovwMask(const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return (Value & 0xfff) | ((Value & 0xf000) << 4);
+}
+
+// 0xWXYZUUUU -> 0x000W0XYZ
+inline static uint32_t getMovtMask(const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return getMovwMask(Value >> 16);
+}
+
+// Writes the following instructions:
+//   MOVW R<regNo>, #<lower 16 bits of the |Value|>
+//   MOVT R<regNo>, #<higher 16 bits of the |Value|>
+inline static uint32_t *
+write32bitLoadReg(uint8_t regNo, uint32_t *Address,
+                  const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  // This is a fatal error: we cannot just report it and continue execution.
+  assert(regNo <= 15 && "Register number must be 0 to 15.");
+  // MOVW R, #0xWXYZ in machine code is 0xE30WRXYZ
+  *Address = (0xE3000000 | (uint32_t(regNo) << 12) | getMovwMask(Value));
+  Address++;
+  // MOVT R, #0xWXYZ in machine code is 0xE34WRXYZ
+  *Address = (0xE3400000 | (uint32_t(regNo) << 12) | getMovtMask(Value));
+  return Address + 1;
+}
+
+// Writes the following instructions:
+//   MOVW r0, #<lower 16 bits of the |Value|>
+//   MOVT r0, #<higher 16 bits of the |Value|>
+inline static uint32_t *
+write32bitLoadR0(uint32_t *Address,
+                 const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return write32bitLoadReg(0, Address, Value);
+}
+
+// Writes the following instructions:
+//   MOVW ip, #<lower 16 bits of the |Value|>
+//   MOVT ip, #<higher 16 bits of the |Value|>
+inline static uint32_t *
+write32bitLoadIP(uint32_t *Address,
+                 const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return write32bitLoadReg(12, Address, Value);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //   B #20
+  //   6 NOPs (24 bytes)
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n:
+  //   PUSH {r0, lr}
+  //   MOVW r0, #<lower 16 bits of function ID>
+  //   MOVT r0, #<higher 16 bits of function ID>
+  //   MOVW ip, #<lower 16 bits of address of TracingHook>
+  //   MOVT ip, #<higher 16 bits of address of TracingHook>
+  //   BLX ip
+  //   POP {r0, lr}
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #20
+
+  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.Address);
+  uint32_t *CurAddress = FirstAddress + 1;
+  if (Enable) {
+    CurAddress =
+        write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId));
+    CurAddress =
+        write32bitLoadIP(CurAddress, reinterpret_cast<uint32_t>(TracingHook));
+    *CurAddress = uint32_t(PatchOpcodes::PO_BlxIp);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_PopR0Lr);
+    CurAddress++;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_PushR0Lr), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
+  }
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in arm?
+  return false;
+}
+
+// FIXME: Maybe implement this better?
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/lib/xray/xray_basic_flags.cc b/lib/xray/xray_basic_flags.cc
deleted file mode 100644
index 75b674c85656..000000000000
--- a/lib/xray/xray_basic_flags.cc
+++ /dev/null
@@ -1,49 +0,0 @@
-//===-- xray_basic_flags.cc -------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// XRay Basic flag parsing logic.
-//===----------------------------------------------------------------------===//
-
-#include "xray_basic_flags.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_flag_parser.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "xray_defs.h"
-
-using namespace __sanitizer;
-
-namespace __xray {
-
-/// Use via basicFlags().
-BasicFlags xray_basic_flags_dont_use_directly;
-
-void BasicFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
-#include "xray_basic_flags.inc"
-#undef XRAY_FLAG
-}
-
-void registerXRayBasicFlags(FlagParser *P,
-                            BasicFlags *F) XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
-  RegisterFlag(P, #Name, Description, &F->Name);
-#include "xray_basic_flags.inc"
-#undef XRAY_FLAG
-}
-
-const char *useCompilerDefinedBasicFlags() XRAY_NEVER_INSTRUMENT {
-#ifdef XRAY_BASIC_OPTIONS
-  return SANITIZER_STRINGIFY(XRAY_BASIC_OPTIONS);
-#else
-  return "";
-#endif
-}
-
-} // namespace __xray
diff --git a/lib/xray/xray_basic_flags.cpp b/lib/xray/xray_basic_flags.cpp
new file mode 100644
index 000000000000..e0a5e7bb29ee
--- /dev/null
+++ b/lib/xray/xray_basic_flags.cpp
@@ -0,0 +1,49 @@
+//===-- xray_basic_flags.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay Basic flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_basic_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+/// Use via basicFlags().
+BasicFlags xray_basic_flags_dont_use_directly;
+
+void BasicFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayBasicFlags(FlagParser *P,
+                            BasicFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedBasicFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_BASIC_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_BASIC_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_basic_logging.cc b/lib/xray/xray_basic_logging.cc
deleted file mode 100644
index 553041ce0c31..000000000000
--- a/lib/xray/xray_basic_logging.cc
+++ /dev/null
@@ -1,515 +0,0 @@
-//===-- xray_basic_logging.cc -----------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Implementation of a simple in-memory log of XRay events. This defines a
-// logging function that's compatible with the XRay handler interface, and
-// routines for exporting data to files.
-//
-//===----------------------------------------------------------------------===//
-
-#include <errno.h>
-#include <fcntl.h>
-#include <pthread.h>
-#include <sys/stat.h>
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
-#include <sys/syscall.h>
-#endif
-#include <sys/types.h>
-#include <time.h>
-#include <unistd.h>
-
-#include "sanitizer_common/sanitizer_allocator_internal.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "xray/xray_records.h"
-#include "xray_recursion_guard.h"
-#include "xray_basic_flags.h"
-#include "xray_basic_logging.h"
-#include "xray_defs.h"
-#include "xray_flags.h"
-#include "xray_interface_internal.h"
-#include "xray_tsc.h"
-#include "xray_utils.h"
-
-namespace __xray {
-
-static SpinMutex LogMutex;
-
-namespace {
-// We use elements of this type to record the entry TSC of every function ID we
-// see as we're tracing a particular thread's execution.
-struct alignas(16) StackEntry {
-  int32_t FuncId;
-  uint16_t Type;
-  uint8_t CPU;
-  uint8_t Padding;
-  uint64_t TSC;
-};
-
-static_assert(sizeof(StackEntry) == 16, "Wrong size for StackEntry");
-
-struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
-  void *InMemoryBuffer = nullptr;
-  size_t BufferSize = 0;
-  size_t BufferOffset = 0;
-  void *ShadowStack = nullptr;
-  size_t StackSize = 0;
-  size_t StackEntries = 0;
-  __xray::LogWriter *LogWriter = nullptr;
-};
-
-struct BasicLoggingOptions {
-  int DurationFilterMicros = 0;
-  size_t MaxStackDepth = 0;
-  size_t ThreadBufferSize = 0;
-};
-} // namespace
-
-static pthread_key_t PThreadKey;
-
-static atomic_uint8_t BasicInitialized{0};
-
-struct BasicLoggingOptions GlobalOptions;
-
-thread_local atomic_uint8_t Guard{0};
-
-static atomic_uint8_t UseRealTSC{0};
-static atomic_uint64_t ThresholdTicks{0};
-static atomic_uint64_t TicksPerSec{0};
-static atomic_uint64_t CycleFrequency{NanosecondsPerSecond};
-
-static LogWriter *getLog() XRAY_NEVER_INSTRUMENT {
-  LogWriter* LW = LogWriter::Open();
-  if (LW == nullptr)
-    return LW;
-
-  static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT;
-  pthread_once(&DetectOnce, +[] {
-    if (atomic_load(&UseRealTSC, memory_order_acquire))
-      atomic_store(&CycleFrequency, getTSCFrequency(), memory_order_release);
-  });
-
-  // Since we're here, we get to write the header. We set it up so that the
-  // header will only be written once, at the start, and let the threads
-  // logging do writes which just append.
-  XRayFileHeader Header;
-  // Version 2 includes tail exit records.
-  // Version 3 includes pid inside records.
-  Header.Version = 3;
-  Header.Type = FileTypes::NAIVE_LOG;
-  Header.CycleFrequency = atomic_load(&CycleFrequency, memory_order_acquire);
-
-  // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
-  // before setting the values in the header.
-  Header.ConstantTSC = 1;
-  Header.NonstopTSC = 1;
-  LW->WriteAll(reinterpret_cast<char *>(&Header),
-               reinterpret_cast<char *>(&Header) + sizeof(Header));
-  return LW;
-}
-
-static LogWriter *getGlobalLog() XRAY_NEVER_INSTRUMENT {
-  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
-  static LogWriter *LW = nullptr;
-  pthread_once(&OnceInit, +[] { LW = getLog(); });
-  return LW;
-}
-
-static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
-  thread_local ThreadLocalData TLD;
-  thread_local bool UNUSED TOnce = [] {
-    if (GlobalOptions.ThreadBufferSize == 0) {
-      if (Verbosity())
-        Report("Not initializing TLD since ThreadBufferSize == 0.\n");
-      return false;
-    }
-    pthread_setspecific(PThreadKey, &TLD);
-    TLD.LogWriter = getGlobalLog();
-    TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>(
-        InternalAlloc(sizeof(XRayRecord) * GlobalOptions.ThreadBufferSize,
-                      nullptr, alignof(XRayRecord)));
-    TLD.BufferSize = GlobalOptions.ThreadBufferSize;
-    TLD.BufferOffset = 0;
-    if (GlobalOptions.MaxStackDepth == 0) {
-      if (Verbosity())
-        Report("Not initializing the ShadowStack since MaxStackDepth == 0.\n");
-      TLD.StackSize = 0;
-      TLD.StackEntries = 0;
-      TLD.ShadowStack = nullptr;
-      return false;
-    }
-    TLD.ShadowStack = reinterpret_cast<StackEntry *>(
-        InternalAlloc(sizeof(StackEntry) * GlobalOptions.MaxStackDepth, nullptr,
-                      alignof(StackEntry)));
-    TLD.StackSize = GlobalOptions.MaxStackDepth;
-    TLD.StackEntries = 0;
-    return false;
-  }();
-  return TLD;
-}
-
-template <class RDTSC>
-void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
-                    RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  LogWriter *LW = getGlobalLog();
-  if (LW == nullptr)
-    return;
-
-  // Use a simple recursion guard, to handle cases where we're already logging
-  // and for one reason or another, this function gets called again in the same
-  // thread.
-  RecursionGuard G(Guard);
-  if (!G)
-    return;
-
-  uint8_t CPU = 0;
-  uint64_t TSC = ReadTSC(CPU);
-
-  switch (Type) {
-  case XRayEntryType::ENTRY:
-  case XRayEntryType::LOG_ARGS_ENTRY: {
-    // Short circuit if we've reached the maximum depth of the stack.
-    if (TLD.StackEntries++ >= TLD.StackSize)
-      return;
-
-    // When we encounter an entry event, we keep track of the TSC and the CPU,
-    // and put it in the stack.
-    StackEntry E;
-    E.FuncId = FuncId;
-    E.CPU = CPU;
-    E.Type = Type;
-    E.TSC = TSC;
-    auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
-                         (sizeof(StackEntry) * (TLD.StackEntries - 1));
-    internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry));
-    break;
-  }
-  case XRayEntryType::EXIT:
-  case XRayEntryType::TAIL: {
-    if (TLD.StackEntries == 0)
-      break;
-
-    if (--TLD.StackEntries >= TLD.StackSize)
-      return;
-
-    // When we encounter an exit event, we check whether all the following are
-    // true:
-    //
-    // - The Function ID is the same as the most recent entry in the stack.
-    // - The CPU is the same as the most recent entry in the stack.
-    // - The Delta of the TSCs is less than the threshold amount of time we're
-    //   looking to record.
-    //
-    // If all of these conditions are true, we pop the stack and don't write a
-    // record and move the record offset back.
-    StackEntry StackTop;
-    auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
-                         (sizeof(StackEntry) * TLD.StackEntries);
-    internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry));
-    if (StackTop.FuncId == FuncId && StackTop.CPU == CPU &&
-        StackTop.TSC < TSC) {
-      auto Delta = TSC - StackTop.TSC;
-      if (Delta < atomic_load(&ThresholdTicks, memory_order_relaxed)) {
-        DCHECK(TLD.BufferOffset > 0);
-        TLD.BufferOffset -= StackTop.Type == XRayEntryType::ENTRY ? 1 : 2;
-        return;
-      }
-    }
-    break;
-  }
-  default:
-    // Should be unreachable.
-    DCHECK(false && "Unsupported XRayEntryType encountered.");
-    break;
-  }
-
-  // First determine whether the delta between the function's enter record and
-  // the exit record is higher than the threshold.
-  XRayRecord R;
-  R.RecordType = RecordTypes::NORMAL;
-  R.CPU = CPU;
-  R.TSC = TSC;
-  R.TId = GetTid(); 
-  R.PId = internal_getpid(); 
-  R.Type = Type;
-  R.FuncId = FuncId;
-  auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer);
-  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
-  if (++TLD.BufferOffset == TLD.BufferSize) {
-    SpinMutexLock Lock(&LogMutex);
-    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
-                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
-    TLD.BufferOffset = 0;
-    TLD.StackEntries = 0;
-  }
-}
-
-template <class RDTSC>
-void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
-                           RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
-  auto &TLD = getThreadLocalData();
-  auto FirstEntry =
-      reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer);
-  const auto &BuffLen = TLD.BufferSize;
-  LogWriter *LW = getGlobalLog();
-  if (LW == nullptr)
-    return;
-
-  // First we check whether there's enough space to write the data consecutively
-  // in the thread-local buffer. If not, we first flush the buffer before
-  // attempting to write the two records that must be consecutive.
-  if (TLD.BufferOffset + 2 > BuffLen) {
-    SpinMutexLock Lock(&LogMutex);
-    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
-                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
-    TLD.BufferOffset = 0;
-    TLD.StackEntries = 0;
-  }
-
-  // Then we write the "we have an argument" record.
-  InMemoryRawLog(FuncId, Type, ReadTSC);
-
-  RecursionGuard G(Guard);
-  if (!G)
-    return;
-
-  // And, from here on write the arg payload.
-  XRayArgPayload R;
-  R.RecordType = RecordTypes::ARG_PAYLOAD;
-  R.FuncId = FuncId;
-  R.TId = GetTid(); 
-  R.PId = internal_getpid(); 
-  R.Arg = Arg1;
-  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
-  if (++TLD.BufferOffset == BuffLen) {
-    SpinMutexLock Lock(&LogMutex);
-    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
-                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
-    TLD.BufferOffset = 0;
-    TLD.StackEntries = 0;
-  }
-}
-
-void basicLoggingHandleArg0RealTSC(int32_t FuncId,
-                                   XRayEntryType Type) XRAY_NEVER_INSTRUMENT {
-  InMemoryRawLog(FuncId, Type, readTSC);
-}
-
-void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type)
-    XRAY_NEVER_INSTRUMENT {
-  InMemoryRawLog(FuncId, Type, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
-    timespec TS;
-    int result = clock_gettime(CLOCK_REALTIME, &TS);
-    if (result != 0) {
-      Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno));
-      TS = {0, 0};
-    }
-    CPU = 0;
-    return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
-  });
-}
-
-void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Type,
-                                   uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
-  InMemoryRawLogWithArg(FuncId, Type, Arg1, readTSC);
-}
-
-void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type,
-                                      uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
-  InMemoryRawLogWithArg(
-      FuncId, Type, Arg1, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
-        timespec TS;
-        int result = clock_gettime(CLOCK_REALTIME, &TS);
-        if (result != 0) {
-          Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno));
-          TS = {0, 0};
-        }
-        CPU = 0;
-        return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
-      });
-}
-
-static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
-  ThreadLocalData &TLD = *reinterpret_cast<ThreadLocalData *>(P);
-  auto ExitGuard = at_scope_exit([&TLD] {
-    // Clean up dynamic resources.
-    if (TLD.InMemoryBuffer)
-      InternalFree(TLD.InMemoryBuffer);
-    if (TLD.ShadowStack)
-      InternalFree(TLD.ShadowStack);
-    if (Verbosity())
-      Report("Cleaned up log for TID: %d\n", GetTid());
-  });
-
-  if (TLD.LogWriter == nullptr || TLD.BufferOffset == 0) {
-    if (Verbosity())
-      Report("Skipping buffer for TID: %d; Offset = %llu\n", GetTid(),
-             TLD.BufferOffset);
-    return;
-  }
-
-  {
-    SpinMutexLock L(&LogMutex);
-    TLD.LogWriter->WriteAll(reinterpret_cast<char *>(TLD.InMemoryBuffer),
-                            reinterpret_cast<char *>(TLD.InMemoryBuffer) +
-                            (sizeof(XRayRecord) * TLD.BufferOffset));
-  }
-
-  // Because this thread's exit could be the last one trying to write to
-  // the file and that we're not able to close out the file properly, we
-  // sync instead and hope that the pending writes are flushed as the
-  // thread exits.
-  TLD.LogWriter->Flush();
-}
-
-XRayLogInitStatus basicLoggingInit(UNUSED size_t BufferSize,
-                                   UNUSED size_t BufferMax, void *Options,
-                                   size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  uint8_t Expected = 0;
-  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
-                                      memory_order_acq_rel)) {
-    if (Verbosity())
-      Report("Basic logging already initialized.\n");
-    return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-  }
-
-  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
-  pthread_once(&OnceInit, +[] {
-    pthread_key_create(&PThreadKey, TLDDestructor);
-    atomic_store(&UseRealTSC, probeRequiredCPUFeatures(), memory_order_release);
-    // Initialize the global TicksPerSec value.
-    atomic_store(&TicksPerSec,
-                 probeRequiredCPUFeatures() ? getTSCFrequency()
-                                            : NanosecondsPerSecond,
-                 memory_order_release);
-    if (!atomic_load(&UseRealTSC, memory_order_relaxed) && Verbosity())
-      Report("WARNING: Required CPU features missing for XRay instrumentation, "
-             "using emulation instead.\n");
-  });
-
-  FlagParser P;
-  BasicFlags F;
-  F.setDefaults();
-  registerXRayBasicFlags(&P, &F);
-  P.ParseString(useCompilerDefinedBasicFlags());
-  auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
-  if (EnvOpts == nullptr)
-    EnvOpts = "";
-
-  P.ParseString(EnvOpts);
-
-  // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
-  // set through XRAY_OPTIONS instead.
-  if (internal_strlen(EnvOpts) == 0) {
-    F.func_duration_threshold_us =
-        flags()->xray_naive_log_func_duration_threshold_us;
-    F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
-    F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
-  }
-
-  P.ParseString(static_cast<const char *>(Options));
-  GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
-  GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
-  GlobalOptions.MaxStackDepth = F.max_stack_depth;
-  *basicFlags() = F;
-
-  atomic_store(&ThresholdTicks,
-               atomic_load(&TicksPerSec, memory_order_acquire) *
-                   GlobalOptions.DurationFilterMicros / 1000000,
-               memory_order_release);
-  __xray_set_handler_arg1(atomic_load(&UseRealTSC, memory_order_acquire)
-                              ? basicLoggingHandleArg1RealTSC
-                              : basicLoggingHandleArg1EmulateTSC);
-  __xray_set_handler(atomic_load(&UseRealTSC, memory_order_acquire)
-                         ? basicLoggingHandleArg0RealTSC
-                         : basicLoggingHandleArg0EmulateTSC);
-
-  // TODO: Implement custom event and typed event handling support in Basic
-  // Mode.
-  __xray_remove_customevent_handler();
-  __xray_remove_typedevent_handler();
-
-  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-}
-
-XRayLogInitStatus basicLoggingFinalize() XRAY_NEVER_INSTRUMENT {
-  uint8_t Expected = 0;
-  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 0,
-                                      memory_order_acq_rel) &&
-      Verbosity())
-    Report("Basic logging already finalized.\n");
-
-  // Nothing really to do aside from marking state of the global to be
-  // uninitialized.
-
-  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
-}
-
-XRayLogFlushStatus basicLoggingFlush() XRAY_NEVER_INSTRUMENT {
-  // This really does nothing, since flushing the logs happen at the end of a
-  // thread's lifetime, or when the buffers are full.
-  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
-}
-
-// This is a handler that, effectively, does nothing.
-void basicLoggingHandleArg0Empty(int32_t, XRayEntryType) XRAY_NEVER_INSTRUMENT {
-}
-
-bool basicLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
-  XRayLogImpl Impl{
-      basicLoggingInit,
-      basicLoggingFinalize,
-      basicLoggingHandleArg0Empty,
-      basicLoggingFlush,
-  };
-  auto RegistrationResult = __xray_log_register_mode("xray-basic", Impl);
-  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
-      Verbosity())
-    Report("Cannot register XRay Basic Mode to 'xray-basic'; error = %d\n",
-           RegistrationResult);
-  if (flags()->xray_naive_log ||
-      !internal_strcmp(flags()->xray_mode, "xray-basic")) {
-    auto SelectResult = __xray_log_select_mode("xray-basic");
-    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
-      if (Verbosity())
-        Report("Failed selecting XRay Basic Mode; error = %d\n", SelectResult);
-      return false;
-    }
-
-    // We initialize the implementation using the data we get from the
-    // XRAY_BASIC_OPTIONS environment variable, at this point of the
-    // implementation.
-    auto *Env = GetEnv("XRAY_BASIC_OPTIONS");
-    auto InitResult =
-        __xray_log_init_mode("xray-basic", Env == nullptr ? "" : Env);
-    if (InitResult != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
-      if (Verbosity())
-        Report("Failed initializing XRay Basic Mode; error = %d\n", InitResult);
-      return false;
-    }
-
-    // At this point we know that we've successfully initialized Basic mode
-    // tracing, and the only chance we're going to get for the current thread to
-    // clean-up may be at thread/program exit. To ensure that we're going to get
-    // the cleanup even without calling the finalization routines, we're
-    // registering a program exit function that will do the cleanup.
-    static pthread_once_t DynamicOnce = PTHREAD_ONCE_INIT;
-    pthread_once(&DynamicOnce, +[] {
-      static void *FakeTLD = nullptr;
-      FakeTLD = &getThreadLocalData();
-      Atexit(+[] { TLDDestructor(FakeTLD); });
-    });
-  }
-  return true;
-}
-
-} // namespace __xray
-
-static auto UNUSED Unused = __xray::basicLogDynamicInitializer();
diff --git a/lib/xray/xray_basic_logging.cpp b/lib/xray/xray_basic_logging.cpp
new file mode 100644
index 000000000000..6e8e93131451
--- /dev/null
+++ b/lib/xray/xray_basic_logging.cpp
@@ -0,0 +1,515 @@
+//===-- xray_basic_logging.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of a simple in-memory log of XRay events. This defines a
+// logging function that's compatible with the XRay handler interface, and
+// routines for exporting data to files.
+//
+//===----------------------------------------------------------------------===//
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/stat.h>
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
+#include <sys/syscall.h>
+#endif
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray/xray_records.h"
+#include "xray_recursion_guard.h"
+#include "xray_basic_flags.h"
+#include "xray_basic_logging.h"
+#include "xray_defs.h"
+#include "xray_flags.h"
+#include "xray_interface_internal.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+
+namespace __xray {
+
+static SpinMutex LogMutex;
+
+namespace {
+// We use elements of this type to record the entry TSC of every function ID we
+// see as we're tracing a particular thread's execution.
+struct alignas(16) StackEntry {
+  int32_t FuncId;
+  uint16_t Type;
+  uint8_t CPU;
+  uint8_t Padding;
+  uint64_t TSC;
+};
+
+static_assert(sizeof(StackEntry) == 16, "Wrong size for StackEntry");
+
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
+  void *InMemoryBuffer = nullptr;
+  size_t BufferSize = 0;
+  size_t BufferOffset = 0;
+  void *ShadowStack = nullptr;
+  size_t StackSize = 0;
+  size_t StackEntries = 0;
+  __xray::LogWriter *LogWriter = nullptr;
+};
+
+struct BasicLoggingOptions {
+  int DurationFilterMicros = 0;
+  size_t MaxStackDepth = 0;
+  size_t ThreadBufferSize = 0;
+};
+} // namespace
+
+static pthread_key_t PThreadKey;
+
+static atomic_uint8_t BasicInitialized{0};
+
+struct BasicLoggingOptions GlobalOptions;
+
+thread_local atomic_uint8_t Guard{0};
+
+static atomic_uint8_t UseRealTSC{0};
+static atomic_uint64_t ThresholdTicks{0};
+static atomic_uint64_t TicksPerSec{0};
+static atomic_uint64_t CycleFrequency{NanosecondsPerSecond};
+
+static LogWriter *getLog() XRAY_NEVER_INSTRUMENT {
+  LogWriter* LW = LogWriter::Open();
+  if (LW == nullptr)
+    return LW;
+
+  static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT;
+  pthread_once(&DetectOnce, +[] {
+    if (atomic_load(&UseRealTSC, memory_order_acquire))
+      atomic_store(&CycleFrequency, getTSCFrequency(), memory_order_release);
+  });
+
+  // Since we're here, we get to write the header. We set it up so that the
+  // header will only be written once, at the start, and let the threads
+  // logging do writes which just append.
+  XRayFileHeader Header;
+  // Version 2 includes tail exit records.
+  // Version 3 includes pid inside records.
+  Header.Version = 3;
+  Header.Type = FileTypes::NAIVE_LOG;
+  Header.CycleFrequency = atomic_load(&CycleFrequency, memory_order_acquire);
+
+  // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
+  // before setting the values in the header.
+  Header.ConstantTSC = 1;
+  Header.NonstopTSC = 1;
+  LW->WriteAll(reinterpret_cast<char *>(&Header),
+               reinterpret_cast<char *>(&Header) + sizeof(Header));
+  return LW;
+}
+
+static LogWriter *getGlobalLog() XRAY_NEVER_INSTRUMENT {
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  static LogWriter *LW = nullptr;
+  pthread_once(&OnceInit, +[] { LW = getLog(); });
+  return LW;
+}
+
+static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+  thread_local ThreadLocalData TLD;
+  thread_local bool UNUSED TOnce = [] {
+    if (GlobalOptions.ThreadBufferSize == 0) {
+      if (Verbosity())
+        Report("Not initializing TLD since ThreadBufferSize == 0.\n");
+      return false;
+    }
+    pthread_setspecific(PThreadKey, &TLD);
+    TLD.LogWriter = getGlobalLog();
+    TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>(
+        InternalAlloc(sizeof(XRayRecord) * GlobalOptions.ThreadBufferSize,
+                      nullptr, alignof(XRayRecord)));
+    TLD.BufferSize = GlobalOptions.ThreadBufferSize;
+    TLD.BufferOffset = 0;
+    if (GlobalOptions.MaxStackDepth == 0) {
+      if (Verbosity())
+        Report("Not initializing the ShadowStack since MaxStackDepth == 0.\n");
+      TLD.StackSize = 0;
+      TLD.StackEntries = 0;
+      TLD.ShadowStack = nullptr;
+      return false;
+    }
+    TLD.ShadowStack = reinterpret_cast<StackEntry *>(
+        InternalAlloc(sizeof(StackEntry) * GlobalOptions.MaxStackDepth, nullptr,
+                      alignof(StackEntry)));
+    TLD.StackSize = GlobalOptions.MaxStackDepth;
+    TLD.StackEntries = 0;
+    return false;
+  }();
+  return TLD;
+}
+
+template <class RDTSC>
+void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
+                    RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  LogWriter *LW = getGlobalLog();
+  if (LW == nullptr)
+    return;
+
+  // Use a simple recursion guard, to handle cases where we're already logging
+  // and for one reason or another, this function gets called again in the same
+  // thread.
+  RecursionGuard G(Guard);
+  if (!G)
+    return;
+
+  uint8_t CPU = 0;
+  uint64_t TSC = ReadTSC(CPU);
+
+  switch (Type) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY: {
+    // Short circuit if we've reached the maximum depth of the stack.
+    if (TLD.StackEntries++ >= TLD.StackSize)
+      return;
+
+    // When we encounter an entry event, we keep track of the TSC and the CPU,
+    // and put it in the stack.
+    StackEntry E;
+    E.FuncId = FuncId;
+    E.CPU = CPU;
+    E.Type = Type;
+    E.TSC = TSC;
+    auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
+                         (sizeof(StackEntry) * (TLD.StackEntries - 1));
+    internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry));
+    break;
+  }
+  case XRayEntryType::EXIT:
+  case XRayEntryType::TAIL: {
+    if (TLD.StackEntries == 0)
+      break;
+
+    if (--TLD.StackEntries >= TLD.StackSize)
+      return;
+
+    // When we encounter an exit event, we check whether all the following are
+    // true:
+    //
+    // - The Function ID is the same as the most recent entry in the stack.
+    // - The CPU is the same as the most recent entry in the stack.
+    // - The Delta of the TSCs is less than the threshold amount of time we're
+    //   looking to record.
+    //
+    // If all of these conditions are true, we pop the stack and don't write a
+    // record and move the record offset back.
+    StackEntry StackTop;
+    auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
+                         (sizeof(StackEntry) * TLD.StackEntries);
+    internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry));
+    if (StackTop.FuncId == FuncId && StackTop.CPU == CPU &&
+        StackTop.TSC < TSC) {
+      auto Delta = TSC - StackTop.TSC;
+      if (Delta < atomic_load(&ThresholdTicks, memory_order_relaxed)) {
+        DCHECK(TLD.BufferOffset > 0);
+        TLD.BufferOffset -= StackTop.Type == XRayEntryType::ENTRY ? 1 : 2;
+        return;
+      }
+    }
+    break;
+  }
+  default:
+    // Should be unreachable.
+    DCHECK(false && "Unsupported XRayEntryType encountered.");
+    break;
+  }
+
+  // First determine whether the delta between the function's enter record and
+  // the exit record is higher than the threshold.
+  XRayRecord R;
+  R.RecordType = RecordTypes::NORMAL;
+  R.CPU = CPU;
+  R.TSC = TSC;
+  R.TId = GetTid(); 
+  R.PId = internal_getpid(); 
+  R.Type = Type;
+  R.FuncId = FuncId;
+  auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer);
+  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
+  if (++TLD.BufferOffset == TLD.BufferSize) {
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
+    TLD.StackEntries = 0;
+  }
+}
+
+template <class RDTSC>
+void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
+                           RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  auto FirstEntry =
+      reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer);
+  const auto &BuffLen = TLD.BufferSize;
+  LogWriter *LW = getGlobalLog();
+  if (LW == nullptr)
+    return;
+
+  // First we check whether there's enough space to write the data consecutively
+  // in the thread-local buffer. If not, we first flush the buffer before
+  // attempting to write the two records that must be consecutive.
+  if (TLD.BufferOffset + 2 > BuffLen) {
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
+    TLD.StackEntries = 0;
+  }
+
+  // Then we write the "we have an argument" record.
+  InMemoryRawLog(FuncId, Type, ReadTSC);
+
+  RecursionGuard G(Guard);
+  if (!G)
+    return;
+
+  // And, from here on write the arg payload.
+  XRayArgPayload R;
+  R.RecordType = RecordTypes::ARG_PAYLOAD;
+  R.FuncId = FuncId;
+  R.TId = GetTid(); 
+  R.PId = internal_getpid(); 
+  R.Arg = Arg1;
+  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
+  if (++TLD.BufferOffset == BuffLen) {
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
+    TLD.StackEntries = 0;
+  }
+}
+
+void basicLoggingHandleArg0RealTSC(int32_t FuncId,
+                                   XRayEntryType Type) XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLog(FuncId, Type, readTSC);
+}
+
+void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type)
+    XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLog(FuncId, Type, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+    timespec TS;
+    int result = clock_gettime(CLOCK_REALTIME, &TS);
+    if (result != 0) {
+      Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno));
+      TS = {0, 0};
+    }
+    CPU = 0;
+    return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
+  });
+}
+
+void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Type,
+                                   uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLogWithArg(FuncId, Type, Arg1, readTSC);
+}
+
+void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type,
+                                      uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLogWithArg(
+      FuncId, Type, Arg1, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+        timespec TS;
+        int result = clock_gettime(CLOCK_REALTIME, &TS);
+        if (result != 0) {
+          Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno));
+          TS = {0, 0};
+        }
+        CPU = 0;
+        return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
+      });
+}
+
+static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
+  ThreadLocalData &TLD = *reinterpret_cast<ThreadLocalData *>(P);
+  auto ExitGuard = at_scope_exit([&TLD] {
+    // Clean up dynamic resources.
+    if (TLD.InMemoryBuffer)
+      InternalFree(TLD.InMemoryBuffer);
+    if (TLD.ShadowStack)
+      InternalFree(TLD.ShadowStack);
+    if (Verbosity())
+      Report("Cleaned up log for TID: %d\n", GetTid());
+  });
+
+  if (TLD.LogWriter == nullptr || TLD.BufferOffset == 0) {
+    if (Verbosity())
+      Report("Skipping buffer for TID: %d; Offset = %llu\n", GetTid(),
+             TLD.BufferOffset);
+    return;
+  }
+
+  {
+    SpinMutexLock L(&LogMutex);
+    TLD.LogWriter->WriteAll(reinterpret_cast<char *>(TLD.InMemoryBuffer),
+                            reinterpret_cast<char *>(TLD.InMemoryBuffer) +
+                            (sizeof(XRayRecord) * TLD.BufferOffset));
+  }
+
+  // Because this thread's exit could be the last one trying to write to
+  // the file and that we're not able to close out the file properly, we
+  // sync instead and hope that the pending writes are flushed as the
+  // thread exits.
+  TLD.LogWriter->Flush();
+}
+
+XRayLogInitStatus basicLoggingInit(UNUSED size_t BufferSize,
+                                   UNUSED size_t BufferMax, void *Options,
+                                   size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  uint8_t Expected = 0;
+  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
+                                      memory_order_acq_rel)) {
+    if (Verbosity())
+      Report("Basic logging already initialized.\n");
+    return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  }
+
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  pthread_once(&OnceInit, +[] {
+    pthread_key_create(&PThreadKey, TLDDestructor);
+    atomic_store(&UseRealTSC, probeRequiredCPUFeatures(), memory_order_release);
+    // Initialize the global TicksPerSec value.
+    atomic_store(&TicksPerSec,
+                 probeRequiredCPUFeatures() ? getTSCFrequency()
+                                            : NanosecondsPerSecond,
+                 memory_order_release);
+    if (!atomic_load(&UseRealTSC, memory_order_relaxed) && Verbosity())
+      Report("WARNING: Required CPU features missing for XRay instrumentation, "
+             "using emulation instead.\n");
+  });
+
+  FlagParser P;
+  BasicFlags F;
+  F.setDefaults();
+  registerXRayBasicFlags(&P, &F);
+  P.ParseString(useCompilerDefinedBasicFlags());
+  auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
+
+  P.ParseString(EnvOpts);
+
+  // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
+  // set through XRAY_OPTIONS instead.
+  if (internal_strlen(EnvOpts) == 0) {
+    F.func_duration_threshold_us =
+        flags()->xray_naive_log_func_duration_threshold_us;
+    F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
+    F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
+  }
+
+  P.ParseString(static_cast<const char *>(Options));
+  GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
+  GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
+  GlobalOptions.MaxStackDepth = F.max_stack_depth;
+  *basicFlags() = F;
+
+  atomic_store(&ThresholdTicks,
+               atomic_load(&TicksPerSec, memory_order_acquire) *
+                   GlobalOptions.DurationFilterMicros / 1000000,
+               memory_order_release);
+  __xray_set_handler_arg1(atomic_load(&UseRealTSC, memory_order_acquire)
+                              ? basicLoggingHandleArg1RealTSC
+                              : basicLoggingHandleArg1EmulateTSC);
+  __xray_set_handler(atomic_load(&UseRealTSC, memory_order_acquire)
+                         ? basicLoggingHandleArg0RealTSC
+                         : basicLoggingHandleArg0EmulateTSC);
+
+  // TODO: Implement custom event and typed event handling support in Basic
+  // Mode.
+  __xray_remove_customevent_handler();
+  __xray_remove_typedevent_handler();
+
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+XRayLogInitStatus basicLoggingFinalize() XRAY_NEVER_INSTRUMENT {
+  uint8_t Expected = 0;
+  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 0,
+                                      memory_order_acq_rel) &&
+      Verbosity())
+    Report("Basic logging already finalized.\n");
+
+  // Nothing really to do aside from marking state of the global to be
+  // uninitialized.
+
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+XRayLogFlushStatus basicLoggingFlush() XRAY_NEVER_INSTRUMENT {
+  // This really does nothing, since flushing the logs happen at the end of a
+  // thread's lifetime, or when the buffers are full.
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+// This is a handler that, effectively, does nothing.
+void basicLoggingHandleArg0Empty(int32_t, XRayEntryType) XRAY_NEVER_INSTRUMENT {
+}
+
+bool basicLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  XRayLogImpl Impl{
+      basicLoggingInit,
+      basicLoggingFinalize,
+      basicLoggingHandleArg0Empty,
+      basicLoggingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-basic", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+      Verbosity())
+    Report("Cannot register XRay Basic Mode to 'xray-basic'; error = %d\n",
+           RegistrationResult);
+  if (flags()->xray_naive_log ||
+      !internal_strcmp(flags()->xray_mode, "xray-basic")) {
+    auto SelectResult = __xray_log_select_mode("xray-basic");
+    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+      if (Verbosity())
+        Report("Failed selecting XRay Basic Mode; error = %d\n", SelectResult);
+      return false;
+    }
+
+    // We initialize the implementation using the data we get from the
+    // XRAY_BASIC_OPTIONS environment variable, at this point of the
+    // implementation.
+    auto *Env = GetEnv("XRAY_BASIC_OPTIONS");
+    auto InitResult =
+        __xray_log_init_mode("xray-basic", Env == nullptr ? "" : Env);
+    if (InitResult != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
+      if (Verbosity())
+        Report("Failed initializing XRay Basic Mode; error = %d\n", InitResult);
+      return false;
+    }
+
+    // At this point we know that we've successfully initialized Basic mode
+    // tracing, and the only chance we're going to get for the current thread to
+    // clean-up may be at thread/program exit. To ensure that we're going to get
+    // the cleanup even without calling the finalization routines, we're
+    // registering a program exit function that will do the cleanup.
+    static pthread_once_t DynamicOnce = PTHREAD_ONCE_INIT;
+    pthread_once(&DynamicOnce, +[] {
+      static void *FakeTLD = nullptr;
+      FakeTLD = &getThreadLocalData();
+      Atexit(+[] { TLDDestructor(FakeTLD); });
+    });
+  }
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::basicLogDynamicInitializer();
diff --git a/lib/xray/xray_buffer_queue.cc b/lib/xray/xray_buffer_queue.cc
deleted file mode 100644
index 4cfa717de208..000000000000
--- a/lib/xray/xray_buffer_queue.cc
+++ /dev/null
@@ -1,237 +0,0 @@
-//===-- xray_buffer_queue.cc -----------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instruementation system.
-//
-// Defines the interface for a buffer queue implementation.
-//
-//===----------------------------------------------------------------------===//
-#include "xray_buffer_queue.h"
-#include "sanitizer_common/sanitizer_atomic.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#if !SANITIZER_FUCHSIA
-#include "sanitizer_common/sanitizer_posix.h"
-#endif
-#include "xray_allocator.h"
-#include "xray_defs.h"
-#include <memory>
-#include <sys/mman.h>
-
-using namespace __xray;
-
-namespace {
-
-BufferQueue::ControlBlock *allocControlBlock(size_t Size, size_t Count) {
-  auto B =
-      allocateBuffer((sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
-  return B == nullptr ? nullptr
-                      : reinterpret_cast<BufferQueue::ControlBlock *>(B);
-}
-
-void deallocControlBlock(BufferQueue::ControlBlock *C, size_t Size,
-                         size_t Count) {
-  deallocateBuffer(reinterpret_cast<unsigned char *>(C),
-                   (sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
-}
-
-void decRefCount(BufferQueue::ControlBlock *C, size_t Size, size_t Count) {
-  if (C == nullptr)
-    return;
-  if (atomic_fetch_sub(&C->RefCount, 1, memory_order_acq_rel) == 1)
-    deallocControlBlock(C, Size, Count);
-}
-
-void incRefCount(BufferQueue::ControlBlock *C) {
-  if (C == nullptr)
-    return;
-  atomic_fetch_add(&C->RefCount, 1, memory_order_acq_rel);
-}
-
-// We use a struct to ensure that we are allocating one atomic_uint64_t per
-// cache line. This allows us to not worry about false-sharing among atomic
-// objects being updated (constantly) by different threads.
-struct ExtentsPadded {
-  union {
-    atomic_uint64_t Extents;
-    unsigned char Storage[kCacheLineSize];
-  };
-};
-
-constexpr size_t kExtentsSize = sizeof(ExtentsPadded);
-
-} // namespace
-
-BufferQueue::ErrorCode BufferQueue::init(size_t BS, size_t BC) {
-  SpinMutexLock Guard(&Mutex);
-
-  if (!finalizing())
-    return BufferQueue::ErrorCode::AlreadyInitialized;
-
-  cleanupBuffers();
-
-  bool Success = false;
-  BufferSize = BS;
-  BufferCount = BC;
-
-  BackingStore = allocControlBlock(BufferSize, BufferCount);
-  if (BackingStore == nullptr)
-    return BufferQueue::ErrorCode::NotEnoughMemory;
-
-  auto CleanupBackingStore = at_scope_exit([&, this] {
-    if (Success)
-      return;
-    deallocControlBlock(BackingStore, BufferSize, BufferCount);
-    BackingStore = nullptr;
-  });
-
-  // Initialize enough atomic_uint64_t instances, each
-  ExtentsBackingStore = allocControlBlock(kExtentsSize, BufferCount);
-  if (ExtentsBackingStore == nullptr)
-    return BufferQueue::ErrorCode::NotEnoughMemory;
-
-  auto CleanupExtentsBackingStore = at_scope_exit([&, this] {
-    if (Success)
-      return;
-    deallocControlBlock(ExtentsBackingStore, kExtentsSize, BufferCount);
-    ExtentsBackingStore = nullptr;
-  });
-
-  Buffers = initArray<BufferRep>(BufferCount);
-  if (Buffers == nullptr)
-    return BufferQueue::ErrorCode::NotEnoughMemory;
-
-  // At this point we increment the generation number to associate the buffers
-  // to the new generation.
-  atomic_fetch_add(&Generation, 1, memory_order_acq_rel);
-
-  // First, we initialize the refcount in the ControlBlock, which we treat as
-  // being at the start of the BackingStore pointer.
-  atomic_store(&BackingStore->RefCount, 1, memory_order_release);
-  atomic_store(&ExtentsBackingStore->RefCount, 1, memory_order_release);
-
-  // Then we initialise the individual buffers that sub-divide the whole backing
-  // store. Each buffer will start at the `Data` member of the ControlBlock, and
-  // will be offsets from these locations.
-  for (size_t i = 0; i < BufferCount; ++i) {
-    auto &T = Buffers[i];
-    auto &Buf = T.Buff;
-    auto *E = reinterpret_cast<ExtentsPadded *>(&ExtentsBackingStore->Data +
-                                                (kExtentsSize * i));
-    Buf.Extents = &E->Extents;
-    atomic_store(Buf.Extents, 0, memory_order_release);
-    Buf.Generation = generation();
-    Buf.Data = &BackingStore->Data + (BufferSize * i);
-    Buf.Size = BufferSize;
-    Buf.BackingStore = BackingStore;
-    Buf.ExtentsBackingStore = ExtentsBackingStore;
-    Buf.Count = BufferCount;
-    T.Used = false;
-  }
-
-  Next = Buffers;
-  First = Buffers;
-  LiveBuffers = 0;
-  atomic_store(&Finalizing, 0, memory_order_release);
-  Success = true;
-  return BufferQueue::ErrorCode::Ok;
-}
-
-BufferQueue::BufferQueue(size_t B, size_t N,
-                         bool &Success) XRAY_NEVER_INSTRUMENT
-    : BufferSize(B),
-      BufferCount(N),
-      Mutex(),
-      Finalizing{1},
-      BackingStore(nullptr),
-      ExtentsBackingStore(nullptr),
-      Buffers(nullptr),
-      Next(Buffers),
-      First(Buffers),
-      LiveBuffers(0),
-      Generation{0} {
-  Success = init(B, N) == BufferQueue::ErrorCode::Ok;
-}
-
-BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
-  if (atomic_load(&Finalizing, memory_order_acquire))
-    return ErrorCode::QueueFinalizing;
-
-  BufferRep *B = nullptr;
-  {
-    SpinMutexLock Guard(&Mutex);
-    if (LiveBuffers == BufferCount)
-      return ErrorCode::NotEnoughMemory;
-    B = Next++;
-    if (Next == (Buffers + BufferCount))
-      Next = Buffers;
-    ++LiveBuffers;
-  }
-
-  incRefCount(BackingStore);
-  incRefCount(ExtentsBackingStore);
-  Buf = B->Buff;
-  Buf.Generation = generation();
-  B->Used = true;
-  return ErrorCode::Ok;
-}
-
-BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
-  // Check whether the buffer being referred to is within the bounds of the
-  // backing store's range.
-  BufferRep *B = nullptr;
-  {
-    SpinMutexLock Guard(&Mutex);
-    if (Buf.Generation != generation() || LiveBuffers == 0) {
-      Buf = {};
-      decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
-      decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
-      return BufferQueue::ErrorCode::Ok;
-    }
-
-    if (Buf.Data < &BackingStore->Data ||
-        Buf.Data > &BackingStore->Data + (BufferCount * BufferSize))
-      return BufferQueue::ErrorCode::UnrecognizedBuffer;
-
-    --LiveBuffers;
-    B = First++;
-    if (First == (Buffers + BufferCount))
-      First = Buffers;
-  }
-
-  // Now that the buffer has been released, we mark it as "used".
-  B->Buff = Buf;
-  B->Used = true;
-  decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
-  decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
-  atomic_store(B->Buff.Extents, atomic_load(Buf.Extents, memory_order_acquire),
-               memory_order_release);
-  Buf = {};
-  return ErrorCode::Ok;
-}
-
-BufferQueue::ErrorCode BufferQueue::finalize() {
-  if (atomic_exchange(&Finalizing, 1, memory_order_acq_rel))
-    return ErrorCode::QueueFinalizing;
-  return ErrorCode::Ok;
-}
-
-void BufferQueue::cleanupBuffers() {
-  for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
-    B->~BufferRep();
-  deallocateBuffer(Buffers, BufferCount);
-  decRefCount(BackingStore, BufferSize, BufferCount);
-  decRefCount(ExtentsBackingStore, kExtentsSize, BufferCount);
-  BackingStore = nullptr;
-  ExtentsBackingStore = nullptr;
-  Buffers = nullptr;
-  BufferCount = 0;
-  BufferSize = 0;
-}
-
-BufferQueue::~BufferQueue() { cleanupBuffers(); }
diff --git a/lib/xray/xray_buffer_queue.cpp b/lib/xray/xray_buffer_queue.cpp
new file mode 100644
index 000000000000..bad91e036cef
--- /dev/null
+++ b/lib/xray/xray_buffer_queue.cpp
@@ -0,0 +1,237 @@
+//===-- xray_buffer_queue.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instruementation system.
+//
+// Defines the interface for a buffer queue implementation.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_buffer_queue.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#if !SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_allocator.h"
+#include "xray_defs.h"
+#include <memory>
+#include <sys/mman.h>
+
+using namespace __xray;
+
+namespace {
+
+BufferQueue::ControlBlock *allocControlBlock(size_t Size, size_t Count) {
+  auto B =
+      allocateBuffer((sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
+  return B == nullptr ? nullptr
+                      : reinterpret_cast<BufferQueue::ControlBlock *>(B);
+}
+
+void deallocControlBlock(BufferQueue::ControlBlock *C, size_t Size,
+                         size_t Count) {
+  deallocateBuffer(reinterpret_cast<unsigned char *>(C),
+                   (sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
+}
+
+void decRefCount(BufferQueue::ControlBlock *C, size_t Size, size_t Count) {
+  if (C == nullptr)
+    return;
+  if (atomic_fetch_sub(&C->RefCount, 1, memory_order_acq_rel) == 1)
+    deallocControlBlock(C, Size, Count);
+}
+
+void incRefCount(BufferQueue::ControlBlock *C) {
+  if (C == nullptr)
+    return;
+  atomic_fetch_add(&C->RefCount, 1, memory_order_acq_rel);
+}
+
+// We use a struct to ensure that we are allocating one atomic_uint64_t per
+// cache line. This allows us to not worry about false-sharing among atomic
+// objects being updated (constantly) by different threads.
+struct ExtentsPadded {
+  union {
+    atomic_uint64_t Extents;
+    unsigned char Storage[kCacheLineSize];
+  };
+};
+
+constexpr size_t kExtentsSize = sizeof(ExtentsPadded);
+
+} // namespace
+
+BufferQueue::ErrorCode BufferQueue::init(size_t BS, size_t BC) {
+  SpinMutexLock Guard(&Mutex);
+
+  if (!finalizing())
+    return BufferQueue::ErrorCode::AlreadyInitialized;
+
+  cleanupBuffers();
+
+  bool Success = false;
+  BufferSize = BS;
+  BufferCount = BC;
+
+  BackingStore = allocControlBlock(BufferSize, BufferCount);
+  if (BackingStore == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  auto CleanupBackingStore = at_scope_exit([&, this] {
+    if (Success)
+      return;
+    deallocControlBlock(BackingStore, BufferSize, BufferCount);
+    BackingStore = nullptr;
+  });
+
+  // Initialize enough atomic_uint64_t instances, each
+  ExtentsBackingStore = allocControlBlock(kExtentsSize, BufferCount);
+  if (ExtentsBackingStore == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  auto CleanupExtentsBackingStore = at_scope_exit([&, this] {
+    if (Success)
+      return;
+    deallocControlBlock(ExtentsBackingStore, kExtentsSize, BufferCount);
+    ExtentsBackingStore = nullptr;
+  });
+
+  Buffers = initArray<BufferRep>(BufferCount);
+  if (Buffers == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  // At this point we increment the generation number to associate the buffers
+  // to the new generation.
+  atomic_fetch_add(&Generation, 1, memory_order_acq_rel);
+
+  // First, we initialize the refcount in the ControlBlock, which we treat as
+  // being at the start of the BackingStore pointer.
+  atomic_store(&BackingStore->RefCount, 1, memory_order_release);
+  atomic_store(&ExtentsBackingStore->RefCount, 1, memory_order_release);
+
+  // Then we initialise the individual buffers that sub-divide the whole backing
+  // store. Each buffer will start at the `Data` member of the ControlBlock, and
+  // will be offsets from these locations.
+  for (size_t i = 0; i < BufferCount; ++i) {
+    auto &T = Buffers[i];
+    auto &Buf = T.Buff;
+    auto *E = reinterpret_cast<ExtentsPadded *>(&ExtentsBackingStore->Data +
+                                                (kExtentsSize * i));
+    Buf.Extents = &E->Extents;
+    atomic_store(Buf.Extents, 0, memory_order_release);
+    Buf.Generation = generation();
+    Buf.Data = &BackingStore->Data + (BufferSize * i);
+    Buf.Size = BufferSize;
+    Buf.BackingStore = BackingStore;
+    Buf.ExtentsBackingStore = ExtentsBackingStore;
+    Buf.Count = BufferCount;
+    T.Used = false;
+  }
+
+  Next = Buffers;
+  First = Buffers;
+  LiveBuffers = 0;
+  atomic_store(&Finalizing, 0, memory_order_release);
+  Success = true;
+  return BufferQueue::ErrorCode::Ok;
+}
+
+BufferQueue::BufferQueue(size_t B, size_t N,
+                         bool &Success) XRAY_NEVER_INSTRUMENT
+    : BufferSize(B),
+      BufferCount(N),
+      Mutex(),
+      Finalizing{1},
+      BackingStore(nullptr),
+      ExtentsBackingStore(nullptr),
+      Buffers(nullptr),
+      Next(Buffers),
+      First(Buffers),
+      LiveBuffers(0),
+      Generation{0} {
+  Success = init(B, N) == BufferQueue::ErrorCode::Ok;
+}
+
+BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
+  if (atomic_load(&Finalizing, memory_order_acquire))
+    return ErrorCode::QueueFinalizing;
+
+  BufferRep *B = nullptr;
+  {
+    SpinMutexLock Guard(&Mutex);
+    if (LiveBuffers == BufferCount)
+      return ErrorCode::NotEnoughMemory;
+    B = Next++;
+    if (Next == (Buffers + BufferCount))
+      Next = Buffers;
+    ++LiveBuffers;
+  }
+
+  incRefCount(BackingStore);
+  incRefCount(ExtentsBackingStore);
+  Buf = B->Buff;
+  Buf.Generation = generation();
+  B->Used = true;
+  return ErrorCode::Ok;
+}
+
+BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
+  // Check whether the buffer being referred to is within the bounds of the
+  // backing store's range.
+  BufferRep *B = nullptr;
+  {
+    SpinMutexLock Guard(&Mutex);
+    if (Buf.Generation != generation() || LiveBuffers == 0) {
+      Buf = {};
+      decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+      decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+      return BufferQueue::ErrorCode::Ok;
+    }
+
+    if (Buf.Data < &BackingStore->Data ||
+        Buf.Data > &BackingStore->Data + (BufferCount * BufferSize))
+      return BufferQueue::ErrorCode::UnrecognizedBuffer;
+
+    --LiveBuffers;
+    B = First++;
+    if (First == (Buffers + BufferCount))
+      First = Buffers;
+  }
+
+  // Now that the buffer has been released, we mark it as "used".
+  B->Buff = Buf;
+  B->Used = true;
+  decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+  decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+  atomic_store(B->Buff.Extents, atomic_load(Buf.Extents, memory_order_acquire),
+               memory_order_release);
+  Buf = {};
+  return ErrorCode::Ok;
+}
+
+BufferQueue::ErrorCode BufferQueue::finalize() {
+  if (atomic_exchange(&Finalizing, 1, memory_order_acq_rel))
+    return ErrorCode::QueueFinalizing;
+  return ErrorCode::Ok;
+}
+
+void BufferQueue::cleanupBuffers() {
+  for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
+    B->~BufferRep();
+  deallocateBuffer(Buffers, BufferCount);
+  decRefCount(BackingStore, BufferSize, BufferCount);
+  decRefCount(ExtentsBackingStore, kExtentsSize, BufferCount);
+  BackingStore = nullptr;
+  ExtentsBackingStore = nullptr;
+  Buffers = nullptr;
+  BufferCount = 0;
+  BufferSize = 0;
+}
+
+BufferQueue::~BufferQueue() { cleanupBuffers(); }
diff --git a/lib/xray/xray_fdr_flags.cc b/lib/xray/xray_fdr_flags.cc
deleted file mode 100644
index 8d432d298d88..000000000000
--- a/lib/xray/xray_fdr_flags.cc
+++ /dev/null
@@ -1,47 +0,0 @@
-//===-- xray_fdr_flags.cc ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// XRay FDR flag parsing logic.
-//===----------------------------------------------------------------------===//
-
-#include "xray_fdr_flags.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_flag_parser.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "xray_defs.h"
-
-using namespace __sanitizer;
-
-namespace __xray {
-
-FDRFlags xray_fdr_flags_dont_use_directly; // use via fdrFlags().
-
-void FDRFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
-#include "xray_fdr_flags.inc"
-#undef XRAY_FLAG
-}
-
-void registerXRayFDRFlags(FlagParser *P, FDRFlags *F) XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
-  RegisterFlag(P, #Name, Description, &F->Name);
-#include "xray_fdr_flags.inc"
-#undef XRAY_FLAG
-}
-
-const char *useCompilerDefinedFDRFlags() XRAY_NEVER_INSTRUMENT {
-#ifdef XRAY_FDR_OPTIONS
-  return SANITIZER_STRINGIFY(XRAY_FDR_OPTIONS);
-#else
-  return "";
-#endif
-}
-
-} // namespace __xray
diff --git a/lib/xray/xray_fdr_flags.cpp b/lib/xray/xray_fdr_flags.cpp
new file mode 100644
index 000000000000..272b0b7cb1f7
--- /dev/null
+++ b/lib/xray/xray_fdr_flags.cpp
@@ -0,0 +1,47 @@
+//===-- xray_fdr_flags.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay FDR flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_fdr_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+FDRFlags xray_fdr_flags_dont_use_directly; // use via fdrFlags().
+
+void FDRFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayFDRFlags(FlagParser *P, FDRFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedFDRFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_FDR_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_FDR_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_fdr_logging.cc b/lib/xray/xray_fdr_logging.cc
deleted file mode 100644
index abba06576da1..000000000000
--- a/lib/xray/xray_fdr_logging.cc
+++ /dev/null
@@ -1,757 +0,0 @@
-//===-- xray_fdr_logging.cc ------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Here we implement the Flight Data Recorder mode for XRay, where we use
-// compact structures to store records in memory as well as when writing out the
-// data to files.
-//
-//===----------------------------------------------------------------------===//
-#include "xray_fdr_logging.h"
-#include <cassert>
-#include <errno.h>
-#include <limits>
-#include <memory>
-#include <pthread.h>
-#include <sys/time.h>
-#include <time.h>
-#include <unistd.h>
-
-#include "sanitizer_common/sanitizer_allocator_internal.h"
-#include "sanitizer_common/sanitizer_atomic.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray/xray_interface.h"
-#include "xray/xray_records.h"
-#include "xray_allocator.h"
-#include "xray_buffer_queue.h"
-#include "xray_defs.h"
-#include "xray_fdr_controller.h"
-#include "xray_fdr_flags.h"
-#include "xray_fdr_log_writer.h"
-#include "xray_flags.h"
-#include "xray_recursion_guard.h"
-#include "xray_tsc.h"
-#include "xray_utils.h"
-
-namespace __xray {
-
-static atomic_sint32_t LoggingStatus = {
-    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-
-namespace {
-
-// Group together thread-local-data in a struct, then hide it behind a function
-// call so that it can be initialized on first use instead of as a global. We
-// force the alignment to 64-bytes for x86 cache line alignment, as this
-// structure is used in the hot path of implementation.
-struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
-  BufferQueue::Buffer Buffer{};
-  BufferQueue *BQ = nullptr;
-
-  using LogWriterStorage =
-      typename std::aligned_storage<sizeof(FDRLogWriter),
-                                    alignof(FDRLogWriter)>::type;
-
-  LogWriterStorage LWStorage;
-  FDRLogWriter *Writer = nullptr;
-
-  using ControllerStorage =
-      typename std::aligned_storage<sizeof(FDRController<>),
-                                    alignof(FDRController<>)>::type;
-  ControllerStorage CStorage;
-  FDRController<> *Controller = nullptr;
-};
-
-} // namespace
-
-static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
-              "ThreadLocalData must be trivially destructible");
-
-// Use a global pthread key to identify thread-local data for logging.
-static pthread_key_t Key;
-
-// Global BufferQueue.
-static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage;
-static BufferQueue *BQ = nullptr;
-
-// Global thresholds for function durations.
-static atomic_uint64_t ThresholdTicks{0};
-
-// Global for ticks per second.
-static atomic_uint64_t TicksPerSec{0};
-
-static atomic_sint32_t LogFlushStatus = {
-    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
-
-// This function will initialize the thread-local data structure used by the FDR
-// logging implementation and return a reference to it. The implementation
-// details require a bit of care to maintain.
-//
-// First, some requirements on the implementation in general:
-//
-//   - XRay handlers should not call any memory allocation routines that may
-//     delegate to an instrumented implementation. This means functions like
-//     malloc() and free() should not be called while instrumenting.
-//
-//   - We would like to use some thread-local data initialized on first-use of
-//     the XRay instrumentation. These allow us to implement unsynchronized
-//     routines that access resources associated with the thread.
-//
-// The implementation here uses a few mechanisms that allow us to provide both
-// the requirements listed above. We do this by:
-//
-//   1. Using a thread-local aligned storage buffer for representing the
-//      ThreadLocalData struct. This data will be uninitialized memory by
-//      design.
-//
-//   2. Not requiring a thread exit handler/implementation, keeping the
-//      thread-local as purely a collection of references/data that do not
-//      require cleanup.
-//
-// We're doing this to avoid using a `thread_local` object that has a
-// non-trivial destructor, because the C++ runtime might call std::malloc(...)
-// to register calls to destructors. Deadlocks may arise when, for example, an
-// externally provided malloc implementation is XRay instrumented, and
-// initializing the thread-locals involves calling into malloc. A malloc
-// implementation that does global synchronization might be holding a lock for a
-// critical section, calling a function that might be XRay instrumented (and
-// thus in turn calling into malloc by virtue of registration of the
-// thread_local's destructor).
-#if XRAY_HAS_TLS_ALIGNAS
-static_assert(alignof(ThreadLocalData) >= 64,
-              "ThreadLocalData must be cache line aligned.");
-#endif
-static ThreadLocalData &getThreadLocalData() {
-  thread_local typename std::aligned_storage<
-      sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{};
-
-  if (pthread_getspecific(Key) == NULL) {
-    new (reinterpret_cast<ThreadLocalData *>(&TLDStorage)) ThreadLocalData{};
-    pthread_setspecific(Key, &TLDStorage);
-  }
-
-  return *reinterpret_cast<ThreadLocalData *>(&TLDStorage);
-}
-
-static XRayFileHeader &fdrCommonHeaderInfo() {
-  static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage;
-  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
-  static bool TSCSupported = true;
-  static uint64_t CycleFrequency = NanosecondsPerSecond;
-  pthread_once(
-      &OnceInit, +[] {
-        XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
-        // Version 2 of the log writes the extents of the buffer, instead of
-        // relying on an end-of-buffer record.
-        // Version 3 includes PID metadata record.
-        // Version 4 includes CPU data in the custom event records.
-        // Version 5 uses relative deltas for custom and typed event records,
-        // and removes the CPU data in custom event records (similar to how
-        // function records use deltas instead of full TSCs and rely on other
-        // metadata records for TSC wraparound and CPU migration).
-        H.Version = 5;
-        H.Type = FileTypes::FDR_LOG;
-
-        // Test for required CPU features and cache the cycle frequency
-        TSCSupported = probeRequiredCPUFeatures();
-        if (TSCSupported)
-          CycleFrequency = getTSCFrequency();
-        H.CycleFrequency = CycleFrequency;
-
-        // FIXME: Actually check whether we have 'constant_tsc' and
-        // 'nonstop_tsc' before setting the values in the header.
-        H.ConstantTSC = 1;
-        H.NonstopTSC = 1;
-      });
-  return reinterpret_cast<XRayFileHeader &>(HStorage);
-}
-
-// This is the iterator implementation, which knows how to handle FDR-mode
-// specific buffers. This is used as an implementation of the iterator function
-// needed by __xray_set_buffer_iterator(...). It maintains a global state of the
-// buffer iteration for the currently installed FDR mode buffers. In particular:
-//
-//   - If the argument represents the initial state of XRayBuffer ({nullptr, 0})
-//     then the iterator returns the header information.
-//   - If the argument represents the header information ({address of header
-//     info, size of the header info}) then it returns the first FDR buffer's
-//     address and extents.
-//   - It will keep returning the next buffer and extents as there are more
-//     buffers to process. When the input represents the last buffer, it will
-//     return the initial state to signal completion ({nullptr, 0}).
-//
-// See xray/xray_log_interface.h for more details on the requirements for the
-// implementations of __xray_set_buffer_iterator(...) and
-// __xray_log_process_buffers(...).
-XRayBuffer fdrIterator(const XRayBuffer B) {
-  DCHECK(internal_strcmp(__xray_log_get_current_mode(), "xray-fdr") == 0);
-  DCHECK(BQ->finalizing());
-
-  if (BQ == nullptr || !BQ->finalizing()) {
-    if (Verbosity())
-      Report(
-          "XRay FDR: Failed global buffer queue is null or not finalizing!\n");
-    return {nullptr, 0};
-  }
-
-  // We use a global scratch-pad for the header information, which only gets
-  // initialized the first time this function is called. We'll update one part
-  // of this information with some relevant data (in particular the number of
-  // buffers to expect).
-  static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage;
-  static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT;
-  pthread_once(
-      &HeaderOnce, +[] {
-        reinterpret_cast<XRayFileHeader &>(HeaderStorage) =
-            fdrCommonHeaderInfo();
-      });
-
-  // We use a convenience alias for code referring to Header from here on out.
-  auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage);
-  if (B.Data == nullptr && B.Size == 0) {
-    Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
-    return XRayBuffer{static_cast<void *>(&Header), sizeof(Header)};
-  }
-
-  static BufferQueue::const_iterator It{};
-  static BufferQueue::const_iterator End{};
-  static uint8_t *CurrentBuffer{nullptr};
-  static size_t SerializedBufferSize = 0;
-  if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
-    // From this point on, we provide raw access to the raw buffer we're getting
-    // from the BufferQueue. We're relying on the iterators from the current
-    // Buffer queue.
-    It = BQ->cbegin();
-    End = BQ->cend();
-  }
-
-  if (CurrentBuffer != nullptr) {
-    deallocateBuffer(CurrentBuffer, SerializedBufferSize);
-    CurrentBuffer = nullptr;
-  }
-
-  if (It == End)
-    return {nullptr, 0};
-
-  // Set up the current buffer to contain the extents like we would when writing
-  // out to disk. The difference here would be that we still write "empty"
-  // buffers, or at least go through the iterators faithfully to let the
-  // handlers see the empty buffers in the queue.
-  //
-  // We need this atomic fence here to ensure that writes happening to the
-  // buffer have been committed before we load the extents atomically. Because
-  // the buffer is not explicitly synchronised across threads, we rely on the
-  // fence ordering to ensure that writes we expect to have been completed
-  // before the fence are fully committed before we read the extents.
-  atomic_thread_fence(memory_order_acquire);
-  auto BufferSize = atomic_load(It->Extents, memory_order_acquire);
-  SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
-  CurrentBuffer = allocateBuffer(SerializedBufferSize);
-  if (CurrentBuffer == nullptr)
-    return {nullptr, 0};
-
-  // Write out the extents as a Metadata Record into the CurrentBuffer.
-  MetadataRecord ExtentsRecord;
-  ExtentsRecord.Type = uint8_t(RecordType::Metadata);
-  ExtentsRecord.RecordKind =
-      uint8_t(MetadataRecord::RecordKinds::BufferExtents);
-  internal_memcpy(ExtentsRecord.Data, &BufferSize, sizeof(BufferSize));
-  auto AfterExtents =
-      static_cast<char *>(internal_memcpy(CurrentBuffer, &ExtentsRecord,
-                                          sizeof(MetadataRecord))) +
-      sizeof(MetadataRecord);
-  internal_memcpy(AfterExtents, It->Data, BufferSize);
-
-  XRayBuffer Result;
-  Result.Data = CurrentBuffer;
-  Result.Size = SerializedBufferSize;
-  ++It;
-  return Result;
-}
-
-// Must finalize before flushing.
-XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&LoggingStatus, memory_order_acquire) !=
-      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
-    if (Verbosity())
-      Report("Not flushing log, implementation is not finalized.\n");
-    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  }
-
-  s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  if (!atomic_compare_exchange_strong(&LogFlushStatus, &Result,
-                                      XRayLogFlushStatus::XRAY_LOG_FLUSHING,
-                                      memory_order_release)) {
-    if (Verbosity())
-      Report("Not flushing log, implementation is still finalizing.\n");
-    return static_cast<XRayLogFlushStatus>(Result);
-  }
-
-  if (BQ == nullptr) {
-    if (Verbosity())
-      Report("Cannot flush when global buffer queue is null.\n");
-    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  }
-
-  // We wait a number of milliseconds to allow threads to see that we've
-  // finalised before attempting to flush the log.
-  SleepForMillis(fdrFlags()->grace_period_ms);
-
-  // At this point, we're going to uninstall the iterator implementation, before
-  // we decide to do anything further with the global buffer queue.
-  __xray_log_remove_buffer_iterator();
-
-  // Once flushed, we should set the global status of the logging implementation
-  // to "uninitialized" to allow for FDR-logging multiple runs.
-  auto ResetToUnitialized = at_scope_exit([] {
-    atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
-                 memory_order_release);
-  });
-
-  auto CleanupBuffers = at_scope_exit([] {
-    auto &TLD = getThreadLocalData();
-    if (TLD.Controller != nullptr)
-      TLD.Controller->flush();
-  });
-
-  if (fdrFlags()->no_file_flush) {
-    if (Verbosity())
-      Report("XRay FDR: Not flushing to file, 'no_file_flush=true'.\n");
-
-    atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
-                 memory_order_release);
-    return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
-  }
-
-  // We write out the file in the following format:
-  //
-  //   1) We write down the XRay file header with version 1, type FDR_LOG.
-  //   2) Then we use the 'apply' member of the BufferQueue that's live, to
-  //      ensure that at this point in time we write down the buffers that have
-  //      been released (and marked "used") -- we dump the full buffer for now
-  //      (fixed-sized) and let the tools reading the buffers deal with the data
-  //      afterwards.
-  //
-  LogWriter *LW = LogWriter::Open();
-  if (LW == nullptr) {
-    auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-    atomic_store(&LogFlushStatus, Result, memory_order_release);
-    return Result;
-  }
-
-  XRayFileHeader Header = fdrCommonHeaderInfo();
-  Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
-  LW->WriteAll(reinterpret_cast<char *>(&Header),
-               reinterpret_cast<char *>(&Header) + sizeof(Header));
-
-  // Release the current thread's buffer before we attempt to write out all the
-  // buffers. This ensures that in case we had only a single thread going, that
-  // we are able to capture the data nonetheless.
-  auto &TLD = getThreadLocalData();
-  if (TLD.Controller != nullptr)
-    TLD.Controller->flush();
-
-  BQ->apply([&](const BufferQueue::Buffer &B) {
-    // Starting at version 2 of the FDR logging implementation, we only write
-    // the records identified by the extents of the buffer. We use the Extents
-    // from the Buffer and write that out as the first record in the buffer.  We
-    // still use a Metadata record, but fill in the extents instead for the
-    // data.
-    MetadataRecord ExtentsRecord;
-    auto BufferExtents = atomic_load(B.Extents, memory_order_acquire);
-    DCHECK(BufferExtents <= B.Size);
-    ExtentsRecord.Type = uint8_t(RecordType::Metadata);
-    ExtentsRecord.RecordKind =
-        uint8_t(MetadataRecord::RecordKinds::BufferExtents);
-    internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
-    if (BufferExtents > 0) {
-      LW->WriteAll(reinterpret_cast<char *>(&ExtentsRecord),
-                   reinterpret_cast<char *>(&ExtentsRecord) +
-                       sizeof(MetadataRecord));
-      LW->WriteAll(reinterpret_cast<char *>(B.Data),
-                   reinterpret_cast<char *>(B.Data) + BufferExtents);
-    }
-  });
-
-  atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
-               memory_order_release);
-  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
-}
-
-XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
-                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
-                                      memory_order_release)) {
-    if (Verbosity())
-      Report("Cannot finalize log, implementation not initialized.\n");
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-  }
-
-  // Do special things to make the log finalize itself, and not allow any more
-  // operations to be performed until re-initialized.
-  if (BQ == nullptr) {
-    if (Verbosity())
-      Report("Attempting to finalize an uninitialized global buffer!\n");
-  } else {
-    BQ->finalize();
-  }
-
-  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
-               memory_order_release);
-  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
-}
-
-struct TSCAndCPU {
-  uint64_t TSC = 0;
-  unsigned char CPU = 0;
-};
-
-static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
-  // We want to get the TSC as early as possible, so that we can check whether
-  // we've seen this CPU before. We also do it before we load anything else,
-  // to allow for forward progress with the scheduling.
-  TSCAndCPU Result;
-
-  // Test once for required CPU features
-  static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT;
-  static bool TSCSupported = true;
-  pthread_once(
-      &OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
-
-  if (TSCSupported) {
-    Result.TSC = __xray::readTSC(Result.CPU);
-  } else {
-    // FIXME: This code needs refactoring as it appears in multiple locations
-    timespec TS;
-    int result = clock_gettime(CLOCK_REALTIME, &TS);
-    if (result != 0) {
-      Report("clock_gettime(2) return %d, errno=%d", result, int(errno));
-      TS = {0, 0};
-    }
-    Result.CPU = 0;
-    Result.TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
-  }
-  return Result;
-}
-
-thread_local atomic_uint8_t Running{0};
-
-static bool setupTLD(ThreadLocalData &TLD) XRAY_NEVER_INSTRUMENT {
-  // Check if we're finalizing, before proceeding.
-  {
-    auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
-    if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
-        Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) {
-      if (TLD.Controller != nullptr) {
-        TLD.Controller->flush();
-        TLD.Controller = nullptr;
-      }
-      return false;
-    }
-  }
-
-  if (UNLIKELY(TLD.Controller == nullptr)) {
-    // Set up the TLD buffer queue.
-    if (UNLIKELY(BQ == nullptr))
-      return false;
-    TLD.BQ = BQ;
-
-    // Check that we have a valid buffer.
-    if (TLD.Buffer.Generation != BQ->generation() &&
-        TLD.BQ->releaseBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
-      return false;
-
-    // Set up a buffer, before setting up the log writer. Bail out on failure.
-    if (TLD.BQ->getBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
-      return false;
-
-    // Set up the Log Writer for this thread.
-    if (UNLIKELY(TLD.Writer == nullptr)) {
-      auto *LWStorage = reinterpret_cast<FDRLogWriter *>(&TLD.LWStorage);
-      new (LWStorage) FDRLogWriter(TLD.Buffer);
-      TLD.Writer = LWStorage;
-    } else {
-      TLD.Writer->resetRecord();
-    }
-
-    auto *CStorage = reinterpret_cast<FDRController<> *>(&TLD.CStorage);
-    new (CStorage)
-        FDRController<>(TLD.BQ, TLD.Buffer, *TLD.Writer, clock_gettime,
-                        atomic_load_relaxed(&ThresholdTicks));
-    TLD.Controller = CStorage;
-  }
-
-  DCHECK_NE(TLD.Controller, nullptr);
-  return true;
-}
-
-void fdrLoggingHandleArg0(int32_t FuncId,
-                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
-  auto TC = getTimestamp();
-  auto &TSC = TC.TSC;
-  auto &CPU = TC.CPU;
-  RecursionGuard Guard{Running};
-  if (!Guard)
-    return;
-
-  auto &TLD = getThreadLocalData();
-  if (!setupTLD(TLD))
-    return;
-
-  switch (Entry) {
-  case XRayEntryType::ENTRY:
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    TLD.Controller->functionEnter(FuncId, TSC, CPU);
-    return;
-  case XRayEntryType::EXIT:
-    TLD.Controller->functionExit(FuncId, TSC, CPU);
-    return;
-  case XRayEntryType::TAIL:
-    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
-    return;
-  case XRayEntryType::CUSTOM_EVENT:
-  case XRayEntryType::TYPED_EVENT:
-    break;
-  }
-}
-
-void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry,
-                          uint64_t Arg) XRAY_NEVER_INSTRUMENT {
-  auto TC = getTimestamp();
-  auto &TSC = TC.TSC;
-  auto &CPU = TC.CPU;
-  RecursionGuard Guard{Running};
-  if (!Guard)
-    return;
-
-  auto &TLD = getThreadLocalData();
-  if (!setupTLD(TLD))
-    return;
-
-  switch (Entry) {
-  case XRayEntryType::ENTRY:
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    TLD.Controller->functionEnterArg(FuncId, TSC, CPU, Arg);
-    return;
-  case XRayEntryType::EXIT:
-    TLD.Controller->functionExit(FuncId, TSC, CPU);
-    return;
-  case XRayEntryType::TAIL:
-    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
-    return;
-  case XRayEntryType::CUSTOM_EVENT:
-  case XRayEntryType::TYPED_EVENT:
-    break;
-  }
-}
-
-void fdrLoggingHandleCustomEvent(void *Event,
-                                 std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
-  auto TC = getTimestamp();
-  auto &TSC = TC.TSC;
-  auto &CPU = TC.CPU;
-  RecursionGuard Guard{Running};
-  if (!Guard)
-    return;
-
-  // Complain when we ever get at least one custom event that's larger than what
-  // we can possibly support.
-  if (EventSize >
-      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
-    static pthread_once_t Once = PTHREAD_ONCE_INIT;
-    pthread_once(
-        &Once, +[] {
-          Report("Custom event size too large; truncating to %d.\n",
-                 std::numeric_limits<int32_t>::max());
-        });
-  }
-
-  auto &TLD = getThreadLocalData();
-  if (!setupTLD(TLD))
-    return;
-
-  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
-  TLD.Controller->customEvent(TSC, CPU, Event, ReducedEventSize);
-}
-
-void fdrLoggingHandleTypedEvent(
-    uint16_t EventType, const void *Event,
-    std::size_t EventSize) noexcept XRAY_NEVER_INSTRUMENT {
-  auto TC = getTimestamp();
-  auto &TSC = TC.TSC;
-  auto &CPU = TC.CPU;
-  RecursionGuard Guard{Running};
-  if (!Guard)
-    return;
-
-  // Complain when we ever get at least one typed event that's larger than what
-  // we can possibly support.
-  if (EventSize >
-      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
-    static pthread_once_t Once = PTHREAD_ONCE_INIT;
-    pthread_once(
-        &Once, +[] {
-          Report("Typed event size too large; truncating to %d.\n",
-                 std::numeric_limits<int32_t>::max());
-        });
-  }
-
-  auto &TLD = getThreadLocalData();
-  if (!setupTLD(TLD))
-    return;
-
-  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
-  TLD.Controller->typedEvent(TSC, CPU, EventType, Event, ReducedEventSize);
-}
-
-XRayLogInitStatus fdrLoggingInit(size_t, size_t, void *Options,
-                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  if (Options == nullptr)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
-                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
-                                      memory_order_release)) {
-    if (Verbosity())
-      Report("Cannot initialize already initialized implementation.\n");
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-  }
-
-  if (Verbosity())
-    Report("Initializing FDR mode with options: %s\n",
-           static_cast<const char *>(Options));
-
-  // TODO: Factor out the flags specific to the FDR mode implementation. For
-  // now, use the global/single definition of the flags, since the FDR mode
-  // flags are already defined there.
-  FlagParser FDRParser;
-  FDRFlags FDRFlags;
-  registerXRayFDRFlags(&FDRParser, &FDRFlags);
-  FDRFlags.setDefaults();
-
-  // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
-  // options until we migrate everyone to use the XRAY_FDR_OPTIONS
-  // compiler-provided options.
-  FDRParser.ParseString(useCompilerDefinedFlags());
-  FDRParser.ParseString(useCompilerDefinedFDRFlags());
-  auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
-  if (EnvOpts == nullptr)
-    EnvOpts = "";
-  FDRParser.ParseString(EnvOpts);
-
-  // FIXME: Remove this when we fully remove the deprecated flags.
-  if (internal_strlen(EnvOpts) == 0) {
-    FDRFlags.func_duration_threshold_us =
-        flags()->xray_fdr_log_func_duration_threshold_us;
-    FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
-  }
-
-  // The provided options should always override the compiler-provided and
-  // environment-variable defined options.
-  FDRParser.ParseString(static_cast<const char *>(Options));
-  *fdrFlags() = FDRFlags;
-  auto BufferSize = FDRFlags.buffer_size;
-  auto BufferMax = FDRFlags.buffer_max;
-
-  if (BQ == nullptr) {
-    bool Success = false;
-    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
-    new (BQ) BufferQueue(BufferSize, BufferMax, Success);
-    if (!Success) {
-      Report("BufferQueue init failed.\n");
-      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-    }
-  } else {
-    if (BQ->init(BufferSize, BufferMax) != BufferQueue::ErrorCode::Ok) {
-      if (Verbosity())
-        Report("Failed to re-initialize global buffer queue. Init failed.\n");
-      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-    }
-  }
-
-  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
-  pthread_once(
-      &OnceInit, +[] {
-        atomic_store(&TicksPerSec,
-                     probeRequiredCPUFeatures() ? getTSCFrequency()
-                                                : __xray::NanosecondsPerSecond,
-                     memory_order_release);
-        pthread_key_create(
-            &Key, +[](void *TLDPtr) {
-              if (TLDPtr == nullptr)
-                return;
-              auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
-              if (TLD.BQ == nullptr)
-                return;
-              if (TLD.Buffer.Data == nullptr)
-                return;
-              auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
-              if (EC != BufferQueue::ErrorCode::Ok)
-                Report("At thread exit, failed to release buffer at %p; "
-                       "error=%s\n",
-                       TLD.Buffer.Data, BufferQueue::getErrorString(EC));
-            });
-      });
-
-  atomic_store(&ThresholdTicks,
-               atomic_load_relaxed(&TicksPerSec) *
-                   fdrFlags()->func_duration_threshold_us / 1000000,
-               memory_order_release);
-  // Arg1 handler should go in first to avoid concurrent code accidentally
-  // falling back to arg0 when it should have ran arg1.
-  __xray_set_handler_arg1(fdrLoggingHandleArg1);
-  // Install the actual handleArg0 handler after initialising the buffers.
-  __xray_set_handler(fdrLoggingHandleArg0);
-  __xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
-  __xray_set_typedevent_handler(fdrLoggingHandleTypedEvent);
-
-  // Install the buffer iterator implementation.
-  __xray_log_set_buffer_iterator(fdrIterator);
-
-  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
-               memory_order_release);
-
-  if (Verbosity())
-    Report("XRay FDR init successful.\n");
-  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-}
-
-bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
-  XRayLogImpl Impl{
-      fdrLoggingInit,
-      fdrLoggingFinalize,
-      fdrLoggingHandleArg0,
-      fdrLoggingFlush,
-  };
-  auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
-  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
-      Verbosity()) {
-    Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
-           RegistrationResult);
-    return false;
-  }
-
-  if (flags()->xray_fdr_log ||
-      !internal_strcmp(flags()->xray_mode, "xray-fdr")) {
-    auto SelectResult = __xray_log_select_mode("xray-fdr");
-    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
-        Verbosity()) {
-      Report("Cannot select XRay FDR mode as 'xray-fdr'; error = %d\n",
-             SelectResult);
-      return false;
-    }
-  }
-  return true;
-}
-
-} // namespace __xray
-
-static auto UNUSED Unused = __xray::fdrLogDynamicInitializer();
diff --git a/lib/xray/xray_fdr_logging.cpp b/lib/xray/xray_fdr_logging.cpp
new file mode 100644
index 000000000000..16ce483502f0
--- /dev/null
+++ b/lib/xray/xray_fdr_logging.cpp
@@ -0,0 +1,757 @@
+//===-- xray_fdr_logging.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Here we implement the Flight Data Recorder mode for XRay, where we use
+// compact structures to store records in memory as well as when writing out the
+// data to files.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_fdr_logging.h"
+#include <cassert>
+#include <errno.h>
+#include <limits>
+#include <memory>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray/xray_interface.h"
+#include "xray/xray_records.h"
+#include "xray_allocator.h"
+#include "xray_buffer_queue.h"
+#include "xray_defs.h"
+#include "xray_fdr_controller.h"
+#include "xray_fdr_flags.h"
+#include "xray_fdr_log_writer.h"
+#include "xray_flags.h"
+#include "xray_recursion_guard.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+
+namespace __xray {
+
+static atomic_sint32_t LoggingStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+namespace {
+
+// Group together thread-local-data in a struct, then hide it behind a function
+// call so that it can be initialized on first use instead of as a global. We
+// force the alignment to 64-bytes for x86 cache line alignment, as this
+// structure is used in the hot path of implementation.
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
+  BufferQueue::Buffer Buffer{};
+  BufferQueue *BQ = nullptr;
+
+  using LogWriterStorage =
+      typename std::aligned_storage<sizeof(FDRLogWriter),
+                                    alignof(FDRLogWriter)>::type;
+
+  LogWriterStorage LWStorage;
+  FDRLogWriter *Writer = nullptr;
+
+  using ControllerStorage =
+      typename std::aligned_storage<sizeof(FDRController<>),
+                                    alignof(FDRController<>)>::type;
+  ControllerStorage CStorage;
+  FDRController<> *Controller = nullptr;
+};
+
+} // namespace
+
+static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
+              "ThreadLocalData must be trivially destructible");
+
+// Use a global pthread key to identify thread-local data for logging.
+static pthread_key_t Key;
+
+// Global BufferQueue.
+static std::aligned_storage<sizeof(BufferQueue)>::type BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+
+// Global thresholds for function durations.
+static atomic_uint64_t ThresholdTicks{0};
+
+// Global for ticks per second.
+static atomic_uint64_t TicksPerSec{0};
+
+static atomic_sint32_t LogFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+// This function will initialize the thread-local data structure used by the FDR
+// logging implementation and return a reference to it. The implementation
+// details require a bit of care to maintain.
+//
+// First, some requirements on the implementation in general:
+//
+//   - XRay handlers should not call any memory allocation routines that may
+//     delegate to an instrumented implementation. This means functions like
+//     malloc() and free() should not be called while instrumenting.
+//
+//   - We would like to use some thread-local data initialized on first-use of
+//     the XRay instrumentation. These allow us to implement unsynchronized
+//     routines that access resources associated with the thread.
+//
+// The implementation here uses a few mechanisms that allow us to provide both
+// the requirements listed above. We do this by:
+//
+//   1. Using a thread-local aligned storage buffer for representing the
+//      ThreadLocalData struct. This data will be uninitialized memory by
+//      design.
+//
+//   2. Not requiring a thread exit handler/implementation, keeping the
+//      thread-local as purely a collection of references/data that do not
+//      require cleanup.
+//
+// We're doing this to avoid using a `thread_local` object that has a
+// non-trivial destructor, because the C++ runtime might call std::malloc(...)
+// to register calls to destructors. Deadlocks may arise when, for example, an
+// externally provided malloc implementation is XRay instrumented, and
+// initializing the thread-locals involves calling into malloc. A malloc
+// implementation that does global synchronization might be holding a lock for a
+// critical section, calling a function that might be XRay instrumented (and
+// thus in turn calling into malloc by virtue of registration of the
+// thread_local's destructor).
+#if XRAY_HAS_TLS_ALIGNAS
+static_assert(alignof(ThreadLocalData) >= 64,
+              "ThreadLocalData must be cache line aligned.");
+#endif
+static ThreadLocalData &getThreadLocalData() {
+  thread_local typename std::aligned_storage<
+      sizeof(ThreadLocalData), alignof(ThreadLocalData)>::type TLDStorage{};
+
+  if (pthread_getspecific(Key) == NULL) {
+    new (reinterpret_cast<ThreadLocalData *>(&TLDStorage)) ThreadLocalData{};
+    pthread_setspecific(Key, &TLDStorage);
+  }
+
+  return *reinterpret_cast<ThreadLocalData *>(&TLDStorage);
+}
+
+static XRayFileHeader &fdrCommonHeaderInfo() {
+  static std::aligned_storage<sizeof(XRayFileHeader)>::type HStorage;
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  static bool TSCSupported = true;
+  static uint64_t CycleFrequency = NanosecondsPerSecond;
+  pthread_once(
+      &OnceInit, +[] {
+        XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
+        // Version 2 of the log writes the extents of the buffer, instead of
+        // relying on an end-of-buffer record.
+        // Version 3 includes PID metadata record.
+        // Version 4 includes CPU data in the custom event records.
+        // Version 5 uses relative deltas for custom and typed event records,
+        // and removes the CPU data in custom event records (similar to how
+        // function records use deltas instead of full TSCs and rely on other
+        // metadata records for TSC wraparound and CPU migration).
+        H.Version = 5;
+        H.Type = FileTypes::FDR_LOG;
+
+        // Test for required CPU features and cache the cycle frequency
+        TSCSupported = probeRequiredCPUFeatures();
+        if (TSCSupported)
+          CycleFrequency = getTSCFrequency();
+        H.CycleFrequency = CycleFrequency;
+
+        // FIXME: Actually check whether we have 'constant_tsc' and
+        // 'nonstop_tsc' before setting the values in the header.
+        H.ConstantTSC = 1;
+        H.NonstopTSC = 1;
+      });
+  return reinterpret_cast<XRayFileHeader &>(HStorage);
+}
+
+// This is the iterator implementation, which knows how to handle FDR-mode
+// specific buffers. This is used as an implementation of the iterator function
+// needed by __xray_set_buffer_iterator(...). It maintains a global state of the
+// buffer iteration for the currently installed FDR mode buffers. In particular:
+//
+//   - If the argument represents the initial state of XRayBuffer ({nullptr, 0})
+//     then the iterator returns the header information.
+//   - If the argument represents the header information ({address of header
+//     info, size of the header info}) then it returns the first FDR buffer's
+//     address and extents.
+//   - It will keep returning the next buffer and extents as there are more
+//     buffers to process. When the input represents the last buffer, it will
+//     return the initial state to signal completion ({nullptr, 0}).
+//
+// See xray/xray_log_interface.h for more details on the requirements for the
+// implementations of __xray_set_buffer_iterator(...) and
+// __xray_log_process_buffers(...).
+XRayBuffer fdrIterator(const XRayBuffer B) {
+  DCHECK(internal_strcmp(__xray_log_get_current_mode(), "xray-fdr") == 0);
+  DCHECK(BQ->finalizing());
+
+  if (BQ == nullptr || !BQ->finalizing()) {
+    if (Verbosity())
+      Report(
+          "XRay FDR: Failed global buffer queue is null or not finalizing!\n");
+    return {nullptr, 0};
+  }
+
+  // We use a global scratch-pad for the header information, which only gets
+  // initialized the first time this function is called. We'll update one part
+  // of this information with some relevant data (in particular the number of
+  // buffers to expect).
+  static std::aligned_storage<sizeof(XRayFileHeader)>::type HeaderStorage;
+  static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT;
+  pthread_once(
+      &HeaderOnce, +[] {
+        reinterpret_cast<XRayFileHeader &>(HeaderStorage) =
+            fdrCommonHeaderInfo();
+      });
+
+  // We use a convenience alias for code referring to Header from here on out.
+  auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage);
+  if (B.Data == nullptr && B.Size == 0) {
+    Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
+    return XRayBuffer{static_cast<void *>(&Header), sizeof(Header)};
+  }
+
+  static BufferQueue::const_iterator It{};
+  static BufferQueue::const_iterator End{};
+  static uint8_t *CurrentBuffer{nullptr};
+  static size_t SerializedBufferSize = 0;
+  if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
+    // From this point on, we provide raw access to the raw buffer we're getting
+    // from the BufferQueue. We're relying on the iterators from the current
+    // Buffer queue.
+    It = BQ->cbegin();
+    End = BQ->cend();
+  }
+
+  if (CurrentBuffer != nullptr) {
+    deallocateBuffer(CurrentBuffer, SerializedBufferSize);
+    CurrentBuffer = nullptr;
+  }
+
+  if (It == End)
+    return {nullptr, 0};
+
+  // Set up the current buffer to contain the extents like we would when writing
+  // out to disk. The difference here would be that we still write "empty"
+  // buffers, or at least go through the iterators faithfully to let the
+  // handlers see the empty buffers in the queue.
+  //
+  // We need this atomic fence here to ensure that writes happening to the
+  // buffer have been committed before we load the extents atomically. Because
+  // the buffer is not explicitly synchronised across threads, we rely on the
+  // fence ordering to ensure that writes we expect to have been completed
+  // before the fence are fully committed before we read the extents.
+  atomic_thread_fence(memory_order_acquire);
+  auto BufferSize = atomic_load(It->Extents, memory_order_acquire);
+  SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+  CurrentBuffer = allocateBuffer(SerializedBufferSize);
+  if (CurrentBuffer == nullptr)
+    return {nullptr, 0};
+
+  // Write out the extents as a Metadata Record into the CurrentBuffer.
+  MetadataRecord ExtentsRecord;
+  ExtentsRecord.Type = uint8_t(RecordType::Metadata);
+  ExtentsRecord.RecordKind =
+      uint8_t(MetadataRecord::RecordKinds::BufferExtents);
+  internal_memcpy(ExtentsRecord.Data, &BufferSize, sizeof(BufferSize));
+  auto AfterExtents =
+      static_cast<char *>(internal_memcpy(CurrentBuffer, &ExtentsRecord,
+                                          sizeof(MetadataRecord))) +
+      sizeof(MetadataRecord);
+  internal_memcpy(AfterExtents, It->Data, BufferSize);
+
+  XRayBuffer Result;
+  Result.Data = CurrentBuffer;
+  Result.Size = SerializedBufferSize;
+  ++It;
+  return Result;
+}
+
+// Must finalize before flushing.
+XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&LoggingStatus, memory_order_acquire) !=
+      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+    if (Verbosity())
+      Report("Not flushing log, implementation is not finalized.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  s32 Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  if (!atomic_compare_exchange_strong(&LogFlushStatus, &Result,
+                                      XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Not flushing log, implementation is still finalizing.\n");
+    return static_cast<XRayLogFlushStatus>(Result);
+  }
+
+  if (BQ == nullptr) {
+    if (Verbosity())
+      Report("Cannot flush when global buffer queue is null.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  // We wait a number of milliseconds to allow threads to see that we've
+  // finalised before attempting to flush the log.
+  SleepForMillis(fdrFlags()->grace_period_ms);
+
+  // At this point, we're going to uninstall the iterator implementation, before
+  // we decide to do anything further with the global buffer queue.
+  __xray_log_remove_buffer_iterator();
+
+  // Once flushed, we should set the global status of the logging implementation
+  // to "uninitialized" to allow for FDR-logging multiple runs.
+  auto ResetToUnitialized = at_scope_exit([] {
+    atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                 memory_order_release);
+  });
+
+  auto CleanupBuffers = at_scope_exit([] {
+    auto &TLD = getThreadLocalData();
+    if (TLD.Controller != nullptr)
+      TLD.Controller->flush();
+  });
+
+  if (fdrFlags()->no_file_flush) {
+    if (Verbosity())
+      Report("XRay FDR: Not flushing to file, 'no_file_flush=true'.\n");
+
+    atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+                 memory_order_release);
+    return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+  }
+
+  // We write out the file in the following format:
+  //
+  //   1) We write down the XRay file header with version 1, type FDR_LOG.
+  //   2) Then we use the 'apply' member of the BufferQueue that's live, to
+  //      ensure that at this point in time we write down the buffers that have
+  //      been released (and marked "used") -- we dump the full buffer for now
+  //      (fixed-sized) and let the tools reading the buffers deal with the data
+  //      afterwards.
+  //
+  LogWriter *LW = LogWriter::Open();
+  if (LW == nullptr) {
+    auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+    atomic_store(&LogFlushStatus, Result, memory_order_release);
+    return Result;
+  }
+
+  XRayFileHeader Header = fdrCommonHeaderInfo();
+  Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
+  LW->WriteAll(reinterpret_cast<char *>(&Header),
+               reinterpret_cast<char *>(&Header) + sizeof(Header));
+
+  // Release the current thread's buffer before we attempt to write out all the
+  // buffers. This ensures that in case we had only a single thread going, that
+  // we are able to capture the data nonetheless.
+  auto &TLD = getThreadLocalData();
+  if (TLD.Controller != nullptr)
+    TLD.Controller->flush();
+
+  BQ->apply([&](const BufferQueue::Buffer &B) {
+    // Starting at version 2 of the FDR logging implementation, we only write
+    // the records identified by the extents of the buffer. We use the Extents
+    // from the Buffer and write that out as the first record in the buffer.  We
+    // still use a Metadata record, but fill in the extents instead for the
+    // data.
+    MetadataRecord ExtentsRecord;
+    auto BufferExtents = atomic_load(B.Extents, memory_order_acquire);
+    DCHECK(BufferExtents <= B.Size);
+    ExtentsRecord.Type = uint8_t(RecordType::Metadata);
+    ExtentsRecord.RecordKind =
+        uint8_t(MetadataRecord::RecordKinds::BufferExtents);
+    internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
+    if (BufferExtents > 0) {
+      LW->WriteAll(reinterpret_cast<char *>(&ExtentsRecord),
+                   reinterpret_cast<char *>(&ExtentsRecord) +
+                       sizeof(MetadataRecord));
+      LW->WriteAll(reinterpret_cast<char *>(B.Data),
+                   reinterpret_cast<char *>(B.Data) + BufferExtents);
+    }
+  });
+
+  atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+               memory_order_release);
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot finalize log, implementation not initialized.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  // Do special things to make the log finalize itself, and not allow any more
+  // operations to be performed until re-initialized.
+  if (BQ == nullptr) {
+    if (Verbosity())
+      Report("Attempting to finalize an uninitialized global buffer!\n");
+  } else {
+    BQ->finalize();
+  }
+
+  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+               memory_order_release);
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+struct TSCAndCPU {
+  uint64_t TSC = 0;
+  unsigned char CPU = 0;
+};
+
+static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
+  // We want to get the TSC as early as possible, so that we can check whether
+  // we've seen this CPU before. We also do it before we load anything else,
+  // to allow for forward progress with the scheduling.
+  TSCAndCPU Result;
+
+  // Test once for required CPU features
+  static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT;
+  static bool TSCSupported = true;
+  pthread_once(
+      &OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
+
+  if (TSCSupported) {
+    Result.TSC = __xray::readTSC(Result.CPU);
+  } else {
+    // FIXME: This code needs refactoring as it appears in multiple locations
+    timespec TS;
+    int result = clock_gettime(CLOCK_REALTIME, &TS);
+    if (result != 0) {
+      Report("clock_gettime(2) return %d, errno=%d", result, int(errno));
+      TS = {0, 0};
+    }
+    Result.CPU = 0;
+    Result.TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
+  }
+  return Result;
+}
+
+thread_local atomic_uint8_t Running{0};
+
+static bool setupTLD(ThreadLocalData &TLD) XRAY_NEVER_INSTRUMENT {
+  // Check if we're finalizing, before proceeding.
+  {
+    auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
+    if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+        Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+      if (TLD.Controller != nullptr) {
+        TLD.Controller->flush();
+        TLD.Controller = nullptr;
+      }
+      return false;
+    }
+  }
+
+  if (UNLIKELY(TLD.Controller == nullptr)) {
+    // Set up the TLD buffer queue.
+    if (UNLIKELY(BQ == nullptr))
+      return false;
+    TLD.BQ = BQ;
+
+    // Check that we have a valid buffer.
+    if (TLD.Buffer.Generation != BQ->generation() &&
+        TLD.BQ->releaseBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    // Set up a buffer, before setting up the log writer. Bail out on failure.
+    if (TLD.BQ->getBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    // Set up the Log Writer for this thread.
+    if (UNLIKELY(TLD.Writer == nullptr)) {
+      auto *LWStorage = reinterpret_cast<FDRLogWriter *>(&TLD.LWStorage);
+      new (LWStorage) FDRLogWriter(TLD.Buffer);
+      TLD.Writer = LWStorage;
+    } else {
+      TLD.Writer->resetRecord();
+    }
+
+    auto *CStorage = reinterpret_cast<FDRController<> *>(&TLD.CStorage);
+    new (CStorage)
+        FDRController<>(TLD.BQ, TLD.Buffer, *TLD.Writer, clock_gettime,
+                        atomic_load_relaxed(&ThresholdTicks));
+    TLD.Controller = CStorage;
+  }
+
+  DCHECK_NE(TLD.Controller, nullptr);
+  return true;
+}
+
+void fdrLoggingHandleArg0(int32_t FuncId,
+                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.Controller->functionEnter(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::EXIT:
+    TLD.Controller->functionExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::TAIL:
+    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::CUSTOM_EVENT:
+  case XRayEntryType::TYPED_EVENT:
+    break;
+  }
+}
+
+void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry,
+                          uint64_t Arg) XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.Controller->functionEnterArg(FuncId, TSC, CPU, Arg);
+    return;
+  case XRayEntryType::EXIT:
+    TLD.Controller->functionExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::TAIL:
+    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::CUSTOM_EVENT:
+  case XRayEntryType::TYPED_EVENT:
+    break;
+  }
+}
+
+void fdrLoggingHandleCustomEvent(void *Event,
+                                 std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  // Complain when we ever get at least one custom event that's larger than what
+  // we can possibly support.
+  if (EventSize >
+      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
+    static pthread_once_t Once = PTHREAD_ONCE_INIT;
+    pthread_once(
+        &Once, +[] {
+          Report("Custom event size too large; truncating to %d.\n",
+                 std::numeric_limits<int32_t>::max());
+        });
+  }
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  TLD.Controller->customEvent(TSC, CPU, Event, ReducedEventSize);
+}
+
+void fdrLoggingHandleTypedEvent(
+    uint16_t EventType, const void *Event,
+    std::size_t EventSize) noexcept XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  // Complain when we ever get at least one typed event that's larger than what
+  // we can possibly support.
+  if (EventSize >
+      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
+    static pthread_once_t Once = PTHREAD_ONCE_INIT;
+    pthread_once(
+        &Once, +[] {
+          Report("Typed event size too large; truncating to %d.\n",
+                 std::numeric_limits<int32_t>::max());
+        });
+  }
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  TLD.Controller->typedEvent(TSC, CPU, EventType, Event, ReducedEventSize);
+}
+
+XRayLogInitStatus fdrLoggingInit(size_t, size_t, void *Options,
+                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  if (Options == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot initialize already initialized implementation.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  if (Verbosity())
+    Report("Initializing FDR mode with options: %s\n",
+           static_cast<const char *>(Options));
+
+  // TODO: Factor out the flags specific to the FDR mode implementation. For
+  // now, use the global/single definition of the flags, since the FDR mode
+  // flags are already defined there.
+  FlagParser FDRParser;
+  FDRFlags FDRFlags;
+  registerXRayFDRFlags(&FDRParser, &FDRFlags);
+  FDRFlags.setDefaults();
+
+  // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
+  // options until we migrate everyone to use the XRAY_FDR_OPTIONS
+  // compiler-provided options.
+  FDRParser.ParseString(useCompilerDefinedFlags());
+  FDRParser.ParseString(useCompilerDefinedFDRFlags());
+  auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
+  FDRParser.ParseString(EnvOpts);
+
+  // FIXME: Remove this when we fully remove the deprecated flags.
+  if (internal_strlen(EnvOpts) == 0) {
+    FDRFlags.func_duration_threshold_us =
+        flags()->xray_fdr_log_func_duration_threshold_us;
+    FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
+  }
+
+  // The provided options should always override the compiler-provided and
+  // environment-variable defined options.
+  FDRParser.ParseString(static_cast<const char *>(Options));
+  *fdrFlags() = FDRFlags;
+  auto BufferSize = FDRFlags.buffer_size;
+  auto BufferMax = FDRFlags.buffer_max;
+
+  if (BQ == nullptr) {
+    bool Success = false;
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+    new (BQ) BufferQueue(BufferSize, BufferMax, Success);
+    if (!Success) {
+      Report("BufferQueue init failed.\n");
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+  } else {
+    if (BQ->init(BufferSize, BufferMax) != BufferQueue::ErrorCode::Ok) {
+      if (Verbosity())
+        Report("Failed to re-initialize global buffer queue. Init failed.\n");
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+  }
+
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  pthread_once(
+      &OnceInit, +[] {
+        atomic_store(&TicksPerSec,
+                     probeRequiredCPUFeatures() ? getTSCFrequency()
+                                                : __xray::NanosecondsPerSecond,
+                     memory_order_release);
+        pthread_key_create(
+            &Key, +[](void *TLDPtr) {
+              if (TLDPtr == nullptr)
+                return;
+              auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
+              if (TLD.BQ == nullptr)
+                return;
+              if (TLD.Buffer.Data == nullptr)
+                return;
+              auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
+              if (EC != BufferQueue::ErrorCode::Ok)
+                Report("At thread exit, failed to release buffer at %p; "
+                       "error=%s\n",
+                       TLD.Buffer.Data, BufferQueue::getErrorString(EC));
+            });
+      });
+
+  atomic_store(&ThresholdTicks,
+               atomic_load_relaxed(&TicksPerSec) *
+                   fdrFlags()->func_duration_threshold_us / 1000000,
+               memory_order_release);
+  // Arg1 handler should go in first to avoid concurrent code accidentally
+  // falling back to arg0 when it should have ran arg1.
+  __xray_set_handler_arg1(fdrLoggingHandleArg1);
+  // Install the actual handleArg0 handler after initialising the buffers.
+  __xray_set_handler(fdrLoggingHandleArg0);
+  __xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
+  __xray_set_typedevent_handler(fdrLoggingHandleTypedEvent);
+
+  // Install the buffer iterator implementation.
+  __xray_log_set_buffer_iterator(fdrIterator);
+
+  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+               memory_order_release);
+
+  if (Verbosity())
+    Report("XRay FDR init successful.\n");
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  XRayLogImpl Impl{
+      fdrLoggingInit,
+      fdrLoggingFinalize,
+      fdrLoggingHandleArg0,
+      fdrLoggingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+      Verbosity()) {
+    Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
+           RegistrationResult);
+    return false;
+  }
+
+  if (flags()->xray_fdr_log ||
+      !internal_strcmp(flags()->xray_mode, "xray-fdr")) {
+    auto SelectResult = __xray_log_select_mode("xray-fdr");
+    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+        Verbosity()) {
+      Report("Cannot select XRay FDR mode as 'xray-fdr'; error = %d\n",
+             SelectResult);
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::fdrLogDynamicInitializer();
diff --git a/lib/xray/xray_flags.cc b/lib/xray/xray_flags.cc
deleted file mode 100644
index b9e8324a7874..000000000000
--- a/lib/xray/xray_flags.cc
+++ /dev/null
@@ -1,84 +0,0 @@
-//===-- xray_flags.cc -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// XRay flag parsing logic.
-//===----------------------------------------------------------------------===//
-
-#include "xray_flags.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_flag_parser.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "xray_defs.h"
-
-using namespace __sanitizer;
-
-namespace __xray {
-
-Flags xray_flags_dont_use_directly; // use via flags().
-
-void Flags::setDefaults() XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
-#include "xray_flags.inc"
-#undef XRAY_FLAG
-}
-
-void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
-  RegisterFlag(P, #Name, Description, &F->Name);
-#include "xray_flags.inc"
-#undef XRAY_FLAG
-}
-
-// This function, as defined with the help of a macro meant to be introduced at
-// build time of the XRay runtime, passes in a statically defined list of
-// options that control XRay. This means users/deployments can tweak the
-// defaults that override the hard-coded defaults in the xray_flags.inc at
-// compile-time using the XRAY_DEFAULT_OPTIONS macro.
-const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
-#ifdef XRAY_DEFAULT_OPTIONS
-  // Do the double-layered string conversion to prevent badly crafted strings
-  // provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues
-  // (or changing the semantics of the implementation through the macro). This
-  // ensures that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a
-  // string literal.
-  return SANITIZER_STRINGIFY(XRAY_DEFAULT_OPTIONS);
-#else
-  return "";
-#endif
-}
-
-void initializeFlags() XRAY_NEVER_INSTRUMENT {
-  SetCommonFlagsDefaults();
-  auto *F = flags();
-  F->setDefaults();
-
-  FlagParser XRayParser;
-  registerXRayFlags(&XRayParser, F);
-  RegisterCommonFlags(&XRayParser);
-
-  // Use options defaulted at compile-time for the runtime.
-  const char *XRayCompileFlags = useCompilerDefinedFlags();
-  XRayParser.ParseString(XRayCompileFlags);
-
-  // Override from environment variables.
-  XRayParser.ParseStringFromEnv("XRAY_OPTIONS");
-
-  // Override from command line.
-  InitializeCommonFlags();
-
-  if (Verbosity())
-    ReportUnrecognizedFlags();
-
-  if (common_flags()->help) {
-    XRayParser.PrintFlagDescriptions();
-  }
-}
-
-} // namespace __xray
diff --git a/lib/xray/xray_flags.cpp b/lib/xray/xray_flags.cpp
new file mode 100644
index 000000000000..e4c6906dc443
--- /dev/null
+++ b/lib/xray/xray_flags.cpp
@@ -0,0 +1,84 @@
+//===-- xray_flags.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+Flags xray_flags_dont_use_directly; // use via flags().
+
+void Flags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_flags.inc"
+#undef XRAY_FLAG
+}
+
+// This function, as defined with the help of a macro meant to be introduced at
+// build time of the XRay runtime, passes in a statically defined list of
+// options that control XRay. This means users/deployments can tweak the
+// defaults that override the hard-coded defaults in the xray_flags.inc at
+// compile-time using the XRAY_DEFAULT_OPTIONS macro.
+const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_DEFAULT_OPTIONS
+  // Do the double-layered string conversion to prevent badly crafted strings
+  // provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues
+  // (or changing the semantics of the implementation through the macro). This
+  // ensures that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a
+  // string literal.
+  return SANITIZER_STRINGIFY(XRAY_DEFAULT_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+void initializeFlags() XRAY_NEVER_INSTRUMENT {
+  SetCommonFlagsDefaults();
+  auto *F = flags();
+  F->setDefaults();
+
+  FlagParser XRayParser;
+  registerXRayFlags(&XRayParser, F);
+  RegisterCommonFlags(&XRayParser);
+
+  // Use options defaulted at compile-time for the runtime.
+  const char *XRayCompileFlags = useCompilerDefinedFlags();
+  XRayParser.ParseString(XRayCompileFlags);
+
+  // Override from environment variables.
+  XRayParser.ParseStringFromEnv("XRAY_OPTIONS");
+
+  // Override from command line.
+  InitializeCommonFlags();
+
+  if (Verbosity())
+    ReportUnrecognizedFlags();
+
+  if (common_flags()->help) {
+    XRayParser.PrintFlagDescriptions();
+  }
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_init.cc b/lib/xray/xray_init.cc
deleted file mode 100644
index b79bc08c5f4d..000000000000
--- a/lib/xray/xray_init.cc
+++ /dev/null
@@ -1,115 +0,0 @@
-//===-- xray_init.cc --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// XRay initialisation logic.
-//===----------------------------------------------------------------------===//
-
-#include <fcntl.h>
-#include <strings.h>
-#include <unistd.h>
-
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_defs.h"
-#include "xray_flags.h"
-#include "xray_interface_internal.h"
-
-extern "C" {
-void __xray_init();
-extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak));
-extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak));
-extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
-extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
-
-#if SANITIZER_MAC
-// HACK: This is a temporary workaround to make XRay build on 
-// Darwin, but it will probably not work at runtime.
-const XRaySledEntry __start_xray_instr_map[] = {};
-extern const XRaySledEntry __stop_xray_instr_map[] = {};
-extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
-extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
-#endif
-}
-
-using namespace __xray;
-
-// When set to 'true' this means the XRay runtime has been initialised. We use
-// the weak symbols defined above (__start_xray_inst_map and
-// __stop_xray_instr_map) to initialise the instrumentation map that XRay uses
-// for runtime patching/unpatching of instrumentation points.
-//
-// FIXME: Support DSO instrumentation maps too. The current solution only works
-// for statically linked executables.
-atomic_uint8_t XRayInitialized{0};
-
-// This should always be updated before XRayInitialized is updated.
-SpinMutex XRayInstrMapMutex;
-XRaySledMap XRayInstrMap;
-
-// Global flag to determine whether the flags have been initialized.
-atomic_uint8_t XRayFlagsInitialized{0};
-
-// A mutex to allow only one thread to initialize the XRay data structures.
-SpinMutex XRayInitMutex;
-
-// __xray_init() will do the actual loading of the current process' memory map
-// and then proceed to look for the .xray_instr_map section/segment.
-void __xray_init() XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayInitMutex);
-  // Short-circuit if we've already initialized XRay before.
-  if (atomic_load(&XRayInitialized, memory_order_acquire))
-    return;
-
-  // XRAY is not compatible with PaX MPROTECT
-  CheckMPROTECT();
-
-  if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) {
-    initializeFlags();
-    atomic_store(&XRayFlagsInitialized, true, memory_order_release);
-  }
-
-  if (__start_xray_instr_map == nullptr) {
-    if (Verbosity())
-      Report("XRay instrumentation map missing. Not initializing XRay.\n");
-    return;
-  }
-
-  {
-    SpinMutexLock Guard(&XRayInstrMapMutex);
-    XRayInstrMap.Sleds = __start_xray_instr_map;
-    XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map;
-    XRayInstrMap.SledsIndex = __start_xray_fn_idx;
-    XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx;
-  }
-  atomic_store(&XRayInitialized, true, memory_order_release);
-
-#ifndef XRAY_NO_PREINIT
-  if (flags()->patch_premain)
-    __xray_patch();
-#endif
-}
-
-// FIXME: Make check-xray tests work on FreeBSD without
-// SANITIZER_CAN_USE_PREINIT_ARRAY.
-// See sanitizer_internal_defs.h where the macro is defined.
-// Calling unresolved PLT functions in .preinit_array can lead to deadlock on
-// FreeBSD but here it seems benign.
-#if !defined(XRAY_NO_PREINIT) &&                                               \
-    (SANITIZER_CAN_USE_PREINIT_ARRAY || SANITIZER_FREEBSD)
-// Only add the preinit array initialization if the sanitizers can.
-__attribute__((section(".preinit_array"),
-               used)) void (*__local_xray_preinit)(void) = __xray_init;
-#else
-// If we cannot use the .preinit_array section, we should instead use dynamic
-// initialisation.
-__attribute__ ((constructor (0)))
-static void __local_xray_dyninit() {
-  __xray_init();
-}
-#endif
diff --git a/lib/xray/xray_init.cpp b/lib/xray/xray_init.cpp
new file mode 100644
index 000000000000..408396477975
--- /dev/null
+++ b/lib/xray/xray_init.cpp
@@ -0,0 +1,115 @@
+//===-- xray_init.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay initialisation logic.
+//===----------------------------------------------------------------------===//
+
+#include <fcntl.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_flags.h"
+#include "xray_interface_internal.h"
+
+extern "C" {
+void __xray_init();
+extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak));
+extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak));
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
+
+#if SANITIZER_MAC
+// HACK: This is a temporary workaround to make XRay build on 
+// Darwin, but it will probably not work at runtime.
+const XRaySledEntry __start_xray_instr_map[] = {};
+extern const XRaySledEntry __stop_xray_instr_map[] = {};
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
+#endif
+}
+
+using namespace __xray;
+
+// When set to 'true' this means the XRay runtime has been initialised. We use
+// the weak symbols defined above (__start_xray_inst_map and
+// __stop_xray_instr_map) to initialise the instrumentation map that XRay uses
+// for runtime patching/unpatching of instrumentation points.
+//
+// FIXME: Support DSO instrumentation maps too. The current solution only works
+// for statically linked executables.
+atomic_uint8_t XRayInitialized{0};
+
+// This should always be updated before XRayInitialized is updated.
+SpinMutex XRayInstrMapMutex;
+XRaySledMap XRayInstrMap;
+
+// Global flag to determine whether the flags have been initialized.
+atomic_uint8_t XRayFlagsInitialized{0};
+
+// A mutex to allow only one thread to initialize the XRay data structures.
+SpinMutex XRayInitMutex;
+
+// __xray_init() will do the actual loading of the current process' memory map
+// and then proceed to look for the .xray_instr_map section/segment.
+void __xray_init() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayInitMutex);
+  // Short-circuit if we've already initialized XRay before.
+  if (atomic_load(&XRayInitialized, memory_order_acquire))
+    return;
+
+  // XRAY is not compatible with PaX MPROTECT
+  CheckMPROTECT();
+
+  if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) {
+    initializeFlags();
+    atomic_store(&XRayFlagsInitialized, true, memory_order_release);
+  }
+
+  if (__start_xray_instr_map == nullptr) {
+    if (Verbosity())
+      Report("XRay instrumentation map missing. Not initializing XRay.\n");
+    return;
+  }
+
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    XRayInstrMap.Sleds = __start_xray_instr_map;
+    XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map;
+    XRayInstrMap.SledsIndex = __start_xray_fn_idx;
+    XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx;
+  }
+  atomic_store(&XRayInitialized, true, memory_order_release);
+
+#ifndef XRAY_NO_PREINIT
+  if (flags()->patch_premain)
+    __xray_patch();
+#endif
+}
+
+// FIXME: Make check-xray tests work on FreeBSD without
+// SANITIZER_CAN_USE_PREINIT_ARRAY.
+// See sanitizer_internal_defs.h where the macro is defined.
+// Calling unresolved PLT functions in .preinit_array can lead to deadlock on
+// FreeBSD but here it seems benign.
+#if !defined(XRAY_NO_PREINIT) &&                                               \
+    (SANITIZER_CAN_USE_PREINIT_ARRAY || SANITIZER_FREEBSD)
+// Only add the preinit array initialization if the sanitizers can.
+__attribute__((section(".preinit_array"),
+               used)) void (*__local_xray_preinit)(void) = __xray_init;
+#else
+// If we cannot use the .preinit_array section, we should instead use dynamic
+// initialisation.
+__attribute__ ((constructor (0)))
+static void __local_xray_dyninit() {
+  __xray_init();
+}
+#endif
diff --git a/lib/xray/xray_interface.cc b/lib/xray/xray_interface.cc
deleted file mode 100644
index 0d22893eb30f..000000000000
--- a/lib/xray/xray_interface.cc
+++ /dev/null
@@ -1,480 +0,0 @@
-//===-- xray_interface.cpp --------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Implementation of the API functions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "xray_interface_internal.h"
-
-#include <cstdint>
-#include <cstdio>
-#include <errno.h>
-#include <limits>
-#include <string.h>
-#include <sys/mman.h>
-
-#if SANITIZER_FUCHSIA
-#include <zircon/process.h>
-#include <zircon/sanitizer.h>
-#include <zircon/status.h>
-#include <zircon/syscalls.h>
-#endif
-
-#include "sanitizer_common/sanitizer_addrhashmap.h"
-#include "sanitizer_common/sanitizer_common.h"
-
-#include "xray_defs.h"
-#include "xray_flags.h"
-
-extern __sanitizer::SpinMutex XRayInstrMapMutex;
-extern __sanitizer::atomic_uint8_t XRayInitialized;
-extern __xray::XRaySledMap XRayInstrMap;
-
-namespace __xray {
-
-#if defined(__x86_64__)
-static const int16_t cSledLength = 12;
-#elif defined(__aarch64__)
-static const int16_t cSledLength = 32;
-#elif defined(__arm__)
-static const int16_t cSledLength = 28;
-#elif SANITIZER_MIPS32
-static const int16_t cSledLength = 48;
-#elif SANITIZER_MIPS64
-static const int16_t cSledLength = 64;
-#elif defined(__powerpc64__)
-static const int16_t cSledLength = 8;
-#else
-#error "Unsupported CPU Architecture"
-#endif /* CPU architecture */
-
-// This is the function to call when we encounter the entry or exit sleds.
-atomic_uintptr_t XRayPatchedFunction{0};
-
-// This is the function to call from the arg1-enabled sleds/trampolines.
-atomic_uintptr_t XRayArgLogger{0};
-
-// This is the function to call when we encounter a custom event log call.
-atomic_uintptr_t XRayPatchedCustomEvent{0};
-
-// This is the function to call when we encounter a typed event log call.
-atomic_uintptr_t XRayPatchedTypedEvent{0};
-
-// This is the global status to determine whether we are currently
-// patching/unpatching.
-atomic_uint8_t XRayPatching{0};
-
-struct TypeDescription {
-  uint32_t type_id;
-  std::size_t description_string_length;
-};
-
-using TypeDescriptorMapType = AddrHashMap<TypeDescription, 11>;
-// An address map from immutable descriptors to type ids.
-TypeDescriptorMapType TypeDescriptorAddressMap{};
-
-atomic_uint32_t TypeEventDescriptorCounter{0};
-
-// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will
-// undo any successful mprotect(...) changes. This is used to make a page
-// writeable and executable, and upon destruction if it was successful in
-// doing so returns the page into a read-only and executable page.
-//
-// This is only used specifically for runtime-patching of the XRay
-// instrumentation points. This assumes that the executable pages are
-// originally read-and-execute only.
-class MProtectHelper {
-  void *PageAlignedAddr;
-  std::size_t MProtectLen;
-  bool MustCleanup;
-
-public:
-  explicit MProtectHelper(void *PageAlignedAddr,
-                          std::size_t MProtectLen,
-                          std::size_t PageSize) XRAY_NEVER_INSTRUMENT
-      : PageAlignedAddr(PageAlignedAddr),
-        MProtectLen(MProtectLen),
-        MustCleanup(false) {
-#if SANITIZER_FUCHSIA
-    MProtectLen = RoundUpTo(MProtectLen, PageSize);
-#endif
-  }
-
-  int MakeWriteable() XRAY_NEVER_INSTRUMENT {
-#if SANITIZER_FUCHSIA
-    auto R = __sanitizer_change_code_protection(
-        reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, true);
-    if (R != ZX_OK) {
-      Report("XRay: cannot change code protection: %s\n",
-             _zx_status_get_string(R));
-      return -1;
-    }
-    MustCleanup = true;
-    return 0;
-#else
-    auto R = mprotect(PageAlignedAddr, MProtectLen,
-                      PROT_READ | PROT_WRITE | PROT_EXEC);
-    if (R != -1)
-      MustCleanup = true;
-    return R;
-#endif
-  }
-
-  ~MProtectHelper() XRAY_NEVER_INSTRUMENT {
-    if (MustCleanup) {
-#if SANITIZER_FUCHSIA
-      auto R = __sanitizer_change_code_protection(
-          reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, false);
-      if (R != ZX_OK) {
-        Report("XRay: cannot change code protection: %s\n",
-               _zx_status_get_string(R));
-      }
-#else
-      mprotect(PageAlignedAddr, MProtectLen, PROT_READ | PROT_EXEC);
-#endif
-    }
-  }
-};
-
-namespace {
-
-bool patchSled(const XRaySledEntry &Sled, bool Enable,
-               int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  bool Success = false;
-  switch (Sled.Kind) {
-  case XRayEntryType::ENTRY:
-    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry);
-    break;
-  case XRayEntryType::EXIT:
-    Success = patchFunctionExit(Enable, FuncId, Sled);
-    break;
-  case XRayEntryType::TAIL:
-    Success = patchFunctionTailExit(Enable, FuncId, Sled);
-    break;
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
-    break;
-  case XRayEntryType::CUSTOM_EVENT:
-    Success = patchCustomEvent(Enable, FuncId, Sled);
-    break;
-  case XRayEntryType::TYPED_EVENT:
-    Success = patchTypedEvent(Enable, FuncId, Sled);
-    break;
-  default:
-    Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
-    return false;
-  }
-  return Success;
-}
-
-XRayPatchingStatus patchFunction(int32_t FuncId,
-                                 bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!atomic_load(&XRayInitialized,
-                                memory_order_acquire))
-    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
-
-  uint8_t NotPatching = false;
-  if (!atomic_compare_exchange_strong(
-          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
-    return XRayPatchingStatus::ONGOING; // Already patching.
-
-  // Next, we look for the function index.
-  XRaySledMap InstrMap;
-  {
-    SpinMutexLock Guard(&XRayInstrMapMutex);
-    InstrMap = XRayInstrMap;
-  }
-
-  // If we don't have an index, we can't patch individual functions.
-  if (InstrMap.Functions == 0)
-    return XRayPatchingStatus::NOT_INITIALIZED;
-
-  // FuncId must be a positive number, less than the number of functions
-  // instrumented.
-  if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
-    Report("Invalid function id provided: %d\n", FuncId);
-    return XRayPatchingStatus::FAILED;
-  }
-
-  // Now we patch ths sleds for this specific function.
-  auto SledRange = InstrMap.SledsIndex[FuncId - 1];
-  auto *f = SledRange.Begin;
-  auto *e = SledRange.End;
-
-  bool SucceedOnce = false;
-  while (f != e)
-    SucceedOnce |= patchSled(*f++, Enable, FuncId);
-
-  atomic_store(&XRayPatching, false,
-                            memory_order_release);
-
-  if (!SucceedOnce) {
-    Report("Failed patching any sled for function '%d'.", FuncId);
-    return XRayPatchingStatus::FAILED;
-  }
-
-  return XRayPatchingStatus::SUCCESS;
-}
-
-// controlPatching implements the common internals of the patching/unpatching
-// implementation. |Enable| defines whether we're enabling or disabling the
-// runtime XRay instrumentation.
-XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
-  if (!atomic_load(&XRayInitialized,
-                                memory_order_acquire))
-    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
-
-  uint8_t NotPatching = false;
-  if (!atomic_compare_exchange_strong(
-          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
-    return XRayPatchingStatus::ONGOING; // Already patching.
-
-  uint8_t PatchingSuccess = false;
-  auto XRayPatchingStatusResetter =
-      at_scope_exit([&PatchingSuccess] {
-        if (!PatchingSuccess)
-          atomic_store(&XRayPatching, false,
-                                    memory_order_release);
-      });
-
-  XRaySledMap InstrMap;
-  {
-    SpinMutexLock Guard(&XRayInstrMapMutex);
-    InstrMap = XRayInstrMap;
-  }
-  if (InstrMap.Entries == 0)
-    return XRayPatchingStatus::NOT_INITIALIZED;
-
-  uint32_t FuncId = 1;
-  uint64_t CurFun = 0;
-
-  // First we want to find the bounds for which we have instrumentation points,
-  // and try to get as few calls to mprotect(...) as possible. We're assuming
-  // that all the sleds for the instrumentation map are contiguous as a single
-  // set of pages. When we do support dynamic shared object instrumentation,
-  // we'll need to do this for each set of page load offsets per DSO loaded. For
-  // now we're assuming we can mprotect the whole section of text between the
-  // minimum sled address and the maximum sled address (+ the largest sled
-  // size).
-  auto MinSled = InstrMap.Sleds[0];
-  auto MaxSled = InstrMap.Sleds[InstrMap.Entries - 1];
-  for (std::size_t I = 0; I < InstrMap.Entries; I++) {
-    const auto &Sled = InstrMap.Sleds[I];
-    if (Sled.Address < MinSled.Address)
-      MinSled = Sled;
-    if (Sled.Address > MaxSled.Address)
-      MaxSled = Sled;
-  }
-
-  const size_t PageSize = flags()->xray_page_size_override > 0
-                              ? flags()->xray_page_size_override
-                              : GetPageSizeCached();
-  if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) {
-    Report("System page size is not a power of two: %lld\n", PageSize);
-    return XRayPatchingStatus::FAILED;
-  }
-
-  void *PageAlignedAddr =
-      reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
-  size_t MProtectLen =
-      (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
-  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
-  if (Protector.MakeWriteable() == -1) {
-    Report("Failed mprotect: %d\n", errno);
-    return XRayPatchingStatus::FAILED;
-  }
-
-  for (std::size_t I = 0; I < InstrMap.Entries; ++I) {
-    auto &Sled = InstrMap.Sleds[I];
-    auto F = Sled.Function;
-    if (CurFun == 0)
-      CurFun = F;
-    if (F != CurFun) {
-      ++FuncId;
-      CurFun = F;
-    }
-    patchSled(Sled, Enable, FuncId);
-  }
-  atomic_store(&XRayPatching, false,
-                            memory_order_release);
-  PatchingSuccess = true;
-  return XRayPatchingStatus::SUCCESS;
-}
-
-XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
-                                            bool Enable) XRAY_NEVER_INSTRUMENT {
-  XRaySledMap InstrMap;
-  {
-    SpinMutexLock Guard(&XRayInstrMapMutex);
-    InstrMap = XRayInstrMap;
-  }
-
-  // FuncId must be a positive number, less than the number of functions
-  // instrumented.
-  if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
-    Report("Invalid function id provided: %d\n", FuncId);
-    return XRayPatchingStatus::FAILED;
-  }
-
-  const size_t PageSize = flags()->xray_page_size_override > 0
-                              ? flags()->xray_page_size_override
-                              : GetPageSizeCached();
-  if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) {
-    Report("Provided page size is not a power of two: %lld\n", PageSize);
-    return XRayPatchingStatus::FAILED;
-  }
-
-  // Here we compute the minumum sled and maximum sled associated with a
-  // particular function ID.
-  auto SledRange = InstrMap.SledsIndex[FuncId - 1];
-  auto *f = SledRange.Begin;
-  auto *e = SledRange.End;
-  auto MinSled = *f;
-  auto MaxSled = *(SledRange.End - 1);
-  while (f != e) {
-    if (f->Address < MinSled.Address)
-      MinSled = *f;
-    if (f->Address > MaxSled.Address)
-      MaxSled = *f;
-    ++f;
-  }
-
-  void *PageAlignedAddr =
-      reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
-  size_t MProtectLen =
-      (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
-  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
-  if (Protector.MakeWriteable() == -1) {
-    Report("Failed mprotect: %d\n", errno);
-    return XRayPatchingStatus::FAILED;
-  }
-  return patchFunction(FuncId, Enable);
-}
-
-} // namespace
-
-} // namespace __xray
-
-using namespace __xray;
-
-// The following functions are declared `extern "C" {...}` in the header, hence
-// they're defined in the global namespace.
-
-int __xray_set_handler(void (*entry)(int32_t,
-                                     XRayEntryType)) XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized,
-                               memory_order_acquire)) {
-
-    atomic_store(&__xray::XRayPatchedFunction,
-                              reinterpret_cast<uintptr_t>(entry),
-                              memory_order_release);
-    return 1;
-  }
-  return 0;
-}
-
-int __xray_set_customevent_handler(void (*entry)(void *, size_t))
-    XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized,
-                               memory_order_acquire)) {
-    atomic_store(&__xray::XRayPatchedCustomEvent,
-                              reinterpret_cast<uintptr_t>(entry),
-                              memory_order_release);
-    return 1;
-  }
-  return 0;
-}
-
-int __xray_set_typedevent_handler(void (*entry)(
-    uint16_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&XRayInitialized,
-                               memory_order_acquire)) {
-    atomic_store(&__xray::XRayPatchedTypedEvent,
-                              reinterpret_cast<uintptr_t>(entry),
-                              memory_order_release);
-    return 1;
-  }
-  return 0;
-}
-
-int __xray_remove_handler() XRAY_NEVER_INSTRUMENT {
-  return __xray_set_handler(nullptr);
-}
-
-int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT {
-  return __xray_set_customevent_handler(nullptr);
-}
-
-int __xray_remove_typedevent_handler() XRAY_NEVER_INSTRUMENT {
-  return __xray_set_typedevent_handler(nullptr);
-}
-
-uint16_t __xray_register_event_type(
-    const char *const event_type) XRAY_NEVER_INSTRUMENT {
-  TypeDescriptorMapType::Handle h(&TypeDescriptorAddressMap, (uptr)event_type);
-  if (h.created()) {
-    h->type_id = atomic_fetch_add(
-        &TypeEventDescriptorCounter, 1, memory_order_acq_rel);
-    h->description_string_length = strnlen(event_type, 1024);
-  }
-  return h->type_id;
-}
-
-XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT {
-  return controlPatching(true);
-}
-
-XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT {
-  return controlPatching(false);
-}
-
-XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  return mprotectAndPatchFunction(FuncId, true);
-}
-
-XRayPatchingStatus
-__xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  return mprotectAndPatchFunction(FuncId, false);
-}
-
-int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) {
-  if (!atomic_load(&XRayInitialized,
-                                memory_order_acquire))
-    return 0;
-
-  // A relaxed write might not be visible even if the current thread gets
-  // scheduled on a different CPU/NUMA node.  We need to wait for everyone to
-  // have this handler installed for consistency of collected data across CPUs.
-  atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
-                            memory_order_release);
-  return 1;
-}
-
-int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
-
-uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayInstrMapMutex);
-  if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions)
-    return 0;
-  return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Function
-// On PPC, function entries are always aligned to 16 bytes. The beginning of a
-// sled might be a local entry, which is always +8 based on the global entry.
-// Always return the global entry.
-#ifdef __PPC__
-         & ~0xf
-#endif
-      ;
-}
-
-size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayInstrMapMutex);
-  return XRayInstrMap.Functions;
-}
diff --git a/lib/xray/xray_interface.cpp b/lib/xray/xray_interface.cpp
new file mode 100644
index 000000000000..0d22893eb30f
--- /dev/null
+++ b/lib/xray/xray_interface.cpp
@@ -0,0 +1,480 @@
+//===-- xray_interface.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of the API functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "xray_interface_internal.h"
+
+#include <cstdint>
+#include <cstdio>
+#include <errno.h>
+#include <limits>
+#include <string.h>
+#include <sys/mman.h>
+
+#if SANITIZER_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
+#include "sanitizer_common/sanitizer_addrhashmap.h"
+#include "sanitizer_common/sanitizer_common.h"
+
+#include "xray_defs.h"
+#include "xray_flags.h"
+
+extern __sanitizer::SpinMutex XRayInstrMapMutex;
+extern __sanitizer::atomic_uint8_t XRayInitialized;
+extern __xray::XRaySledMap XRayInstrMap;
+
+namespace __xray {
+
+#if defined(__x86_64__)
+static const int16_t cSledLength = 12;
+#elif defined(__aarch64__)
+static const int16_t cSledLength = 32;
+#elif defined(__arm__)
+static const int16_t cSledLength = 28;
+#elif SANITIZER_MIPS32
+static const int16_t cSledLength = 48;
+#elif SANITIZER_MIPS64
+static const int16_t cSledLength = 64;
+#elif defined(__powerpc64__)
+static const int16_t cSledLength = 8;
+#else
+#error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
+
+// This is the function to call when we encounter the entry or exit sleds.
+atomic_uintptr_t XRayPatchedFunction{0};
+
+// This is the function to call from the arg1-enabled sleds/trampolines.
+atomic_uintptr_t XRayArgLogger{0};
+
+// This is the function to call when we encounter a custom event log call.
+atomic_uintptr_t XRayPatchedCustomEvent{0};
+
+// This is the function to call when we encounter a typed event log call.
+atomic_uintptr_t XRayPatchedTypedEvent{0};
+
+// This is the global status to determine whether we are currently
+// patching/unpatching.
+atomic_uint8_t XRayPatching{0};
+
+struct TypeDescription {
+  uint32_t type_id;
+  std::size_t description_string_length;
+};
+
+using TypeDescriptorMapType = AddrHashMap<TypeDescription, 11>;
+// An address map from immutable descriptors to type ids.
+TypeDescriptorMapType TypeDescriptorAddressMap{};
+
+atomic_uint32_t TypeEventDescriptorCounter{0};
+
+// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will
+// undo any successful mprotect(...) changes. This is used to make a page
+// writeable and executable, and upon destruction if it was successful in
+// doing so returns the page into a read-only and executable page.
+//
+// This is only used specifically for runtime-patching of the XRay
+// instrumentation points. This assumes that the executable pages are
+// originally read-and-execute only.
+class MProtectHelper {
+  void *PageAlignedAddr;
+  std::size_t MProtectLen;
+  bool MustCleanup;
+
+public:
+  explicit MProtectHelper(void *PageAlignedAddr,
+                          std::size_t MProtectLen,
+                          std::size_t PageSize) XRAY_NEVER_INSTRUMENT
+      : PageAlignedAddr(PageAlignedAddr),
+        MProtectLen(MProtectLen),
+        MustCleanup(false) {
+#if SANITIZER_FUCHSIA
+    MProtectLen = RoundUpTo(MProtectLen, PageSize);
+#endif
+  }
+
+  int MakeWriteable() XRAY_NEVER_INSTRUMENT {
+#if SANITIZER_FUCHSIA
+    auto R = __sanitizer_change_code_protection(
+        reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, true);
+    if (R != ZX_OK) {
+      Report("XRay: cannot change code protection: %s\n",
+             _zx_status_get_string(R));
+      return -1;
+    }
+    MustCleanup = true;
+    return 0;
+#else
+    auto R = mprotect(PageAlignedAddr, MProtectLen,
+                      PROT_READ | PROT_WRITE | PROT_EXEC);
+    if (R != -1)
+      MustCleanup = true;
+    return R;
+#endif
+  }
+
+  ~MProtectHelper() XRAY_NEVER_INSTRUMENT {
+    if (MustCleanup) {
+#if SANITIZER_FUCHSIA
+      auto R = __sanitizer_change_code_protection(
+          reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, false);
+      if (R != ZX_OK) {
+        Report("XRay: cannot change code protection: %s\n",
+               _zx_status_get_string(R));
+      }
+#else
+      mprotect(PageAlignedAddr, MProtectLen, PROT_READ | PROT_EXEC);
+#endif
+    }
+  }
+};
+
+namespace {
+
+bool patchSled(const XRaySledEntry &Sled, bool Enable,
+               int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  bool Success = false;
+  switch (Sled.Kind) {
+  case XRayEntryType::ENTRY:
+    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry);
+    break;
+  case XRayEntryType::EXIT:
+    Success = patchFunctionExit(Enable, FuncId, Sled);
+    break;
+  case XRayEntryType::TAIL:
+    Success = patchFunctionTailExit(Enable, FuncId, Sled);
+    break;
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
+    break;
+  case XRayEntryType::CUSTOM_EVENT:
+    Success = patchCustomEvent(Enable, FuncId, Sled);
+    break;
+  case XRayEntryType::TYPED_EVENT:
+    Success = patchTypedEvent(Enable, FuncId, Sled);
+    break;
+  default:
+    Report("Unsupported sled kind '%d' @%04x\n", Sled.Address, int(Sled.Kind));
+    return false;
+  }
+  return Success;
+}
+
+XRayPatchingStatus patchFunction(int32_t FuncId,
+                                 bool Enable) XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
+    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
+
+  uint8_t NotPatching = false;
+  if (!atomic_compare_exchange_strong(
+          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
+    return XRayPatchingStatus::ONGOING; // Already patching.
+
+  // Next, we look for the function index.
+  XRaySledMap InstrMap;
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    InstrMap = XRayInstrMap;
+  }
+
+  // If we don't have an index, we can't patch individual functions.
+  if (InstrMap.Functions == 0)
+    return XRayPatchingStatus::NOT_INITIALIZED;
+
+  // FuncId must be a positive number, less than the number of functions
+  // instrumented.
+  if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
+    Report("Invalid function id provided: %d\n", FuncId);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  // Now we patch ths sleds for this specific function.
+  auto SledRange = InstrMap.SledsIndex[FuncId - 1];
+  auto *f = SledRange.Begin;
+  auto *e = SledRange.End;
+
+  bool SucceedOnce = false;
+  while (f != e)
+    SucceedOnce |= patchSled(*f++, Enable, FuncId);
+
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
+
+  if (!SucceedOnce) {
+    Report("Failed patching any sled for function '%d'.", FuncId);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  return XRayPatchingStatus::SUCCESS;
+}
+
+// controlPatching implements the common internals of the patching/unpatching
+// implementation. |Enable| defines whether we're enabling or disabling the
+// runtime XRay instrumentation.
+XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
+    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
+
+  uint8_t NotPatching = false;
+  if (!atomic_compare_exchange_strong(
+          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
+    return XRayPatchingStatus::ONGOING; // Already patching.
+
+  uint8_t PatchingSuccess = false;
+  auto XRayPatchingStatusResetter =
+      at_scope_exit([&PatchingSuccess] {
+        if (!PatchingSuccess)
+          atomic_store(&XRayPatching, false,
+                                    memory_order_release);
+      });
+
+  XRaySledMap InstrMap;
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    InstrMap = XRayInstrMap;
+  }
+  if (InstrMap.Entries == 0)
+    return XRayPatchingStatus::NOT_INITIALIZED;
+
+  uint32_t FuncId = 1;
+  uint64_t CurFun = 0;
+
+  // First we want to find the bounds for which we have instrumentation points,
+  // and try to get as few calls to mprotect(...) as possible. We're assuming
+  // that all the sleds for the instrumentation map are contiguous as a single
+  // set of pages. When we do support dynamic shared object instrumentation,
+  // we'll need to do this for each set of page load offsets per DSO loaded. For
+  // now we're assuming we can mprotect the whole section of text between the
+  // minimum sled address and the maximum sled address (+ the largest sled
+  // size).
+  auto MinSled = InstrMap.Sleds[0];
+  auto MaxSled = InstrMap.Sleds[InstrMap.Entries - 1];
+  for (std::size_t I = 0; I < InstrMap.Entries; I++) {
+    const auto &Sled = InstrMap.Sleds[I];
+    if (Sled.Address < MinSled.Address)
+      MinSled = Sled;
+    if (Sled.Address > MaxSled.Address)
+      MaxSled = Sled;
+  }
+
+  const size_t PageSize = flags()->xray_page_size_override > 0
+                              ? flags()->xray_page_size_override
+                              : GetPageSizeCached();
+  if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) {
+    Report("System page size is not a power of two: %lld\n", PageSize);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  void *PageAlignedAddr =
+      reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
+  size_t MProtectLen =
+      (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
+  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
+  if (Protector.MakeWriteable() == -1) {
+    Report("Failed mprotect: %d\n", errno);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  for (std::size_t I = 0; I < InstrMap.Entries; ++I) {
+    auto &Sled = InstrMap.Sleds[I];
+    auto F = Sled.Function;
+    if (CurFun == 0)
+      CurFun = F;
+    if (F != CurFun) {
+      ++FuncId;
+      CurFun = F;
+    }
+    patchSled(Sled, Enable, FuncId);
+  }
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
+  PatchingSuccess = true;
+  return XRayPatchingStatus::SUCCESS;
+}
+
+XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
+                                            bool Enable) XRAY_NEVER_INSTRUMENT {
+  XRaySledMap InstrMap;
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    InstrMap = XRayInstrMap;
+  }
+
+  // FuncId must be a positive number, less than the number of functions
+  // instrumented.
+  if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
+    Report("Invalid function id provided: %d\n", FuncId);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  const size_t PageSize = flags()->xray_page_size_override > 0
+                              ? flags()->xray_page_size_override
+                              : GetPageSizeCached();
+  if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) {
+    Report("Provided page size is not a power of two: %lld\n", PageSize);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  // Here we compute the minumum sled and maximum sled associated with a
+  // particular function ID.
+  auto SledRange = InstrMap.SledsIndex[FuncId - 1];
+  auto *f = SledRange.Begin;
+  auto *e = SledRange.End;
+  auto MinSled = *f;
+  auto MaxSled = *(SledRange.End - 1);
+  while (f != e) {
+    if (f->Address < MinSled.Address)
+      MinSled = *f;
+    if (f->Address > MaxSled.Address)
+      MaxSled = *f;
+    ++f;
+  }
+
+  void *PageAlignedAddr =
+      reinterpret_cast<void *>(MinSled.Address & ~(PageSize - 1));
+  size_t MProtectLen =
+      (MaxSled.Address - reinterpret_cast<uptr>(PageAlignedAddr)) + cSledLength;
+  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
+  if (Protector.MakeWriteable() == -1) {
+    Report("Failed mprotect: %d\n", errno);
+    return XRayPatchingStatus::FAILED;
+  }
+  return patchFunction(FuncId, Enable);
+}
+
+} // namespace
+
+} // namespace __xray
+
+using namespace __xray;
+
+// The following functions are declared `extern "C" {...}` in the header, hence
+// they're defined in the global namespace.
+
+int __xray_set_handler(void (*entry)(int32_t,
+                                     XRayEntryType)) XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+
+    atomic_store(&__xray::XRayPatchedFunction,
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_customevent_handler(void (*entry)(void *, size_t))
+    XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+    atomic_store(&__xray::XRayPatchedCustomEvent,
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_typedevent_handler(void (*entry)(
+    uint16_t, const void *, size_t)) XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+    atomic_store(&__xray::XRayPatchedTypedEvent,
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_remove_handler() XRAY_NEVER_INSTRUMENT {
+  return __xray_set_handler(nullptr);
+}
+
+int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT {
+  return __xray_set_customevent_handler(nullptr);
+}
+
+int __xray_remove_typedevent_handler() XRAY_NEVER_INSTRUMENT {
+  return __xray_set_typedevent_handler(nullptr);
+}
+
+uint16_t __xray_register_event_type(
+    const char *const event_type) XRAY_NEVER_INSTRUMENT {
+  TypeDescriptorMapType::Handle h(&TypeDescriptorAddressMap, (uptr)event_type);
+  if (h.created()) {
+    h->type_id = atomic_fetch_add(
+        &TypeEventDescriptorCounter, 1, memory_order_acq_rel);
+    h->description_string_length = strnlen(event_type, 1024);
+  }
+  return h->type_id;
+}
+
+XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT {
+  return controlPatching(true);
+}
+
+XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT {
+  return controlPatching(false);
+}
+
+XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  return mprotectAndPatchFunction(FuncId, true);
+}
+
+XRayPatchingStatus
+__xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  return mprotectAndPatchFunction(FuncId, false);
+}
+
+int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) {
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
+    return 0;
+
+  // A relaxed write might not be visible even if the current thread gets
+  // scheduled on a different CPU/NUMA node.  We need to wait for everyone to
+  // have this handler installed for consistency of collected data across CPUs.
+  atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
+                            memory_order_release);
+  return 1;
+}
+
+int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
+
+uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayInstrMapMutex);
+  if (FuncId <= 0 || static_cast<size_t>(FuncId) > XRayInstrMap.Functions)
+    return 0;
+  return XRayInstrMap.SledsIndex[FuncId - 1].Begin->Function
+// On PPC, function entries are always aligned to 16 bytes. The beginning of a
+// sled might be a local entry, which is always +8 based on the global entry.
+// Always return the global entry.
+#ifdef __PPC__
+         & ~0xf
+#endif
+      ;
+}
+
+size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayInstrMapMutex);
+  return XRayInstrMap.Functions;
+}
diff --git a/lib/xray/xray_log_interface.cc b/lib/xray/xray_log_interface.cc
deleted file mode 100644
index 7916a9e2b8ad..000000000000
--- a/lib/xray/xray_log_interface.cc
+++ /dev/null
@@ -1,209 +0,0 @@
-//===-- xray_log_interface.cc ---------------------------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a function call tracing system.
-//
-//===----------------------------------------------------------------------===//
-#include "xray/xray_log_interface.h"
-
-#include "sanitizer_common/sanitizer_allocator_internal.h"
-#include "sanitizer_common/sanitizer_atomic.h"
-#include "sanitizer_common/sanitizer_mutex.h"
-#include "xray/xray_interface.h"
-#include "xray_defs.h"
-
-namespace __xray {
-static SpinMutex XRayImplMutex;
-static XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr};
-static XRayLogImpl *GlobalXRayImpl = nullptr;
-
-// This is the default implementation of a buffer iterator, which always yields
-// a null buffer.
-XRayBuffer NullBufferIterator(XRayBuffer) XRAY_NEVER_INSTRUMENT {
-  return {nullptr, 0};
-}
-
-// This is the global function responsible for iterating through given buffers.
-atomic_uintptr_t XRayBufferIterator{
-    reinterpret_cast<uintptr_t>(&NullBufferIterator)};
-
-// We use a linked list of Mode to XRayLogImpl mappings. This is a linked list
-// when it should be a map because we're avoiding having to depend on C++
-// standard library data structures at this level of the implementation.
-struct ModeImpl {
-  ModeImpl *Next;
-  const char *Mode;
-  XRayLogImpl Impl;
-};
-
-static ModeImpl SentinelModeImpl{
-    nullptr, nullptr, {nullptr, nullptr, nullptr, nullptr}};
-static ModeImpl *ModeImpls = &SentinelModeImpl;
-static const ModeImpl *CurrentMode = nullptr;
-
-} // namespace __xray
-
-using namespace __xray;
-
-void __xray_log_set_buffer_iterator(XRayBuffer (*Iterator)(XRayBuffer))
-    XRAY_NEVER_INSTRUMENT {
-  atomic_store(&__xray::XRayBufferIterator,
-               reinterpret_cast<uintptr_t>(Iterator), memory_order_release);
-}
-
-void __xray_log_remove_buffer_iterator() XRAY_NEVER_INSTRUMENT {
-  __xray_log_set_buffer_iterator(&NullBufferIterator);
-}
-
-XRayLogRegisterStatus
-__xray_log_register_mode(const char *Mode,
-                         XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
-  if (Impl.flush_log == nullptr || Impl.handle_arg0 == nullptr ||
-      Impl.log_finalize == nullptr || Impl.log_init == nullptr)
-    return XRayLogRegisterStatus::XRAY_INCOMPLETE_IMPL;
-
-  SpinMutexLock Guard(&XRayImplMutex);
-  // First, look for whether the mode already has a registered implementation.
-  for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
-    if (!internal_strcmp(Mode, it->Mode))
-      return XRayLogRegisterStatus::XRAY_DUPLICATE_MODE;
-  }
-  auto *NewModeImpl = static_cast<ModeImpl *>(InternalAlloc(sizeof(ModeImpl)));
-  NewModeImpl->Next = ModeImpls;
-  NewModeImpl->Mode = internal_strdup(Mode);
-  NewModeImpl->Impl = Impl;
-  ModeImpls = NewModeImpl;
-  return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
-}
-
-XRayLogRegisterStatus
-__xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
-    if (!internal_strcmp(Mode, it->Mode)) {
-      CurrentMode = it;
-      CurrentXRayImpl = it->Impl;
-      GlobalXRayImpl = &CurrentXRayImpl;
-      __xray_set_handler(it->Impl.handle_arg0);
-      return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
-    }
-  }
-  return XRayLogRegisterStatus::XRAY_MODE_NOT_FOUND;
-}
-
-const char *__xray_log_get_current_mode() XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  if (CurrentMode != nullptr)
-    return CurrentMode->Mode;
-  return nullptr;
-}
-
-void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
-  if (Impl.log_init == nullptr || Impl.log_finalize == nullptr ||
-      Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) {
-    SpinMutexLock Guard(&XRayImplMutex);
-    GlobalXRayImpl = nullptr;
-    CurrentMode = nullptr;
-    __xray_remove_handler();
-    __xray_remove_handler_arg1();
-    return;
-  }
-
-  SpinMutexLock Guard(&XRayImplMutex);
-  CurrentXRayImpl = Impl;
-  GlobalXRayImpl = &CurrentXRayImpl;
-  __xray_set_handler(Impl.handle_arg0);
-}
-
-void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  GlobalXRayImpl = nullptr;
-  __xray_remove_handler();
-  __xray_remove_handler_arg1();
-}
-
-XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers,
-                                  void *Args,
-                                  size_t ArgsSize) XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  if (!GlobalXRayImpl)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  return GlobalXRayImpl->log_init(BufferSize, MaxBuffers, Args, ArgsSize);
-}
-
-XRayLogInitStatus __xray_log_init_mode(const char *Mode, const char *Config)
-    XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  if (!GlobalXRayImpl)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  if (Config == nullptr)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  // Check first whether the current mode is the same as what we expect.
-  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  // Here we do some work to coerce the pointer we're provided, so that
-  // the implementations that still take void* pointers can handle the
-  // data provided in the Config argument.
-  return GlobalXRayImpl->log_init(
-      0, 0, const_cast<void *>(static_cast<const void *>(Config)), 0);
-}
-
-XRayLogInitStatus
-__xray_log_init_mode_bin(const char *Mode, const char *Config,
-                         size_t ConfigSize) XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  if (!GlobalXRayImpl)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  if (Config == nullptr)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  // Check first whether the current mode is the same as what we expect.
-  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  // Here we do some work to coerce the pointer we're provided, so that
-  // the implementations that still take void* pointers can handle the
-  // data provided in the Config argument.
-  return GlobalXRayImpl->log_init(
-      0, 0, const_cast<void *>(static_cast<const void *>(Config)), ConfigSize);
-}
-
-XRayLogInitStatus __xray_log_finalize() XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  if (!GlobalXRayImpl)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  return GlobalXRayImpl->log_finalize();
-}
-
-XRayLogFlushStatus __xray_log_flushLog() XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Guard(&XRayImplMutex);
-  if (!GlobalXRayImpl)
-    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  return GlobalXRayImpl->flush_log();
-}
-
-XRayLogFlushStatus __xray_log_process_buffers(
-    void (*Processor)(const char *, XRayBuffer)) XRAY_NEVER_INSTRUMENT {
-  // We want to make sure that there will be no changes to the global state for
-  // the log by synchronising on the XRayBufferIteratorMutex.
-  if (!GlobalXRayImpl)
-    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  auto Iterator = reinterpret_cast<XRayBuffer (*)(XRayBuffer)>(
-      atomic_load(&XRayBufferIterator, memory_order_acquire));
-  auto Buffer = (*Iterator)(XRayBuffer{nullptr, 0});
-  auto Mode = CurrentMode ? CurrentMode->Mode : nullptr;
-  while (Buffer.Data != nullptr) {
-    (*Processor)(Mode, Buffer);
-    Buffer = (*Iterator)(Buffer);
-  }
-  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
-}
diff --git a/lib/xray/xray_log_interface.cpp b/lib/xray/xray_log_interface.cpp
new file mode 100644
index 000000000000..fc70373f9dac
--- /dev/null
+++ b/lib/xray/xray_log_interface.cpp
@@ -0,0 +1,209 @@
+//===-- xray_log_interface.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "xray/xray_log_interface.h"
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include "xray/xray_interface.h"
+#include "xray_defs.h"
+
+namespace __xray {
+static SpinMutex XRayImplMutex;
+static XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr};
+static XRayLogImpl *GlobalXRayImpl = nullptr;
+
+// This is the default implementation of a buffer iterator, which always yields
+// a null buffer.
+XRayBuffer NullBufferIterator(XRayBuffer) XRAY_NEVER_INSTRUMENT {
+  return {nullptr, 0};
+}
+
+// This is the global function responsible for iterating through given buffers.
+atomic_uintptr_t XRayBufferIterator{
+    reinterpret_cast<uintptr_t>(&NullBufferIterator)};
+
+// We use a linked list of Mode to XRayLogImpl mappings. This is a linked list
+// when it should be a map because we're avoiding having to depend on C++
+// standard library data structures at this level of the implementation.
+struct ModeImpl {
+  ModeImpl *Next;
+  const char *Mode;
+  XRayLogImpl Impl;
+};
+
+static ModeImpl SentinelModeImpl{
+    nullptr, nullptr, {nullptr, nullptr, nullptr, nullptr}};
+static ModeImpl *ModeImpls = &SentinelModeImpl;
+static const ModeImpl *CurrentMode = nullptr;
+
+} // namespace __xray
+
+using namespace __xray;
+
+void __xray_log_set_buffer_iterator(XRayBuffer (*Iterator)(XRayBuffer))
+    XRAY_NEVER_INSTRUMENT {
+  atomic_store(&__xray::XRayBufferIterator,
+               reinterpret_cast<uintptr_t>(Iterator), memory_order_release);
+}
+
+void __xray_log_remove_buffer_iterator() XRAY_NEVER_INSTRUMENT {
+  __xray_log_set_buffer_iterator(&NullBufferIterator);
+}
+
+XRayLogRegisterStatus
+__xray_log_register_mode(const char *Mode,
+                         XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
+  if (Impl.flush_log == nullptr || Impl.handle_arg0 == nullptr ||
+      Impl.log_finalize == nullptr || Impl.log_init == nullptr)
+    return XRayLogRegisterStatus::XRAY_INCOMPLETE_IMPL;
+
+  SpinMutexLock Guard(&XRayImplMutex);
+  // First, look for whether the mode already has a registered implementation.
+  for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
+    if (!internal_strcmp(Mode, it->Mode))
+      return XRayLogRegisterStatus::XRAY_DUPLICATE_MODE;
+  }
+  auto *NewModeImpl = static_cast<ModeImpl *>(InternalAlloc(sizeof(ModeImpl)));
+  NewModeImpl->Next = ModeImpls;
+  NewModeImpl->Mode = internal_strdup(Mode);
+  NewModeImpl->Impl = Impl;
+  ModeImpls = NewModeImpl;
+  return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
+}
+
+XRayLogRegisterStatus
+__xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
+    if (!internal_strcmp(Mode, it->Mode)) {
+      CurrentMode = it;
+      CurrentXRayImpl = it->Impl;
+      GlobalXRayImpl = &CurrentXRayImpl;
+      __xray_set_handler(it->Impl.handle_arg0);
+      return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
+    }
+  }
+  return XRayLogRegisterStatus::XRAY_MODE_NOT_FOUND;
+}
+
+const char *__xray_log_get_current_mode() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (CurrentMode != nullptr)
+    return CurrentMode->Mode;
+  return nullptr;
+}
+
+void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
+  if (Impl.log_init == nullptr || Impl.log_finalize == nullptr ||
+      Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) {
+    SpinMutexLock Guard(&XRayImplMutex);
+    GlobalXRayImpl = nullptr;
+    CurrentMode = nullptr;
+    __xray_remove_handler();
+    __xray_remove_handler_arg1();
+    return;
+  }
+
+  SpinMutexLock Guard(&XRayImplMutex);
+  CurrentXRayImpl = Impl;
+  GlobalXRayImpl = &CurrentXRayImpl;
+  __xray_set_handler(Impl.handle_arg0);
+}
+
+void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  GlobalXRayImpl = nullptr;
+  __xray_remove_handler();
+  __xray_remove_handler_arg1();
+}
+
+XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers,
+                                  void *Args,
+                                  size_t ArgsSize) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  return GlobalXRayImpl->log_init(BufferSize, MaxBuffers, Args, ArgsSize);
+}
+
+XRayLogInitStatus __xray_log_init_mode(const char *Mode, const char *Config)
+    XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  if (Config == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Check first whether the current mode is the same as what we expect.
+  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Here we do some work to coerce the pointer we're provided, so that
+  // the implementations that still take void* pointers can handle the
+  // data provided in the Config argument.
+  return GlobalXRayImpl->log_init(
+      0, 0, const_cast<void *>(static_cast<const void *>(Config)), 0);
+}
+
+XRayLogInitStatus
+__xray_log_init_mode_bin(const char *Mode, const char *Config,
+                         size_t ConfigSize) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  if (Config == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Check first whether the current mode is the same as what we expect.
+  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Here we do some work to coerce the pointer we're provided, so that
+  // the implementations that still take void* pointers can handle the
+  // data provided in the Config argument.
+  return GlobalXRayImpl->log_init(
+      0, 0, const_cast<void *>(static_cast<const void *>(Config)), ConfigSize);
+}
+
+XRayLogInitStatus __xray_log_finalize() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  return GlobalXRayImpl->log_finalize();
+}
+
+XRayLogFlushStatus __xray_log_flushLog() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  return GlobalXRayImpl->flush_log();
+}
+
+XRayLogFlushStatus __xray_log_process_buffers(
+    void (*Processor)(const char *, XRayBuffer)) XRAY_NEVER_INSTRUMENT {
+  // We want to make sure that there will be no changes to the global state for
+  // the log by synchronising on the XRayBufferIteratorMutex.
+  if (!GlobalXRayImpl)
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  auto Iterator = reinterpret_cast<XRayBuffer (*)(XRayBuffer)>(
+      atomic_load(&XRayBufferIterator, memory_order_acquire));
+  auto Buffer = (*Iterator)(XRayBuffer{nullptr, 0});
+  auto Mode = CurrentMode ? CurrentMode->Mode : nullptr;
+  while (Buffer.Data != nullptr) {
+    (*Processor)(Mode, Buffer);
+    Buffer = (*Iterator)(Buffer);
+  }
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
diff --git a/lib/xray/xray_mips.cc b/lib/xray/xray_mips.cc
deleted file mode 100644
index 80990ab8d639..000000000000
--- a/lib/xray/xray_mips.cc
+++ /dev/null
@@ -1,170 +0,0 @@
-//===-- xray_mips.cc --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Implementation of MIPS-specific routines (32-bit).
-//
-//===----------------------------------------------------------------------===//
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_defs.h"
-#include "xray_interface_internal.h"
-#include <atomic>
-
-namespace __xray {
-
-// The machine codes for some instructions used in runtime patching.
-enum PatchOpcodes : uint32_t {
-  PO_ADDIU = 0x24000000, // addiu rt, rs, imm
-  PO_SW = 0xAC000000,    // sw rt, offset(sp)
-  PO_LUI = 0x3C000000,   // lui rs, %hi(address)
-  PO_ORI = 0x34000000,   // ori rt, rs, %lo(address)
-  PO_JALR = 0x0000F809,  // jalr rs
-  PO_LW = 0x8C000000,    // lw rt, offset(address)
-  PO_B44 = 0x1000000b,   // b #44
-  PO_NOP = 0x0,          // nop
-};
-
-enum RegNum : uint32_t {
-  RN_T0 = 0x8,
-  RN_T9 = 0x19,
-  RN_RA = 0x1F,
-  RN_SP = 0x1D,
-};
-
-inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs,
-                                         uint32_t Rt,
-                                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
-  return (Opcode | Rs << 21 | Rt << 16 | Imm);
-}
-
-inline static uint32_t
-encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd,
-                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
-  return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode);
-}
-
-inline static bool patchSled(const bool Enable, const uint32_t FuncId,
-                             const XRaySledEntry &Sled,
-                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
-  // When |Enable| == true,
-  // We replace the following compile-time stub (sled):
-  //
-  // xray_sled_n:
-  //	B .tmpN
-  //	11 NOPs (44 bytes)
-  //	.tmpN
-  //	ADDIU T9, T9, 44
-  //
-  // With the following runtime patch:
-  //
-  // xray_sled_n (32-bit):
-  //    addiu sp, sp, -8                        ;create stack frame
-  //    nop
-  //    sw ra, 4(sp)                            ;save return address
-  //    sw t9, 0(sp)                            ;save register t9
-  //    lui t9, %hi(__xray_FunctionEntry/Exit)
-  //    ori t9, t9, %lo(__xray_FunctionEntry/Exit)
-  //    lui t0, %hi(function_id)
-  //    jalr t9                                 ;call Tracing hook
-  //    ori t0, t0, %lo(function_id)            ;pass function id (delay slot)
-  //    lw t9, 0(sp)                            ;restore register t9
-  //    lw ra, 4(sp)                            ;restore return address
-  //    addiu sp, sp, 8                         ;delete stack frame
-  //
-  // We add 44 bytes to t9 because we want to adjust the function pointer to
-  // the actual start of function i.e. the address just after the noop sled.
-  // We do this because gp displacement relocation is emitted at the start of
-  // of the function i.e after the nop sled and to correctly calculate the
-  // global offset table address, t9 must hold the address of the instruction
-  // containing the gp displacement relocation.
-  // FIXME: Is this correct for the static relocation model?
-  //
-  // Replacement of the first 4-byte instruction should be the last and atomic
-  // operation, so that the user code which reaches the sled concurrently
-  // either jumps over the whole sled, or executes the whole sled when the
-  // latter is ready.
-  //
-  // When |Enable|==false, we set back the first instruction in the sled to be
-  //   B #44
-
-  if (Enable) {
-    uint32_t LoTracingHookAddr =
-        reinterpret_cast<int32_t>(TracingHook) & 0xffff;
-    uint32_t HiTracingHookAddr =
-        (reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff;
-    uint32_t LoFunctionID = FuncId & 0xffff;
-    uint32_t HiFunctionID = (FuncId >> 16) & 0xffff;
-    *reinterpret_cast<uint32_t *>(Sled.Address + 8) = encodeInstruction(
-        PatchOpcodes::PO_SW, RegNum::RN_SP, RegNum::RN_RA, 0x4);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 12) = encodeInstruction(
-        PatchOpcodes::PO_SW, RegNum::RN_SP, RegNum::RN_T9, 0x0);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 16) = encodeInstruction(
-        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9, HiTracingHookAddr);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 20) = encodeInstruction(
-        PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, LoTracingHookAddr);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 24) = encodeInstruction(
-        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0, HiFunctionID);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 28) = encodeSpecialInstruction(
-        PatchOpcodes::PO_JALR, RegNum::RN_T9, 0x0, RegNum::RN_RA, 0X0);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 32) = encodeInstruction(
-        PatchOpcodes::PO_ORI, RegNum::RN_T0, RegNum::RN_T0, LoFunctionID);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 36) = encodeInstruction(
-        PatchOpcodes::PO_LW, RegNum::RN_SP, RegNum::RN_T9, 0x0);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 40) = encodeInstruction(
-        PatchOpcodes::PO_LW, RegNum::RN_SP, RegNum::RN_RA, 0x4);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 44) = encodeInstruction(
-        PatchOpcodes::PO_ADDIU, RegNum::RN_SP, RegNum::RN_SP, 0x8);
-    uint32_t CreateStackSpaceInstr = encodeInstruction(
-        PatchOpcodes::PO_ADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xFFF8);
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
-        uint32_t(CreateStackSpaceInstr), std::memory_order_release);
-  } else {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
-        uint32_t(PatchOpcodes::PO_B44), std::memory_order_release);
-  }
-  return true;
-}
-
-bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
-                        const XRaySledEntry &Sled,
-                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, Trampoline);
-}
-
-bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
-                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
-}
-
-bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: In the future we'd need to distinguish between non-tail exits and
-  // tail exits for better information preservation.
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
-}
-
-bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in mips?
-  return false;
-}
-
-bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in mips?
-  return false;
-}
-
-} // namespace __xray
-
-extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
-  // FIXME: this will have to be implemented in the trampoline assembly file
-}
diff --git a/lib/xray/xray_mips.cpp b/lib/xray/xray_mips.cpp
new file mode 100644
index 000000000000..26fc50374471
--- /dev/null
+++ b/lib/xray/xray_mips.cpp
@@ -0,0 +1,170 @@
+//===-- xray_mips.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of MIPS-specific routines (32-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum PatchOpcodes : uint32_t {
+  PO_ADDIU = 0x24000000, // addiu rt, rs, imm
+  PO_SW = 0xAC000000,    // sw rt, offset(sp)
+  PO_LUI = 0x3C000000,   // lui rs, %hi(address)
+  PO_ORI = 0x34000000,   // ori rt, rs, %lo(address)
+  PO_JALR = 0x0000F809,  // jalr rs
+  PO_LW = 0x8C000000,    // lw rt, offset(address)
+  PO_B44 = 0x1000000b,   // b #44
+  PO_NOP = 0x0,          // nop
+};
+
+enum RegNum : uint32_t {
+  RN_T0 = 0x8,
+  RN_T9 = 0x19,
+  RN_RA = 0x1F,
+  RN_SP = 0x1D,
+};
+
+inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs,
+                                         uint32_t Rt,
+                                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Opcode | Rs << 21 | Rt << 16 | Imm);
+}
+
+inline static uint32_t
+encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd,
+                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //	B .tmpN
+  //	11 NOPs (44 bytes)
+  //	.tmpN
+  //	ADDIU T9, T9, 44
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n (32-bit):
+  //    addiu sp, sp, -8                        ;create stack frame
+  //    nop
+  //    sw ra, 4(sp)                            ;save return address
+  //    sw t9, 0(sp)                            ;save register t9
+  //    lui t9, %hi(__xray_FunctionEntry/Exit)
+  //    ori t9, t9, %lo(__xray_FunctionEntry/Exit)
+  //    lui t0, %hi(function_id)
+  //    jalr t9                                 ;call Tracing hook
+  //    ori t0, t0, %lo(function_id)            ;pass function id (delay slot)
+  //    lw t9, 0(sp)                            ;restore register t9
+  //    lw ra, 4(sp)                            ;restore return address
+  //    addiu sp, sp, 8                         ;delete stack frame
+  //
+  // We add 44 bytes to t9 because we want to adjust the function pointer to
+  // the actual start of function i.e. the address just after the noop sled.
+  // We do this because gp displacement relocation is emitted at the start of
+  // of the function i.e after the nop sled and to correctly calculate the
+  // global offset table address, t9 must hold the address of the instruction
+  // containing the gp displacement relocation.
+  // FIXME: Is this correct for the static relocation model?
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #44
+
+  if (Enable) {
+    uint32_t LoTracingHookAddr =
+        reinterpret_cast<int32_t>(TracingHook) & 0xffff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff;
+    uint32_t LoFunctionID = FuncId & 0xffff;
+    uint32_t HiFunctionID = (FuncId >> 16) & 0xffff;
+    *reinterpret_cast<uint32_t *>(Sled.Address + 8) = encodeInstruction(
+        PatchOpcodes::PO_SW, RegNum::RN_SP, RegNum::RN_RA, 0x4);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 12) = encodeInstruction(
+        PatchOpcodes::PO_SW, RegNum::RN_SP, RegNum::RN_T9, 0x0);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 16) = encodeInstruction(
+        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9, HiTracingHookAddr);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 20) = encodeInstruction(
+        PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, LoTracingHookAddr);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 24) = encodeInstruction(
+        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0, HiFunctionID);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 28) = encodeSpecialInstruction(
+        PatchOpcodes::PO_JALR, RegNum::RN_T9, 0x0, RegNum::RN_RA, 0X0);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 32) = encodeInstruction(
+        PatchOpcodes::PO_ORI, RegNum::RN_T0, RegNum::RN_T0, LoFunctionID);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 36) = encodeInstruction(
+        PatchOpcodes::PO_LW, RegNum::RN_SP, RegNum::RN_T9, 0x0);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 40) = encodeInstruction(
+        PatchOpcodes::PO_LW, RegNum::RN_SP, RegNum::RN_RA, 0x4);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 44) = encodeInstruction(
+        PatchOpcodes::PO_ADDIU, RegNum::RN_SP, RegNum::RN_SP, 0x8);
+    uint32_t CreateStackSpaceInstr = encodeInstruction(
+        PatchOpcodes::PO_ADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xFFF8);
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
+        uint32_t(CreateStackSpaceInstr), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
+        uint32_t(PatchOpcodes::PO_B44), std::memory_order_release);
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: In the future we'd need to distinguish between non-tail exits and
+  // tail exits for better information preservation.
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/lib/xray/xray_mips64.cc b/lib/xray/xray_mips64.cc
deleted file mode 100644
index 73c8924f9a0b..000000000000
--- a/lib/xray/xray_mips64.cc
+++ /dev/null
@@ -1,178 +0,0 @@
-//===-- xray_mips64.cc ------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Implementation of MIPS64-specific routines.
-//
-//===----------------------------------------------------------------------===//
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_defs.h"
-#include "xray_interface_internal.h"
-#include <atomic>
-
-namespace __xray {
-
-// The machine codes for some instructions used in runtime patching.
-enum PatchOpcodes : uint32_t {
-  PO_DADDIU = 0x64000000, // daddiu rt, rs, imm
-  PO_SD = 0xFC000000,     // sd rt, base(offset)
-  PO_LUI = 0x3C000000,    // lui rt, imm
-  PO_ORI = 0x34000000,    // ori rt, rs, imm
-  PO_DSLL = 0x00000038,   // dsll rd, rt, sa
-  PO_JALR = 0x00000009,   // jalr rs
-  PO_LD = 0xDC000000,     // ld rt, base(offset)
-  PO_B60 = 0x1000000f,    // b #60
-  PO_NOP = 0x0,           // nop
-};
-
-enum RegNum : uint32_t {
-  RN_T0 = 0xC,
-  RN_T9 = 0x19,
-  RN_RA = 0x1F,
-  RN_SP = 0x1D,
-};
-
-inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs,
-                                         uint32_t Rt,
-                                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
-  return (Opcode | Rs << 21 | Rt << 16 | Imm);
-}
-
-inline static uint32_t
-encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd,
-                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
-  return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode);
-}
-
-inline static bool patchSled(const bool Enable, const uint32_t FuncId,
-                             const XRaySledEntry &Sled,
-                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
-  // When |Enable| == true,
-  // We replace the following compile-time stub (sled):
-  //
-  // xray_sled_n:
-  //	B .tmpN
-  //	15 NOPs (60 bytes)
-  //	.tmpN
-  //
-  // With the following runtime patch:
-  //
-  // xray_sled_n (64-bit):
-  //    daddiu sp, sp, -16                      ;create stack frame
-  //    nop
-  //    sd ra, 8(sp)                            ;save return address
-  //    sd t9, 0(sp)                            ;save register t9
-  //    lui t9, %highest(__xray_FunctionEntry/Exit)
-  //    ori t9, t9, %higher(__xray_FunctionEntry/Exit)
-  //    dsll t9, t9, 16
-  //    ori t9, t9, %hi(__xray_FunctionEntry/Exit)
-  //    dsll t9, t9, 16
-  //    ori t9, t9, %lo(__xray_FunctionEntry/Exit)
-  //    lui t0, %hi(function_id)
-  //    jalr t9                                 ;call Tracing hook
-  //    ori t0, t0, %lo(function_id)            ;pass function id (delay slot)
-  //    ld t9, 0(sp)                            ;restore register t9
-  //    ld ra, 8(sp)                            ;restore return address
-  //    daddiu sp, sp, 16                       ;delete stack frame
-  //
-  // Replacement of the first 4-byte instruction should be the last and atomic
-  // operation, so that the user code which reaches the sled concurrently
-  // either jumps over the whole sled, or executes the whole sled when the
-  // latter is ready.
-  //
-  // When |Enable|==false, we set back the first instruction in the sled to be
-  //   B #60
-
-  if (Enable) {
-    uint32_t LoTracingHookAddr =
-        reinterpret_cast<int64_t>(TracingHook) & 0xffff;
-    uint32_t HiTracingHookAddr =
-        (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
-    uint32_t HigherTracingHookAddr =
-        (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff;
-    uint32_t HighestTracingHookAddr =
-        (reinterpret_cast<int64_t>(TracingHook) >> 48) & 0xffff;
-    uint32_t LoFunctionID = FuncId & 0xffff;
-    uint32_t HiFunctionID = (FuncId >> 16) & 0xffff;
-    *reinterpret_cast<uint32_t *>(Sled.Address + 8) = encodeInstruction(
-        PatchOpcodes::PO_SD, RegNum::RN_SP, RegNum::RN_RA, 0x8);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 12) = encodeInstruction(
-        PatchOpcodes::PO_SD, RegNum::RN_SP, RegNum::RN_T9, 0x0);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 16) = encodeInstruction(
-        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9, HighestTracingHookAddr);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 20) =
-        encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9,
-                          HigherTracingHookAddr);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 24) = encodeSpecialInstruction(
-        PatchOpcodes::PO_DSLL, 0x0, RegNum::RN_T9, RegNum::RN_T9, 0x10);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 28) = encodeInstruction(
-        PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, HiTracingHookAddr);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 32) = encodeSpecialInstruction(
-        PatchOpcodes::PO_DSLL, 0x0, RegNum::RN_T9, RegNum::RN_T9, 0x10);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 36) = encodeInstruction(
-        PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, LoTracingHookAddr);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 40) = encodeInstruction(
-        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0, HiFunctionID);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 44) = encodeSpecialInstruction(
-        PatchOpcodes::PO_JALR, RegNum::RN_T9, 0x0, RegNum::RN_RA, 0X0);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 48) = encodeInstruction(
-        PatchOpcodes::PO_ORI, RegNum::RN_T0, RegNum::RN_T0, LoFunctionID);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 52) = encodeInstruction(
-        PatchOpcodes::PO_LD, RegNum::RN_SP, RegNum::RN_T9, 0x0);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 56) = encodeInstruction(
-        PatchOpcodes::PO_LD, RegNum::RN_SP, RegNum::RN_RA, 0x8);
-    *reinterpret_cast<uint32_t *>(Sled.Address + 60) = encodeInstruction(
-        PatchOpcodes::PO_DADDIU, RegNum::RN_SP, RegNum::RN_SP, 0x10);
-    uint32_t CreateStackSpace = encodeInstruction(
-        PatchOpcodes::PO_DADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xfff0);
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
-        CreateStackSpace, std::memory_order_release);
-  } else {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
-        uint32_t(PatchOpcodes::PO_B60), std::memory_order_release);
-  }
-  return true;
-}
-
-bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
-                        const XRaySledEntry &Sled,
-                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, Trampoline);
-}
-
-bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
-                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
-}
-
-bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: In the future we'd need to distinguish between non-tail exits and
-  // tail exits for better information preservation.
-  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
-}
-
-bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in mips64?
-  return false;
-}
-
-bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in mips64?
-  return false;
-}
-} // namespace __xray
-
-extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
-  // FIXME: this will have to be implemented in the trampoline assembly file
-}
diff --git a/lib/xray/xray_mips64.cpp b/lib/xray/xray_mips64.cpp
new file mode 100644
index 000000000000..62c67ff7376d
--- /dev/null
+++ b/lib/xray/xray_mips64.cpp
@@ -0,0 +1,178 @@
+//===-- xray_mips64.cpp -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of MIPS64-specific routines.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum PatchOpcodes : uint32_t {
+  PO_DADDIU = 0x64000000, // daddiu rt, rs, imm
+  PO_SD = 0xFC000000,     // sd rt, base(offset)
+  PO_LUI = 0x3C000000,    // lui rt, imm
+  PO_ORI = 0x34000000,    // ori rt, rs, imm
+  PO_DSLL = 0x00000038,   // dsll rd, rt, sa
+  PO_JALR = 0x00000009,   // jalr rs
+  PO_LD = 0xDC000000,     // ld rt, base(offset)
+  PO_B60 = 0x1000000f,    // b #60
+  PO_NOP = 0x0,           // nop
+};
+
+enum RegNum : uint32_t {
+  RN_T0 = 0xC,
+  RN_T9 = 0x19,
+  RN_RA = 0x1F,
+  RN_SP = 0x1D,
+};
+
+inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs,
+                                         uint32_t Rt,
+                                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Opcode | Rs << 21 | Rt << 16 | Imm);
+}
+
+inline static uint32_t
+encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd,
+                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //	B .tmpN
+  //	15 NOPs (60 bytes)
+  //	.tmpN
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n (64-bit):
+  //    daddiu sp, sp, -16                      ;create stack frame
+  //    nop
+  //    sd ra, 8(sp)                            ;save return address
+  //    sd t9, 0(sp)                            ;save register t9
+  //    lui t9, %highest(__xray_FunctionEntry/Exit)
+  //    ori t9, t9, %higher(__xray_FunctionEntry/Exit)
+  //    dsll t9, t9, 16
+  //    ori t9, t9, %hi(__xray_FunctionEntry/Exit)
+  //    dsll t9, t9, 16
+  //    ori t9, t9, %lo(__xray_FunctionEntry/Exit)
+  //    lui t0, %hi(function_id)
+  //    jalr t9                                 ;call Tracing hook
+  //    ori t0, t0, %lo(function_id)            ;pass function id (delay slot)
+  //    ld t9, 0(sp)                            ;restore register t9
+  //    ld ra, 8(sp)                            ;restore return address
+  //    daddiu sp, sp, 16                       ;delete stack frame
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #60
+
+  if (Enable) {
+    uint32_t LoTracingHookAddr =
+        reinterpret_cast<int64_t>(TracingHook) & 0xffff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
+    uint32_t HigherTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff;
+    uint32_t HighestTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 48) & 0xffff;
+    uint32_t LoFunctionID = FuncId & 0xffff;
+    uint32_t HiFunctionID = (FuncId >> 16) & 0xffff;
+    *reinterpret_cast<uint32_t *>(Sled.Address + 8) = encodeInstruction(
+        PatchOpcodes::PO_SD, RegNum::RN_SP, RegNum::RN_RA, 0x8);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 12) = encodeInstruction(
+        PatchOpcodes::PO_SD, RegNum::RN_SP, RegNum::RN_T9, 0x0);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 16) = encodeInstruction(
+        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9, HighestTracingHookAddr);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 20) =
+        encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9,
+                          HigherTracingHookAddr);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 24) = encodeSpecialInstruction(
+        PatchOpcodes::PO_DSLL, 0x0, RegNum::RN_T9, RegNum::RN_T9, 0x10);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 28) = encodeInstruction(
+        PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, HiTracingHookAddr);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 32) = encodeSpecialInstruction(
+        PatchOpcodes::PO_DSLL, 0x0, RegNum::RN_T9, RegNum::RN_T9, 0x10);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 36) = encodeInstruction(
+        PatchOpcodes::PO_ORI, RegNum::RN_T9, RegNum::RN_T9, LoTracingHookAddr);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 40) = encodeInstruction(
+        PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0, HiFunctionID);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 44) = encodeSpecialInstruction(
+        PatchOpcodes::PO_JALR, RegNum::RN_T9, 0x0, RegNum::RN_RA, 0X0);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 48) = encodeInstruction(
+        PatchOpcodes::PO_ORI, RegNum::RN_T0, RegNum::RN_T0, LoFunctionID);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 52) = encodeInstruction(
+        PatchOpcodes::PO_LD, RegNum::RN_SP, RegNum::RN_T9, 0x0);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 56) = encodeInstruction(
+        PatchOpcodes::PO_LD, RegNum::RN_SP, RegNum::RN_RA, 0x8);
+    *reinterpret_cast<uint32_t *>(Sled.Address + 60) = encodeInstruction(
+        PatchOpcodes::PO_DADDIU, RegNum::RN_SP, RegNum::RN_SP, 0x10);
+    uint32_t CreateStackSpace = encodeInstruction(
+        PatchOpcodes::PO_DADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xfff0);
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
+        CreateStackSpace, std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Sled.Address),
+        uint32_t(PatchOpcodes::PO_B60), std::memory_order_release);
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: In the future we'd need to distinguish between non-tail exits and
+  // tail exits for better information preservation.
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/lib/xray/xray_powerpc64.cc b/lib/xray/xray_powerpc64.cc
deleted file mode 100644
index abc2becf5b4d..000000000000
--- a/lib/xray/xray_powerpc64.cc
+++ /dev/null
@@ -1,111 +0,0 @@
-//===-- xray_powerpc64.cc ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// Implementation of powerpc64 and powerpc64le routines.
-//
-//===----------------------------------------------------------------------===//
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_defs.h"
-#include "xray_interface_internal.h"
-#include "xray_utils.h"
-#include <atomic>
-#include <cassert>
-#include <cstring>
-
-#ifndef __LITTLE_ENDIAN__
-#error powerpc64 big endian is not supported for now.
-#endif
-
-namespace {
-
-constexpr unsigned long long JumpOverInstNum = 7;
-
-void clearCache(void *Addr, size_t Len) {
-  const size_t LineSize = 32;
-
-  const intptr_t Mask = ~(LineSize - 1);
-  const intptr_t StartLine = ((intptr_t)Addr) & Mask;
-  const intptr_t EndLine = ((intptr_t)Addr + Len + LineSize - 1) & Mask;
-
-  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
-    asm volatile("dcbf 0, %0" : : "r"(Line));
-  asm volatile("sync");
-
-  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
-    asm volatile("icbi 0, %0" : : "r"(Line));
-  asm volatile("isync");
-}
-
-} // namespace
-
-extern "C" void __clear_cache(void *start, void *end);
-
-namespace __xray {
-
-bool patchFunctionEntry(const bool Enable, uint32_t FuncId,
-                        const XRaySledEntry &Sled,
-                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
-  if (Enable) {
-    // lis 0, FuncId[16..32]
-    // li 0, FuncId[0..15]
-    *reinterpret_cast<uint64_t *>(Sled.Address) =
-        (0x3c000000ull + (FuncId >> 16)) +
-        ((0x60000000ull + (FuncId & 0xffff)) << 32);
-  } else {
-    // b +JumpOverInstNum instructions.
-    *reinterpret_cast<uint32_t *>(Sled.Address) =
-        0x48000000ull + (JumpOverInstNum << 2);
-  }
-  clearCache(reinterpret_cast<void *>(Sled.Address), 8);
-  return true;
-}
-
-bool patchFunctionExit(const bool Enable, uint32_t FuncId,
-                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  if (Enable) {
-    // lis 0, FuncId[16..32]
-    // li 0, FuncId[0..15]
-    *reinterpret_cast<uint64_t *>(Sled.Address) =
-        (0x3c000000ull + (FuncId >> 16)) +
-        ((0x60000000ull + (FuncId & 0xffff)) << 32);
-  } else {
-    // Copy the blr/b instruction after JumpOverInstNum instructions.
-    *reinterpret_cast<uint32_t *>(Sled.Address) =
-        *(reinterpret_cast<uint32_t *>(Sled.Address) + JumpOverInstNum);
-  }
-  clearCache(reinterpret_cast<void *>(Sled.Address), 8);
-  return true;
-}
-
-bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  return patchFunctionExit(Enable, FuncId, Sled);
-}
-
-// FIXME: Maybe implement this better?
-bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
-
-bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in powerpc64?
-  return false;
-}
-
-bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // FIXME: Implement in powerpc64?
-  return false;
-}
-
-} // namespace __xray
-
-extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
-  // FIXME: this will have to be implemented in the trampoline assembly file
-}
diff --git a/lib/xray/xray_powerpc64.cpp b/lib/xray/xray_powerpc64.cpp
new file mode 100644
index 000000000000..b41f1bce6f21
--- /dev/null
+++ b/lib/xray/xray_powerpc64.cpp
@@ -0,0 +1,111 @@
+//===-- xray_powerpc64.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of powerpc64 and powerpc64le routines.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include "xray_utils.h"
+#include <atomic>
+#include <cassert>
+#include <cstring>
+
+#ifndef __LITTLE_ENDIAN__
+#error powerpc64 big endian is not supported for now.
+#endif
+
+namespace {
+
+constexpr unsigned long long JumpOverInstNum = 7;
+
+void clearCache(void *Addr, size_t Len) {
+  const size_t LineSize = 32;
+
+  const intptr_t Mask = ~(LineSize - 1);
+  const intptr_t StartLine = ((intptr_t)Addr) & Mask;
+  const intptr_t EndLine = ((intptr_t)Addr + Len + LineSize - 1) & Mask;
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("dcbf 0, %0" : : "r"(Line));
+  asm volatile("sync");
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("icbi 0, %0" : : "r"(Line));
+  asm volatile("isync");
+}
+
+} // namespace
+
+extern "C" void __clear_cache(void *start, void *end);
+
+namespace __xray {
+
+bool patchFunctionEntry(const bool Enable, uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  if (Enable) {
+    // lis 0, FuncId[16..32]
+    // li 0, FuncId[0..15]
+    *reinterpret_cast<uint64_t *>(Sled.Address) =
+        (0x3c000000ull + (FuncId >> 16)) +
+        ((0x60000000ull + (FuncId & 0xffff)) << 32);
+  } else {
+    // b +JumpOverInstNum instructions.
+    *reinterpret_cast<uint32_t *>(Sled.Address) =
+        0x48000000ull + (JumpOverInstNum << 2);
+  }
+  clearCache(reinterpret_cast<void *>(Sled.Address), 8);
+  return true;
+}
+
+bool patchFunctionExit(const bool Enable, uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  if (Enable) {
+    // lis 0, FuncId[16..32]
+    // li 0, FuncId[0..15]
+    *reinterpret_cast<uint64_t *>(Sled.Address) =
+        (0x3c000000ull + (FuncId >> 16)) +
+        ((0x60000000ull + (FuncId & 0xffff)) << 32);
+  } else {
+    // Copy the blr/b instruction after JumpOverInstNum instructions.
+    *reinterpret_cast<uint32_t *>(Sled.Address) =
+        *(reinterpret_cast<uint32_t *>(Sled.Address) + JumpOverInstNum);
+  }
+  clearCache(reinterpret_cast<void *>(Sled.Address), 8);
+  return true;
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchFunctionExit(Enable, FuncId, Sled);
+}
+
+// FIXME: Maybe implement this better?
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/lib/xray/xray_profile_collector.cc b/lib/xray/xray_profile_collector.cc
deleted file mode 100644
index 97b52e1d9a22..000000000000
--- a/lib/xray/xray_profile_collector.cc
+++ /dev/null
@@ -1,414 +0,0 @@
-//===-- xray_profile_collector.cc ------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// This implements the interface for the profileCollectorService.
-//
-//===----------------------------------------------------------------------===//
-#include "xray_profile_collector.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_allocator.h"
-#include "xray_defs.h"
-#include "xray_profiling_flags.h"
-#include "xray_segmented_array.h"
-#include <memory>
-#include <pthread.h>
-#include <utility>
-
-namespace __xray {
-namespace profileCollectorService {
-
-namespace {
-
-SpinMutex GlobalMutex;
-struct ThreadTrie {
-  tid_t TId;
-  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
-};
-
-struct ProfileBuffer {
-  void *Data;
-  size_t Size;
-};
-
-// Current version of the profile format.
-constexpr u64 XRayProfilingVersion = 0x20180424;
-
-// Identifier for XRay profiling files 'xrayprof' in hex.
-constexpr u64 XRayMagicBytes = 0x7872617970726f66;
-
-struct XRayProfilingFileHeader {
-  const u64 MagicBytes = XRayMagicBytes;
-  const u64 Version = XRayProfilingVersion;
-  u64 Timestamp = 0; // System time in nanoseconds.
-  u64 PID = 0;       // Process ID.
-};
-
-struct BlockHeader {
-  u32 BlockSize;
-  u32 BlockNum;
-  u64 ThreadId;
-};
-
-struct ThreadData {
-  BufferQueue *BQ;
-  FunctionCallTrie::Allocators::Buffers Buffers;
-  FunctionCallTrie::Allocators Allocators;
-  FunctionCallTrie FCT;
-  tid_t TId;
-};
-
-using ThreadDataArray = Array<ThreadData>;
-using ThreadDataAllocator = ThreadDataArray::AllocatorType;
-
-// We use a separate buffer queue for the backing store for the allocator used
-// by the ThreadData array. This lets us host the buffers, allocators, and tries
-// associated with a thread by moving the data into the array instead of
-// attempting to copy the data to a separately backed set of tries.
-static typename std::aligned_storage<
-    sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
-static BufferQueue *BQ = nullptr;
-static BufferQueue::Buffer Buffer;
-static typename std::aligned_storage<sizeof(ThreadDataAllocator),
-                                     alignof(ThreadDataAllocator)>::type
-    ThreadDataAllocatorStorage;
-static typename std::aligned_storage<sizeof(ThreadDataArray),
-                                     alignof(ThreadDataArray)>::type
-    ThreadDataArrayStorage;
-
-static ThreadDataAllocator *TDAllocator = nullptr;
-static ThreadDataArray *TDArray = nullptr;
-
-using ProfileBufferArray = Array<ProfileBuffer>;
-using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
-
-// These need to be global aligned storage to avoid dynamic initialization. We
-// need these to be aligned to allow us to placement new objects into the
-// storage, and have pointers to those objects be appropriately aligned.
-static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
-    ProfileBuffersStorage;
-static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
-    ProfileBufferArrayAllocatorStorage;
-
-static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
-static ProfileBufferArray *ProfileBuffers = nullptr;
-
-// Use a global flag to determine whether the collector implementation has been
-// initialized.
-static atomic_uint8_t CollectorInitialized{0};
-
-} // namespace
-
-void post(BufferQueue *Q, FunctionCallTrie &&T,
-          FunctionCallTrie::Allocators &&A,
-          FunctionCallTrie::Allocators::Buffers &&B,
-          tid_t TId) XRAY_NEVER_INSTRUMENT {
-  DCHECK_NE(Q, nullptr);
-
-  // Bail out early if the collector has not been initialized.
-  if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
-    T.~FunctionCallTrie();
-    A.~Allocators();
-    Q->releaseBuffer(B.NodeBuffer);
-    Q->releaseBuffer(B.RootsBuffer);
-    Q->releaseBuffer(B.ShadowStackBuffer);
-    Q->releaseBuffer(B.NodeIdPairBuffer);
-    B.~Buffers();
-    return;
-  }
-
-  {
-    SpinMutexLock Lock(&GlobalMutex);
-    DCHECK_NE(TDAllocator, nullptr);
-    DCHECK_NE(TDArray, nullptr);
-
-    if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
-                               TId) == nullptr) {
-      // If we fail to add the data to the array, we should destroy the objects
-      // handed us.
-      T.~FunctionCallTrie();
-      A.~Allocators();
-      Q->releaseBuffer(B.NodeBuffer);
-      Q->releaseBuffer(B.RootsBuffer);
-      Q->releaseBuffer(B.ShadowStackBuffer);
-      Q->releaseBuffer(B.NodeIdPairBuffer);
-      B.~Buffers();
-    }
-  }
-}
-
-// A PathArray represents the function id's representing a stack trace. In this
-// context a path is almost always represented from the leaf function in a call
-// stack to a root of the call trie.
-using PathArray = Array<int32_t>;
-
-struct ProfileRecord {
-  using PathAllocator = typename PathArray::AllocatorType;
-
-  // The Path in this record is the function id's from the leaf to the root of
-  // the function call stack as represented from a FunctionCallTrie.
-  PathArray Path;
-  const FunctionCallTrie::Node *Node;
-};
-
-namespace {
-
-using ProfileRecordArray = Array<ProfileRecord>;
-
-// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
-// the path(s) and the data associated with the path.
-static void
-populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
-                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
-  using StackArray = Array<const FunctionCallTrie::Node *>;
-  using StackAllocator = typename StackArray::AllocatorType;
-  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
-  StackArray DFSStack(StackAlloc);
-  for (const auto *R : Trie.getRoots()) {
-    DFSStack.Append(R);
-    while (!DFSStack.empty()) {
-      auto *Node = DFSStack.back();
-      DFSStack.trim(1);
-      if (Node == nullptr)
-        continue;
-      auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
-      if (Record == nullptr)
-        return;
-      DCHECK_NE(Record, nullptr);
-
-      // Traverse the Node's parents and as we're doing so, get the FIds in
-      // the order they appear.
-      for (auto N = Node; N != nullptr; N = N->Parent)
-        Record->Path.Append(N->FId);
-      DCHECK(!Record->Path.empty());
-
-      for (const auto C : Node->Callees)
-        DFSStack.Append(C.NodePtr);
-    }
-  }
-}
-
-static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
-                             const ProfileRecordArray &ProfileRecords)
-    XRAY_NEVER_INSTRUMENT {
-  auto NextPtr = static_cast<uint8_t *>(
-                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
-                 sizeof(Header);
-  for (const auto &Record : ProfileRecords) {
-    // List of IDs follow:
-    for (const auto FId : Record.Path)
-      NextPtr =
-          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
-          sizeof(FId);
-
-    // Add the sentinel here.
-    constexpr int32_t SentinelFId = 0;
-    NextPtr = static_cast<uint8_t *>(
-                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
-              sizeof(SentinelFId);
-
-    // Add the node data here.
-    NextPtr =
-        static_cast<uint8_t *>(internal_memcpy(
-            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
-        sizeof(Record.Node->CallCount);
-    NextPtr = static_cast<uint8_t *>(
-                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
-                                  sizeof(Record.Node->CumulativeLocalTime))) +
-              sizeof(Record.Node->CumulativeLocalTime);
-  }
-
-  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
-}
-
-} // namespace
-
-void serialize() XRAY_NEVER_INSTRUMENT {
-  if (!atomic_load(&CollectorInitialized, memory_order_acquire))
-    return;
-
-  SpinMutexLock Lock(&GlobalMutex);
-
-  // Clear out the global ProfileBuffers, if it's not empty.
-  for (auto &B : *ProfileBuffers)
-    deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
-  ProfileBuffers->trim(ProfileBuffers->size());
-
-  DCHECK_NE(TDArray, nullptr);
-  if (TDArray->empty())
-    return;
-
-  // Then repopulate the global ProfileBuffers.
-  u32 I = 0;
-  auto MaxSize = profilingFlags()->global_allocator_max;
-  auto ProfileArena = allocateBuffer(MaxSize);
-  if (ProfileArena == nullptr)
-    return;
-
-  auto ProfileArenaCleanup = at_scope_exit(
-      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
-
-  auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
-  if (PathArena == nullptr)
-    return;
-
-  auto PathArenaCleanup = at_scope_exit(
-      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
-
-  for (const auto &ThreadTrie : *TDArray) {
-    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
-    ProfileRecordAllocator PRAlloc(ProfileArena,
-                                   profilingFlags()->global_allocator_max);
-    ProfileRecord::PathAllocator PathAlloc(
-        PathArena, profilingFlags()->global_allocator_max);
-    ProfileRecordArray ProfileRecords(PRAlloc);
-
-    // First, we want to compute the amount of space we're going to need. We'll
-    // use a local allocator and an __xray::Array<...> to store the intermediary
-    // data, then compute the size as we're going along. Then we'll allocate the
-    // contiguous space to contain the thread buffer data.
-    if (ThreadTrie.FCT.getRoots().empty())
-      continue;
-
-    populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
-    DCHECK(!ThreadTrie.FCT.getRoots().empty());
-    DCHECK(!ProfileRecords.empty());
-
-    // Go through each record, to compute the sizes.
-    //
-    // header size = block size (4 bytes)
-    //   + block number (4 bytes)
-    //   + thread id (8 bytes)
-    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
-    //   + call count (8 bytes)
-    //   + local time (8 bytes)
-    //   + end of record (8 bytes)
-    u32 CumulativeSizes = 0;
-    for (const auto &Record : ProfileRecords)
-      CumulativeSizes += 20 + (4 * Record.Path.size());
-
-    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
-    auto B = ProfileBuffers->Append({});
-    B->Size = sizeof(Header) + CumulativeSizes;
-    B->Data = allocateBuffer(B->Size);
-    DCHECK_NE(B->Data, nullptr);
-    serializeRecords(B, Header, ProfileRecords);
-  }
-}
-
-void reset() XRAY_NEVER_INSTRUMENT {
-  atomic_store(&CollectorInitialized, 0, memory_order_release);
-  SpinMutexLock Lock(&GlobalMutex);
-
-  if (ProfileBuffers != nullptr) {
-    // Clear out the profile buffers that have been serialized.
-    for (auto &B : *ProfileBuffers)
-      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
-    ProfileBuffers->trim(ProfileBuffers->size());
-    ProfileBuffers = nullptr;
-  }
-
-  if (TDArray != nullptr) {
-    // Release the resources as required.
-    for (auto &TD : *TDArray) {
-      TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
-      TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
-      TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
-      TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
-    }
-    // We don't bother destroying the array here because we've already
-    // potentially freed the backing store for the array. Instead we're going to
-    // reset the pointer to nullptr, and re-use the storage later instead
-    // (placement-new'ing into the storage as-is).
-    TDArray = nullptr;
-  }
-
-  if (TDAllocator != nullptr) {
-    TDAllocator->~Allocator();
-    TDAllocator = nullptr;
-  }
-
-  if (Buffer.Data != nullptr) {
-    BQ->releaseBuffer(Buffer);
-  }
-
-  if (BQ == nullptr) {
-    bool Success = false;
-    new (&BufferQueueStorage)
-        BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
-    if (!Success)
-      return;
-    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
-  } else {
-    BQ->finalize();
-
-    if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
-        BufferQueue::ErrorCode::Ok)
-      return;
-  }
-
-  if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
-    return;
-
-  new (&ProfileBufferArrayAllocatorStorage)
-      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
-  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
-      &ProfileBufferArrayAllocatorStorage);
-
-  new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
-  ProfileBuffers =
-      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
-
-  new (&ThreadDataAllocatorStorage)
-      ThreadDataAllocator(Buffer.Data, Buffer.Size);
-  TDAllocator =
-      reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
-  new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
-  TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
-
-  atomic_store(&CollectorInitialized, 1, memory_order_release);
-}
-
-XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
-  SpinMutexLock Lock(&GlobalMutex);
-
-  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
-    return {nullptr, 0};
-
-  static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
-      FileHeaderStorage;
-  pthread_once(
-      &Once, +[]() XRAY_NEVER_INSTRUMENT {
-        new (&FileHeaderStorage) XRayProfilingFileHeader{};
-      });
-
-  if (UNLIKELY(B.Data == nullptr)) {
-    // The first buffer should always contain the file header information.
-    auto &FileHeader =
-        *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
-    FileHeader.Timestamp = NanoTime();
-    FileHeader.PID = internal_getpid();
-    return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
-  }
-
-  if (UNLIKELY(B.Data == &FileHeaderStorage))
-    return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
-
-  BlockHeader Header;
-  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
-  auto NextBlock = Header.BlockNum + 1;
-  if (NextBlock < ProfileBuffers->size())
-    return {(*ProfileBuffers)[NextBlock].Data,
-            (*ProfileBuffers)[NextBlock].Size};
-  return {nullptr, 0};
-}
-
-} // namespace profileCollectorService
-} // namespace __xray
diff --git a/lib/xray/xray_profile_collector.cpp b/lib/xray/xray_profile_collector.cpp
new file mode 100644
index 000000000000..bef2504f2a16
--- /dev/null
+++ b/lib/xray/xray_profile_collector.cpp
@@ -0,0 +1,414 @@
+//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the interface for the profileCollectorService.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
+#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
+#include <memory>
+#include <pthread.h>
+#include <utility>
+
+namespace __xray {
+namespace profileCollectorService {
+
+namespace {
+
+SpinMutex GlobalMutex;
+struct ThreadTrie {
+  tid_t TId;
+  typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
+};
+
+struct ProfileBuffer {
+  void *Data;
+  size_t Size;
+};
+
+// Current version of the profile format.
+constexpr u64 XRayProfilingVersion = 0x20180424;
+
+// Identifier for XRay profiling files 'xrayprof' in hex.
+constexpr u64 XRayMagicBytes = 0x7872617970726f66;
+
+struct XRayProfilingFileHeader {
+  const u64 MagicBytes = XRayMagicBytes;
+  const u64 Version = XRayProfilingVersion;
+  u64 Timestamp = 0; // System time in nanoseconds.
+  u64 PID = 0;       // Process ID.
+};
+
+struct BlockHeader {
+  u32 BlockSize;
+  u32 BlockNum;
+  u64 ThreadId;
+};
+
+struct ThreadData {
+  BufferQueue *BQ;
+  FunctionCallTrie::Allocators::Buffers Buffers;
+  FunctionCallTrie::Allocators Allocators;
+  FunctionCallTrie FCT;
+  tid_t TId;
+};
+
+using ThreadDataArray = Array<ThreadData>;
+using ThreadDataAllocator = ThreadDataArray::AllocatorType;
+
+// We use a separate buffer queue for the backing store for the allocator used
+// by the ThreadData array. This lets us host the buffers, allocators, and tries
+// associated with a thread by moving the data into the array instead of
+// attempting to copy the data to a separately backed set of tries.
+static typename std::aligned_storage<
+    sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+static BufferQueue::Buffer Buffer;
+static typename std::aligned_storage<sizeof(ThreadDataAllocator),
+                                     alignof(ThreadDataAllocator)>::type
+    ThreadDataAllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadDataArray),
+                                     alignof(ThreadDataArray)>::type
+    ThreadDataArrayStorage;
+
+static ThreadDataAllocator *TDAllocator = nullptr;
+static ThreadDataArray *TDArray = nullptr;
+
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+    ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+    ProfileBufferArrayAllocatorStorage;
+
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+
+// Use a global flag to determine whether the collector implementation has been
+// initialized.
+static atomic_uint8_t CollectorInitialized{0};
+
+} // namespace
+
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+          FunctionCallTrie::Allocators &&A,
+          FunctionCallTrie::Allocators::Buffers &&B,
+          tid_t TId) XRAY_NEVER_INSTRUMENT {
+  DCHECK_NE(Q, nullptr);
+
+  // Bail out early if the collector has not been initialized.
+  if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
+    T.~FunctionCallTrie();
+    A.~Allocators();
+    Q->releaseBuffer(B.NodeBuffer);
+    Q->releaseBuffer(B.RootsBuffer);
+    Q->releaseBuffer(B.ShadowStackBuffer);
+    Q->releaseBuffer(B.NodeIdPairBuffer);
+    B.~Buffers();
+    return;
+  }
+
+  {
+    SpinMutexLock Lock(&GlobalMutex);
+    DCHECK_NE(TDAllocator, nullptr);
+    DCHECK_NE(TDArray, nullptr);
+
+    if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
+                               TId) == nullptr) {
+      // If we fail to add the data to the array, we should destroy the objects
+      // handed us.
+      T.~FunctionCallTrie();
+      A.~Allocators();
+      Q->releaseBuffer(B.NodeBuffer);
+      Q->releaseBuffer(B.RootsBuffer);
+      Q->releaseBuffer(B.ShadowStackBuffer);
+      Q->releaseBuffer(B.NodeIdPairBuffer);
+      B.~Buffers();
+    }
+  }
+}
+
+// A PathArray represents the function id's representing a stack trace. In this
+// context a path is almost always represented from the leaf function in a call
+// stack to a root of the call trie.
+using PathArray = Array<int32_t>;
+
+struct ProfileRecord {
+  using PathAllocator = typename PathArray::AllocatorType;
+
+  // The Path in this record is the function id's from the leaf to the root of
+  // the function call stack as represented from a FunctionCallTrie.
+  PathArray Path;
+  const FunctionCallTrie::Node *Node;
+};
+
+namespace {
+
+using ProfileRecordArray = Array<ProfileRecord>;
+
+// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
+// the path(s) and the data associated with the path.
+static void
+populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
+                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
+  using StackArray = Array<const FunctionCallTrie::Node *>;
+  using StackAllocator = typename StackArray::AllocatorType;
+  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+  StackArray DFSStack(StackAlloc);
+  for (const auto *R : Trie.getRoots()) {
+    DFSStack.Append(R);
+    while (!DFSStack.empty()) {
+      auto *Node = DFSStack.back();
+      DFSStack.trim(1);
+      if (Node == nullptr)
+        continue;
+      auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
+      if (Record == nullptr)
+        return;
+      DCHECK_NE(Record, nullptr);
+
+      // Traverse the Node's parents and as we're doing so, get the FIds in
+      // the order they appear.
+      for (auto N = Node; N != nullptr; N = N->Parent)
+        Record->Path.Append(N->FId);
+      DCHECK(!Record->Path.empty());
+
+      for (const auto C : Node->Callees)
+        DFSStack.Append(C.NodePtr);
+    }
+  }
+}
+
+static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
+                             const ProfileRecordArray &ProfileRecords)
+    XRAY_NEVER_INSTRUMENT {
+  auto NextPtr = static_cast<uint8_t *>(
+                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
+                 sizeof(Header);
+  for (const auto &Record : ProfileRecords) {
+    // List of IDs follow:
+    for (const auto FId : Record.Path)
+      NextPtr =
+          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+          sizeof(FId);
+
+    // Add the sentinel here.
+    constexpr int32_t SentinelFId = 0;
+    NextPtr = static_cast<uint8_t *>(
+                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
+              sizeof(SentinelFId);
+
+    // Add the node data here.
+    NextPtr =
+        static_cast<uint8_t *>(internal_memcpy(
+            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
+        sizeof(Record.Node->CallCount);
+    NextPtr = static_cast<uint8_t *>(
+                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
+                                  sizeof(Record.Node->CumulativeLocalTime))) +
+              sizeof(Record.Node->CumulativeLocalTime);
+  }
+
+  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
+}
+
+} // namespace
+
+void serialize() XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&CollectorInitialized, memory_order_acquire))
+    return;
+
+  SpinMutexLock Lock(&GlobalMutex);
+
+  // Clear out the global ProfileBuffers, if it's not empty.
+  for (auto &B : *ProfileBuffers)
+    deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
+  ProfileBuffers->trim(ProfileBuffers->size());
+
+  DCHECK_NE(TDArray, nullptr);
+  if (TDArray->empty())
+    return;
+
+  // Then repopulate the global ProfileBuffers.
+  u32 I = 0;
+  auto MaxSize = profilingFlags()->global_allocator_max;
+  auto ProfileArena = allocateBuffer(MaxSize);
+  if (ProfileArena == nullptr)
+    return;
+
+  auto ProfileArenaCleanup = at_scope_exit(
+      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
+
+  auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
+  if (PathArena == nullptr)
+    return;
+
+  auto PathArenaCleanup = at_scope_exit(
+      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
+
+  for (const auto &ThreadTrie : *TDArray) {
+    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
+    ProfileRecordAllocator PRAlloc(ProfileArena,
+                                   profilingFlags()->global_allocator_max);
+    ProfileRecord::PathAllocator PathAlloc(
+        PathArena, profilingFlags()->global_allocator_max);
+    ProfileRecordArray ProfileRecords(PRAlloc);
+
+    // First, we want to compute the amount of space we're going to need. We'll
+    // use a local allocator and an __xray::Array<...> to store the intermediary
+    // data, then compute the size as we're going along. Then we'll allocate the
+    // contiguous space to contain the thread buffer data.
+    if (ThreadTrie.FCT.getRoots().empty())
+      continue;
+
+    populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
+    DCHECK(!ThreadTrie.FCT.getRoots().empty());
+    DCHECK(!ProfileRecords.empty());
+
+    // Go through each record, to compute the sizes.
+    //
+    // header size = block size (4 bytes)
+    //   + block number (4 bytes)
+    //   + thread id (8 bytes)
+    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
+    //   + call count (8 bytes)
+    //   + local time (8 bytes)
+    //   + end of record (8 bytes)
+    u32 CumulativeSizes = 0;
+    for (const auto &Record : ProfileRecords)
+      CumulativeSizes += 20 + (4 * Record.Path.size());
+
+    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+    auto B = ProfileBuffers->Append({});
+    B->Size = sizeof(Header) + CumulativeSizes;
+    B->Data = allocateBuffer(B->Size);
+    DCHECK_NE(B->Data, nullptr);
+    serializeRecords(B, Header, ProfileRecords);
+  }
+}
+
+void reset() XRAY_NEVER_INSTRUMENT {
+  atomic_store(&CollectorInitialized, 0, memory_order_release);
+  SpinMutexLock Lock(&GlobalMutex);
+
+  if (ProfileBuffers != nullptr) {
+    // Clear out the profile buffers that have been serialized.
+    for (auto &B : *ProfileBuffers)
+      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
+    ProfileBuffers->trim(ProfileBuffers->size());
+    ProfileBuffers = nullptr;
+  }
+
+  if (TDArray != nullptr) {
+    // Release the resources as required.
+    for (auto &TD : *TDArray) {
+      TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
+    }
+    // We don't bother destroying the array here because we've already
+    // potentially freed the backing store for the array. Instead we're going to
+    // reset the pointer to nullptr, and re-use the storage later instead
+    // (placement-new'ing into the storage as-is).
+    TDArray = nullptr;
+  }
+
+  if (TDAllocator != nullptr) {
+    TDAllocator->~Allocator();
+    TDAllocator = nullptr;
+  }
+
+  if (Buffer.Data != nullptr) {
+    BQ->releaseBuffer(Buffer);
+  }
+
+  if (BQ == nullptr) {
+    bool Success = false;
+    new (&BufferQueueStorage)
+        BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
+    if (!Success)
+      return;
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+  } else {
+    BQ->finalize();
+
+    if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
+        BufferQueue::ErrorCode::Ok)
+      return;
+  }
+
+  if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
+    return;
+
+  new (&ProfileBufferArrayAllocatorStorage)
+      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+      &ProfileBufferArrayAllocatorStorage);
+
+  new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
+  ProfileBuffers =
+      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+
+  new (&ThreadDataAllocatorStorage)
+      ThreadDataAllocator(Buffer.Data, Buffer.Size);
+  TDAllocator =
+      reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
+  new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
+  TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
+
+  atomic_store(&CollectorInitialized, 1, memory_order_release);
+}
+
+XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Lock(&GlobalMutex);
+
+  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
+    return {nullptr, 0};
+
+  static pthread_once_t Once = PTHREAD_ONCE_INIT;
+  static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
+      FileHeaderStorage;
+  pthread_once(
+      &Once, +[]() XRAY_NEVER_INSTRUMENT {
+        new (&FileHeaderStorage) XRayProfilingFileHeader{};
+      });
+
+  if (UNLIKELY(B.Data == nullptr)) {
+    // The first buffer should always contain the file header information.
+    auto &FileHeader =
+        *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
+    FileHeader.Timestamp = NanoTime();
+    FileHeader.PID = internal_getpid();
+    return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
+  }
+
+  if (UNLIKELY(B.Data == &FileHeaderStorage))
+    return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
+
+  BlockHeader Header;
+  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
+  auto NextBlock = Header.BlockNum + 1;
+  if (NextBlock < ProfileBuffers->size())
+    return {(*ProfileBuffers)[NextBlock].Data,
+            (*ProfileBuffers)[NextBlock].Size};
+  return {nullptr, 0};
+}
+
+} // namespace profileCollectorService
+} // namespace __xray
diff --git a/lib/xray/xray_profiling.cc b/lib/xray/xray_profiling.cc
deleted file mode 100644
index 66def6cf2485..000000000000
--- a/lib/xray/xray_profiling.cc
+++ /dev/null
@@ -1,519 +0,0 @@
-//===-- xray_profiling.cc ---------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// This is the implementation of a profiling handler.
-//
-//===----------------------------------------------------------------------===//
-#include <memory>
-#include <time.h>
-
-#include "sanitizer_common/sanitizer_atomic.h"
-#include "sanitizer_common/sanitizer_flags.h"
-#include "xray/xray_interface.h"
-#include "xray/xray_log_interface.h"
-#include "xray_buffer_queue.h"
-#include "xray_flags.h"
-#include "xray_profile_collector.h"
-#include "xray_profiling_flags.h"
-#include "xray_recursion_guard.h"
-#include "xray_tsc.h"
-#include "xray_utils.h"
-#include <pthread.h>
-
-namespace __xray {
-
-namespace {
-
-static atomic_sint32_t ProfilerLogFlushStatus = {
-    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
-
-static atomic_sint32_t ProfilerLogStatus = {
-    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
-
-static SpinMutex ProfilerOptionsMutex;
-
-struct ProfilingData {
-  atomic_uintptr_t Allocators;
-  atomic_uintptr_t FCT;
-};
-
-static pthread_key_t ProfilingKey;
-
-// We use a global buffer queue, which gets initialized once at initialisation
-// time, and gets reset when profiling is "done".
-static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type
-    BufferQueueStorage;
-static BufferQueue *BQ = nullptr;
-
-thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers;
-thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators),
-                                  alignof(FunctionCallTrie::Allocators)>::type
-    AllocatorsStorage;
-thread_local std::aligned_storage<sizeof(FunctionCallTrie),
-                                  alignof(FunctionCallTrie)>::type
-    FunctionCallTrieStorage;
-thread_local ProfilingData TLD{{0}, {0}};
-thread_local atomic_uint8_t ReentranceGuard{0};
-
-// We use a separate guard for ensuring that for this thread, if we're already
-// cleaning up, that any signal handlers don't attempt to cleanup nor
-// initialise.
-thread_local atomic_uint8_t TLDInitGuard{0};
-
-// We also use a separate latch to signal that the thread is exiting, and
-// non-essential work should be ignored (things like recording events, etc.).
-thread_local atomic_uint8_t ThreadExitingLatch{0};
-
-static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT {
-  thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT {
-    pthread_setspecific(ProfilingKey, &TLD);
-    return false;
-  }();
-  (void)ThreadOnce;
-
-  RecursionGuard TLDInit(TLDInitGuard);
-  if (!TLDInit)
-    return nullptr;
-
-  if (atomic_load_relaxed(&ThreadExitingLatch))
-    return nullptr;
-
-  uptr Allocators = 0;
-  if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1,
-                                     memory_order_acq_rel)) {
-    bool Success = false;
-    auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
-      if (!Success)
-        atomic_store(&TLD.Allocators, 0, memory_order_release);
-    });
-
-    // Acquire a set of buffers for this thread.
-    if (BQ == nullptr)
-      return nullptr;
-
-    if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok)
-      return nullptr;
-    auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
-      if (!Success)
-        BQ->releaseBuffer(ThreadBuffers.NodeBuffer);
-    });
-
-    if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok)
-      return nullptr;
-    auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
-      if (!Success)
-        BQ->releaseBuffer(ThreadBuffers.RootsBuffer);
-    });
-
-    if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) !=
-        BufferQueue::ErrorCode::Ok)
-      return nullptr;
-    auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
-      if (!Success)
-        BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer);
-    });
-
-    if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) !=
-        BufferQueue::ErrorCode::Ok)
-      return nullptr;
-
-    Success = true;
-    new (&AllocatorsStorage) FunctionCallTrie::Allocators(
-        FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers));
-    Allocators = reinterpret_cast<uptr>(
-        reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage));
-    atomic_store(&TLD.Allocators, Allocators, memory_order_release);
-  }
-
-  if (Allocators == 1)
-    return nullptr;
-
-  uptr FCT = 0;
-  if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) {
-    new (&FunctionCallTrieStorage)
-        FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>(
-            atomic_load_relaxed(&TLD.Allocators)));
-    FCT = reinterpret_cast<uptr>(
-        reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage));
-    atomic_store(&TLD.FCT, FCT, memory_order_release);
-  }
-
-  if (FCT == 1)
-    return nullptr;
-
-  return &TLD;
-}
-
-static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
-  auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel);
-  if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>(
-                 &FunctionCallTrieStorage)))
-    reinterpret_cast<FunctionCallTrie *>(FCT)->~FunctionCallTrie();
-
-  auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel);
-  if (Allocators ==
-      reinterpret_cast<uptr>(
-          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
-    reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)->~Allocators();
-}
-
-static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT {
-  RecursionGuard TLDInit(TLDInitGuard);
-  if (!TLDInit)
-    return;
-
-  uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel);
-  if (P != reinterpret_cast<uptr>(
-               reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)))
-    return;
-
-  auto FCT = reinterpret_cast<FunctionCallTrie *>(P);
-  DCHECK_NE(FCT, nullptr);
-
-  uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel);
-  if (A !=
-      reinterpret_cast<uptr>(
-          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
-    return;
-
-  auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A);
-  DCHECK_NE(Allocators, nullptr);
-
-  // Always move the data into the profile collector.
-  profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators),
-                                std::move(ThreadBuffers), GetTid());
-
-  // Re-initialize the ThreadBuffers object to a known "default" state.
-  ThreadBuffers = FunctionCallTrie::Allocators::Buffers{};
-}
-
-} // namespace
-
-const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
-#ifdef XRAY_PROFILER_DEFAULT_OPTIONS
-  return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS);
-#else
-  return "";
-#endif
-}
-
-XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
-  if (atomic_load(&ProfilerLogStatus, memory_order_acquire) !=
-      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
-    if (Verbosity())
-      Report("Not flushing profiles, profiling not been finalized.\n");
-    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  }
-
-  RecursionGuard SignalGuard(ReentranceGuard);
-  if (!SignalGuard) {
-    if (Verbosity())
-      Report("Cannot finalize properly inside a signal handler!\n");
-    atomic_store(&ProfilerLogFlushStatus,
-                 XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING,
-                 memory_order_release);
-    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
-  }
-
-  s32 Previous = atomic_exchange(&ProfilerLogFlushStatus,
-                                 XRayLogFlushStatus::XRAY_LOG_FLUSHING,
-                                 memory_order_acq_rel);
-  if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) {
-    if (Verbosity())
-      Report("Not flushing profiles, implementation still flushing.\n");
-    return XRayLogFlushStatus::XRAY_LOG_FLUSHING;
-  }
-
-  // At this point, we'll create the file that will contain the profile, but
-  // only if the options say so.
-  if (!profilingFlags()->no_flush) {
-    // First check whether we have data in the profile collector service
-    // before we try and write anything down.
-    XRayBuffer B = profileCollectorService::nextBuffer({nullptr, 0});
-    if (B.Data == nullptr) {
-      if (Verbosity())
-        Report("profiling: No data to flush.\n");
-    } else {
-      LogWriter *LW = LogWriter::Open();
-      if (LW == nullptr) {
-        if (Verbosity())
-          Report("profiling: Failed to flush to file, dropping data.\n");
-      } else {
-        // Now for each of the buffers, write out the profile data as we would
-        // see it in memory, verbatim.
-        while (B.Data != nullptr && B.Size != 0) {
-          LW->WriteAll(reinterpret_cast<const char *>(B.Data),
-                       reinterpret_cast<const char *>(B.Data) + B.Size);
-          B = profileCollectorService::nextBuffer(B);
-        }
-      }
-      LogWriter::Close(LW);
-    }
-  }
-
-  profileCollectorService::reset();
-
-  atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
-               memory_order_release);
-  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
-               memory_order_release);
-
-  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
-}
-
-void profilingHandleArg0(int32_t FuncId,
-                         XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
-  unsigned char CPU;
-  auto TSC = readTSC(CPU);
-  RecursionGuard G(ReentranceGuard);
-  if (!G)
-    return;
-
-  auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
-  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED ||
-               Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING))
-    return;
-
-  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
-               Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
-    postCurrentThreadFCT(TLD);
-    return;
-  }
-
-  auto T = getThreadLocalData();
-  if (T == nullptr)
-    return;
-
-  auto FCT = reinterpret_cast<FunctionCallTrie *>(atomic_load_relaxed(&T->FCT));
-  switch (Entry) {
-  case XRayEntryType::ENTRY:
-  case XRayEntryType::LOG_ARGS_ENTRY:
-    FCT->enterFunction(FuncId, TSC, CPU);
-    break;
-  case XRayEntryType::EXIT:
-  case XRayEntryType::TAIL:
-    FCT->exitFunction(FuncId, TSC, CPU);
-    break;
-  default:
-    // FIXME: Handle bugs.
-    break;
-  }
-}
-
-void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry,
-                         uint64_t) XRAY_NEVER_INSTRUMENT {
-  return profilingHandleArg0(FuncId, Entry);
-}
-
-XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
-                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
-                                      memory_order_release)) {
-    if (Verbosity())
-      Report("Cannot finalize profile, the profiling is not initialized.\n");
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-  }
-
-  // Mark then finalize the current generation of buffers. This allows us to let
-  // the threads currently holding onto new buffers still use them, but let the
-  // last reference do the memory cleanup.
-  DCHECK_NE(BQ, nullptr);
-  BQ->finalize();
-
-  // Wait a grace period to allow threads to see that we're finalizing.
-  SleepForMillis(profilingFlags()->grace_period_ms);
-
-  // If we for some reason are entering this function from an instrumented
-  // handler, we bail out.
-  RecursionGuard G(ReentranceGuard);
-  if (!G)
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-
-  // Post the current thread's data if we have any.
-  postCurrentThreadFCT(TLD);
-
-  // Then we force serialize the log data.
-  profileCollectorService::serialize();
-
-  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
-               memory_order_release);
-  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
-}
-
-XRayLogInitStatus
-profilingLoggingInit(size_t, size_t, void *Options,
-                     size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
-  RecursionGuard G(ReentranceGuard);
-  if (!G)
-    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-
-  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
-                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
-                                      memory_order_acq_rel)) {
-    if (Verbosity())
-      Report("Cannot initialize already initialised profiling "
-             "implementation.\n");
-    return static_cast<XRayLogInitStatus>(CurrentStatus);
-  }
-
-  {
-    SpinMutexLock Lock(&ProfilerOptionsMutex);
-    FlagParser ConfigParser;
-    ProfilerFlags Flags;
-    Flags.setDefaults();
-    registerProfilerFlags(&ConfigParser, &Flags);
-    ConfigParser.ParseString(profilingCompilerDefinedFlags());
-    const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
-    if (Env == nullptr)
-      Env = "";
-    ConfigParser.ParseString(Env);
-
-    // Then parse the configuration string provided.
-    ConfigParser.ParseString(static_cast<const char *>(Options));
-    if (Verbosity())
-      ReportUnrecognizedFlags();
-    *profilingFlags() = Flags;
-  }
-
-  // We need to reset the profile data collection implementation now.
-  profileCollectorService::reset();
-
-  // Then also reset the buffer queue implementation.
-  if (BQ == nullptr) {
-    bool Success = false;
-    new (&BufferQueueStorage)
-        BufferQueue(profilingFlags()->per_thread_allocator_max,
-                    profilingFlags()->buffers_max, Success);
-    if (!Success) {
-      if (Verbosity())
-        Report("Failed to initialize preallocated memory buffers!");
-      atomic_store(&ProfilerLogStatus,
-                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
-                   memory_order_release);
-      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-    }
-
-    // If we've succeded, set the global pointer to the initialised storage.
-    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
-  } else {
-    BQ->finalize();
-    auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max,
-                               profilingFlags()->buffers_max);
-
-    if (InitStatus != BufferQueue::ErrorCode::Ok) {
-      if (Verbosity())
-        Report("Failed to initialize preallocated memory buffers; error: %s",
-               BufferQueue::getErrorString(InitStatus));
-      atomic_store(&ProfilerLogStatus,
-                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
-                   memory_order_release);
-      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
-    }
-
-    DCHECK(!BQ->finalizing());
-  }
-
-  // We need to set up the exit handlers.
-  static pthread_once_t Once = PTHREAD_ONCE_INIT;
-  pthread_once(
-      &Once, +[] {
-        pthread_key_create(
-            &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT {
-              if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
-                return;
-
-              if (P == nullptr)
-                return;
-
-              auto T = reinterpret_cast<ProfilingData *>(P);
-              if (atomic_load_relaxed(&T->Allocators) == 0)
-                return;
-
-              {
-                // If we're somehow executing this while inside a
-                // non-reentrant-friendly context, we skip attempting to post
-                // the current thread's data.
-                RecursionGuard G(ReentranceGuard);
-                if (!G)
-                  return;
-
-                postCurrentThreadFCT(*T);
-              }
-            });
-
-        // We also need to set up an exit handler, so that we can get the
-        // profile information at exit time. We use the C API to do this, to not
-        // rely on C++ ABI functions for registering exit handlers.
-        Atexit(+[]() XRAY_NEVER_INSTRUMENT {
-          if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
-            return;
-
-          auto Cleanup =
-              at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); });
-
-          // Finalize and flush.
-          if (profilingFinalize() != XRAY_LOG_FINALIZED ||
-              profilingFlush() != XRAY_LOG_FLUSHED)
-            return;
-
-          if (Verbosity())
-            Report("XRay Profile flushed at exit.");
-        });
-      });
-
-  __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
-  __xray_set_handler(profilingHandleArg0);
-  __xray_set_handler_arg1(profilingHandleArg1);
-
-  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
-               memory_order_release);
-  if (Verbosity())
-    Report("XRay Profiling init successful.\n");
-
-  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
-}
-
-bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT {
-  // Set up the flag defaults from the static defaults and the
-  // compiler-provided defaults.
-  {
-    SpinMutexLock Lock(&ProfilerOptionsMutex);
-    auto *F = profilingFlags();
-    F->setDefaults();
-    FlagParser ProfilingParser;
-    registerProfilerFlags(&ProfilingParser, F);
-    ProfilingParser.ParseString(profilingCompilerDefinedFlags());
-  }
-
-  XRayLogImpl Impl{
-      profilingLoggingInit,
-      profilingFinalize,
-      profilingHandleArg0,
-      profilingFlush,
-  };
-  auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl);
-  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
-    if (Verbosity())
-      Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = "
-             "%d\n",
-             RegistrationResult);
-    return false;
-  }
-
-  if (!internal_strcmp(flags()->xray_mode, "xray-profiling"))
-    __xray_log_select_mode("xray_profiling");
-  return true;
-}
-
-} // namespace __xray
-
-static auto UNUSED Unused = __xray::profilingDynamicInitializer();
diff --git a/lib/xray/xray_profiling.cpp b/lib/xray/xray_profiling.cpp
new file mode 100644
index 000000000000..ef16691562cc
--- /dev/null
+++ b/lib/xray/xray_profiling.cpp
@@ -0,0 +1,519 @@
+//===-- xray_profiling.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This is the implementation of a profiling handler.
+//
+//===----------------------------------------------------------------------===//
+#include <memory>
+#include <time.h>
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "xray/xray_interface.h"
+#include "xray/xray_log_interface.h"
+#include "xray_buffer_queue.h"
+#include "xray_flags.h"
+#include "xray_profile_collector.h"
+#include "xray_profiling_flags.h"
+#include "xray_recursion_guard.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+#include <pthread.h>
+
+namespace __xray {
+
+namespace {
+
+static atomic_sint32_t ProfilerLogFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+static atomic_sint32_t ProfilerLogStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+static SpinMutex ProfilerOptionsMutex;
+
+struct ProfilingData {
+  atomic_uintptr_t Allocators;
+  atomic_uintptr_t FCT;
+};
+
+static pthread_key_t ProfilingKey;
+
+// We use a global buffer queue, which gets initialized once at initialisation
+// time, and gets reset when profiling is "done".
+static std::aligned_storage<sizeof(BufferQueue), alignof(BufferQueue)>::type
+    BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+
+thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers;
+thread_local std::aligned_storage<sizeof(FunctionCallTrie::Allocators),
+                                  alignof(FunctionCallTrie::Allocators)>::type
+    AllocatorsStorage;
+thread_local std::aligned_storage<sizeof(FunctionCallTrie),
+                                  alignof(FunctionCallTrie)>::type
+    FunctionCallTrieStorage;
+thread_local ProfilingData TLD{{0}, {0}};
+thread_local atomic_uint8_t ReentranceGuard{0};
+
+// We use a separate guard for ensuring that for this thread, if we're already
+// cleaning up, that any signal handlers don't attempt to cleanup nor
+// initialise.
+thread_local atomic_uint8_t TLDInitGuard{0};
+
+// We also use a separate latch to signal that the thread is exiting, and
+// non-essential work should be ignored (things like recording events, etc.).
+thread_local atomic_uint8_t ThreadExitingLatch{0};
+
+static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+  thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT {
+    pthread_setspecific(ProfilingKey, &TLD);
+    return false;
+  }();
+  (void)ThreadOnce;
+
+  RecursionGuard TLDInit(TLDInitGuard);
+  if (!TLDInit)
+    return nullptr;
+
+  if (atomic_load_relaxed(&ThreadExitingLatch))
+    return nullptr;
+
+  uptr Allocators = 0;
+  if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1,
+                                     memory_order_acq_rel)) {
+    bool Success = false;
+    auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        atomic_store(&TLD.Allocators, 0, memory_order_release);
+    });
+
+    // Acquire a set of buffers for this thread.
+    if (BQ == nullptr)
+      return nullptr;
+
+    if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.NodeBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.RootsBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) !=
+        BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) !=
+        BufferQueue::ErrorCode::Ok)
+      return nullptr;
+
+    Success = true;
+    new (&AllocatorsStorage) FunctionCallTrie::Allocators(
+        FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers));
+    Allocators = reinterpret_cast<uptr>(
+        reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage));
+    atomic_store(&TLD.Allocators, Allocators, memory_order_release);
+  }
+
+  if (Allocators == 1)
+    return nullptr;
+
+  uptr FCT = 0;
+  if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) {
+    new (&FunctionCallTrieStorage)
+        FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>(
+            atomic_load_relaxed(&TLD.Allocators)));
+    FCT = reinterpret_cast<uptr>(
+        reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage));
+    atomic_store(&TLD.FCT, FCT, memory_order_release);
+  }
+
+  if (FCT == 1)
+    return nullptr;
+
+  return &TLD;
+}
+
+static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
+  auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel);
+  if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>(
+                 &FunctionCallTrieStorage)))
+    reinterpret_cast<FunctionCallTrie *>(FCT)->~FunctionCallTrie();
+
+  auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel);
+  if (Allocators ==
+      reinterpret_cast<uptr>(
+          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+    reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)->~Allocators();
+}
+
+static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT {
+  RecursionGuard TLDInit(TLDInitGuard);
+  if (!TLDInit)
+    return;
+
+  uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel);
+  if (P != reinterpret_cast<uptr>(
+               reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)))
+    return;
+
+  auto FCT = reinterpret_cast<FunctionCallTrie *>(P);
+  DCHECK_NE(FCT, nullptr);
+
+  uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel);
+  if (A !=
+      reinterpret_cast<uptr>(
+          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+    return;
+
+  auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A);
+  DCHECK_NE(Allocators, nullptr);
+
+  // Always move the data into the profile collector.
+  profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators),
+                                std::move(ThreadBuffers), GetTid());
+
+  // Re-initialize the ThreadBuffers object to a known "default" state.
+  ThreadBuffers = FunctionCallTrie::Allocators::Buffers{};
+}
+
+} // namespace
+
+const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_PROFILER_DEFAULT_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&ProfilerLogStatus, memory_order_acquire) !=
+      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+    if (Verbosity())
+      Report("Not flushing profiles, profiling not been finalized.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  RecursionGuard SignalGuard(ReentranceGuard);
+  if (!SignalGuard) {
+    if (Verbosity())
+      Report("Cannot finalize properly inside a signal handler!\n");
+    atomic_store(&ProfilerLogFlushStatus,
+                 XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING,
+                 memory_order_release);
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  s32 Previous = atomic_exchange(&ProfilerLogFlushStatus,
+                                 XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+                                 memory_order_acq_rel);
+  if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) {
+    if (Verbosity())
+      Report("Not flushing profiles, implementation still flushing.\n");
+    return XRayLogFlushStatus::XRAY_LOG_FLUSHING;
+  }
+
+  // At this point, we'll create the file that will contain the profile, but
+  // only if the options say so.
+  if (!profilingFlags()->no_flush) {
+    // First check whether we have data in the profile collector service
+    // before we try and write anything down.
+    XRayBuffer B = profileCollectorService::nextBuffer({nullptr, 0});
+    if (B.Data == nullptr) {
+      if (Verbosity())
+        Report("profiling: No data to flush.\n");
+    } else {
+      LogWriter *LW = LogWriter::Open();
+      if (LW == nullptr) {
+        if (Verbosity())
+          Report("profiling: Failed to flush to file, dropping data.\n");
+      } else {
+        // Now for each of the buffers, write out the profile data as we would
+        // see it in memory, verbatim.
+        while (B.Data != nullptr && B.Size != 0) {
+          LW->WriteAll(reinterpret_cast<const char *>(B.Data),
+                       reinterpret_cast<const char *>(B.Data) + B.Size);
+          B = profileCollectorService::nextBuffer(B);
+        }
+      }
+      LogWriter::Close(LW);
+    }
+  }
+
+  profileCollectorService::reset();
+
+  atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+               memory_order_release);
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+               memory_order_release);
+
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+void profilingHandleArg0(int32_t FuncId,
+                         XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  unsigned char CPU;
+  auto TSC = readTSC(CPU);
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return;
+
+  auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
+  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED ||
+               Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING))
+    return;
+
+  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
+               Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
+    postCurrentThreadFCT(TLD);
+    return;
+  }
+
+  auto T = getThreadLocalData();
+  if (T == nullptr)
+    return;
+
+  auto FCT = reinterpret_cast<FunctionCallTrie *>(atomic_load_relaxed(&T->FCT));
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    FCT->enterFunction(FuncId, TSC, CPU);
+    break;
+  case XRayEntryType::EXIT:
+  case XRayEntryType::TAIL:
+    FCT->exitFunction(FuncId, TSC, CPU);
+    break;
+  default:
+    // FIXME: Handle bugs.
+    break;
+  }
+}
+
+void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry,
+                         uint64_t) XRAY_NEVER_INSTRUMENT {
+  return profilingHandleArg0(FuncId, Entry);
+}
+
+XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot finalize profile, the profiling is not initialized.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  // Mark then finalize the current generation of buffers. This allows us to let
+  // the threads currently holding onto new buffers still use them, but let the
+  // last reference do the memory cleanup.
+  DCHECK_NE(BQ, nullptr);
+  BQ->finalize();
+
+  // Wait a grace period to allow threads to see that we're finalizing.
+  SleepForMillis(profilingFlags()->grace_period_ms);
+
+  // If we for some reason are entering this function from an instrumented
+  // handler, we bail out.
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+
+  // Post the current thread's data if we have any.
+  postCurrentThreadFCT(TLD);
+
+  // Then we force serialize the log data.
+  profileCollectorService::serialize();
+
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+               memory_order_release);
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+XRayLogInitStatus
+profilingLoggingInit(size_t, size_t, void *Options,
+                     size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+                                      memory_order_acq_rel)) {
+    if (Verbosity())
+      Report("Cannot initialize already initialised profiling "
+             "implementation.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    FlagParser ConfigParser;
+    ProfilerFlags Flags;
+    Flags.setDefaults();
+    registerProfilerFlags(&ConfigParser, &Flags);
+    ConfigParser.ParseString(profilingCompilerDefinedFlags());
+    const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
+    if (Env == nullptr)
+      Env = "";
+    ConfigParser.ParseString(Env);
+
+    // Then parse the configuration string provided.
+    ConfigParser.ParseString(static_cast<const char *>(Options));
+    if (Verbosity())
+      ReportUnrecognizedFlags();
+    *profilingFlags() = Flags;
+  }
+
+  // We need to reset the profile data collection implementation now.
+  profileCollectorService::reset();
+
+  // Then also reset the buffer queue implementation.
+  if (BQ == nullptr) {
+    bool Success = false;
+    new (&BufferQueueStorage)
+        BufferQueue(profilingFlags()->per_thread_allocator_max,
+                    profilingFlags()->buffers_max, Success);
+    if (!Success) {
+      if (Verbosity())
+        Report("Failed to initialize preallocated memory buffers!");
+      atomic_store(&ProfilerLogStatus,
+                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                   memory_order_release);
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+
+    // If we've succeded, set the global pointer to the initialised storage.
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+  } else {
+    BQ->finalize();
+    auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max,
+                               profilingFlags()->buffers_max);
+
+    if (InitStatus != BufferQueue::ErrorCode::Ok) {
+      if (Verbosity())
+        Report("Failed to initialize preallocated memory buffers; error: %s",
+               BufferQueue::getErrorString(InitStatus));
+      atomic_store(&ProfilerLogStatus,
+                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                   memory_order_release);
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+
+    DCHECK(!BQ->finalizing());
+  }
+
+  // We need to set up the exit handlers.
+  static pthread_once_t Once = PTHREAD_ONCE_INIT;
+  pthread_once(
+      &Once, +[] {
+        pthread_key_create(
+            &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT {
+              if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+                return;
+
+              if (P == nullptr)
+                return;
+
+              auto T = reinterpret_cast<ProfilingData *>(P);
+              if (atomic_load_relaxed(&T->Allocators) == 0)
+                return;
+
+              {
+                // If we're somehow executing this while inside a
+                // non-reentrant-friendly context, we skip attempting to post
+                // the current thread's data.
+                RecursionGuard G(ReentranceGuard);
+                if (!G)
+                  return;
+
+                postCurrentThreadFCT(*T);
+              }
+            });
+
+        // We also need to set up an exit handler, so that we can get the
+        // profile information at exit time. We use the C API to do this, to not
+        // rely on C++ ABI functions for registering exit handlers.
+        Atexit(+[]() XRAY_NEVER_INSTRUMENT {
+          if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+            return;
+
+          auto Cleanup =
+              at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); });
+
+          // Finalize and flush.
+          if (profilingFinalize() != XRAY_LOG_FINALIZED ||
+              profilingFlush() != XRAY_LOG_FLUSHED)
+            return;
+
+          if (Verbosity())
+            Report("XRay Profile flushed at exit.");
+        });
+      });
+
+  __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
+  __xray_set_handler(profilingHandleArg0);
+  __xray_set_handler_arg1(profilingHandleArg1);
+
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+               memory_order_release);
+  if (Verbosity())
+    Report("XRay Profiling init successful.\n");
+
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  // Set up the flag defaults from the static defaults and the
+  // compiler-provided defaults.
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    auto *F = profilingFlags();
+    F->setDefaults();
+    FlagParser ProfilingParser;
+    registerProfilerFlags(&ProfilingParser, F);
+    ProfilingParser.ParseString(profilingCompilerDefinedFlags());
+  }
+
+  XRayLogImpl Impl{
+      profilingLoggingInit,
+      profilingFinalize,
+      profilingHandleArg0,
+      profilingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+    if (Verbosity())
+      Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = "
+             "%d\n",
+             RegistrationResult);
+    return false;
+  }
+
+  if (!internal_strcmp(flags()->xray_mode, "xray-profiling"))
+    __xray_log_select_mode("xray_profiling");
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::profilingDynamicInitializer();
diff --git a/lib/xray/xray_profiling_flags.cc b/lib/xray/xray_profiling_flags.cc
deleted file mode 100644
index 0e89b7420f8c..000000000000
--- a/lib/xray/xray_profiling_flags.cc
+++ /dev/null
@@ -1,39 +0,0 @@
-//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-// XRay runtime flags.
-//===----------------------------------------------------------------------===//
-
-#include "xray_profiling_flags.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "sanitizer_common/sanitizer_flag_parser.h"
-#include "sanitizer_common/sanitizer_libc.h"
-#include "xray_defs.h"
-
-namespace __xray {
-
-// Storage for the profiling flags.
-ProfilerFlags xray_profiling_flags_dont_use_directly;
-
-void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
-#include "xray_profiling_flags.inc"
-#undef XRAY_FLAG
-}
-
-void registerProfilerFlags(FlagParser *P,
-                           ProfilerFlags *F) XRAY_NEVER_INSTRUMENT {
-#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
-  RegisterFlag(P, #Name, Description, &F->Name);
-#include "xray_profiling_flags.inc"
-#undef XRAY_FLAG
-}
-
-} // namespace __xray
diff --git a/lib/xray/xray_profiling_flags.cpp b/lib/xray/xray_profiling_flags.cpp
new file mode 100644
index 000000000000..0e89b7420f8c
--- /dev/null
+++ b/lib/xray/xray_profiling_flags.cpp
@@ -0,0 +1,39 @@
+//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay runtime flags.
+//===----------------------------------------------------------------------===//
+
+#include "xray_profiling_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+namespace __xray {
+
+// Storage for the profiling flags.
+ProfilerFlags xray_profiling_flags_dont_use_directly;
+
+void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerProfilerFlags(FlagParser *P,
+                           ProfilerFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_trampoline_powerpc64.cc b/lib/xray/xray_trampoline_powerpc64.cc
deleted file mode 100644
index 878c46930fee..000000000000
--- a/lib/xray/xray_trampoline_powerpc64.cc
+++ /dev/null
@@ -1,15 +0,0 @@
-#include <atomic>
-#include <xray/xray_interface.h>
-
-namespace __xray {
-
-extern std::atomic<void (*)(int32_t, XRayEntryType)> XRayPatchedFunction;
-
-// Implement this in C++ instead of assembly, to avoid dealing with ToC by hand.
-void CallXRayPatchedFunction(int32_t FuncId, XRayEntryType Type) {
-  auto fptr = __xray::XRayPatchedFunction.load();
-  if (fptr != nullptr)
-    (*fptr)(FuncId, Type);
-}
-
-} // namespace __xray
diff --git a/lib/xray/xray_trampoline_powerpc64.cpp b/lib/xray/xray_trampoline_powerpc64.cpp
new file mode 100644
index 000000000000..878c46930fee
--- /dev/null
+++ b/lib/xray/xray_trampoline_powerpc64.cpp
@@ -0,0 +1,15 @@
+#include <atomic>
+#include <xray/xray_interface.h>
+
+namespace __xray {
+
+extern std::atomic<void (*)(int32_t, XRayEntryType)> XRayPatchedFunction;
+
+// Implement this in C++ instead of assembly, to avoid dealing with ToC by hand.
+void CallXRayPatchedFunction(int32_t FuncId, XRayEntryType Type) {
+  auto fptr = __xray::XRayPatchedFunction.load();
+  if (fptr != nullptr)
+    (*fptr)(FuncId, Type);
+}
+
+} // namespace __xray
diff --git a/lib/xray/xray_utils.cc b/lib/xray/xray_utils.cc
deleted file mode 100644
index 82674baa5a0c..000000000000
--- a/lib/xray/xray_utils.cc
+++ /dev/null
@@ -1,195 +0,0 @@
-//===-- xray_utils.cc -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file is a part of XRay, a dynamic runtime instrumentation system.
-//
-//===----------------------------------------------------------------------===//
-#include "xray_utils.h"
-
-#include "sanitizer_common/sanitizer_allocator_internal.h"
-#include "sanitizer_common/sanitizer_common.h"
-#include "xray_allocator.h"
-#include "xray_defs.h"
-#include "xray_flags.h"
-#include <cstdio>
-#include <errno.h>
-#include <fcntl.h>
-#include <iterator>
-#include <stdlib.h>
-#include <sys/types.h>
-#include <tuple>
-#include <unistd.h>
-#include <utility>
-
-#if SANITIZER_FUCHSIA
-#include "sanitizer_common/sanitizer_symbolizer_fuchsia.h"
-
-#include <inttypes.h>
-#include <zircon/process.h>
-#include <zircon/sanitizer.h>
-#include <zircon/status.h>
-#include <zircon/syscalls.h>
-#endif
-
-namespace __xray {
-
-#if SANITIZER_FUCHSIA
-constexpr const char* ProfileSinkName = "llvm-xray";
-
-LogWriter::~LogWriter() {
-  _zx_handle_close(Vmo);
-}
-
-void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
-  if (Begin == End)
-    return;
-  auto TotalBytes = std::distance(Begin, End);
-
-  const size_t PageSize = flags()->xray_page_size_override > 0
-                              ? flags()->xray_page_size_override
-                              : GetPageSizeCached();
-  if (RoundUpTo(Offset, PageSize) != RoundUpTo(Offset + TotalBytes, PageSize)) {
-    // Resize the VMO to ensure there's sufficient space for the data.
-    zx_status_t Status = _zx_vmo_set_size(Vmo, Offset + TotalBytes);
-    if (Status != ZX_OK) {
-      Report("Failed to resize VMO: %s\n", _zx_status_get_string(Status));
-      return;
-    }
-  }
-
-  // Write the data into VMO.
-  zx_status_t Status = _zx_vmo_write(Vmo, Begin, Offset, TotalBytes);
-  if (Status != ZX_OK) {
-    Report("Failed to write: %s\n", _zx_status_get_string(Status));
-    return;
-  }
-  Offset += TotalBytes;
-}
-
-void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
-  // Nothing to do here since WriteAll writes directly into the VMO.
-}
-
-LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
-  // Create VMO to hold the profile data.
-  zx_handle_t Vmo;
-  zx_status_t Status = _zx_vmo_create(0, ZX_VMO_RESIZABLE, &Vmo);
-  if (Status != ZX_OK) {
-    Report("XRay: cannot create VMO: %s\n", _zx_status_get_string(Status));
-    return nullptr;
-  }
-
-  // Get the KOID of the current process to use in the VMO name.
-  zx_info_handle_basic_t Info;
-  Status = _zx_object_get_info(_zx_process_self(), ZX_INFO_HANDLE_BASIC, &Info,
-                               sizeof(Info), NULL, NULL);
-  if (Status != ZX_OK) {
-    Report("XRay: cannot get basic info about current process handle: %s\n",
-           _zx_status_get_string(Status));
-    return nullptr;
-  }
-
-  // Give the VMO a name including our process KOID so it's easy to spot.
-  char VmoName[ZX_MAX_NAME_LEN];
-  internal_snprintf(VmoName, sizeof(VmoName), "%s.%zu", ProfileSinkName,
-                    Info.koid);
-  _zx_object_set_property(Vmo, ZX_PROP_NAME, VmoName, strlen(VmoName));
-
-  // Duplicate the handle since __sanitizer_publish_data consumes it and
-  // LogWriter needs to hold onto it.
-  zx_handle_t Handle;
-  Status =_zx_handle_duplicate(Vmo, ZX_RIGHT_SAME_RIGHTS, &Handle);
-  if (Status != ZX_OK) {
-    Report("XRay: cannot duplicate VMO handle: %s\n",
-           _zx_status_get_string(Status));
-    return nullptr;
-  }
-
-  // Publish the VMO that receives the logging. Note the VMO's contents can
-  // grow and change after publication. The contents won't be read out until
-  // after the process exits.
-  __sanitizer_publish_data(ProfileSinkName, Handle);
-
-  // Use the dumpfile symbolizer markup element to write the name of the VMO.
-  Report("XRay: " FORMAT_DUMPFILE "\n", ProfileSinkName, VmoName);
-
-  LogWriter *LW = reinterpret_cast<LogWriter *>(InternalAlloc(sizeof(LogWriter)));
-  new (LW) LogWriter(Vmo);
-  return LW;
-}
-
-void LogWriter::Close(LogWriter *LW) {
-  LW->~LogWriter();
-  InternalFree(LW);
-}
-#else // SANITIZER_FUCHSIA
-LogWriter::~LogWriter() {
-  internal_close(Fd);
-}
-
-void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
-  if (Begin == End)
-    return;
-  auto TotalBytes = std::distance(Begin, End);
-  while (auto Written = write(Fd, Begin, TotalBytes)) {
-    if (Written < 0) {
-      if (errno == EINTR)
-        continue; // Try again.
-      Report("Failed to write; errno = %d\n", errno);
-      return;
-    }
-    TotalBytes -= Written;
-    if (TotalBytes == 0)
-      break;
-    Begin += Written;
-  }
-}
-
-void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
-  fsync(Fd);
-}
-
-LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
-  // Open a temporary file once for the log.
-  char TmpFilename[256] = {};
-  char TmpWildcardPattern[] = "XXXXXX";
-  auto **Argv = GetArgv();
-  const char *Progname = !Argv ? "(unknown)" : Argv[0];
-  const char *LastSlash = internal_strrchr(Progname, '/');
-
-  if (LastSlash != nullptr)
-    Progname = LastSlash + 1;
-
-  int NeededLength = internal_snprintf(
-      TmpFilename, sizeof(TmpFilename), "%s%s.%s",
-      flags()->xray_logfile_base, Progname, TmpWildcardPattern);
-  if (NeededLength > int(sizeof(TmpFilename))) {
-    Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename);
-    return nullptr;
-  }
-  int Fd = mkstemp(TmpFilename);
-  if (Fd == -1) {
-    Report("XRay: Failed opening temporary file '%s'; not logging events.\n",
-           TmpFilename);
-    return nullptr;
-  }
-  if (Verbosity())
-    Report("XRay: Log file in '%s'\n", TmpFilename);
-
-  LogWriter *LW = allocate<LogWriter>();
-  new (LW) LogWriter(Fd);
-  return LW;
-}
-
-void LogWriter::Close(LogWriter *LW) {
-  LW->~LogWriter();
-  deallocate(LW);
-}
-#endif // SANITIZER_FUCHSIA
-
-} // namespace __xray
diff --git a/lib/xray/xray_utils.cpp b/lib/xray/xray_utils.cpp
new file mode 100644
index 000000000000..1036d17a7725
--- /dev/null
+++ b/lib/xray/xray_utils.cpp
@@ -0,0 +1,195 @@
+//===-- xray_utils.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_utils.h"
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
+#include "xray_flags.h"
+#include <cstdio>
+#include <errno.h>
+#include <fcntl.h>
+#include <iterator>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <tuple>
+#include <unistd.h>
+#include <utility>
+
+#if SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_symbolizer_fuchsia.h"
+
+#include <inttypes.h>
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
+namespace __xray {
+
+#if SANITIZER_FUCHSIA
+constexpr const char* ProfileSinkName = "llvm-xray";
+
+LogWriter::~LogWriter() {
+  _zx_handle_close(Vmo);
+}
+
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+  if (Begin == End)
+    return;
+  auto TotalBytes = std::distance(Begin, End);
+
+  const size_t PageSize = flags()->xray_page_size_override > 0
+                              ? flags()->xray_page_size_override
+                              : GetPageSizeCached();
+  if (RoundUpTo(Offset, PageSize) != RoundUpTo(Offset + TotalBytes, PageSize)) {
+    // Resize the VMO to ensure there's sufficient space for the data.
+    zx_status_t Status = _zx_vmo_set_size(Vmo, Offset + TotalBytes);
+    if (Status != ZX_OK) {
+      Report("Failed to resize VMO: %s\n", _zx_status_get_string(Status));
+      return;
+    }
+  }
+
+  // Write the data into VMO.
+  zx_status_t Status = _zx_vmo_write(Vmo, Begin, Offset, TotalBytes);
+  if (Status != ZX_OK) {
+    Report("Failed to write: %s\n", _zx_status_get_string(Status));
+    return;
+  }
+  Offset += TotalBytes;
+}
+
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+  // Nothing to do here since WriteAll writes directly into the VMO.
+}
+
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
+  // Create VMO to hold the profile data.
+  zx_handle_t Vmo;
+  zx_status_t Status = _zx_vmo_create(0, ZX_VMO_RESIZABLE, &Vmo);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot create VMO: %s\n", _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Get the KOID of the current process to use in the VMO name.
+  zx_info_handle_basic_t Info;
+  Status = _zx_object_get_info(_zx_process_self(), ZX_INFO_HANDLE_BASIC, &Info,
+                               sizeof(Info), NULL, NULL);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot get basic info about current process handle: %s\n",
+           _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Give the VMO a name including our process KOID so it's easy to spot.
+  char VmoName[ZX_MAX_NAME_LEN];
+  internal_snprintf(VmoName, sizeof(VmoName), "%s.%zu", ProfileSinkName,
+                    Info.koid);
+  _zx_object_set_property(Vmo, ZX_PROP_NAME, VmoName, strlen(VmoName));
+
+  // Duplicate the handle since __sanitizer_publish_data consumes it and
+  // LogWriter needs to hold onto it.
+  zx_handle_t Handle;
+  Status =_zx_handle_duplicate(Vmo, ZX_RIGHT_SAME_RIGHTS, &Handle);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot duplicate VMO handle: %s\n",
+           _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Publish the VMO that receives the logging. Note the VMO's contents can
+  // grow and change after publication. The contents won't be read out until
+  // after the process exits.
+  __sanitizer_publish_data(ProfileSinkName, Handle);
+
+  // Use the dumpfile symbolizer markup element to write the name of the VMO.
+  Report("XRay: " FORMAT_DUMPFILE "\n", ProfileSinkName, VmoName);
+
+  LogWriter *LW = reinterpret_cast<LogWriter *>(InternalAlloc(sizeof(LogWriter)));
+  new (LW) LogWriter(Vmo);
+  return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+  LW->~LogWriter();
+  InternalFree(LW);
+}
+#else // SANITIZER_FUCHSIA
+LogWriter::~LogWriter() {
+  internal_close(Fd);
+}
+
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+  if (Begin == End)
+    return;
+  auto TotalBytes = std::distance(Begin, End);
+  while (auto Written = write(Fd, Begin, TotalBytes)) {
+    if (Written < 0) {
+      if (errno == EINTR)
+        continue; // Try again.
+      Report("Failed to write; errno = %d\n", errno);
+      return;
+    }
+    TotalBytes -= Written;
+    if (TotalBytes == 0)
+      break;
+    Begin += Written;
+  }
+}
+
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+  fsync(Fd);
+}
+
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
+  // Open a temporary file once for the log.
+  char TmpFilename[256] = {};
+  char TmpWildcardPattern[] = "XXXXXX";
+  auto **Argv = GetArgv();
+  const char *Progname = !Argv ? "(unknown)" : Argv[0];
+  const char *LastSlash = internal_strrchr(Progname, '/');
+
+  if (LastSlash != nullptr)
+    Progname = LastSlash + 1;
+
+  int NeededLength = internal_snprintf(
+      TmpFilename, sizeof(TmpFilename), "%s%s.%s",
+      flags()->xray_logfile_base, Progname, TmpWildcardPattern);
+  if (NeededLength > int(sizeof(TmpFilename))) {
+    Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename);
+    return nullptr;
+  }
+  int Fd = mkstemp(TmpFilename);
+  if (Fd == -1) {
+    Report("XRay: Failed opening temporary file '%s'; not logging events.\n",
+           TmpFilename);
+    return nullptr;
+  }
+  if (Verbosity())
+    Report("XRay: Log file in '%s'\n", TmpFilename);
+
+  LogWriter *LW = allocate<LogWriter>();
+  new (LW) LogWriter(Fd);
+  return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+  LW->~LogWriter();
+  deallocate(LW);
+}
+#endif // SANITIZER_FUCHSIA
+
+} // namespace __xray
diff --git a/lib/xray/xray_x86_64.cc b/lib/xray/xray_x86_64.cc
deleted file mode 100644
index e63ee1b3bd02..000000000000
--- a/lib/xray/xray_x86_64.cc
+++ /dev/null
@@ -1,353 +0,0 @@
-#include "cpuid.h"
-#include "sanitizer_common/sanitizer_common.h"
-#if !SANITIZER_FUCHSIA
-#include "sanitizer_common/sanitizer_posix.h"
-#endif
-#include "xray_defs.h"
-#include "xray_interface_internal.h"
-
-#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
-#include <sys/types.h>
-#if SANITIZER_OPENBSD
-#include <sys/time.h>
-#include <machine/cpu.h>
-#endif
-#include <sys/sysctl.h>
-#elif SANITIZER_FUCHSIA
-#include <zircon/syscalls.h>
-#endif
-
-#include <atomic>
-#include <cstdint>
-#include <errno.h>
-#include <fcntl.h>
-#include <iterator>
-#include <limits>
-#include <tuple>
-#include <unistd.h>
-
-namespace __xray {
-
-#if SANITIZER_LINUX
-static std::pair<ssize_t, bool>
-retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
-  auto BytesToRead = std::distance(Begin, End);
-  ssize_t BytesRead;
-  ssize_t TotalBytesRead = 0;
-  while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) {
-    if (BytesRead == -1) {
-      if (errno == EINTR)
-        continue;
-      Report("Read error; errno = %d\n", errno);
-      return std::make_pair(TotalBytesRead, false);
-    }
-
-    TotalBytesRead += BytesRead;
-    BytesToRead -= BytesRead;
-    Begin += BytesRead;
-  }
-  return std::make_pair(TotalBytesRead, true);
-}
-
-static bool readValueFromFile(const char *Filename,
-                              long long *Value) XRAY_NEVER_INSTRUMENT {
-  int Fd = open(Filename, O_RDONLY | O_CLOEXEC);
-  if (Fd == -1)
-    return false;
-  static constexpr size_t BufSize = 256;
-  char Line[BufSize] = {};
-  ssize_t BytesRead;
-  bool Success;
-  std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize);
-  close(Fd);
-  if (!Success)
-    return false;
-  const char *End = nullptr;
-  long long Tmp = internal_simple_strtoll(Line, &End, 10);
-  bool Result = false;
-  if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
-    *Value = Tmp;
-    Result = true;
-  }
-  return Result;
-}
-
-uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
-  long long TSCFrequency = -1;
-  if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz",
-                        &TSCFrequency)) {
-    TSCFrequency *= 1000;
-  } else if (readValueFromFile(
-                 "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
-                 &TSCFrequency)) {
-    TSCFrequency *= 1000;
-  } else {
-    Report("Unable to determine CPU frequency for TSC accounting.\n");
-  }
-  return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
-}
-#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
-uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
-    long long TSCFrequency = -1;
-    size_t tscfreqsz = sizeof(TSCFrequency);
-#if SANITIZER_OPENBSD
-    int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ };
-    if (internal_sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
-#elif SANITIZER_MAC
-    if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
-                              &tscfreqsz, NULL, 0) != -1) {
-
-#else
-    if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
-                              NULL, 0) != -1) {
-#endif
-        return static_cast<uint64_t>(TSCFrequency);
-    } else {
-      Report("Unable to determine CPU frequency for TSC accounting.\n");
-    }
-
-    return 0;
-}
-#elif !SANITIZER_FUCHSIA
-uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
-    /* Not supported */
-    return 0;
-}
-#endif
-
-static constexpr uint8_t CallOpCode = 0xe8;
-static constexpr uint16_t MovR10Seq = 0xba41;
-static constexpr uint16_t Jmp9Seq = 0x09eb;
-static constexpr uint16_t Jmp20Seq = 0x14eb;
-static constexpr uint16_t Jmp15Seq = 0x0feb;
-static constexpr uint8_t JmpOpCode = 0xe9;
-static constexpr uint8_t RetOpCode = 0xc3;
-static constexpr uint16_t NopwSeq = 0x9066;
-
-static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
-static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
-
-bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
-                        const XRaySledEntry &Sled,
-                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
-  // Here we do the dance of replacing the following sled:
-  //
-  // xray_sled_n:
-  //   jmp +9
-  //   <9 byte nop>
-  //
-  // With the following:
-  //
-  //   mov r10d, <function id>
-  //   call <relative 32bit offset to entry trampoline>
-  //
-  // We need to do this in the following order:
-  //
-  // 1. Put the function id first, 2 bytes from the start of the sled (just
-  // after the 2-byte jmp instruction).
-  // 2. Put the call opcode 6 bytes from the start of the sled.
-  // 3. Put the relative offset 7 bytes from the start of the sled.
-  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
-  // opcode and first operand.
-  //
-  // Prerequisite is to compute the relative offset to the trampoline's address.
-  int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
-                             (static_cast<int64_t>(Sled.Address) + 11);
-  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
-    Report("XRay Entry trampoline (%p) too far from sled (%p)\n",
-           Trampoline, reinterpret_cast<void *>(Sled.Address));
-    return false;
-  }
-  if (Enable) {
-    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
-    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
-    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
-        std::memory_order_release);
-  } else {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
-        std::memory_order_release);
-    // FIXME: Write out the nops still?
-  }
-  return true;
-}
-
-bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
-                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // Here we do the dance of replacing the following sled:
-  //
-  // xray_sled_n:
-  //   ret
-  //   <10 byte nop>
-  //
-  // With the following:
-  //
-  //   mov r10d, <function id>
-  //   jmp <relative 32bit offset to exit trampoline>
-  //
-  // 1. Put the function id first, 2 bytes from the start of the sled (just
-  // after the 1-byte ret instruction).
-  // 2. Put the jmp opcode 6 bytes from the start of the sled.
-  // 3. Put the relative offset 7 bytes from the start of the sled.
-  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
-  // opcode and first operand.
-  //
-  // Prerequisite is to compute the relative offset fo the
-  // __xray_FunctionExit function's address.
-  int64_t TrampolineOffset = reinterpret_cast<int64_t>(__xray_FunctionExit) -
-                             (static_cast<int64_t>(Sled.Address) + 11);
-  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
-    Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
-           __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address));
-    return false;
-  }
-  if (Enable) {
-    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
-    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = JmpOpCode;
-    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
-        std::memory_order_release);
-  } else {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint8_t> *>(Sled.Address), RetOpCode,
-        std::memory_order_release);
-    // FIXME: Write out the nops still?
-  }
-  return true;
-}
-
-bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
-                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // Here we do the dance of replacing the tail call sled with a similar
-  // sequence as the entry sled, but calls the tail exit sled instead.
-  int64_t TrampolineOffset =
-      reinterpret_cast<int64_t>(__xray_FunctionTailExit) -
-      (static_cast<int64_t>(Sled.Address) + 11);
-  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
-    Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
-           __xray_FunctionTailExit, reinterpret_cast<void *>(Sled.Address));
-    return false;
-  }
-  if (Enable) {
-    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
-    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
-    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
-        std::memory_order_release);
-  } else {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
-        std::memory_order_release);
-    // FIXME: Write out the nops still?
-  }
-  return true;
-}
-
-bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // Here we do the dance of replacing the following sled:
-  //
-  // In Version 0:
-  //
-  // xray_sled_n:
-  //   jmp +20          // 2 bytes
-  //   ...
-  //
-  // With the following:
-  //
-  //   nopw             // 2 bytes*
-  //   ...
-  //
-  //
-  // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
-  //
-  // ---
-  //
-  // In Version 1:
-  //
-  //   The jump offset is now 15 bytes (0x0f), so when restoring the nopw back
-  //   to a jmp, use 15 bytes instead.
-  //
-  if (Enable) {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
-        std::memory_order_release);
-  } else {
-    switch (Sled.Version) {
-    case 1:
-      std::atomic_store_explicit(
-          reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp15Seq,
-          std::memory_order_release);
-      break;
-    case 0:
-    default:
-      std::atomic_store_explicit(
-          reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
-          std::memory_order_release);
-      break;
-    }
-    }
-  return false;
-}
-
-bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
-                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
-  // Here we do the dance of replacing the following sled:
-  //
-  // xray_sled_n:
-  //   jmp +20          // 2 byte instruction
-  //   ...
-  //
-  // With the following:
-  //
-  //   nopw             // 2 bytes
-  //   ...
-  //
-  //
-  // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
-  // The 20 byte sled stashes three argument registers, calls the trampoline,
-  // unstashes the registers and returns. If the arguments are already in
-  // the correct registers, the stashing and unstashing become equivalently
-  // sized nops.
-  if (Enable) {
-    std::atomic_store_explicit(
-        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
-        std::memory_order_release);
-  } else {
-      std::atomic_store_explicit(
-          reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
-          std::memory_order_release);
-  }
-  return false;
-}
-
-#if !SANITIZER_FUCHSIA
-// We determine whether the CPU we're running on has the correct features we
-// need. In x86_64 this will be rdtscp support.
-bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
-  unsigned int EAX, EBX, ECX, EDX;
-
-  // We check whether rdtscp support is enabled. According to the x86_64 manual,
-  // level should be set at 0x80000001, and we should have a look at bit 27 in
-  // EDX. That's 0x8000000 (or 1u << 27).
-  __asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX)
-    : "0"(0x80000001));
-  if (!(EDX & (1u << 27))) {
-    Report("Missing rdtscp support.\n");
-    return false;
-  }
-  // Also check whether we can determine the CPU frequency, since if we cannot,
-  // we should use the emulated TSC instead.
-  if (!getTSCFrequency()) {
-    Report("Unable to determine CPU frequency.\n");
-    return false;
-  }
-  return true;
-}
-#endif
-
-} // namespace __xray
diff --git a/lib/xray/xray_x86_64.cpp b/lib/xray/xray_x86_64.cpp
new file mode 100644
index 000000000000..e63ee1b3bd02
--- /dev/null
+++ b/lib/xray/xray_x86_64.cpp
@@ -0,0 +1,353 @@
+#include "cpuid.h"
+#include "sanitizer_common/sanitizer_common.h"
+#if !SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
+#include <sys/types.h>
+#if SANITIZER_OPENBSD
+#include <sys/time.h>
+#include <machine/cpu.h>
+#endif
+#include <sys/sysctl.h>
+#elif SANITIZER_FUCHSIA
+#include <zircon/syscalls.h>
+#endif
+
+#include <atomic>
+#include <cstdint>
+#include <errno.h>
+#include <fcntl.h>
+#include <iterator>
+#include <limits>
+#include <tuple>
+#include <unistd.h>
+
+namespace __xray {
+
+#if SANITIZER_LINUX
+static std::pair<ssize_t, bool>
+retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
+  auto BytesToRead = std::distance(Begin, End);
+  ssize_t BytesRead;
+  ssize_t TotalBytesRead = 0;
+  while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) {
+    if (BytesRead == -1) {
+      if (errno == EINTR)
+        continue;
+      Report("Read error; errno = %d\n", errno);
+      return std::make_pair(TotalBytesRead, false);
+    }
+
+    TotalBytesRead += BytesRead;
+    BytesToRead -= BytesRead;
+    Begin += BytesRead;
+  }
+  return std::make_pair(TotalBytesRead, true);
+}
+
+static bool readValueFromFile(const char *Filename,
+                              long long *Value) XRAY_NEVER_INSTRUMENT {
+  int Fd = open(Filename, O_RDONLY | O_CLOEXEC);
+  if (Fd == -1)
+    return false;
+  static constexpr size_t BufSize = 256;
+  char Line[BufSize] = {};
+  ssize_t BytesRead;
+  bool Success;
+  std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize);
+  close(Fd);
+  if (!Success)
+    return false;
+  const char *End = nullptr;
+  long long Tmp = internal_simple_strtoll(Line, &End, 10);
+  bool Result = false;
+  if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
+    *Value = Tmp;
+    Result = true;
+  }
+  return Result;
+}
+
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+  long long TSCFrequency = -1;
+  if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz",
+                        &TSCFrequency)) {
+    TSCFrequency *= 1000;
+  } else if (readValueFromFile(
+                 "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                 &TSCFrequency)) {
+    TSCFrequency *= 1000;
+  } else {
+    Report("Unable to determine CPU frequency for TSC accounting.\n");
+  }
+  return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
+}
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_OPENBSD || SANITIZER_MAC
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+    long long TSCFrequency = -1;
+    size_t tscfreqsz = sizeof(TSCFrequency);
+#if SANITIZER_OPENBSD
+    int Mib[2] = { CTL_MACHDEP, CPU_TSCFREQ };
+    if (internal_sysctl(Mib, 2, &TSCFrequency, &tscfreqsz, NULL, 0) != -1) {
+#elif SANITIZER_MAC
+    if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
+                              &tscfreqsz, NULL, 0) != -1) {
+
+#else
+    if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
+                              NULL, 0) != -1) {
+#endif
+        return static_cast<uint64_t>(TSCFrequency);
+    } else {
+      Report("Unable to determine CPU frequency for TSC accounting.\n");
+    }
+
+    return 0;
+}
+#elif !SANITIZER_FUCHSIA
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+    /* Not supported */
+    return 0;
+}
+#endif
+
+static constexpr uint8_t CallOpCode = 0xe8;
+static constexpr uint16_t MovR10Seq = 0xba41;
+static constexpr uint16_t Jmp9Seq = 0x09eb;
+static constexpr uint16_t Jmp20Seq = 0x14eb;
+static constexpr uint16_t Jmp15Seq = 0x0feb;
+static constexpr uint8_t JmpOpCode = 0xe9;
+static constexpr uint8_t RetOpCode = 0xc3;
+static constexpr uint16_t NopwSeq = 0x9066;
+
+static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
+static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +9
+  //   <9 byte nop>
+  //
+  // With the following:
+  //
+  //   mov r10d, <function id>
+  //   call <relative 32bit offset to entry trampoline>
+  //
+  // We need to do this in the following order:
+  //
+  // 1. Put the function id first, 2 bytes from the start of the sled (just
+  // after the 2-byte jmp instruction).
+  // 2. Put the call opcode 6 bytes from the start of the sled.
+  // 3. Put the relative offset 7 bytes from the start of the sled.
+  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+  // opcode and first operand.
+  //
+  // Prerequisite is to compute the relative offset to the trampoline's address.
+  int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
+                             (static_cast<int64_t>(Sled.Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Entry trampoline (%p) too far from sled (%p)\n",
+           Trampoline, reinterpret_cast<void *>(Sled.Address));
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
+    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   ret
+  //   <10 byte nop>
+  //
+  // With the following:
+  //
+  //   mov r10d, <function id>
+  //   jmp <relative 32bit offset to exit trampoline>
+  //
+  // 1. Put the function id first, 2 bytes from the start of the sled (just
+  // after the 1-byte ret instruction).
+  // 2. Put the jmp opcode 6 bytes from the start of the sled.
+  // 3. Put the relative offset 7 bytes from the start of the sled.
+  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+  // opcode and first operand.
+  //
+  // Prerequisite is to compute the relative offset fo the
+  // __xray_FunctionExit function's address.
+  int64_t TrampolineOffset = reinterpret_cast<int64_t>(__xray_FunctionExit) -
+                             (static_cast<int64_t>(Sled.Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
+           __xray_FunctionExit, reinterpret_cast<void *>(Sled.Address));
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = JmpOpCode;
+    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint8_t> *>(Sled.Address), RetOpCode,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the tail call sled with a similar
+  // sequence as the entry sled, but calls the tail exit sled instead.
+  int64_t TrampolineOffset =
+      reinterpret_cast<int64_t>(__xray_FunctionTailExit) -
+      (static_cast<int64_t>(Sled.Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
+           __xray_FunctionTailExit, reinterpret_cast<void *>(Sled.Address));
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Sled.Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Sled.Address + 6) = CallOpCode;
+    *reinterpret_cast<uint32_t *>(Sled.Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp9Seq,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // In Version 0:
+  //
+  // xray_sled_n:
+  //   jmp +20          // 2 bytes
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes*
+  //   ...
+  //
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
+  //
+  // ---
+  //
+  // In Version 1:
+  //
+  //   The jump offset is now 15 bytes (0x0f), so when restoring the nopw back
+  //   to a jmp, use 15 bytes instead.
+  //
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+    switch (Sled.Version) {
+    case 1:
+      std::atomic_store_explicit(
+          reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp15Seq,
+          std::memory_order_release);
+      break;
+    case 0:
+    default:
+      std::atomic_store_explicit(
+          reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
+          std::memory_order_release);
+      break;
+    }
+    }
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +20          // 2 byte instruction
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes
+  //   ...
+  //
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
+  // The 20 byte sled stashes three argument registers, calls the trampoline,
+  // unstashes the registers and returns. If the arguments are already in
+  // the correct registers, the stashing and unstashing become equivalently
+  // sized nops.
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+      std::atomic_store_explicit(
+          reinterpret_cast<std::atomic<uint16_t> *>(Sled.Address), Jmp20Seq,
+          std::memory_order_release);
+  }
+  return false;
+}
+
+#if !SANITIZER_FUCHSIA
+// We determine whether the CPU we're running on has the correct features we
+// need. In x86_64 this will be rdtscp support.
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
+  unsigned int EAX, EBX, ECX, EDX;
+
+  // We check whether rdtscp support is enabled. According to the x86_64 manual,
+  // level should be set at 0x80000001, and we should have a look at bit 27 in
+  // EDX. That's 0x8000000 (or 1u << 27).
+  __asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX)
+    : "0"(0x80000001));
+  if (!(EDX & (1u << 27))) {
+    Report("Missing rdtscp support.\n");
+    return false;
+  }
+  // Also check whether we can determine the CPU frequency, since if we cannot,
+  // we should use the emulated TSC instead.
+  if (!getTSCFrequency()) {
+    Report("Unable to determine CPU frequency.\n");
+    return false;
+  }
+  return true;
+}
+#endif
+
+} // namespace __xray
-- 
cgit v1.2.3