58 files changed, 9848 insertions, 0 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/weak_symbols.txt b/contrib/llvm-project/compiler-rt/lib/xray/weak_symbols.txt
new file mode 100644
index 000000000000..963fff2d697e
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/weak_symbols.txt
@@ -0,0 +1,4 @@
+___start_xray_fn_idx
+___start_xray_instr_map
+___stop_xray_fn_idx
+___stop_xray_instr_map
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_AArch64.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_AArch64.cpp
new file mode 100644
index 000000000000..c1d77758946e
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_AArch64.cpp
@@ -0,0 +1,143 @@
+//===-- xray_AArch64.cpp ----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of AArch64-specific routines (64-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+#include <cassert>
+
+extern "C" void __clear_cache(void *start, void *end);
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum class PatchOpcodes : uint32_t {
+  PO_StpX0X30SP_m16e = 0xA9BF7BE0, // STP X0, X30, [SP, #-16]!
+  PO_LdrX16_12 = 0x58000070,       // LDR X16, #12
+  PO_BlrX16 = 0xD63F0200,          // BLR X16
+  PO_LdpX0X30SP_16 = 0xA8C17BE0,   // LDP X0, X30, [SP], #16
+  PO_B32 = 0x14000008              // B #32
+};
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //   B #32
+  //   7 NOPs (24 bytes)
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n:
+  //   STP X0, X30, [SP, #-16]! ; PUSH {r0, lr}
+  //   LDR W17, #12 ; W17 := function ID
+  //   LDR X16,#12 ; X16 := address of the trampoline
+  //   BLR X16
+  //   ;DATA: 32 bits of function ID
+  //   ;DATA: lower 32 bits of the address of the trampoline
+  //   ;DATA: higher 32 bits of the address of the trampoline
+  //   LDP X0, X30, [SP], #16 ; POP {r0, lr}
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #32
+
+  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.address());
+  uint32_t *CurAddress = FirstAddress + 1;
+  if (Enable) {
+    *CurAddress++ = 0x18000071; // ldr w17, #12
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdrX16_12);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_BlrX16);
+    CurAddress++;
+    *CurAddress = FuncId;
+    CurAddress++;
+    *reinterpret_cast<void (**)()>(CurAddress) = TracingHook;
+    CurAddress += 2;
+    *CurAddress = uint32_t(PatchOpcodes::PO_LdpX0X30SP_16);
+    CurAddress++;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_StpX0X30SP_m16e), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_B32), std::memory_order_release);
+  }
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
+}
+
+// AArch64AsmPrinter::LowerPATCHABLE_EVENT_CALL generates this code sequence:
+//
+// .Lxray_event_sled_N:
+//   b 1f
+//   save x0 and x1 (and also x2 for TYPED_EVENT_CALL)
+//   set up x0 and x1 (and also x2 for TYPED_EVENT_CALL)
+//   bl __xray_CustomEvent or __xray_TypedEvent
+//   restore x0 and x1 (and also x2 for TYPED_EVENT_CALL)
+// 1f
+//
+// There are 6 instructions for EVENT_CALL and 9 for TYPED_EVENT_CALL.
+//
+// Enable: b .+24 => nop
+// Disable: nop => b .+24
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  uint32_t Inst = Enable ? 0xd503201f : 0x14000006;
+  std::atomic_store_explicit(
+      reinterpret_cast<std::atomic<uint32_t> *>(Sled.address()), Inst,
+      std::memory_order_release);
+  return false;
+}
+
+// Enable: b +36 => nop
+// Disable: nop => b +36
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  uint32_t Inst = Enable ? 0xd503201f : 0x14000009;
+  std::atomic_store_explicit(
+      reinterpret_cast<std::atomic<uint32_t> *>(Sled.address()), Inst,
+      std::memory_order_release);
+  return false;
+}
+
+// FIXME: Maybe implement this better?
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_allocator.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_allocator.h
new file mode 100644
index 000000000000..0284f4299fb1
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_allocator.h
@@ -0,0 +1,288 @@
+//===-- xray_allocator.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the allocator interface for an arena allocator, used primarily for
+// the profiling runtime.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_ALLOCATOR_H
+#define XRAY_ALLOCATOR_H
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#if SANITIZER_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#else
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_defs.h"
+#include "xray_utils.h"
+#include <cstddef>
+#include <cstdint>
+#include <sys/mman.h>
+
+namespace __xray {
+
+// We implement our own memory allocation routine which will bypass the
+// internal allocator. This allows us to manage the memory directly, using
+// mmap'ed memory to back the allocators.
+template <class T> T *allocate() XRAY_NEVER_INSTRUMENT {
+  uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  zx_handle_t Vmo;
+  zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to create VMO of size %zu: %s\n",
+             sizeof(T), _zx_status_get_string(Status));
+    return nullptr;
+  }
+  uintptr_t B;
+  Status =
+      _zx_vmar_map(_zx_vmar_root_self(), ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0,
+                   Vmo, 0, sizeof(T), &B);
+  _zx_handle_close(Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", sizeof(T),
+             _zx_status_get_string(Status));
+    return nullptr;
+  }
+  return reinterpret_cast<T *>(B);
+#else
+  uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  int ErrNo = 0;
+  if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to allocate memory of size %zu; Error = "
+             "%zu\n",
+             RoundedSize, B);
+    return nullptr;
+  }
+#endif
+  return reinterpret_cast<T *>(B);
+}
+
+template <class T> void deallocate(T *B) XRAY_NEVER_INSTRUMENT {
+  if (B == nullptr)
+    return;
+  uptr RoundedSize = RoundUpTo(sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B),
+                 RoundedSize);
+#else
+  internal_munmap(B, RoundedSize);
+#endif
+}
+
+template <class T = unsigned char>
+T *allocateBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+  uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  zx_handle_t Vmo;
+  zx_status_t Status = _zx_vmo_create(RoundedSize, 0, &Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to create VMO of size %zu: %s\n", S,
+             _zx_status_get_string(Status));
+    return nullptr;
+  }
+  uintptr_t B;
+  Status = _zx_vmar_map(_zx_vmar_root_self(),
+                        ZX_VM_PERM_READ | ZX_VM_PERM_WRITE, 0, Vmo, 0, S, &B);
+  _zx_handle_close(Vmo);
+  if (Status != ZX_OK) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to map VMAR of size %zu: %s\n", S,
+             _zx_status_get_string(Status));
+    return nullptr;
+  }
+#else
+  uptr B = internal_mmap(NULL, RoundedSize, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
+  int ErrNo = 0;
+  if (UNLIKELY(internal_iserror(B, &ErrNo))) {
+    if (Verbosity())
+      Report("XRay Profiling: Failed to allocate memory of size %zu; Error = "
+             "%zu\n",
+             RoundedSize, B);
+    return nullptr;
+  }
+#endif
+  return reinterpret_cast<T *>(B);
+}
+
+template <class T> void deallocateBuffer(T *B, size_t S) XRAY_NEVER_INSTRUMENT {
+  if (B == nullptr)
+    return;
+  uptr RoundedSize = RoundUpTo(S * sizeof(T), GetPageSizeCached());
+#if SANITIZER_FUCHSIA
+  _zx_vmar_unmap(_zx_vmar_root_self(), reinterpret_cast<uintptr_t>(B),
+                 RoundedSize);
+#else
+  internal_munmap(B, RoundedSize);
+#endif
+}
+
+template <class T, class... U>
+T *initArray(size_t N, U &&... Us) XRAY_NEVER_INSTRUMENT {
+  auto A = allocateBuffer<T>(N);
+  if (A != nullptr)
+    while (N > 0)
+      new (A + (--N)) T(std::forward<U>(Us)...);
+  return A;
+}
+
+/// The Allocator type hands out fixed-sized chunks of memory that are
+/// cache-line aligned and sized. This is useful for placement of
+/// performance-sensitive data in memory that's frequently accessed. The
+/// allocator also self-limits the peak memory usage to a dynamically defined
+/// maximum.
+///
+/// N is the lower-bound size of the block of memory to return from the
+/// allocation function. N is used to compute the size of a block, which is
+/// cache-line-size multiples worth of memory. We compute the size of a block by
+/// determining how many cache lines worth of memory is required to subsume N.
+///
+/// The Allocator instance will manage its own memory acquired through mmap.
+/// This severely constrains the platforms on which this can be used to POSIX
+/// systems where mmap semantics are well-defined.
+///
+/// FIXME: Isolate the lower-level memory management to a different abstraction
+/// that can be platform-specific.
+template <size_t N> struct Allocator {
+  // The Allocator returns memory as Block instances.
+  struct Block {
+    /// Compute the minimum cache-line size multiple that is >= N.
+    static constexpr auto Size = nearest_boundary(N, kCacheLineSize);
+    void *Data;
+  };
+
+private:
+  size_t MaxMemory{0};
+  unsigned char *BackingStore = nullptr;
+  unsigned char *AlignedNextBlock = nullptr;
+  size_t AllocatedBlocks = 0;
+  bool Owned;
+  SpinMutex Mutex{};
+
+  void *Alloc() XRAY_NEVER_INSTRUMENT {
+    SpinMutexLock Lock(&Mutex);
+    if (UNLIKELY(BackingStore == nullptr)) {
+      BackingStore = allocateBuffer(MaxMemory);
+      if (BackingStore == nullptr) {
+        if (Verbosity())
+          Report("XRay Profiling: Failed to allocate memory for allocator\n");
+        return nullptr;
+      }
+
+      AlignedNextBlock = BackingStore;
+
+      // Ensure that NextBlock is aligned appropriately.
+      auto BackingStoreNum = reinterpret_cast<uintptr_t>(BackingStore);
+      auto AlignedNextBlockNum = nearest_boundary(
+          reinterpret_cast<uintptr_t>(AlignedNextBlock), kCacheLineSize);
+      if (diff(AlignedNextBlockNum, BackingStoreNum) > ptrdiff_t(MaxMemory)) {
+        deallocateBuffer(BackingStore, MaxMemory);
+        AlignedNextBlock = BackingStore = nullptr;
+        if (Verbosity())
+          Report("XRay Profiling: Cannot obtain enough memory from "
+                 "preallocated region\n");
+        return nullptr;
+      }
+
+      AlignedNextBlock = reinterpret_cast<unsigned char *>(AlignedNextBlockNum);
+
+      // Assert that AlignedNextBlock is cache-line aligned.
+      DCHECK_EQ(reinterpret_cast<uintptr_t>(AlignedNextBlock) % kCacheLineSize,
+                0);
+    }
+
+    if (((AllocatedBlocks + 1) * Block::Size) > MaxMemory)
+      return nullptr;
+
+    // Align the pointer we'd like to return to an appropriate alignment, then
+    // advance the pointer from where to start allocations.
+    void *Result = AlignedNextBlock;
+    AlignedNextBlock =
+        reinterpret_cast<unsigned char *>(AlignedNextBlock) + Block::Size;
+    ++AllocatedBlocks;
+    return Result;
+  }
+
+public:
+  explicit Allocator(size_t M) XRAY_NEVER_INSTRUMENT
+      : MaxMemory(RoundUpTo(M, kCacheLineSize)),
+        BackingStore(nullptr),
+        AlignedNextBlock(nullptr),
+        AllocatedBlocks(0),
+        Owned(true),
+        Mutex() {}
+
+  explicit Allocator(void *P, size_t M) XRAY_NEVER_INSTRUMENT
+      : MaxMemory(M),
+        BackingStore(reinterpret_cast<unsigned char *>(P)),
+        AlignedNextBlock(reinterpret_cast<unsigned char *>(P)),
+        AllocatedBlocks(0),
+        Owned(false),
+        Mutex() {}
+
+  Allocator(const Allocator &) = delete;
+  Allocator &operator=(const Allocator &) = delete;
+
+  Allocator(Allocator &&O) XRAY_NEVER_INSTRUMENT {
+    SpinMutexLock L0(&Mutex);
+    SpinMutexLock L1(&O.Mutex);
+    MaxMemory = O.MaxMemory;
+    O.MaxMemory = 0;
+    BackingStore = O.BackingStore;
+    O.BackingStore = nullptr;
+    AlignedNextBlock = O.AlignedNextBlock;
+    O.AlignedNextBlock = nullptr;
+    AllocatedBlocks = O.AllocatedBlocks;
+    O.AllocatedBlocks = 0;
+    Owned = O.Owned;
+    O.Owned = false;
+  }
+
+  Allocator &operator=(Allocator &&O) XRAY_NEVER_INSTRUMENT {
+    SpinMutexLock L0(&Mutex);
+    SpinMutexLock L1(&O.Mutex);
+    MaxMemory = O.MaxMemory;
+    O.MaxMemory = 0;
+    if (BackingStore != nullptr)
+      deallocateBuffer(BackingStore, MaxMemory);
+    BackingStore = O.BackingStore;
+    O.BackingStore = nullptr;
+    AlignedNextBlock = O.AlignedNextBlock;
+    O.AlignedNextBlock = nullptr;
+    AllocatedBlocks = O.AllocatedBlocks;
+    O.AllocatedBlocks = 0;
+    Owned = O.Owned;
+    O.Owned = false;
+    return *this;
+  }
+
+  Block Allocate() XRAY_NEVER_INSTRUMENT { return {Alloc()}; }
+
+  ~Allocator() NOEXCEPT XRAY_NEVER_INSTRUMENT {
+    if (Owned && BackingStore != nullptr) {
+      deallocateBuffer(BackingStore, MaxMemory);
+    }
+  }
+};
+
+} // namespace __xray
+
+#endif // XRAY_ALLOCATOR_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_always_instrument.txt b/contrib/llvm-project/compiler-rt/lib/xray/xray_always_instrument.txt
new file mode 100644
index 000000000000..151ed703dd56
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_always_instrument.txt
@@ -0,0 +1,6 @@
+# List of function matchers common to C/C++ applications that make sense to
+# always instrument. You can use this as an argument to
+# -fxray-always-instrument=<path> along with your project-specific lists.
+
+# Always instrument the main function.
+fun:main
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_arm.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_arm.cpp
new file mode 100644
index 000000000000..e1818555906c
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_arm.cpp
@@ -0,0 +1,164 @@
+//===-- xray_arm.cpp --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of ARM-specific routines (32-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+#include <cassert>
+
+extern "C" void __clear_cache(void *start, void *end);
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum class PatchOpcodes : uint32_t {
+  PO_PushR0Lr = 0xE92D4001, // PUSH {r0, lr}
+  PO_BlxIp = 0xE12FFF3C,    // BLX ip
+  PO_PopR0Lr = 0xE8BD4001,  // POP {r0, lr}
+  PO_B20 = 0xEA000005       // B #20
+};
+
+// 0xUUUUWXYZ -> 0x000W0XYZ
+inline static uint32_t getMovwMask(const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return (Value & 0xfff) | ((Value & 0xf000) << 4);
+}
+
+// 0xWXYZUUUU -> 0x000W0XYZ
+inline static uint32_t getMovtMask(const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return getMovwMask(Value >> 16);
+}
+
+// Writes the following instructions:
+//   MOVW R<regNo>, #<lower 16 bits of the |Value|>
+//   MOVT R<regNo>, #<higher 16 bits of the |Value|>
+inline static uint32_t *
+write32bitLoadReg(uint8_t regNo, uint32_t *Address,
+                  const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  // This is a fatal error: we cannot just report it and continue execution.
+  assert(regNo <= 15 && "Register number must be 0 to 15.");
+  // MOVW R, #0xWXYZ in machine code is 0xE30WRXYZ
+  *Address = (0xE3000000 | (uint32_t(regNo) << 12) | getMovwMask(Value));
+  Address++;
+  // MOVT R, #0xWXYZ in machine code is 0xE34WRXYZ
+  *Address = (0xE3400000 | (uint32_t(regNo) << 12) | getMovtMask(Value));
+  return Address + 1;
+}
+
+// Writes the following instructions:
+//   MOVW r0, #<lower 16 bits of the |Value|>
+//   MOVT r0, #<higher 16 bits of the |Value|>
+inline static uint32_t *
+write32bitLoadR0(uint32_t *Address,
+                 const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return write32bitLoadReg(0, Address, Value);
+}
+
+// Writes the following instructions:
+//   MOVW ip, #<lower 16 bits of the |Value|>
+//   MOVT ip, #<higher 16 bits of the |Value|>
+inline static uint32_t *
+write32bitLoadIP(uint32_t *Address,
+                 const uint32_t Value) XRAY_NEVER_INSTRUMENT {
+  return write32bitLoadReg(12, Address, Value);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //   B #20
+  //   6 NOPs (24 bytes)
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n:
+  //   PUSH {r0, lr}
+  //   MOVW r0, #<lower 16 bits of function ID>
+  //   MOVT r0, #<higher 16 bits of function ID>
+  //   MOVW ip, #<lower 16 bits of address of TracingHook>
+  //   MOVT ip, #<higher 16 bits of address of TracingHook>
+  //   BLX ip
+  //   POP {r0, lr}
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #20
+
+  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.address());
+  uint32_t *CurAddress = FirstAddress + 1;
+  if (Enable) {
+    CurAddress =
+        write32bitLoadR0(CurAddress, reinterpret_cast<uint32_t>(FuncId));
+    CurAddress =
+        write32bitLoadIP(CurAddress, reinterpret_cast<uint32_t>(TracingHook));
+    *CurAddress = uint32_t(PatchOpcodes::PO_BlxIp);
+    CurAddress++;
+    *CurAddress = uint32_t(PatchOpcodes::PO_PopR0Lr);
+    CurAddress++;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_PushR0Lr), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(FirstAddress),
+        uint32_t(PatchOpcodes::PO_B20), std::memory_order_release);
+  }
+  __clear_cache(reinterpret_cast<char *>(FirstAddress),
+                reinterpret_cast<char *>(CurAddress));
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionTailExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled)
+    XRAY_NEVER_INSTRUMENT { // FIXME: Implement in arm?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in arm?
+  return false;
+}
+
+// FIXME: Maybe implement this better?
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.cpp
new file mode 100644
index 000000000000..e0a5e7bb29ee
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.cpp
@@ -0,0 +1,49 @@
+//===-- xray_basic_flags.cpp ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay Basic flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_basic_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+/// Use via basicFlags().
+BasicFlags xray_basic_flags_dont_use_directly;
+
+void BasicFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayBasicFlags(FlagParser *P,
+                            BasicFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedBasicFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_BASIC_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_BASIC_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.h
new file mode 100644
index 000000000000..b846c1233e8a
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.h
@@ -0,0 +1,37 @@
+//===-- xray_basic_flags.h -------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay Basic Mode runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_BASIC_FLAGS_H
+#define XRAY_BASIC_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct BasicFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_basic_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern BasicFlags xray_basic_flags_dont_use_directly;
+extern void registerXRayBasicFlags(FlagParser *P, BasicFlags *F);
+const char *useCompilerDefinedBasicFlags();
+inline BasicFlags *basicFlags() { return &xray_basic_flags_dont_use_directly; }
+
+} // namespace __xray
+
+#endif // XRAY_BASIC_FLAGS_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.inc b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.inc
new file mode 100644
index 000000000000..fb38c540d356
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_flags.inc
@@ -0,0 +1,23 @@
+//===-- xray_basic_flags.inc ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(int, func_duration_threshold_us, 5,
+          "Basic logging will try to skip functions that execute for fewer "
+          "microseconds than this threshold.")
+XRAY_FLAG(int, max_stack_depth, 64,
+          "Basic logging will keep track of at most this deep a call stack, "
+          "any more and the recordings will be dropped.")
+XRAY_FLAG(int, thread_buffer_size, 1024,
+          "The number of entries to keep on a per-thread buffer.")
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_logging.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_logging.cpp
new file mode 100644
index 000000000000..6ac5417bef75
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_logging.cpp
@@ -0,0 +1,515 @@
+//===-- xray_basic_logging.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of a simple in-memory log of XRay events. This defines a
+// logging function that's compatible with the XRay handler interface, and
+// routines for exporting data to files.
+//
+//===----------------------------------------------------------------------===//
+
+#include <errno.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/stat.h>
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_APPLE
+#include <sys/syscall.h>
+#endif
+#include <sys/types.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray/xray_records.h"
+#include "xray_recursion_guard.h"
+#include "xray_basic_flags.h"
+#include "xray_basic_logging.h"
+#include "xray_defs.h"
+#include "xray_flags.h"
+#include "xray_interface_internal.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+
+namespace __xray {
+
+static SpinMutex LogMutex;
+
+namespace {
+// We use elements of this type to record the entry TSC of every function ID we
+// see as we're tracing a particular thread's execution.
+struct alignas(16) StackEntry {
+  int32_t FuncId;
+  uint16_t Type;
+  uint8_t CPU;
+  uint8_t Padding;
+  uint64_t TSC;
+};
+
+static_assert(sizeof(StackEntry) == 16, "Wrong size for StackEntry");
+
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
+  void *InMemoryBuffer = nullptr;
+  size_t BufferSize = 0;
+  size_t BufferOffset = 0;
+  void *ShadowStack = nullptr;
+  size_t StackSize = 0;
+  size_t StackEntries = 0;
+  __xray::LogWriter *LogWriter = nullptr;
+};
+
+struct BasicLoggingOptions {
+  int DurationFilterMicros = 0;
+  size_t MaxStackDepth = 0;
+  size_t ThreadBufferSize = 0;
+};
+} // namespace
+
+static pthread_key_t PThreadKey;
+
+static atomic_uint8_t BasicInitialized{0};
+
+struct BasicLoggingOptions GlobalOptions;
+
+thread_local atomic_uint8_t Guard{0};
+
+static atomic_uint8_t UseRealTSC{0};
+static atomic_uint64_t ThresholdTicks{0};
+static atomic_uint64_t TicksPerSec{0};
+static atomic_uint64_t CycleFrequency{NanosecondsPerSecond};
+
+static LogWriter *getLog() XRAY_NEVER_INSTRUMENT {
+  LogWriter* LW = LogWriter::Open();
+  if (LW == nullptr)
+    return LW;
+
+  static pthread_once_t DetectOnce = PTHREAD_ONCE_INIT;
+  pthread_once(&DetectOnce, +[] {
+    if (atomic_load(&UseRealTSC, memory_order_acquire))
+      atomic_store(&CycleFrequency, getTSCFrequency(), memory_order_release);
+  });
+
+  // Since we're here, we get to write the header. We set it up so that the
+  // header will only be written once, at the start, and let the threads
+  // logging do writes which just append.
+  XRayFileHeader Header;
+  // Version 2 includes tail exit records.
+  // Version 3 includes pid inside records.
+  Header.Version = 3;
+  Header.Type = FileTypes::NAIVE_LOG;
+  Header.CycleFrequency = atomic_load(&CycleFrequency, memory_order_acquire);
+
+  // FIXME: Actually check whether we have 'constant_tsc' and 'nonstop_tsc'
+  // before setting the values in the header.
+  Header.ConstantTSC = 1;
+  Header.NonstopTSC = 1;
+  LW->WriteAll(reinterpret_cast<char *>(&Header),
+               reinterpret_cast<char *>(&Header) + sizeof(Header));
+  return LW;
+}
+
+static LogWriter *getGlobalLog() XRAY_NEVER_INSTRUMENT {
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  static LogWriter *LW = nullptr;
+  pthread_once(&OnceInit, +[] { LW = getLog(); });
+  return LW;
+}
+
+static ThreadLocalData &getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+  thread_local ThreadLocalData TLD;
+  thread_local bool UNUSED TOnce = [] {
+    if (GlobalOptions.ThreadBufferSize == 0) {
+      if (Verbosity())
+        Report("Not initializing TLD since ThreadBufferSize == 0.\n");
+      return false;
+    }
+    pthread_setspecific(PThreadKey, &TLD);
+    TLD.LogWriter = getGlobalLog();
+    TLD.InMemoryBuffer = reinterpret_cast<XRayRecord *>(
+        InternalAlloc(sizeof(XRayRecord) * GlobalOptions.ThreadBufferSize,
+                      nullptr, alignof(XRayRecord)));
+    TLD.BufferSize = GlobalOptions.ThreadBufferSize;
+    TLD.BufferOffset = 0;
+    if (GlobalOptions.MaxStackDepth == 0) {
+      if (Verbosity())
+        Report("Not initializing the ShadowStack since MaxStackDepth == 0.\n");
+      TLD.StackSize = 0;
+      TLD.StackEntries = 0;
+      TLD.ShadowStack = nullptr;
+      return false;
+    }
+    TLD.ShadowStack = reinterpret_cast<StackEntry *>(
+        InternalAlloc(sizeof(StackEntry) * GlobalOptions.MaxStackDepth, nullptr,
+                      alignof(StackEntry)));
+    TLD.StackSize = GlobalOptions.MaxStackDepth;
+    TLD.StackEntries = 0;
+    return false;
+  }();
+  return TLD;
+}
+
+template <class RDTSC>
+void InMemoryRawLog(int32_t FuncId, XRayEntryType Type,
+                    RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  LogWriter *LW = getGlobalLog();
+  if (LW == nullptr)
+    return;
+
+  // Use a simple recursion guard, to handle cases where we're already logging
+  // and for one reason or another, this function gets called again in the same
+  // thread.
+  RecursionGuard G(Guard);
+  if (!G)
+    return;
+
+  uint8_t CPU = 0;
+  uint64_t TSC = ReadTSC(CPU);
+
+  switch (Type) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY: {
+    // Short circuit if we've reached the maximum depth of the stack.
+    if (TLD.StackEntries++ >= TLD.StackSize)
+      return;
+
+    // When we encounter an entry event, we keep track of the TSC and the CPU,
+    // and put it in the stack.
+    StackEntry E;
+    E.FuncId = FuncId;
+    E.CPU = CPU;
+    E.Type = Type;
+    E.TSC = TSC;
+    auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
+                         (sizeof(StackEntry) * (TLD.StackEntries - 1));
+    internal_memcpy(StackEntryPtr, &E, sizeof(StackEntry));
+    break;
+  }
+  case XRayEntryType::EXIT:
+  case XRayEntryType::TAIL: {
+    if (TLD.StackEntries == 0)
+      break;
+
+    if (--TLD.StackEntries >= TLD.StackSize)
+      return;
+
+    // When we encounter an exit event, we check whether all the following are
+    // true:
+    //
+    // - The Function ID is the same as the most recent entry in the stack.
+    // - The CPU is the same as the most recent entry in the stack.
+    // - The Delta of the TSCs is less than the threshold amount of time we're
+    //   looking to record.
+    //
+    // If all of these conditions are true, we pop the stack and don't write a
+    // record and move the record offset back.
+    StackEntry StackTop;
+    auto StackEntryPtr = static_cast<char *>(TLD.ShadowStack) +
+                         (sizeof(StackEntry) * TLD.StackEntries);
+    internal_memcpy(&StackTop, StackEntryPtr, sizeof(StackEntry));
+    if (StackTop.FuncId == FuncId && StackTop.CPU == CPU &&
+        StackTop.TSC < TSC) {
+      auto Delta = TSC - StackTop.TSC;
+      if (Delta < atomic_load(&ThresholdTicks, memory_order_relaxed)) {
+        DCHECK(TLD.BufferOffset > 0);
+        TLD.BufferOffset -= StackTop.Type == XRayEntryType::ENTRY ? 1 : 2;
+        return;
+      }
+    }
+    break;
+  }
+  default:
+    // Should be unreachable.
+    DCHECK(false && "Unsupported XRayEntryType encountered.");
+    break;
+  }
+
+  // First determine whether the delta between the function's enter record and
+  // the exit record is higher than the threshold.
+  XRayRecord R;
+  R.RecordType = RecordTypes::NORMAL;
+  R.CPU = CPU;
+  R.TSC = TSC;
+  R.TId = GetTid(); 
+  R.PId = internal_getpid(); 
+  R.Type = Type;
+  R.FuncId = FuncId;
+  auto FirstEntry = reinterpret_cast<XRayRecord *>(TLD.InMemoryBuffer);
+  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
+  if (++TLD.BufferOffset == TLD.BufferSize) {
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
+    TLD.StackEntries = 0;
+  }
+}
+
+template <class RDTSC>
+void InMemoryRawLogWithArg(int32_t FuncId, XRayEntryType Type, uint64_t Arg1,
+                           RDTSC ReadTSC) XRAY_NEVER_INSTRUMENT {
+  auto &TLD = getThreadLocalData();
+  auto FirstEntry =
+      reinterpret_cast<XRayArgPayload *>(TLD.InMemoryBuffer);
+  const auto &BuffLen = TLD.BufferSize;
+  LogWriter *LW = getGlobalLog();
+  if (LW == nullptr)
+    return;
+
+  // First we check whether there's enough space to write the data consecutively
+  // in the thread-local buffer. If not, we first flush the buffer before
+  // attempting to write the two records that must be consecutive.
+  if (TLD.BufferOffset + 2 > BuffLen) {
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
+    TLD.StackEntries = 0;
+  }
+
+  // Then we write the "we have an argument" record.
+  InMemoryRawLog(FuncId, Type, ReadTSC);
+
+  RecursionGuard G(Guard);
+  if (!G)
+    return;
+
+  // And, from here on write the arg payload.
+  XRayArgPayload R;
+  R.RecordType = RecordTypes::ARG_PAYLOAD;
+  R.FuncId = FuncId;
+  R.TId = GetTid(); 
+  R.PId = internal_getpid(); 
+  R.Arg = Arg1;
+  internal_memcpy(FirstEntry + TLD.BufferOffset, &R, sizeof(R));
+  if (++TLD.BufferOffset == BuffLen) {
+    SpinMutexLock Lock(&LogMutex);
+    LW->WriteAll(reinterpret_cast<char *>(FirstEntry),
+                 reinterpret_cast<char *>(FirstEntry + TLD.BufferOffset));
+    TLD.BufferOffset = 0;
+    TLD.StackEntries = 0;
+  }
+}
+
+void basicLoggingHandleArg0RealTSC(int32_t FuncId,
+                                   XRayEntryType Type) XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLog(FuncId, Type, readTSC);
+}
+
+void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Type)
+    XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLog(FuncId, Type, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+    timespec TS;
+    int result = clock_gettime(CLOCK_REALTIME, &TS);
+    if (result != 0) {
+      Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno));
+      TS = {0, 0};
+    }
+    CPU = 0;
+    return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
+  });
+}
+
+void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Type,
+                                   uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLogWithArg(FuncId, Type, Arg1, readTSC);
+}
+
+void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Type,
+                                      uint64_t Arg1) XRAY_NEVER_INSTRUMENT {
+  InMemoryRawLogWithArg(
+      FuncId, Type, Arg1, [](uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+        timespec TS;
+        int result = clock_gettime(CLOCK_REALTIME, &TS);
+        if (result != 0) {
+          Report("clock_gettimg(2) return %d, errno=%d.", result, int(errno));
+          TS = {0, 0};
+        }
+        CPU = 0;
+        return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
+      });
+}
+
+static void TLDDestructor(void *P) XRAY_NEVER_INSTRUMENT {
+  ThreadLocalData &TLD = *reinterpret_cast<ThreadLocalData *>(P);
+  auto ExitGuard = at_scope_exit([&TLD] {
+    // Clean up dynamic resources.
+    if (TLD.InMemoryBuffer)
+      InternalFree(TLD.InMemoryBuffer);
+    if (TLD.ShadowStack)
+      InternalFree(TLD.ShadowStack);
+    if (Verbosity())
+      Report("Cleaned up log for TID: %llu\n", GetTid());
+  });
+
+  if (TLD.LogWriter == nullptr || TLD.BufferOffset == 0) {
+    if (Verbosity())
+      Report("Skipping buffer for TID: %llu; Offset = %zu\n", GetTid(),
+             TLD.BufferOffset);
+    return;
+  }
+
+  {
+    SpinMutexLock L(&LogMutex);
+    TLD.LogWriter->WriteAll(reinterpret_cast<char *>(TLD.InMemoryBuffer),
+                            reinterpret_cast<char *>(TLD.InMemoryBuffer) +
+                            (sizeof(XRayRecord) * TLD.BufferOffset));
+  }
+
+  // Because this thread's exit could be the last one trying to write to
+  // the file and that we're not able to close out the file properly, we
+  // sync instead and hope that the pending writes are flushed as the
+  // thread exits.
+  TLD.LogWriter->Flush();
+}
+
+XRayLogInitStatus basicLoggingInit(UNUSED size_t BufferSize,
+                                   UNUSED size_t BufferMax, void *Options,
+                                   size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  uint8_t Expected = 0;
+  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 1,
+                                      memory_order_acq_rel)) {
+    if (Verbosity())
+      Report("Basic logging already initialized.\n");
+    return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  }
+
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  pthread_once(&OnceInit, +[] {
+    pthread_key_create(&PThreadKey, TLDDestructor);
+    atomic_store(&UseRealTSC, probeRequiredCPUFeatures(), memory_order_release);
+    // Initialize the global TicksPerSec value.
+    atomic_store(&TicksPerSec,
+                 probeRequiredCPUFeatures() ? getTSCFrequency()
+                                            : NanosecondsPerSecond,
+                 memory_order_release);
+    if (!atomic_load(&UseRealTSC, memory_order_relaxed) && Verbosity())
+      Report("WARNING: Required CPU features missing for XRay instrumentation, "
+             "using emulation instead.\n");
+  });
+
+  FlagParser P;
+  BasicFlags F;
+  F.setDefaults();
+  registerXRayBasicFlags(&P, &F);
+  P.ParseString(useCompilerDefinedBasicFlags());
+  auto *EnvOpts = GetEnv("XRAY_BASIC_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
+
+  P.ParseString(EnvOpts);
+
+  // If XRAY_BASIC_OPTIONS was not defined, then we use the deprecated options
+  // set through XRAY_OPTIONS instead.
+  if (internal_strlen(EnvOpts) == 0) {
+    F.func_duration_threshold_us =
+        flags()->xray_naive_log_func_duration_threshold_us;
+    F.max_stack_depth = flags()->xray_naive_log_max_stack_depth;
+    F.thread_buffer_size = flags()->xray_naive_log_thread_buffer_size;
+  }
+
+  P.ParseString(static_cast<const char *>(Options));
+  GlobalOptions.ThreadBufferSize = F.thread_buffer_size;
+  GlobalOptions.DurationFilterMicros = F.func_duration_threshold_us;
+  GlobalOptions.MaxStackDepth = F.max_stack_depth;
+  *basicFlags() = F;
+
+  atomic_store(&ThresholdTicks,
+               atomic_load(&TicksPerSec, memory_order_acquire) *
+                   GlobalOptions.DurationFilterMicros / 1000000,
+               memory_order_release);
+  __xray_set_handler_arg1(atomic_load(&UseRealTSC, memory_order_acquire)
+                              ? basicLoggingHandleArg1RealTSC
+                              : basicLoggingHandleArg1EmulateTSC);
+  __xray_set_handler(atomic_load(&UseRealTSC, memory_order_acquire)
+                         ? basicLoggingHandleArg0RealTSC
+                         : basicLoggingHandleArg0EmulateTSC);
+
+  // TODO: Implement custom event and typed event handling support in Basic
+  // Mode.
+  __xray_remove_customevent_handler();
+  __xray_remove_typedevent_handler();
+
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+XRayLogInitStatus basicLoggingFinalize() XRAY_NEVER_INSTRUMENT {
+  uint8_t Expected = 0;
+  if (!atomic_compare_exchange_strong(&BasicInitialized, &Expected, 0,
+                                      memory_order_acq_rel) &&
+      Verbosity())
+    Report("Basic logging already finalized.\n");
+
+  // Nothing really to do aside from marking state of the global to be
+  // uninitialized.
+
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+XRayLogFlushStatus basicLoggingFlush() XRAY_NEVER_INSTRUMENT {
+  // This really does nothing, since flushing the logs happen at the end of a
+  // thread's lifetime, or when the buffers are full.
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+// This is a handler that, effectively, does nothing.
+void basicLoggingHandleArg0Empty(int32_t, XRayEntryType) XRAY_NEVER_INSTRUMENT {
+}
+
+bool basicLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  XRayLogImpl Impl{
+      basicLoggingInit,
+      basicLoggingFinalize,
+      basicLoggingHandleArg0Empty,
+      basicLoggingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-basic", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+      Verbosity())
+    Report("Cannot register XRay Basic Mode to 'xray-basic'; error = %d\n",
+           RegistrationResult);
+  if (flags()->xray_naive_log ||
+      !internal_strcmp(flags()->xray_mode, "xray-basic")) {
+    auto SelectResult = __xray_log_select_mode("xray-basic");
+    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+      if (Verbosity())
+        Report("Failed selecting XRay Basic Mode; error = %d\n", SelectResult);
+      return false;
+    }
+
+    // We initialize the implementation using the data we get from the
+    // XRAY_BASIC_OPTIONS environment variable, at this point of the
+    // implementation.
+    auto *Env = GetEnv("XRAY_BASIC_OPTIONS");
+    auto InitResult =
+        __xray_log_init_mode("xray-basic", Env == nullptr ? "" : Env);
+    if (InitResult != XRayLogInitStatus::XRAY_LOG_INITIALIZED) {
+      if (Verbosity())
+        Report("Failed initializing XRay Basic Mode; error = %d\n", InitResult);
+      return false;
+    }
+
+    // At this point we know that we've successfully initialized Basic mode
+    // tracing, and the only chance we're going to get for the current thread to
+    // clean-up may be at thread/program exit. To ensure that we're going to get
+    // the cleanup even without calling the finalization routines, we're
+    // registering a program exit function that will do the cleanup.
+    static pthread_once_t DynamicOnce = PTHREAD_ONCE_INIT;
+    pthread_once(&DynamicOnce, +[] {
+      static void *FakeTLD = nullptr;
+      FakeTLD = &getThreadLocalData();
+      Atexit(+[] { TLDDestructor(FakeTLD); });
+    });
+  }
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::basicLogDynamicInitializer();
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_logging.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_logging.h
new file mode 100644
index 000000000000..89caca66b585
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_basic_logging.h
@@ -0,0 +1,42 @@
+//===-- xray_basic_logging.h ----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_INMEMORY_LOG_H
+#define XRAY_XRAY_INMEMORY_LOG_H
+
+#include "xray/xray_log_interface.h"
+
+/// Basic (Naive) Mode
+/// ==================
+///
+/// This implementation hooks in through the XRay logging implementation
+/// framework. The Basic Mode implementation will keep appending to a file as
+/// soon as the thread-local buffers are full. It keeps minimal in-memory state
+/// and does the minimum filtering required to keep log files smaller.
+
+namespace __xray {
+
+XRayLogInitStatus basicLoggingInit(size_t BufferSize, size_t BufferMax,
+                                   void *Options, size_t OptionsSize);
+XRayLogInitStatus basicLoggingFinalize();
+
+void basicLoggingHandleArg0RealTSC(int32_t FuncId, XRayEntryType Entry);
+void basicLoggingHandleArg0EmulateTSC(int32_t FuncId, XRayEntryType Entry);
+void basicLoggingHandleArg1RealTSC(int32_t FuncId, XRayEntryType Entry,
+                                   uint64_t Arg1);
+void basicLoggingHandleArg1EmulateTSC(int32_t FuncId, XRayEntryType Entry,
+                                      uint64_t Arg1);
+XRayLogFlushStatus basicLoggingFlush();
+XRayLogInitStatus basicLoggingReset();
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_INMEMORY_LOG_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_buffer_queue.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_buffer_queue.cpp
new file mode 100644
index 000000000000..748708ccd0f4
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_buffer_queue.cpp
@@ -0,0 +1,237 @@
+//===-- xray_buffer_queue.cpp ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the interface for a buffer queue implementation.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_buffer_queue.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#if !SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_allocator.h"
+#include "xray_defs.h"
+#include <memory>
+#include <sys/mman.h>
+
+using namespace __xray;
+
+namespace {
+
+BufferQueue::ControlBlock *allocControlBlock(size_t Size, size_t Count) {
+  auto B =
+      allocateBuffer((sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
+  return B == nullptr ? nullptr
+                      : reinterpret_cast<BufferQueue::ControlBlock *>(B);
+}
+
+void deallocControlBlock(BufferQueue::ControlBlock *C, size_t Size,
+                         size_t Count) {
+  deallocateBuffer(reinterpret_cast<unsigned char *>(C),
+                   (sizeof(BufferQueue::ControlBlock) - 1) + (Size * Count));
+}
+
+void decRefCount(BufferQueue::ControlBlock *C, size_t Size, size_t Count) {
+  if (C == nullptr)
+    return;
+  if (atomic_fetch_sub(&C->RefCount, 1, memory_order_acq_rel) == 1)
+    deallocControlBlock(C, Size, Count);
+}
+
+void incRefCount(BufferQueue::ControlBlock *C) {
+  if (C == nullptr)
+    return;
+  atomic_fetch_add(&C->RefCount, 1, memory_order_acq_rel);
+}
+
+// We use a struct to ensure that we are allocating one atomic_uint64_t per
+// cache line. This allows us to not worry about false-sharing among atomic
+// objects being updated (constantly) by different threads.
+struct ExtentsPadded {
+  union {
+    atomic_uint64_t Extents;
+    unsigned char Storage[kCacheLineSize];
+  };
+};
+
+constexpr size_t kExtentsSize = sizeof(ExtentsPadded);
+
+} // namespace
+
+BufferQueue::ErrorCode BufferQueue::init(size_t BS, size_t BC) {
+  SpinMutexLock Guard(&Mutex);
+
+  if (!finalizing())
+    return BufferQueue::ErrorCode::AlreadyInitialized;
+
+  cleanupBuffers();
+
+  bool Success = false;
+  BufferSize = BS;
+  BufferCount = BC;
+
+  BackingStore = allocControlBlock(BufferSize, BufferCount);
+  if (BackingStore == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  auto CleanupBackingStore = at_scope_exit([&, this] {
+    if (Success)
+      return;
+    deallocControlBlock(BackingStore, BufferSize, BufferCount);
+    BackingStore = nullptr;
+  });
+
+  // Initialize enough atomic_uint64_t instances, each
+  ExtentsBackingStore = allocControlBlock(kExtentsSize, BufferCount);
+  if (ExtentsBackingStore == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  auto CleanupExtentsBackingStore = at_scope_exit([&, this] {
+    if (Success)
+      return;
+    deallocControlBlock(ExtentsBackingStore, kExtentsSize, BufferCount);
+    ExtentsBackingStore = nullptr;
+  });
+
+  Buffers = initArray<BufferRep>(BufferCount);
+  if (Buffers == nullptr)
+    return BufferQueue::ErrorCode::NotEnoughMemory;
+
+  // At this point we increment the generation number to associate the buffers
+  // to the new generation.
+  atomic_fetch_add(&Generation, 1, memory_order_acq_rel);
+
+  // First, we initialize the refcount in the ControlBlock, which we treat as
+  // being at the start of the BackingStore pointer.
+  atomic_store(&BackingStore->RefCount, 1, memory_order_release);
+  atomic_store(&ExtentsBackingStore->RefCount, 1, memory_order_release);
+
+  // Then we initialise the individual buffers that sub-divide the whole backing
+  // store. Each buffer will start at the `Data` member of the ControlBlock, and
+  // will be offsets from these locations.
+  for (size_t i = 0; i < BufferCount; ++i) {
+    auto &T = Buffers[i];
+    auto &Buf = T.Buff;
+    auto *E = reinterpret_cast<ExtentsPadded *>(&ExtentsBackingStore->Data +
+                                                (kExtentsSize * i));
+    Buf.Extents = &E->Extents;
+    atomic_store(Buf.Extents, 0, memory_order_release);
+    Buf.Generation = generation();
+    Buf.Data = &BackingStore->Data + (BufferSize * i);
+    Buf.Size = BufferSize;
+    Buf.BackingStore = BackingStore;
+    Buf.ExtentsBackingStore = ExtentsBackingStore;
+    Buf.Count = BufferCount;
+    T.Used = false;
+  }
+
+  Next = Buffers;
+  First = Buffers;
+  LiveBuffers = 0;
+  atomic_store(&Finalizing, 0, memory_order_release);
+  Success = true;
+  return BufferQueue::ErrorCode::Ok;
+}
+
+BufferQueue::BufferQueue(size_t B, size_t N,
+                         bool &Success) XRAY_NEVER_INSTRUMENT
+    : BufferSize(B),
+      BufferCount(N),
+      Mutex(),
+      Finalizing{1},
+      BackingStore(nullptr),
+      ExtentsBackingStore(nullptr),
+      Buffers(nullptr),
+      Next(Buffers),
+      First(Buffers),
+      LiveBuffers(0),
+      Generation{0} {
+  Success = init(B, N) == BufferQueue::ErrorCode::Ok;
+}
+
+BufferQueue::ErrorCode BufferQueue::getBuffer(Buffer &Buf) {
+  if (atomic_load(&Finalizing, memory_order_acquire))
+    return ErrorCode::QueueFinalizing;
+
+  BufferRep *B = nullptr;
+  {
+    SpinMutexLock Guard(&Mutex);
+    if (LiveBuffers == BufferCount)
+      return ErrorCode::NotEnoughMemory;
+    B = Next++;
+    if (Next == (Buffers + BufferCount))
+      Next = Buffers;
+    ++LiveBuffers;
+  }
+
+  incRefCount(BackingStore);
+  incRefCount(ExtentsBackingStore);
+  Buf = B->Buff;
+  Buf.Generation = generation();
+  B->Used = true;
+  return ErrorCode::Ok;
+}
+
+BufferQueue::ErrorCode BufferQueue::releaseBuffer(Buffer &Buf) {
+  // Check whether the buffer being referred to is within the bounds of the
+  // backing store's range.
+  BufferRep *B = nullptr;
+  {
+    SpinMutexLock Guard(&Mutex);
+    if (Buf.Generation != generation() || LiveBuffers == 0) {
+      Buf = {};
+      decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+      decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+      return BufferQueue::ErrorCode::Ok;
+    }
+
+    if (Buf.Data < &BackingStore->Data ||
+        Buf.Data > &BackingStore->Data + (BufferCount * BufferSize))
+      return BufferQueue::ErrorCode::UnrecognizedBuffer;
+
+    --LiveBuffers;
+    B = First++;
+    if (First == (Buffers + BufferCount))
+      First = Buffers;
+  }
+
+  // Now that the buffer has been released, we mark it as "used".
+  B->Buff = Buf;
+  B->Used = true;
+  decRefCount(Buf.BackingStore, Buf.Size, Buf.Count);
+  decRefCount(Buf.ExtentsBackingStore, kExtentsSize, Buf.Count);
+  atomic_store(B->Buff.Extents, atomic_load(Buf.Extents, memory_order_acquire),
+               memory_order_release);
+  Buf = {};
+  return ErrorCode::Ok;
+}
+
+BufferQueue::ErrorCode BufferQueue::finalize() {
+  if (atomic_exchange(&Finalizing, 1, memory_order_acq_rel))
+    return ErrorCode::QueueFinalizing;
+  return ErrorCode::Ok;
+}
+
+void BufferQueue::cleanupBuffers() {
+  for (auto B = Buffers, E = Buffers + BufferCount; B != E; ++B)
+    B->~BufferRep();
+  deallocateBuffer(Buffers, BufferCount);
+  decRefCount(BackingStore, BufferSize, BufferCount);
+  decRefCount(ExtentsBackingStore, kExtentsSize, BufferCount);
+  BackingStore = nullptr;
+  ExtentsBackingStore = nullptr;
+  Buffers = nullptr;
+  BufferCount = 0;
+  BufferSize = 0;
+}
+
+BufferQueue::~BufferQueue() { cleanupBuffers(); }
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_buffer_queue.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_buffer_queue.h
new file mode 100644
index 000000000000..8d33f73576b5
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_buffer_queue.h
@@ -0,0 +1,280 @@
+//===-- xray_buffer_queue.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the interface for a buffer queue implementation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_BUFFER_QUEUE_H
+#define XRAY_BUFFER_QUEUE_H
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include "xray_defs.h"
+#include <cstddef>
+#include <cstdint>
+
+namespace __xray {
+
+/// BufferQueue implements a circular queue of fixed sized buffers (much like a
+/// freelist) but is concerned with making it quick to initialise, finalise, and
+/// get from or return buffers to the queue. This is one key component of the
+/// "flight data recorder" (FDR) mode to support ongoing XRay function call
+/// trace collection.
+class BufferQueue {
+public:
+  /// ControlBlock represents the memory layout of how we interpret the backing
+  /// store for all buffers and extents managed by a BufferQueue instance. The
+  /// ControlBlock has the reference count as the first member, sized according
+  /// to platform-specific cache-line size. We never use the Buffer member of
+  /// the union, which is only there for compiler-supported alignment and
+  /// sizing.
+  ///
+  /// This ensures that the `Data` member will be placed at least kCacheLineSize
+  /// bytes from the beginning of the structure.
+  struct ControlBlock {
+    union {
+      atomic_uint64_t RefCount;
+      char Buffer[kCacheLineSize];
+    };
+
+    /// We need to make this size 1, to conform to the C++ rules for array data
+    /// members. Typically, we want to subtract this 1 byte for sizing
+    /// information.
+    char Data[1];
+  };
+
+  struct Buffer {
+    atomic_uint64_t *Extents = nullptr;
+    uint64_t Generation{0};
+    void *Data = nullptr;
+    size_t Size = 0;
+
+  private:
+    friend class BufferQueue;
+    ControlBlock *BackingStore = nullptr;
+    ControlBlock *ExtentsBackingStore = nullptr;
+    size_t Count = 0;
+  };
+
+  struct BufferRep {
+    // The managed buffer.
+    Buffer Buff;
+
+    // This is true if the buffer has been returned to the available queue, and
+    // is considered "used" by another thread.
+    bool Used = false;
+  };
+
+private:
+  // This models a ForwardIterator. |T| Must be either a `Buffer` or `const
+  // Buffer`. Note that we only advance to the "used" buffers, when
+  // incrementing, so that at dereference we're always at a valid point.
+  template <class T> class Iterator {
+  public:
+    BufferRep *Buffers = nullptr;
+    size_t Offset = 0;
+    size_t Max = 0;
+
+    Iterator &operator++() {
+      DCHECK_NE(Offset, Max);
+      do {
+        ++Offset;
+      } while (Offset != Max && !Buffers[Offset].Used);
+      return *this;
+    }
+
+    Iterator operator++(int) {
+      Iterator C = *this;
+      ++(*this);
+      return C;
+    }
+
+    T &operator*() const { return Buffers[Offset].Buff; }
+
+    T *operator->() const { return &(Buffers[Offset].Buff); }
+
+    Iterator(BufferRep *Root, size_t O, size_t M) XRAY_NEVER_INSTRUMENT
+        : Buffers(Root),
+          Offset(O),
+          Max(M) {
+      // We want to advance to the first Offset where the 'Used' property is
+      // true, or to the end of the list/queue.
+      while (Offset != Max && !Buffers[Offset].Used) {
+        ++Offset;
+      }
+    }
+
+    Iterator() = default;
+    Iterator(const Iterator &) = default;
+    Iterator(Iterator &&) = default;
+    Iterator &operator=(const Iterator &) = default;
+    Iterator &operator=(Iterator &&) = default;
+    ~Iterator() = default;
+
+    template <class V>
+    friend bool operator==(const Iterator &L, const Iterator<V> &R) {
+      DCHECK_EQ(L.Max, R.Max);
+      return L.Buffers == R.Buffers && L.Offset == R.Offset;
+    }
+
+    template <class V>
+    friend bool operator!=(const Iterator &L, const Iterator<V> &R) {
+      return !(L == R);
+    }
+  };
+
+  // Size of each individual Buffer.
+  size_t BufferSize;
+
+  // Amount of pre-allocated buffers.
+  size_t BufferCount;
+
+  SpinMutex Mutex;
+  atomic_uint8_t Finalizing;
+
+  // The collocated ControlBlock and buffer storage.
+  ControlBlock *BackingStore;
+
+  // The collocated ControlBlock and extents storage.
+  ControlBlock *ExtentsBackingStore;
+
+  // A dynamically allocated array of BufferRep instances.
+  BufferRep *Buffers;
+
+  // Pointer to the next buffer to be handed out.
+  BufferRep *Next;
+
+  // Pointer to the entry in the array where the next released buffer will be
+  // placed.
+  BufferRep *First;
+
+  // Count of buffers that have been handed out through 'getBuffer'.
+  size_t LiveBuffers;
+
+  // We use a generation number to identify buffers and which generation they're
+  // associated with.
+  atomic_uint64_t Generation;
+
+  /// Releases references to the buffers backed by the current buffer queue.
+  void cleanupBuffers();
+
+public:
+  enum class ErrorCode : unsigned {
+    Ok,
+    NotEnoughMemory,
+    QueueFinalizing,
+    UnrecognizedBuffer,
+    AlreadyFinalized,
+    AlreadyInitialized,
+  };
+
+  static const char *getErrorString(ErrorCode E) {
+    switch (E) {
+    case ErrorCode::Ok:
+      return "(none)";
+    case ErrorCode::NotEnoughMemory:
+      return "no available buffers in the queue";
+    case ErrorCode::QueueFinalizing:
+      return "queue already finalizing";
+    case ErrorCode::UnrecognizedBuffer:
+      return "buffer being returned not owned by buffer queue";
+    case ErrorCode::AlreadyFinalized:
+      return "queue already finalized";
+    case ErrorCode::AlreadyInitialized:
+      return "queue already initialized";
+    }
+    return "unknown error";
+  }
+
+  /// Initialise a queue of size |N| with buffers of size |B|. We report success
+  /// through |Success|.
+  BufferQueue(size_t B, size_t N, bool &Success);
+
+  /// Updates |Buf| to contain the pointer to an appropriate buffer. Returns an
+  /// error in case there are no available buffers to return when we will run
+  /// over the upper bound for the total buffers.
+  ///
+  /// Requirements:
+  ///   - BufferQueue is not finalising.
+  ///
+  /// Returns:
+  ///   - ErrorCode::NotEnoughMemory on exceeding MaxSize.
+  ///   - ErrorCode::Ok when we find a Buffer.
+  ///   - ErrorCode::QueueFinalizing or ErrorCode::AlreadyFinalized on
+  ///     a finalizing/finalized BufferQueue.
+  ErrorCode getBuffer(Buffer &Buf);
+
+  /// Updates |Buf| to point to nullptr, with size 0.
+  ///
+  /// Returns:
+  ///   - ErrorCode::Ok when we successfully release the buffer.
+  ///   - ErrorCode::UnrecognizedBuffer for when this BufferQueue does not own
+  ///     the buffer being released.
+  ErrorCode releaseBuffer(Buffer &Buf);
+
+  /// Initializes the buffer queue, starting a new generation. We can re-set the
+  /// size of buffers with |BS| along with the buffer count with |BC|.
+  ///
+  /// Returns:
+  ///   - ErrorCode::Ok when we successfully initialize the buffer. This
+  ///   requires that the buffer queue is previously finalized.
+  ///   - ErrorCode::AlreadyInitialized when the buffer queue is not finalized.
+  ErrorCode init(size_t BS, size_t BC);
+
+  bool finalizing() const {
+    return atomic_load(&Finalizing, memory_order_acquire);
+  }
+
+  uint64_t generation() const {
+    return atomic_load(&Generation, memory_order_acquire);
+  }
+
+  /// Returns the configured size of the buffers in the buffer queue.
+  size_t ConfiguredBufferSize() const { return BufferSize; }
+
+  /// Sets the state of the BufferQueue to finalizing, which ensures that:
+  ///
+  ///   - All subsequent attempts to retrieve a Buffer will fail.
+  ///   - All releaseBuffer operations will not fail.
+  ///
+  /// After a call to finalize succeeds, all subsequent calls to finalize will
+  /// fail with ErrorCode::QueueFinalizing.
+  ErrorCode finalize();
+
+  /// Applies the provided function F to each Buffer in the queue, only if the
+  /// Buffer is marked 'used' (i.e. has been the result of getBuffer(...) and a
+  /// releaseBuffer(...) operation).
+  template <class F> void apply(F Fn) XRAY_NEVER_INSTRUMENT {
+    SpinMutexLock G(&Mutex);
+    for (auto I = begin(), E = end(); I != E; ++I)
+      Fn(*I);
+  }
+
+  using const_iterator = Iterator<const Buffer>;
+  using iterator = Iterator<Buffer>;
+
+  /// Provides iterator access to the raw Buffer instances.
+  iterator begin() const { return iterator(Buffers, 0, BufferCount); }
+  const_iterator cbegin() const {
+    return const_iterator(Buffers, 0, BufferCount);
+  }
+  iterator end() const { return iterator(Buffers, BufferCount, BufferCount); }
+  const_iterator cend() const {
+    return const_iterator(Buffers, BufferCount, BufferCount);
+  }
+
+  // Cleans up allocated buffers.
+  ~BufferQueue();
+};
+
+} // namespace __xray
+
+#endif // XRAY_BUFFER_QUEUE_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_defs.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_defs.h
new file mode 100644
index 000000000000..2da03c3c3451
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_defs.h
@@ -0,0 +1,31 @@
+//===-- xray_defs.h ---------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Common definitions useful for XRay sources.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_DEFS_H
+#define XRAY_XRAY_DEFS_H
+
+#if XRAY_SUPPORTED
+#define XRAY_NEVER_INSTRUMENT __attribute__((xray_never_instrument))
+#else
+#define XRAY_NEVER_INSTRUMENT
+#endif
+
+#if SANITIZER_NETBSD
+// NetBSD: thread_local is not aligned properly, and the code relying
+// on it segfaults
+#define XRAY_TLS_ALIGNAS(x)
+#define XRAY_HAS_TLS_ALIGNAS 0
+#else
+#define XRAY_TLS_ALIGNAS(x) alignas(x)
+#define XRAY_HAS_TLS_ALIGNAS 1
+#endif
+
+#endif  // XRAY_XRAY_DEFS_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_controller.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_controller.h
new file mode 100644
index 000000000000..28a3546caa7b
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_controller.h
@@ -0,0 +1,372 @@
+//===-- xray_fdr_controller.h ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
+#define COMPILER_RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
+
+#include <limits>
+#include <time.h>
+
+#include "xray/xray_interface.h"
+#include "xray/xray_records.h"
+#include "xray_buffer_queue.h"
+#include "xray_fdr_log_writer.h"
+
+namespace __xray {
+
+template <size_t Version = 5> class FDRController {
+  BufferQueue *BQ;
+  BufferQueue::Buffer &B;
+  FDRLogWriter &W;
+  int (*WallClockReader)(clockid_t, struct timespec *) = 0;
+  uint64_t CycleThreshold = 0;
+
+  uint64_t LastFunctionEntryTSC = 0;
+  uint64_t LatestTSC = 0;
+  uint16_t LatestCPU = 0;
+  tid_t TId = 0;
+  pid_t PId = 0;
+  bool First = true;
+
+  uint32_t UndoableFunctionEnters = 0;
+  uint32_t UndoableTailExits = 0;
+
+  bool finalized() const XRAY_NEVER_INSTRUMENT {
+    return BQ == nullptr || BQ->finalizing();
+  }
+
+  bool hasSpace(size_t S) XRAY_NEVER_INSTRUMENT {
+    return B.Data != nullptr && B.Generation == BQ->generation() &&
+           W.getNextRecord() + S <= reinterpret_cast<char *>(B.Data) + B.Size;
+  }
+
+  constexpr int32_t mask(int32_t FuncId) const XRAY_NEVER_INSTRUMENT {
+    return FuncId & ((1 << 29) - 1);
+  }
+
+  bool getNewBuffer() XRAY_NEVER_INSTRUMENT {
+    if (BQ->getBuffer(B) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    W.resetRecord();
+    DCHECK_EQ(W.getNextRecord(), B.Data);
+    LatestTSC = 0;
+    LatestCPU = 0;
+    First = true;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    atomic_store(B.Extents, 0, memory_order_release);
+    return true;
+  }
+
+  bool setupNewBuffer() XRAY_NEVER_INSTRUMENT {
+    if (finalized())
+      return false;
+
+    DCHECK(hasSpace(sizeof(MetadataRecord) * 3));
+    TId = GetTid();
+    PId = internal_getpid();
+    struct timespec TS {
+      0, 0
+    };
+    WallClockReader(CLOCK_MONOTONIC, &TS);
+
+    MetadataRecord Metadata[] = {
+        // Write out a MetadataRecord to signify that this is the start of a new
+        // buffer, associated with a particular thread, with a new CPU. For the
+        // data, we have 15 bytes to squeeze as much information as we can. At
+        // this point we only write down the following bytes:
+        //   - Thread ID (tid_t, cast to 4 bytes type due to Darwin being 8
+        //   bytes)
+        createMetadataRecord<MetadataRecord::RecordKinds::NewBuffer>(
+            static_cast<int32_t>(TId)),
+
+        // Also write the WalltimeMarker record. We only really need microsecond
+        // precision here, and enforce across platforms that we need 64-bit
+        // seconds and 32-bit microseconds encoded in the Metadata record.
+        createMetadataRecord<MetadataRecord::RecordKinds::WalltimeMarker>(
+            static_cast<int64_t>(TS.tv_sec),
+            static_cast<int32_t>(TS.tv_nsec / 1000)),
+
+        // Also write the Pid record.
+        createMetadataRecord<MetadataRecord::RecordKinds::Pid>(
+            static_cast<int32_t>(PId)),
+    };
+
+    if (finalized())
+      return false;
+    return W.writeMetadataRecords(Metadata);
+  }
+
+  bool prepareBuffer(size_t S) XRAY_NEVER_INSTRUMENT {
+    if (finalized())
+      return returnBuffer();
+
+    if (UNLIKELY(!hasSpace(S))) {
+      if (!returnBuffer())
+        return false;
+      if (!getNewBuffer())
+        return false;
+      if (!setupNewBuffer())
+        return false;
+    }
+
+    if (First) {
+      First = false;
+      W.resetRecord();
+      atomic_store(B.Extents, 0, memory_order_release);
+      return setupNewBuffer();
+    }
+
+    return true;
+  }
+
+  bool returnBuffer() XRAY_NEVER_INSTRUMENT {
+    if (BQ == nullptr)
+      return false;
+
+    First = true;
+    if (finalized()) {
+      BQ->releaseBuffer(B); // ignore result.
+      return false;
+    }
+
+    return BQ->releaseBuffer(B) == BufferQueue::ErrorCode::Ok;
+  }
+
+  enum class PreambleResult { NoChange, WroteMetadata, InvalidBuffer };
+  PreambleResult recordPreamble(uint64_t TSC,
+                                uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (UNLIKELY(LatestCPU != CPU || LatestTSC == 0)) {
+      // We update our internal tracking state for the Latest TSC and CPU we've
+      // seen, then write out the appropriate metadata and function records.
+      LatestTSC = TSC;
+      LatestCPU = CPU;
+
+      if (B.Generation != BQ->generation())
+        return PreambleResult::InvalidBuffer;
+
+      W.writeMetadata<MetadataRecord::RecordKinds::NewCPUId>(CPU, TSC);
+      return PreambleResult::WroteMetadata;
+    }
+
+    DCHECK_EQ(LatestCPU, CPU);
+
+    if (UNLIKELY(LatestTSC > TSC ||
+                 TSC - LatestTSC >
+                     uint64_t{std::numeric_limits<int32_t>::max()})) {
+      // Either the TSC has wrapped around from the last TSC we've seen or the
+      // delta is too large to fit in a 32-bit signed integer, so we write a
+      // wrap-around record.
+      LatestTSC = TSC;
+
+      if (B.Generation != BQ->generation())
+        return PreambleResult::InvalidBuffer;
+
+      W.writeMetadata<MetadataRecord::RecordKinds::TSCWrap>(TSC);
+      return PreambleResult::WroteMetadata;
+    }
+
+    return PreambleResult::NoChange;
+  }
+
+  bool rewindRecords(int32_t FuncId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    // Undo one enter record, because at this point we are either at the state
+    // of:
+    // - We are exiting a function that we recently entered.
+    // - We are exiting a function that was the result of a sequence of tail
+    //   exits, and we can check whether the tail exits can be re-wound.
+    //
+    FunctionRecord F;
+    W.undoWrites(sizeof(FunctionRecord));
+    if (B.Generation != BQ->generation())
+      return false;
+    internal_memcpy(&F, W.getNextRecord(), sizeof(FunctionRecord));
+
+    DCHECK(F.RecordKind ==
+               uint8_t(FunctionRecord::RecordKinds::FunctionEnter) &&
+           "Expected to find function entry recording when rewinding.");
+    DCHECK_EQ(F.FuncId, FuncId & ~(0x0F << 28));
+
+    LatestTSC -= F.TSCDelta;
+    if (--UndoableFunctionEnters != 0) {
+      LastFunctionEntryTSC -= F.TSCDelta;
+      return true;
+    }
+
+    LastFunctionEntryTSC = 0;
+    auto RewindingTSC = LatestTSC;
+    auto RewindingRecordPtr = W.getNextRecord() - sizeof(FunctionRecord);
+    while (UndoableTailExits) {
+      if (B.Generation != BQ->generation())
+        return false;
+      internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord));
+      DCHECK_EQ(F.RecordKind,
+                uint8_t(FunctionRecord::RecordKinds::FunctionTailExit));
+      RewindingTSC -= F.TSCDelta;
+      RewindingRecordPtr -= sizeof(FunctionRecord);
+      if (B.Generation != BQ->generation())
+        return false;
+      internal_memcpy(&F, RewindingRecordPtr, sizeof(FunctionRecord));
+
+      // This tail call exceeded the threshold duration. It will not be erased.
+      if ((TSC - RewindingTSC) >= CycleThreshold) {
+        UndoableTailExits = 0;
+        return true;
+      }
+
+      --UndoableTailExits;
+      W.undoWrites(sizeof(FunctionRecord) * 2);
+      LatestTSC = RewindingTSC;
+    }
+    return true;
+  }
+
+public:
+  template <class WallClockFunc>
+  FDRController(BufferQueue *BQ, BufferQueue::Buffer &B, FDRLogWriter &W,
+                WallClockFunc R, uint64_t C) XRAY_NEVER_INSTRUMENT
+      : BQ(BQ),
+        B(B),
+        W(W),
+        WallClockReader(R),
+        CycleThreshold(C) {}
+
+  bool functionEnter(int32_t FuncId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+      return returnBuffer();
+
+    auto PreambleStatus = recordPreamble(TSC, CPU);
+    if (PreambleStatus == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    if (PreambleStatus == PreambleResult::WroteMetadata) {
+      UndoableFunctionEnters = 1;
+      UndoableTailExits = 0;
+    } else {
+      ++UndoableFunctionEnters;
+    }
+
+    auto Delta = TSC - LatestTSC;
+    LastFunctionEntryTSC = TSC;
+    LatestTSC = TSC;
+    return W.writeFunction(FDRLogWriter::FunctionRecordKind::Enter,
+                           mask(FuncId), Delta);
+  }
+
+  bool functionTailExit(int32_t FuncId, uint64_t TSC,
+                        uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (finalized())
+      return returnBuffer();
+
+    if (!prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+      return returnBuffer();
+
+    auto PreambleStatus = recordPreamble(TSC, CPU);
+    if (PreambleStatus == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    if (PreambleStatus == PreambleResult::NoChange &&
+        UndoableFunctionEnters != 0 &&
+        TSC - LastFunctionEntryTSC < CycleThreshold)
+      return rewindRecords(FuncId, TSC, CPU);
+
+    UndoableTailExits = UndoableFunctionEnters ? UndoableTailExits + 1 : 0;
+    UndoableFunctionEnters = 0;
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    return W.writeFunction(FDRLogWriter::FunctionRecordKind::TailExit,
+                           mask(FuncId), Delta);
+  }
+
+  bool functionEnterArg(int32_t FuncId, uint64_t TSC, uint16_t CPU,
+                        uint64_t Arg) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer((2 * sizeof(MetadataRecord)) + sizeof(FunctionRecord)) ||
+        recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    LastFunctionEntryTSC = 0;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+
+    return W.writeFunctionWithArg(FDRLogWriter::FunctionRecordKind::EnterArg,
+                                  mask(FuncId), Delta, Arg);
+  }
+
+  bool functionExit(int32_t FuncId, uint64_t TSC,
+                    uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer(sizeof(MetadataRecord) + sizeof(FunctionRecord)))
+      return returnBuffer();
+
+    auto PreambleStatus = recordPreamble(TSC, CPU);
+    if (PreambleStatus == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    if (PreambleStatus == PreambleResult::NoChange &&
+        UndoableFunctionEnters != 0 &&
+        TSC - LastFunctionEntryTSC < CycleThreshold)
+      return rewindRecords(FuncId, TSC, CPU);
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    return W.writeFunction(FDRLogWriter::FunctionRecordKind::Exit, mask(FuncId),
+                           Delta);
+  }
+
+  bool customEvent(uint64_t TSC, uint16_t CPU, const void *Event,
+                   int32_t EventSize) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) ||
+        recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    return W.writeCustomEvent(Delta, Event, EventSize);
+  }
+
+  bool typedEvent(uint64_t TSC, uint16_t CPU, uint16_t EventType,
+                  const void *Event, int32_t EventSize) XRAY_NEVER_INSTRUMENT {
+    if (finalized() ||
+        !prepareBuffer((2 * sizeof(MetadataRecord)) + EventSize) ||
+        recordPreamble(TSC, CPU) == PreambleResult::InvalidBuffer)
+      return returnBuffer();
+
+    auto Delta = TSC - LatestTSC;
+    LatestTSC = TSC;
+    UndoableFunctionEnters = 0;
+    UndoableTailExits = 0;
+    return W.writeTypedEvent(Delta, EventType, Event, EventSize);
+  }
+
+  bool flush() XRAY_NEVER_INSTRUMENT {
+    if (finalized()) {
+      returnBuffer(); // ignore result.
+      return true;
+    }
+    return returnBuffer();
+  }
+};
+
+} // namespace __xray
+
+#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_CONTROLLER_H_
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.cpp
new file mode 100644
index 000000000000..272b0b7cb1f7
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.cpp
@@ -0,0 +1,47 @@
+//===-- xray_fdr_flags.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay FDR flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_fdr_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+FDRFlags xray_fdr_flags_dont_use_directly; // use via fdrFlags().
+
+void FDRFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayFDRFlags(FlagParser *P, FDRFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+}
+
+const char *useCompilerDefinedFDRFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_FDR_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_FDR_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.h
new file mode 100644
index 000000000000..d6f00dc48006
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.h
@@ -0,0 +1,37 @@
+//===-- xray_fdr_flags.h ---------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the flags for the flight-data-recorder mode implementation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FDR_FLAGS_H
+#define XRAY_FDR_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct FDRFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_fdr_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern FDRFlags xray_fdr_flags_dont_use_directly;
+extern void registerXRayFDRFlags(FlagParser *P, FDRFlags *F);
+const char *useCompilerDefinedFDRFlags();
+inline FDRFlags *fdrFlags() { return &xray_fdr_flags_dont_use_directly; }
+
+} // namespace __xray
+
+#endif // XRAY_FDR_FLAGS_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.inc b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.inc
new file mode 100644
index 000000000000..6082b7e78521
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_flags.inc
@@ -0,0 +1,28 @@
+//===-- xray_fdr_flags.inc --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay FDR Mode runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+// FDR (Flight Data Recorder) Mode logging options.
+XRAY_FLAG(int, func_duration_threshold_us, 5,
+          "FDR logging will try to skip functions that execute for fewer "
+          "microseconds than this threshold.")
+XRAY_FLAG(int, grace_period_ms, 100,
+          "FDR logging will wait this much time in milliseconds before "
+          "actually flushing the log; this gives a chance for threads to "
+          "notice that the log has been finalized and clean up.")
+XRAY_FLAG(int, buffer_size, 16384,
+          "Size of buffers in the circular buffer queue.")
+XRAY_FLAG(int, buffer_max, 100, "Maximum number of buffers in the queue.")
+XRAY_FLAG(bool, no_file_flush, false,
+          "Set to true to not write log files by default.")
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_log_records.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_log_records.h
new file mode 100644
index 000000000000..7a5d438314af
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_log_records.h
@@ -0,0 +1,75 @@
+//===-- xray_fdr_log_records.h  -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_FDR_LOG_RECORDS_H
+#define XRAY_XRAY_FDR_LOG_RECORDS_H
+#include <cstdint>
+
+namespace __xray {
+
+enum class RecordType : uint8_t { Function, Metadata };
+
+// A MetadataRecord encodes the kind of record in its first byte, and have 15
+// additional bytes in the end to hold free-form data.
+struct alignas(16) MetadataRecord {
+  // A MetadataRecord must always have a type of 1.
+  /* RecordType */ uint8_t Type : 1;
+
+  // Each kind of record is represented as a 7-bit value (even though we use an
+  // unsigned 8-bit enum class to do so).
+  enum class RecordKinds : uint8_t {
+    NewBuffer,
+    EndOfBuffer,
+    NewCPUId,
+    TSCWrap,
+    WalltimeMarker,
+    CustomEventMarker,
+    CallArgument,
+    BufferExtents,
+    TypedEventMarker,
+    Pid,
+  };
+
+  // Use 7 bits to identify this record type.
+  /* RecordKinds */ uint8_t RecordKind : 7;
+  char Data[15];
+} __attribute__((packed));
+
+static_assert(sizeof(MetadataRecord) == 16, "Wrong size for MetadataRecord.");
+
+struct alignas(8) FunctionRecord {
+  // A FunctionRecord must always have a type of 0.
+  /* RecordType */ uint8_t Type : 1;
+  enum class RecordKinds {
+    FunctionEnter = 0x00,
+    FunctionExit = 0x01,
+    FunctionTailExit = 0x02,
+  };
+  /* RecordKinds */ uint8_t RecordKind : 3;
+
+  // We only use 28 bits of the function ID, so that we can use as few bytes as
+  // possible. This means we only support 2^28 (268,435,456) unique function ids
+  // in a single binary.
+  int FuncId : 28;
+
+  // We use another 4 bytes to hold the delta between the previous entry's TSC.
+  // In case we've found that the distance is greater than the allowable 32 bits
+  // (either because we are running in a different CPU and the TSC might be
+  // different then), we should use a MetadataRecord before this FunctionRecord
+  // that will contain the full TSC for that CPU, and keep this to 0.
+  uint32_t TSCDelta;
+} __attribute__((packed));
+
+static_assert(sizeof(FunctionRecord) == 8, "Wrong size for FunctionRecord.");
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_FDR_LOG_RECORDS_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_log_writer.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_log_writer.h
new file mode 100644
index 000000000000..0378663c3907
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_log_writer.h
@@ -0,0 +1,231 @@
+//===-- xray_fdr_log_writer.h ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+#define COMPILER_RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
+
+#include "xray_buffer_queue.h"
+#include "xray_fdr_log_records.h"
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+template <size_t Index> struct SerializerImpl {
+  template <class Tuple,
+            typename std::enable_if<
+                Index<std::tuple_size<
+                          typename std::remove_reference<Tuple>::type>::value,
+                      int>::type = 0> static void serializeTo(char *Buffer,
+                                                              Tuple &&T) {
+    auto P = reinterpret_cast<const char *>(&std::get<Index>(T));
+    constexpr auto Size = sizeof(std::get<Index>(T));
+    internal_memcpy(Buffer, P, Size);
+    SerializerImpl<Index + 1>::serializeTo(Buffer + Size,
+                                           std::forward<Tuple>(T));
+  }
+
+  template <class Tuple,
+            typename std::enable_if<
+                Index >= std::tuple_size<typename std::remove_reference<
+                             Tuple>::type>::value,
+                int>::type = 0>
+  static void serializeTo(char *, Tuple &&) {}
+};
+
+using Serializer = SerializerImpl<0>;
+
+template <class Tuple, size_t Index> struct AggregateSizesImpl {
+  static constexpr size_t value =
+      sizeof(typename std::tuple_element<Index, Tuple>::type) +
+      AggregateSizesImpl<Tuple, Index - 1>::value;
+};
+
+template <class Tuple> struct AggregateSizesImpl<Tuple, 0> {
+  static constexpr size_t value =
+      sizeof(typename std::tuple_element<0, Tuple>::type);
+};
+
+template <class Tuple> struct AggregateSizes {
+  static constexpr size_t value =
+      AggregateSizesImpl<Tuple, std::tuple_size<Tuple>::value - 1>::value;
+};
+
+template <MetadataRecord::RecordKinds Kind, class... DataTypes>
+MetadataRecord createMetadataRecord(DataTypes &&... Ds) {
+  static_assert(AggregateSizes<std::tuple<DataTypes...>>::value <=
+                    sizeof(MetadataRecord) - 1,
+                "Metadata payload longer than metadata buffer!");
+  MetadataRecord R;
+  R.Type = 1;
+  R.RecordKind = static_cast<uint8_t>(Kind);
+  Serializer::serializeTo(R.Data,
+                          std::make_tuple(std::forward<DataTypes>(Ds)...));
+  return R;
+}
+
+class FDRLogWriter {
+  BufferQueue::Buffer &Buffer;
+  char *NextRecord = nullptr;
+
+  template <class T> void writeRecord(const T &R) {
+    internal_memcpy(NextRecord, reinterpret_cast<const char *>(&R), sizeof(T));
+    NextRecord += sizeof(T);
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, sizeof(T), memory_order_acq_rel);
+  }
+
+public:
+  explicit FDRLogWriter(BufferQueue::Buffer &B, char *P)
+      : Buffer(B), NextRecord(P) {
+    DCHECK_NE(Buffer.Data, nullptr);
+    DCHECK_NE(NextRecord, nullptr);
+  }
+
+  explicit FDRLogWriter(BufferQueue::Buffer &B)
+      : FDRLogWriter(B, static_cast<char *>(B.Data)) {}
+
+  template <MetadataRecord::RecordKinds Kind, class... Data>
+  bool writeMetadata(Data &&... Ds) {
+    // TODO: Check boundary conditions:
+    // 1) Buffer is full, and cannot handle one metadata record.
+    // 2) Buffer queue is finalising.
+    writeRecord(createMetadataRecord<Kind>(std::forward<Data>(Ds)...));
+    return true;
+  }
+
+  template <size_t N> size_t writeMetadataRecords(MetadataRecord (&Recs)[N]) {
+    constexpr auto Size = sizeof(MetadataRecord) * N;
+    internal_memcpy(NextRecord, reinterpret_cast<const char *>(Recs), Size);
+    NextRecord += Size;
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, Size, memory_order_acq_rel);
+    return Size;
+  }
+
+  enum class FunctionRecordKind : uint8_t {
+    Enter = 0x00,
+    Exit = 0x01,
+    TailExit = 0x02,
+    EnterArg = 0x03,
+  };
+
+  bool writeFunction(FunctionRecordKind Kind, int32_t FuncId, int32_t Delta) {
+    FunctionRecord R;
+    R.Type = 0;
+    R.RecordKind = uint8_t(Kind);
+    R.FuncId = FuncId;
+    R.TSCDelta = Delta;
+    writeRecord(R);
+    return true;
+  }
+
+  bool writeFunctionWithArg(FunctionRecordKind Kind, int32_t FuncId,
+                            int32_t Delta, uint64_t Arg) {
+    // We need to write the function with arg into the buffer, and then
+    // atomically update the buffer extents. This ensures that any reads
+    // synchronised on the buffer extents record will always see the writes
+    // that happen before the atomic update.
+    FunctionRecord R;
+    R.Type = 0;
+    R.RecordKind = uint8_t(Kind);
+    R.FuncId = FuncId;
+    R.TSCDelta = Delta;
+    MetadataRecord A =
+        createMetadataRecord<MetadataRecord::RecordKinds::CallArgument>(Arg);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+                 sizeof(R);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&A), sizeof(A))) +
+                 sizeof(A);
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, sizeof(R) + sizeof(A),
+                     memory_order_acq_rel);
+    return true;
+  }
+
+  bool writeCustomEvent(int32_t Delta, const void *Event, int32_t EventSize) {
+    // We write the metadata record and the custom event data into the buffer
+    // first, before we atomically update the extents for the buffer. This
+    // allows us to ensure that any threads reading the extents of the buffer
+    // will only ever see the full metadata and custom event payload accounted
+    // (no partial writes accounted).
+    MetadataRecord R =
+        createMetadataRecord<MetadataRecord::RecordKinds::CustomEventMarker>(
+            EventSize, Delta);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+                 sizeof(R);
+    NextRecord = reinterpret_cast<char *>(
+                     internal_memcpy(NextRecord, Event, EventSize)) +
+                 EventSize;
+
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, sizeof(R) + EventSize,
+                     memory_order_acq_rel);
+    return true;
+  }
+
+  bool writeTypedEvent(int32_t Delta, uint16_t EventType, const void *Event,
+                       int32_t EventSize) {
+    // We do something similar when writing out typed events, see
+    // writeCustomEvent(...) above for details.
+    MetadataRecord R =
+        createMetadataRecord<MetadataRecord::RecordKinds::TypedEventMarker>(
+            EventSize, Delta, EventType);
+    NextRecord = reinterpret_cast<char *>(internal_memcpy(
+                     NextRecord, reinterpret_cast<char *>(&R), sizeof(R))) +
+                 sizeof(R);
+    NextRecord = reinterpret_cast<char *>(
+                     internal_memcpy(NextRecord, Event, EventSize)) +
+                 EventSize;
+
+    // We need this atomic fence here to ensure that other threads attempting to
+    // read the bytes in the buffer will see the writes committed before the
+    // extents are updated.
+    atomic_thread_fence(memory_order_release);
+    atomic_fetch_add(Buffer.Extents, EventSize, memory_order_acq_rel);
+    return true;
+  }
+
+  char *getNextRecord() const { return NextRecord; }
+
+  void resetRecord() {
+    NextRecord = reinterpret_cast<char *>(Buffer.Data);
+    atomic_store(Buffer.Extents, 0, memory_order_release);
+  }
+
+  void undoWrites(size_t B) {
+    DCHECK_GE(NextRecord - B, reinterpret_cast<char *>(Buffer.Data));
+    NextRecord -= B;
+    atomic_fetch_sub(Buffer.Extents, B, memory_order_acq_rel);
+  }
+
+}; // namespace __xray
+
+} // namespace __xray
+
+#endif // COMPILER-RT_LIB_XRAY_XRAY_FDR_LOG_WRITER_H_
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_logging.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_logging.cpp
new file mode 100644
index 000000000000..7def3565d56a
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_logging.cpp
@@ -0,0 +1,754 @@
+//===-- xray_fdr_logging.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Here we implement the Flight Data Recorder mode for XRay, where we use
+// compact structures to store records in memory as well as when writing out the
+// data to files.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_fdr_logging.h"
+#include <cassert>
+#include <cstddef>
+#include <errno.h>
+#include <limits>
+#include <memory>
+#include <pthread.h>
+#include <sys/time.h>
+#include <time.h>
+#include <unistd.h>
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray/xray_interface.h"
+#include "xray/xray_records.h"
+#include "xray_allocator.h"
+#include "xray_buffer_queue.h"
+#include "xray_defs.h"
+#include "xray_fdr_controller.h"
+#include "xray_fdr_flags.h"
+#include "xray_fdr_log_writer.h"
+#include "xray_flags.h"
+#include "xray_recursion_guard.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+
+namespace __xray {
+
+static atomic_sint32_t LoggingStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+namespace {
+
+// Group together thread-local-data in a struct, then hide it behind a function
+// call so that it can be initialized on first use instead of as a global. We
+// force the alignment to 64-bytes for x86 cache line alignment, as this
+// structure is used in the hot path of implementation.
+struct XRAY_TLS_ALIGNAS(64) ThreadLocalData {
+  BufferQueue::Buffer Buffer{};
+  BufferQueue *BQ = nullptr;
+
+  using LogWriterStorage = std::byte[sizeof(FDRLogWriter)];
+  alignas(FDRLogWriter) LogWriterStorage LWStorage;
+  FDRLogWriter *Writer = nullptr;
+
+  using ControllerStorage = std::byte[sizeof(FDRController<>)];
+  alignas(FDRController<>) ControllerStorage CStorage;
+  FDRController<> *Controller = nullptr;
+};
+
+} // namespace
+
+static_assert(std::is_trivially_destructible<ThreadLocalData>::value,
+              "ThreadLocalData must be trivially destructible");
+
+// Use a global pthread key to identify thread-local data for logging.
+static pthread_key_t Key;
+
+// Global BufferQueue.
+static std::byte BufferQueueStorage[sizeof(BufferQueue)];
+static BufferQueue *BQ = nullptr;
+
+// Global thresholds for function durations.
+static atomic_uint64_t ThresholdTicks{0};
+
+// Global for ticks per second.
+static atomic_uint64_t TicksPerSec{0};
+
+static atomic_sint32_t LogFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+// This function will initialize the thread-local data structure used by the FDR
+// logging implementation and return a reference to it. The implementation
+// details require a bit of care to maintain.
+//
+// First, some requirements on the implementation in general:
+//
+//   - XRay handlers should not call any memory allocation routines that may
+//     delegate to an instrumented implementation. This means functions like
+//     malloc() and free() should not be called while instrumenting.
+//
+//   - We would like to use some thread-local data initialized on first-use of
+//     the XRay instrumentation. These allow us to implement unsynchronized
+//     routines that access resources associated with the thread.
+//
+// The implementation here uses a few mechanisms that allow us to provide both
+// the requirements listed above. We do this by:
+//
+//   1. Using a thread-local aligned storage buffer for representing the
+//      ThreadLocalData struct. This data will be uninitialized memory by
+//      design.
+//
+//   2. Not requiring a thread exit handler/implementation, keeping the
+//      thread-local as purely a collection of references/data that do not
+//      require cleanup.
+//
+// We're doing this to avoid using a `thread_local` object that has a
+// non-trivial destructor, because the C++ runtime might call std::malloc(...)
+// to register calls to destructors. Deadlocks may arise when, for example, an
+// externally provided malloc implementation is XRay instrumented, and
+// initializing the thread-locals involves calling into malloc. A malloc
+// implementation that does global synchronization might be holding a lock for a
+// critical section, calling a function that might be XRay instrumented (and
+// thus in turn calling into malloc by virtue of registration of the
+// thread_local's destructor).
+#if XRAY_HAS_TLS_ALIGNAS
+static_assert(alignof(ThreadLocalData) >= 64,
+              "ThreadLocalData must be cache line aligned.");
+#endif
+static ThreadLocalData &getThreadLocalData() {
+  alignas(ThreadLocalData) thread_local std::byte
+      TLDStorage[sizeof(ThreadLocalData)];
+
+  if (pthread_getspecific(Key) == NULL) {
+    new (reinterpret_cast<ThreadLocalData *>(&TLDStorage)) ThreadLocalData{};
+    pthread_setspecific(Key, &TLDStorage);
+  }
+
+  return *reinterpret_cast<ThreadLocalData *>(&TLDStorage);
+}
+
+static XRayFileHeader &fdrCommonHeaderInfo() {
+  alignas(XRayFileHeader) static std::byte HStorage[sizeof(XRayFileHeader)];
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  static bool TSCSupported = true;
+  static uint64_t CycleFrequency = NanosecondsPerSecond;
+  pthread_once(
+      &OnceInit, +[] {
+        XRayFileHeader &H = reinterpret_cast<XRayFileHeader &>(HStorage);
+        // Version 2 of the log writes the extents of the buffer, instead of
+        // relying on an end-of-buffer record.
+        // Version 3 includes PID metadata record.
+        // Version 4 includes CPU data in the custom event records.
+        // Version 5 uses relative deltas for custom and typed event records,
+        // and removes the CPU data in custom event records (similar to how
+        // function records use deltas instead of full TSCs and rely on other
+        // metadata records for TSC wraparound and CPU migration).
+        H.Version = 5;
+        H.Type = FileTypes::FDR_LOG;
+
+        // Test for required CPU features and cache the cycle frequency
+        TSCSupported = probeRequiredCPUFeatures();
+        if (TSCSupported)
+          CycleFrequency = getTSCFrequency();
+        H.CycleFrequency = CycleFrequency;
+
+        // FIXME: Actually check whether we have 'constant_tsc' and
+        // 'nonstop_tsc' before setting the values in the header.
+        H.ConstantTSC = 1;
+        H.NonstopTSC = 1;
+      });
+  return reinterpret_cast<XRayFileHeader &>(HStorage);
+}
+
+// This is the iterator implementation, which knows how to handle FDR-mode
+// specific buffers. This is used as an implementation of the iterator function
+// needed by __xray_set_buffer_iterator(...). It maintains a global state of the
+// buffer iteration for the currently installed FDR mode buffers. In particular:
+//
+//   - If the argument represents the initial state of XRayBuffer ({nullptr, 0})
+//     then the iterator returns the header information.
+//   - If the argument represents the header information ({address of header
+//     info, size of the header info}) then it returns the first FDR buffer's
+//     address and extents.
+//   - It will keep returning the next buffer and extents as there are more
+//     buffers to process. When the input represents the last buffer, it will
+//     return the initial state to signal completion ({nullptr, 0}).
+//
+// See xray/xray_log_interface.h for more details on the requirements for the
+// implementations of __xray_set_buffer_iterator(...) and
+// __xray_log_process_buffers(...).
+XRayBuffer fdrIterator(const XRayBuffer B) {
+  DCHECK(internal_strcmp(__xray_log_get_current_mode(), "xray-fdr") == 0);
+  DCHECK(BQ->finalizing());
+
+  if (BQ == nullptr || !BQ->finalizing()) {
+    if (Verbosity())
+      Report(
+          "XRay FDR: Failed global buffer queue is null or not finalizing!\n");
+    return {nullptr, 0};
+  }
+
+  // We use a global scratch-pad for the header information, which only gets
+  // initialized the first time this function is called. We'll update one part
+  // of this information with some relevant data (in particular the number of
+  // buffers to expect).
+  alignas(
+      XRayFileHeader) static std::byte HeaderStorage[sizeof(XRayFileHeader)];
+  static pthread_once_t HeaderOnce = PTHREAD_ONCE_INIT;
+  pthread_once(
+      &HeaderOnce, +[] {
+        reinterpret_cast<XRayFileHeader &>(HeaderStorage) =
+            fdrCommonHeaderInfo();
+      });
+
+  // We use a convenience alias for code referring to Header from here on out.
+  auto &Header = reinterpret_cast<XRayFileHeader &>(HeaderStorage);
+  if (B.Data == nullptr && B.Size == 0) {
+    Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
+    return XRayBuffer{static_cast<void *>(&Header), sizeof(Header)};
+  }
+
+  static BufferQueue::const_iterator It{};
+  static BufferQueue::const_iterator End{};
+  static uint8_t *CurrentBuffer{nullptr};
+  static size_t SerializedBufferSize = 0;
+  if (B.Data == static_cast<void *>(&Header) && B.Size == sizeof(Header)) {
+    // From this point on, we provide raw access to the raw buffer we're getting
+    // from the BufferQueue. We're relying on the iterators from the current
+    // Buffer queue.
+    It = BQ->cbegin();
+    End = BQ->cend();
+  }
+
+  if (CurrentBuffer != nullptr) {
+    deallocateBuffer(CurrentBuffer, SerializedBufferSize);
+    CurrentBuffer = nullptr;
+  }
+
+  if (It == End)
+    return {nullptr, 0};
+
+  // Set up the current buffer to contain the extents like we would when writing
+  // out to disk. The difference here would be that we still write "empty"
+  // buffers, or at least go through the iterators faithfully to let the
+  // handlers see the empty buffers in the queue.
+  //
+  // We need this atomic fence here to ensure that writes happening to the
+  // buffer have been committed before we load the extents atomically. Because
+  // the buffer is not explicitly synchronised across threads, we rely on the
+  // fence ordering to ensure that writes we expect to have been completed
+  // before the fence are fully committed before we read the extents.
+  atomic_thread_fence(memory_order_acquire);
+  auto BufferSize = atomic_load(It->Extents, memory_order_acquire);
+  SerializedBufferSize = BufferSize + sizeof(MetadataRecord);
+  CurrentBuffer = allocateBuffer(SerializedBufferSize);
+  if (CurrentBuffer == nullptr)
+    return {nullptr, 0};
+
+  // Write out the extents as a Metadata Record into the CurrentBuffer.
+  MetadataRecord ExtentsRecord;
+  ExtentsRecord.Type = uint8_t(RecordType::Metadata);
+  ExtentsRecord.RecordKind =
+      uint8_t(MetadataRecord::RecordKinds::BufferExtents);
+  internal_memcpy(ExtentsRecord.Data, &BufferSize, sizeof(BufferSize));
+  auto AfterExtents =
+      static_cast<char *>(internal_memcpy(CurrentBuffer, &ExtentsRecord,
+                                          sizeof(MetadataRecord))) +
+      sizeof(MetadataRecord);
+  internal_memcpy(AfterExtents, It->Data, BufferSize);
+
+  XRayBuffer Result;
+  Result.Data = CurrentBuffer;
+  Result.Size = SerializedBufferSize;
+  ++It;
+  return Result;
+}
+
+// Must finalize before flushing.
+XRayLogFlushStatus fdrLoggingFlush() XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&LoggingStatus, memory_order_acquire) !=
+      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+    if (Verbosity())
+      Report("Not flushing log, implementation is not finalized.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  if (atomic_exchange(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+                      memory_order_release) ==
+      XRayLogFlushStatus::XRAY_LOG_FLUSHING) {
+    if (Verbosity())
+      Report("Not flushing log, implementation is still flushing.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  if (BQ == nullptr) {
+    if (Verbosity())
+      Report("Cannot flush when global buffer queue is null.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  // We wait a number of milliseconds to allow threads to see that we've
+  // finalised before attempting to flush the log.
+  SleepForMillis(fdrFlags()->grace_period_ms);
+
+  // At this point, we're going to uninstall the iterator implementation, before
+  // we decide to do anything further with the global buffer queue.
+  __xray_log_remove_buffer_iterator();
+
+  // Once flushed, we should set the global status of the logging implementation
+  // to "uninitialized" to allow for FDR-logging multiple runs.
+  auto ResetToUnitialized = at_scope_exit([] {
+    atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                 memory_order_release);
+  });
+
+  auto CleanupBuffers = at_scope_exit([] {
+    auto &TLD = getThreadLocalData();
+    if (TLD.Controller != nullptr)
+      TLD.Controller->flush();
+  });
+
+  if (fdrFlags()->no_file_flush) {
+    if (Verbosity())
+      Report("XRay FDR: Not flushing to file, 'no_file_flush=true'.\n");
+
+    atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+                 memory_order_release);
+    return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+  }
+
+  // We write out the file in the following format:
+  //
+  //   1) We write down the XRay file header with version 1, type FDR_LOG.
+  //   2) Then we use the 'apply' member of the BufferQueue that's live, to
+  //      ensure that at this point in time we write down the buffers that have
+  //      been released (and marked "used") -- we dump the full buffer for now
+  //      (fixed-sized) and let the tools reading the buffers deal with the data
+  //      afterwards.
+  //
+  LogWriter *LW = LogWriter::Open();
+  if (LW == nullptr) {
+    auto Result = XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+    atomic_store(&LogFlushStatus, Result, memory_order_release);
+    return Result;
+  }
+
+  XRayFileHeader Header = fdrCommonHeaderInfo();
+  Header.FdrData = FdrAdditionalHeaderData{BQ->ConfiguredBufferSize()};
+  LW->WriteAll(reinterpret_cast<char *>(&Header),
+               reinterpret_cast<char *>(&Header) + sizeof(Header));
+
+  // Release the current thread's buffer before we attempt to write out all the
+  // buffers. This ensures that in case we had only a single thread going, that
+  // we are able to capture the data nonetheless.
+  auto &TLD = getThreadLocalData();
+  if (TLD.Controller != nullptr)
+    TLD.Controller->flush();
+
+  BQ->apply([&](const BufferQueue::Buffer &B) {
+    // Starting at version 2 of the FDR logging implementation, we only write
+    // the records identified by the extents of the buffer. We use the Extents
+    // from the Buffer and write that out as the first record in the buffer.  We
+    // still use a Metadata record, but fill in the extents instead for the
+    // data.
+    MetadataRecord ExtentsRecord;
+    auto BufferExtents = atomic_load(B.Extents, memory_order_acquire);
+    DCHECK(BufferExtents <= B.Size);
+    ExtentsRecord.Type = uint8_t(RecordType::Metadata);
+    ExtentsRecord.RecordKind =
+        uint8_t(MetadataRecord::RecordKinds::BufferExtents);
+    internal_memcpy(ExtentsRecord.Data, &BufferExtents, sizeof(BufferExtents));
+    if (BufferExtents > 0) {
+      LW->WriteAll(reinterpret_cast<char *>(&ExtentsRecord),
+                   reinterpret_cast<char *>(&ExtentsRecord) +
+                       sizeof(MetadataRecord));
+      LW->WriteAll(reinterpret_cast<char *>(B.Data),
+                   reinterpret_cast<char *>(B.Data) + BufferExtents);
+    }
+  });
+
+  atomic_store(&LogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+               memory_order_release);
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+XRayLogInitStatus fdrLoggingFinalize() XRAY_NEVER_INSTRUMENT {
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot finalize log, implementation not initialized.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  // Do special things to make the log finalize itself, and not allow any more
+  // operations to be performed until re-initialized.
+  if (BQ == nullptr) {
+    if (Verbosity())
+      Report("Attempting to finalize an uninitialized global buffer!\n");
+  } else {
+    BQ->finalize();
+  }
+
+  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+               memory_order_release);
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+struct TSCAndCPU {
+  uint64_t TSC = 0;
+  unsigned char CPU = 0;
+};
+
+static TSCAndCPU getTimestamp() XRAY_NEVER_INSTRUMENT {
+  // We want to get the TSC as early as possible, so that we can check whether
+  // we've seen this CPU before. We also do it before we load anything else,
+  // to allow for forward progress with the scheduling.
+  TSCAndCPU Result;
+
+  // Test once for required CPU features
+  static pthread_once_t OnceProbe = PTHREAD_ONCE_INIT;
+  static bool TSCSupported = true;
+  pthread_once(
+      &OnceProbe, +[] { TSCSupported = probeRequiredCPUFeatures(); });
+
+  if (TSCSupported) {
+    Result.TSC = __xray::readTSC(Result.CPU);
+  } else {
+    // FIXME: This code needs refactoring as it appears in multiple locations
+    timespec TS;
+    int result = clock_gettime(CLOCK_REALTIME, &TS);
+    if (result != 0) {
+      Report("clock_gettime(2) return %d, errno=%d", result, int(errno));
+      TS = {0, 0};
+    }
+    Result.CPU = 0;
+    Result.TSC = TS.tv_sec * __xray::NanosecondsPerSecond + TS.tv_nsec;
+  }
+  return Result;
+}
+
+thread_local atomic_uint8_t Running{0};
+
+static bool setupTLD(ThreadLocalData &TLD) XRAY_NEVER_INSTRUMENT {
+  // Check if we're finalizing, before proceeding.
+  {
+    auto Status = atomic_load(&LoggingStatus, memory_order_acquire);
+    if (Status == XRayLogInitStatus::XRAY_LOG_FINALIZING ||
+        Status == XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+      if (TLD.Controller != nullptr) {
+        TLD.Controller->flush();
+        TLD.Controller = nullptr;
+      }
+      return false;
+    }
+  }
+
+  if (UNLIKELY(TLD.Controller == nullptr)) {
+    // Set up the TLD buffer queue.
+    if (UNLIKELY(BQ == nullptr))
+      return false;
+    TLD.BQ = BQ;
+
+    // Check that we have a valid buffer.
+    if (TLD.Buffer.Generation != BQ->generation() &&
+        TLD.BQ->releaseBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    // Set up a buffer, before setting up the log writer. Bail out on failure.
+    if (TLD.BQ->getBuffer(TLD.Buffer) != BufferQueue::ErrorCode::Ok)
+      return false;
+
+    // Set up the Log Writer for this thread.
+    if (UNLIKELY(TLD.Writer == nullptr)) {
+      auto *LWStorage = reinterpret_cast<FDRLogWriter *>(&TLD.LWStorage);
+      new (LWStorage) FDRLogWriter(TLD.Buffer);
+      TLD.Writer = LWStorage;
+    } else {
+      TLD.Writer->resetRecord();
+    }
+
+    auto *CStorage = reinterpret_cast<FDRController<> *>(&TLD.CStorage);
+    new (CStorage)
+        FDRController<>(TLD.BQ, TLD.Buffer, *TLD.Writer, clock_gettime,
+                        atomic_load_relaxed(&ThresholdTicks));
+    TLD.Controller = CStorage;
+  }
+
+  DCHECK_NE(TLD.Controller, nullptr);
+  return true;
+}
+
+void fdrLoggingHandleArg0(int32_t FuncId,
+                          XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.Controller->functionEnter(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::EXIT:
+    TLD.Controller->functionExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::TAIL:
+    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::CUSTOM_EVENT:
+  case XRayEntryType::TYPED_EVENT:
+    break;
+  }
+}
+
+void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry,
+                          uint64_t Arg) XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    TLD.Controller->functionEnterArg(FuncId, TSC, CPU, Arg);
+    return;
+  case XRayEntryType::EXIT:
+    TLD.Controller->functionExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::TAIL:
+    TLD.Controller->functionTailExit(FuncId, TSC, CPU);
+    return;
+  case XRayEntryType::CUSTOM_EVENT:
+  case XRayEntryType::TYPED_EVENT:
+    break;
+  }
+}
+
+void fdrLoggingHandleCustomEvent(void *Event,
+                                 std::size_t EventSize) XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  // Complain when we ever get at least one custom event that's larger than what
+  // we can possibly support.
+  if (EventSize >
+      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
+    static pthread_once_t Once = PTHREAD_ONCE_INIT;
+    pthread_once(
+        &Once, +[] {
+          Report("Custom event size too large; truncating to %d.\n",
+                 std::numeric_limits<int32_t>::max());
+        });
+  }
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  TLD.Controller->customEvent(TSC, CPU, Event, ReducedEventSize);
+}
+
+void fdrLoggingHandleTypedEvent(size_t EventType, const void *Event,
+                                size_t EventSize) noexcept
+    XRAY_NEVER_INSTRUMENT {
+  auto TC = getTimestamp();
+  auto &TSC = TC.TSC;
+  auto &CPU = TC.CPU;
+  RecursionGuard Guard{Running};
+  if (!Guard)
+    return;
+
+  // Complain when we ever get at least one typed event that's larger than what
+  // we can possibly support.
+  if (EventSize >
+      static_cast<std::size_t>(std::numeric_limits<int32_t>::max())) {
+    static pthread_once_t Once = PTHREAD_ONCE_INIT;
+    pthread_once(
+        &Once, +[] {
+          Report("Typed event size too large; truncating to %d.\n",
+                 std::numeric_limits<int32_t>::max());
+        });
+  }
+
+  auto &TLD = getThreadLocalData();
+  if (!setupTLD(TLD))
+    return;
+
+  int32_t ReducedEventSize = static_cast<int32_t>(EventSize);
+  TLD.Controller->typedEvent(TSC, CPU, static_cast<uint16_t>(EventType), Event,
+                             ReducedEventSize);
+}
+
+XRayLogInitStatus fdrLoggingInit(size_t, size_t, void *Options,
+                                 size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  if (Options == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!atomic_compare_exchange_strong(&LoggingStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot initialize already initialized implementation.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  if (Verbosity())
+    Report("Initializing FDR mode with options: %s\n",
+           static_cast<const char *>(Options));
+
+  // TODO: Factor out the flags specific to the FDR mode implementation. For
+  // now, use the global/single definition of the flags, since the FDR mode
+  // flags are already defined there.
+  FlagParser FDRParser;
+  FDRFlags FDRFlags;
+  registerXRayFDRFlags(&FDRParser, &FDRFlags);
+  FDRFlags.setDefaults();
+
+  // Override first from the general XRAY_DEFAULT_OPTIONS compiler-provided
+  // options until we migrate everyone to use the XRAY_FDR_OPTIONS
+  // compiler-provided options.
+  FDRParser.ParseString(useCompilerDefinedFlags());
+  FDRParser.ParseString(useCompilerDefinedFDRFlags());
+  auto *EnvOpts = GetEnv("XRAY_FDR_OPTIONS");
+  if (EnvOpts == nullptr)
+    EnvOpts = "";
+  FDRParser.ParseString(EnvOpts);
+
+  // FIXME: Remove this when we fully remove the deprecated flags.
+  if (internal_strlen(EnvOpts) == 0) {
+    FDRFlags.func_duration_threshold_us =
+        flags()->xray_fdr_log_func_duration_threshold_us;
+    FDRFlags.grace_period_ms = flags()->xray_fdr_log_grace_period_ms;
+  }
+
+  // The provided options should always override the compiler-provided and
+  // environment-variable defined options.
+  FDRParser.ParseString(static_cast<const char *>(Options));
+  *fdrFlags() = FDRFlags;
+  auto BufferSize = FDRFlags.buffer_size;
+  auto BufferMax = FDRFlags.buffer_max;
+
+  if (BQ == nullptr) {
+    bool Success = false;
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+    new (BQ) BufferQueue(BufferSize, BufferMax, Success);
+    if (!Success) {
+      Report("BufferQueue init failed.\n");
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+  } else {
+    if (BQ->init(BufferSize, BufferMax) != BufferQueue::ErrorCode::Ok) {
+      if (Verbosity())
+        Report("Failed to re-initialize global buffer queue. Init failed.\n");
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+  }
+
+  static pthread_once_t OnceInit = PTHREAD_ONCE_INIT;
+  pthread_once(
+      &OnceInit, +[] {
+        atomic_store(&TicksPerSec,
+                     probeRequiredCPUFeatures() ? getTSCFrequency()
+                                                : __xray::NanosecondsPerSecond,
+                     memory_order_release);
+        pthread_key_create(
+            &Key, +[](void *TLDPtr) {
+              if (TLDPtr == nullptr)
+                return;
+              auto &TLD = *reinterpret_cast<ThreadLocalData *>(TLDPtr);
+              if (TLD.BQ == nullptr)
+                return;
+              if (TLD.Buffer.Data == nullptr)
+                return;
+              auto EC = TLD.BQ->releaseBuffer(TLD.Buffer);
+              if (EC != BufferQueue::ErrorCode::Ok)
+                Report("At thread exit, failed to release buffer at %p; "
+                       "error=%s\n",
+                       TLD.Buffer.Data, BufferQueue::getErrorString(EC));
+            });
+      });
+
+  atomic_store(&ThresholdTicks,
+               atomic_load_relaxed(&TicksPerSec) *
+                   fdrFlags()->func_duration_threshold_us / 1000000,
+               memory_order_release);
+  // Arg1 handler should go in first to avoid concurrent code accidentally
+  // falling back to arg0 when it should have ran arg1.
+  __xray_set_handler_arg1(fdrLoggingHandleArg1);
+  // Install the actual handleArg0 handler after initialising the buffers.
+  __xray_set_handler(fdrLoggingHandleArg0);
+  __xray_set_customevent_handler(fdrLoggingHandleCustomEvent);
+  __xray_set_typedevent_handler(fdrLoggingHandleTypedEvent);
+
+  // Install the buffer iterator implementation.
+  __xray_log_set_buffer_iterator(fdrIterator);
+
+  atomic_store(&LoggingStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+               memory_order_release);
+
+  if (Verbosity())
+    Report("XRay FDR init successful.\n");
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+bool fdrLogDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  XRayLogImpl Impl{
+      fdrLoggingInit,
+      fdrLoggingFinalize,
+      fdrLoggingHandleArg0,
+      fdrLoggingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-fdr", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+      Verbosity()) {
+    Report("Cannot register XRay FDR mode to 'xray-fdr'; error = %d\n",
+           RegistrationResult);
+    return false;
+  }
+
+  if (flags()->xray_fdr_log ||
+      !internal_strcmp(flags()->xray_mode, "xray-fdr")) {
+    auto SelectResult = __xray_log_select_mode("xray-fdr");
+    if (SelectResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK &&
+        Verbosity()) {
+      Report("Cannot select XRay FDR mode as 'xray-fdr'; error = %d\n",
+             SelectResult);
+      return false;
+    }
+  }
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::fdrLogDynamicInitializer();
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_logging.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_logging.h
new file mode 100644
index 000000000000..6df0057c4965
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_fdr_logging.h
@@ -0,0 +1,38 @@
+//===-- xray_fdr_logging.h ------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_FDR_LOGGING_H
+#define XRAY_XRAY_FDR_LOGGING_H
+
+#include "xray/xray_log_interface.h"
+#include "xray_fdr_log_records.h"
+
+// FDR (Flight Data Recorder) Mode
+// ===============================
+//
+// The XRay whitepaper describes a mode of operation for function call trace
+// logging that involves writing small records into an in-memory circular
+// buffer, that then gets logged to disk on demand. To do this efficiently and
+// capture as much data as we can, we use smaller records compared to the
+// default mode of always writing fixed-size records.
+
+namespace __xray {
+XRayLogInitStatus fdrLoggingInit(size_t BufferSize, size_t BufferMax,
+                                 void *Options, size_t OptionsSize);
+XRayLogInitStatus fdrLoggingFinalize();
+void fdrLoggingHandleArg0(int32_t FuncId, XRayEntryType Entry);
+void fdrLoggingHandleArg1(int32_t FuncId, XRayEntryType Entry, uint64_t Arg1);
+XRayLogFlushStatus fdrLoggingFlush();
+XRayLogInitStatus fdrLoggingReset();
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_FDR_LOGGING_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.cpp
new file mode 100644
index 000000000000..e4c6906dc443
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.cpp
@@ -0,0 +1,84 @@
+//===-- xray_flags.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay flag parsing logic.
+//===----------------------------------------------------------------------===//
+
+#include "xray_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+using namespace __sanitizer;
+
+namespace __xray {
+
+Flags xray_flags_dont_use_directly; // use via flags().
+
+void Flags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerXRayFlags(FlagParser *P, Flags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_flags.inc"
+#undef XRAY_FLAG
+}
+
+// This function, as defined with the help of a macro meant to be introduced at
+// build time of the XRay runtime, passes in a statically defined list of
+// options that control XRay. This means users/deployments can tweak the
+// defaults that override the hard-coded defaults in the xray_flags.inc at
+// compile-time using the XRAY_DEFAULT_OPTIONS macro.
+const char *useCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_DEFAULT_OPTIONS
+  // Do the double-layered string conversion to prevent badly crafted strings
+  // provided through the XRAY_DEFAULT_OPTIONS from causing compilation issues
+  // (or changing the semantics of the implementation through the macro). This
+  // ensures that we convert whatever XRAY_DEFAULT_OPTIONS is defined as a
+  // string literal.
+  return SANITIZER_STRINGIFY(XRAY_DEFAULT_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+void initializeFlags() XRAY_NEVER_INSTRUMENT {
+  SetCommonFlagsDefaults();
+  auto *F = flags();
+  F->setDefaults();
+
+  FlagParser XRayParser;
+  registerXRayFlags(&XRayParser, F);
+  RegisterCommonFlags(&XRayParser);
+
+  // Use options defaulted at compile-time for the runtime.
+  const char *XRayCompileFlags = useCompilerDefinedFlags();
+  XRayParser.ParseString(XRayCompileFlags);
+
+  // Override from environment variables.
+  XRayParser.ParseStringFromEnv("XRAY_OPTIONS");
+
+  // Override from command line.
+  InitializeCommonFlags();
+
+  if (Verbosity())
+    ReportUnrecognizedFlags();
+
+  if (common_flags()->help) {
+    XRayParser.PrintFlagDescriptions();
+  }
+}
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.h
new file mode 100644
index 000000000000..cce6fe9d62f9
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.h
@@ -0,0 +1,39 @@
+//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_FLAGS_H
+#define XRAY_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct Flags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern Flags xray_flags_dont_use_directly;
+extern void registerXRayFlags(FlagParser *P, Flags *F);
+const char *useCompilerDefinedFlags();
+inline Flags *flags() { return &xray_flags_dont_use_directly; }
+
+void initializeFlags();
+
+} // namespace __xray
+
+#endif // XRAY_FLAGS_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.inc b/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.inc
new file mode 100644
index 000000000000..b7dc5a08f242
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_flags.inc
@@ -0,0 +1,49 @@
+//===-- xray_flags.inc ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(bool, patch_premain, false,
+          "Whether to patch instrumentation points before main.")
+XRAY_FLAG(const char *, xray_logfile_base, "xray-log.",
+          "Filename base for the xray logfile.")
+XRAY_FLAG(const char *, xray_mode, "", "Mode to install by default.")
+XRAY_FLAG(uptr, xray_page_size_override, 0,
+          "Override the default page size for the system, in bytes. The size "
+          "should be a power-of-two.")
+
+// Basic (Naive) Mode logging options.
+XRAY_FLAG(bool, xray_naive_log, false,
+          "DEPRECATED: Use xray_mode=xray-basic instead.")
+XRAY_FLAG(int, xray_naive_log_func_duration_threshold_us, 5,
+          "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+          "func_duration_threshold_us instead.")
+XRAY_FLAG(int, xray_naive_log_max_stack_depth, 64,
+          "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+          "max_stack_depth instead.")
+XRAY_FLAG(int, xray_naive_log_thread_buffer_size, 1024,
+          "DEPRECATED: use the environment variable XRAY_BASIC_OPTIONS and set "
+          "thread_buffer_size instead.")
+
+// FDR (Flight Data Recorder) Mode logging options.
+XRAY_FLAG(bool, xray_fdr_log, false,
+          "DEPRECATED: Use xray_mode=xray-fdr instead.")
+XRAY_FLAG(int, xray_fdr_log_func_duration_threshold_us, 5,
+          "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+          "func_duration_threshold_us instead.")
+XRAY_FLAG(int, xray_fdr_log_grace_period_us, 0,
+          "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+          "grace_period_ms instead.")
+XRAY_FLAG(int, xray_fdr_log_grace_period_ms, 100,
+          "DEPRECATED: use the environment variable XRAY_FDR_OPTIONS and set "
+          "grace_period_ms instead.")
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_function_call_trie.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_function_call_trie.h
new file mode 100644
index 000000000000..7536f39b8081
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_function_call_trie.h
@@ -0,0 +1,599 @@
+//===-- xray_function_call_trie.h ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a function call trie.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FUNCTION_CALL_TRIE_H
+#define XRAY_FUNCTION_CALL_TRIE_H
+
+#include "xray_buffer_queue.h"
+#include "xray_defs.h"
+#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
+#include <limits>
+#include <memory> // For placement new.
+#include <utility>
+
+namespace __xray {
+
+/// A FunctionCallTrie represents the stack traces of XRay instrumented
+/// functions that we've encountered, where a node corresponds to a function and
+/// the path from the root to the node its stack trace. Each node in the trie
+/// will contain some useful values, including:
+///
+///   * The cumulative amount of time spent in this particular node/stack.
+///   * The number of times this stack has appeared.
+///   * A histogram of latencies for that particular node.
+///
+/// Each node in the trie will also contain a list of callees, represented using
+/// a Array<NodeIdPair> -- each NodeIdPair instance will contain the function
+/// ID of the callee, and a pointer to the node.
+///
+/// If we visualise this data structure, we'll find the following potential
+/// representation:
+///
+///   [function id node] -> [callees] [cumulative time]
+///                         [call counter] [latency histogram]
+///
+/// As an example, when we have a function in this pseudocode:
+///
+///   func f(N) {
+///     g()
+///     h()
+///     for i := 1..N { j() }
+///   }
+///
+/// We may end up with a trie of the following form:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ ... ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///
+/// If for instance the function g() called j() like so:
+///
+///   func g() {
+///     for i := 1..10 { j() }
+///   }
+///
+/// We'll find the following updated trie:
+///
+///   f -> [ g, h, j ] [...] [1] [...]
+///   g -> [ j' ] [...] [1] [...]
+///   h -> [ ... ] [...] [1] [...]
+///   j -> [ ... ] [...] [N] [...]
+///   j' -> [ ... ] [...] [10] [...]
+///
+/// Note that we'll have a new node representing the path `f -> g -> j'` with
+/// isolated data. This isolation gives us a means of representing the stack
+/// traces as a path, as opposed to a key in a table. The alternative
+/// implementation here would be to use a separate table for the path, and use
+/// hashes of the path as an identifier to accumulate the information. We've
+/// moved away from this approach as it takes a lot of time to compute the hash
+/// every time we need to update a function's call information as we're handling
+/// the entry and exit events.
+///
+/// This approach allows us to maintain a shadow stack, which represents the
+/// currently executing path, and on function exits quickly compute the amount
+/// of time elapsed from the entry, then update the counters for the node
+/// already represented in the trie. This necessitates an efficient
+/// representation of the various data structures (the list of callees must be
+/// cache-aware and efficient to look up, and the histogram must be compact and
+/// quick to update) to enable us to keep the overheads of this implementation
+/// to the minimum.
+class FunctionCallTrie {
+public:
+  struct Node;
+
+  // We use a NodeIdPair type instead of a std::pair<...> to not rely on the
+  // standard library types in this header.
+  struct NodeIdPair {
+    Node *NodePtr;
+    int32_t FId;
+  };
+
+  using NodeIdPairArray = Array<NodeIdPair>;
+  using NodeIdPairAllocatorType = NodeIdPairArray::AllocatorType;
+
+  // A Node in the FunctionCallTrie gives us a list of callees, the cumulative
+  // number of times this node actually appeared, the cumulative amount of time
+  // for this particular node including its children call times, and just the
+  // local time spent on this node. Each Node will have the ID of the XRay
+  // instrumented function that it is associated to.
+  struct Node {
+    Node *Parent;
+    NodeIdPairArray Callees;
+    uint64_t CallCount;
+    uint64_t CumulativeLocalTime; // Typically in TSC deltas, not wall-time.
+    int32_t FId;
+
+    // TODO: Include the compact histogram.
+  };
+
+private:
+  struct ShadowStackEntry {
+    uint64_t EntryTSC;
+    Node *NodePtr;
+    uint16_t EntryCPU;
+  };
+
+  using NodeArray = Array<Node>;
+  using RootArray = Array<Node *>;
+  using ShadowStackArray = Array<ShadowStackEntry>;
+
+public:
+  // We collate the allocators we need into a single struct, as a convenience to
+  // allow us to initialize these as a group.
+  struct Allocators {
+    using NodeAllocatorType = NodeArray::AllocatorType;
+    using RootAllocatorType = RootArray::AllocatorType;
+    using ShadowStackAllocatorType = ShadowStackArray::AllocatorType;
+
+    // Use hosted aligned storage members to allow for trivial move and init.
+    // This also allows us to sidestep the potential-failing allocation issue.
+    alignas(NodeAllocatorType) std::byte
+        NodeAllocatorStorage[sizeof(NodeAllocatorType)];
+    alignas(RootAllocatorType) std::byte
+        RootAllocatorStorage[sizeof(RootAllocatorType)];
+    alignas(ShadowStackAllocatorType) std::byte
+        ShadowStackAllocatorStorage[sizeof(ShadowStackAllocatorType)];
+    alignas(NodeIdPairAllocatorType) std::byte
+        NodeIdPairAllocatorStorage[sizeof(NodeIdPairAllocatorType)];
+
+    NodeAllocatorType *NodeAllocator = nullptr;
+    RootAllocatorType *RootAllocator = nullptr;
+    ShadowStackAllocatorType *ShadowStackAllocator = nullptr;
+    NodeIdPairAllocatorType *NodeIdPairAllocator = nullptr;
+
+    Allocators() = default;
+    Allocators(const Allocators &) = delete;
+    Allocators &operator=(const Allocators &) = delete;
+
+    struct Buffers {
+      BufferQueue::Buffer NodeBuffer;
+      BufferQueue::Buffer RootsBuffer;
+      BufferQueue::Buffer ShadowStackBuffer;
+      BufferQueue::Buffer NodeIdPairBuffer;
+    };
+
+    explicit Allocators(Buffers &B) XRAY_NEVER_INSTRUMENT {
+      new (&NodeAllocatorStorage)
+          NodeAllocatorType(B.NodeBuffer.Data, B.NodeBuffer.Size);
+      NodeAllocator =
+          reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+
+      new (&RootAllocatorStorage)
+          RootAllocatorType(B.RootsBuffer.Data, B.RootsBuffer.Size);
+      RootAllocator =
+          reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+
+      new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(
+          B.ShadowStackBuffer.Data, B.ShadowStackBuffer.Size);
+      ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+          &ShadowStackAllocatorStorage);
+
+      new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(
+          B.NodeIdPairBuffer.Data, B.NodeIdPairBuffer.Size);
+      NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+          &NodeIdPairAllocatorStorage);
+    }
+
+    explicit Allocators(uptr Max) XRAY_NEVER_INSTRUMENT {
+      new (&NodeAllocatorStorage) NodeAllocatorType(Max);
+      NodeAllocator =
+          reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+
+      new (&RootAllocatorStorage) RootAllocatorType(Max);
+      RootAllocator =
+          reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+
+      new (&ShadowStackAllocatorStorage) ShadowStackAllocatorType(Max);
+      ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+          &ShadowStackAllocatorStorage);
+
+      new (&NodeIdPairAllocatorStorage) NodeIdPairAllocatorType(Max);
+      NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+          &NodeIdPairAllocatorStorage);
+    }
+
+    Allocators(Allocators &&O) XRAY_NEVER_INSTRUMENT {
+      // Here we rely on the safety of memcpy'ing contents of the storage
+      // members, and then pointing the source pointers to nullptr.
+      internal_memcpy(&NodeAllocatorStorage, &O.NodeAllocatorStorage,
+                      sizeof(NodeAllocatorType));
+      internal_memcpy(&RootAllocatorStorage, &O.RootAllocatorStorage,
+                      sizeof(RootAllocatorType));
+      internal_memcpy(&ShadowStackAllocatorStorage,
+                      &O.ShadowStackAllocatorStorage,
+                      sizeof(ShadowStackAllocatorType));
+      internal_memcpy(&NodeIdPairAllocatorStorage,
+                      &O.NodeIdPairAllocatorStorage,
+                      sizeof(NodeIdPairAllocatorType));
+
+      NodeAllocator =
+          reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+      RootAllocator =
+          reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+      ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+          &ShadowStackAllocatorStorage);
+      NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+          &NodeIdPairAllocatorStorage);
+
+      O.NodeAllocator = nullptr;
+      O.RootAllocator = nullptr;
+      O.ShadowStackAllocator = nullptr;
+      O.NodeIdPairAllocator = nullptr;
+    }
+
+    Allocators &operator=(Allocators &&O) XRAY_NEVER_INSTRUMENT {
+      // When moving into an existing instance, we ensure that we clean up the
+      // current allocators.
+      if (NodeAllocator)
+        NodeAllocator->~NodeAllocatorType();
+      if (O.NodeAllocator) {
+        new (&NodeAllocatorStorage)
+            NodeAllocatorType(std::move(*O.NodeAllocator));
+        NodeAllocator =
+            reinterpret_cast<NodeAllocatorType *>(&NodeAllocatorStorage);
+        O.NodeAllocator = nullptr;
+      } else {
+        NodeAllocator = nullptr;
+      }
+
+      if (RootAllocator)
+        RootAllocator->~RootAllocatorType();
+      if (O.RootAllocator) {
+        new (&RootAllocatorStorage)
+            RootAllocatorType(std::move(*O.RootAllocator));
+        RootAllocator =
+            reinterpret_cast<RootAllocatorType *>(&RootAllocatorStorage);
+        O.RootAllocator = nullptr;
+      } else {
+        RootAllocator = nullptr;
+      }
+
+      if (ShadowStackAllocator)
+        ShadowStackAllocator->~ShadowStackAllocatorType();
+      if (O.ShadowStackAllocator) {
+        new (&ShadowStackAllocatorStorage)
+            ShadowStackAllocatorType(std::move(*O.ShadowStackAllocator));
+        ShadowStackAllocator = reinterpret_cast<ShadowStackAllocatorType *>(
+            &ShadowStackAllocatorStorage);
+        O.ShadowStackAllocator = nullptr;
+      } else {
+        ShadowStackAllocator = nullptr;
+      }
+
+      if (NodeIdPairAllocator)
+        NodeIdPairAllocator->~NodeIdPairAllocatorType();
+      if (O.NodeIdPairAllocator) {
+        new (&NodeIdPairAllocatorStorage)
+            NodeIdPairAllocatorType(std::move(*O.NodeIdPairAllocator));
+        NodeIdPairAllocator = reinterpret_cast<NodeIdPairAllocatorType *>(
+            &NodeIdPairAllocatorStorage);
+        O.NodeIdPairAllocator = nullptr;
+      } else {
+        NodeIdPairAllocator = nullptr;
+      }
+
+      return *this;
+    }
+
+    ~Allocators() XRAY_NEVER_INSTRUMENT {
+      if (NodeAllocator != nullptr)
+        NodeAllocator->~NodeAllocatorType();
+      if (RootAllocator != nullptr)
+        RootAllocator->~RootAllocatorType();
+      if (ShadowStackAllocator != nullptr)
+        ShadowStackAllocator->~ShadowStackAllocatorType();
+      if (NodeIdPairAllocator != nullptr)
+        NodeIdPairAllocator->~NodeIdPairAllocatorType();
+    }
+  };
+
+  static Allocators InitAllocators() XRAY_NEVER_INSTRUMENT {
+    return InitAllocatorsCustom(profilingFlags()->per_thread_allocator_max);
+  }
+
+  static Allocators InitAllocatorsCustom(uptr Max) XRAY_NEVER_INSTRUMENT {
+    Allocators A(Max);
+    return A;
+  }
+
+  static Allocators
+  InitAllocatorsFromBuffers(Allocators::Buffers &Bufs) XRAY_NEVER_INSTRUMENT {
+    Allocators A(Bufs);
+    return A;
+  }
+
+private:
+  NodeArray Nodes;
+  RootArray Roots;
+  ShadowStackArray ShadowStack;
+  NodeIdPairAllocatorType *NodeIdPairAllocator;
+  uint32_t OverflowedFunctions;
+
+public:
+  explicit FunctionCallTrie(const Allocators &A) XRAY_NEVER_INSTRUMENT
+      : Nodes(*A.NodeAllocator),
+        Roots(*A.RootAllocator),
+        ShadowStack(*A.ShadowStackAllocator),
+        NodeIdPairAllocator(A.NodeIdPairAllocator),
+        OverflowedFunctions(0) {}
+
+  FunctionCallTrie() = delete;
+  FunctionCallTrie(const FunctionCallTrie &) = delete;
+  FunctionCallTrie &operator=(const FunctionCallTrie &) = delete;
+
+  FunctionCallTrie(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT
+      : Nodes(std::move(O.Nodes)),
+        Roots(std::move(O.Roots)),
+        ShadowStack(std::move(O.ShadowStack)),
+        NodeIdPairAllocator(O.NodeIdPairAllocator),
+        OverflowedFunctions(O.OverflowedFunctions) {}
+
+  FunctionCallTrie &operator=(FunctionCallTrie &&O) XRAY_NEVER_INSTRUMENT {
+    Nodes = std::move(O.Nodes);
+    Roots = std::move(O.Roots);
+    ShadowStack = std::move(O.ShadowStack);
+    NodeIdPairAllocator = O.NodeIdPairAllocator;
+    OverflowedFunctions = O.OverflowedFunctions;
+    return *this;
+  }
+
+  ~FunctionCallTrie() XRAY_NEVER_INSTRUMENT {}
+
+  void enterFunction(const int32_t FId, uint64_t TSC,
+                     uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    DCHECK_NE(FId, 0);
+
+    // If we're already overflowed the function call stack, do not bother
+    // attempting to record any more function entries.
+    if (UNLIKELY(OverflowedFunctions)) {
+      ++OverflowedFunctions;
+      return;
+    }
+
+    // If this is the first function we've encountered, we want to set up the
+    // node(s) and treat it as a root.
+    if (UNLIKELY(ShadowStack.empty())) {
+      auto *NewRoot = Nodes.AppendEmplace(
+          nullptr, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
+      if (Roots.AppendEmplace(NewRoot) == nullptr) {
+        Nodes.trim(1);
+        return;
+      }
+      if (ShadowStack.AppendEmplace(TSC, NewRoot, CPU) == nullptr) {
+        Nodes.trim(1);
+        Roots.trim(1);
+        ++OverflowedFunctions;
+        return;
+      }
+      return;
+    }
+
+    // From this point on, we require that the stack is not empty.
+    DCHECK(!ShadowStack.empty());
+    auto TopNode = ShadowStack.back().NodePtr;
+    DCHECK_NE(TopNode, nullptr);
+
+    // If we've seen this callee before, then we access that node and place that
+    // on the top of the stack.
+    auto* Callee = TopNode->Callees.find_element(
+        [FId](const NodeIdPair &NR) { return NR.FId == FId; });
+    if (Callee != nullptr) {
+      CHECK_NE(Callee->NodePtr, nullptr);
+      if (ShadowStack.AppendEmplace(TSC, Callee->NodePtr, CPU) == nullptr)
+        ++OverflowedFunctions;
+      return;
+    }
+
+    // This means we've never seen this stack before, create a new node here.
+    auto* NewNode = Nodes.AppendEmplace(
+        TopNode, NodeIdPairArray(*NodeIdPairAllocator), 0u, 0u, FId);
+    if (UNLIKELY(NewNode == nullptr))
+      return;
+    DCHECK_NE(NewNode, nullptr);
+    TopNode->Callees.AppendEmplace(NewNode, FId);
+    if (ShadowStack.AppendEmplace(TSC, NewNode, CPU) == nullptr)
+      ++OverflowedFunctions;
+    return;
+  }
+
+  void exitFunction(int32_t FId, uint64_t TSC,
+                    uint16_t CPU) XRAY_NEVER_INSTRUMENT {
+    // If we're exiting functions that have "overflowed" or don't fit into the
+    // stack due to allocator constraints, we then decrement that count first.
+    if (OverflowedFunctions) {
+      --OverflowedFunctions;
+      return;
+    }
+
+    // When we exit a function, we look up the ShadowStack to see whether we've
+    // entered this function before. We do as little processing here as we can,
+    // since most of the hard work would have already been done at function
+    // entry.
+    uint64_t CumulativeTreeTime = 0;
+
+    while (!ShadowStack.empty()) {
+      const auto &Top = ShadowStack.back();
+      auto TopNode = Top.NodePtr;
+      DCHECK_NE(TopNode, nullptr);
+
+      // We may encounter overflow on the TSC we're provided, which may end up
+      // being less than the TSC when we first entered the function.
+      //
+      // To get the accurate measurement of cycles, we need to check whether
+      // we've overflowed (TSC < Top.EntryTSC) and then account the difference
+      // between the entry TSC and the max for the TSC counter (max of uint64_t)
+      // then add the value of TSC. We can prove that the maximum delta we will
+      // get is at most the 64-bit unsigned value, since the difference between
+      // a TSC of 0 and a Top.EntryTSC of 1 is (numeric_limits<uint64_t>::max()
+      // - 1) + 1.
+      //
+      // NOTE: This assumes that TSCs are synchronised across CPUs.
+      // TODO: Count the number of times we've seen CPU migrations.
+      uint64_t LocalTime =
+          Top.EntryTSC > TSC
+              ? (std::numeric_limits<uint64_t>::max() - Top.EntryTSC) + TSC
+              : TSC - Top.EntryTSC;
+      TopNode->CallCount++;
+      TopNode->CumulativeLocalTime += LocalTime - CumulativeTreeTime;
+      CumulativeTreeTime += LocalTime;
+      ShadowStack.trim(1);
+
+      // TODO: Update the histogram for the node.
+      if (TopNode->FId == FId)
+        break;
+    }
+  }
+
+  const RootArray &getRoots() const XRAY_NEVER_INSTRUMENT { return Roots; }
+
+  // The deepCopyInto operation will update the provided FunctionCallTrie by
+  // re-creating the contents of this particular FunctionCallTrie in the other
+  // FunctionCallTrie. It will do this using a Depth First Traversal from the
+  // roots, and while doing so recreating the traversal in the provided
+  // FunctionCallTrie.
+  //
+  // This operation will *not* destroy the state in `O`, and thus may cause some
+  // duplicate entries in `O` if it is not empty.
+  //
+  // This function is *not* thread-safe, and may require external
+  // synchronisation of both "this" and |O|.
+  //
+  // This function must *not* be called with a non-empty FunctionCallTrie |O|.
+  void deepCopyInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
+    DCHECK(O.getRoots().empty());
+
+    // We then push the root into a stack, to use as the parent marker for new
+    // nodes we push in as we're traversing depth-first down the call tree.
+    struct NodeAndParent {
+      FunctionCallTrie::Node *Node;
+      FunctionCallTrie::Node *NewNode;
+    };
+    using Stack = Array<NodeAndParent>;
+
+    typename Stack::AllocatorType StackAllocator(
+        profilingFlags()->stack_allocator_max);
+    Stack DFSStack(StackAllocator);
+
+    for (const auto Root : getRoots()) {
+      // Add a node in O for this root.
+      auto NewRoot = O.Nodes.AppendEmplace(
+          nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), Root->CallCount,
+          Root->CumulativeLocalTime, Root->FId);
+
+      // Because we cannot allocate more memory we should bail out right away.
+      if (UNLIKELY(NewRoot == nullptr))
+        return;
+
+      if (UNLIKELY(O.Roots.Append(NewRoot) == nullptr))
+        return;
+
+      // TODO: Figure out what to do if we fail to allocate any more stack
+      // space. Maybe warn or report once?
+      if (DFSStack.AppendEmplace(Root, NewRoot) == nullptr)
+        return;
+      while (!DFSStack.empty()) {
+        NodeAndParent NP = DFSStack.back();
+        DCHECK_NE(NP.Node, nullptr);
+        DCHECK_NE(NP.NewNode, nullptr);
+        DFSStack.trim(1);
+        for (const auto Callee : NP.Node->Callees) {
+          auto NewNode = O.Nodes.AppendEmplace(
+              NP.NewNode, NodeIdPairArray(*O.NodeIdPairAllocator),
+              Callee.NodePtr->CallCount, Callee.NodePtr->CumulativeLocalTime,
+              Callee.FId);
+          if (UNLIKELY(NewNode == nullptr))
+            return;
+          if (UNLIKELY(NP.NewNode->Callees.AppendEmplace(NewNode, Callee.FId) ==
+                       nullptr))
+            return;
+          if (UNLIKELY(DFSStack.AppendEmplace(Callee.NodePtr, NewNode) ==
+                       nullptr))
+            return;
+        }
+      }
+    }
+  }
+
+  // The mergeInto operation will update the provided FunctionCallTrie by
+  // traversing the current trie's roots and updating (i.e. merging) the data in
+  // the nodes with the data in the target's nodes. If the node doesn't exist in
+  // the provided trie, we add a new one in the right position, and inherit the
+  // data from the original (current) trie, along with all its callees.
+  //
+  // This function is *not* thread-safe, and may require external
+  // synchronisation of both "this" and |O|.
+  void mergeInto(FunctionCallTrie &O) const XRAY_NEVER_INSTRUMENT {
+    struct NodeAndTarget {
+      FunctionCallTrie::Node *OrigNode;
+      FunctionCallTrie::Node *TargetNode;
+    };
+    using Stack = Array<NodeAndTarget>;
+    typename Stack::AllocatorType StackAllocator(
+        profilingFlags()->stack_allocator_max);
+    Stack DFSStack(StackAllocator);
+
+    for (const auto Root : getRoots()) {
+      Node *TargetRoot = nullptr;
+      auto R = O.Roots.find_element(
+          [&](const Node *Node) { return Node->FId == Root->FId; });
+      if (R == nullptr) {
+        TargetRoot = O.Nodes.AppendEmplace(
+            nullptr, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
+            Root->FId);
+        if (UNLIKELY(TargetRoot == nullptr))
+          return;
+
+        O.Roots.Append(TargetRoot);
+      } else {
+        TargetRoot = *R;
+      }
+
+      DFSStack.AppendEmplace(Root, TargetRoot);
+      while (!DFSStack.empty()) {
+        NodeAndTarget NT = DFSStack.back();
+        DCHECK_NE(NT.OrigNode, nullptr);
+        DCHECK_NE(NT.TargetNode, nullptr);
+        DFSStack.trim(1);
+        // TODO: Update the histogram as well when we have it ready.
+        NT.TargetNode->CallCount += NT.OrigNode->CallCount;
+        NT.TargetNode->CumulativeLocalTime += NT.OrigNode->CumulativeLocalTime;
+        for (const auto Callee : NT.OrigNode->Callees) {
+          auto TargetCallee = NT.TargetNode->Callees.find_element(
+              [&](const FunctionCallTrie::NodeIdPair &C) {
+                return C.FId == Callee.FId;
+              });
+          if (TargetCallee == nullptr) {
+            auto NewTargetNode = O.Nodes.AppendEmplace(
+                NT.TargetNode, NodeIdPairArray(*O.NodeIdPairAllocator), 0u, 0u,
+                Callee.FId);
+
+            if (UNLIKELY(NewTargetNode == nullptr))
+              return;
+
+            TargetCallee =
+                NT.TargetNode->Callees.AppendEmplace(NewTargetNode, Callee.FId);
+          }
+          DFSStack.AppendEmplace(Callee.NodePtr, TargetCallee->NodePtr);
+        }
+      }
+    }
+  }
+};
+
+} // namespace __xray
+
+#endif // XRAY_FUNCTION_CALL_TRIE_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_hexagon.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_hexagon.cpp
new file mode 100644
index 000000000000..7f127b2b499c
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_hexagon.cpp
@@ -0,0 +1,168 @@
+//===-- xray_hexagon.cpp --------------------------------------*- C++ ---*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of hexagon-specific routines (32-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <assert.h>
+#include <atomic>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum PatchOpcodes : uint32_t {
+  PO_JUMPI_14 = 0x5800c00a, // jump #0x014 (PC + 0x014)
+  PO_CALLR_R6 = 0x50a6c000, // indirect call: callr r6
+  PO_TFR_IMM = 0x78000000,  // transfer immed
+                            // ICLASS 0x7 - S2-type A-type
+  PO_IMMEXT = 0x00000000, // constant extender
+};
+
+enum PacketWordParseBits : uint32_t {
+  PP_DUPLEX = 0x00 << 14,
+  PP_NOT_END = 0x01 << 14,
+  PP_PACKET_END = 0x03 << 14,
+};
+
+enum RegNum : uint32_t {
+  RN_R6 = 0x6,
+  RN_R7 = 0x7,
+};
+
+inline static uint32_t
+encodeExtendedTransferImmediate(uint32_t Imm, RegNum DestReg,
+                                bool PacketEnd = false) XRAY_NEVER_INSTRUMENT {
+  static const uint32_t REG_MASK = 0x1f;
+  assert((DestReg & (~REG_MASK)) == 0);
+  // The constant-extended register transfer encodes the 6 least
+  // significant bits of the effective constant:
+  Imm = Imm & 0x03f;
+  const PacketWordParseBits ParseBits = PacketEnd ? PP_PACKET_END : PP_NOT_END;
+
+  return PO_TFR_IMM | ParseBits | (Imm << 5) | (DestReg & REG_MASK);
+}
+
+inline static uint32_t
+encodeConstantExtender(uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  // Bits   Name      Description
+  // -----  -------   ------------------------------------------
+  // 31:28  ICLASS    Instruction class = 0000
+  // 27:16  high      High 12 bits of 26-bit constant extension
+  // 15:14  Parse     Parse bits
+  // 13:0   low       Low 14 bits of 26-bit constant extension
+  static const uint32_t IMM_MASK_LOW = 0x03fff;
+  static const uint32_t IMM_MASK_HIGH = 0x00fff << 14;
+
+  // The extender encodes the 26 most significant bits of the effective
+  // constant:
+  Imm = Imm >> 6;
+
+  const uint32_t high = (Imm & IMM_MASK_HIGH) << 16;
+  const uint32_t low = Imm & IMM_MASK_LOW;
+
+  return PO_IMMEXT | high | PP_NOT_END | low;
+}
+
+static void WriteInstFlushCache(void *Addr, uint32_t NewInstruction) {
+  asm volatile("icinva(%[inst_addr])\n\t"
+               "isync\n\t"
+               "memw(%[inst_addr]) = %[new_inst]\n\t"
+               "dccleaninva(%[inst_addr])\n\t"
+               "syncht\n\t"
+               :
+               : [ inst_addr ] "r"(Addr), [ new_inst ] "r"(NewInstruction)
+               : "memory");
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // .L_xray_sled_N:
+  // <xray_sled_base>:
+  // {  jump .Ltmp0 }
+  // {  nop
+  //    nop
+  //    nop
+  //    nop }
+  // .Ltmp0:
+
+  // With the following runtime patch:
+  //
+  // xray_sled_n (32-bit):
+  //
+  // <xray_sled_n>:
+  // {  immext(#...) // upper 26-bits of func id
+  //    r7 = ##...   // lower  6-bits of func id
+  //    immext(#...) // upper 26-bits of trampoline
+  //    r6 = ##... }  // lower 6 bits of trampoline
+  // {  callr r6 }
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  // {  jump .Ltmp0 }
+
+  uint32_t *FirstAddress = reinterpret_cast<uint32_t *>(Sled.address());
+  if (Enable) {
+    uint32_t *CurAddress = FirstAddress + 1;
+    *CurAddress = encodeExtendedTransferImmediate(FuncId, RN_R7);
+    CurAddress++;
+    *CurAddress = encodeConstantExtender(reinterpret_cast<uint32_t>(TracingHook));
+    CurAddress++;
+    *CurAddress =
+        encodeExtendedTransferImmediate(reinterpret_cast<uint32_t>(TracingHook), RN_R6, true);
+    CurAddress++;
+
+    *CurAddress = uint32_t(PO_CALLR_R6);
+
+    WriteInstFlushCache(FirstAddress, uint32_t(encodeConstantExtender(FuncId)));
+  } else {
+    WriteInstFlushCache(FirstAddress, uint32_t(PatchOpcodes::PO_JUMPI_14));
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in hexagon?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in hexagon?
+  return false;
+}
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_init.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_init.cpp
new file mode 100644
index 000000000000..f22a31b95686
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_init.cpp
@@ -0,0 +1,131 @@
+//===-- xray_init.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay initialisation logic.
+//===----------------------------------------------------------------------===//
+
+#include <fcntl.h>
+#include <strings.h>
+#include <unistd.h>
+
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_flags.h"
+#include "xray_interface_internal.h"
+
+extern "C" {
+void __xray_init();
+extern const XRaySledEntry __start_xray_instr_map[] __attribute__((weak));
+extern const XRaySledEntry __stop_xray_instr_map[] __attribute__((weak));
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] __attribute__((weak));
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] __attribute__((weak));
+
+#if SANITIZER_APPLE
+// HACK: This is a temporary workaround to make XRay build on 
+// Darwin, but it will probably not work at runtime.
+const XRaySledEntry __start_xray_instr_map[] = {};
+extern const XRaySledEntry __stop_xray_instr_map[] = {};
+extern const XRayFunctionSledIndex __start_xray_fn_idx[] = {};
+extern const XRayFunctionSledIndex __stop_xray_fn_idx[] = {};
+#endif
+}
+
+using namespace __xray;
+
+// When set to 'true' this means the XRay runtime has been initialised. We use
+// the weak symbols defined above (__start_xray_inst_map and
+// __stop_xray_instr_map) to initialise the instrumentation map that XRay uses
+// for runtime patching/unpatching of instrumentation points.
+//
+// FIXME: Support DSO instrumentation maps too. The current solution only works
+// for statically linked executables.
+atomic_uint8_t XRayInitialized{0};
+
+// This should always be updated before XRayInitialized is updated.
+SpinMutex XRayInstrMapMutex;
+XRaySledMap XRayInstrMap;
+
+// Global flag to determine whether the flags have been initialized.
+atomic_uint8_t XRayFlagsInitialized{0};
+
+// A mutex to allow only one thread to initialize the XRay data structures.
+SpinMutex XRayInitMutex;
+
+// __xray_init() will do the actual loading of the current process' memory map
+// and then proceed to look for the .xray_instr_map section/segment.
+void __xray_init() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayInitMutex);
+  // Short-circuit if we've already initialized XRay before.
+  if (atomic_load(&XRayInitialized, memory_order_acquire))
+    return;
+
+  // XRAY is not compatible with PaX MPROTECT
+  CheckMPROTECT();
+
+  if (!atomic_load(&XRayFlagsInitialized, memory_order_acquire)) {
+    initializeFlags();
+    atomic_store(&XRayFlagsInitialized, true, memory_order_release);
+  }
+
+  if (__start_xray_instr_map == nullptr) {
+    if (Verbosity())
+      Report("XRay instrumentation map missing. Not initializing XRay.\n");
+    return;
+  }
+
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    XRayInstrMap.Sleds = __start_xray_instr_map;
+    XRayInstrMap.Entries = __stop_xray_instr_map - __start_xray_instr_map;
+    if (__start_xray_fn_idx != nullptr) {
+      XRayInstrMap.SledsIndex = __start_xray_fn_idx;
+      XRayInstrMap.Functions = __stop_xray_fn_idx - __start_xray_fn_idx;
+    } else {
+      size_t CountFunctions = 0;
+      uint64_t LastFnAddr = 0;
+
+      for (std::size_t I = 0; I < XRayInstrMap.Entries; I++) {
+        const auto &Sled = XRayInstrMap.Sleds[I];
+        const auto Function = Sled.function();
+        if (Function != LastFnAddr) {
+          CountFunctions++;
+          LastFnAddr = Function;
+        }
+      }
+
+      XRayInstrMap.Functions = CountFunctions;
+    }
+  }
+  atomic_store(&XRayInitialized, true, memory_order_release);
+
+#ifndef XRAY_NO_PREINIT
+  if (flags()->patch_premain)
+    __xray_patch();
+#endif
+}
+
+// FIXME: Make check-xray tests work on FreeBSD without
+// SANITIZER_CAN_USE_PREINIT_ARRAY.
+// See sanitizer_internal_defs.h where the macro is defined.
+// Calling unresolved PLT functions in .preinit_array can lead to deadlock on
+// FreeBSD but here it seems benign.
+#if !defined(XRAY_NO_PREINIT) &&                                               \
+    (SANITIZER_CAN_USE_PREINIT_ARRAY || SANITIZER_FREEBSD)
+// Only add the preinit array initialization if the sanitizers can.
+__attribute__((section(".preinit_array"),
+               used)) void (*__local_xray_preinit)(void) = __xray_init;
+#else
+// If we cannot use the .preinit_array section, we should instead use dynamic
+// initialisation.
+__attribute__ ((constructor (0)))
+static void __local_xray_dyninit() {
+  __xray_init();
+}
+#endif
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_interface.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_interface.cpp
new file mode 100644
index 000000000000..5839043fcb93
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_interface.cpp
@@ -0,0 +1,530 @@
+//===-- xray_interface.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of the API functions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "xray_interface_internal.h"
+
+#include <cinttypes>
+#include <cstdio>
+#include <errno.h>
+#include <limits>
+#include <string.h>
+#include <sys/mman.h>
+
+#if SANITIZER_FUCHSIA
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
+#include "sanitizer_common/sanitizer_addrhashmap.h"
+#include "sanitizer_common/sanitizer_common.h"
+
+#include "xray_defs.h"
+#include "xray_flags.h"
+
+extern __sanitizer::SpinMutex XRayInstrMapMutex;
+extern __sanitizer::atomic_uint8_t XRayInitialized;
+extern __xray::XRaySledMap XRayInstrMap;
+
+namespace __xray {
+
+#if defined(__x86_64__)
+static const int16_t cSledLength = 12;
+#elif defined(__aarch64__)
+static const int16_t cSledLength = 32;
+#elif defined(__arm__)
+static const int16_t cSledLength = 28;
+#elif SANITIZER_LOONGARCH64
+static const int16_t cSledLength = 48;
+#elif SANITIZER_MIPS32
+static const int16_t cSledLength = 48;
+#elif SANITIZER_MIPS64
+static const int16_t cSledLength = 64;
+#elif defined(__powerpc64__)
+static const int16_t cSledLength = 8;
+#elif defined(__hexagon__)
+static const int16_t cSledLength = 20;
+#else
+#error "Unsupported CPU Architecture"
+#endif /* CPU architecture */
+
+// This is the function to call when we encounter the entry or exit sleds.
+atomic_uintptr_t XRayPatchedFunction{0};
+
+// This is the function to call from the arg1-enabled sleds/trampolines.
+atomic_uintptr_t XRayArgLogger{0};
+
+// This is the function to call when we encounter a custom event log call.
+atomic_uintptr_t XRayPatchedCustomEvent{0};
+
+// This is the function to call when we encounter a typed event log call.
+atomic_uintptr_t XRayPatchedTypedEvent{0};
+
+// This is the global status to determine whether we are currently
+// patching/unpatching.
+atomic_uint8_t XRayPatching{0};
+
+struct TypeDescription {
+  uint32_t type_id;
+  std::size_t description_string_length;
+};
+
+using TypeDescriptorMapType = AddrHashMap<TypeDescription, 11>;
+// An address map from immutable descriptors to type ids.
+TypeDescriptorMapType TypeDescriptorAddressMap{};
+
+atomic_uint32_t TypeEventDescriptorCounter{0};
+
+// MProtectHelper is an RAII wrapper for calls to mprotect(...) that will
+// undo any successful mprotect(...) changes. This is used to make a page
+// writeable and executable, and upon destruction if it was successful in
+// doing so returns the page into a read-only and executable page.
+//
+// This is only used specifically for runtime-patching of the XRay
+// instrumentation points. This assumes that the executable pages are
+// originally read-and-execute only.
+class MProtectHelper {
+  void *PageAlignedAddr;
+  std::size_t MProtectLen;
+  bool MustCleanup;
+
+public:
+  explicit MProtectHelper(void *PageAlignedAddr,
+                          std::size_t MProtectLen,
+                          std::size_t PageSize) XRAY_NEVER_INSTRUMENT
+      : PageAlignedAddr(PageAlignedAddr),
+        MProtectLen(MProtectLen),
+        MustCleanup(false) {
+#if SANITIZER_FUCHSIA
+    MProtectLen = RoundUpTo(MProtectLen, PageSize);
+#endif
+  }
+
+  int MakeWriteable() XRAY_NEVER_INSTRUMENT {
+#if SANITIZER_FUCHSIA
+    auto R = __sanitizer_change_code_protection(
+        reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, true);
+    if (R != ZX_OK) {
+      Report("XRay: cannot change code protection: %s\n",
+             _zx_status_get_string(R));
+      return -1;
+    }
+    MustCleanup = true;
+    return 0;
+#else
+    auto R = mprotect(PageAlignedAddr, MProtectLen,
+                      PROT_READ | PROT_WRITE | PROT_EXEC);
+    if (R != -1)
+      MustCleanup = true;
+    return R;
+#endif
+  }
+
+  ~MProtectHelper() XRAY_NEVER_INSTRUMENT {
+    if (MustCleanup) {
+#if SANITIZER_FUCHSIA
+      auto R = __sanitizer_change_code_protection(
+          reinterpret_cast<uintptr_t>(PageAlignedAddr), MProtectLen, false);
+      if (R != ZX_OK) {
+        Report("XRay: cannot change code protection: %s\n",
+               _zx_status_get_string(R));
+      }
+#else
+      mprotect(PageAlignedAddr, MProtectLen, PROT_READ | PROT_EXEC);
+#endif
+    }
+  }
+};
+
+namespace {
+
+bool patchSled(const XRaySledEntry &Sled, bool Enable,
+               int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  bool Success = false;
+  switch (Sled.Kind) {
+  case XRayEntryType::ENTRY:
+    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_FunctionEntry);
+    break;
+  case XRayEntryType::EXIT:
+    Success = patchFunctionExit(Enable, FuncId, Sled);
+    break;
+  case XRayEntryType::TAIL:
+    Success = patchFunctionTailExit(Enable, FuncId, Sled);
+    break;
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    Success = patchFunctionEntry(Enable, FuncId, Sled, __xray_ArgLoggerEntry);
+    break;
+  case XRayEntryType::CUSTOM_EVENT:
+    Success = patchCustomEvent(Enable, FuncId, Sled);
+    break;
+  case XRayEntryType::TYPED_EVENT:
+    Success = patchTypedEvent(Enable, FuncId, Sled);
+    break;
+  default:
+    Report("Unsupported sled kind '%" PRIu64 "' @%04x\n", Sled.Address,
+           int(Sled.Kind));
+    return false;
+  }
+  return Success;
+}
+
+const XRayFunctionSledIndex
+findFunctionSleds(int32_t FuncId,
+                  const XRaySledMap &InstrMap) XRAY_NEVER_INSTRUMENT {
+  int32_t CurFn = 0;
+  uint64_t LastFnAddr = 0;
+  XRayFunctionSledIndex Index = {nullptr, 0};
+
+  for (std::size_t I = 0; I < InstrMap.Entries && CurFn <= FuncId; I++) {
+    const auto &Sled = InstrMap.Sleds[I];
+    const auto Function = Sled.function();
+    if (Function != LastFnAddr) {
+      CurFn++;
+      LastFnAddr = Function;
+    }
+
+    if (CurFn == FuncId) {
+      if (Index.Begin == nullptr)
+        Index.Begin = &Sled;
+      Index.Size = &Sled - Index.Begin + 1;
+    }
+  }
+
+  return Index;
+}
+
+XRayPatchingStatus patchFunction(int32_t FuncId,
+                                 bool Enable) XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
+    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
+
+  uint8_t NotPatching = false;
+  if (!atomic_compare_exchange_strong(
+          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
+    return XRayPatchingStatus::ONGOING; // Already patching.
+
+  // Next, we look for the function index.
+  XRaySledMap InstrMap;
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    InstrMap = XRayInstrMap;
+  }
+
+  // If we don't have an index, we can't patch individual functions.
+  if (InstrMap.Functions == 0)
+    return XRayPatchingStatus::NOT_INITIALIZED;
+
+  // FuncId must be a positive number, less than the number of functions
+  // instrumented.
+  if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
+    Report("Invalid function id provided: %d\n", FuncId);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  // Now we patch ths sleds for this specific function.
+  XRayFunctionSledIndex SledRange;
+  if (InstrMap.SledsIndex) {
+    SledRange = {InstrMap.SledsIndex[FuncId - 1].fromPCRelative(),
+                 InstrMap.SledsIndex[FuncId - 1].Size};
+  } else {
+    SledRange = findFunctionSleds(FuncId, InstrMap);
+  }
+  auto *f = SledRange.Begin;
+  bool SucceedOnce = false;
+  for (size_t i = 0; i != SledRange.Size; ++i)
+    SucceedOnce |= patchSled(f[i], Enable, FuncId);
+
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
+
+  if (!SucceedOnce) {
+    Report("Failed patching any sled for function '%d'.", FuncId);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  return XRayPatchingStatus::SUCCESS;
+}
+
+// controlPatching implements the common internals of the patching/unpatching
+// implementation. |Enable| defines whether we're enabling or disabling the
+// runtime XRay instrumentation.
+XRayPatchingStatus controlPatching(bool Enable) XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
+    return XRayPatchingStatus::NOT_INITIALIZED; // Not initialized.
+
+  uint8_t NotPatching = false;
+  if (!atomic_compare_exchange_strong(
+          &XRayPatching, &NotPatching, true, memory_order_acq_rel))
+    return XRayPatchingStatus::ONGOING; // Already patching.
+
+  uint8_t PatchingSuccess = false;
+  auto XRayPatchingStatusResetter =
+      at_scope_exit([&PatchingSuccess] {
+        if (!PatchingSuccess)
+          atomic_store(&XRayPatching, false,
+                                    memory_order_release);
+      });
+
+  XRaySledMap InstrMap;
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    InstrMap = XRayInstrMap;
+  }
+  if (InstrMap.Entries == 0)
+    return XRayPatchingStatus::NOT_INITIALIZED;
+
+  uint32_t FuncId = 1;
+  uint64_t CurFun = 0;
+
+  // First we want to find the bounds for which we have instrumentation points,
+  // and try to get as few calls to mprotect(...) as possible. We're assuming
+  // that all the sleds for the instrumentation map are contiguous as a single
+  // set of pages. When we do support dynamic shared object instrumentation,
+  // we'll need to do this for each set of page load offsets per DSO loaded. For
+  // now we're assuming we can mprotect the whole section of text between the
+  // minimum sled address and the maximum sled address (+ the largest sled
+  // size).
+  auto *MinSled = &InstrMap.Sleds[0];
+  auto *MaxSled = &InstrMap.Sleds[InstrMap.Entries - 1];
+  for (std::size_t I = 0; I < InstrMap.Entries; I++) {
+    const auto &Sled = InstrMap.Sleds[I];
+    if (Sled.address() < MinSled->address())
+      MinSled = &Sled;
+    if (Sled.address() > MaxSled->address())
+      MaxSled = &Sled;
+  }
+
+  const size_t PageSize = flags()->xray_page_size_override > 0
+                              ? flags()->xray_page_size_override
+                              : GetPageSizeCached();
+  if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) {
+    Report("System page size is not a power of two: %zu\n", PageSize);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  void *PageAlignedAddr =
+      reinterpret_cast<void *>(MinSled->address() & ~(PageSize - 1));
+  size_t MProtectLen =
+      (MaxSled->address() - reinterpret_cast<uptr>(PageAlignedAddr)) +
+      cSledLength;
+  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
+  if (Protector.MakeWriteable() == -1) {
+    Report("Failed mprotect: %d\n", errno);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  for (std::size_t I = 0; I < InstrMap.Entries; ++I) {
+    auto &Sled = InstrMap.Sleds[I];
+    auto F = Sled.function();
+    if (CurFun == 0)
+      CurFun = F;
+    if (F != CurFun) {
+      ++FuncId;
+      CurFun = F;
+    }
+    patchSled(Sled, Enable, FuncId);
+  }
+  atomic_store(&XRayPatching, false,
+                            memory_order_release);
+  PatchingSuccess = true;
+  return XRayPatchingStatus::SUCCESS;
+}
+
+XRayPatchingStatus mprotectAndPatchFunction(int32_t FuncId,
+                                            bool Enable) XRAY_NEVER_INSTRUMENT {
+  XRaySledMap InstrMap;
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    InstrMap = XRayInstrMap;
+  }
+
+  // FuncId must be a positive number, less than the number of functions
+  // instrumented.
+  if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions) {
+    Report("Invalid function id provided: %d\n", FuncId);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  const size_t PageSize = flags()->xray_page_size_override > 0
+                              ? flags()->xray_page_size_override
+                              : GetPageSizeCached();
+  if ((PageSize == 0) || ((PageSize & (PageSize - 1)) != 0)) {
+    Report("Provided page size is not a power of two: %zu\n", PageSize);
+    return XRayPatchingStatus::FAILED;
+  }
+
+  // Here we compute the minimum sled and maximum sled associated with a
+  // particular function ID.
+  XRayFunctionSledIndex SledRange;
+  if (InstrMap.SledsIndex) {
+    SledRange = {InstrMap.SledsIndex[FuncId - 1].fromPCRelative(),
+                 InstrMap.SledsIndex[FuncId - 1].Size};
+  } else {
+    SledRange = findFunctionSleds(FuncId, InstrMap);
+  }
+  auto *f = SledRange.Begin;
+  auto *e = SledRange.Begin + SledRange.Size;
+  auto *MinSled = f;
+  auto *MaxSled = e - 1;
+  while (f != e) {
+    if (f->address() < MinSled->address())
+      MinSled = f;
+    if (f->address() > MaxSled->address())
+      MaxSled = f;
+    ++f;
+  }
+
+  void *PageAlignedAddr =
+      reinterpret_cast<void *>(MinSled->address() & ~(PageSize - 1));
+  size_t MProtectLen =
+      (MaxSled->address() - reinterpret_cast<uptr>(PageAlignedAddr)) +
+      cSledLength;
+  MProtectHelper Protector(PageAlignedAddr, MProtectLen, PageSize);
+  if (Protector.MakeWriteable() == -1) {
+    Report("Failed mprotect: %d\n", errno);
+    return XRayPatchingStatus::FAILED;
+  }
+  return patchFunction(FuncId, Enable);
+}
+
+} // namespace
+
+} // namespace __xray
+
+using namespace __xray;
+
+// The following functions are declared `extern "C" {...}` in the header, hence
+// they're defined in the global namespace.
+
+int __xray_set_handler(void (*entry)(int32_t,
+                                     XRayEntryType)) XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+
+    atomic_store(&__xray::XRayPatchedFunction,
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_customevent_handler(void (*entry)(void *, size_t))
+    XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+    atomic_store(&__xray::XRayPatchedCustomEvent,
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_set_typedevent_handler(void (*entry)(size_t, const void *,
+                                                size_t)) XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&XRayInitialized,
+                               memory_order_acquire)) {
+    atomic_store(&__xray::XRayPatchedTypedEvent,
+                              reinterpret_cast<uintptr_t>(entry),
+                              memory_order_release);
+    return 1;
+  }
+  return 0;
+}
+
+int __xray_remove_handler() XRAY_NEVER_INSTRUMENT {
+  return __xray_set_handler(nullptr);
+}
+
+int __xray_remove_customevent_handler() XRAY_NEVER_INSTRUMENT {
+  return __xray_set_customevent_handler(nullptr);
+}
+
+int __xray_remove_typedevent_handler() XRAY_NEVER_INSTRUMENT {
+  return __xray_set_typedevent_handler(nullptr);
+}
+
+uint16_t __xray_register_event_type(
+    const char *const event_type) XRAY_NEVER_INSTRUMENT {
+  TypeDescriptorMapType::Handle h(&TypeDescriptorAddressMap, (uptr)event_type);
+  if (h.created()) {
+    h->type_id = atomic_fetch_add(
+        &TypeEventDescriptorCounter, 1, memory_order_acq_rel);
+    h->description_string_length = strnlen(event_type, 1024);
+  }
+  return h->type_id;
+}
+
+XRayPatchingStatus __xray_patch() XRAY_NEVER_INSTRUMENT {
+  return controlPatching(true);
+}
+
+XRayPatchingStatus __xray_unpatch() XRAY_NEVER_INSTRUMENT {
+  return controlPatching(false);
+}
+
+XRayPatchingStatus __xray_patch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  return mprotectAndPatchFunction(FuncId, true);
+}
+
+XRayPatchingStatus
+__xray_unpatch_function(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  return mprotectAndPatchFunction(FuncId, false);
+}
+
+int __xray_set_handler_arg1(void (*entry)(int32_t, XRayEntryType, uint64_t)) {
+  if (!atomic_load(&XRayInitialized,
+                                memory_order_acquire))
+    return 0;
+
+  // A relaxed write might not be visible even if the current thread gets
+  // scheduled on a different CPU/NUMA node.  We need to wait for everyone to
+  // have this handler installed for consistency of collected data across CPUs.
+  atomic_store(&XRayArgLogger, reinterpret_cast<uint64_t>(entry),
+                            memory_order_release);
+  return 1;
+}
+
+int __xray_remove_handler_arg1() { return __xray_set_handler_arg1(nullptr); }
+
+uintptr_t __xray_function_address(int32_t FuncId) XRAY_NEVER_INSTRUMENT {
+  XRaySledMap InstrMap;
+  {
+    SpinMutexLock Guard(&XRayInstrMapMutex);
+    InstrMap = XRayInstrMap;
+  }
+
+  if (FuncId <= 0 || static_cast<size_t>(FuncId) > InstrMap.Functions)
+    return 0;
+  const XRaySledEntry *Sled =
+      InstrMap.SledsIndex ? InstrMap.SledsIndex[FuncId - 1].fromPCRelative()
+                          : findFunctionSleds(FuncId, InstrMap).Begin;
+  return Sled->function()
+// On PPC, function entries are always aligned to 16 bytes. The beginning of a
+// sled might be a local entry, which is always +8 based on the global entry.
+// Always return the global entry.
+#ifdef __PPC__
+         & ~0xf
+#endif
+      ;
+}
+
+size_t __xray_max_function_id() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayInstrMapMutex);
+  return XRayInstrMap.Functions;
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_interface_internal.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_interface_internal.h
new file mode 100644
index 000000000000..80c07c167f64
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_interface_internal.h
@@ -0,0 +1,102 @@
+//===-- xray_interface_internal.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of the API functions. See also include/xray/xray_interface.h.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_INTERFACE_INTERNAL_H
+#define XRAY_INTERFACE_INTERNAL_H
+
+#include "sanitizer_common/sanitizer_platform.h"
+#include "xray/xray_interface.h"
+#include <cstddef>
+#include <cstdint>
+
+extern "C" {
+
+struct XRaySledEntry {
+#if SANITIZER_WORDSIZE == 64
+  uint64_t Address;
+  uint64_t Function;
+  unsigned char Kind;
+  unsigned char AlwaysInstrument;
+  unsigned char Version;
+  unsigned char Padding[13]; // Need 32 bytes
+  uint64_t function() const {
+    // The target address is relative to the location of the Function variable.
+    return reinterpret_cast<uint64_t>(&Function) + Function;
+  }
+  uint64_t address() const {
+    // The target address is relative to the location of the Address variable.
+    return reinterpret_cast<uint64_t>(&Address) + Address;
+  }
+#elif SANITIZER_WORDSIZE == 32
+  uint32_t Address;
+  uint32_t Function;
+  unsigned char Kind;
+  unsigned char AlwaysInstrument;
+  unsigned char Version;
+  unsigned char Padding[5]; // Need 16 bytes
+  uint32_t function() const {
+    // The target address is relative to the location of the Function variable.
+    return reinterpret_cast<uint32_t>(&Function) + Function;
+  }
+  uint32_t address() const {
+    // The target address is relative to the location of the Address variable.
+    return reinterpret_cast<uint32_t>(&Address) + Address;
+  }
+#else
+#error "Unsupported word size."
+#endif
+};
+
+struct XRayFunctionSledIndex {
+  const XRaySledEntry *Begin;
+  size_t Size;
+  // For an entry in the xray_fn_idx section, the address is relative to the
+  // location of the Begin variable.
+  const XRaySledEntry *fromPCRelative() const {
+    return reinterpret_cast<const XRaySledEntry *>(uintptr_t(&Begin) +
+                                                   uintptr_t(Begin));
+  }
+};
+}
+
+namespace __xray {
+
+struct XRaySledMap {
+  const XRaySledEntry *Sleds;
+  size_t Entries;
+  const XRayFunctionSledIndex *SledsIndex;
+  size_t Functions;
+};
+
+bool patchFunctionEntry(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled,
+                        void (*Trampoline)());
+bool patchFunctionExit(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+bool patchFunctionTailExit(bool Enable, uint32_t FuncId,
+                           const XRaySledEntry &Sled);
+bool patchCustomEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+bool patchTypedEvent(bool Enable, uint32_t FuncId, const XRaySledEntry &Sled);
+
+} // namespace __xray
+
+extern "C" {
+// The following functions have to be defined in assembler, on a per-platform
+// basis. See xray_trampoline_*.S files for implementations.
+extern void __xray_FunctionEntry();
+extern void __xray_FunctionExit();
+extern void __xray_FunctionTailExit();
+extern void __xray_ArgLoggerEntry();
+extern void __xray_CustomEvent();
+extern void __xray_TypedEvent();
+}
+
+#endif
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_log_interface.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_log_interface.cpp
new file mode 100644
index 000000000000..fc70373f9dac
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_log_interface.cpp
@@ -0,0 +1,209 @@
+//===-- xray_log_interface.cpp --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a function call tracing system.
+//
+//===----------------------------------------------------------------------===//
+#include "xray/xray_log_interface.h"
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_mutex.h"
+#include "xray/xray_interface.h"
+#include "xray_defs.h"
+
+namespace __xray {
+static SpinMutex XRayImplMutex;
+static XRayLogImpl CurrentXRayImpl{nullptr, nullptr, nullptr, nullptr};
+static XRayLogImpl *GlobalXRayImpl = nullptr;
+
+// This is the default implementation of a buffer iterator, which always yields
+// a null buffer.
+XRayBuffer NullBufferIterator(XRayBuffer) XRAY_NEVER_INSTRUMENT {
+  return {nullptr, 0};
+}
+
+// This is the global function responsible for iterating through given buffers.
+atomic_uintptr_t XRayBufferIterator{
+    reinterpret_cast<uintptr_t>(&NullBufferIterator)};
+
+// We use a linked list of Mode to XRayLogImpl mappings. This is a linked list
+// when it should be a map because we're avoiding having to depend on C++
+// standard library data structures at this level of the implementation.
+struct ModeImpl {
+  ModeImpl *Next;
+  const char *Mode;
+  XRayLogImpl Impl;
+};
+
+static ModeImpl SentinelModeImpl{
+    nullptr, nullptr, {nullptr, nullptr, nullptr, nullptr}};
+static ModeImpl *ModeImpls = &SentinelModeImpl;
+static const ModeImpl *CurrentMode = nullptr;
+
+} // namespace __xray
+
+using namespace __xray;
+
+void __xray_log_set_buffer_iterator(XRayBuffer (*Iterator)(XRayBuffer))
+    XRAY_NEVER_INSTRUMENT {
+  atomic_store(&__xray::XRayBufferIterator,
+               reinterpret_cast<uintptr_t>(Iterator), memory_order_release);
+}
+
+void __xray_log_remove_buffer_iterator() XRAY_NEVER_INSTRUMENT {
+  __xray_log_set_buffer_iterator(&NullBufferIterator);
+}
+
+XRayLogRegisterStatus
+__xray_log_register_mode(const char *Mode,
+                         XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
+  if (Impl.flush_log == nullptr || Impl.handle_arg0 == nullptr ||
+      Impl.log_finalize == nullptr || Impl.log_init == nullptr)
+    return XRayLogRegisterStatus::XRAY_INCOMPLETE_IMPL;
+
+  SpinMutexLock Guard(&XRayImplMutex);
+  // First, look for whether the mode already has a registered implementation.
+  for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
+    if (!internal_strcmp(Mode, it->Mode))
+      return XRayLogRegisterStatus::XRAY_DUPLICATE_MODE;
+  }
+  auto *NewModeImpl = static_cast<ModeImpl *>(InternalAlloc(sizeof(ModeImpl)));
+  NewModeImpl->Next = ModeImpls;
+  NewModeImpl->Mode = internal_strdup(Mode);
+  NewModeImpl->Impl = Impl;
+  ModeImpls = NewModeImpl;
+  return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
+}
+
+XRayLogRegisterStatus
+__xray_log_select_mode(const char *Mode) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  for (ModeImpl *it = ModeImpls; it != &SentinelModeImpl; it = it->Next) {
+    if (!internal_strcmp(Mode, it->Mode)) {
+      CurrentMode = it;
+      CurrentXRayImpl = it->Impl;
+      GlobalXRayImpl = &CurrentXRayImpl;
+      __xray_set_handler(it->Impl.handle_arg0);
+      return XRayLogRegisterStatus::XRAY_REGISTRATION_OK;
+    }
+  }
+  return XRayLogRegisterStatus::XRAY_MODE_NOT_FOUND;
+}
+
+const char *__xray_log_get_current_mode() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (CurrentMode != nullptr)
+    return CurrentMode->Mode;
+  return nullptr;
+}
+
+void __xray_set_log_impl(XRayLogImpl Impl) XRAY_NEVER_INSTRUMENT {
+  if (Impl.log_init == nullptr || Impl.log_finalize == nullptr ||
+      Impl.handle_arg0 == nullptr || Impl.flush_log == nullptr) {
+    SpinMutexLock Guard(&XRayImplMutex);
+    GlobalXRayImpl = nullptr;
+    CurrentMode = nullptr;
+    __xray_remove_handler();
+    __xray_remove_handler_arg1();
+    return;
+  }
+
+  SpinMutexLock Guard(&XRayImplMutex);
+  CurrentXRayImpl = Impl;
+  GlobalXRayImpl = &CurrentXRayImpl;
+  __xray_set_handler(Impl.handle_arg0);
+}
+
+void __xray_remove_log_impl() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  GlobalXRayImpl = nullptr;
+  __xray_remove_handler();
+  __xray_remove_handler_arg1();
+}
+
+XRayLogInitStatus __xray_log_init(size_t BufferSize, size_t MaxBuffers,
+                                  void *Args,
+                                  size_t ArgsSize) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  return GlobalXRayImpl->log_init(BufferSize, MaxBuffers, Args, ArgsSize);
+}
+
+XRayLogInitStatus __xray_log_init_mode(const char *Mode, const char *Config)
+    XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  if (Config == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Check first whether the current mode is the same as what we expect.
+  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Here we do some work to coerce the pointer we're provided, so that
+  // the implementations that still take void* pointers can handle the
+  // data provided in the Config argument.
+  return GlobalXRayImpl->log_init(
+      0, 0, const_cast<void *>(static_cast<const void *>(Config)), 0);
+}
+
+XRayLogInitStatus
+__xray_log_init_mode_bin(const char *Mode, const char *Config,
+                         size_t ConfigSize) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  if (Config == nullptr)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Check first whether the current mode is the same as what we expect.
+  if (CurrentMode == nullptr || internal_strcmp(CurrentMode->Mode, Mode) != 0)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  // Here we do some work to coerce the pointer we're provided, so that
+  // the implementations that still take void* pointers can handle the
+  // data provided in the Config argument.
+  return GlobalXRayImpl->log_init(
+      0, 0, const_cast<void *>(static_cast<const void *>(Config)), ConfigSize);
+}
+
+XRayLogInitStatus __xray_log_finalize() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  return GlobalXRayImpl->log_finalize();
+}
+
+XRayLogFlushStatus __xray_log_flushLog() XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Guard(&XRayImplMutex);
+  if (!GlobalXRayImpl)
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  return GlobalXRayImpl->flush_log();
+}
+
+XRayLogFlushStatus __xray_log_process_buffers(
+    void (*Processor)(const char *, XRayBuffer)) XRAY_NEVER_INSTRUMENT {
+  // We want to make sure that there will be no changes to the global state for
+  // the log by synchronising on the XRayBufferIteratorMutex.
+  if (!GlobalXRayImpl)
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  auto Iterator = reinterpret_cast<XRayBuffer (*)(XRayBuffer)>(
+      atomic_load(&XRayBufferIterator, memory_order_acquire));
+  auto Buffer = (*Iterator)(XRayBuffer{nullptr, 0});
+  auto Mode = CurrentMode ? CurrentMode->Mode : nullptr;
+  while (Buffer.Data != nullptr) {
+    (*Processor)(Mode, Buffer);
+    Buffer = (*Iterator)(Buffer);
+  }
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_loongarch64.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_loongarch64.cpp
new file mode 100644
index 000000000000..b839adba00d2
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_loongarch64.cpp
@@ -0,0 +1,160 @@
+//===-------- xray_loongarch64.cpp ------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of loongarch-specific routines.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+
+namespace __xray {
+
+enum RegNum : uint32_t {
+  RN_RA = 1,
+  RN_SP = 3,
+  RN_T0 = 12,
+  RN_T1 = 13,
+};
+
+// Encode instructions in the 2RIx format, where the primary formats here
+// are 2RI12-type and 2RI16-type.
+static inline uint32_t
+encodeInstruction2RIx(uint32_t Opcode, uint32_t Rd, uint32_t Rj,
+                      uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return Opcode | (Imm << 10) | (Rj << 5) | Rd;
+}
+
+// Encode instructions in 1RI20 format, e.g. lu12i.w/lu32i.d.
+static inline uint32_t
+encodeInstruction1RI20(uint32_t Opcode, uint32_t Rd,
+                       uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return Opcode | (Imm << 5) | Rd;
+}
+
+static inline bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // .Lxray_sled_beginN:
+  //	B .Lxray_sled_endN
+  //	11 NOPs (44 bytes)
+  // .Lxray_sled_endN:
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n:
+  //   addi.d  sp, sp, -16                       ; create the stack frame
+  //   st.d    ra, sp, 8                         ; save the return address
+  //   lu12i.w t0, %abs_hi20(__xray_FunctionEntry/Exit)
+  //   ori     t0, t0, %abs_lo12(__xray_FunctionEntry/Exit)
+  //   lu32i.d t0, %abs64_lo20(__xray_FunctionEntry/Exit)
+  //   lu52i.d t0, t0, %abs64_hi12(__xray_FunctionEntry/Exit)
+  //   lu12i.w t1, %abs_hi20(function_id)
+  //   ori     t1, t1, %abs_lo12(function_id)    ; pass the function id
+  //   jirl    ra, t0, 0                         ; call the tracing hook
+  //   ld.d    ra, sp, 8                         ; restore the return address
+  //   addi.d  sp, sp, 16                        ; de-allocate the stack frame
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set the first instruction in the sled back to
+  //   B #48
+
+  uint32_t *Address = reinterpret_cast<uint32_t *>(Sled.address());
+  if (Enable) {
+    uint32_t LoTracingHookAddr = reinterpret_cast<int64_t>(TracingHook) & 0xfff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 12) & 0xfffff;
+    uint32_t HigherTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xfffff;
+    uint32_t HighestTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 52) & 0xfff;
+    uint32_t LoFunctionID = FuncId & 0xfff;
+    uint32_t HiFunctionID = (FuncId >> 12) & 0xfffff;
+    Address[1] = encodeInstruction2RIx(0x29c00000, RegNum::RN_RA, RegNum::RN_SP,
+                                       0x8); // st.d ra, sp, 8
+    Address[2] = encodeInstruction1RI20(
+        0x14000000, RegNum::RN_T0,
+        HiTracingHookAddr); // lu12i.w t0, HiTracingHookAddr
+    Address[3] = encodeInstruction2RIx(
+        0x03800000, RegNum::RN_T0, RegNum::RN_T0,
+        LoTracingHookAddr); // ori t0, t0, LoTracingHookAddr
+    Address[4] = encodeInstruction1RI20(
+        0x16000000, RegNum::RN_T0,
+        HigherTracingHookAddr); // lu32i.d t0, HigherTracingHookAddr
+    Address[5] = encodeInstruction2RIx(
+        0x03000000, RegNum::RN_T0, RegNum::RN_T0,
+        HighestTracingHookAddr); // lu52i.d t0, t0, HighestTracingHookAddr
+    Address[6] =
+        encodeInstruction1RI20(0x14000000, RegNum::RN_T1,
+                               HiFunctionID); // lu12i.w t1, HiFunctionID
+    Address[7] =
+        encodeInstruction2RIx(0x03800000, RegNum::RN_T1, RegNum::RN_T1,
+                              LoFunctionID); // ori t1, t1, LoFunctionID
+    Address[8] = encodeInstruction2RIx(0x4c000000, RegNum::RN_RA, RegNum::RN_T0,
+                                       0); // jirl ra, t0, 0
+    Address[9] = encodeInstruction2RIx(0x28c00000, RegNum::RN_RA, RegNum::RN_SP,
+                                       0x8); // ld.d ra, sp, 8
+    Address[10] = encodeInstruction2RIx(
+        0x02c00000, RegNum::RN_SP, RegNum::RN_SP, 0x10); // addi.d sp, sp, 16
+    uint32_t CreateStackSpace = encodeInstruction2RIx(
+        0x02c00000, RegNum::RN_SP, RegNum::RN_SP, 0xff0); // addi.d sp, sp, -16
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address), CreateStackSpace,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address),
+        uint32_t(0x50003000), std::memory_order_release); // b #48
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // TODO: In the future we'd need to distinguish between non-tail exits and
+  // tail exits for better information preservation.
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in loongarch?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in loongarch?
+  return false;
+}
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // TODO: This will have to be implemented in the trampoline assembly file.
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_mips.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_mips.cpp
new file mode 100644
index 000000000000..dc9e837a555d
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_mips.cpp
@@ -0,0 +1,171 @@
+//===-- xray_mips.cpp -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of MIPS-specific routines (32-bit).
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum PatchOpcodes : uint32_t {
+  PO_ADDIU = 0x24000000, // addiu rt, rs, imm
+  PO_SW = 0xAC000000,    // sw rt, offset(sp)
+  PO_LUI = 0x3C000000,   // lui rs, %hi(address)
+  PO_ORI = 0x34000000,   // ori rt, rs, %lo(address)
+  PO_JALR = 0x0000F809,  // jalr rs
+  PO_LW = 0x8C000000,    // lw rt, offset(address)
+  PO_B44 = 0x1000000b,   // b #44
+  PO_NOP = 0x0,          // nop
+};
+
+enum RegNum : uint32_t {
+  RN_T0 = 0x8,
+  RN_T9 = 0x19,
+  RN_RA = 0x1F,
+  RN_SP = 0x1D,
+};
+
+inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs,
+                                         uint32_t Rt,
+                                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Opcode | Rs << 21 | Rt << 16 | Imm);
+}
+
+inline static uint32_t
+encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd,
+                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //	B .tmpN
+  //	11 NOPs (44 bytes)
+  //	.tmpN
+  //	ADDIU T9, T9, 44
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n (32-bit):
+  //    addiu sp, sp, -8                        ;create stack frame
+  //    nop
+  //    sw ra, 4(sp)                            ;save return address
+  //    sw t9, 0(sp)                            ;save register t9
+  //    lui t9, %hi(__xray_FunctionEntry/Exit)
+  //    ori t9, t9, %lo(__xray_FunctionEntry/Exit)
+  //    lui t0, %hi(function_id)
+  //    jalr t9                                 ;call Tracing hook
+  //    ori t0, t0, %lo(function_id)            ;pass function id (delay slot)
+  //    lw t9, 0(sp)                            ;restore register t9
+  //    lw ra, 4(sp)                            ;restore return address
+  //    addiu sp, sp, 8                         ;delete stack frame
+  //
+  // We add 44 bytes to t9 because we want to adjust the function pointer to
+  // the actual start of function i.e. the address just after the noop sled.
+  // We do this because gp displacement relocation is emitted at the start of
+  // of the function i.e after the nop sled and to correctly calculate the
+  // global offset table address, t9 must hold the address of the instruction
+  // containing the gp displacement relocation.
+  // FIXME: Is this correct for the static relocation model?
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #44
+
+  uint32_t *Address = reinterpret_cast<uint32_t *>(Sled.address());
+  if (Enable) {
+    uint32_t LoTracingHookAddr =
+        reinterpret_cast<int32_t>(TracingHook) & 0xffff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int32_t>(TracingHook) >> 16) & 0xffff;
+    uint32_t LoFunctionID = FuncId & 0xffff;
+    uint32_t HiFunctionID = (FuncId >> 16) & 0xffff;
+    Address[2] = encodeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
+                                   RegNum::RN_RA, 0x4);
+    Address[3] = encodeInstruction(PatchOpcodes::PO_SW, RegNum::RN_SP,
+                                   RegNum::RN_T9, 0x0);
+    Address[4] = encodeInstruction(PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9,
+                                   HiTracingHookAddr);
+    Address[5] = encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T9,
+                                   RegNum::RN_T9, LoTracingHookAddr);
+    Address[6] = encodeInstruction(PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0,
+                                   HiFunctionID);
+    Address[7] = encodeSpecialInstruction(PatchOpcodes::PO_JALR, RegNum::RN_T9,
+                                          0x0, RegNum::RN_RA, 0X0);
+    Address[8] = encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T0,
+                                   RegNum::RN_T0, LoFunctionID);
+    Address[9] = encodeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
+                                   RegNum::RN_T9, 0x0);
+    Address[10] = encodeInstruction(PatchOpcodes::PO_LW, RegNum::RN_SP,
+                                    RegNum::RN_RA, 0x4);
+    Address[11] = encodeInstruction(PatchOpcodes::PO_ADDIU, RegNum::RN_SP,
+                                    RegNum::RN_SP, 0x8);
+    uint32_t CreateStackSpaceInstr = encodeInstruction(
+        PatchOpcodes::PO_ADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xFFF8);
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address),
+        uint32_t(CreateStackSpaceInstr), std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address),
+        uint32_t(PatchOpcodes::PO_B44), std::memory_order_release);
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: In the future we'd need to distinguish between non-tail exits and
+  // tail exits for better information preservation.
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips?
+  return false;
+}
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_mips64.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_mips64.cpp
new file mode 100644
index 000000000000..5b221bb6ddc0
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_mips64.cpp
@@ -0,0 +1,178 @@
+//===-- xray_mips64.cpp -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of MIPS64-specific routines.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include <atomic>
+
+namespace __xray {
+
+// The machine codes for some instructions used in runtime patching.
+enum PatchOpcodes : uint32_t {
+  PO_DADDIU = 0x64000000, // daddiu rt, rs, imm
+  PO_SD = 0xFC000000,     // sd rt, base(offset)
+  PO_LUI = 0x3C000000,    // lui rt, imm
+  PO_ORI = 0x34000000,    // ori rt, rs, imm
+  PO_DSLL = 0x00000038,   // dsll rd, rt, sa
+  PO_JALR = 0x00000009,   // jalr rs
+  PO_LD = 0xDC000000,     // ld rt, base(offset)
+  PO_B60 = 0x1000000f,    // b #60
+  PO_NOP = 0x0,           // nop
+};
+
+enum RegNum : uint32_t {
+  RN_T0 = 0xC,
+  RN_T9 = 0x19,
+  RN_RA = 0x1F,
+  RN_SP = 0x1D,
+};
+
+inline static uint32_t encodeInstruction(uint32_t Opcode, uint32_t Rs,
+                                         uint32_t Rt,
+                                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Opcode | Rs << 21 | Rt << 16 | Imm);
+}
+
+inline static uint32_t
+encodeSpecialInstruction(uint32_t Opcode, uint32_t Rs, uint32_t Rt, uint32_t Rd,
+                         uint32_t Imm) XRAY_NEVER_INSTRUMENT {
+  return (Rs << 21 | Rt << 16 | Rd << 11 | Imm << 6 | Opcode);
+}
+
+inline static bool patchSled(const bool Enable, const uint32_t FuncId,
+                             const XRaySledEntry &Sled,
+                             void (*TracingHook)()) XRAY_NEVER_INSTRUMENT {
+  // When |Enable| == true,
+  // We replace the following compile-time stub (sled):
+  //
+  // xray_sled_n:
+  //	B .tmpN
+  //	15 NOPs (60 bytes)
+  //	.tmpN
+  //
+  // With the following runtime patch:
+  //
+  // xray_sled_n (64-bit):
+  //    daddiu sp, sp, -16                      ;create stack frame
+  //    nop
+  //    sd ra, 8(sp)                            ;save return address
+  //    sd t9, 0(sp)                            ;save register t9
+  //    lui t9, %highest(__xray_FunctionEntry/Exit)
+  //    ori t9, t9, %higher(__xray_FunctionEntry/Exit)
+  //    dsll t9, t9, 16
+  //    ori t9, t9, %hi(__xray_FunctionEntry/Exit)
+  //    dsll t9, t9, 16
+  //    ori t9, t9, %lo(__xray_FunctionEntry/Exit)
+  //    lui t0, %hi(function_id)
+  //    jalr t9                                 ;call Tracing hook
+  //    ori t0, t0, %lo(function_id)            ;pass function id (delay slot)
+  //    ld t9, 0(sp)                            ;restore register t9
+  //    ld ra, 8(sp)                            ;restore return address
+  //    daddiu sp, sp, 16                       ;delete stack frame
+  //
+  // Replacement of the first 4-byte instruction should be the last and atomic
+  // operation, so that the user code which reaches the sled concurrently
+  // either jumps over the whole sled, or executes the whole sled when the
+  // latter is ready.
+  //
+  // When |Enable|==false, we set back the first instruction in the sled to be
+  //   B #60
+
+  uint32_t *Address = reinterpret_cast<uint32_t *>(Sled.address());
+  if (Enable) {
+    uint32_t LoTracingHookAddr =
+        reinterpret_cast<int64_t>(TracingHook) & 0xffff;
+    uint32_t HiTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 16) & 0xffff;
+    uint32_t HigherTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 32) & 0xffff;
+    uint32_t HighestTracingHookAddr =
+        (reinterpret_cast<int64_t>(TracingHook) >> 48) & 0xffff;
+    uint32_t LoFunctionID = FuncId & 0xffff;
+    uint32_t HiFunctionID = (FuncId >> 16) & 0xffff;
+    Address[2] = encodeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
+                                   RegNum::RN_RA, 0x8);
+    Address[3] = encodeInstruction(PatchOpcodes::PO_SD, RegNum::RN_SP,
+                                   RegNum::RN_T9, 0x0);
+    Address[4] = encodeInstruction(PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T9,
+                                   HighestTracingHookAddr);
+    Address[5] = encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T9,
+                                   RegNum::RN_T9, HigherTracingHookAddr);
+    Address[6] = encodeSpecialInstruction(PatchOpcodes::PO_DSLL, 0x0,
+                                          RegNum::RN_T9, RegNum::RN_T9, 0x10);
+    Address[7] = encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T9,
+                                   RegNum::RN_T9, HiTracingHookAddr);
+    Address[8] = encodeSpecialInstruction(PatchOpcodes::PO_DSLL, 0x0,
+                                          RegNum::RN_T9, RegNum::RN_T9, 0x10);
+    Address[9] = encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T9,
+                                   RegNum::RN_T9, LoTracingHookAddr);
+    Address[10] = encodeInstruction(PatchOpcodes::PO_LUI, 0x0, RegNum::RN_T0,
+                                    HiFunctionID);
+    Address[11] = encodeSpecialInstruction(PatchOpcodes::PO_JALR, RegNum::RN_T9,
+                                           0x0, RegNum::RN_RA, 0X0);
+    Address[12] = encodeInstruction(PatchOpcodes::PO_ORI, RegNum::RN_T0,
+                                    RegNum::RN_T0, LoFunctionID);
+    Address[13] = encodeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
+                                    RegNum::RN_T9, 0x0);
+    Address[14] = encodeInstruction(PatchOpcodes::PO_LD, RegNum::RN_SP,
+                                    RegNum::RN_RA, 0x8);
+    Address[15] = encodeInstruction(PatchOpcodes::PO_DADDIU, RegNum::RN_SP,
+                                    RegNum::RN_SP, 0x10);
+    uint32_t CreateStackSpace = encodeInstruction(
+        PatchOpcodes::PO_DADDIU, RegNum::RN_SP, RegNum::RN_SP, 0xfff0);
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address), CreateStackSpace,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint32_t> *>(Address),
+        uint32_t(PatchOpcodes::PO_B60), std::memory_order_release);
+  }
+  return true;
+}
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, Trampoline);
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: In the future we'd need to distinguish between non-tail exits and
+  // tail exits for better information preservation.
+  return patchSled(Enable, FuncId, Sled, __xray_FunctionExit);
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in mips64?
+  return false;
+}
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_never_instrument.txt b/contrib/llvm-project/compiler-rt/lib/xray/xray_never_instrument.txt
new file mode 100644
index 000000000000..7fa48dda7e16
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_never_instrument.txt
@@ -0,0 +1,6 @@
+# List of function matchers common to C/C++ applications that make sense to
+# never instrument. You can use this as an argument to
+# -fxray-never-instrument=<path> along with your project-specific lists.
+
+# Never instrument any function whose symbol starts with __xray.
+fun:__xray*
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_powerpc64.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_powerpc64.cpp
new file mode 100644
index 000000000000..c3553d848313
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_powerpc64.cpp
@@ -0,0 +1,113 @@
+//===-- xray_powerpc64.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Implementation of powerpc64 and powerpc64le routines.
+//
+//===----------------------------------------------------------------------===//
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+#include "xray_utils.h"
+#include <atomic>
+#include <cassert>
+#include <cstring>
+
+#ifndef __LITTLE_ENDIAN__
+#error powerpc64 big endian is not supported for now.
+#endif
+
+namespace {
+
+constexpr unsigned long long JumpOverInstNum = 7;
+
+void clearCache(void *Addr, size_t Len) {
+  const size_t LineSize = 32;
+
+  const intptr_t Mask = ~(LineSize - 1);
+  const intptr_t StartLine = ((intptr_t)Addr) & Mask;
+  const intptr_t EndLine = ((intptr_t)Addr + Len + LineSize - 1) & Mask;
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("dcbf 0, %0" : : "r"(Line));
+  asm volatile("sync");
+
+  for (intptr_t Line = StartLine; Line < EndLine; Line += LineSize)
+    asm volatile("icbi 0, %0" : : "r"(Line));
+  asm volatile("isync");
+}
+
+} // namespace
+
+extern "C" void __clear_cache(void *start, void *end);
+
+namespace __xray {
+
+bool patchFunctionEntry(const bool Enable, uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  const uint64_t Address = Sled.address();
+  if (Enable) {
+    // lis 0, FuncId[16..32]
+    // li 0, FuncId[0..15]
+    *reinterpret_cast<uint64_t *>(Address) =
+        (0x3c000000ull + (FuncId >> 16)) +
+        ((0x60000000ull + (FuncId & 0xffff)) << 32);
+  } else {
+    // b +JumpOverInstNum instructions.
+    *reinterpret_cast<uint32_t *>(Address) =
+        0x48000000ull + (JumpOverInstNum << 2);
+  }
+  clearCache(reinterpret_cast<void *>(Address), 8);
+  return true;
+}
+
+bool patchFunctionExit(const bool Enable, uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  const uint64_t Address = Sled.address();
+  if (Enable) {
+    // lis 0, FuncId[16..32]
+    // li 0, FuncId[0..15]
+    *reinterpret_cast<uint64_t *>(Address) =
+        (0x3c000000ull + (FuncId >> 16)) +
+        ((0x60000000ull + (FuncId & 0xffff)) << 32);
+  } else {
+    // Copy the blr/b instruction after JumpOverInstNum instructions.
+    *reinterpret_cast<uint32_t *>(Address) =
+        *(reinterpret_cast<uint32_t *>(Address) + JumpOverInstNum);
+  }
+  clearCache(reinterpret_cast<void *>(Address), 8);
+  return true;
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  return patchFunctionExit(Enable, FuncId, Sled);
+}
+
+// FIXME: Maybe implement this better?
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                     const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // FIXME: Implement in powerpc64?
+  return false;
+}
+
+} // namespace __xray
+
+extern "C" void __xray_ArgLoggerEntry() XRAY_NEVER_INSTRUMENT {
+  // FIXME: this will have to be implemented in the trampoline assembly file
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_powerpc64.inc b/contrib/llvm-project/compiler-rt/lib/xray/xray_powerpc64.inc
new file mode 100644
index 000000000000..7e872b5b42e6
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_powerpc64.inc
@@ -0,0 +1,51 @@
+//===-- xray_powerpc64.inc --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+#include <mutex>
+#ifdef __linux__
+#include <sys/platform/ppc.h>
+#elif defined(__FreeBSD__)
+#include <sys/types.h>
+#include <sys/sysctl.h>
+
+#define __ppc_get_timebase __builtin_ppc_get_timebase
+
+uint64_t __ppc_get_timebase_freq (void)
+{
+  uint64_t tb_freq = 0;
+  size_t length = sizeof(tb_freq);
+  sysctlbyname("kern.timecounter.tc.timebase.frequency", &tb_freq, &length, nullptr, 0);
+  return tb_freq;
+}
+#endif
+
+#include "xray_defs.h"
+
+namespace __xray {
+
+ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+  CPU = 0;
+  return __ppc_get_timebase();
+}
+
+inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+  static std::mutex M;
+  std::lock_guard<std::mutex> Guard(M);
+  return __ppc_get_timebase_freq();
+}
+
+inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
+  return true;
+}
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.cpp
new file mode 100644
index 000000000000..3a28240e603c
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.cpp
@@ -0,0 +1,411 @@
+//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the interface for the profileCollectorService.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
+#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
+#include <memory>
+#include <pthread.h>
+#include <utility>
+
+namespace __xray {
+namespace profileCollectorService {
+
+namespace {
+
+SpinMutex GlobalMutex;
+struct ThreadTrie {
+  tid_t TId;
+  alignas(FunctionCallTrie) std::byte TrieStorage[sizeof(FunctionCallTrie)];
+};
+
+struct ProfileBuffer {
+  void *Data;
+  size_t Size;
+};
+
+// Current version of the profile format.
+constexpr u64 XRayProfilingVersion = 0x20180424;
+
+// Identifier for XRay profiling files 'xrayprof' in hex.
+constexpr u64 XRayMagicBytes = 0x7872617970726f66;
+
+struct XRayProfilingFileHeader {
+  const u64 MagicBytes = XRayMagicBytes;
+  const u64 Version = XRayProfilingVersion;
+  u64 Timestamp = 0; // System time in nanoseconds.
+  u64 PID = 0;       // Process ID.
+};
+
+struct BlockHeader {
+  u32 BlockSize;
+  u32 BlockNum;
+  u64 ThreadId;
+};
+
+struct ThreadData {
+  BufferQueue *BQ;
+  FunctionCallTrie::Allocators::Buffers Buffers;
+  FunctionCallTrie::Allocators Allocators;
+  FunctionCallTrie FCT;
+  tid_t TId;
+};
+
+using ThreadDataArray = Array<ThreadData>;
+using ThreadDataAllocator = ThreadDataArray::AllocatorType;
+
+// We use a separate buffer queue for the backing store for the allocator used
+// by the ThreadData array. This lets us host the buffers, allocators, and tries
+// associated with a thread by moving the data into the array instead of
+// attempting to copy the data to a separately backed set of tries.
+alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)];
+static BufferQueue *BQ = nullptr;
+static BufferQueue::Buffer Buffer;
+alignas(ThreadDataAllocator) static std::byte
+    ThreadDataAllocatorStorage[sizeof(ThreadDataAllocator)];
+alignas(ThreadDataArray) static std::byte
+    ThreadDataArrayStorage[sizeof(ThreadDataArray)];
+
+static ThreadDataAllocator *TDAllocator = nullptr;
+static ThreadDataArray *TDArray = nullptr;
+
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+alignas(ProfileBufferArray) static std::byte
+    ProfileBuffersStorage[sizeof(ProfileBufferArray)];
+alignas(ProfileBufferArrayAllocator) static std::byte
+    ProfileBufferArrayAllocatorStorage[sizeof(ProfileBufferArrayAllocator)];
+
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+
+// Use a global flag to determine whether the collector implementation has been
+// initialized.
+static atomic_uint8_t CollectorInitialized{0};
+
+} // namespace
+
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+          FunctionCallTrie::Allocators &&A,
+          FunctionCallTrie::Allocators::Buffers &&B,
+          tid_t TId) XRAY_NEVER_INSTRUMENT {
+  DCHECK_NE(Q, nullptr);
+
+  // Bail out early if the collector has not been initialized.
+  if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
+    T.~FunctionCallTrie();
+    A.~Allocators();
+    Q->releaseBuffer(B.NodeBuffer);
+    Q->releaseBuffer(B.RootsBuffer);
+    Q->releaseBuffer(B.ShadowStackBuffer);
+    Q->releaseBuffer(B.NodeIdPairBuffer);
+    B.~Buffers();
+    return;
+  }
+
+  {
+    SpinMutexLock Lock(&GlobalMutex);
+    DCHECK_NE(TDAllocator, nullptr);
+    DCHECK_NE(TDArray, nullptr);
+
+    if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
+                               TId) == nullptr) {
+      // If we fail to add the data to the array, we should destroy the objects
+      // handed us.
+      T.~FunctionCallTrie();
+      A.~Allocators();
+      Q->releaseBuffer(B.NodeBuffer);
+      Q->releaseBuffer(B.RootsBuffer);
+      Q->releaseBuffer(B.ShadowStackBuffer);
+      Q->releaseBuffer(B.NodeIdPairBuffer);
+      B.~Buffers();
+    }
+  }
+}
+
+// A PathArray represents the function id's representing a stack trace. In this
+// context a path is almost always represented from the leaf function in a call
+// stack to a root of the call trie.
+using PathArray = Array<int32_t>;
+
+struct ProfileRecord {
+  using PathAllocator = typename PathArray::AllocatorType;
+
+  // The Path in this record is the function id's from the leaf to the root of
+  // the function call stack as represented from a FunctionCallTrie.
+  PathArray Path;
+  const FunctionCallTrie::Node *Node;
+};
+
+namespace {
+
+using ProfileRecordArray = Array<ProfileRecord>;
+
+// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
+// the path(s) and the data associated with the path.
+static void
+populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
+                const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
+  using StackArray = Array<const FunctionCallTrie::Node *>;
+  using StackAllocator = typename StackArray::AllocatorType;
+  StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+  StackArray DFSStack(StackAlloc);
+  for (const auto *R : Trie.getRoots()) {
+    DFSStack.Append(R);
+    while (!DFSStack.empty()) {
+      auto *Node = DFSStack.back();
+      DFSStack.trim(1);
+      if (Node == nullptr)
+        continue;
+      auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
+      if (Record == nullptr)
+        return;
+      DCHECK_NE(Record, nullptr);
+
+      // Traverse the Node's parents and as we're doing so, get the FIds in
+      // the order they appear.
+      for (auto N = Node; N != nullptr; N = N->Parent)
+        Record->Path.Append(N->FId);
+      DCHECK(!Record->Path.empty());
+
+      for (const auto C : Node->Callees)
+        DFSStack.Append(C.NodePtr);
+    }
+  }
+}
+
+static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
+                             const ProfileRecordArray &ProfileRecords)
+    XRAY_NEVER_INSTRUMENT {
+  auto NextPtr = static_cast<uint8_t *>(
+                     internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
+                 sizeof(Header);
+  for (const auto &Record : ProfileRecords) {
+    // List of IDs follow:
+    for (const auto FId : Record.Path)
+      NextPtr =
+          static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+          sizeof(FId);
+
+    // Add the sentinel here.
+    constexpr int32_t SentinelFId = 0;
+    NextPtr = static_cast<uint8_t *>(
+                  internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
+              sizeof(SentinelFId);
+
+    // Add the node data here.
+    NextPtr =
+        static_cast<uint8_t *>(internal_memcpy(
+            NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
+        sizeof(Record.Node->CallCount);
+    NextPtr = static_cast<uint8_t *>(
+                  internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
+                                  sizeof(Record.Node->CumulativeLocalTime))) +
+              sizeof(Record.Node->CumulativeLocalTime);
+  }
+
+  DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
+}
+
+} // namespace
+
+void serialize() XRAY_NEVER_INSTRUMENT {
+  if (!atomic_load(&CollectorInitialized, memory_order_acquire))
+    return;
+
+  SpinMutexLock Lock(&GlobalMutex);
+
+  // Clear out the global ProfileBuffers, if it's not empty.
+  for (auto &B : *ProfileBuffers)
+    deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
+  ProfileBuffers->trim(ProfileBuffers->size());
+
+  DCHECK_NE(TDArray, nullptr);
+  if (TDArray->empty())
+    return;
+
+  // Then repopulate the global ProfileBuffers.
+  u32 I = 0;
+  auto MaxSize = profilingFlags()->global_allocator_max;
+  auto ProfileArena = allocateBuffer(MaxSize);
+  if (ProfileArena == nullptr)
+    return;
+
+  auto ProfileArenaCleanup = at_scope_exit(
+      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
+
+  auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
+  if (PathArena == nullptr)
+    return;
+
+  auto PathArenaCleanup = at_scope_exit(
+      [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
+
+  for (const auto &ThreadTrie : *TDArray) {
+    using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
+    ProfileRecordAllocator PRAlloc(ProfileArena,
+                                   profilingFlags()->global_allocator_max);
+    ProfileRecord::PathAllocator PathAlloc(
+        PathArena, profilingFlags()->global_allocator_max);
+    ProfileRecordArray ProfileRecords(PRAlloc);
+
+    // First, we want to compute the amount of space we're going to need. We'll
+    // use a local allocator and an __xray::Array<...> to store the intermediary
+    // data, then compute the size as we're going along. Then we'll allocate the
+    // contiguous space to contain the thread buffer data.
+    if (ThreadTrie.FCT.getRoots().empty())
+      continue;
+
+    populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
+    DCHECK(!ThreadTrie.FCT.getRoots().empty());
+    DCHECK(!ProfileRecords.empty());
+
+    // Go through each record, to compute the sizes.
+    //
+    // header size = block size (4 bytes)
+    //   + block number (4 bytes)
+    //   + thread id (8 bytes)
+    // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
+    //   + call count (8 bytes)
+    //   + local time (8 bytes)
+    //   + end of record (8 bytes)
+    u32 CumulativeSizes = 0;
+    for (const auto &Record : ProfileRecords)
+      CumulativeSizes += 20 + (4 * Record.Path.size());
+
+    BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+    auto B = ProfileBuffers->Append({});
+    B->Size = sizeof(Header) + CumulativeSizes;
+    B->Data = allocateBuffer(B->Size);
+    DCHECK_NE(B->Data, nullptr);
+    serializeRecords(B, Header, ProfileRecords);
+  }
+}
+
+void reset() XRAY_NEVER_INSTRUMENT {
+  atomic_store(&CollectorInitialized, 0, memory_order_release);
+  SpinMutexLock Lock(&GlobalMutex);
+
+  if (ProfileBuffers != nullptr) {
+    // Clear out the profile buffers that have been serialized.
+    for (auto &B : *ProfileBuffers)
+      deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
+    ProfileBuffers->trim(ProfileBuffers->size());
+    ProfileBuffers = nullptr;
+  }
+
+  if (TDArray != nullptr) {
+    // Release the resources as required.
+    for (auto &TD : *TDArray) {
+      TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
+      TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
+    }
+    // We don't bother destroying the array here because we've already
+    // potentially freed the backing store for the array. Instead we're going to
+    // reset the pointer to nullptr, and re-use the storage later instead
+    // (placement-new'ing into the storage as-is).
+    TDArray = nullptr;
+  }
+
+  if (TDAllocator != nullptr) {
+    TDAllocator->~Allocator();
+    TDAllocator = nullptr;
+  }
+
+  if (Buffer.Data != nullptr) {
+    BQ->releaseBuffer(Buffer);
+  }
+
+  if (BQ == nullptr) {
+    bool Success = false;
+    new (&BufferQueueStorage)
+        BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
+    if (!Success)
+      return;
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+  } else {
+    BQ->finalize();
+
+    if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
+        BufferQueue::ErrorCode::Ok)
+      return;
+  }
+
+  if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
+    return;
+
+  new (&ProfileBufferArrayAllocatorStorage)
+      ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+  ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+      &ProfileBufferArrayAllocatorStorage);
+
+  new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
+  ProfileBuffers =
+      reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+
+  new (&ThreadDataAllocatorStorage)
+      ThreadDataAllocator(Buffer.Data, Buffer.Size);
+  TDAllocator =
+      reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
+  new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
+  TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
+
+  atomic_store(&CollectorInitialized, 1, memory_order_release);
+}
+
+XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
+  SpinMutexLock Lock(&GlobalMutex);
+
+  if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
+    return {nullptr, 0};
+
+  static pthread_once_t Once = PTHREAD_ONCE_INIT;
+  alignas(XRayProfilingFileHeader) static std::byte
+      FileHeaderStorage[sizeof(XRayProfilingFileHeader)];
+  pthread_once(
+      &Once, +[]() XRAY_NEVER_INSTRUMENT {
+        new (&FileHeaderStorage) XRayProfilingFileHeader{};
+      });
+
+  if (UNLIKELY(B.Data == nullptr)) {
+    // The first buffer should always contain the file header information.
+    auto &FileHeader =
+        *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
+    FileHeader.Timestamp = NanoTime();
+    FileHeader.PID = internal_getpid();
+    return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
+  }
+
+  if (UNLIKELY(B.Data == &FileHeaderStorage))
+    return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
+
+  BlockHeader Header;
+  internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
+  auto NextBlock = Header.BlockNum + 1;
+  if (NextBlock < ProfileBuffers->size())
+    return {(*ProfileBuffers)[NextBlock].Data,
+            (*ProfileBuffers)[NextBlock].Size};
+  return {nullptr, 0};
+}
+
+} // namespace profileCollectorService
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.h
new file mode 100644
index 000000000000..6e0f252714ba
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_profile_collector.h
@@ -0,0 +1,73 @@
+//===-- xray_profile_collector.h -------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This file defines the interface for a data collection service, for XRay
+// profiling. What we implement here is an in-process service where
+// FunctionCallTrie instances can be handed off by threads, to be
+// consolidated/collected.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_PROFILE_COLLECTOR_H
+#define XRAY_XRAY_PROFILE_COLLECTOR_H
+
+#include "xray_function_call_trie.h"
+
+#include "xray/xray_log_interface.h"
+
+namespace __xray {
+
+/// The ProfileCollectorService implements a centralised mechanism for
+/// collecting FunctionCallTrie instances, indexed by thread ID. On demand, the
+/// ProfileCollectorService can be queried for the most recent state of the
+/// data, in a form that allows traversal.
+namespace profileCollectorService {
+
+/// Posts the FunctionCallTrie associated with a specific Thread ID. This
+/// will:
+///
+/// Moves the collection of FunctionCallTrie, Allocators, and Buffers associated
+/// with a thread's data to the queue. This takes ownership of the memory
+/// associated with a thread, and manages those exclusively.
+///
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+          FunctionCallTrie::Allocators &&A,
+          FunctionCallTrie::Allocators::Buffers &&B, tid_t TId);
+
+/// The serialize will process all FunctionCallTrie instances in memory, and
+/// turn those into specifically formatted blocks, each describing the
+/// function call trie's contents in a compact form. In memory, this looks
+/// like the following layout:
+///
+///   - block size (32 bits)
+///   - block number (32 bits)
+///   - thread id (64 bits)
+///   - list of records:
+///     - function ids in leaf to root order, terminated by
+///       0 (32 bits per function id)
+///     - call count (64 bit)
+///     - cumulative local time (64 bit)
+///     - record delimiter (64 bit, 0x0)
+///
+void serialize();
+
+/// The reset function will clear out any internal memory held by the
+/// service. The intent is to have the resetting be done in calls to the
+/// initialization routine, or explicitly through the flush log API.
+void reset();
+
+/// This nextBuffer function is meant to implement the iterator functionality,
+/// provided in the XRay API.
+XRayBuffer nextBuffer(XRayBuffer B);
+
+} // namespace profileCollectorService
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_PROFILE_COLLECTOR_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling.cpp
new file mode 100644
index 000000000000..e9ac2fdd8aad
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling.cpp
@@ -0,0 +1,516 @@
+//===-- xray_profiling.cpp --------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This is the implementation of a profiling handler.
+//
+//===----------------------------------------------------------------------===//
+#include <memory>
+#include <time.h>
+
+#include "sanitizer_common/sanitizer_atomic.h"
+#include "sanitizer_common/sanitizer_flags.h"
+#include "xray/xray_interface.h"
+#include "xray/xray_log_interface.h"
+#include "xray_buffer_queue.h"
+#include "xray_flags.h"
+#include "xray_profile_collector.h"
+#include "xray_profiling_flags.h"
+#include "xray_recursion_guard.h"
+#include "xray_tsc.h"
+#include "xray_utils.h"
+#include <pthread.h>
+
+namespace __xray {
+
+namespace {
+
+static atomic_sint32_t ProfilerLogFlushStatus = {
+    XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING};
+
+static atomic_sint32_t ProfilerLogStatus = {
+    XRayLogInitStatus::XRAY_LOG_UNINITIALIZED};
+
+static SpinMutex ProfilerOptionsMutex;
+
+struct ProfilingData {
+  atomic_uintptr_t Allocators;
+  atomic_uintptr_t FCT;
+};
+
+static pthread_key_t ProfilingKey;
+
+// We use a global buffer queue, which gets initialized once at initialisation
+// time, and gets reset when profiling is "done".
+alignas(BufferQueue) static std::byte BufferQueueStorage[sizeof(BufferQueue)];
+static BufferQueue *BQ = nullptr;
+
+thread_local FunctionCallTrie::Allocators::Buffers ThreadBuffers;
+alignas(FunctionCallTrie::Allocators) thread_local std::byte
+    AllocatorsStorage[sizeof(FunctionCallTrie::Allocators)];
+alignas(FunctionCallTrie) thread_local std::byte
+    FunctionCallTrieStorage[sizeof(FunctionCallTrie)];
+thread_local ProfilingData TLD{{0}, {0}};
+thread_local atomic_uint8_t ReentranceGuard{0};
+
+// We use a separate guard for ensuring that for this thread, if we're already
+// cleaning up, that any signal handlers don't attempt to cleanup nor
+// initialise.
+thread_local atomic_uint8_t TLDInitGuard{0};
+
+// We also use a separate latch to signal that the thread is exiting, and
+// non-essential work should be ignored (things like recording events, etc.).
+thread_local atomic_uint8_t ThreadExitingLatch{0};
+
+static ProfilingData *getThreadLocalData() XRAY_NEVER_INSTRUMENT {
+  thread_local auto ThreadOnce = []() XRAY_NEVER_INSTRUMENT {
+    pthread_setspecific(ProfilingKey, &TLD);
+    return false;
+  }();
+  (void)ThreadOnce;
+
+  RecursionGuard TLDInit(TLDInitGuard);
+  if (!TLDInit)
+    return nullptr;
+
+  if (atomic_load_relaxed(&ThreadExitingLatch))
+    return nullptr;
+
+  uptr Allocators = 0;
+  if (atomic_compare_exchange_strong(&TLD.Allocators, &Allocators, 1,
+                                     memory_order_acq_rel)) {
+    bool Success = false;
+    auto AllocatorsUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        atomic_store(&TLD.Allocators, 0, memory_order_release);
+    });
+
+    // Acquire a set of buffers for this thread.
+    if (BQ == nullptr)
+      return nullptr;
+
+    if (BQ->getBuffer(ThreadBuffers.NodeBuffer) != BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto NodeBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.NodeBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.RootsBuffer) != BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto RootsBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.RootsBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.ShadowStackBuffer) !=
+        BufferQueue::ErrorCode::Ok)
+      return nullptr;
+    auto ShadowStackBufferUndo = at_scope_exit([&]() XRAY_NEVER_INSTRUMENT {
+      if (!Success)
+        BQ->releaseBuffer(ThreadBuffers.ShadowStackBuffer);
+    });
+
+    if (BQ->getBuffer(ThreadBuffers.NodeIdPairBuffer) !=
+        BufferQueue::ErrorCode::Ok)
+      return nullptr;
+
+    Success = true;
+    new (&AllocatorsStorage) FunctionCallTrie::Allocators(
+        FunctionCallTrie::InitAllocatorsFromBuffers(ThreadBuffers));
+    Allocators = reinterpret_cast<uptr>(
+        reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage));
+    atomic_store(&TLD.Allocators, Allocators, memory_order_release);
+  }
+
+  if (Allocators == 1)
+    return nullptr;
+
+  uptr FCT = 0;
+  if (atomic_compare_exchange_strong(&TLD.FCT, &FCT, 1, memory_order_acq_rel)) {
+    new (&FunctionCallTrieStorage)
+        FunctionCallTrie(*reinterpret_cast<FunctionCallTrie::Allocators *>(
+            atomic_load_relaxed(&TLD.Allocators)));
+    FCT = reinterpret_cast<uptr>(
+        reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage));
+    atomic_store(&TLD.FCT, FCT, memory_order_release);
+  }
+
+  if (FCT == 1)
+    return nullptr;
+
+  return &TLD;
+}
+
+static void cleanupTLD() XRAY_NEVER_INSTRUMENT {
+  auto FCT = atomic_exchange(&TLD.FCT, 0, memory_order_acq_rel);
+  if (FCT == reinterpret_cast<uptr>(reinterpret_cast<FunctionCallTrie *>(
+                 &FunctionCallTrieStorage)))
+    reinterpret_cast<FunctionCallTrie *>(FCT)->~FunctionCallTrie();
+
+  auto Allocators = atomic_exchange(&TLD.Allocators, 0, memory_order_acq_rel);
+  if (Allocators ==
+      reinterpret_cast<uptr>(
+          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+    reinterpret_cast<FunctionCallTrie::Allocators *>(Allocators)->~Allocators();
+}
+
+static void postCurrentThreadFCT(ProfilingData &T) XRAY_NEVER_INSTRUMENT {
+  RecursionGuard TLDInit(TLDInitGuard);
+  if (!TLDInit)
+    return;
+
+  uptr P = atomic_exchange(&T.FCT, 0, memory_order_acq_rel);
+  if (P != reinterpret_cast<uptr>(
+               reinterpret_cast<FunctionCallTrie *>(&FunctionCallTrieStorage)))
+    return;
+
+  auto FCT = reinterpret_cast<FunctionCallTrie *>(P);
+  DCHECK_NE(FCT, nullptr);
+
+  uptr A = atomic_exchange(&T.Allocators, 0, memory_order_acq_rel);
+  if (A !=
+      reinterpret_cast<uptr>(
+          reinterpret_cast<FunctionCallTrie::Allocators *>(&AllocatorsStorage)))
+    return;
+
+  auto Allocators = reinterpret_cast<FunctionCallTrie::Allocators *>(A);
+  DCHECK_NE(Allocators, nullptr);
+
+  // Always move the data into the profile collector.
+  profileCollectorService::post(BQ, std::move(*FCT), std::move(*Allocators),
+                                std::move(ThreadBuffers), GetTid());
+
+  // Re-initialize the ThreadBuffers object to a known "default" state.
+  ThreadBuffers = FunctionCallTrie::Allocators::Buffers{};
+}
+
+} // namespace
+
+const char *profilingCompilerDefinedFlags() XRAY_NEVER_INSTRUMENT {
+#ifdef XRAY_PROFILER_DEFAULT_OPTIONS
+  return SANITIZER_STRINGIFY(XRAY_PROFILER_DEFAULT_OPTIONS);
+#else
+  return "";
+#endif
+}
+
+XRayLogFlushStatus profilingFlush() XRAY_NEVER_INSTRUMENT {
+  if (atomic_load(&ProfilerLogStatus, memory_order_acquire) !=
+      XRayLogInitStatus::XRAY_LOG_FINALIZED) {
+    if (Verbosity())
+      Report("Not flushing profiles, profiling not been finalized.\n");
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  RecursionGuard SignalGuard(ReentranceGuard);
+  if (!SignalGuard) {
+    if (Verbosity())
+      Report("Cannot finalize properly inside a signal handler!\n");
+    atomic_store(&ProfilerLogFlushStatus,
+                 XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING,
+                 memory_order_release);
+    return XRayLogFlushStatus::XRAY_LOG_NOT_FLUSHING;
+  }
+
+  s32 Previous = atomic_exchange(&ProfilerLogFlushStatus,
+                                 XRayLogFlushStatus::XRAY_LOG_FLUSHING,
+                                 memory_order_acq_rel);
+  if (Previous == XRayLogFlushStatus::XRAY_LOG_FLUSHING) {
+    if (Verbosity())
+      Report("Not flushing profiles, implementation still flushing.\n");
+    return XRayLogFlushStatus::XRAY_LOG_FLUSHING;
+  }
+
+  // At this point, we'll create the file that will contain the profile, but
+  // only if the options say so.
+  if (!profilingFlags()->no_flush) {
+    // First check whether we have data in the profile collector service
+    // before we try and write anything down.
+    XRayBuffer B = profileCollectorService::nextBuffer({nullptr, 0});
+    if (B.Data == nullptr) {
+      if (Verbosity())
+        Report("profiling: No data to flush.\n");
+    } else {
+      LogWriter *LW = LogWriter::Open();
+      if (LW == nullptr) {
+        if (Verbosity())
+          Report("profiling: Failed to flush to file, dropping data.\n");
+      } else {
+        // Now for each of the buffers, write out the profile data as we would
+        // see it in memory, verbatim.
+        while (B.Data != nullptr && B.Size != 0) {
+          LW->WriteAll(reinterpret_cast<const char *>(B.Data),
+                       reinterpret_cast<const char *>(B.Data) + B.Size);
+          B = profileCollectorService::nextBuffer(B);
+        }
+        LogWriter::Close(LW);
+      }
+    }
+  }
+
+  profileCollectorService::reset();
+
+  atomic_store(&ProfilerLogFlushStatus, XRayLogFlushStatus::XRAY_LOG_FLUSHED,
+               memory_order_release);
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+               memory_order_release);
+
+  return XRayLogFlushStatus::XRAY_LOG_FLUSHED;
+}
+
+void profilingHandleArg0(int32_t FuncId,
+                         XRayEntryType Entry) XRAY_NEVER_INSTRUMENT {
+  unsigned char CPU;
+  auto TSC = readTSC(CPU);
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return;
+
+  auto Status = atomic_load(&ProfilerLogStatus, memory_order_acquire);
+  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_UNINITIALIZED ||
+               Status == XRayLogInitStatus::XRAY_LOG_INITIALIZING))
+    return;
+
+  if (UNLIKELY(Status == XRayLogInitStatus::XRAY_LOG_FINALIZED ||
+               Status == XRayLogInitStatus::XRAY_LOG_FINALIZING)) {
+    postCurrentThreadFCT(TLD);
+    return;
+  }
+
+  auto T = getThreadLocalData();
+  if (T == nullptr)
+    return;
+
+  auto FCT = reinterpret_cast<FunctionCallTrie *>(atomic_load_relaxed(&T->FCT));
+  switch (Entry) {
+  case XRayEntryType::ENTRY:
+  case XRayEntryType::LOG_ARGS_ENTRY:
+    FCT->enterFunction(FuncId, TSC, CPU);
+    break;
+  case XRayEntryType::EXIT:
+  case XRayEntryType::TAIL:
+    FCT->exitFunction(FuncId, TSC, CPU);
+    break;
+  default:
+    // FIXME: Handle bugs.
+    break;
+  }
+}
+
+void profilingHandleArg1(int32_t FuncId, XRayEntryType Entry,
+                         uint64_t) XRAY_NEVER_INSTRUMENT {
+  return profilingHandleArg0(FuncId, Entry);
+}
+
+XRayLogInitStatus profilingFinalize() XRAY_NEVER_INSTRUMENT {
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_FINALIZING,
+                                      memory_order_release)) {
+    if (Verbosity())
+      Report("Cannot finalize profile, the profiling is not initialized.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  // Mark then finalize the current generation of buffers. This allows us to let
+  // the threads currently holding onto new buffers still use them, but let the
+  // last reference do the memory cleanup.
+  DCHECK_NE(BQ, nullptr);
+  BQ->finalize();
+
+  // Wait a grace period to allow threads to see that we're finalizing.
+  SleepForMillis(profilingFlags()->grace_period_ms);
+
+  // If we for some reason are entering this function from an instrumented
+  // handler, we bail out.
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+
+  // Post the current thread's data if we have any.
+  postCurrentThreadFCT(TLD);
+
+  // Then we force serialize the log data.
+  profileCollectorService::serialize();
+
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_FINALIZED,
+               memory_order_release);
+  return XRayLogInitStatus::XRAY_LOG_FINALIZED;
+}
+
+XRayLogInitStatus
+profilingLoggingInit(size_t, size_t, void *Options,
+                     size_t OptionsSize) XRAY_NEVER_INSTRUMENT {
+  RecursionGuard G(ReentranceGuard);
+  if (!G)
+    return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+
+  s32 CurrentStatus = XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+  if (!atomic_compare_exchange_strong(&ProfilerLogStatus, &CurrentStatus,
+                                      XRayLogInitStatus::XRAY_LOG_INITIALIZING,
+                                      memory_order_acq_rel)) {
+    if (Verbosity())
+      Report("Cannot initialize already initialised profiling "
+             "implementation.\n");
+    return static_cast<XRayLogInitStatus>(CurrentStatus);
+  }
+
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    FlagParser ConfigParser;
+    ProfilerFlags Flags;
+    Flags.setDefaults();
+    registerProfilerFlags(&ConfigParser, &Flags);
+    ConfigParser.ParseString(profilingCompilerDefinedFlags());
+    const char *Env = GetEnv("XRAY_PROFILING_OPTIONS");
+    if (Env == nullptr)
+      Env = "";
+    ConfigParser.ParseString(Env);
+
+    // Then parse the configuration string provided.
+    ConfigParser.ParseString(static_cast<const char *>(Options));
+    if (Verbosity())
+      ReportUnrecognizedFlags();
+    *profilingFlags() = Flags;
+  }
+
+  // We need to reset the profile data collection implementation now.
+  profileCollectorService::reset();
+
+  // Then also reset the buffer queue implementation.
+  if (BQ == nullptr) {
+    bool Success = false;
+    new (&BufferQueueStorage)
+        BufferQueue(profilingFlags()->per_thread_allocator_max,
+                    profilingFlags()->buffers_max, Success);
+    if (!Success) {
+      if (Verbosity())
+        Report("Failed to initialize preallocated memory buffers!");
+      atomic_store(&ProfilerLogStatus,
+                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                   memory_order_release);
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+
+    // If we've succeeded, set the global pointer to the initialised storage.
+    BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+  } else {
+    BQ->finalize();
+    auto InitStatus = BQ->init(profilingFlags()->per_thread_allocator_max,
+                               profilingFlags()->buffers_max);
+
+    if (InitStatus != BufferQueue::ErrorCode::Ok) {
+      if (Verbosity())
+        Report("Failed to initialize preallocated memory buffers; error: %s",
+               BufferQueue::getErrorString(InitStatus));
+      atomic_store(&ProfilerLogStatus,
+                   XRayLogInitStatus::XRAY_LOG_UNINITIALIZED,
+                   memory_order_release);
+      return XRayLogInitStatus::XRAY_LOG_UNINITIALIZED;
+    }
+
+    DCHECK(!BQ->finalizing());
+  }
+
+  // We need to set up the exit handlers.
+  static pthread_once_t Once = PTHREAD_ONCE_INIT;
+  pthread_once(
+      &Once, +[] {
+        pthread_key_create(
+            &ProfilingKey, +[](void *P) XRAY_NEVER_INSTRUMENT {
+              if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+                return;
+
+              if (P == nullptr)
+                return;
+
+              auto T = reinterpret_cast<ProfilingData *>(P);
+              if (atomic_load_relaxed(&T->Allocators) == 0)
+                return;
+
+              {
+                // If we're somehow executing this while inside a
+                // non-reentrant-friendly context, we skip attempting to post
+                // the current thread's data.
+                RecursionGuard G(ReentranceGuard);
+                if (!G)
+                  return;
+
+                postCurrentThreadFCT(*T);
+              }
+            });
+
+        // We also need to set up an exit handler, so that we can get the
+        // profile information at exit time. We use the C API to do this, to not
+        // rely on C++ ABI functions for registering exit handlers.
+        Atexit(+[]() XRAY_NEVER_INSTRUMENT {
+          if (atomic_exchange(&ThreadExitingLatch, 1, memory_order_acq_rel))
+            return;
+
+          auto Cleanup =
+              at_scope_exit([]() XRAY_NEVER_INSTRUMENT { cleanupTLD(); });
+
+          // Finalize and flush.
+          if (profilingFinalize() != XRAY_LOG_FINALIZED ||
+              profilingFlush() != XRAY_LOG_FLUSHED)
+            return;
+
+          if (Verbosity())
+            Report("XRay Profile flushed at exit.");
+        });
+      });
+
+  __xray_log_set_buffer_iterator(profileCollectorService::nextBuffer);
+  __xray_set_handler(profilingHandleArg0);
+  __xray_set_handler_arg1(profilingHandleArg1);
+
+  atomic_store(&ProfilerLogStatus, XRayLogInitStatus::XRAY_LOG_INITIALIZED,
+               memory_order_release);
+  if (Verbosity())
+    Report("XRay Profiling init successful.\n");
+
+  return XRayLogInitStatus::XRAY_LOG_INITIALIZED;
+}
+
+bool profilingDynamicInitializer() XRAY_NEVER_INSTRUMENT {
+  // Set up the flag defaults from the static defaults and the
+  // compiler-provided defaults.
+  {
+    SpinMutexLock Lock(&ProfilerOptionsMutex);
+    auto *F = profilingFlags();
+    F->setDefaults();
+    FlagParser ProfilingParser;
+    registerProfilerFlags(&ProfilingParser, F);
+    ProfilingParser.ParseString(profilingCompilerDefinedFlags());
+  }
+
+  XRayLogImpl Impl{
+      profilingLoggingInit,
+      profilingFinalize,
+      profilingHandleArg0,
+      profilingFlush,
+  };
+  auto RegistrationResult = __xray_log_register_mode("xray-profiling", Impl);
+  if (RegistrationResult != XRayLogRegisterStatus::XRAY_REGISTRATION_OK) {
+    if (Verbosity())
+      Report("Cannot register XRay Profiling mode to 'xray-profiling'; error = "
+             "%d\n",
+             RegistrationResult);
+    return false;
+  }
+
+  if (!internal_strcmp(flags()->xray_mode, "xray-profiling"))
+    __xray_log_select_mode("xray_profiling");
+  return true;
+}
+
+} // namespace __xray
+
+static auto UNUSED Unused = __xray::profilingDynamicInitializer();
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.cpp
new file mode 100644
index 000000000000..0e89b7420f8c
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.cpp
@@ -0,0 +1,39 @@
+//===-- xray_flags.h -------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay runtime flags.
+//===----------------------------------------------------------------------===//
+
+#include "xray_profiling_flags.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_libc.h"
+#include "xray_defs.h"
+
+namespace __xray {
+
+// Storage for the profiling flags.
+ProfilerFlags xray_profiling_flags_dont_use_directly;
+
+void ProfilerFlags::setDefaults() XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Name = DefaultValue;
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+void registerProfilerFlags(FlagParser *P,
+                           ProfilerFlags *F) XRAY_NEVER_INSTRUMENT {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description)                       \
+  RegisterFlag(P, #Name, Description, &F->Name);
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+}
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.h
new file mode 100644
index 000000000000..d67f240adc88
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.h
@@ -0,0 +1,38 @@
+//===-- xray_profiling_flags.h ----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// XRay profiling runtime flags.
+//===----------------------------------------------------------------------===//
+
+#ifndef XRAY_PROFILER_FLAGS_H
+#define XRAY_PROFILER_FLAGS_H
+
+#include "sanitizer_common/sanitizer_flag_parser.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+
+namespace __xray {
+
+struct ProfilerFlags {
+#define XRAY_FLAG(Type, Name, DefaultValue, Description) Type Name;
+#include "xray_profiling_flags.inc"
+#undef XRAY_FLAG
+
+  void setDefaults();
+};
+
+extern ProfilerFlags xray_profiling_flags_dont_use_directly;
+inline ProfilerFlags *profilingFlags() {
+  return &xray_profiling_flags_dont_use_directly;
+}
+void registerProfilerFlags(FlagParser *P, ProfilerFlags *F);
+
+} // namespace __xray
+
+#endif // XRAY_PROFILER_FLAGS_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.inc b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.inc
new file mode 100644
index 000000000000..4f6138872af7
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_profiling_flags.inc
@@ -0,0 +1,31 @@
+//===-- xray_profiling_flags.inc --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// XRay profiling runtime flags.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_FLAG
+#error "Define XRAY_FLAG prior to including this file!"
+#endif
+
+XRAY_FLAG(uptr, per_thread_allocator_max, 16384,
+          "Maximum size of any single per-thread allocator.")
+XRAY_FLAG(uptr, global_allocator_max, 2 << 24,
+          "Maximum size of the global allocator for profile storage.")
+XRAY_FLAG(uptr, stack_allocator_max, 2 << 20,
+          "Maximum size of the traversal stack allocator.")
+XRAY_FLAG(int, grace_period_ms, 1,
+          "Profile collection will wait this much time in milliseconds before "
+          "resetting the global state. This gives a chance to threads to "
+          "notice that the profiler has been finalized and clean up.")
+XRAY_FLAG(bool, no_flush, false,
+          "Set to true if we want the profiling implementation to not write "
+          "out files.")
+XRAY_FLAG(int, buffers_max, 128,
+          "The number of buffers to pre-allocate used by the profiling "
+          "implementation.")
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_recursion_guard.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_recursion_guard.h
new file mode 100644
index 000000000000..3b6158a2d36c
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_recursion_guard.h
@@ -0,0 +1,56 @@
+//===-- xray_recursion_guard.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_XRAY_RECURSION_GUARD_H
+#define XRAY_XRAY_RECURSION_GUARD_H
+
+#include "sanitizer_common/sanitizer_atomic.h"
+
+namespace __xray {
+
+/// The RecursionGuard is useful for guarding against signal handlers which are
+/// also potentially calling XRay-instrumented functions. To use the
+/// RecursionGuard, you'll typically need a thread_local atomic_uint8_t:
+///
+///   thread_local atomic_uint8_t Guard{0};
+///
+///   // In a handler function:
+///   void handleArg0(int32_t F, XRayEntryType T) {
+///     RecursionGuard G(Guard);
+///     if (!G)
+///       return;  // Failed to acquire the guard.
+///     ...
+///   }
+///
+class RecursionGuard {
+  atomic_uint8_t &Running;
+  const bool Valid;
+
+public:
+  explicit inline RecursionGuard(atomic_uint8_t &R)
+      : Running(R), Valid(!atomic_exchange(&R, 1, memory_order_acq_rel)) {}
+
+  inline RecursionGuard(const RecursionGuard &) = delete;
+  inline RecursionGuard(RecursionGuard &&) = delete;
+  inline RecursionGuard &operator=(const RecursionGuard &) = delete;
+  inline RecursionGuard &operator=(RecursionGuard &&) = delete;
+
+  explicit inline operator bool() const { return Valid; }
+
+  inline ~RecursionGuard() noexcept {
+    if (Valid)
+      atomic_store(&Running, 0, memory_order_release);
+  }
+};
+
+} // namespace __xray
+
+#endif // XRAY_XRAY_RECURSION_GUARD_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_segmented_array.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_segmented_array.h
new file mode 100644
index 000000000000..3ab174bcbe18
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_segmented_array.h
@@ -0,0 +1,649 @@
+//===-- xray_segmented_array.h ---------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Defines the implementation of a segmented array, with fixed-size segments
+// backing the segments.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_SEGMENTED_ARRAY_H
+#define XRAY_SEGMENTED_ARRAY_H
+
+#include "sanitizer_common/sanitizer_allocator.h"
+#include "xray_allocator.h"
+#include "xray_utils.h"
+#include <cassert>
+#include <type_traits>
+#include <utility>
+
+namespace __xray {
+
+/// The Array type provides an interface similar to std::vector<...> but does
+/// not shrink in size. Once constructed, elements can be appended but cannot be
+/// removed. The implementation is heavily dependent on the contract provided by
+/// the Allocator type, in that all memory will be released when the Allocator
+/// is destroyed. When an Array is destroyed, it will destroy elements in the
+/// backing store but will not free the memory.
+template <class T> class Array {
+  struct Segment {
+    Segment *Prev;
+    Segment *Next;
+    char Data[1];
+  };
+
+public:
+  // Each segment of the array will be laid out with the following assumptions:
+  //
+  //   - Each segment will be on a cache-line address boundary (kCacheLineSize
+  //     aligned).
+  //
+  //   - The elements will be accessed through an aligned pointer, dependent on
+  //     the alignment of T.
+  //
+  //   - Each element is at least two-pointers worth from the beginning of the
+  //     Segment, aligned properly, and the rest of the elements are accessed
+  //     through appropriate alignment.
+  //
+  // We then compute the size of the segment to follow this logic:
+  //
+  //   - Compute the number of elements that can fit within
+  //     kCacheLineSize-multiple segments, minus the size of two pointers.
+  //
+  //   - Request cacheline-multiple sized elements from the allocator.
+  static constexpr uint64_t AlignedElementStorageSize = sizeof(T);
+
+  static constexpr uint64_t SegmentControlBlockSize = sizeof(Segment *) * 2;
+
+  static constexpr uint64_t SegmentSize = nearest_boundary(
+      SegmentControlBlockSize + next_pow2(sizeof(T)), kCacheLineSize);
+
+  using AllocatorType = Allocator<SegmentSize>;
+
+  static constexpr uint64_t ElementsPerSegment =
+      (SegmentSize - SegmentControlBlockSize) / next_pow2(sizeof(T));
+
+  static_assert(ElementsPerSegment > 0,
+                "Must have at least 1 element per segment.");
+
+  static Segment SentinelSegment;
+
+  using size_type = uint64_t;
+
+private:
+  // This Iterator models a BidirectionalIterator.
+  template <class U> class Iterator {
+    Segment *S = &SentinelSegment;
+    uint64_t Offset = 0;
+    uint64_t Size = 0;
+
+  public:
+    Iterator(Segment *IS, uint64_t Off, uint64_t S) XRAY_NEVER_INSTRUMENT
+        : S(IS),
+          Offset(Off),
+          Size(S) {}
+    Iterator(const Iterator &) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator() NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator(Iterator &&) NOEXCEPT XRAY_NEVER_INSTRUMENT = default;
+    Iterator &operator=(const Iterator &) XRAY_NEVER_INSTRUMENT = default;
+    Iterator &operator=(Iterator &&) XRAY_NEVER_INSTRUMENT = default;
+    ~Iterator() XRAY_NEVER_INSTRUMENT = default;
+
+    Iterator &operator++() XRAY_NEVER_INSTRUMENT {
+      if (++Offset % ElementsPerSegment || Offset == Size)
+        return *this;
+
+      // At this point, we know that Offset % N == 0, so we must advance the
+      // segment pointer.
+      DCHECK_EQ(Offset % ElementsPerSegment, 0);
+      DCHECK_NE(Offset, Size);
+      DCHECK_NE(S, &SentinelSegment);
+      DCHECK_NE(S->Next, &SentinelSegment);
+      S = S->Next;
+      DCHECK_NE(S, &SentinelSegment);
+      return *this;
+    }
+
+    Iterator &operator--() XRAY_NEVER_INSTRUMENT {
+      DCHECK_NE(S, &SentinelSegment);
+      DCHECK_GT(Offset, 0);
+
+      auto PreviousOffset = Offset--;
+      if (PreviousOffset != Size && PreviousOffset % ElementsPerSegment == 0) {
+        DCHECK_NE(S->Prev, &SentinelSegment);
+        S = S->Prev;
+      }
+
+      return *this;
+    }
+
+    Iterator operator++(int) XRAY_NEVER_INSTRUMENT {
+      Iterator Copy(*this);
+      ++(*this);
+      return Copy;
+    }
+
+    Iterator operator--(int) XRAY_NEVER_INSTRUMENT {
+      Iterator Copy(*this);
+      --(*this);
+      return Copy;
+    }
+
+    template <class V, class W>
+    friend bool operator==(const Iterator<V> &L,
+                           const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
+      return L.S == R.S && L.Offset == R.Offset;
+    }
+
+    template <class V, class W>
+    friend bool operator!=(const Iterator<V> &L,
+                           const Iterator<W> &R) XRAY_NEVER_INSTRUMENT {
+      return !(L == R);
+    }
+
+    U &operator*() const XRAY_NEVER_INSTRUMENT {
+      DCHECK_NE(S, &SentinelSegment);
+      auto RelOff = Offset % ElementsPerSegment;
+
+      // We need to compute the character-aligned pointer, offset from the
+      // segment's Data location to get the element in the position of Offset.
+      auto Base = &S->Data;
+      auto AlignedOffset = Base + (RelOff * AlignedElementStorageSize);
+      return *reinterpret_cast<U *>(AlignedOffset);
+    }
+
+    U *operator->() const XRAY_NEVER_INSTRUMENT { return &(**this); }
+  };
+
+  AllocatorType *Alloc;
+  Segment *Head;
+  Segment *Tail;
+
+  // Here we keep track of segments in the freelist, to allow us to re-use
+  // segments when elements are trimmed off the end.
+  Segment *Freelist;
+  uint64_t Size;
+
+  // ===============================
+  // In the following implementation, we work through the algorithms and the
+  // list operations using the following notation:
+  //
+  //   - pred(s) is the predecessor (previous node accessor) and succ(s) is
+  //     the successor (next node accessor).
+  //
+  //   - S is a sentinel segment, which has the following property:
+  //
+  //         pred(S) == succ(S) == S
+  //
+  //   - @ is a loop operator, which can imply pred(s) == s if it appears on
+  //     the left of s, or succ(s) == S if it appears on the right of s.
+  //
+  //   - sL <-> sR : means a bidirectional relation between sL and sR, which
+  //     means:
+  //
+  //         succ(sL) == sR && pred(SR) == sL
+  //
+  //   - sL -> sR : implies a unidirectional relation between sL and SR,
+  //     with the following properties:
+  //
+  //         succ(sL) == sR
+  //
+  //     sL <- sR : implies a unidirectional relation between sR and sL,
+  //     with the following properties:
+  //
+  //         pred(sR) == sL
+  //
+  // ===============================
+
+  Segment *NewSegment() XRAY_NEVER_INSTRUMENT {
+    // We need to handle the case in which enough elements have been trimmed to
+    // allow us to re-use segments we've allocated before. For this we look into
+    // the Freelist, to see whether we need to actually allocate new blocks or
+    // just re-use blocks we've already seen before.
+    if (Freelist != &SentinelSegment) {
+      // The current state of lists resemble something like this at this point:
+      //
+      //   Freelist: @S@<-f0->...<->fN->@S@
+      //                  ^ Freelist
+      //
+      // We want to perform a splice of `f0` from Freelist to a temporary list,
+      // which looks like:
+      //
+      //   Templist: @S@<-f0->@S@
+      //                  ^ FreeSegment
+      //
+      // Our algorithm preconditions are:
+      DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+
+      // Then the algorithm we implement is:
+      //
+      //   SFS = Freelist
+      //   Freelist = succ(Freelist)
+      //   if (Freelist != S)
+      //     pred(Freelist) = S
+      //   succ(SFS) = S
+      //   pred(SFS) = S
+      //
+      auto *FreeSegment = Freelist;
+      Freelist = Freelist->Next;
+
+      // Note that we need to handle the case where Freelist is now pointing to
+      // S, which we don't want to be overwriting.
+      // TODO: Determine whether the cost of the branch is higher than the cost
+      // of the blind assignment.
+      if (Freelist != &SentinelSegment)
+        Freelist->Prev = &SentinelSegment;
+
+      FreeSegment->Next = &SentinelSegment;
+      FreeSegment->Prev = &SentinelSegment;
+
+      // Our postconditions are:
+      DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+      DCHECK_NE(FreeSegment, &SentinelSegment);
+      return FreeSegment;
+    }
+
+    auto SegmentBlock = Alloc->Allocate();
+    if (SegmentBlock.Data == nullptr)
+      return nullptr;
+
+    // Placement-new the Segment element at the beginning of the SegmentBlock.
+    new (SegmentBlock.Data) Segment{&SentinelSegment, &SentinelSegment, {0}};
+    auto SB = reinterpret_cast<Segment *>(SegmentBlock.Data);
+    return SB;
+  }
+
+  Segment *InitHeadAndTail() XRAY_NEVER_INSTRUMENT {
+    DCHECK_EQ(Head, &SentinelSegment);
+    DCHECK_EQ(Tail, &SentinelSegment);
+    auto S = NewSegment();
+    if (S == nullptr)
+      return nullptr;
+    DCHECK_EQ(S->Next, &SentinelSegment);
+    DCHECK_EQ(S->Prev, &SentinelSegment);
+    DCHECK_NE(S, &SentinelSegment);
+    Head = S;
+    Tail = S;
+    DCHECK_EQ(Head, Tail);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    DCHECK_EQ(Tail->Prev, &SentinelSegment);
+    return S;
+  }
+
+  Segment *AppendNewSegment() XRAY_NEVER_INSTRUMENT {
+    auto S = NewSegment();
+    if (S == nullptr)
+      return nullptr;
+    DCHECK_NE(Tail, &SentinelSegment);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    DCHECK_EQ(S->Prev, &SentinelSegment);
+    DCHECK_EQ(S->Next, &SentinelSegment);
+    S->Prev = Tail;
+    Tail->Next = S;
+    Tail = S;
+    DCHECK_EQ(S, S->Prev->Next);
+    DCHECK_EQ(Tail->Next, &SentinelSegment);
+    return S;
+  }
+
+public:
+  explicit Array(AllocatorType &A) XRAY_NEVER_INSTRUMENT
+      : Alloc(&A),
+        Head(&SentinelSegment),
+        Tail(&SentinelSegment),
+        Freelist(&SentinelSegment),
+        Size(0) {}
+
+  Array() XRAY_NEVER_INSTRUMENT : Alloc(nullptr),
+                                  Head(&SentinelSegment),
+                                  Tail(&SentinelSegment),
+                                  Freelist(&SentinelSegment),
+                                  Size(0) {}
+
+  Array(const Array &) = delete;
+  Array &operator=(const Array &) = delete;
+
+  Array(Array &&O) XRAY_NEVER_INSTRUMENT : Alloc(O.Alloc),
+                                           Head(O.Head),
+                                           Tail(O.Tail),
+                                           Freelist(O.Freelist),
+                                           Size(O.Size) {
+    O.Alloc = nullptr;
+    O.Head = &SentinelSegment;
+    O.Tail = &SentinelSegment;
+    O.Size = 0;
+    O.Freelist = &SentinelSegment;
+  }
+
+  Array &operator=(Array &&O) XRAY_NEVER_INSTRUMENT {
+    Alloc = O.Alloc;
+    O.Alloc = nullptr;
+    Head = O.Head;
+    O.Head = &SentinelSegment;
+    Tail = O.Tail;
+    O.Tail = &SentinelSegment;
+    Freelist = O.Freelist;
+    O.Freelist = &SentinelSegment;
+    Size = O.Size;
+    O.Size = 0;
+    return *this;
+  }
+
+  ~Array() XRAY_NEVER_INSTRUMENT {
+    for (auto &E : *this)
+      (&E)->~T();
+  }
+
+  bool empty() const XRAY_NEVER_INSTRUMENT { return Size == 0; }
+
+  AllocatorType &allocator() const XRAY_NEVER_INSTRUMENT {
+    DCHECK_NE(Alloc, nullptr);
+    return *Alloc;
+  }
+
+  uint64_t size() const XRAY_NEVER_INSTRUMENT { return Size; }
+
+  template <class... Args>
+  T *AppendEmplace(Args &&... args) XRAY_NEVER_INSTRUMENT {
+    DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) ||
+           (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+    if (UNLIKELY(Head == &SentinelSegment)) {
+      auto R = InitHeadAndTail();
+      if (R == nullptr)
+        return nullptr;
+    }
+
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Tail, &SentinelSegment);
+
+    auto Offset = Size % ElementsPerSegment;
+    if (UNLIKELY(Size != 0 && Offset == 0))
+      if (AppendNewSegment() == nullptr)
+        return nullptr;
+
+    DCHECK_NE(Tail, &SentinelSegment);
+    auto Base = &Tail->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    DCHECK_LE(AlignedOffset + sizeof(T),
+              reinterpret_cast<unsigned char *>(Base) + SegmentSize);
+
+    // In-place construct at Position.
+    new (AlignedOffset) T{std::forward<Args>(args)...};
+    ++Size;
+    return reinterpret_cast<T *>(AlignedOffset);
+  }
+
+  T *Append(const T &E) XRAY_NEVER_INSTRUMENT {
+    // FIXME: This is a duplication of AppenEmplace with the copy semantics
+    // explicitly used, as a work-around to GCC 4.8 not invoking the copy
+    // constructor with the placement new with braced-init syntax.
+    DCHECK((Size == 0 && Head == &SentinelSegment && Head == Tail) ||
+           (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+    if (UNLIKELY(Head == &SentinelSegment)) {
+      auto R = InitHeadAndTail();
+      if (R == nullptr)
+        return nullptr;
+    }
+
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Tail, &SentinelSegment);
+
+    auto Offset = Size % ElementsPerSegment;
+    if (UNLIKELY(Size != 0 && Offset == 0))
+      if (AppendNewSegment() == nullptr)
+        return nullptr;
+
+    DCHECK_NE(Tail, &SentinelSegment);
+    auto Base = &Tail->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    DCHECK_LE(AlignedOffset + sizeof(T),
+              reinterpret_cast<unsigned char *>(Tail) + SegmentSize);
+
+    // In-place construct at Position.
+    new (AlignedOffset) T(E);
+    ++Size;
+    return reinterpret_cast<T *>(AlignedOffset);
+  }
+
+  T &operator[](uint64_t Offset) const XRAY_NEVER_INSTRUMENT {
+    DCHECK_LE(Offset, Size);
+    // We need to traverse the array enough times to find the element at Offset.
+    auto S = Head;
+    while (Offset >= ElementsPerSegment) {
+      S = S->Next;
+      Offset -= ElementsPerSegment;
+      DCHECK_NE(S, &SentinelSegment);
+    }
+    auto Base = &S->Data;
+    auto AlignedOffset = Base + (Offset * AlignedElementStorageSize);
+    auto Position = reinterpret_cast<T *>(AlignedOffset);
+    return *reinterpret_cast<T *>(Position);
+  }
+
+  T &front() const XRAY_NEVER_INSTRUMENT {
+    DCHECK_NE(Head, &SentinelSegment);
+    DCHECK_NE(Size, 0u);
+    return *begin();
+  }
+
+  T &back() const XRAY_NEVER_INSTRUMENT {
+    DCHECK_NE(Tail, &SentinelSegment);
+    DCHECK_NE(Size, 0u);
+    auto It = end();
+    --It;
+    return *It;
+  }
+
+  template <class Predicate>
+  T *find_element(Predicate P) const XRAY_NEVER_INSTRUMENT {
+    if (empty())
+      return nullptr;
+
+    auto E = end();
+    for (auto I = begin(); I != E; ++I)
+      if (P(*I))
+        return &(*I);
+
+    return nullptr;
+  }
+
+  /// Remove N Elements from the end. This leaves the blocks behind, and not
+  /// require allocation of new blocks for new elements added after trimming.
+  void trim(uint64_t Elements) XRAY_NEVER_INSTRUMENT {
+    auto OldSize = Size;
+    Elements = Elements > Size ? Size : Elements;
+    Size -= Elements;
+
+    // We compute the number of segments we're going to return from the tail by
+    // counting how many elements have been trimmed. Given the following:
+    //
+    // - Each segment has N valid positions, where N > 0
+    // - The previous size > current size
+    //
+    // To compute the number of segments to return, we need to perform the
+    // following calculations for the number of segments required given 'x'
+    // elements:
+    //
+    //   f(x) = {
+    //            x == 0          : 0
+    //          , 0 < x <= N      : 1
+    //          , N < x <= max    : x / N + (x % N ? 1 : 0)
+    //          }
+    //
+    // We can simplify this down to:
+    //
+    //   f(x) = {
+    //            x == 0          : 0,
+    //          , 0 < x <= max    : x / N + (x < N || x % N ? 1 : 0)
+    //          }
+    //
+    // And further down to:
+    //
+    //   f(x) = x ? x / N + (x < N || x % N ? 1 : 0) : 0
+    //
+    // We can then perform the following calculation `s` which counts the number
+    // of segments we need to remove from the end of the data structure:
+    //
+    //   s(p, c) = f(p) - f(c)
+    //
+    // If we treat p = previous size, and c = current size, and given the
+    // properties above, the possible range for s(...) is [0..max(typeof(p))/N]
+    // given that typeof(p) == typeof(c).
+    auto F = [](uint64_t X) {
+      return X ? (X / ElementsPerSegment) +
+                     (X < ElementsPerSegment || X % ElementsPerSegment ? 1 : 0)
+               : 0;
+    };
+    auto PS = F(OldSize);
+    auto CS = F(Size);
+    DCHECK_GE(PS, CS);
+    auto SegmentsToTrim = PS - CS;
+    for (auto I = 0uL; I < SegmentsToTrim; ++I) {
+      // Here we place the current tail segment to the freelist. To do this
+      // appropriately, we need to perform a splice operation on two
+      // bidirectional linked-lists. In particular, we have the current state of
+      // the doubly-linked list of segments:
+      //
+      //   @S@ <- s0 <-> s1 <-> ... <-> sT -> @S@
+      //
+      DCHECK_NE(Head, &SentinelSegment);
+      DCHECK_NE(Tail, &SentinelSegment);
+      DCHECK_EQ(Tail->Next, &SentinelSegment);
+
+      if (Freelist == &SentinelSegment) {
+        // Our two lists at this point are in this configuration:
+        //
+        //   Freelist: (potentially) @S@
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@
+        //                  ^ Head                ^ Tail
+        //
+        // The end state for us will be this configuration:
+        //
+        //   Freelist: @S@<-sT->@S@
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT->@S@
+        //                  ^ Head          ^ Tail
+        //
+        // The first step for us is to hold a reference to the tail of Mainlist,
+        // which in our notation is represented by sT. We call this our "free
+        // segment" which is the segment we are placing on the Freelist.
+        //
+        //   sF = sT
+        //
+        // Then, we also hold a reference to the "pre-tail" element, which we
+        // call sPT:
+        //
+        //   sPT = pred(sT)
+        //
+        // We want to splice sT into the beginning of the Freelist, which in
+        // an empty Freelist means placing a segment whose predecessor and
+        // successor is the sentinel segment.
+        //
+        // The splice operation then can be performed in the following
+        // algorithm:
+        //
+        //   succ(sPT) = S
+        //   pred(sT) = S
+        //   succ(sT) = Freelist
+        //   Freelist = sT
+        //   Tail = sPT
+        //
+        auto SPT = Tail->Prev;
+        SPT->Next = &SentinelSegment;
+        Tail->Prev = &SentinelSegment;
+        Tail->Next = Freelist;
+        Freelist = Tail;
+        Tail = SPT;
+
+        // Our post-conditions here are:
+        DCHECK_EQ(Tail->Next, &SentinelSegment);
+        DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+      } else {
+        // In the other case, where the Freelist is not empty, we perform the
+        // following transformation instead:
+        //
+        // This transforms the current state:
+        //
+        //   Freelist: @S@<-f0->@S@
+        //                  ^ Freelist
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT<->sT->@S@
+        //                  ^ Head                ^ Tail
+        //
+        // Into the following:
+        //
+        //   Freelist: @S@<-sT<->f0->@S@
+        //                  ^ Freelist
+        //   Mainlist: @S@<-s0<->s1<->...<->sPT->@S@
+        //                  ^ Head          ^ Tail
+        //
+        // The algorithm is:
+        //
+        //   sFH = Freelist
+        //   sPT = pred(sT)
+        //   pred(SFH) = sT
+        //   succ(sT) = Freelist
+        //   pred(sT) = S
+        //   succ(sPT) = S
+        //   Tail = sPT
+        //   Freelist = sT
+        //
+        auto SFH = Freelist;
+        auto SPT = Tail->Prev;
+        auto ST = Tail;
+        SFH->Prev = ST;
+        ST->Next = Freelist;
+        ST->Prev = &SentinelSegment;
+        SPT->Next = &SentinelSegment;
+        Tail = SPT;
+        Freelist = ST;
+
+        // Our post-conditions here are:
+        DCHECK_EQ(Tail->Next, &SentinelSegment);
+        DCHECK_EQ(Freelist->Prev, &SentinelSegment);
+        DCHECK_EQ(Freelist->Next->Prev, Freelist);
+      }
+    }
+
+    // Now in case we've spliced all the segments in the end, we ensure that the
+    // main list is "empty", or both the head and tail pointing to the sentinel
+    // segment.
+    if (Tail == &SentinelSegment)
+      Head = Tail;
+
+    DCHECK(
+        (Size == 0 && Head == &SentinelSegment && Tail == &SentinelSegment) ||
+        (Size != 0 && Head != &SentinelSegment && Tail != &SentinelSegment));
+    DCHECK(
+        (Freelist != &SentinelSegment && Freelist->Prev == &SentinelSegment) ||
+        (Freelist == &SentinelSegment && Tail->Next == &SentinelSegment));
+  }
+
+  // Provide iterators.
+  Iterator<T> begin() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<T>(Head, 0, Size);
+  }
+  Iterator<T> end() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<T>(Tail, Size, Size);
+  }
+  Iterator<const T> cbegin() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<const T>(Head, 0, Size);
+  }
+  Iterator<const T> cend() const XRAY_NEVER_INSTRUMENT {
+    return Iterator<const T>(Tail, Size, Size);
+  }
+};
+
+// We need to have this storage definition out-of-line so that the compiler can
+// ensure that storage for the SentinelSegment is defined and has a single
+// address.
+template <class T>
+typename Array<T>::Segment Array<T>::SentinelSegment{
+    &Array<T>::SentinelSegment, &Array<T>::SentinelSegment, {'\0'}};
+
+} // namespace __xray
+
+#endif // XRAY_SEGMENTED_ARRAY_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_AArch64.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_AArch64.S
new file mode 100644
index 000000000000..536a79e0d150
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_AArch64.S
@@ -0,0 +1,169 @@
+#include "../builtins/assembly.h"
+#include "../sanitizer_common/sanitizer_asm.h"
+
+.macro SAVE_REGISTERS
+  stp x1, x2, [sp, #-16]!
+  stp x3, x4, [sp, #-16]!
+  stp x5, x6, [sp, #-16]!
+  stp x7, x30, [sp, #-16]!
+  stp q0, q1, [sp, #-32]!
+  stp q2, q3, [sp, #-32]!
+  stp q4, q5, [sp, #-32]!
+  stp q6, q7, [sp, #-32]!
+  // x8 is the indirect result register and needs to be preserved for the body of the function to use.
+  stp x8, x0, [sp, #-16]!
+.endm
+
+.macro RESTORE_REGISTERS
+  ldp x8, x0, [sp], #16
+  ldp q6, q7, [sp], #32
+  ldp q4, q5, [sp], #32
+  ldp q2, q3, [sp], #32
+  ldp q0, q1, [sp], #32
+  ldp x7, x30, [sp], #16
+  ldp x5, x6, [sp], #16
+  ldp x3, x4, [sp], #16
+  ldp x1, x2, [sp], #16
+.endm
+
+.text
+.p2align 2
+.global ASM_SYMBOL(__xray_FunctionEntry)
+ASM_HIDDEN(__xray_FunctionEntry)
+ASM_TYPE_FUNCTION(__xray_FunctionEntry)
+ASM_SYMBOL(__xray_FunctionEntry):
+    /* Move the return address beyond the end of sled data. The 12 bytes of
+         data are inserted in the code of the runtime patch, between the call
+         instruction and the instruction returned into. The data contains 32
+         bits of instrumented function ID and 64 bits of the address of
+         the current trampoline. */
+  add x30, x30, #12
+  // Push the registers which may be modified by the handler function.
+  SAVE_REGISTERS
+
+  // Load the handler function pointer.
+  adrp x2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+  ldr x2, [x2, #:lo12:ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)]
+  cbz x2, 1f
+  // Set w0 to the function ID (w17). Set x1 to XRayEntryType::ENTRY = 0.
+  mov w0, w17
+  mov x1, #0
+  // Call the handler with 2 parameters.
+  blr x2
+1:
+  RESTORE_REGISTERS
+  ret
+ASM_SIZE(__xray_FunctionEntry)
+
+.p2align 2
+.global ASM_SYMBOL(__xray_FunctionExit)
+ASM_HIDDEN(__xray_FunctionExit)
+ASM_TYPE_FUNCTION(__xray_FunctionExit)
+ASM_SYMBOL(__xray_FunctionExit):
+    /* Move the return address beyond the end of sled data. The 12 bytes of
+         data are inserted in the code of the runtime patch, between the call
+         instruction and the instruction returned into. The data contains 32
+         bits of instrumented function ID and 64 bits of the address of
+         the current trampoline. */
+  add x30, x30, #12
+  SAVE_REGISTERS
+
+  // Load the handler function pointer into x2.
+  adrp x2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+  ldr x2, [x2, #:lo12:ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)]
+  cbz x2, 1f
+  // Set w0 to the function ID (w17). Set x1 to XRayEntryType::EXIT = 1.
+  mov w0, w17
+  mov x1, #1
+  // Call the handler with 2 parameters.
+  blr x2
+1:
+  RESTORE_REGISTERS
+  ret
+ASM_SIZE(__xray_FunctionExit)
+
+.p2align 2
+.global ASM_SYMBOL(__xray_FunctionTailExit)
+ASM_HIDDEN(__xray_FunctionTailExit)
+ASM_TYPE_FUNCTION(__xray_FunctionTailExit)
+ASM_SYMBOL(__xray_FunctionTailExit):
+    /* Move the return address beyond the end of sled data. The 12 bytes of
+         data are inserted in the code of the runtime patch, between the call
+         instruction and the instruction returned into. The data contains 32
+         bits of instrumented function ID and 64 bits of the address of
+         the current trampoline. */
+  add x30, x30, #12
+  // Save the registers which may be modified by the handler function.
+  SAVE_REGISTERS
+  // Load the handler function pointer into x2.
+  adrp x2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+  ldr x2, [x2, #:lo12:ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)]
+  cbz x2, 1f
+  // Set w0 to the function ID (w17). Set x1 to XRayEntryType::TAIL = 2.
+  mov w0, w17
+  mov x1, #2
+  // Call the handler with 2 parameters.
+  blr x2
+1:
+  RESTORE_REGISTERS
+  ret
+ASM_SIZE(__xray_FunctionTailExit)
+
+.p2align 2
+.global ASM_SYMBOL(__xray_ArgLoggerEntry)
+ASM_HIDDEN(__xray_ArgLoggerEntry)
+ASM_TYPE_FUNCTION(__xray_ArgLoggerEntry)
+ASM_SYMBOL(__xray_ArgLoggerEntry):
+  add x30, x30, #12
+  // Push the registers which may be modified by the handler function.
+  SAVE_REGISTERS
+
+  adrp x8, ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)
+  ldr x8, [x8, #:lo12:ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)]
+  cbnz x8, 2f
+
+  // Load the handler function pointer.
+  adrp x8, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+  ldr x8, [x8, #:lo12:ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)]
+  cbz x8, 1f
+
+2:
+  mov x2, x0
+  mov x1, #3  // XRayEntryType::LOG_ARGS_ENTRY
+  mov w0, w17
+  blr x8
+
+1:
+  RESTORE_REGISTERS
+  ret
+ASM_SIZE(__xray_ArgLoggerEntry)
+
+// __xray_*Event have default visibility so that they can be referenced by user
+// DSOs that do not link against the runtime.
+.global ASM_SYMBOL(__xray_CustomEvent)
+ASM_TYPE_FUNCTION(__xray_CustomEvent)
+ASM_SYMBOL(__xray_CustomEvent):
+  SAVE_REGISTERS
+  adrp x8, ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)
+  ldr x8, [x8, #:lo12:ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)]
+  cbz x8, 1f
+  blr x8
+1:
+  RESTORE_REGISTERS
+  ret
+ASM_SIZE(__xray_CustomEvent)
+
+.global ASM_SYMBOL(__xray_TypedEvent)
+ASM_TYPE_FUNCTION(__xray_TypedEvent)
+ASM_SYMBOL(__xray_TypedEvent):
+  SAVE_REGISTERS
+  adrp x8, ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)
+  ldr x8, [x8, #:lo12:ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)]
+  cbz x8, 1f
+  blr x8
+1:
+  RESTORE_REGISTERS
+  ret
+ASM_SIZE(__xray_TypedEvent)
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_arm.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_arm.S
new file mode 100644
index 000000000000..3ffc1e443761
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_arm.S
@@ -0,0 +1,105 @@
+#include "../builtins/assembly.h"
+
+    .syntax unified
+    .arch armv6t2
+    .fpu vfpv2
+    .code 32
+    .global _ZN6__xray19XRayPatchedFunctionE
+
+    @ Word-aligned function entry point
+    .p2align 2
+    @ Let C/C++ see the symbol
+    .global __xray_FunctionEntry
+    .hidden __xray_FunctionEntry
+    @ It preserves all registers except r0, r12(ip), r14(lr) and r15(pc)
+    @ Assume that "q" part of the floating-point registers is not used
+    @   for passing parameters to C/C++ functions.
+    .type __xray_FunctionEntry, %function
+    @ In C++ it is void extern "C" __xray_FunctionEntry(uint32_t FuncId) with
+    @   FuncId passed in r0 register.
+__xray_FunctionEntry:
+    PUSH {r1-r3,lr}
+    @ Save floating-point parameters of the instrumented function
+    VPUSH {d0-d7}
+    MOVW r1, #:lower16:_ZN6__xray19XRayPatchedFunctionE - (. + 16)
+    MOVT r1, #:upper16:_ZN6__xray19XRayPatchedFunctionE - (. + 12)
+    LDR r2, [pc, r1]
+    @ Handler address is nullptr if handler is not set
+    CMP r2, #0
+    BEQ FunctionEntry_restore
+    @ Function ID is already in r0 (the first parameter).
+    @ r1=0 means that we are tracing an entry event
+    MOV r1, #0
+    @ Call the handler with 2 parameters in r0 and r1
+    BLX r2
+FunctionEntry_restore:
+    @ Restore floating-point parameters of the instrumented function
+    VPOP {d0-d7}
+    POP {r1-r3,pc}
+
+    @ Word-aligned function entry point
+    .p2align 2
+    @ Let C/C++ see the symbol
+	.global __xray_FunctionExit
+	.hidden __xray_FunctionExit
+	@ Assume that d1-d7 are not used for the return value.
+    @ Assume that "q" part of the floating-point registers is not used for the
+    @   return value in C/C++.
+	.type __xray_FunctionExit, %function
+	@ In C++ it is extern "C" void __xray_FunctionExit(uint32_t FuncId) with
+    @   FuncId passed in r0 register.
+__xray_FunctionExit:
+    PUSH {r1-r3,lr}
+    @ Save the floating-point return value of the instrumented function
+    VPUSH {d0}
+    @ Load the handler address
+    MOVW r1, #:lower16:_ZN6__xray19XRayPatchedFunctionE - (. + 16)
+    MOVT r1, #:upper16:_ZN6__xray19XRayPatchedFunctionE - (. + 12)
+    LDR r2, [pc, r1]
+    @ Handler address is nullptr if handler is not set
+    CMP r2, #0
+    BEQ FunctionExit_restore
+    @ Function ID is already in r0 (the first parameter).
+    @ 1 means that we are tracing an exit event
+    MOV r1, #1
+    @ Call the handler with 2 parameters in r0 and r1
+    BLX r2
+FunctionExit_restore:
+    @ Restore the floating-point return value of the instrumented function
+    VPOP {d0}
+    POP {r1-r3,pc}
+
+    @ Word-aligned function entry point
+    .p2align 2
+    @ Let C/C++ see the symbol
+    .global __xray_FunctionTailExit
+    .hidden __xray_FunctionTailExit
+    @ It preserves all registers except r0, r12(ip), r14(lr) and r15(pc)
+    @ Assume that "q" part of the floating-point registers is not used
+    @   for passing parameters to C/C++ functions.
+    .type __xray_FunctionTailExit, %function
+    @ In C++ it is void extern "C" __xray_FunctionTailExit(uint32_t FuncId)
+    @   with FuncId passed in r0 register.
+__xray_FunctionTailExit:
+    PUSH {r1-r3,lr}
+    @ Save floating-point parameters of the instrumented function
+    VPUSH {d0-d7}
+    MOVW r1, #:lower16:_ZN6__xray19XRayPatchedFunctionE - (. + 16)
+    MOVT r1, #:upper16:_ZN6__xray19XRayPatchedFunctionE - (. + 12)
+    LDR r2, [pc, r1]
+    @ Handler address is nullptr if handler is not set
+    CMP r2, #0
+    BEQ FunctionTailExit_restore
+    @ Function ID is already in r0 (the first parameter).
+    @ r1=2 means that we are tracing a tail exit event
+    @ But before the logging part of XRay is ready, we pretend that here a
+    @   normal function exit happens, so we give the handler code 1
+    MOV r1, #1
+    @ Call the handler with 2 parameters in r0 and r1
+    BLX r2
+FunctionTailExit_restore:
+    @ Restore floating-point parameters of the instrumented function
+    VPOP {d0-d7}
+    POP {r1-r3,pc}
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_hexagon.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_hexagon.S
new file mode 100644
index 000000000000..c87ec4bed1f9
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_hexagon.S
@@ -0,0 +1,99 @@
+//===-- xray_trampoline_hexagon.s -------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the hexagon-specific assembler for the trampolines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../builtins/assembly.h"
+#include "../sanitizer_common/sanitizer_asm.h"
+
+.macro SAVE_REGISTERS
+memw(sp+#0)=r0
+memw(sp+#4)=r1
+memw(sp+#8)=r2
+memw(sp+#12)=r3
+memw(sp+#16)=r4
+.endm
+.macro RESTORE_REGISTERS
+r0=memw(sp+#0)
+r1=memw(sp+#4)
+r2=memw(sp+#8)
+r3=memw(sp+#12)
+r4=memw(sp+#16)
+.endm
+
+.macro CALL_PATCHED_FUNC entry_type
+	// if (xray::XRayPatchedFunctionE != NULL)
+	//     xray::XRayPatchedFunctionE(FuncType);
+
+	r8 = #ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+
+	// The patched sled puts the function type
+	// into r6.  Move it into r0 to pass it to
+	// the patched function.
+	{ r0 = r6
+          r1 = \entry_type
+          p0 = !cmp.eq(r8, #0)
+	  if (p0) callr r8 }
+.endm
+
+	.text
+	.globl ASM_SYMBOL(__xray_FunctionEntry)
+	ASM_HIDDEN(__xray_FunctionEntry)
+	ASM_TYPE_FUNCTION(__xray_FunctionEntry)
+# LLVM-MCA-BEGIN __xray_FunctionEntry
+ASM_SYMBOL(__xray_FunctionEntry):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+
+	CALL_PATCHED_FUNC #0  // XRayEntryType::ENTRY
+.Ltmp0:
+	RESTORE_REGISTERS
+	// return
+# LLVM-MCA-END
+	ASM_SIZE(__xray_FunctionEntry)
+	CFI_ENDPROC
+
+
+	.globl ASM_SYMBOL(__xray_FunctionExit)
+	ASM_HIDDEN(__xray_FunctionExit)
+	ASM_TYPE_FUNCTION(__xray_FunctionExit)
+# LLVM-MCA-BEGIN __xray_FunctionExit
+ASM_SYMBOL(__xray_FunctionExit):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+
+	CALL_PATCHED_FUNC #1  // XRayEntryType::EXIT
+.Ltmp1:
+	RESTORE_REGISTERS
+	// return
+	jumpr r31
+# LLVM-MCA-END
+	ASM_SIZE(__xray_FunctionExit)
+	CFI_ENDPROC
+
+
+	.globl ASM_SYMBOL(__xray_FunctionTailExit)
+	ASM_HIDDEN(__xray_FunctionTailExit)
+	ASM_TYPE_FUNCTION(__xray_FunctionTailExit)
+# LLVM-MCA-BEGIN __xray_FunctionTailExit
+ASM_SYMBOL(__xray_FunctionTailExit):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+
+	CALL_PATCHED_FUNC #2  // XRayEntryType::TAIL
+.Ltmp2:
+	RESTORE_REGISTERS
+	// return
+	jumpr r31
+# LLVM-MCA-END
+	ASM_SIZE(__xray_FunctionTailExit)
+	CFI_ENDPROC
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_loongarch64.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_loongarch64.S
new file mode 100644
index 000000000000..fcbefcc5f7a2
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_loongarch64.S
@@ -0,0 +1,124 @@
+//===-- xray_trampoline_loongarch64.s ---------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the loongarch-specific assembler for the trampolines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../sanitizer_common/sanitizer_asm.h"
+
+#define FROM_0_TO_7 0,1,2,3,4,5,6,7
+#define FROM_7_TO_0 7,6,5,4,3,2,1,0
+
+.macro SAVE_ARG_REGISTERS
+  .irp i,FROM_7_TO_0
+    st.d  $a\i, $sp, (8 * 8 + 8 * \i)
+  .endr
+  .irp i,FROM_7_TO_0
+    fst.d $f\i, $sp, (8 * \i)
+  .endr
+.endm
+
+.macro RESTORE_ARG_REGISTERS
+  .irp i,FROM_0_TO_7
+    fld.d $f\i, $sp, (8 * \i)
+  .endr
+  .irp i,FROM_0_TO_7
+    ld.d  $a\i, $sp, (8 * 8 + 8 * \i)
+  .endr
+.endm
+
+.macro SAVE_RET_REGISTERS
+  st.d    $a1, $sp, 24
+  st.d    $a0, $sp, 16
+  fst.d   $f1, $sp, 8
+  fst.d   $f0, $sp, 0
+.endm
+
+.macro RESTORE_RET_REGISTERS
+  fld.d   $f0, $sp, 0
+  fld.d   $f1, $sp, 8
+  ld.d    $a0, $sp, 16
+  ld.d    $a1, $sp, 24
+.endm
+
+  .text
+  .file "xray_trampoline_loongarch64.S"
+  .globl ASM_SYMBOL(__xray_FunctionEntry)
+  ASM_HIDDEN(__xray_FunctionEntry)
+  .p2align 2
+  ASM_TYPE_FUNCTION(__xray_FunctionEntry)
+ASM_SYMBOL(__xray_FunctionEntry):
+  .cfi_startproc
+  // Save argument registers before doing any actual work.
+  .cfi_def_cfa_offset 136
+  addi.d  $sp, $sp, -136
+  st.d    $ra, $sp, 128
+  .cfi_offset 1, -8
+  SAVE_ARG_REGISTERS
+
+  la.got  $t2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+  ld.d    $t2, $t2, 0
+
+  beqz    $t2, FunctionEntry_restore
+
+  // a1=0 means that we are tracing an entry event.
+  move    $a1, $zero
+  // Function ID is in t1 (the first parameter).
+  move    $a0, $t1
+  jirl    $ra, $t2, 0
+
+FunctionEntry_restore:
+  // Restore argument registers.
+  RESTORE_ARG_REGISTERS
+  ld.d    $ra, $sp, 128
+  addi.d  $sp, $sp, 136
+  ret
+FunctionEntry_end:
+  ASM_SIZE(__xray_FunctionEntry)
+  .cfi_endproc
+
+  .text
+  .globl ASM_SYMBOL(__xray_FunctionExit)
+  ASM_HIDDEN(__xray_FunctionExit)
+  .p2align 2
+  ASM_TYPE_FUNCTION(__xray_FunctionExit)
+ASM_SYMBOL(__xray_FunctionExit):
+  .cfi_startproc
+  // Save return registers before doing any actual work.
+  .cfi_def_cfa_offset 48
+  addi.d  $sp, $sp, -48
+  st.d    $ra, $sp, 40
+  .cfi_offset 1, -8
+  st.d    $fp, $sp, 32
+  SAVE_RET_REGISTERS
+
+  la.got  $t2, ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)
+  ld.d    $t2, $t2, 0
+
+  beqz    $t2, FunctionExit_restore
+
+  // a1=1 means that we are tracing an exit event.
+  li.w    $a1, 1
+  // Function ID is in t1 (the first parameter).
+  move    $a0, $t1
+  jirl    $ra, $t2, 0
+
+FunctionExit_restore:
+  // Restore return registers.
+  RESTORE_RET_REGISTERS
+  ld.d    $fp, $sp, 32
+  ld.d    $ra, $sp, 40
+  addi.d  $sp, $sp, 48
+  ret
+
+FunctionExit_end:
+  ASM_SIZE(__xray_FunctionExit)
+  .cfi_endproc
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_mips.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_mips.S
new file mode 100644
index 000000000000..499c350d2a24
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_mips.S
@@ -0,0 +1,109 @@
+//===-- xray_trampoline_mips.s ----------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the MIPS-specific assembler for the trampolines.
+//
+//===----------------------------------------------------------------------===//
+
+  .text
+  .file "xray_trampoline_mips.S"
+  .globl __xray_FunctionEntry
+  .p2align 2
+  .type __xray_FunctionEntry,@function
+__xray_FunctionEntry:
+  .cfi_startproc
+  .set noreorder
+  .cpload $t9
+  .set reorder
+  // Save argument registers before doing any actual work
+  .cfi_def_cfa_offset 36
+  addiu  $sp, $sp, -36
+  sw     $ra, 32($sp)
+  .cfi_offset 31, -4
+  sw     $a3, 28($sp)
+  sw     $a2, 24($sp)
+  sw     $a1, 20($sp)
+  sw     $a0, 16($sp)
+  sdc1	 $f14, 8($sp)
+  sdc1	 $f12, 0($sp)
+
+  la     $t9, _ZN6__xray19XRayPatchedFunctionE
+  lw     $t9, 0($t9)
+
+  beqz   $t9, FunctionEntry_restore
+
+  // a1=0 means that we are tracing an entry event
+  move   $a1, $zero
+  // Function ID is in t0 (the first parameter).
+  move   $a0, $t0
+  jalr   $t9
+
+FunctionEntry_restore:
+  // Restore argument registers
+  ldc1   $f12, 0($sp)
+  ldc1   $f14, 8($sp)
+  lw     $a0, 16($sp)
+  lw     $a1, 20($sp)
+  lw     $a2, 24($sp)
+  lw     $a3, 28($sp)
+  lw     $ra, 32($sp)
+  addiu	 $sp, $sp, 36
+  jr     $ra
+FunctionEntry_end:
+  .size __xray_FunctionEntry, FunctionEntry_end-__xray_FunctionEntry
+  .cfi_endproc
+
+  .text
+  .globl __xray_FunctionExit
+  .p2align 2
+  .type __xray_FunctionExit,@function
+__xray_FunctionExit:
+  .cfi_startproc
+  .set noreorder
+  .cpload $t9
+  .set reorder
+  // Save return registers before doing any actual work.
+  .cfi_def_cfa_offset 36
+  addiu  $sp, $sp, -36
+  sw     $ra, 32($sp)
+  .cfi_offset 31, -4
+  sw     $a1, 28($sp)
+  sw     $a0, 24($sp)
+  sw     $v1, 20($sp)
+  sw     $v0, 16($sp)
+  sdc1   $f2, 8($sp)
+  sdc1   $f0, 0($sp)
+
+  la     $t9, _ZN6__xray19XRayPatchedFunctionE
+  lw     $t9, 0($t9)
+
+  beqz	 $t9, FunctionExit_restore
+
+  // a1=1 means that we are tracing an exit event
+  li     $a1, 1
+  // Function ID is in t0 (the first parameter).
+  move   $a0, $t0
+  jalr   $t9
+
+FunctionExit_restore:
+  // Restore return registers
+  ldc1   $f0, 0($sp)
+  ldc1   $f2, 8($sp)
+  lw     $v0, 16($sp)
+  lw     $v1, 20($sp)
+  lw     $a0, 24($sp)
+  lw     $a1, 28($sp)
+  lw     $ra, 32($sp)
+  addiu  $sp, $sp, 36
+  jr     $ra
+
+FunctionExit_end:
+  .size __xray_FunctionExit, FunctionExit_end-__xray_FunctionExit
+  .cfi_endproc
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_mips64.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_mips64.S
new file mode 100644
index 000000000000..d65bec1fc687
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_mips64.S
@@ -0,0 +1,135 @@
+//===-- xray_trampoline_mips64.s --------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the MIPS64-specific assembler for the trampolines.
+//
+//===----------------------------------------------------------------------===//
+
+  .text
+  .file "xray_trampoline_mips64.S"
+  .globl __xray_FunctionEntry
+  .p2align 2
+  .type __xray_FunctionEntry,@function
+__xray_FunctionEntry:
+  .cfi_startproc
+  // Save argument registers before doing any actual work.
+  .cfi_def_cfa_offset 144
+  daddiu  $sp, $sp, -144
+  sd      $ra, 136($sp)
+  .cfi_offset 31, -8
+  sd      $gp, 128($sp)
+  sd      $a7, 120($sp)
+  sd      $a6, 112($sp)
+  sd      $a5, 104($sp)
+  sd      $a4, 96($sp)
+  sd      $a3, 88($sp)
+  sd      $a2, 80($sp)
+  sd      $a1, 72($sp)
+  sd      $a0, 64($sp)
+  sdc1    $f19, 56($sp)
+  sdc1    $f18, 48($sp)
+  sdc1    $f17, 40($sp)
+  sdc1    $f16, 32($sp)
+  sdc1    $f15, 24($sp)
+  sdc1    $f14, 16($sp)
+  sdc1    $f13, 8($sp)
+  sdc1    $f12, 0($sp)
+
+  lui     $gp, %hi(%neg(%gp_rel(__xray_FunctionEntry)))
+  daddu   $gp, $gp, $t9
+  daddiu  $gp ,$gp, %lo(%neg(%gp_rel(__xray_FunctionEntry)))
+
+  dla     $t9, _ZN6__xray19XRayPatchedFunctionE
+  ld      $t9, 0($t9)
+
+  beqz    $t9, FunctionEntry_restore
+
+  // a1=0 means that we are tracing an entry event
+  move    $a1, $zero
+  // Function ID is in t0 (the first parameter).
+  move    $a0, $t0
+  jalr    $t9
+
+FunctionEntry_restore:
+  // Restore argument registers
+  ldc1    $f12, 0($sp)
+  ldc1    $f13, 8($sp)
+  ldc1    $f14, 16($sp)
+  ldc1    $f15, 24($sp)
+  ldc1    $f16, 32($sp)
+  ldc1    $f17, 40($sp)
+  ldc1    $f18, 48($sp)
+  ldc1    $f19, 56($sp)
+  ld      $a0, 64($sp)
+  ld      $a1, 72($sp)
+  ld      $a2, 80($sp)
+  ld      $a3, 88($sp)
+  ld      $a4, 96($sp)
+  ld      $a5, 104($sp)
+  ld      $a6, 112($sp)
+  ld      $a7, 120($sp)
+  ld      $gp, 128($sp)
+  ld      $ra, 136($sp)
+  daddiu  $sp, $sp, 144
+  jr      $ra
+FunctionEntry_end:
+  .size __xray_FunctionEntry, FunctionEntry_end-__xray_FunctionEntry
+  .cfi_endproc
+
+  .text
+  .globl __xray_FunctionExit
+  .p2align 2
+  .type __xray_FunctionExit,@function
+__xray_FunctionExit:
+  .cfi_startproc
+  // Save return registers before doing any actual work.
+  .cfi_def_cfa_offset 64
+  daddiu  $sp, $sp, -64
+  sd      $ra, 56($sp)
+  .cfi_offset 31, -8
+  sd      $gp, 48($sp)
+  sd      $a0, 40($sp)
+  sd      $v1, 32($sp)
+  sd      $v0, 24($sp)
+  sdc1    $f2, 16($sp)
+  sdc1    $f1, 8($sp)
+  sdc1    $f0, 0($sp)
+
+  lui     $gp, %hi(%neg(%gp_rel(__xray_FunctionExit)))
+  daddu   $gp, $gp, $t9
+  daddiu  $gp ,$gp, %lo(%neg(%gp_rel(__xray_FunctionExit)))
+
+  dla     $t9, _ZN6__xray19XRayPatchedFunctionE
+  ld      $t9, 0($t9)
+
+  beqz    $t9, FunctionExit_restore
+
+  // a1=1 means that we are tracing an exit event
+  li      $a1, 1
+  // Function ID is in t0 (the first parameter).
+  move    $a0, $t0
+  jalr    $t9
+
+FunctionExit_restore:
+  // Restore return registers
+  ldc1    $f0, 0($sp)
+  ldc1    $f1, 8($sp)
+  ldc1    $f2, 16($sp)
+  ld      $v0, 24($sp)
+  ld      $v1, 32($sp)
+  ld      $a0, 40($sp)
+  ld      $gp, 48($sp)
+  ld      $ra, 56($sp)
+  daddiu  $sp, $sp, 64
+  jr      $ra
+
+FunctionExit_end:
+  .size __xray_FunctionExit, FunctionExit_end-__xray_FunctionExit
+  .cfi_endproc
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_powerpc64.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_powerpc64.cpp
new file mode 100644
index 000000000000..878c46930fee
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_powerpc64.cpp
@@ -0,0 +1,15 @@
+#include <atomic>
+#include <xray/xray_interface.h>
+
+namespace __xray {
+
+extern std::atomic<void (*)(int32_t, XRayEntryType)> XRayPatchedFunction;
+
+// Implement this in C++ instead of assembly, to avoid dealing with ToC by hand.
+void CallXRayPatchedFunction(int32_t FuncId, XRayEntryType Type) {
+  auto fptr = __xray::XRayPatchedFunction.load();
+  if (fptr != nullptr)
+    (*fptr)(FuncId, Type);
+}
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S
new file mode 100644
index 000000000000..250e2e5be67a
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_powerpc64_asm.S
@@ -0,0 +1,235 @@
+	.text
+	.abiversion 2
+	.globl	__xray_FunctionEntry
+	.p2align	4
+__xray_FunctionEntry:
+	std 0, 16(1)
+	stdu 1, -408(1)
+# Spill r3-r10, f1-f13, and vsr34-vsr45, which are parameter registers.
+# If this appears to be slow, the caller needs to pass in number of generic,
+# floating point, and vector parameters, so that we only spill those live ones.
+	std 3, 32(1)
+	ld 3, 400(1) # FuncId
+	std 4, 40(1)
+	std 5, 48(1)
+	std 6, 56(1)
+	std 7, 64(1)
+	std 8, 72(1)
+	std 9, 80(1)
+	std 10, 88(1)
+	addi 4, 1, 96
+	stxsdx 1, 0, 4
+	addi 4, 1, 104
+	stxsdx 2, 0, 4
+	addi 4, 1, 112
+	stxsdx 3, 0, 4
+	addi 4, 1, 120
+	stxsdx 4, 0, 4
+	addi 4, 1, 128
+	stxsdx 5, 0, 4
+	addi 4, 1, 136
+	stxsdx 6, 0, 4
+	addi 4, 1, 144
+	stxsdx 7, 0, 4
+	addi 4, 1, 152
+	stxsdx 8, 0, 4
+	addi 4, 1, 160
+	stxsdx 9, 0, 4
+	addi 4, 1, 168
+	stxsdx 10, 0, 4
+	addi 4, 1, 176
+	stxsdx 11, 0, 4
+	addi 4, 1, 184
+	stxsdx 12, 0, 4
+	addi 4, 1, 192
+	stxsdx 13, 0, 4
+	addi 4, 1, 200
+	stxvd2x 34, 0, 4
+	addi 4, 1, 216
+	stxvd2x 35, 0, 4
+	addi 4, 1, 232
+	stxvd2x 36, 0, 4
+	addi 4, 1, 248
+	stxvd2x 37, 0, 4
+	addi 4, 1, 264
+	stxvd2x 38, 0, 4
+	addi 4, 1, 280
+	stxvd2x 39, 0, 4
+	addi 4, 1, 296
+	stxvd2x 40, 0, 4
+	addi 4, 1, 312
+	stxvd2x 41, 0, 4
+	addi 4, 1, 328
+	stxvd2x 42, 0, 4
+	addi 4, 1, 344
+	stxvd2x 43, 0, 4
+	addi 4, 1, 360
+	stxvd2x 44, 0, 4
+	addi 4, 1, 376
+	stxvd2x 45, 0, 4
+	std 2, 392(1)
+	mflr 0
+	std 0, 400(1)
+
+	li 4, 0
+	bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType
+	nop
+
+	addi 4, 1, 96
+	lxsdx 1, 0, 4
+	addi 4, 1, 104
+	lxsdx 2, 0, 4
+	addi 4, 1, 112
+	lxsdx 3, 0, 4
+	addi 4, 1, 120
+	lxsdx 4, 0, 4
+	addi 4, 1, 128
+	lxsdx 5, 0, 4
+	addi 4, 1, 136
+	lxsdx 6, 0, 4
+	addi 4, 1, 144
+	lxsdx 7, 0, 4
+	addi 4, 1, 152
+	lxsdx 8, 0, 4
+	addi 4, 1, 160
+	lxsdx 9, 0, 4
+	addi 4, 1, 168
+	lxsdx 10, 0, 4
+	addi 4, 1, 176
+	lxsdx 11, 0, 4
+	addi 4, 1, 184
+	lxsdx 12, 0, 4
+	addi 4, 1, 192
+	lxsdx 13, 0, 4
+	addi 4, 1, 200
+	lxvd2x 34, 0, 4
+	addi 4, 1, 216
+	lxvd2x 35, 0, 4
+	addi 4, 1, 232
+	lxvd2x 36, 0, 4
+	addi 4, 1, 248
+	lxvd2x 37, 0, 4
+	addi 4, 1, 264
+	lxvd2x 38, 0, 4
+	addi 4, 1, 280
+	lxvd2x 39, 0, 4
+	addi 4, 1, 296
+	lxvd2x 40, 0, 4
+	addi 4, 1, 312
+	lxvd2x 41, 0, 4
+	addi 4, 1, 328
+	lxvd2x 42, 0, 4
+	addi 4, 1, 344
+	lxvd2x 43, 0, 4
+	addi 4, 1, 360
+	lxvd2x 44, 0, 4
+	addi 4, 1, 376
+	lxvd2x 45, 0, 4
+	ld 0, 400(1)
+	mtlr 0
+	ld 2, 392(1)
+	ld 3, 32(1)
+	ld 4, 40(1)
+	ld 5, 48(1)
+	ld 6, 56(1)
+	ld 7, 64(1)
+	ld 8, 72(1)
+	ld 9, 80(1)
+	ld 10, 88(1)
+
+	addi 1, 1, 408
+	ld 0, 16(1)
+	blr
+
+	.globl	__xray_FunctionExit
+	.p2align	4
+__xray_FunctionExit:
+	std 0, 16(1)
+	stdu 1, -256(1)
+# Spill r3-r4, f1-f8, and vsr34-vsr41, which are return registers.
+# If this appears to be slow, the caller needs to pass in number of generic,
+# floating point, and vector parameters, so that we only spill those live ones.
+	std 3, 32(1)
+	ld 3, 248(1) # FuncId
+	std 4, 40(1)
+	addi 4, 1, 48
+	stxsdx 1, 0, 4
+	addi 4, 1, 56
+	stxsdx 2, 0, 4
+	addi 4, 1, 64
+	stxsdx 3, 0, 4
+	addi 4, 1, 72
+	stxsdx 4, 0, 4
+	addi 4, 1, 80
+	stxsdx 5, 0, 4
+	addi 4, 1, 88
+	stxsdx 6, 0, 4
+	addi 4, 1, 96
+	stxsdx 7, 0, 4
+	addi 4, 1, 104
+	stxsdx 8, 0, 4
+	addi 4, 1, 112
+	stxvd2x 34, 0, 4
+	addi 4, 1, 128
+	stxvd2x 35, 0, 4
+	addi 4, 1, 144
+	stxvd2x 36, 0, 4
+	addi 4, 1, 160
+	stxvd2x 37, 0, 4
+	addi 4, 1, 176
+	stxvd2x 38, 0, 4
+	addi 4, 1, 192
+	stxvd2x 39, 0, 4
+	addi 4, 1, 208
+	stxvd2x 40, 0, 4
+	addi 4, 1, 224
+	stxvd2x 41, 0, 4
+	std 2, 240(1)
+	mflr 0
+	std 0, 248(1)
+
+	li 4, 1
+	bl _ZN6__xray23CallXRayPatchedFunctionEi13XRayEntryType
+	nop
+
+	addi 4, 1, 48
+	lxsdx 1, 0, 4
+	addi 4, 1, 56
+	lxsdx 2, 0, 4
+	addi 4, 1, 64
+	lxsdx 3, 0, 4
+	addi 4, 1, 72
+	lxsdx 4, 0, 4
+	addi 4, 1, 80
+	lxsdx 5, 0, 4
+	addi 4, 1, 88
+	lxsdx 6, 0, 4
+	addi 4, 1, 96
+	lxsdx 7, 0, 4
+	addi 4, 1, 104
+	lxsdx 8, 0, 4
+	addi 4, 1, 112
+	lxvd2x 34, 0, 4
+	addi 4, 1, 128
+	lxvd2x 35, 0, 4
+	addi 4, 1, 144
+	lxvd2x 36, 0, 4
+	addi 4, 1, 160
+	lxvd2x 37, 0, 4
+	addi 4, 1, 176
+	lxvd2x 38, 0, 4
+	addi 4, 1, 192
+	lxvd2x 39, 0, 4
+	addi 4, 1, 208
+	lxvd2x 40, 0, 4
+	addi 4, 1, 224
+	lxvd2x 41, 0, 4
+	ld 0, 248(1)
+	mtlr 0
+	ld 2, 240(1)
+	ld 3, 32(1)
+	ld 4, 40(1)
+
+	addi 1, 1, 256
+	ld 0, 16(1)
+	blr
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_x86_64.S b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_x86_64.S
new file mode 100644
index 000000000000..01098f60eeab
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_trampoline_x86_64.S
@@ -0,0 +1,311 @@
+//===-- xray_trampoline_x86.s -----------------------------------*- ASM -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the X86-specific assembler for the trampolines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "../builtins/assembly.h"
+#include "../sanitizer_common/sanitizer_asm.h"
+
+// XRay trampolines which are not produced by intrinsics are not System V AMD64
+// ABI compliant because they are called with a stack that is always misaligned
+// by 8 bytes with respect to a 16 bytes alignment. This is because they are
+// called immediately after the call to, or immediately before returning from,
+// the function being instrumented. This saves space in the patch point, but
+// misaligns the stack by 8 bytes.
+
+.macro ALIGN_STACK_16B
+#if defined(__APPLE__)
+	subq	$$8, %rsp
+#else
+	subq	$8, %rsp
+#endif
+	CFI_ADJUST_CFA_OFFSET(8)
+.endm
+
+.macro RESTORE_STACK_ALIGNMENT
+#if defined(__APPLE__)
+	addq	$$8, %rsp
+#else
+	addq	$8, %rsp
+#endif
+	CFI_ADJUST_CFA_OFFSET(-8)
+.endm
+
+// This macro should lower the stack pointer by an odd multiple of 8.
+.macro SAVE_REGISTERS
+	pushfq
+	CFI_ADJUST_CFA_OFFSET(8)
+	subq $240, %rsp
+	CFI_ADJUST_CFA_OFFSET(240)
+	movq %rbp, 232(%rsp)
+	movupd	%xmm0, 216(%rsp)
+	movupd	%xmm1, 200(%rsp)
+	movupd	%xmm2, 184(%rsp)
+	movupd	%xmm3, 168(%rsp)
+	movupd	%xmm4, 152(%rsp)
+	movupd	%xmm5, 136(%rsp)
+	movupd	%xmm6, 120(%rsp)
+	movupd	%xmm7, 104(%rsp)
+	movq	%rdi, 96(%rsp)
+	movq	%rax, 88(%rsp)
+	movq	%rdx, 80(%rsp)
+	movq	%rsi, 72(%rsp)
+	movq	%rcx, 64(%rsp)
+	movq	%r8, 56(%rsp)
+	movq	%r9, 48(%rsp)
+	movq  %r10, 40(%rsp)
+	movq  %r11, 32(%rsp)
+	movq  %r12, 24(%rsp)
+	movq  %r13, 16(%rsp)
+	movq  %r14, 8(%rsp)
+	movq  %r15, 0(%rsp)
+.endm
+
+.macro RESTORE_REGISTERS
+	movq  232(%rsp), %rbp
+	movupd	216(%rsp), %xmm0
+	movupd	200(%rsp), %xmm1
+	movupd	184(%rsp), %xmm2
+	movupd	168(%rsp), %xmm3
+	movupd	152(%rsp), %xmm4
+	movupd	136(%rsp), %xmm5
+	movupd	120(%rsp) , %xmm6
+	movupd	104(%rsp) , %xmm7
+	movq	96(%rsp), %rdi
+	movq	88(%rsp), %rax
+	movq	80(%rsp), %rdx
+	movq	72(%rsp), %rsi
+	movq	64(%rsp), %rcx
+	movq	56(%rsp), %r8
+	movq	48(%rsp), %r9
+	movq  40(%rsp), %r10
+	movq  32(%rsp), %r11
+	movq  24(%rsp), %r12
+	movq  16(%rsp), %r13
+	movq  8(%rsp), %r14
+	movq  0(%rsp), %r15
+	addq	$240, %rsp
+	CFI_ADJUST_CFA_OFFSET(-240)
+	popfq
+	CFI_ADJUST_CFA_OFFSET(-8)
+.endm
+
+	.text
+#if !defined(__APPLE__)
+	.section .text
+	.file "xray_trampoline_x86.S"
+#else
+	.section __TEXT,__text
+#endif
+
+//===----------------------------------------------------------------------===//
+
+	.globl ASM_SYMBOL(__xray_FunctionEntry)
+	ASM_HIDDEN(__xray_FunctionEntry)
+	.align 16, 0x90
+	ASM_TYPE_FUNCTION(__xray_FunctionEntry)
+# LLVM-MCA-BEGIN __xray_FunctionEntry
+ASM_SYMBOL(__xray_FunctionEntry):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+	ALIGN_STACK_16B
+
+	// This load has to be atomic, it's concurrent with __xray_patch().
+	// On x86/amd64, a simple (type-aligned) MOV instruction is enough.
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	testq	%rax, %rax
+	je	LOCAL_LABEL(tmp0)
+
+	// The patched function prologue puts its xray_instr_map index into %r10d.
+	movl	%r10d, %edi
+	xor	%esi,%esi
+	callq	*%rax
+
+LOCAL_LABEL(tmp0):
+	RESTORE_STACK_ALIGNMENT
+	RESTORE_REGISTERS
+	retq
+# LLVM-MCA-END
+	ASM_SIZE(__xray_FunctionEntry)
+	CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
+	.globl ASM_SYMBOL(__xray_FunctionExit)
+	ASM_HIDDEN(__xray_FunctionExit)
+	.align 16, 0x90
+	ASM_TYPE_FUNCTION(__xray_FunctionExit)
+# LLVM-MCA-BEGIN __xray_FunctionExit
+ASM_SYMBOL(__xray_FunctionExit):
+	CFI_STARTPROC
+	ALIGN_STACK_16B
+
+	// Save the important registers first. Since we're assuming that this
+	// function is only jumped into, we only preserve the registers for
+	// returning.
+	subq	$64, %rsp
+	CFI_ADJUST_CFA_OFFSET(64)
+	movq  %rbp, 48(%rsp)
+	movupd	%xmm0, 32(%rsp)
+	movupd	%xmm1, 16(%rsp)
+	movq	%rax, 8(%rsp)
+	movq	%rdx, 0(%rsp)
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	testq %rax,%rax
+	je	LOCAL_LABEL(tmp2)
+
+	movl	%r10d, %edi
+	movl	$1, %esi
+	callq	*%rax
+
+LOCAL_LABEL(tmp2):
+	// Restore the important registers.
+	movq  48(%rsp), %rbp
+	movupd	32(%rsp), %xmm0
+	movupd	16(%rsp), %xmm1
+	movq	8(%rsp), %rax
+	movq	0(%rsp), %rdx
+	addq	$64, %rsp
+	CFI_ADJUST_CFA_OFFSET(-64)
+
+	RESTORE_STACK_ALIGNMENT
+	retq
+# LLVM-MCA-END
+	ASM_SIZE(__xray_FunctionExit)
+	CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
+	.globl ASM_SYMBOL(__xray_FunctionTailExit)
+	ASM_HIDDEN(__xray_FunctionTailExit)
+	.align 16, 0x90
+	ASM_TYPE_FUNCTION(__xray_FunctionTailExit)
+# LLVM-MCA-BEGIN __xray_FunctionTailExit
+ASM_SYMBOL(__xray_FunctionTailExit):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+	ALIGN_STACK_16B
+
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	testq %rax,%rax
+	je	LOCAL_LABEL(tmp4)
+
+	movl	%r10d, %edi
+	movl	$2, %esi
+	callq	*%rax
+
+LOCAL_LABEL(tmp4):
+	RESTORE_STACK_ALIGNMENT
+	RESTORE_REGISTERS
+	retq
+# LLVM-MCA-END
+	ASM_SIZE(__xray_FunctionTailExit)
+	CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
+	.globl ASM_SYMBOL(__xray_ArgLoggerEntry)
+	ASM_HIDDEN(__xray_ArgLoggerEntry)
+	.align 16, 0x90
+	ASM_TYPE_FUNCTION(__xray_ArgLoggerEntry)
+# LLVM-MCA-BEGIN __xray_ArgLoggerEntry
+ASM_SYMBOL(__xray_ArgLoggerEntry):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+	ALIGN_STACK_16B
+
+	// Again, these function pointer loads must be atomic; MOV is fine.
+	movq	ASM_SYMBOL(_ZN6__xray13XRayArgLoggerE)(%rip), %rax
+	testq	%rax, %rax
+	jne	LOCAL_LABEL(arg1entryLog)
+
+	// If [arg1 logging handler] not set, defer to no-arg logging.
+	movq	ASM_SYMBOL(_ZN6__xray19XRayPatchedFunctionE)(%rip), %rax
+	testq	%rax, %rax
+	je	LOCAL_LABEL(arg1entryFail)
+
+LOCAL_LABEL(arg1entryLog):
+
+	// First argument will become the third
+	movq	%rdi, %rdx
+
+	// XRayEntryType::LOG_ARGS_ENTRY into the second
+	mov	$0x3, %esi
+
+	// 32-bit function ID becomes the first
+	movl	%r10d, %edi
+
+	callq	*%rax
+
+LOCAL_LABEL(arg1entryFail):
+	RESTORE_STACK_ALIGNMENT
+	RESTORE_REGISTERS
+	retq
+# LLVM-MCA-END
+	ASM_SIZE(__xray_ArgLoggerEntry)
+	CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
+// __xray_*Event have default visibility so that they can be referenced by user
+// DSOs that do not link against the runtime.
+	.global ASM_SYMBOL(__xray_CustomEvent)
+	.align 16, 0x90
+	ASM_TYPE_FUNCTION(__xray_CustomEvent)
+# LLVM-MCA-BEGIN __xray_CustomEvent
+ASM_SYMBOL(__xray_CustomEvent):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+
+	// We take two arguments to this trampoline, which should be in rdi	and rsi
+	// already.
+	movq ASM_SYMBOL(_ZN6__xray22XRayPatchedCustomEventE)(%rip), %rax
+	testq %rax,%rax
+	je LOCAL_LABEL(customEventCleanup)
+
+	callq	*%rax
+
+LOCAL_LABEL(customEventCleanup):
+	RESTORE_REGISTERS
+	retq
+# LLVM-MCA-END
+	ASM_SIZE(__xray_CustomEvent)
+	CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
+	.global ASM_SYMBOL(__xray_TypedEvent)
+	.align 16, 0x90
+	ASM_TYPE_FUNCTION(__xray_TypedEvent)
+# LLVM-MCA-BEGIN __xray_TypedEvent
+ASM_SYMBOL(__xray_TypedEvent):
+	CFI_STARTPROC
+	SAVE_REGISTERS
+
+	// We pass three arguments to this trampoline, which should be in rdi, rsi
+	// and rdx without our intervention.
+	movq ASM_SYMBOL(_ZN6__xray21XRayPatchedTypedEventE)(%rip), %rax
+	testq %rax,%rax
+	je LOCAL_LABEL(typedEventCleanup)
+
+	callq	*%rax
+
+LOCAL_LABEL(typedEventCleanup):
+	RESTORE_REGISTERS
+	retq
+# LLVM-MCA-END
+	ASM_SIZE(__xray_TypedEvent)
+	CFI_ENDPROC
+
+//===----------------------------------------------------------------------===//
+
+NO_EXEC_STACK_DIRECTIVE
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_tsc.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_tsc.h
new file mode 100644
index 000000000000..e1cafe1bf11d
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_tsc.h
@@ -0,0 +1,91 @@
+//===-- xray_tsc.h ----------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_EMULATE_TSC_H
+#define XRAY_EMULATE_TSC_H
+
+#include "sanitizer_common/sanitizer_common.h"
+
+namespace __xray {
+static constexpr uint64_t NanosecondsPerSecond = 1000ULL * 1000 * 1000;
+}
+
+#if SANITIZER_FUCHSIA
+#include <zircon/syscalls.h>
+
+namespace __xray {
+
+inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+  CPU = 0;
+  return _zx_ticks_get();
+}
+
+inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+  return _zx_ticks_per_second();
+}
+
+} // namespace __xray
+
+#else // SANITIZER_FUCHSIA
+
+#if defined(__x86_64__)
+#include "xray_x86_64.inc"
+#elif defined(__powerpc64__)
+#include "xray_powerpc64.inc"
+#elif defined(__arm__) || defined(__aarch64__) || defined(__mips__) ||         \
+    defined(__hexagon__) || defined(__loongarch_lp64)
+// Emulated TSC.
+// There is no instruction like RDTSCP in user mode on ARM. ARM's CP15 does
+//   not have a constant frequency like TSC on x86(_64), it may go faster
+//   or slower depending on CPU turbo or power saving mode. Furthermore,
+//   to read from CP15 on ARM a kernel modification or a driver is needed.
+//   We can not require this from users of compiler-rt.
+// So on ARM we use clock_gettime() which gives the result in nanoseconds.
+//   To get the measurements per second, we scale this by the number of
+//   nanoseconds per second, pretending that the TSC frequency is 1GHz and
+//   one TSC tick is 1 nanosecond.
+#include "sanitizer_common/sanitizer_common.h"
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "xray_defs.h"
+#include <cerrno>
+#include <cstdint>
+#include <time.h>
+
+namespace __xray {
+
+inline bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT { return true; }
+
+ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+  timespec TS;
+  int result = clock_gettime(CLOCK_REALTIME, &TS);
+  if (result != 0) {
+    Report("clock_gettime(2) returned %d, errno=%d.", result, int(errno));
+    TS.tv_sec = 0;
+    TS.tv_nsec = 0;
+  }
+  CPU = 0;
+  return TS.tv_sec * NanosecondsPerSecond + TS.tv_nsec;
+}
+
+inline uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+  return NanosecondsPerSecond;
+}
+
+} // namespace __xray
+
+#else
+#error Target architecture is not supported.
+#endif // CPU architecture
+#endif // SANITIZER_FUCHSIA
+
+#endif // XRAY_EMULATE_TSC_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_utils.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_utils.cpp
new file mode 100644
index 000000000000..5d51df9937c2
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_utils.cpp
@@ -0,0 +1,200 @@
+//===-- xray_utils.cpp ------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_utils.h"
+
+#include "sanitizer_common/sanitizer_allocator_internal.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
+#include "xray_flags.h"
+#include <cstdio>
+#include <errno.h>
+#include <fcntl.h>
+#include <iterator>
+#include <new>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <tuple>
+#include <unistd.h>
+#include <utility>
+
+#if SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_symbolizer_markup_constants.h"
+
+#include <inttypes.h>
+#include <zircon/process.h>
+#include <zircon/sanitizer.h>
+#include <zircon/status.h>
+#include <zircon/syscalls.h>
+#endif
+
+namespace __xray {
+
+#if SANITIZER_FUCHSIA
+constexpr const char* ProfileSinkName = "llvm-xray";
+
+LogWriter::~LogWriter() {
+  _zx_handle_close(Vmo);
+}
+
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+  if (Begin == End)
+    return;
+  auto TotalBytes = std::distance(Begin, End);
+
+  const size_t PageSize = flags()->xray_page_size_override > 0
+                              ? flags()->xray_page_size_override
+                              : GetPageSizeCached();
+  if (RoundUpTo(Offset, PageSize) != RoundUpTo(Offset + TotalBytes, PageSize)) {
+    // Resize the VMO to ensure there's sufficient space for the data.
+    zx_status_t Status = _zx_vmo_set_size(Vmo, Offset + TotalBytes);
+    if (Status != ZX_OK) {
+      Report("Failed to resize VMO: %s\n", _zx_status_get_string(Status));
+      return;
+    }
+  }
+
+  // Write the data into VMO.
+  zx_status_t Status = _zx_vmo_write(Vmo, Begin, Offset, TotalBytes);
+  if (Status != ZX_OK) {
+    Report("Failed to write: %s\n", _zx_status_get_string(Status));
+    return;
+  }
+  Offset += TotalBytes;
+
+  // Record the data size as a property of the VMO.
+  _zx_object_set_property(Vmo, ZX_PROP_VMO_CONTENT_SIZE,
+                          &Offset, sizeof(Offset));
+}
+
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+  // Nothing to do here since WriteAll writes directly into the VMO.
+}
+
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
+  // Create VMO to hold the profile data.
+  zx_handle_t Vmo;
+  zx_status_t Status = _zx_vmo_create(0, ZX_VMO_RESIZABLE, &Vmo);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot create VMO: %s\n", _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Get the KOID of the current process to use in the VMO name.
+  zx_info_handle_basic_t Info;
+  Status = _zx_object_get_info(_zx_process_self(), ZX_INFO_HANDLE_BASIC, &Info,
+                               sizeof(Info), NULL, NULL);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot get basic info about current process handle: %s\n",
+           _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Give the VMO a name including our process KOID so it's easy to spot.
+  char VmoName[ZX_MAX_NAME_LEN];
+  internal_snprintf(VmoName, sizeof(VmoName), "%s.%zu", ProfileSinkName,
+                    Info.koid);
+  _zx_object_set_property(Vmo, ZX_PROP_NAME, VmoName, strlen(VmoName));
+
+  // Duplicate the handle since __sanitizer_publish_data consumes it and
+  // LogWriter needs to hold onto it.
+  zx_handle_t Handle;
+  Status =_zx_handle_duplicate(Vmo, ZX_RIGHT_SAME_RIGHTS, &Handle);
+  if (Status != ZX_OK) {
+    Report("XRay: cannot duplicate VMO handle: %s\n",
+           _zx_status_get_string(Status));
+    return nullptr;
+  }
+
+  // Publish the VMO that receives the logging. Note the VMO's contents can
+  // grow and change after publication. The contents won't be read out until
+  // after the process exits.
+  __sanitizer_publish_data(ProfileSinkName, Handle);
+
+  // Use the dumpfile symbolizer markup element to write the name of the VMO.
+  Report("XRay: " FORMAT_DUMPFILE "\n", ProfileSinkName, VmoName);
+
+  LogWriter *LW = reinterpret_cast<LogWriter *>(InternalAlloc(sizeof(LogWriter)));
+  new (LW) LogWriter(Vmo);
+  return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+  LW->~LogWriter();
+  InternalFree(LW);
+}
+#else // SANITIZER_FUCHSIA
+LogWriter::~LogWriter() {
+  internal_close(Fd);
+}
+
+void LogWriter::WriteAll(const char *Begin, const char *End) XRAY_NEVER_INSTRUMENT {
+  if (Begin == End)
+    return;
+  auto TotalBytes = std::distance(Begin, End);
+  while (auto Written = write(Fd, Begin, TotalBytes)) {
+    if (Written < 0) {
+      if (errno == EINTR)
+        continue; // Try again.
+      Report("Failed to write; errno = %d\n", errno);
+      return;
+    }
+    TotalBytes -= Written;
+    if (TotalBytes == 0)
+      break;
+    Begin += Written;
+  }
+}
+
+void LogWriter::Flush() XRAY_NEVER_INSTRUMENT {
+  fsync(Fd);
+}
+
+LogWriter *LogWriter::Open() XRAY_NEVER_INSTRUMENT {
+  // Open a temporary file once for the log.
+  char TmpFilename[256] = {};
+  char TmpWildcardPattern[] = "XXXXXX";
+  auto **Argv = GetArgv();
+  const char *Progname = !Argv ? "(unknown)" : Argv[0];
+  const char *LastSlash = internal_strrchr(Progname, '/');
+
+  if (LastSlash != nullptr)
+    Progname = LastSlash + 1;
+
+  int NeededLength = internal_snprintf(
+      TmpFilename, sizeof(TmpFilename), "%s%s.%s",
+      flags()->xray_logfile_base, Progname, TmpWildcardPattern);
+  if (NeededLength > int(sizeof(TmpFilename))) {
+    Report("XRay log file name too long (%d): %s\n", NeededLength, TmpFilename);
+    return nullptr;
+  }
+  int Fd = mkstemp(TmpFilename);
+  if (Fd == -1) {
+    Report("XRay: Failed opening temporary file '%s'; not logging events.\n",
+           TmpFilename);
+    return nullptr;
+  }
+  if (Verbosity())
+    Report("XRay: Log file in '%s'\n", TmpFilename);
+
+  LogWriter *LW = allocate<LogWriter>();
+  new (LW) LogWriter(Fd);
+  return LW;
+}
+
+void LogWriter::Close(LogWriter *LW) {
+  LW->~LogWriter();
+  deallocate(LW);
+}
+#endif // SANITIZER_FUCHSIA
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_utils.h b/contrib/llvm-project/compiler-rt/lib/xray/xray_utils.h
new file mode 100644
index 000000000000..5dc73d7fa8cd
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_utils.h
@@ -0,0 +1,85 @@
+//===-- xray_utils.h --------------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// Some shared utilities for the XRay runtime implementation.
+//
+//===----------------------------------------------------------------------===//
+#ifndef XRAY_UTILS_H
+#define XRAY_UTILS_H
+
+#include <cstddef>
+#include <cstdint>
+#include <sys/types.h>
+#include <utility>
+
+#include "sanitizer_common/sanitizer_common.h"
+#if SANITIZER_FUCHSIA
+#include <zircon/types.h>
+#endif
+
+namespace __xray {
+
+class LogWriter {
+public:
+#if SANITIZER_FUCHSIA
+ LogWriter(zx_handle_t Vmo) : Vmo(Vmo) {}
+#else
+  explicit LogWriter(int Fd) : Fd(Fd) {}
+#endif
+ ~LogWriter();
+
+ // Write a character range into a log.
+ void WriteAll(const char *Begin, const char *End);
+
+ void Flush();
+
+ // Returns a new log instance initialized using the flag-provided values.
+ static LogWriter *Open();
+ // Closes and deallocates the log instance.
+ static void Close(LogWriter *LogWriter);
+
+private:
+#if SANITIZER_FUCHSIA
+ zx_handle_t Vmo = ZX_HANDLE_INVALID;
+ uint64_t Offset = 0;
+#else
+ int Fd = -1;
+#endif
+};
+
+constexpr size_t gcd(size_t a, size_t b) {
+  return (b == 0) ? a : gcd(b, a % b);
+}
+
+constexpr size_t lcm(size_t a, size_t b) { return a * b / gcd(a, b); }
+
+constexpr size_t nearest_boundary(size_t number, size_t multiple) {
+  return multiple * ((number / multiple) + ((number % multiple) ? 1 : 0));
+}
+
+constexpr size_t next_pow2_helper(size_t num, size_t acc) {
+  return (1u << acc) >= num ? (1u << acc) : next_pow2_helper(num, acc + 1);
+}
+
+constexpr size_t next_pow2(size_t number) {
+  return next_pow2_helper(number, 1);
+}
+
+template <class T> constexpr T &max(T &A, T &B) { return A > B ? A : B; }
+
+template <class T> constexpr T &min(T &A, T &B) { return A <= B ? A : B; }
+
+constexpr ptrdiff_t diff(uintptr_t A, uintptr_t B) {
+  return max(A, B) - min(A, B);
+}
+
+} // namespace __xray
+
+#endif // XRAY_UTILS_H
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_x86_64.cpp b/contrib/llvm-project/compiler-rt/lib/xray/xray_x86_64.cpp
new file mode 100644
index 000000000000..b9666a40861d
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_x86_64.cpp
@@ -0,0 +1,334 @@
+#include "cpuid.h"
+#include "sanitizer_common/sanitizer_common.h"
+#if !SANITIZER_FUCHSIA
+#include "sanitizer_common/sanitizer_posix.h"
+#endif
+#include "xray_defs.h"
+#include "xray_interface_internal.h"
+
+#if SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_APPLE
+#include <sys/types.h>
+#include <sys/sysctl.h>
+#elif SANITIZER_FUCHSIA
+#include <zircon/syscalls.h>
+#endif
+
+#include <atomic>
+#include <cstdint>
+#include <errno.h>
+#include <fcntl.h>
+#include <iterator>
+#include <limits>
+#include <tuple>
+#include <unistd.h>
+
+namespace __xray {
+
+#if SANITIZER_LINUX
+static std::pair<ssize_t, bool>
+retryingReadSome(int Fd, char *Begin, char *End) XRAY_NEVER_INSTRUMENT {
+  auto BytesToRead = std::distance(Begin, End);
+  ssize_t BytesRead;
+  ssize_t TotalBytesRead = 0;
+  while (BytesToRead && (BytesRead = read(Fd, Begin, BytesToRead))) {
+    if (BytesRead == -1) {
+      if (errno == EINTR)
+        continue;
+      Report("Read error; errno = %d\n", errno);
+      return std::make_pair(TotalBytesRead, false);
+    }
+
+    TotalBytesRead += BytesRead;
+    BytesToRead -= BytesRead;
+    Begin += BytesRead;
+  }
+  return std::make_pair(TotalBytesRead, true);
+}
+
+static bool readValueFromFile(const char *Filename,
+                              long long *Value) XRAY_NEVER_INSTRUMENT {
+  int Fd = open(Filename, O_RDONLY | O_CLOEXEC);
+  if (Fd == -1)
+    return false;
+  static constexpr size_t BufSize = 256;
+  char Line[BufSize] = {};
+  ssize_t BytesRead;
+  bool Success;
+  std::tie(BytesRead, Success) = retryingReadSome(Fd, Line, Line + BufSize);
+  close(Fd);
+  if (!Success)
+    return false;
+  const char *End = nullptr;
+  long long Tmp = internal_simple_strtoll(Line, &End, 10);
+  bool Result = false;
+  if (Line[0] != '\0' && (*End == '\n' || *End == '\0')) {
+    *Value = Tmp;
+    Result = true;
+  }
+  return Result;
+}
+
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+  long long TSCFrequency = -1;
+  if (readValueFromFile("/sys/devices/system/cpu/cpu0/tsc_freq_khz",
+                        &TSCFrequency)) {
+    TSCFrequency *= 1000;
+  } else if (readValueFromFile(
+                 "/sys/devices/system/cpu/cpu0/cpufreq/cpuinfo_max_freq",
+                 &TSCFrequency)) {
+    TSCFrequency *= 1000;
+  } else {
+    Report("Unable to determine CPU frequency for TSC accounting.\n");
+  }
+  return TSCFrequency == -1 ? 0 : static_cast<uint64_t>(TSCFrequency);
+}
+#elif SANITIZER_FREEBSD || SANITIZER_NETBSD || SANITIZER_APPLE
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+    long long TSCFrequency = -1;
+    size_t tscfreqsz = sizeof(TSCFrequency);
+#if SANITIZER_APPLE
+    if (internal_sysctlbyname("machdep.tsc.frequency", &TSCFrequency,
+                              &tscfreqsz, NULL, 0) != -1) {
+
+#else
+    if (internal_sysctlbyname("machdep.tsc_freq", &TSCFrequency, &tscfreqsz,
+                              NULL, 0) != -1) {
+#endif
+        return static_cast<uint64_t>(TSCFrequency);
+    } else {
+      Report("Unable to determine CPU frequency for TSC accounting.\n");
+    }
+
+    return 0;
+}
+#elif !SANITIZER_FUCHSIA
+uint64_t getTSCFrequency() XRAY_NEVER_INSTRUMENT {
+    /* Not supported */
+    return 0;
+}
+#endif
+
+static constexpr uint8_t CallOpCode = 0xe8;
+static constexpr uint16_t MovR10Seq = 0xba41;
+static constexpr uint16_t Jmp9Seq = 0x09eb;
+static constexpr uint16_t Jmp20Seq = 0x14eb;
+static constexpr uint16_t Jmp15Seq = 0x0feb;
+static constexpr uint8_t JmpOpCode = 0xe9;
+static constexpr uint8_t RetOpCode = 0xc3;
+static constexpr uint16_t NopwSeq = 0x9066;
+
+static constexpr int64_t MinOffset{std::numeric_limits<int32_t>::min()};
+static constexpr int64_t MaxOffset{std::numeric_limits<int32_t>::max()};
+
+bool patchFunctionEntry(const bool Enable, const uint32_t FuncId,
+                        const XRaySledEntry &Sled,
+                        void (*Trampoline)()) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +9
+  //   <9 byte nop>
+  //
+  // With the following:
+  //
+  //   mov r10d, <function id>
+  //   call <relative 32bit offset to entry trampoline>
+  //
+  // We need to do this in the following order:
+  //
+  // 1. Put the function id first, 2 bytes from the start of the sled (just
+  // after the 2-byte jmp instruction).
+  // 2. Put the call opcode 6 bytes from the start of the sled.
+  // 3. Put the relative offset 7 bytes from the start of the sled.
+  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+  // opcode and first operand.
+  //
+  // Prerequisite is to compute the relative offset to the trampoline's address.
+  const uint64_t Address = Sled.address();
+  int64_t TrampolineOffset = reinterpret_cast<int64_t>(Trampoline) -
+                             (static_cast<int64_t>(Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Entry trampoline (%p) too far from sled (%p)\n",
+           reinterpret_cast<void *>(Trampoline),
+           reinterpret_cast<void *>(Address));
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Address + 6) = CallOpCode;
+    *reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp9Seq,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+bool patchFunctionExit(const bool Enable, const uint32_t FuncId,
+                       const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   ret
+  //   <10 byte nop>
+  //
+  // With the following:
+  //
+  //   mov r10d, <function id>
+  //   jmp <relative 32bit offset to exit trampoline>
+  //
+  // 1. Put the function id first, 2 bytes from the start of the sled (just
+  // after the 1-byte ret instruction).
+  // 2. Put the jmp opcode 6 bytes from the start of the sled.
+  // 3. Put the relative offset 7 bytes from the start of the sled.
+  // 4. Do an atomic write over the jmp instruction for the "mov r10d"
+  // opcode and first operand.
+  //
+  // Prerequisite is to compute the relative offset fo the
+  // __xray_FunctionExit function's address.
+  const uint64_t Address = Sled.address();
+  int64_t TrampolineOffset = reinterpret_cast<int64_t>(__xray_FunctionExit) -
+                             (static_cast<int64_t>(Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Exit trampoline (%p) too far from sled (%p)\n",
+           reinterpret_cast<void *>(__xray_FunctionExit),
+           reinterpret_cast<void *>(Address));
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Address + 6) = JmpOpCode;
+    *reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint8_t> *>(Address), RetOpCode,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+bool patchFunctionTailExit(const bool Enable, const uint32_t FuncId,
+                           const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the tail call sled with a similar
+  // sequence as the entry sled, but calls the tail exit sled instead.
+  const uint64_t Address = Sled.address();
+  int64_t TrampolineOffset =
+      reinterpret_cast<int64_t>(__xray_FunctionTailExit) -
+      (static_cast<int64_t>(Address) + 11);
+  if (TrampolineOffset < MinOffset || TrampolineOffset > MaxOffset) {
+    Report("XRay Tail Exit trampoline (%p) too far from sled (%p)\n",
+           reinterpret_cast<void *>(__xray_FunctionTailExit),
+           reinterpret_cast<void *>(Address));
+    return false;
+  }
+  if (Enable) {
+    *reinterpret_cast<uint32_t *>(Address + 2) = FuncId;
+    *reinterpret_cast<uint8_t *>(Address + 6) = CallOpCode;
+    *reinterpret_cast<uint32_t *>(Address + 7) = TrampolineOffset;
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), MovR10Seq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp9Seq,
+        std::memory_order_release);
+    // FIXME: Write out the nops still?
+  }
+  return true;
+}
+
+bool patchCustomEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +15          // 2 bytes
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes*
+  //   ...
+  //
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +15'.
+  const uint64_t Address = Sled.address();
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp15Seq,
+        std::memory_order_release);
+  }
+  return false;
+}
+
+bool patchTypedEvent(const bool Enable, const uint32_t FuncId,
+                      const XRaySledEntry &Sled) XRAY_NEVER_INSTRUMENT {
+  // Here we do the dance of replacing the following sled:
+  //
+  // xray_sled_n:
+  //   jmp +20          // 2 byte instruction
+  //   ...
+  //
+  // With the following:
+  //
+  //   nopw             // 2 bytes
+  //   ...
+  //
+  //
+  // The "unpatch" should just turn the 'nopw' back to a 'jmp +20'.
+  // The 20 byte sled stashes three argument registers, calls the trampoline,
+  // unstashes the registers and returns. If the arguments are already in
+  // the correct registers, the stashing and unstashing become equivalently
+  // sized nops.
+  const uint64_t Address = Sled.address();
+  if (Enable) {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), NopwSeq,
+        std::memory_order_release);
+  } else {
+    std::atomic_store_explicit(
+        reinterpret_cast<std::atomic<uint16_t> *>(Address), Jmp20Seq,
+        std::memory_order_release);
+  }
+  return false;
+}
+
+#if !SANITIZER_FUCHSIA
+// We determine whether the CPU we're running on has the correct features we
+// need. In x86_64 this will be rdtscp support.
+bool probeRequiredCPUFeatures() XRAY_NEVER_INSTRUMENT {
+  unsigned int EAX, EBX, ECX, EDX;
+
+  // We check whether rdtscp support is enabled. According to the x86_64 manual,
+  // level should be set at 0x80000001, and we should have a look at bit 27 in
+  // EDX. That's 0x8000000 (or 1u << 27).
+  __asm__ __volatile__("cpuid" : "=a"(EAX), "=b"(EBX), "=c"(ECX), "=d"(EDX)
+    : "0"(0x80000001));
+  if (!(EDX & (1u << 27))) {
+    Report("Missing rdtscp support.\n");
+    return false;
+  }
+  // Also check whether we can determine the CPU frequency, since if we cannot,
+  // we should use the emulated TSC instead.
+  if (!getTSCFrequency()) {
+    Report("Unable to determine CPU frequency.\n");
+    return false;
+  }
+  return true;
+}
+#endif
+
+} // namespace __xray
diff --git a/contrib/llvm-project/compiler-rt/lib/xray/xray_x86_64.inc b/contrib/llvm-project/compiler-rt/lib/xray/xray_x86_64.inc
new file mode 100644
index 000000000000..dc71fb87f63d
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/xray/xray_x86_64.inc
@@ -0,0 +1,32 @@
+//===-- xray_x86_64.inc -----------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+//===----------------------------------------------------------------------===//
+
+#include <cstdint>
+
+#include "sanitizer_common/sanitizer_internal_defs.h"
+#include "xray_defs.h"
+
+namespace __xray {
+
+ALWAYS_INLINE uint64_t readTSC(uint8_t &CPU) XRAY_NEVER_INSTRUMENT {
+  unsigned LongCPU;
+  unsigned long Rax, Rdx;
+  __asm__ __volatile__("rdtscp\n" : "=a"(Rax), "=d"(Rdx), "=c"(LongCPU) ::);
+  CPU = LongCPU;
+  return (Rdx << 32) + Rax;
+}
+
+uint64_t getTSCFrequency();
+
+bool probeRequiredCPUFeatures();
+
+} // namespace __xray