summaryrefslogtreecommitdiff
path: root/lib/xray/xray_profile_collector.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'lib/xray/xray_profile_collector.cpp')
-rw-r--r--lib/xray/xray_profile_collector.cpp414
1 files changed, 414 insertions, 0 deletions
diff --git a/lib/xray/xray_profile_collector.cpp b/lib/xray/xray_profile_collector.cpp
new file mode 100644
index 000000000000..bef2504f2a16
--- /dev/null
+++ b/lib/xray/xray_profile_collector.cpp
@@ -0,0 +1,414 @@
+//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of XRay, a dynamic runtime instrumentation system.
+//
+// This implements the interface for the profileCollectorService.
+//
+//===----------------------------------------------------------------------===//
+#include "xray_profile_collector.h"
+#include "sanitizer_common/sanitizer_common.h"
+#include "xray_allocator.h"
+#include "xray_defs.h"
+#include "xray_profiling_flags.h"
+#include "xray_segmented_array.h"
+#include <memory>
+#include <pthread.h>
+#include <utility>
+
+namespace __xray {
+namespace profileCollectorService {
+
+namespace {
+
+SpinMutex GlobalMutex;
+struct ThreadTrie {
+ tid_t TId;
+ typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage;
+};
+
+struct ProfileBuffer {
+ void *Data;
+ size_t Size;
+};
+
+// Current version of the profile format.
+constexpr u64 XRayProfilingVersion = 0x20180424;
+
+// Identifier for XRay profiling files 'xrayprof' in hex.
+constexpr u64 XRayMagicBytes = 0x7872617970726f66;
+
+struct XRayProfilingFileHeader {
+ const u64 MagicBytes = XRayMagicBytes;
+ const u64 Version = XRayProfilingVersion;
+ u64 Timestamp = 0; // System time in nanoseconds.
+ u64 PID = 0; // Process ID.
+};
+
+struct BlockHeader {
+ u32 BlockSize;
+ u32 BlockNum;
+ u64 ThreadId;
+};
+
+struct ThreadData {
+ BufferQueue *BQ;
+ FunctionCallTrie::Allocators::Buffers Buffers;
+ FunctionCallTrie::Allocators Allocators;
+ FunctionCallTrie FCT;
+ tid_t TId;
+};
+
+using ThreadDataArray = Array<ThreadData>;
+using ThreadDataAllocator = ThreadDataArray::AllocatorType;
+
+// We use a separate buffer queue for the backing store for the allocator used
+// by the ThreadData array. This lets us host the buffers, allocators, and tries
+// associated with a thread by moving the data into the array instead of
+// attempting to copy the data to a separately backed set of tries.
+static typename std::aligned_storage<
+ sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage;
+static BufferQueue *BQ = nullptr;
+static BufferQueue::Buffer Buffer;
+static typename std::aligned_storage<sizeof(ThreadDataAllocator),
+ alignof(ThreadDataAllocator)>::type
+ ThreadDataAllocatorStorage;
+static typename std::aligned_storage<sizeof(ThreadDataArray),
+ alignof(ThreadDataArray)>::type
+ ThreadDataArrayStorage;
+
+static ThreadDataAllocator *TDAllocator = nullptr;
+static ThreadDataArray *TDArray = nullptr;
+
+using ProfileBufferArray = Array<ProfileBuffer>;
+using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType;
+
+// These need to be global aligned storage to avoid dynamic initialization. We
+// need these to be aligned to allow us to placement new objects into the
+// storage, and have pointers to those objects be appropriately aligned.
+static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type
+ ProfileBuffersStorage;
+static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type
+ ProfileBufferArrayAllocatorStorage;
+
+static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr;
+static ProfileBufferArray *ProfileBuffers = nullptr;
+
+// Use a global flag to determine whether the collector implementation has been
+// initialized.
+static atomic_uint8_t CollectorInitialized{0};
+
+} // namespace
+
+void post(BufferQueue *Q, FunctionCallTrie &&T,
+ FunctionCallTrie::Allocators &&A,
+ FunctionCallTrie::Allocators::Buffers &&B,
+ tid_t TId) XRAY_NEVER_INSTRUMENT {
+ DCHECK_NE(Q, nullptr);
+
+ // Bail out early if the collector has not been initialized.
+ if (!atomic_load(&CollectorInitialized, memory_order_acquire)) {
+ T.~FunctionCallTrie();
+ A.~Allocators();
+ Q->releaseBuffer(B.NodeBuffer);
+ Q->releaseBuffer(B.RootsBuffer);
+ Q->releaseBuffer(B.ShadowStackBuffer);
+ Q->releaseBuffer(B.NodeIdPairBuffer);
+ B.~Buffers();
+ return;
+ }
+
+ {
+ SpinMutexLock Lock(&GlobalMutex);
+ DCHECK_NE(TDAllocator, nullptr);
+ DCHECK_NE(TDArray, nullptr);
+
+ if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T),
+ TId) == nullptr) {
+ // If we fail to add the data to the array, we should destroy the objects
+ // handed us.
+ T.~FunctionCallTrie();
+ A.~Allocators();
+ Q->releaseBuffer(B.NodeBuffer);
+ Q->releaseBuffer(B.RootsBuffer);
+ Q->releaseBuffer(B.ShadowStackBuffer);
+ Q->releaseBuffer(B.NodeIdPairBuffer);
+ B.~Buffers();
+ }
+ }
+}
+
+// A PathArray represents the function id's representing a stack trace. In this
+// context a path is almost always represented from the leaf function in a call
+// stack to a root of the call trie.
+using PathArray = Array<int32_t>;
+
+struct ProfileRecord {
+ using PathAllocator = typename PathArray::AllocatorType;
+
+ // The Path in this record is the function id's from the leaf to the root of
+ // the function call stack as represented from a FunctionCallTrie.
+ PathArray Path;
+ const FunctionCallTrie::Node *Node;
+};
+
+namespace {
+
+using ProfileRecordArray = Array<ProfileRecord>;
+
+// Walk a depth-first traversal of each root of the FunctionCallTrie to generate
+// the path(s) and the data associated with the path.
+static void
+populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA,
+ const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT {
+ using StackArray = Array<const FunctionCallTrie::Node *>;
+ using StackAllocator = typename StackArray::AllocatorType;
+ StackAllocator StackAlloc(profilingFlags()->stack_allocator_max);
+ StackArray DFSStack(StackAlloc);
+ for (const auto *R : Trie.getRoots()) {
+ DFSStack.Append(R);
+ while (!DFSStack.empty()) {
+ auto *Node = DFSStack.back();
+ DFSStack.trim(1);
+ if (Node == nullptr)
+ continue;
+ auto Record = PRs.AppendEmplace(PathArray{PA}, Node);
+ if (Record == nullptr)
+ return;
+ DCHECK_NE(Record, nullptr);
+
+ // Traverse the Node's parents and as we're doing so, get the FIds in
+ // the order they appear.
+ for (auto N = Node; N != nullptr; N = N->Parent)
+ Record->Path.Append(N->FId);
+ DCHECK(!Record->Path.empty());
+
+ for (const auto C : Node->Callees)
+ DFSStack.Append(C.NodePtr);
+ }
+ }
+}
+
+static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header,
+ const ProfileRecordArray &ProfileRecords)
+ XRAY_NEVER_INSTRUMENT {
+ auto NextPtr = static_cast<uint8_t *>(
+ internal_memcpy(Buffer->Data, &Header, sizeof(Header))) +
+ sizeof(Header);
+ for (const auto &Record : ProfileRecords) {
+ // List of IDs follow:
+ for (const auto FId : Record.Path)
+ NextPtr =
+ static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) +
+ sizeof(FId);
+
+ // Add the sentinel here.
+ constexpr int32_t SentinelFId = 0;
+ NextPtr = static_cast<uint8_t *>(
+ internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) +
+ sizeof(SentinelFId);
+
+ // Add the node data here.
+ NextPtr =
+ static_cast<uint8_t *>(internal_memcpy(
+ NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) +
+ sizeof(Record.Node->CallCount);
+ NextPtr = static_cast<uint8_t *>(
+ internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime,
+ sizeof(Record.Node->CumulativeLocalTime))) +
+ sizeof(Record.Node->CumulativeLocalTime);
+ }
+
+ DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size);
+}
+
+} // namespace
+
+void serialize() XRAY_NEVER_INSTRUMENT {
+ if (!atomic_load(&CollectorInitialized, memory_order_acquire))
+ return;
+
+ SpinMutexLock Lock(&GlobalMutex);
+
+ // Clear out the global ProfileBuffers, if it's not empty.
+ for (auto &B : *ProfileBuffers)
+ deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size);
+ ProfileBuffers->trim(ProfileBuffers->size());
+
+ DCHECK_NE(TDArray, nullptr);
+ if (TDArray->empty())
+ return;
+
+ // Then repopulate the global ProfileBuffers.
+ u32 I = 0;
+ auto MaxSize = profilingFlags()->global_allocator_max;
+ auto ProfileArena = allocateBuffer(MaxSize);
+ if (ProfileArena == nullptr)
+ return;
+
+ auto ProfileArenaCleanup = at_scope_exit(
+ [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); });
+
+ auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max);
+ if (PathArena == nullptr)
+ return;
+
+ auto PathArenaCleanup = at_scope_exit(
+ [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); });
+
+ for (const auto &ThreadTrie : *TDArray) {
+ using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType;
+ ProfileRecordAllocator PRAlloc(ProfileArena,
+ profilingFlags()->global_allocator_max);
+ ProfileRecord::PathAllocator PathAlloc(
+ PathArena, profilingFlags()->global_allocator_max);
+ ProfileRecordArray ProfileRecords(PRAlloc);
+
+ // First, we want to compute the amount of space we're going to need. We'll
+ // use a local allocator and an __xray::Array<...> to store the intermediary
+ // data, then compute the size as we're going along. Then we'll allocate the
+ // contiguous space to contain the thread buffer data.
+ if (ThreadTrie.FCT.getRoots().empty())
+ continue;
+
+ populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT);
+ DCHECK(!ThreadTrie.FCT.getRoots().empty());
+ DCHECK(!ProfileRecords.empty());
+
+ // Go through each record, to compute the sizes.
+ //
+ // header size = block size (4 bytes)
+ // + block number (4 bytes)
+ // + thread id (8 bytes)
+ // record size = path ids (4 bytes * number of ids + sentinel 4 bytes)
+ // + call count (8 bytes)
+ // + local time (8 bytes)
+ // + end of record (8 bytes)
+ u32 CumulativeSizes = 0;
+ for (const auto &Record : ProfileRecords)
+ CumulativeSizes += 20 + (4 * Record.Path.size());
+
+ BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId};
+ auto B = ProfileBuffers->Append({});
+ B->Size = sizeof(Header) + CumulativeSizes;
+ B->Data = allocateBuffer(B->Size);
+ DCHECK_NE(B->Data, nullptr);
+ serializeRecords(B, Header, ProfileRecords);
+ }
+}
+
+void reset() XRAY_NEVER_INSTRUMENT {
+ atomic_store(&CollectorInitialized, 0, memory_order_release);
+ SpinMutexLock Lock(&GlobalMutex);
+
+ if (ProfileBuffers != nullptr) {
+ // Clear out the profile buffers that have been serialized.
+ for (auto &B : *ProfileBuffers)
+ deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size);
+ ProfileBuffers->trim(ProfileBuffers->size());
+ ProfileBuffers = nullptr;
+ }
+
+ if (TDArray != nullptr) {
+ // Release the resources as required.
+ for (auto &TD : *TDArray) {
+ TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer);
+ TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer);
+ }
+ // We don't bother destroying the array here because we've already
+ // potentially freed the backing store for the array. Instead we're going to
+ // reset the pointer to nullptr, and re-use the storage later instead
+ // (placement-new'ing into the storage as-is).
+ TDArray = nullptr;
+ }
+
+ if (TDAllocator != nullptr) {
+ TDAllocator->~Allocator();
+ TDAllocator = nullptr;
+ }
+
+ if (Buffer.Data != nullptr) {
+ BQ->releaseBuffer(Buffer);
+ }
+
+ if (BQ == nullptr) {
+ bool Success = false;
+ new (&BufferQueueStorage)
+ BufferQueue(profilingFlags()->global_allocator_max, 1, Success);
+ if (!Success)
+ return;
+ BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage);
+ } else {
+ BQ->finalize();
+
+ if (BQ->init(profilingFlags()->global_allocator_max, 1) !=
+ BufferQueue::ErrorCode::Ok)
+ return;
+ }
+
+ if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok)
+ return;
+
+ new (&ProfileBufferArrayAllocatorStorage)
+ ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max);
+ ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>(
+ &ProfileBufferArrayAllocatorStorage);
+
+ new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator);
+ ProfileBuffers =
+ reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage);
+
+ new (&ThreadDataAllocatorStorage)
+ ThreadDataAllocator(Buffer.Data, Buffer.Size);
+ TDAllocator =
+ reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage);
+ new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator);
+ TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage);
+
+ atomic_store(&CollectorInitialized, 1, memory_order_release);
+}
+
+XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT {
+ SpinMutexLock Lock(&GlobalMutex);
+
+ if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0)
+ return {nullptr, 0};
+
+ static pthread_once_t Once = PTHREAD_ONCE_INIT;
+ static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type
+ FileHeaderStorage;
+ pthread_once(
+ &Once, +[]() XRAY_NEVER_INSTRUMENT {
+ new (&FileHeaderStorage) XRayProfilingFileHeader{};
+ });
+
+ if (UNLIKELY(B.Data == nullptr)) {
+ // The first buffer should always contain the file header information.
+ auto &FileHeader =
+ *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage);
+ FileHeader.Timestamp = NanoTime();
+ FileHeader.PID = internal_getpid();
+ return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)};
+ }
+
+ if (UNLIKELY(B.Data == &FileHeaderStorage))
+ return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size};
+
+ BlockHeader Header;
+ internal_memcpy(&Header, B.Data, sizeof(BlockHeader));
+ auto NextBlock = Header.BlockNum + 1;
+ if (NextBlock < ProfileBuffers->size())
+ return {(*ProfileBuffers)[NextBlock].Data,
+ (*ProfileBuffers)[NextBlock].Size};
+ return {nullptr, 0};
+}
+
+} // namespace profileCollectorService
+} // namespace __xray