diff options
Diffstat (limited to 'lib/xray/xray_profile_collector.cpp')
-rw-r--r-- | lib/xray/xray_profile_collector.cpp | 414 |
1 files changed, 414 insertions, 0 deletions
diff --git a/lib/xray/xray_profile_collector.cpp b/lib/xray/xray_profile_collector.cpp new file mode 100644 index 000000000000..bef2504f2a16 --- /dev/null +++ b/lib/xray/xray_profile_collector.cpp @@ -0,0 +1,414 @@ +//===-- xray_profile_collector.cpp -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of XRay, a dynamic runtime instrumentation system. +// +// This implements the interface for the profileCollectorService. +// +//===----------------------------------------------------------------------===// +#include "xray_profile_collector.h" +#include "sanitizer_common/sanitizer_common.h" +#include "xray_allocator.h" +#include "xray_defs.h" +#include "xray_profiling_flags.h" +#include "xray_segmented_array.h" +#include <memory> +#include <pthread.h> +#include <utility> + +namespace __xray { +namespace profileCollectorService { + +namespace { + +SpinMutex GlobalMutex; +struct ThreadTrie { + tid_t TId; + typename std::aligned_storage<sizeof(FunctionCallTrie)>::type TrieStorage; +}; + +struct ProfileBuffer { + void *Data; + size_t Size; +}; + +// Current version of the profile format. +constexpr u64 XRayProfilingVersion = 0x20180424; + +// Identifier for XRay profiling files 'xrayprof' in hex. +constexpr u64 XRayMagicBytes = 0x7872617970726f66; + +struct XRayProfilingFileHeader { + const u64 MagicBytes = XRayMagicBytes; + const u64 Version = XRayProfilingVersion; + u64 Timestamp = 0; // System time in nanoseconds. + u64 PID = 0; // Process ID. +}; + +struct BlockHeader { + u32 BlockSize; + u32 BlockNum; + u64 ThreadId; +}; + +struct ThreadData { + BufferQueue *BQ; + FunctionCallTrie::Allocators::Buffers Buffers; + FunctionCallTrie::Allocators Allocators; + FunctionCallTrie FCT; + tid_t TId; +}; + +using ThreadDataArray = Array<ThreadData>; +using ThreadDataAllocator = ThreadDataArray::AllocatorType; + +// We use a separate buffer queue for the backing store for the allocator used +// by the ThreadData array. This lets us host the buffers, allocators, and tries +// associated with a thread by moving the data into the array instead of +// attempting to copy the data to a separately backed set of tries. +static typename std::aligned_storage< + sizeof(BufferQueue), alignof(BufferQueue)>::type BufferQueueStorage; +static BufferQueue *BQ = nullptr; +static BufferQueue::Buffer Buffer; +static typename std::aligned_storage<sizeof(ThreadDataAllocator), + alignof(ThreadDataAllocator)>::type + ThreadDataAllocatorStorage; +static typename std::aligned_storage<sizeof(ThreadDataArray), + alignof(ThreadDataArray)>::type + ThreadDataArrayStorage; + +static ThreadDataAllocator *TDAllocator = nullptr; +static ThreadDataArray *TDArray = nullptr; + +using ProfileBufferArray = Array<ProfileBuffer>; +using ProfileBufferArrayAllocator = typename ProfileBufferArray::AllocatorType; + +// These need to be global aligned storage to avoid dynamic initialization. We +// need these to be aligned to allow us to placement new objects into the +// storage, and have pointers to those objects be appropriately aligned. +static typename std::aligned_storage<sizeof(ProfileBufferArray)>::type + ProfileBuffersStorage; +static typename std::aligned_storage<sizeof(ProfileBufferArrayAllocator)>::type + ProfileBufferArrayAllocatorStorage; + +static ProfileBufferArrayAllocator *ProfileBuffersAllocator = nullptr; +static ProfileBufferArray *ProfileBuffers = nullptr; + +// Use a global flag to determine whether the collector implementation has been +// initialized. +static atomic_uint8_t CollectorInitialized{0}; + +} // namespace + +void post(BufferQueue *Q, FunctionCallTrie &&T, + FunctionCallTrie::Allocators &&A, + FunctionCallTrie::Allocators::Buffers &&B, + tid_t TId) XRAY_NEVER_INSTRUMENT { + DCHECK_NE(Q, nullptr); + + // Bail out early if the collector has not been initialized. + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) { + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + return; + } + + { + SpinMutexLock Lock(&GlobalMutex); + DCHECK_NE(TDAllocator, nullptr); + DCHECK_NE(TDArray, nullptr); + + if (TDArray->AppendEmplace(Q, std::move(B), std::move(A), std::move(T), + TId) == nullptr) { + // If we fail to add the data to the array, we should destroy the objects + // handed us. + T.~FunctionCallTrie(); + A.~Allocators(); + Q->releaseBuffer(B.NodeBuffer); + Q->releaseBuffer(B.RootsBuffer); + Q->releaseBuffer(B.ShadowStackBuffer); + Q->releaseBuffer(B.NodeIdPairBuffer); + B.~Buffers(); + } + } +} + +// A PathArray represents the function id's representing a stack trace. In this +// context a path is almost always represented from the leaf function in a call +// stack to a root of the call trie. +using PathArray = Array<int32_t>; + +struct ProfileRecord { + using PathAllocator = typename PathArray::AllocatorType; + + // The Path in this record is the function id's from the leaf to the root of + // the function call stack as represented from a FunctionCallTrie. + PathArray Path; + const FunctionCallTrie::Node *Node; +}; + +namespace { + +using ProfileRecordArray = Array<ProfileRecord>; + +// Walk a depth-first traversal of each root of the FunctionCallTrie to generate +// the path(s) and the data associated with the path. +static void +populateRecords(ProfileRecordArray &PRs, ProfileRecord::PathAllocator &PA, + const FunctionCallTrie &Trie) XRAY_NEVER_INSTRUMENT { + using StackArray = Array<const FunctionCallTrie::Node *>; + using StackAllocator = typename StackArray::AllocatorType; + StackAllocator StackAlloc(profilingFlags()->stack_allocator_max); + StackArray DFSStack(StackAlloc); + for (const auto *R : Trie.getRoots()) { + DFSStack.Append(R); + while (!DFSStack.empty()) { + auto *Node = DFSStack.back(); + DFSStack.trim(1); + if (Node == nullptr) + continue; + auto Record = PRs.AppendEmplace(PathArray{PA}, Node); + if (Record == nullptr) + return; + DCHECK_NE(Record, nullptr); + + // Traverse the Node's parents and as we're doing so, get the FIds in + // the order they appear. + for (auto N = Node; N != nullptr; N = N->Parent) + Record->Path.Append(N->FId); + DCHECK(!Record->Path.empty()); + + for (const auto C : Node->Callees) + DFSStack.Append(C.NodePtr); + } + } +} + +static void serializeRecords(ProfileBuffer *Buffer, const BlockHeader &Header, + const ProfileRecordArray &ProfileRecords) + XRAY_NEVER_INSTRUMENT { + auto NextPtr = static_cast<uint8_t *>( + internal_memcpy(Buffer->Data, &Header, sizeof(Header))) + + sizeof(Header); + for (const auto &Record : ProfileRecords) { + // List of IDs follow: + for (const auto FId : Record.Path) + NextPtr = + static_cast<uint8_t *>(internal_memcpy(NextPtr, &FId, sizeof(FId))) + + sizeof(FId); + + // Add the sentinel here. + constexpr int32_t SentinelFId = 0; + NextPtr = static_cast<uint8_t *>( + internal_memset(NextPtr, SentinelFId, sizeof(SentinelFId))) + + sizeof(SentinelFId); + + // Add the node data here. + NextPtr = + static_cast<uint8_t *>(internal_memcpy( + NextPtr, &Record.Node->CallCount, sizeof(Record.Node->CallCount))) + + sizeof(Record.Node->CallCount); + NextPtr = static_cast<uint8_t *>( + internal_memcpy(NextPtr, &Record.Node->CumulativeLocalTime, + sizeof(Record.Node->CumulativeLocalTime))) + + sizeof(Record.Node->CumulativeLocalTime); + } + + DCHECK_EQ(NextPtr - static_cast<uint8_t *>(Buffer->Data), Buffer->Size); +} + +} // namespace + +void serialize() XRAY_NEVER_INSTRUMENT { + if (!atomic_load(&CollectorInitialized, memory_order_acquire)) + return; + + SpinMutexLock Lock(&GlobalMutex); + + // Clear out the global ProfileBuffers, if it's not empty. + for (auto &B : *ProfileBuffers) + deallocateBuffer(reinterpret_cast<unsigned char *>(B.Data), B.Size); + ProfileBuffers->trim(ProfileBuffers->size()); + + DCHECK_NE(TDArray, nullptr); + if (TDArray->empty()) + return; + + // Then repopulate the global ProfileBuffers. + u32 I = 0; + auto MaxSize = profilingFlags()->global_allocator_max; + auto ProfileArena = allocateBuffer(MaxSize); + if (ProfileArena == nullptr) + return; + + auto ProfileArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(ProfileArena, MaxSize); }); + + auto PathArena = allocateBuffer(profilingFlags()->global_allocator_max); + if (PathArena == nullptr) + return; + + auto PathArenaCleanup = at_scope_exit( + [&]() XRAY_NEVER_INSTRUMENT { deallocateBuffer(PathArena, MaxSize); }); + + for (const auto &ThreadTrie : *TDArray) { + using ProfileRecordAllocator = typename ProfileRecordArray::AllocatorType; + ProfileRecordAllocator PRAlloc(ProfileArena, + profilingFlags()->global_allocator_max); + ProfileRecord::PathAllocator PathAlloc( + PathArena, profilingFlags()->global_allocator_max); + ProfileRecordArray ProfileRecords(PRAlloc); + + // First, we want to compute the amount of space we're going to need. We'll + // use a local allocator and an __xray::Array<...> to store the intermediary + // data, then compute the size as we're going along. Then we'll allocate the + // contiguous space to contain the thread buffer data. + if (ThreadTrie.FCT.getRoots().empty()) + continue; + + populateRecords(ProfileRecords, PathAlloc, ThreadTrie.FCT); + DCHECK(!ThreadTrie.FCT.getRoots().empty()); + DCHECK(!ProfileRecords.empty()); + + // Go through each record, to compute the sizes. + // + // header size = block size (4 bytes) + // + block number (4 bytes) + // + thread id (8 bytes) + // record size = path ids (4 bytes * number of ids + sentinel 4 bytes) + // + call count (8 bytes) + // + local time (8 bytes) + // + end of record (8 bytes) + u32 CumulativeSizes = 0; + for (const auto &Record : ProfileRecords) + CumulativeSizes += 20 + (4 * Record.Path.size()); + + BlockHeader Header{16 + CumulativeSizes, I++, ThreadTrie.TId}; + auto B = ProfileBuffers->Append({}); + B->Size = sizeof(Header) + CumulativeSizes; + B->Data = allocateBuffer(B->Size); + DCHECK_NE(B->Data, nullptr); + serializeRecords(B, Header, ProfileRecords); + } +} + +void reset() XRAY_NEVER_INSTRUMENT { + atomic_store(&CollectorInitialized, 0, memory_order_release); + SpinMutexLock Lock(&GlobalMutex); + + if (ProfileBuffers != nullptr) { + // Clear out the profile buffers that have been serialized. + for (auto &B : *ProfileBuffers) + deallocateBuffer(reinterpret_cast<uint8_t *>(B.Data), B.Size); + ProfileBuffers->trim(ProfileBuffers->size()); + ProfileBuffers = nullptr; + } + + if (TDArray != nullptr) { + // Release the resources as required. + for (auto &TD : *TDArray) { + TD.BQ->releaseBuffer(TD.Buffers.NodeBuffer); + TD.BQ->releaseBuffer(TD.Buffers.RootsBuffer); + TD.BQ->releaseBuffer(TD.Buffers.ShadowStackBuffer); + TD.BQ->releaseBuffer(TD.Buffers.NodeIdPairBuffer); + } + // We don't bother destroying the array here because we've already + // potentially freed the backing store for the array. Instead we're going to + // reset the pointer to nullptr, and re-use the storage later instead + // (placement-new'ing into the storage as-is). + TDArray = nullptr; + } + + if (TDAllocator != nullptr) { + TDAllocator->~Allocator(); + TDAllocator = nullptr; + } + + if (Buffer.Data != nullptr) { + BQ->releaseBuffer(Buffer); + } + + if (BQ == nullptr) { + bool Success = false; + new (&BufferQueueStorage) + BufferQueue(profilingFlags()->global_allocator_max, 1, Success); + if (!Success) + return; + BQ = reinterpret_cast<BufferQueue *>(&BufferQueueStorage); + } else { + BQ->finalize(); + + if (BQ->init(profilingFlags()->global_allocator_max, 1) != + BufferQueue::ErrorCode::Ok) + return; + } + + if (BQ->getBuffer(Buffer) != BufferQueue::ErrorCode::Ok) + return; + + new (&ProfileBufferArrayAllocatorStorage) + ProfileBufferArrayAllocator(profilingFlags()->global_allocator_max); + ProfileBuffersAllocator = reinterpret_cast<ProfileBufferArrayAllocator *>( + &ProfileBufferArrayAllocatorStorage); + + new (&ProfileBuffersStorage) ProfileBufferArray(*ProfileBuffersAllocator); + ProfileBuffers = + reinterpret_cast<ProfileBufferArray *>(&ProfileBuffersStorage); + + new (&ThreadDataAllocatorStorage) + ThreadDataAllocator(Buffer.Data, Buffer.Size); + TDAllocator = + reinterpret_cast<ThreadDataAllocator *>(&ThreadDataAllocatorStorage); + new (&ThreadDataArrayStorage) ThreadDataArray(*TDAllocator); + TDArray = reinterpret_cast<ThreadDataArray *>(&ThreadDataArrayStorage); + + atomic_store(&CollectorInitialized, 1, memory_order_release); +} + +XRayBuffer nextBuffer(XRayBuffer B) XRAY_NEVER_INSTRUMENT { + SpinMutexLock Lock(&GlobalMutex); + + if (ProfileBuffers == nullptr || ProfileBuffers->size() == 0) + return {nullptr, 0}; + + static pthread_once_t Once = PTHREAD_ONCE_INIT; + static typename std::aligned_storage<sizeof(XRayProfilingFileHeader)>::type + FileHeaderStorage; + pthread_once( + &Once, +[]() XRAY_NEVER_INSTRUMENT { + new (&FileHeaderStorage) XRayProfilingFileHeader{}; + }); + + if (UNLIKELY(B.Data == nullptr)) { + // The first buffer should always contain the file header information. + auto &FileHeader = + *reinterpret_cast<XRayProfilingFileHeader *>(&FileHeaderStorage); + FileHeader.Timestamp = NanoTime(); + FileHeader.PID = internal_getpid(); + return {&FileHeaderStorage, sizeof(XRayProfilingFileHeader)}; + } + + if (UNLIKELY(B.Data == &FileHeaderStorage)) + return {(*ProfileBuffers)[0].Data, (*ProfileBuffers)[0].Size}; + + BlockHeader Header; + internal_memcpy(&Header, B.Data, sizeof(BlockHeader)); + auto NextBlock = Header.BlockNum + 1; + if (NextBlock < ProfileBuffers->size()) + return {(*ProfileBuffers)[NextBlock].Data, + (*ProfileBuffers)[NextBlock].Size}; + return {nullptr, 0}; +} + +} // namespace profileCollectorService +} // namespace __xray |