diff options
Diffstat (limited to 'lib/tsan/rtl/tsan_clock.cpp')
-rw-r--r-- | lib/tsan/rtl/tsan_clock.cpp | 597 |
1 files changed, 597 insertions, 0 deletions
diff --git a/lib/tsan/rtl/tsan_clock.cpp b/lib/tsan/rtl/tsan_clock.cpp new file mode 100644 index 0000000000000..4b7aa0653da6b --- /dev/null +++ b/lib/tsan/rtl/tsan_clock.cpp @@ -0,0 +1,597 @@ +//===-- tsan_clock.cpp ----------------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file is a part of ThreadSanitizer (TSan), a race detector. +// +//===----------------------------------------------------------------------===// +#include "tsan_clock.h" +#include "tsan_rtl.h" +#include "sanitizer_common/sanitizer_placement_new.h" + +// SyncClock and ThreadClock implement vector clocks for sync variables +// (mutexes, atomic variables, file descriptors, etc) and threads, respectively. +// ThreadClock contains fixed-size vector clock for maximum number of threads. +// SyncClock contains growable vector clock for currently necessary number of +// threads. +// Together they implement very simple model of operations, namely: +// +// void ThreadClock::acquire(const SyncClock *src) { +// for (int i = 0; i < kMaxThreads; i++) +// clock[i] = max(clock[i], src->clock[i]); +// } +// +// void ThreadClock::release(SyncClock *dst) const { +// for (int i = 0; i < kMaxThreads; i++) +// dst->clock[i] = max(dst->clock[i], clock[i]); +// } +// +// void ThreadClock::ReleaseStore(SyncClock *dst) const { +// for (int i = 0; i < kMaxThreads; i++) +// dst->clock[i] = clock[i]; +// } +// +// void ThreadClock::acq_rel(SyncClock *dst) { +// acquire(dst); +// release(dst); +// } +// +// Conformance to this model is extensively verified in tsan_clock_test.cpp. +// However, the implementation is significantly more complex. The complexity +// allows to implement important classes of use cases in O(1) instead of O(N). +// +// The use cases are: +// 1. Singleton/once atomic that has a single release-store operation followed +// by zillions of acquire-loads (the acquire-load is O(1)). +// 2. Thread-local mutex (both lock and unlock can be O(1)). +// 3. Leaf mutex (unlock is O(1)). +// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)). +// 5. An atomic with a single writer (writes can be O(1)). +// The implementation dynamically adopts to workload. So if an atomic is in +// read-only phase, these reads will be O(1); if it later switches to read/write +// phase, the implementation will correctly handle that by switching to O(N). +// +// Thread-safety note: all const operations on SyncClock's are conducted under +// a shared lock; all non-const operations on SyncClock's are conducted under +// an exclusive lock; ThreadClock's are private to respective threads and so +// do not need any protection. +// +// Description of SyncClock state: +// clk_ - variable size vector clock, low kClkBits hold timestamp, +// the remaining bits hold "acquired" flag (the actual value is thread's +// reused counter); +// if acquried == thr->reused_, then the respective thread has already +// acquired this clock (except possibly for dirty elements). +// dirty_ - holds up to two indeces in the vector clock that other threads +// need to acquire regardless of "acquired" flag value; +// release_store_tid_ - denotes that the clock state is a result of +// release-store operation by the thread with release_store_tid_ index. +// release_store_reused_ - reuse count of release_store_tid_. + +// We don't have ThreadState in these methods, so this is an ugly hack that +// works only in C++. +#if !SANITIZER_GO +# define CPP_STAT_INC(typ) StatInc(cur_thread(), typ) +#else +# define CPP_STAT_INC(typ) (void)0 +#endif + +namespace __tsan { + +static atomic_uint32_t *ref_ptr(ClockBlock *cb) { + return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]); +} + +// Drop reference to the first level block idx. +static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) { + ClockBlock *cb = ctx->clock_alloc.Map(idx); + atomic_uint32_t *ref = ref_ptr(cb); + u32 v = atomic_load(ref, memory_order_acquire); + for (;;) { + CHECK_GT(v, 0); + if (v == 1) + break; + if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel)) + return; + } + // First level block owns second level blocks, so them as well. + for (uptr i = 0; i < blocks; i++) + ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]); + ctx->clock_alloc.Free(c, idx); +} + +ThreadClock::ThreadClock(unsigned tid, unsigned reused) + : tid_(tid) + , reused_(reused + 1) // 0 has special meaning + , cached_idx_() + , cached_size_() + , cached_blocks_() { + CHECK_LT(tid, kMaxTidInClock); + CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits); + nclk_ = tid_ + 1; + last_acquire_ = 0; + internal_memset(clk_, 0, sizeof(clk_)); +} + +void ThreadClock::ResetCached(ClockCache *c) { + if (cached_idx_) { + UnrefClockBlock(c, cached_idx_, cached_blocks_); + cached_idx_ = 0; + cached_size_ = 0; + cached_blocks_ = 0; + } +} + +void ThreadClock::acquire(ClockCache *c, SyncClock *src) { + DCHECK_LE(nclk_, kMaxTid); + DCHECK_LE(src->size_, kMaxTid); + CPP_STAT_INC(StatClockAcquire); + + // Check if it's empty -> no need to do anything. + const uptr nclk = src->size_; + if (nclk == 0) { + CPP_STAT_INC(StatClockAcquireEmpty); + return; + } + + bool acquired = false; + for (unsigned i = 0; i < kDirtyTids; i++) { + SyncClock::Dirty dirty = src->dirty_[i]; + unsigned tid = dirty.tid; + if (tid != kInvalidTid) { + if (clk_[tid] < dirty.epoch) { + clk_[tid] = dirty.epoch; + acquired = true; + } + } + } + + // Check if we've already acquired src after the last release operation on src + if (tid_ >= nclk || src->elem(tid_).reused != reused_) { + // O(N) acquire. + CPP_STAT_INC(StatClockAcquireFull); + nclk_ = max(nclk_, nclk); + u64 *dst_pos = &clk_[0]; + for (ClockElem &src_elem : *src) { + u64 epoch = src_elem.epoch; + if (*dst_pos < epoch) { + *dst_pos = epoch; + acquired = true; + } + dst_pos++; + } + + // Remember that this thread has acquired this clock. + if (nclk > tid_) + src->elem(tid_).reused = reused_; + } + + if (acquired) { + CPP_STAT_INC(StatClockAcquiredSomething); + last_acquire_ = clk_[tid_]; + ResetCached(c); + } +} + +void ThreadClock::release(ClockCache *c, SyncClock *dst) { + DCHECK_LE(nclk_, kMaxTid); + DCHECK_LE(dst->size_, kMaxTid); + + if (dst->size_ == 0) { + // ReleaseStore will correctly set release_store_tid_, + // which can be important for future operations. + ReleaseStore(c, dst); + return; + } + + CPP_STAT_INC(StatClockRelease); + // Check if we need to resize dst. + if (dst->size_ < nclk_) + dst->Resize(c, nclk_); + + // Check if we had not acquired anything from other threads + // since the last release on dst. If so, we need to update + // only dst->elem(tid_). + if (dst->elem(tid_).epoch > last_acquire_) { + UpdateCurrentThread(c, dst); + if (dst->release_store_tid_ != tid_ || + dst->release_store_reused_ != reused_) + dst->release_store_tid_ = kInvalidTid; + return; + } + + // O(N) release. + CPP_STAT_INC(StatClockReleaseFull); + dst->Unshare(c); + // First, remember whether we've acquired dst. + bool acquired = IsAlreadyAcquired(dst); + if (acquired) + CPP_STAT_INC(StatClockReleaseAcquired); + // Update dst->clk_. + dst->FlushDirty(); + uptr i = 0; + for (ClockElem &ce : *dst) { + ce.epoch = max(ce.epoch, clk_[i]); + ce.reused = 0; + i++; + } + // Clear 'acquired' flag in the remaining elements. + if (nclk_ < dst->size_) + CPP_STAT_INC(StatClockReleaseClearTail); + for (uptr i = nclk_; i < dst->size_; i++) + dst->elem(i).reused = 0; + dst->release_store_tid_ = kInvalidTid; + dst->release_store_reused_ = 0; + // If we've acquired dst, remember this fact, + // so that we don't need to acquire it on next acquire. + if (acquired) + dst->elem(tid_).reused = reused_; +} + +void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) { + DCHECK_LE(nclk_, kMaxTid); + DCHECK_LE(dst->size_, kMaxTid); + CPP_STAT_INC(StatClockStore); + + if (dst->size_ == 0 && cached_idx_ != 0) { + // Reuse the cached clock. + // Note: we could reuse/cache the cached clock in more cases: + // we could update the existing clock and cache it, or replace it with the + // currently cached clock and release the old one. And for a shared + // existing clock, we could replace it with the currently cached; + // or unshare, update and cache. But, for simplicity, we currnetly reuse + // cached clock only when the target clock is empty. + dst->tab_ = ctx->clock_alloc.Map(cached_idx_); + dst->tab_idx_ = cached_idx_; + dst->size_ = cached_size_; + dst->blocks_ = cached_blocks_; + CHECK_EQ(dst->dirty_[0].tid, kInvalidTid); + // The cached clock is shared (immutable), + // so this is where we store the current clock. + dst->dirty_[0].tid = tid_; + dst->dirty_[0].epoch = clk_[tid_]; + dst->release_store_tid_ = tid_; + dst->release_store_reused_ = reused_; + // Rememeber that we don't need to acquire it in future. + dst->elem(tid_).reused = reused_; + // Grab a reference. + atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); + return; + } + + // Check if we need to resize dst. + if (dst->size_ < nclk_) + dst->Resize(c, nclk_); + + if (dst->release_store_tid_ == tid_ && + dst->release_store_reused_ == reused_ && + dst->elem(tid_).epoch > last_acquire_) { + CPP_STAT_INC(StatClockStoreFast); + UpdateCurrentThread(c, dst); + return; + } + + // O(N) release-store. + CPP_STAT_INC(StatClockStoreFull); + dst->Unshare(c); + // Note: dst can be larger than this ThreadClock. + // This is fine since clk_ beyond size is all zeros. + uptr i = 0; + for (ClockElem &ce : *dst) { + ce.epoch = clk_[i]; + ce.reused = 0; + i++; + } + for (uptr i = 0; i < kDirtyTids; i++) + dst->dirty_[i].tid = kInvalidTid; + dst->release_store_tid_ = tid_; + dst->release_store_reused_ = reused_; + // Rememeber that we don't need to acquire it in future. + dst->elem(tid_).reused = reused_; + + // If the resulting clock is cachable, cache it for future release operations. + // The clock is always cachable if we released to an empty sync object. + if (cached_idx_ == 0 && dst->Cachable()) { + // Grab a reference to the ClockBlock. + atomic_uint32_t *ref = ref_ptr(dst->tab_); + if (atomic_load(ref, memory_order_acquire) == 1) + atomic_store_relaxed(ref, 2); + else + atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed); + cached_idx_ = dst->tab_idx_; + cached_size_ = dst->size_; + cached_blocks_ = dst->blocks_; + } +} + +void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) { + CPP_STAT_INC(StatClockAcquireRelease); + acquire(c, dst); + ReleaseStore(c, dst); +} + +// Updates only single element related to the current thread in dst->clk_. +void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const { + // Update the threads time, but preserve 'acquired' flag. + for (unsigned i = 0; i < kDirtyTids; i++) { + SyncClock::Dirty *dirty = &dst->dirty_[i]; + const unsigned tid = dirty->tid; + if (tid == tid_ || tid == kInvalidTid) { + CPP_STAT_INC(StatClockReleaseFast); + dirty->tid = tid_; + dirty->epoch = clk_[tid_]; + return; + } + } + // Reset all 'acquired' flags, O(N). + // We are going to touch dst elements, so we need to unshare it. + dst->Unshare(c); + CPP_STAT_INC(StatClockReleaseSlow); + dst->elem(tid_).epoch = clk_[tid_]; + for (uptr i = 0; i < dst->size_; i++) + dst->elem(i).reused = 0; + dst->FlushDirty(); +} + +// Checks whether the current thread has already acquired src. +bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const { + if (src->elem(tid_).reused != reused_) + return false; + for (unsigned i = 0; i < kDirtyTids; i++) { + SyncClock::Dirty dirty = src->dirty_[i]; + if (dirty.tid != kInvalidTid) { + if (clk_[dirty.tid] < dirty.epoch) + return false; + } + } + return true; +} + +// Sets a single element in the vector clock. +// This function is called only from weird places like AcquireGlobal. +void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) { + DCHECK_LT(tid, kMaxTid); + DCHECK_GE(v, clk_[tid]); + clk_[tid] = v; + if (nclk_ <= tid) + nclk_ = tid + 1; + last_acquire_ = clk_[tid_]; + ResetCached(c); +} + +void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) { + printf("clock=["); + for (uptr i = 0; i < nclk_; i++) + printf("%s%llu", i == 0 ? "" : ",", clk_[i]); + printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_); +} + +SyncClock::SyncClock() { + ResetImpl(); +} + +SyncClock::~SyncClock() { + // Reset must be called before dtor. + CHECK_EQ(size_, 0); + CHECK_EQ(blocks_, 0); + CHECK_EQ(tab_, 0); + CHECK_EQ(tab_idx_, 0); +} + +void SyncClock::Reset(ClockCache *c) { + if (size_) + UnrefClockBlock(c, tab_idx_, blocks_); + ResetImpl(); +} + +void SyncClock::ResetImpl() { + tab_ = 0; + tab_idx_ = 0; + size_ = 0; + blocks_ = 0; + release_store_tid_ = kInvalidTid; + release_store_reused_ = 0; + for (uptr i = 0; i < kDirtyTids; i++) + dirty_[i].tid = kInvalidTid; +} + +void SyncClock::Resize(ClockCache *c, uptr nclk) { + CPP_STAT_INC(StatClockReleaseResize); + Unshare(c); + if (nclk <= capacity()) { + // Memory is already allocated, just increase the size. + size_ = nclk; + return; + } + if (size_ == 0) { + // Grow from 0 to one-level table. + CHECK_EQ(size_, 0); + CHECK_EQ(blocks_, 0); + CHECK_EQ(tab_, 0); + CHECK_EQ(tab_idx_, 0); + tab_idx_ = ctx->clock_alloc.Alloc(c); + tab_ = ctx->clock_alloc.Map(tab_idx_); + internal_memset(tab_, 0, sizeof(*tab_)); + atomic_store_relaxed(ref_ptr(tab_), 1); + size_ = 1; + } else if (size_ > blocks_ * ClockBlock::kClockCount) { + u32 idx = ctx->clock_alloc.Alloc(c); + ClockBlock *new_cb = ctx->clock_alloc.Map(idx); + uptr top = size_ - blocks_ * ClockBlock::kClockCount; + CHECK_LT(top, ClockBlock::kClockCount); + const uptr move = top * sizeof(tab_->clock[0]); + internal_memcpy(&new_cb->clock[0], tab_->clock, move); + internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move); + internal_memset(tab_->clock, 0, move); + append_block(idx); + } + // At this point we have first level table allocated and all clock elements + // are evacuated from it to a second level block. + // Add second level tables as necessary. + while (nclk > capacity()) { + u32 idx = ctx->clock_alloc.Alloc(c); + ClockBlock *cb = ctx->clock_alloc.Map(idx); + internal_memset(cb, 0, sizeof(*cb)); + append_block(idx); + } + size_ = nclk; +} + +// Flushes all dirty elements into the main clock array. +void SyncClock::FlushDirty() { + for (unsigned i = 0; i < kDirtyTids; i++) { + Dirty *dirty = &dirty_[i]; + if (dirty->tid != kInvalidTid) { + CHECK_LT(dirty->tid, size_); + elem(dirty->tid).epoch = dirty->epoch; + dirty->tid = kInvalidTid; + } + } +} + +bool SyncClock::IsShared() const { + if (size_ == 0) + return false; + atomic_uint32_t *ref = ref_ptr(tab_); + u32 v = atomic_load(ref, memory_order_acquire); + CHECK_GT(v, 0); + return v > 1; +} + +// Unshares the current clock if it's shared. +// Shared clocks are immutable, so they need to be unshared before any updates. +// Note: this does not apply to dirty entries as they are not shared. +void SyncClock::Unshare(ClockCache *c) { + if (!IsShared()) + return; + // First, copy current state into old. + SyncClock old; + old.tab_ = tab_; + old.tab_idx_ = tab_idx_; + old.size_ = size_; + old.blocks_ = blocks_; + old.release_store_tid_ = release_store_tid_; + old.release_store_reused_ = release_store_reused_; + for (unsigned i = 0; i < kDirtyTids; i++) + old.dirty_[i] = dirty_[i]; + // Then, clear current object. + ResetImpl(); + // Allocate brand new clock in the current object. + Resize(c, old.size_); + // Now copy state back into this object. + Iter old_iter(&old); + for (ClockElem &ce : *this) { + ce = *old_iter; + ++old_iter; + } + release_store_tid_ = old.release_store_tid_; + release_store_reused_ = old.release_store_reused_; + for (unsigned i = 0; i < kDirtyTids; i++) + dirty_[i] = old.dirty_[i]; + // Drop reference to old and delete if necessary. + old.Reset(c); +} + +// Can we cache this clock for future release operations? +ALWAYS_INLINE bool SyncClock::Cachable() const { + if (size_ == 0) + return false; + for (unsigned i = 0; i < kDirtyTids; i++) { + if (dirty_[i].tid != kInvalidTid) + return false; + } + return atomic_load_relaxed(ref_ptr(tab_)) == 1; +} + +// elem linearizes the two-level structure into linear array. +// Note: this is used only for one time accesses, vector operations use +// the iterator as it is much faster. +ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const { + DCHECK_LT(tid, size_); + const uptr block = tid / ClockBlock::kClockCount; + DCHECK_LE(block, blocks_); + tid %= ClockBlock::kClockCount; + if (block == blocks_) + return tab_->clock[tid]; + u32 idx = get_block(block); + ClockBlock *cb = ctx->clock_alloc.Map(idx); + return cb->clock[tid]; +} + +ALWAYS_INLINE uptr SyncClock::capacity() const { + if (size_ == 0) + return 0; + uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]); + // How many clock elements we can fit into the first level block. + // +1 for ref counter. + uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio; + return blocks_ * ClockBlock::kClockCount + top; +} + +ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const { + DCHECK(size_); + DCHECK_LT(bi, blocks_); + return tab_->table[ClockBlock::kBlockIdx - bi]; +} + +ALWAYS_INLINE void SyncClock::append_block(u32 idx) { + uptr bi = blocks_++; + CHECK_EQ(get_block(bi), 0); + tab_->table[ClockBlock::kBlockIdx - bi] = idx; +} + +// Used only by tests. +u64 SyncClock::get(unsigned tid) const { + for (unsigned i = 0; i < kDirtyTids; i++) { + Dirty dirty = dirty_[i]; + if (dirty.tid == tid) + return dirty.epoch; + } + return elem(tid).epoch; +} + +// Used only by Iter test. +u64 SyncClock::get_clean(unsigned tid) const { + return elem(tid).epoch; +} + +void SyncClock::DebugDump(int(*printf)(const char *s, ...)) { + printf("clock=["); + for (uptr i = 0; i < size_; i++) + printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch); + printf("] reused=["); + for (uptr i = 0; i < size_; i++) + printf("%s%llu", i == 0 ? "" : ",", elem(i).reused); + printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]", + release_store_tid_, release_store_reused_, + dirty_[0].tid, dirty_[0].epoch, + dirty_[1].tid, dirty_[1].epoch); +} + +void SyncClock::Iter::Next() { + // Finished with the current block, move on to the next one. + block_++; + if (block_ < parent_->blocks_) { + // Iterate over the next second level block. + u32 idx = parent_->get_block(block_); + ClockBlock *cb = ctx->clock_alloc.Map(idx); + pos_ = &cb->clock[0]; + end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, + ClockBlock::kClockCount); + return; + } + if (block_ == parent_->blocks_ && + parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) { + // Iterate over elements in the first level block. + pos_ = &parent_->tab_->clock[0]; + end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount, + ClockBlock::kClockCount); + return; + } + parent_ = nullptr; // denotes end +} +} // namespace __tsan |