1 files changed, 597 insertions, 0 deletions
diff --git a/lib/tsan/rtl/tsan_clock.cpp b/lib/tsan/rtl/tsan_clock.cpp
new file mode 100644
index 0000000000000..4b7aa0653da6b
--- /dev/null
+++ b/lib/tsan/rtl/tsan_clock.cpp
@@ -0,0 +1,597 @@
+//===-- tsan_clock.cpp ----------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of ThreadSanitizer (TSan), a race detector.
+//
+//===----------------------------------------------------------------------===//
+#include "tsan_clock.h"
+#include "tsan_rtl.h"
+#include "sanitizer_common/sanitizer_placement_new.h"
+
+// SyncClock and ThreadClock implement vector clocks for sync variables
+// (mutexes, atomic variables, file descriptors, etc) and threads, respectively.
+// ThreadClock contains fixed-size vector clock for maximum number of threads.
+// SyncClock contains growable vector clock for currently necessary number of
+// threads.
+// Together they implement very simple model of operations, namely:
+//
+//   void ThreadClock::acquire(const SyncClock *src) {
+//     for (int i = 0; i < kMaxThreads; i++)
+//       clock[i] = max(clock[i], src->clock[i]);
+//   }
+//
+//   void ThreadClock::release(SyncClock *dst) const {
+//     for (int i = 0; i < kMaxThreads; i++)
+//       dst->clock[i] = max(dst->clock[i], clock[i]);
+//   }
+//
+//   void ThreadClock::ReleaseStore(SyncClock *dst) const {
+//     for (int i = 0; i < kMaxThreads; i++)
+//       dst->clock[i] = clock[i];
+//   }
+//
+//   void ThreadClock::acq_rel(SyncClock *dst) {
+//     acquire(dst);
+//     release(dst);
+//   }
+//
+// Conformance to this model is extensively verified in tsan_clock_test.cpp.
+// However, the implementation is significantly more complex. The complexity
+// allows to implement important classes of use cases in O(1) instead of O(N).
+//
+// The use cases are:
+// 1. Singleton/once atomic that has a single release-store operation followed
+//    by zillions of acquire-loads (the acquire-load is O(1)).
+// 2. Thread-local mutex (both lock and unlock can be O(1)).
+// 3. Leaf mutex (unlock is O(1)).
+// 4. A mutex shared by 2 threads (both lock and unlock can be O(1)).
+// 5. An atomic with a single writer (writes can be O(1)).
+// The implementation dynamically adopts to workload. So if an atomic is in
+// read-only phase, these reads will be O(1); if it later switches to read/write
+// phase, the implementation will correctly handle that by switching to O(N).
+//
+// Thread-safety note: all const operations on SyncClock's are conducted under
+// a shared lock; all non-const operations on SyncClock's are conducted under
+// an exclusive lock; ThreadClock's are private to respective threads and so
+// do not need any protection.
+//
+// Description of SyncClock state:
+// clk_ - variable size vector clock, low kClkBits hold timestamp,
+//   the remaining bits hold "acquired" flag (the actual value is thread's
+//   reused counter);
+//   if acquried == thr->reused_, then the respective thread has already
+//   acquired this clock (except possibly for dirty elements).
+// dirty_ - holds up to two indeces in the vector clock that other threads
+//   need to acquire regardless of "acquired" flag value;
+// release_store_tid_ - denotes that the clock state is a result of
+//   release-store operation by the thread with release_store_tid_ index.
+// release_store_reused_ - reuse count of release_store_tid_.
+
+// We don't have ThreadState in these methods, so this is an ugly hack that
+// works only in C++.
+#if !SANITIZER_GO
+# define CPP_STAT_INC(typ) StatInc(cur_thread(), typ)
+#else
+# define CPP_STAT_INC(typ) (void)0
+#endif
+
+namespace __tsan {
+
+static atomic_uint32_t *ref_ptr(ClockBlock *cb) {
+  return reinterpret_cast<atomic_uint32_t *>(&cb->table[ClockBlock::kRefIdx]);
+}
+
+// Drop reference to the first level block idx.
+static void UnrefClockBlock(ClockCache *c, u32 idx, uptr blocks) {
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  atomic_uint32_t *ref = ref_ptr(cb);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  for (;;) {
+    CHECK_GT(v, 0);
+    if (v == 1)
+      break;
+    if (atomic_compare_exchange_strong(ref, &v, v - 1, memory_order_acq_rel))
+      return;
+  }
+  // First level block owns second level blocks, so them as well.
+  for (uptr i = 0; i < blocks; i++)
+    ctx->clock_alloc.Free(c, cb->table[ClockBlock::kBlockIdx - i]);
+  ctx->clock_alloc.Free(c, idx);
+}
+
+ThreadClock::ThreadClock(unsigned tid, unsigned reused)
+    : tid_(tid)
+    , reused_(reused + 1)  // 0 has special meaning
+    , cached_idx_()
+    , cached_size_()
+    , cached_blocks_() {
+  CHECK_LT(tid, kMaxTidInClock);
+  CHECK_EQ(reused_, ((u64)reused_ << kClkBits) >> kClkBits);
+  nclk_ = tid_ + 1;
+  last_acquire_ = 0;
+  internal_memset(clk_, 0, sizeof(clk_));
+}
+
+void ThreadClock::ResetCached(ClockCache *c) {
+  if (cached_idx_) {
+    UnrefClockBlock(c, cached_idx_, cached_blocks_);
+    cached_idx_ = 0;
+    cached_size_ = 0;
+    cached_blocks_ = 0;
+  }
+}
+
+void ThreadClock::acquire(ClockCache *c, SyncClock *src) {
+  DCHECK_LE(nclk_, kMaxTid);
+  DCHECK_LE(src->size_, kMaxTid);
+  CPP_STAT_INC(StatClockAcquire);
+
+  // Check if it's empty -> no need to do anything.
+  const uptr nclk = src->size_;
+  if (nclk == 0) {
+    CPP_STAT_INC(StatClockAcquireEmpty);
+    return;
+  }
+
+  bool acquired = false;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    SyncClock::Dirty dirty = src->dirty_[i];
+    unsigned tid = dirty.tid;
+    if (tid != kInvalidTid) {
+      if (clk_[tid] < dirty.epoch) {
+        clk_[tid] = dirty.epoch;
+        acquired = true;
+      }
+    }
+  }
+
+  // Check if we've already acquired src after the last release operation on src
+  if (tid_ >= nclk || src->elem(tid_).reused != reused_) {
+    // O(N) acquire.
+    CPP_STAT_INC(StatClockAcquireFull);
+    nclk_ = max(nclk_, nclk);
+    u64 *dst_pos = &clk_[0];
+    for (ClockElem &src_elem : *src) {
+      u64 epoch = src_elem.epoch;
+      if (*dst_pos < epoch) {
+        *dst_pos = epoch;
+        acquired = true;
+      }
+      dst_pos++;
+    }
+
+    // Remember that this thread has acquired this clock.
+    if (nclk > tid_)
+      src->elem(tid_).reused = reused_;
+  }
+
+  if (acquired) {
+    CPP_STAT_INC(StatClockAcquiredSomething);
+    last_acquire_ = clk_[tid_];
+    ResetCached(c);
+  }
+}
+
+void ThreadClock::release(ClockCache *c, SyncClock *dst) {
+  DCHECK_LE(nclk_, kMaxTid);
+  DCHECK_LE(dst->size_, kMaxTid);
+
+  if (dst->size_ == 0) {
+    // ReleaseStore will correctly set release_store_tid_,
+    // which can be important for future operations.
+    ReleaseStore(c, dst);
+    return;
+  }
+
+  CPP_STAT_INC(StatClockRelease);
+  // Check if we need to resize dst.
+  if (dst->size_ < nclk_)
+    dst->Resize(c, nclk_);
+
+  // Check if we had not acquired anything from other threads
+  // since the last release on dst. If so, we need to update
+  // only dst->elem(tid_).
+  if (dst->elem(tid_).epoch > last_acquire_) {
+    UpdateCurrentThread(c, dst);
+    if (dst->release_store_tid_ != tid_ ||
+        dst->release_store_reused_ != reused_)
+      dst->release_store_tid_ = kInvalidTid;
+    return;
+  }
+
+  // O(N) release.
+  CPP_STAT_INC(StatClockReleaseFull);
+  dst->Unshare(c);
+  // First, remember whether we've acquired dst.
+  bool acquired = IsAlreadyAcquired(dst);
+  if (acquired)
+    CPP_STAT_INC(StatClockReleaseAcquired);
+  // Update dst->clk_.
+  dst->FlushDirty();
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = max(ce.epoch, clk_[i]);
+    ce.reused = 0;
+    i++;
+  }
+  // Clear 'acquired' flag in the remaining elements.
+  if (nclk_ < dst->size_)
+    CPP_STAT_INC(StatClockReleaseClearTail);
+  for (uptr i = nclk_; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
+  dst->release_store_tid_ = kInvalidTid;
+  dst->release_store_reused_ = 0;
+  // If we've acquired dst, remember this fact,
+  // so that we don't need to acquire it on next acquire.
+  if (acquired)
+    dst->elem(tid_).reused = reused_;
+}
+
+void ThreadClock::ReleaseStore(ClockCache *c, SyncClock *dst) {
+  DCHECK_LE(nclk_, kMaxTid);
+  DCHECK_LE(dst->size_, kMaxTid);
+  CPP_STAT_INC(StatClockStore);
+
+  if (dst->size_ == 0 && cached_idx_ != 0) {
+    // Reuse the cached clock.
+    // Note: we could reuse/cache the cached clock in more cases:
+    // we could update the existing clock and cache it, or replace it with the
+    // currently cached clock and release the old one. And for a shared
+    // existing clock, we could replace it with the currently cached;
+    // or unshare, update and cache. But, for simplicity, we currnetly reuse
+    // cached clock only when the target clock is empty.
+    dst->tab_ = ctx->clock_alloc.Map(cached_idx_);
+    dst->tab_idx_ = cached_idx_;
+    dst->size_ = cached_size_;
+    dst->blocks_ = cached_blocks_;
+    CHECK_EQ(dst->dirty_[0].tid, kInvalidTid);
+    // The cached clock is shared (immutable),
+    // so this is where we store the current clock.
+    dst->dirty_[0].tid = tid_;
+    dst->dirty_[0].epoch = clk_[tid_];
+    dst->release_store_tid_ = tid_;
+    dst->release_store_reused_ = reused_;
+    // Rememeber that we don't need to acquire it in future.
+    dst->elem(tid_).reused = reused_;
+    // Grab a reference.
+    atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    return;
+  }
+
+  // Check if we need to resize dst.
+  if (dst->size_ < nclk_)
+    dst->Resize(c, nclk_);
+
+  if (dst->release_store_tid_ == tid_ &&
+      dst->release_store_reused_ == reused_ &&
+      dst->elem(tid_).epoch > last_acquire_) {
+    CPP_STAT_INC(StatClockStoreFast);
+    UpdateCurrentThread(c, dst);
+    return;
+  }
+
+  // O(N) release-store.
+  CPP_STAT_INC(StatClockStoreFull);
+  dst->Unshare(c);
+  // Note: dst can be larger than this ThreadClock.
+  // This is fine since clk_ beyond size is all zeros.
+  uptr i = 0;
+  for (ClockElem &ce : *dst) {
+    ce.epoch = clk_[i];
+    ce.reused = 0;
+    i++;
+  }
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dst->dirty_[i].tid = kInvalidTid;
+  dst->release_store_tid_ = tid_;
+  dst->release_store_reused_ = reused_;
+  // Rememeber that we don't need to acquire it in future.
+  dst->elem(tid_).reused = reused_;
+
+  // If the resulting clock is cachable, cache it for future release operations.
+  // The clock is always cachable if we released to an empty sync object.
+  if (cached_idx_ == 0 && dst->Cachable()) {
+    // Grab a reference to the ClockBlock.
+    atomic_uint32_t *ref = ref_ptr(dst->tab_);
+    if (atomic_load(ref, memory_order_acquire) == 1)
+      atomic_store_relaxed(ref, 2);
+    else
+      atomic_fetch_add(ref_ptr(dst->tab_), 1, memory_order_relaxed);
+    cached_idx_ = dst->tab_idx_;
+    cached_size_ = dst->size_;
+    cached_blocks_ = dst->blocks_;
+  }
+}
+
+void ThreadClock::acq_rel(ClockCache *c, SyncClock *dst) {
+  CPP_STAT_INC(StatClockAcquireRelease);
+  acquire(c, dst);
+  ReleaseStore(c, dst);
+}
+
+// Updates only single element related to the current thread in dst->clk_.
+void ThreadClock::UpdateCurrentThread(ClockCache *c, SyncClock *dst) const {
+  // Update the threads time, but preserve 'acquired' flag.
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    SyncClock::Dirty *dirty = &dst->dirty_[i];
+    const unsigned tid = dirty->tid;
+    if (tid == tid_ || tid == kInvalidTid) {
+      CPP_STAT_INC(StatClockReleaseFast);
+      dirty->tid = tid_;
+      dirty->epoch = clk_[tid_];
+      return;
+    }
+  }
+  // Reset all 'acquired' flags, O(N).
+  // We are going to touch dst elements, so we need to unshare it.
+  dst->Unshare(c);
+  CPP_STAT_INC(StatClockReleaseSlow);
+  dst->elem(tid_).epoch = clk_[tid_];
+  for (uptr i = 0; i < dst->size_; i++)
+    dst->elem(i).reused = 0;
+  dst->FlushDirty();
+}
+
+// Checks whether the current thread has already acquired src.
+bool ThreadClock::IsAlreadyAcquired(const SyncClock *src) const {
+  if (src->elem(tid_).reused != reused_)
+    return false;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    SyncClock::Dirty dirty = src->dirty_[i];
+    if (dirty.tid != kInvalidTid) {
+      if (clk_[dirty.tid] < dirty.epoch)
+        return false;
+    }
+  }
+  return true;
+}
+
+// Sets a single element in the vector clock.
+// This function is called only from weird places like AcquireGlobal.
+void ThreadClock::set(ClockCache *c, unsigned tid, u64 v) {
+  DCHECK_LT(tid, kMaxTid);
+  DCHECK_GE(v, clk_[tid]);
+  clk_[tid] = v;
+  if (nclk_ <= tid)
+    nclk_ = tid + 1;
+  last_acquire_ = clk_[tid_];
+  ResetCached(c);
+}
+
+void ThreadClock::DebugDump(int(*printf)(const char *s, ...)) {
+  printf("clock=[");
+  for (uptr i = 0; i < nclk_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", clk_[i]);
+  printf("] tid=%u/%u last_acq=%llu", tid_, reused_, last_acquire_);
+}
+
+SyncClock::SyncClock() {
+  ResetImpl();
+}
+
+SyncClock::~SyncClock() {
+  // Reset must be called before dtor.
+  CHECK_EQ(size_, 0);
+  CHECK_EQ(blocks_, 0);
+  CHECK_EQ(tab_, 0);
+  CHECK_EQ(tab_idx_, 0);
+}
+
+void SyncClock::Reset(ClockCache *c) {
+  if (size_)
+    UnrefClockBlock(c, tab_idx_, blocks_);
+  ResetImpl();
+}
+
+void SyncClock::ResetImpl() {
+  tab_ = 0;
+  tab_idx_ = 0;
+  size_ = 0;
+  blocks_ = 0;
+  release_store_tid_ = kInvalidTid;
+  release_store_reused_ = 0;
+  for (uptr i = 0; i < kDirtyTids; i++)
+    dirty_[i].tid = kInvalidTid;
+}
+
+void SyncClock::Resize(ClockCache *c, uptr nclk) {
+  CPP_STAT_INC(StatClockReleaseResize);
+  Unshare(c);
+  if (nclk <= capacity()) {
+    // Memory is already allocated, just increase the size.
+    size_ = nclk;
+    return;
+  }
+  if (size_ == 0) {
+    // Grow from 0 to one-level table.
+    CHECK_EQ(size_, 0);
+    CHECK_EQ(blocks_, 0);
+    CHECK_EQ(tab_, 0);
+    CHECK_EQ(tab_idx_, 0);
+    tab_idx_ = ctx->clock_alloc.Alloc(c);
+    tab_ = ctx->clock_alloc.Map(tab_idx_);
+    internal_memset(tab_, 0, sizeof(*tab_));
+    atomic_store_relaxed(ref_ptr(tab_), 1);
+    size_ = 1;
+  } else if (size_ > blocks_ * ClockBlock::kClockCount) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *new_cb = ctx->clock_alloc.Map(idx);
+    uptr top = size_ - blocks_ * ClockBlock::kClockCount;
+    CHECK_LT(top, ClockBlock::kClockCount);
+    const uptr move = top * sizeof(tab_->clock[0]);
+    internal_memcpy(&new_cb->clock[0], tab_->clock, move);
+    internal_memset(&new_cb->clock[top], 0, sizeof(*new_cb) - move);
+    internal_memset(tab_->clock, 0, move);
+    append_block(idx);
+  }
+  // At this point we have first level table allocated and all clock elements
+  // are evacuated from it to a second level block.
+  // Add second level tables as necessary.
+  while (nclk > capacity()) {
+    u32 idx = ctx->clock_alloc.Alloc(c);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    internal_memset(cb, 0, sizeof(*cb));
+    append_block(idx);
+  }
+  size_ = nclk;
+}
+
+// Flushes all dirty elements into the main clock array.
+void SyncClock::FlushDirty() {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty *dirty = &dirty_[i];
+    if (dirty->tid != kInvalidTid) {
+      CHECK_LT(dirty->tid, size_);
+      elem(dirty->tid).epoch = dirty->epoch;
+      dirty->tid = kInvalidTid;
+    }
+  }
+}
+
+bool SyncClock::IsShared() const {
+  if (size_ == 0)
+    return false;
+  atomic_uint32_t *ref = ref_ptr(tab_);
+  u32 v = atomic_load(ref, memory_order_acquire);
+  CHECK_GT(v, 0);
+  return v > 1;
+}
+
+// Unshares the current clock if it's shared.
+// Shared clocks are immutable, so they need to be unshared before any updates.
+// Note: this does not apply to dirty entries as they are not shared.
+void SyncClock::Unshare(ClockCache *c) {
+  if (!IsShared())
+    return;
+  // First, copy current state into old.
+  SyncClock old;
+  old.tab_ = tab_;
+  old.tab_idx_ = tab_idx_;
+  old.size_ = size_;
+  old.blocks_ = blocks_;
+  old.release_store_tid_ = release_store_tid_;
+  old.release_store_reused_ = release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    old.dirty_[i] = dirty_[i];
+  // Then, clear current object.
+  ResetImpl();
+  // Allocate brand new clock in the current object.
+  Resize(c, old.size_);
+  // Now copy state back into this object.
+  Iter old_iter(&old);
+  for (ClockElem &ce : *this) {
+    ce = *old_iter;
+    ++old_iter;
+  }
+  release_store_tid_ = old.release_store_tid_;
+  release_store_reused_ = old.release_store_reused_;
+  for (unsigned i = 0; i < kDirtyTids; i++)
+    dirty_[i] = old.dirty_[i];
+  // Drop reference to old and delete if necessary.
+  old.Reset(c);
+}
+
+// Can we cache this clock for future release operations?
+ALWAYS_INLINE bool SyncClock::Cachable() const {
+  if (size_ == 0)
+    return false;
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    if (dirty_[i].tid != kInvalidTid)
+      return false;
+  }
+  return atomic_load_relaxed(ref_ptr(tab_)) == 1;
+}
+
+// elem linearizes the two-level structure into linear array.
+// Note: this is used only for one time accesses, vector operations use
+// the iterator as it is much faster.
+ALWAYS_INLINE ClockElem &SyncClock::elem(unsigned tid) const {
+  DCHECK_LT(tid, size_);
+  const uptr block = tid / ClockBlock::kClockCount;
+  DCHECK_LE(block, blocks_);
+  tid %= ClockBlock::kClockCount;
+  if (block == blocks_)
+    return tab_->clock[tid];
+  u32 idx = get_block(block);
+  ClockBlock *cb = ctx->clock_alloc.Map(idx);
+  return cb->clock[tid];
+}
+
+ALWAYS_INLINE uptr SyncClock::capacity() const {
+  if (size_ == 0)
+    return 0;
+  uptr ratio = sizeof(ClockBlock::clock[0]) / sizeof(ClockBlock::table[0]);
+  // How many clock elements we can fit into the first level block.
+  // +1 for ref counter.
+  uptr top = ClockBlock::kClockCount - RoundUpTo(blocks_ + 1, ratio) / ratio;
+  return blocks_ * ClockBlock::kClockCount + top;
+}
+
+ALWAYS_INLINE u32 SyncClock::get_block(uptr bi) const {
+  DCHECK(size_);
+  DCHECK_LT(bi, blocks_);
+  return tab_->table[ClockBlock::kBlockIdx - bi];
+}
+
+ALWAYS_INLINE void SyncClock::append_block(u32 idx) {
+  uptr bi = blocks_++;
+  CHECK_EQ(get_block(bi), 0);
+  tab_->table[ClockBlock::kBlockIdx - bi] = idx;
+}
+
+// Used only by tests.
+u64 SyncClock::get(unsigned tid) const {
+  for (unsigned i = 0; i < kDirtyTids; i++) {
+    Dirty dirty = dirty_[i];
+    if (dirty.tid == tid)
+      return dirty.epoch;
+  }
+  return elem(tid).epoch;
+}
+
+// Used only by Iter test.
+u64 SyncClock::get_clean(unsigned tid) const {
+  return elem(tid).epoch;
+}
+
+void SyncClock::DebugDump(int(*printf)(const char *s, ...)) {
+  printf("clock=[");
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).epoch);
+  printf("] reused=[");
+  for (uptr i = 0; i < size_; i++)
+    printf("%s%llu", i == 0 ? "" : ",", elem(i).reused);
+  printf("] release_store_tid=%d/%d dirty_tids=%d[%llu]/%d[%llu]",
+      release_store_tid_, release_store_reused_,
+      dirty_[0].tid, dirty_[0].epoch,
+      dirty_[1].tid, dirty_[1].epoch);
+}
+
+void SyncClock::Iter::Next() {
+  // Finished with the current block, move on to the next one.
+  block_++;
+  if (block_ < parent_->blocks_) {
+    // Iterate over the next second level block.
+    u32 idx = parent_->get_block(block_);
+    ClockBlock *cb = ctx->clock_alloc.Map(idx);
+    pos_ = &cb->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  if (block_ == parent_->blocks_ &&
+      parent_->size_ > parent_->blocks_ * ClockBlock::kClockCount) {
+    // Iterate over elements in the first level block.
+    pos_ = &parent_->tab_->clock[0];
+    end_ = pos_ + min(parent_->size_ - block_ * ClockBlock::kClockCount,
+        ClockBlock::kClockCount);
+    return;
+  }
+  parent_ = nullptr;  // denotes end
+}
+}  // namespace __tsan