6 files changed, 372 insertions, 0 deletions
diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/func_entry_exit.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/func_entry_exit.cpp
new file mode 100644
index 000000000000..5e0ba1d6981b
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/func_entry_exit.cpp
@@ -0,0 +1,20 @@
+// Synthetic benchmark for __tsan_func_entry/exit (spends ~75% there).
+
+void foo(bool x);
+
+int main() {
+  volatile int kRepeat1 = 1 << 30;
+  const int kRepeat = kRepeat1;
+  for (int i = 0; i < kRepeat; i++)
+    foo(false);
+}
+
+__attribute__((noinline)) void bar(volatile bool x) {
+  if (x)
+    foo(x);
+}
+
+__attribute__((noinline)) void foo(bool x) {
+  if (__builtin_expect(x, false))
+    bar(x);
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_local.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_local.cpp
new file mode 100644
index 000000000000..accdcb63878f
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_local.cpp
@@ -0,0 +1,49 @@
+// Mini-benchmark for tsan: non-shared memory writes.
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+int len;
+int *a;
+const int kNumIter = 1000;
+
+__attribute__((noinline))
+void Run(int idx) {
+  for (int i = 0, n = len; i < n; i++)
+    a[i + idx * n] = i;
+}
+
+void *Thread(void *arg) {
+  long idx = (long)arg;
+  printf("Thread %ld started\n", idx);
+  for (int i = 0; i < kNumIter; i++)
+    Run(idx);
+  printf("Thread %ld done\n", idx);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int n_threads = 0;
+  if (argc != 3) {
+    n_threads = 4;
+    len = 1000000;
+  } else {
+    n_threads = atoi(argv[1]);
+    assert(n_threads > 0 && n_threads <= 32);
+    len = atoi(argv[2]);
+  }
+  printf("%s: n_threads=%d len=%d iter=%d\n",
+         __FILE__, n_threads, len, kNumIter);
+  a = new int[n_threads * len];
+  pthread_t *t = new pthread_t[n_threads];
+  for (int i = 0; i < n_threads; i++) {
+    pthread_create(&t[i], 0, Thread, (void*)i);
+  }
+  for (int i = 0; i < n_threads; i++) {
+    pthread_join(t[i], 0);
+  }
+  delete [] t;
+  delete [] a;
+  return 0;
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_shared.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_shared.cpp
new file mode 100644
index 000000000000..f9b9f42f78a4
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_shared.cpp
@@ -0,0 +1,51 @@
+// Mini-benchmark for tsan: shared memory reads.
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+
+int len;
+int *a;
+const int kNumIter = 1000;
+
+__attribute__((noinline))
+void Run(int idx) {
+  for (int i = 0, n = len; i < n; i++)
+    if (a[i] != i) abort();
+}
+
+void *Thread(void *arg) {
+  long idx = (long)arg;
+  printf("Thread %ld started\n", idx);
+  for (int i = 0; i < kNumIter; i++)
+    Run(idx);
+  printf("Thread %ld done\n", idx);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int n_threads = 0;
+  if (argc != 3) {
+    n_threads = 4;
+    len = 1000000;
+  } else {
+    n_threads = atoi(argv[1]);
+    assert(n_threads > 0 && n_threads <= 32);
+    len = atoi(argv[2]);
+  }
+  printf("%s: n_threads=%d len=%d iter=%d\n",
+         __FILE__, n_threads, len, kNumIter);
+  a = new int[len];
+  for (int i = 0, n = len; i < n; i++)
+    a[i] = i;
+  pthread_t *t = new pthread_t[n_threads];
+  for (int i = 0; i < n_threads; i++) {
+    pthread_create(&t[i], 0, Thread, (void*)i);
+  }
+  for (int i = 0; i < n_threads; i++) {
+    pthread_join(t[i], 0);
+  }
+  delete [] t;
+  delete [] a;
+  return 0;
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mop.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mop.cpp
new file mode 100644
index 000000000000..e87fab856969
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mop.cpp
@@ -0,0 +1,80 @@
+// Synthetic benchmark for __tsan_read/write{1,2,4,8}.
+// As compared to mini_bench_local/shared.cc this benchmark passes through
+// deduplication logic (ContainsSameAccess).
+// First argument is access size (1, 2, 4, 8). Second optional arg switches
+// from writes to reads.
+
+#include <pthread.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <linux/futex.h>
+#include <sys/syscall.h>
+#include <sys/time.h>
+
+template<typename T, bool write>
+void* thread(void *arg) {
+  const int kSize = 2 << 10;
+  static volatile long data[kSize];
+  static volatile long turn;
+  const int kRepeat = 1 << 17;
+  const int id = !!arg;
+  for (int i = 0; i < kRepeat; i++) {
+    for (;;) {
+      int t = __atomic_load_n(&turn, __ATOMIC_ACQUIRE);
+      if (t == id)
+        break;
+      syscall(SYS_futex, &turn, FUTEX_WAIT, t, 0, 0, 0);
+    }
+    for (int j = 0; j < kSize; j++) {
+      if (write) {
+        ((volatile T*)&data[j])[0] = 1;
+        ((volatile T*)&data[j])[sizeof(T) == 8 ? 0 : 1] = 1;
+      } else {
+        T v0 = ((volatile T*)&data[j])[0];
+        T v1 = ((volatile T*)&data[j])[sizeof(T) == 8 ? 0 : 1];
+        (void)v0;
+        (void)v1;
+      }
+    }
+    __atomic_store_n(&turn, 1 - id, __ATOMIC_RELEASE);
+    syscall(SYS_futex, &turn, FUTEX_WAKE, 0, 0, 0, 0);
+  }
+  return 0;
+}
+
+template<typename T, bool write>
+void test() {
+  pthread_t th;
+  pthread_create(&th, 0, thread<T, write>, (void*)1);
+  thread<T, write>(0);
+  pthread_join(th, 0);  
+}
+
+template<bool write>
+void testw(int size) {
+  switch (size) {
+  case 1: return test<char, write>();
+  case 2: return test<short, write>();
+  case 4: return test<int, write>();
+  case 8: return test<long long, write>();
+  }
+}
+
+int main(int argc, char** argv) {
+  int size = 8;
+  bool write = true;
+  if (argc > 1) {
+    size = atoi(argv[1]);
+    if (size != 1 && size != 2 && size != 4 && size != 8)
+      size = 8;
+  }
+  if (argc > 2)
+    write = false;
+  printf("%s%d\n", write ? "write" : "read", size);
+  if (write)
+    testw<true>(size);
+  else
+    testw<false>(size);
+  return 0;
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/start_many_threads.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/start_many_threads.cpp
new file mode 100644
index 000000000000..1e86fa6c502e
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/start_many_threads.cpp
@@ -0,0 +1,52 @@
+// Mini-benchmark for creating a lot of threads.
+//
+// Some facts:
+// a) clang -O1 takes <15ms to start N=500 threads,
+//    consuming ~4MB more RAM than N=1.
+// b) clang -O1 -ftsan takes ~26s to start N=500 threads,
+//    eats 5GB more RAM than N=1 (which is somewhat expected but still a lot)
+//    but then it consumes ~4GB of extra memory when the threads shut down!
+//        (definitely not in the barrier_wait interceptor)
+//    Also, it takes 26s to run with N=500 vs just 1.1s to run with N=1.
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+pthread_barrier_t all_threads_ready;
+
+void* Thread(void *unused) {
+  pthread_barrier_wait(&all_threads_ready);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int n_threads;
+  if (argc == 1) {
+    n_threads = 100;
+  } else if (argc == 2) {
+    n_threads = atoi(argv[1]);
+  } else {
+    printf("Usage: %s n_threads\n", argv[0]);
+    return 1;
+  }
+  printf("%s: n_threads=%d\n", __FILE__, n_threads);
+
+  pthread_barrier_init(&all_threads_ready, NULL, n_threads + 1);
+
+  pthread_t *t = new pthread_t[n_threads];
+  for (int i = 0; i < n_threads; i++) {
+    int status = pthread_create(&t[i], 0, Thread, (void*)i);
+    assert(status == 0);
+  }
+  // sleep(5);  // FIXME: simplify measuring the memory usage.
+  pthread_barrier_wait(&all_threads_ready);
+  for (int i = 0; i < n_threads; i++) {
+    pthread_join(t[i], 0);
+  }
+  // sleep(5);  // FIXME: simplify measuring the memory usage.
+  delete [] t;
+
+  return 0;
+}
diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp
new file mode 100644
index 000000000000..f1056e20c874
--- /dev/null
+++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp
@@ -0,0 +1,120 @@
+// Mini-benchmark for tsan VTS worst case performance
+// Idea:
+// 1) Spawn M + N threads (M >> N)
+//    We'll call the 'M' threads as 'garbage threads'.
+// 2) Make sure all threads have created thus no TIDs were reused
+// 3) Join the garbage threads
+// 4) Do many sync operations on the remaining N threads
+//
+// It turns out that due to O(M+N) VTS complexity the (4) is much slower with
+// when N is large.
+//
+// Some numbers:
+// a) clang++ native O1 with n_iterations=200kk takes
+//      5s regardless of M
+//    clang++ tsanv2 O1 with n_iterations=20kk takes
+//      23.5s with M=200
+//      11.5s with M=1
+//    i.e. tsanv2 is ~23x to ~47x slower than native, depends on M.
+// b) g++ native O1 with n_iterations=200kk takes
+//      5.5s regardless of M
+//    g++ tsanv1 O1 with n_iterations=2kk takes
+//      39.5s with M=200
+//      20.5s with M=1
+//    i.e. tsanv1 is ~370x to ~720x slower than native, depends on M.
+
+#include <assert.h>
+#include <pthread.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+class __attribute__((aligned(64))) Mutex {
+ public:
+  Mutex()  { pthread_mutex_init(&m_, NULL); }
+  ~Mutex() { pthread_mutex_destroy(&m_); }
+  void Lock() { pthread_mutex_lock(&m_); }
+  void Unlock() { pthread_mutex_unlock(&m_); }
+
+ private:
+  pthread_mutex_t m_;
+};
+
+const int kNumMutexes = 1024;
+Mutex mutexes[kNumMutexes];
+
+int n_threads, n_iterations;
+
+pthread_barrier_t all_threads_ready, main_threads_ready;
+
+void* GarbageThread(void *unused) {
+  pthread_barrier_wait(&all_threads_ready);
+  return 0;
+}
+
+void *Thread(void *arg) {
+  long idx = (long)arg;
+  pthread_barrier_wait(&all_threads_ready);
+
+  // Wait for the main thread to join the garbage threads.
+  pthread_barrier_wait(&main_threads_ready);
+
+  printf("Thread %ld go!\n", idx);
+  int offset = idx * kNumMutexes / n_threads;
+  for (int i = 0; i < n_iterations; i++) {
+    mutexes[(offset + i) % kNumMutexes].Lock();
+    mutexes[(offset + i) % kNumMutexes].Unlock();
+  }
+  printf("Thread %ld done\n", idx);
+  return 0;
+}
+
+int main(int argc, char **argv) {
+  int n_garbage_threads;
+  if (argc == 1) {
+    n_threads = 2;
+    n_garbage_threads = 200;
+    n_iterations = 20000000;
+  } else if (argc == 4) {
+    n_threads = atoi(argv[1]);
+    assert(n_threads > 0 && n_threads <= 32);
+    n_garbage_threads = atoi(argv[2]);
+    assert(n_garbage_threads > 0 && n_garbage_threads <= 16000);
+    n_iterations = atoi(argv[3]);
+  } else {
+    printf("Usage: %s n_threads n_garbage_threads n_iterations\n", argv[0]);
+    return 1;
+  }
+  printf("%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n",
+         __FILE__, n_threads, n_garbage_threads, n_iterations);
+
+  pthread_barrier_init(&all_threads_ready, NULL, n_garbage_threads + n_threads + 1);
+  pthread_barrier_init(&main_threads_ready, NULL, n_threads + 1);
+
+  pthread_t *t = new pthread_t[n_threads];
+  {
+    pthread_t *g_t = new pthread_t[n_garbage_threads];
+    for (int i = 0; i < n_garbage_threads; i++) {
+      int status = pthread_create(&g_t[i], 0, GarbageThread, NULL);
+      assert(status == 0);
+    }
+    for (int i = 0; i < n_threads; i++) {
+      int status = pthread_create(&t[i], 0, Thread, (void*)i);
+      assert(status == 0);
+    }
+    pthread_barrier_wait(&all_threads_ready);
+    printf("All threads started! Killing the garbage threads.\n");
+    for (int i = 0; i < n_garbage_threads; i++) {
+      pthread_join(g_t[i], 0);
+    }
+    delete [] g_t;
+  }
+  printf("Resuming the main threads.\n");
+  pthread_barrier_wait(&main_threads_ready);
+
+
+  for (int i = 0; i < n_threads; i++) {
+    pthread_join(t[i], 0);
+  }
+  delete [] t;
+  return 0;
+}