diff options
Diffstat (limited to 'contrib/llvm-project/compiler-rt/lib/tsan/benchmarks')
6 files changed, 372 insertions, 0 deletions
| diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/func_entry_exit.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/func_entry_exit.cpp new file mode 100644 index 000000000000..5e0ba1d6981b --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/func_entry_exit.cpp @@ -0,0 +1,20 @@ +// Synthetic benchmark for __tsan_func_entry/exit (spends ~75% there). + +void foo(bool x); + +int main() { +  volatile int kRepeat1 = 1 << 30; +  const int kRepeat = kRepeat1; +  for (int i = 0; i < kRepeat; i++) +    foo(false); +} + +__attribute__((noinline)) void bar(volatile bool x) { +  if (x) +    foo(x); +} + +__attribute__((noinline)) void foo(bool x) { +  if (__builtin_expect(x, false)) +    bar(x); +} diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_local.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_local.cpp new file mode 100644 index 000000000000..accdcb63878f --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_local.cpp @@ -0,0 +1,49 @@ +// Mini-benchmark for tsan: non-shared memory writes. +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +int len; +int *a; +const int kNumIter = 1000; + +__attribute__((noinline)) +void Run(int idx) { +  for (int i = 0, n = len; i < n; i++) +    a[i + idx * n] = i; +} + +void *Thread(void *arg) { +  long idx = (long)arg; +  printf("Thread %ld started\n", idx); +  for (int i = 0; i < kNumIter; i++) +    Run(idx); +  printf("Thread %ld done\n", idx); +  return 0; +} + +int main(int argc, char **argv) { +  int n_threads = 0; +  if (argc != 3) { +    n_threads = 4; +    len = 1000000; +  } else { +    n_threads = atoi(argv[1]); +    assert(n_threads > 0 && n_threads <= 32); +    len = atoi(argv[2]); +  } +  printf("%s: n_threads=%d len=%d iter=%d\n", +         __FILE__, n_threads, len, kNumIter); +  a = new int[n_threads * len]; +  pthread_t *t = new pthread_t[n_threads]; +  for (int i = 0; i < n_threads; i++) { +    pthread_create(&t[i], 0, Thread, (void*)i); +  } +  for (int i = 0; i < n_threads; i++) { +    pthread_join(t[i], 0); +  } +  delete [] t; +  delete [] a; +  return 0; +} diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_shared.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_shared.cpp new file mode 100644 index 000000000000..f9b9f42f78a4 --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mini_bench_shared.cpp @@ -0,0 +1,51 @@ +// Mini-benchmark for tsan: shared memory reads. +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> + +int len; +int *a; +const int kNumIter = 1000; + +__attribute__((noinline)) +void Run(int idx) { +  for (int i = 0, n = len; i < n; i++) +    if (a[i] != i) abort(); +} + +void *Thread(void *arg) { +  long idx = (long)arg; +  printf("Thread %ld started\n", idx); +  for (int i = 0; i < kNumIter; i++) +    Run(idx); +  printf("Thread %ld done\n", idx); +  return 0; +} + +int main(int argc, char **argv) { +  int n_threads = 0; +  if (argc != 3) { +    n_threads = 4; +    len = 1000000; +  } else { +    n_threads = atoi(argv[1]); +    assert(n_threads > 0 && n_threads <= 32); +    len = atoi(argv[2]); +  } +  printf("%s: n_threads=%d len=%d iter=%d\n", +         __FILE__, n_threads, len, kNumIter); +  a = new int[len]; +  for (int i = 0, n = len; i < n; i++) +    a[i] = i; +  pthread_t *t = new pthread_t[n_threads]; +  for (int i = 0; i < n_threads; i++) { +    pthread_create(&t[i], 0, Thread, (void*)i); +  } +  for (int i = 0; i < n_threads; i++) { +    pthread_join(t[i], 0); +  } +  delete [] t; +  delete [] a; +  return 0; +} diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mop.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mop.cpp new file mode 100644 index 000000000000..e87fab856969 --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/mop.cpp @@ -0,0 +1,80 @@ +// Synthetic benchmark for __tsan_read/write{1,2,4,8}. +// As compared to mini_bench_local/shared.cc this benchmark passes through +// deduplication logic (ContainsSameAccess). +// First argument is access size (1, 2, 4, 8). Second optional arg switches +// from writes to reads. + +#include <pthread.h> +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <linux/futex.h> +#include <sys/syscall.h> +#include <sys/time.h> + +template<typename T, bool write> +void* thread(void *arg) { +  const int kSize = 2 << 10; +  static volatile long data[kSize]; +  static volatile long turn; +  const int kRepeat = 1 << 17; +  const int id = !!arg; +  for (int i = 0; i < kRepeat; i++) { +    for (;;) { +      int t = __atomic_load_n(&turn, __ATOMIC_ACQUIRE); +      if (t == id) +        break; +      syscall(SYS_futex, &turn, FUTEX_WAIT, t, 0, 0, 0); +    } +    for (int j = 0; j < kSize; j++) { +      if (write) { +        ((volatile T*)&data[j])[0] = 1; +        ((volatile T*)&data[j])[sizeof(T) == 8 ? 0 : 1] = 1; +      } else { +        T v0 = ((volatile T*)&data[j])[0]; +        T v1 = ((volatile T*)&data[j])[sizeof(T) == 8 ? 0 : 1]; +        (void)v0; +        (void)v1; +      } +    } +    __atomic_store_n(&turn, 1 - id, __ATOMIC_RELEASE); +    syscall(SYS_futex, &turn, FUTEX_WAKE, 0, 0, 0, 0); +  } +  return 0; +} + +template<typename T, bool write> +void test() { +  pthread_t th; +  pthread_create(&th, 0, thread<T, write>, (void*)1); +  thread<T, write>(0); +  pthread_join(th, 0);   +} + +template<bool write> +void testw(int size) { +  switch (size) { +  case 1: return test<char, write>(); +  case 2: return test<short, write>(); +  case 4: return test<int, write>(); +  case 8: return test<long long, write>(); +  } +} + +int main(int argc, char** argv) { +  int size = 8; +  bool write = true; +  if (argc > 1) { +    size = atoi(argv[1]); +    if (size != 1 && size != 2 && size != 4 && size != 8) +      size = 8; +  } +  if (argc > 2) +    write = false; +  printf("%s%d\n", write ? "write" : "read", size); +  if (write) +    testw<true>(size); +  else +    testw<false>(size); +  return 0; +} diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/start_many_threads.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/start_many_threads.cpp new file mode 100644 index 000000000000..1e86fa6c502e --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/start_many_threads.cpp @@ -0,0 +1,52 @@ +// Mini-benchmark for creating a lot of threads. +// +// Some facts: +// a) clang -O1 takes <15ms to start N=500 threads, +//    consuming ~4MB more RAM than N=1. +// b) clang -O1 -ftsan takes ~26s to start N=500 threads, +//    eats 5GB more RAM than N=1 (which is somewhat expected but still a lot) +//    but then it consumes ~4GB of extra memory when the threads shut down! +//        (definitely not in the barrier_wait interceptor) +//    Also, it takes 26s to run with N=500 vs just 1.1s to run with N=1. +#include <assert.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> +#include <unistd.h> + +pthread_barrier_t all_threads_ready; + +void* Thread(void *unused) { +  pthread_barrier_wait(&all_threads_ready); +  return 0; +} + +int main(int argc, char **argv) { +  int n_threads; +  if (argc == 1) { +    n_threads = 100; +  } else if (argc == 2) { +    n_threads = atoi(argv[1]); +  } else { +    printf("Usage: %s n_threads\n", argv[0]); +    return 1; +  } +  printf("%s: n_threads=%d\n", __FILE__, n_threads); + +  pthread_barrier_init(&all_threads_ready, NULL, n_threads + 1); + +  pthread_t *t = new pthread_t[n_threads]; +  for (int i = 0; i < n_threads; i++) { +    int status = pthread_create(&t[i], 0, Thread, (void*)i); +    assert(status == 0); +  } +  // sleep(5);  // FIXME: simplify measuring the memory usage. +  pthread_barrier_wait(&all_threads_ready); +  for (int i = 0; i < n_threads; i++) { +    pthread_join(t[i], 0); +  } +  // sleep(5);  // FIXME: simplify measuring the memory usage. +  delete [] t; + +  return 0; +} diff --git a/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp new file mode 100644 index 000000000000..f1056e20c874 --- /dev/null +++ b/contrib/llvm-project/compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp @@ -0,0 +1,120 @@ +// Mini-benchmark for tsan VTS worst case performance +// Idea: +// 1) Spawn M + N threads (M >> N) +//    We'll call the 'M' threads as 'garbage threads'. +// 2) Make sure all threads have created thus no TIDs were reused +// 3) Join the garbage threads +// 4) Do many sync operations on the remaining N threads +// +// It turns out that due to O(M+N) VTS complexity the (4) is much slower with +// when N is large. +// +// Some numbers: +// a) clang++ native O1 with n_iterations=200kk takes +//      5s regardless of M +//    clang++ tsanv2 O1 with n_iterations=20kk takes +//      23.5s with M=200 +//      11.5s with M=1 +//    i.e. tsanv2 is ~23x to ~47x slower than native, depends on M. +// b) g++ native O1 with n_iterations=200kk takes +//      5.5s regardless of M +//    g++ tsanv1 O1 with n_iterations=2kk takes +//      39.5s with M=200 +//      20.5s with M=1 +//    i.e. tsanv1 is ~370x to ~720x slower than native, depends on M. + +#include <assert.h> +#include <pthread.h> +#include <stdio.h> +#include <stdlib.h> + +class __attribute__((aligned(64))) Mutex { + public: +  Mutex()  { pthread_mutex_init(&m_, NULL); } +  ~Mutex() { pthread_mutex_destroy(&m_); } +  void Lock() { pthread_mutex_lock(&m_); } +  void Unlock() { pthread_mutex_unlock(&m_); } + + private: +  pthread_mutex_t m_; +}; + +const int kNumMutexes = 1024; +Mutex mutexes[kNumMutexes]; + +int n_threads, n_iterations; + +pthread_barrier_t all_threads_ready, main_threads_ready; + +void* GarbageThread(void *unused) { +  pthread_barrier_wait(&all_threads_ready); +  return 0; +} + +void *Thread(void *arg) { +  long idx = (long)arg; +  pthread_barrier_wait(&all_threads_ready); + +  // Wait for the main thread to join the garbage threads. +  pthread_barrier_wait(&main_threads_ready); + +  printf("Thread %ld go!\n", idx); +  int offset = idx * kNumMutexes / n_threads; +  for (int i = 0; i < n_iterations; i++) { +    mutexes[(offset + i) % kNumMutexes].Lock(); +    mutexes[(offset + i) % kNumMutexes].Unlock(); +  } +  printf("Thread %ld done\n", idx); +  return 0; +} + +int main(int argc, char **argv) { +  int n_garbage_threads; +  if (argc == 1) { +    n_threads = 2; +    n_garbage_threads = 200; +    n_iterations = 20000000; +  } else if (argc == 4) { +    n_threads = atoi(argv[1]); +    assert(n_threads > 0 && n_threads <= 32); +    n_garbage_threads = atoi(argv[2]); +    assert(n_garbage_threads > 0 && n_garbage_threads <= 16000); +    n_iterations = atoi(argv[3]); +  } else { +    printf("Usage: %s n_threads n_garbage_threads n_iterations\n", argv[0]); +    return 1; +  } +  printf("%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n", +         __FILE__, n_threads, n_garbage_threads, n_iterations); + +  pthread_barrier_init(&all_threads_ready, NULL, n_garbage_threads + n_threads + 1); +  pthread_barrier_init(&main_threads_ready, NULL, n_threads + 1); + +  pthread_t *t = new pthread_t[n_threads]; +  { +    pthread_t *g_t = new pthread_t[n_garbage_threads]; +    for (int i = 0; i < n_garbage_threads; i++) { +      int status = pthread_create(&g_t[i], 0, GarbageThread, NULL); +      assert(status == 0); +    } +    for (int i = 0; i < n_threads; i++) { +      int status = pthread_create(&t[i], 0, Thread, (void*)i); +      assert(status == 0); +    } +    pthread_barrier_wait(&all_threads_ready); +    printf("All threads started! Killing the garbage threads.\n"); +    for (int i = 0; i < n_garbage_threads; i++) { +      pthread_join(g_t[i], 0); +    } +    delete [] g_t; +  } +  printf("Resuming the main threads.\n"); +  pthread_barrier_wait(&main_threads_ready); + + +  for (int i = 0; i < n_threads; i++) { +    pthread_join(t[i], 0); +  } +  delete [] t; +  return 0; +} | 
