1// Mini-benchmark for tsan VTS worst case performance
2// Idea:
3// 1) Spawn M + N threads (M >> N)
4// We'll call the 'M' threads as 'garbage threads'.
5// 2) Make sure all threads have created thus no TIDs were reused
6// 3) Join the garbage threads
7// 4) Do many sync operations on the remaining N threads
8//
9// It turns out that due to O(M+N) VTS complexity the (4) is much slower with
10// when N is large.
11//
12// Some numbers:
13// a) clang++ native O1 with n_iterations=200kk takes
14// 5s regardless of M
15// clang++ tsanv2 O1 with n_iterations=20kk takes
16// 23.5s with M=200
17// 11.5s with M=1
18// i.e. tsanv2 is ~23x to ~47x slower than native, depends on M.
19// b) g++ native O1 with n_iterations=200kk takes
20// 5.5s regardless of M
21// g++ tsanv1 O1 with n_iterations=2kk takes
22// 39.5s with M=200
23// 20.5s with M=1
24// i.e. tsanv1 is ~370x to ~720x slower than native, depends on M.
25
26#include <assert.h>
27#include <pthread.h>
28#include <stdio.h>
29#include <stdlib.h>
30
31class __attribute__((aligned(64))) Mutex {
32 public:
33 Mutex() { pthread_mutex_init(mutex: &m_, NULL); }
34 ~Mutex() { pthread_mutex_destroy(mutex: &m_); }
35 void Lock() { pthread_mutex_lock(mutex: &m_); }
36 void Unlock() { pthread_mutex_unlock(mutex: &m_); }
37
38 private:
39 pthread_mutex_t m_;
40};
41
42const int kNumMutexes = 1024;
43Mutex mutexes[kNumMutexes];
44
45int n_threads, n_iterations;
46
47pthread_barrier_t all_threads_ready, main_threads_ready;
48
49void* GarbageThread(void *unused) {
50 pthread_barrier_wait(barrier: &all_threads_ready);
51 return 0;
52}
53
54void *Thread(void *arg) {
55 long idx = (long)arg;
56 pthread_barrier_wait(barrier: &all_threads_ready);
57
58 // Wait for the main thread to join the garbage threads.
59 pthread_barrier_wait(barrier: &main_threads_ready);
60
61 printf(format: "Thread %ld go!\n", idx);
62 int offset = idx * kNumMutexes / n_threads;
63 for (int i = 0; i < n_iterations; i++) {
64 mutexes[(offset + i) % kNumMutexes].Lock();
65 mutexes[(offset + i) % kNumMutexes].Unlock();
66 }
67 printf(format: "Thread %ld done\n", idx);
68 return 0;
69}
70
71int main(int argc, char **argv) {
72 int n_garbage_threads;
73 if (argc == 1) {
74 n_threads = 2;
75 n_garbage_threads = 200;
76 n_iterations = 20000000;
77 } else if (argc == 4) {
78 n_threads = atoi(nptr: argv[1]);
79 assert(n_threads > 0 && n_threads <= 32);
80 n_garbage_threads = atoi(nptr: argv[2]);
81 assert(n_garbage_threads > 0 && n_garbage_threads <= 16000);
82 n_iterations = atoi(nptr: argv[3]);
83 } else {
84 printf(format: "Usage: %s n_threads n_garbage_threads n_iterations\n", argv[0]);
85 return 1;
86 }
87 printf(format: "%s: n_threads=%d n_garbage_threads=%d n_iterations=%d\n",
88 __FILE__, n_threads, n_garbage_threads, n_iterations);
89
90 pthread_barrier_init(barrier: &all_threads_ready, NULL, count: n_garbage_threads + n_threads + 1);
91 pthread_barrier_init(barrier: &main_threads_ready, NULL, count: n_threads + 1);
92
93 pthread_t *t = new pthread_t[n_threads];
94 {
95 pthread_t *g_t = new pthread_t[n_garbage_threads];
96 for (int i = 0; i < n_garbage_threads; i++) {
97 int status = pthread_create(newthread: &g_t[i], attr: 0, start_routine: GarbageThread, NULL);
98 assert(status == 0);
99 }
100 for (int i = 0; i < n_threads; i++) {
101 int status = pthread_create(newthread: &t[i], attr: 0, start_routine: Thread, arg: (void*)i);
102 assert(status == 0);
103 }
104 pthread_barrier_wait(barrier: &all_threads_ready);
105 printf(format: "All threads started! Killing the garbage threads.\n");
106 for (int i = 0; i < n_garbage_threads; i++) {
107 pthread_join(th: g_t[i], thread_return: 0);
108 }
109 delete [] g_t;
110 }
111 printf(format: "Resuming the main threads.\n");
112 pthread_barrier_wait(barrier: &main_threads_ready);
113
114
115 for (int i = 0; i < n_threads; i++) {
116 pthread_join(th: t[i], thread_return: 0);
117 }
118 delete [] t;
119 return 0;
120}
121

source code of compiler-rt/lib/tsan/benchmarks/vts_many_threads_bench.cpp