1 | // SPDX-License-Identifier: GPL-2.0 |
2 | #ifdef HAVE_EVENTFD_SUPPORT |
3 | /* |
4 | * Copyright (C) 2018 Davidlohr Bueso. |
5 | * |
6 | * This program benchmarks concurrent epoll_wait(2) monitoring multiple |
7 | * file descriptors under one or two load balancing models. The first, |
8 | * and default, is the single/combined queueing (which refers to a single |
9 | * epoll instance for N worker threads): |
10 | * |
11 | * |---> [worker A] |
12 | * |---> [worker B] |
13 | * [combined queue] .---> [worker C] |
14 | * |---> [worker D] |
15 | * |---> [worker E] |
16 | * |
17 | * While the second model, enabled via --multiq option, uses multiple |
18 | * queueing (which refers to one epoll instance per worker). For example, |
19 | * short lived tcp connections in a high throughput httpd server will |
20 | * distribute the accept()'ing connections across CPUs. In this case each |
21 | * worker does a limited amount of processing. |
22 | * |
23 | * [queue A] ---> [worker] |
24 | * [queue B] ---> [worker] |
25 | * [queue C] ---> [worker] |
26 | * [queue D] ---> [worker] |
27 | * [queue E] ---> [worker] |
28 | * |
29 | * Naturally, the single queue will enforce more concurrency on the epoll |
30 | * instance, and can therefore scale poorly compared to multiple queues. |
31 | * However, this is a benchmark raw data and must be taken with a grain of |
32 | * salt when choosing how to make use of sys_epoll. |
33 | |
34 | * Each thread has a number of private, nonblocking file descriptors, |
35 | * referred to as fdmap. A writer thread will constantly be writing to |
36 | * the fdmaps of all threads, minimizing each threads's chances of |
37 | * epoll_wait not finding any ready read events and blocking as this |
38 | * is not what we want to stress. The size of the fdmap can be adjusted |
39 | * by the user; enlarging the value will increase the chances of |
40 | * epoll_wait(2) blocking as the lineal writer thread will take "longer", |
41 | * at least at a high level. |
42 | * |
43 | * Note that because fds are private to each thread, this workload does |
44 | * not stress scenarios where multiple tasks are awoken per ready IO; ie: |
45 | * EPOLLEXCLUSIVE semantics. |
46 | * |
47 | * The end result/metric is throughput: number of ops/second where an |
48 | * operation consists of: |
49 | * |
50 | * epoll_wait(2) + [others] |
51 | * |
52 | * ... where [others] is the cost of re-adding the fd (EPOLLET), |
53 | * or rearming it (EPOLLONESHOT). |
54 | * |
55 | * |
56 | * The purpose of this is program is that it be useful for measuring |
57 | * kernel related changes to the sys_epoll, and not comparing different |
58 | * IO polling methods, for example. Hence everything is very adhoc and |
59 | * outputs raw microbenchmark numbers. Also this uses eventfd, similar |
60 | * tools tend to use pipes or sockets, but the result is the same. |
61 | */ |
62 | |
63 | /* For the CLR_() macros */ |
64 | #include <string.h> |
65 | #include <pthread.h> |
66 | #include <unistd.h> |
67 | |
68 | #include <errno.h> |
69 | #include <inttypes.h> |
70 | #include <signal.h> |
71 | #include <stdlib.h> |
72 | #include <linux/compiler.h> |
73 | #include <linux/kernel.h> |
74 | #include <sys/time.h> |
75 | #include <sys/resource.h> |
76 | #include <sys/epoll.h> |
77 | #include <sys/eventfd.h> |
78 | #include <sys/types.h> |
79 | #include <perf/cpumap.h> |
80 | |
81 | #include "../util/stat.h" |
82 | #include "../util/mutex.h" |
83 | #include <subcmd/parse-options.h> |
84 | #include "bench.h" |
85 | |
86 | #include <err.h> |
87 | |
88 | #define printinfo(fmt, arg...) \ |
89 | do { if (__verbose) { printf(fmt, ## arg); fflush(stdout); } } while (0) |
90 | |
91 | static unsigned int nthreads = 0; |
92 | static unsigned int nsecs = 8; |
93 | static bool wdone, done, __verbose, randomize, nonblocking; |
94 | |
95 | /* |
96 | * epoll related shared variables. |
97 | */ |
98 | |
99 | /* Maximum number of nesting allowed inside epoll sets */ |
100 | #define EPOLL_MAXNESTS 4 |
101 | |
102 | static int epollfd; |
103 | static int *epollfdp; |
104 | static bool noaffinity; |
105 | static unsigned int nested = 0; |
106 | static bool et; /* edge-trigger */ |
107 | static bool oneshot; |
108 | static bool multiq; /* use an epoll instance per thread */ |
109 | |
110 | /* amount of fds to monitor, per thread */ |
111 | static unsigned int nfds = 64; |
112 | |
113 | static struct mutex thread_lock; |
114 | static unsigned int threads_starting; |
115 | static struct stats throughput_stats; |
116 | static struct cond thread_parent, thread_worker; |
117 | |
118 | struct worker { |
119 | int tid; |
120 | int epollfd; /* for --multiq */ |
121 | pthread_t thread; |
122 | unsigned long ops; |
123 | int *fdmap; |
124 | }; |
125 | |
126 | static const struct option options[] = { |
127 | /* general benchmark options */ |
128 | OPT_UINTEGER('t', "threads" , &nthreads, "Specify amount of threads" ), |
129 | OPT_UINTEGER('r', "runtime" , &nsecs, "Specify runtime (in seconds)" ), |
130 | OPT_UINTEGER('f', "nfds" , &nfds, "Specify amount of file descriptors to monitor for each thread" ), |
131 | OPT_BOOLEAN( 'n', "noaffinity" , &noaffinity, "Disables CPU affinity" ), |
132 | OPT_BOOLEAN('R', "randomize" , &randomize, "Enable random write behaviour (default is lineal)" ), |
133 | OPT_BOOLEAN( 'v', "verbose" , &__verbose, "Verbose mode" ), |
134 | |
135 | /* epoll specific options */ |
136 | OPT_BOOLEAN( 'm', "multiq" , &multiq, "Use multiple epoll instances (one per thread)" ), |
137 | OPT_BOOLEAN( 'B', "nonblocking" , &nonblocking, "Nonblocking epoll_wait(2) behaviour" ), |
138 | OPT_UINTEGER( 'N', "nested" , &nested, "Nesting level epoll hierarchy (default is 0, no nesting)" ), |
139 | OPT_BOOLEAN( 'S', "oneshot" , &oneshot, "Use EPOLLONESHOT semantics" ), |
140 | OPT_BOOLEAN( 'E', "edge" , &et, "Use Edge-triggered interface (default is LT)" ), |
141 | |
142 | OPT_END() |
143 | }; |
144 | |
145 | static const char * const bench_epoll_wait_usage[] = { |
146 | "perf bench epoll wait <options>" , |
147 | NULL |
148 | }; |
149 | |
150 | |
151 | /* |
152 | * Arrange the N elements of ARRAY in random order. |
153 | * Only effective if N is much smaller than RAND_MAX; |
154 | * if this may not be the case, use a better random |
155 | * number generator. -- Ben Pfaff. |
156 | */ |
157 | static void shuffle(void *array, size_t n, size_t size) |
158 | { |
159 | char *carray = array; |
160 | void *aux; |
161 | size_t i; |
162 | |
163 | if (n <= 1) |
164 | return; |
165 | |
166 | aux = calloc(1, size); |
167 | if (!aux) |
168 | err(EXIT_FAILURE, "calloc" ); |
169 | |
170 | for (i = 1; i < n; ++i) { |
171 | size_t j = i + rand() / (RAND_MAX / (n - i) + 1); |
172 | j *= size; |
173 | |
174 | memcpy(aux, &carray[j], size); |
175 | memcpy(&carray[j], &carray[i*size], size); |
176 | memcpy(&carray[i*size], aux, size); |
177 | } |
178 | |
179 | free(aux); |
180 | } |
181 | |
182 | |
183 | static void *workerfn(void *arg) |
184 | { |
185 | int fd, ret, r; |
186 | struct worker *w = (struct worker *) arg; |
187 | unsigned long ops = w->ops; |
188 | struct epoll_event ev; |
189 | uint64_t val; |
190 | int to = nonblocking? 0 : -1; |
191 | int efd = multiq ? w->epollfd : epollfd; |
192 | |
193 | mutex_lock(&thread_lock); |
194 | threads_starting--; |
195 | if (!threads_starting) |
196 | cond_signal(&thread_parent); |
197 | cond_wait(&thread_worker, &thread_lock); |
198 | mutex_unlock(&thread_lock); |
199 | |
200 | do { |
201 | /* |
202 | * Block indefinitely waiting for the IN event. |
203 | * In order to stress the epoll_wait(2) syscall, |
204 | * call it event per event, instead of a larger |
205 | * batch (max)limit. |
206 | */ |
207 | do { |
208 | ret = epoll_wait(efd, &ev, 1, to); |
209 | } while (ret < 0 && errno == EINTR); |
210 | if (ret < 0) |
211 | err(EXIT_FAILURE, "epoll_wait" ); |
212 | |
213 | fd = ev.data.fd; |
214 | |
215 | do { |
216 | r = read(fd, &val, sizeof(val)); |
217 | } while (!done && (r < 0 && errno == EAGAIN)); |
218 | |
219 | if (et) { |
220 | ev.events = EPOLLIN | EPOLLET; |
221 | ret = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev); |
222 | } |
223 | |
224 | if (oneshot) { |
225 | /* rearm the file descriptor with a new event mask */ |
226 | ev.events |= EPOLLIN | EPOLLONESHOT; |
227 | ret = epoll_ctl(efd, EPOLL_CTL_MOD, fd, &ev); |
228 | } |
229 | |
230 | ops++; |
231 | } while (!done); |
232 | |
233 | if (multiq) |
234 | close(w->epollfd); |
235 | |
236 | w->ops = ops; |
237 | return NULL; |
238 | } |
239 | |
240 | static void nest_epollfd(struct worker *w) |
241 | { |
242 | unsigned int i; |
243 | struct epoll_event ev; |
244 | int efd = multiq ? w->epollfd : epollfd; |
245 | |
246 | if (nested > EPOLL_MAXNESTS) |
247 | nested = EPOLL_MAXNESTS; |
248 | |
249 | epollfdp = calloc(nested, sizeof(*epollfdp)); |
250 | if (!epollfdp) |
251 | err(EXIT_FAILURE, "calloc" ); |
252 | |
253 | for (i = 0; i < nested; i++) { |
254 | epollfdp[i] = epoll_create(1); |
255 | if (epollfdp[i] < 0) |
256 | err(EXIT_FAILURE, "epoll_create" ); |
257 | } |
258 | |
259 | ev.events = EPOLLHUP; /* anything */ |
260 | ev.data.u64 = i; /* any number */ |
261 | |
262 | for (i = nested - 1; i; i--) { |
263 | if (epoll_ctl(epollfdp[i - 1], EPOLL_CTL_ADD, |
264 | epollfdp[i], &ev) < 0) |
265 | err(EXIT_FAILURE, "epoll_ctl" ); |
266 | } |
267 | |
268 | if (epoll_ctl(efd, EPOLL_CTL_ADD, *epollfdp, &ev) < 0) |
269 | err(EXIT_FAILURE, "epoll_ctl" ); |
270 | } |
271 | |
272 | static void toggle_done(int sig __maybe_unused, |
273 | siginfo_t *info __maybe_unused, |
274 | void *uc __maybe_unused) |
275 | { |
276 | /* inform all threads that we're done for the day */ |
277 | done = true; |
278 | gettimeofday(&bench__end, NULL); |
279 | timersub(&bench__end, &bench__start, &bench__runtime); |
280 | } |
281 | |
282 | static void print_summary(void) |
283 | { |
284 | unsigned long avg = avg_stats(&throughput_stats); |
285 | double stddev = stddev_stats(&throughput_stats); |
286 | |
287 | printf("\nAveraged %ld operations/sec (+- %.2f%%), total secs = %d\n" , |
288 | avg, rel_stddev_stats(stddev, avg), |
289 | (int)bench__runtime.tv_sec); |
290 | } |
291 | |
292 | static int do_threads(struct worker *worker, struct perf_cpu_map *cpu) |
293 | { |
294 | pthread_attr_t thread_attr, *attrp = NULL; |
295 | cpu_set_t *cpuset; |
296 | unsigned int i, j; |
297 | int ret = 0, events = EPOLLIN; |
298 | int nrcpus; |
299 | size_t size; |
300 | |
301 | if (oneshot) |
302 | events |= EPOLLONESHOT; |
303 | if (et) |
304 | events |= EPOLLET; |
305 | |
306 | printinfo("starting worker/consumer %sthreads%s\n" , |
307 | noaffinity ? "" :"CPU affinity " , |
308 | nonblocking ? " (nonblocking)" :"" ); |
309 | if (!noaffinity) |
310 | pthread_attr_init(&thread_attr); |
311 | |
312 | nrcpus = perf_cpu_map__nr(cpu); |
313 | cpuset = CPU_ALLOC(nrcpus); |
314 | BUG_ON(!cpuset); |
315 | size = CPU_ALLOC_SIZE(nrcpus); |
316 | |
317 | for (i = 0; i < nthreads; i++) { |
318 | struct worker *w = &worker[i]; |
319 | |
320 | if (multiq) { |
321 | w->epollfd = epoll_create(1); |
322 | if (w->epollfd < 0) |
323 | err(EXIT_FAILURE, "epoll_create" ); |
324 | |
325 | if (nested) |
326 | nest_epollfd(w); |
327 | } |
328 | |
329 | w->tid = i; |
330 | w->fdmap = calloc(nfds, sizeof(int)); |
331 | if (!w->fdmap) |
332 | return 1; |
333 | |
334 | for (j = 0; j < nfds; j++) { |
335 | int efd = multiq ? w->epollfd : epollfd; |
336 | struct epoll_event ev; |
337 | |
338 | w->fdmap[j] = eventfd(0, EFD_NONBLOCK); |
339 | if (w->fdmap[j] < 0) |
340 | err(EXIT_FAILURE, "eventfd" ); |
341 | |
342 | ev.data.fd = w->fdmap[j]; |
343 | ev.events = events; |
344 | |
345 | ret = epoll_ctl(efd, EPOLL_CTL_ADD, |
346 | w->fdmap[j], &ev); |
347 | if (ret < 0) |
348 | err(EXIT_FAILURE, "epoll_ctl" ); |
349 | } |
350 | |
351 | if (!noaffinity) { |
352 | CPU_ZERO_S(size, cpuset); |
353 | CPU_SET_S(perf_cpu_map__cpu(cpu, i % perf_cpu_map__nr(cpu)).cpu, |
354 | size, cpuset); |
355 | |
356 | ret = pthread_attr_setaffinity_np(&thread_attr, size, cpuset); |
357 | if (ret) { |
358 | CPU_FREE(cpuset); |
359 | err(EXIT_FAILURE, "pthread_attr_setaffinity_np" ); |
360 | } |
361 | |
362 | attrp = &thread_attr; |
363 | } |
364 | |
365 | ret = pthread_create(&w->thread, attrp, workerfn, |
366 | (void *)(struct worker *) w); |
367 | if (ret) { |
368 | CPU_FREE(cpuset); |
369 | err(EXIT_FAILURE, "pthread_create" ); |
370 | } |
371 | } |
372 | |
373 | CPU_FREE(cpuset); |
374 | if (!noaffinity) |
375 | pthread_attr_destroy(&thread_attr); |
376 | |
377 | return ret; |
378 | } |
379 | |
380 | static void *writerfn(void *p) |
381 | { |
382 | struct worker *worker = p; |
383 | size_t i, j, iter; |
384 | const uint64_t val = 1; |
385 | ssize_t sz; |
386 | struct timespec ts = { .tv_sec = 0, |
387 | .tv_nsec = 500 }; |
388 | |
389 | printinfo("starting writer-thread: doing %s writes ...\n" , |
390 | randomize? "random" :"lineal" ); |
391 | |
392 | for (iter = 0; !wdone; iter++) { |
393 | if (randomize) { |
394 | shuffle((void *)worker, nthreads, sizeof(*worker)); |
395 | } |
396 | |
397 | for (i = 0; i < nthreads; i++) { |
398 | struct worker *w = &worker[i]; |
399 | |
400 | if (randomize) { |
401 | shuffle((void *)w->fdmap, nfds, sizeof(int)); |
402 | } |
403 | |
404 | for (j = 0; j < nfds; j++) { |
405 | do { |
406 | sz = write(w->fdmap[j], &val, sizeof(val)); |
407 | } while (!wdone && (sz < 0 && errno == EAGAIN)); |
408 | } |
409 | } |
410 | |
411 | nanosleep(&ts, NULL); |
412 | } |
413 | |
414 | printinfo("exiting writer-thread (total full-loops: %zd)\n" , iter); |
415 | return NULL; |
416 | } |
417 | |
418 | static int cmpworker(const void *p1, const void *p2) |
419 | { |
420 | |
421 | struct worker *w1 = (struct worker *) p1; |
422 | struct worker *w2 = (struct worker *) p2; |
423 | return w1->tid > w2->tid; |
424 | } |
425 | |
426 | int bench_epoll_wait(int argc, const char **argv) |
427 | { |
428 | int ret = 0; |
429 | struct sigaction act; |
430 | unsigned int i; |
431 | struct worker *worker = NULL; |
432 | struct perf_cpu_map *cpu; |
433 | pthread_t wthread; |
434 | struct rlimit rl, prevrl; |
435 | |
436 | argc = parse_options(argc, argv, options, bench_epoll_wait_usage, 0); |
437 | if (argc) { |
438 | usage_with_options(bench_epoll_wait_usage, options); |
439 | exit(EXIT_FAILURE); |
440 | } |
441 | |
442 | memset(&act, 0, sizeof(act)); |
443 | sigfillset(&act.sa_mask); |
444 | act.sa_sigaction = toggle_done; |
445 | sigaction(SIGINT, &act, NULL); |
446 | |
447 | cpu = perf_cpu_map__new_online_cpus(); |
448 | if (!cpu) |
449 | goto errmem; |
450 | |
451 | /* a single, main epoll instance */ |
452 | if (!multiq) { |
453 | epollfd = epoll_create(1); |
454 | if (epollfd < 0) |
455 | err(EXIT_FAILURE, "epoll_create" ); |
456 | |
457 | /* |
458 | * Deal with nested epolls, if any. |
459 | */ |
460 | if (nested) |
461 | nest_epollfd(NULL); |
462 | } |
463 | |
464 | printinfo("Using %s queue model\n" , multiq ? "multi" : "single" ); |
465 | printinfo("Nesting level(s): %d\n" , nested); |
466 | |
467 | /* default to the number of CPUs and leave one for the writer pthread */ |
468 | if (!nthreads) |
469 | nthreads = perf_cpu_map__nr(cpu) - 1; |
470 | |
471 | worker = calloc(nthreads, sizeof(*worker)); |
472 | if (!worker) { |
473 | goto errmem; |
474 | } |
475 | |
476 | if (getrlimit(RLIMIT_NOFILE, &prevrl)) |
477 | err(EXIT_FAILURE, "getrlimit" ); |
478 | rl.rlim_cur = rl.rlim_max = nfds * nthreads * 2 + 50; |
479 | printinfo("Setting RLIMIT_NOFILE rlimit from %" PRIu64 " to: %" PRIu64 "\n" , |
480 | (uint64_t)prevrl.rlim_max, (uint64_t)rl.rlim_max); |
481 | if (setrlimit(RLIMIT_NOFILE, &rl) < 0) |
482 | err(EXIT_FAILURE, "setrlimit" ); |
483 | |
484 | printf("Run summary [PID %d]: %d threads monitoring%s on " |
485 | "%d file-descriptors for %d secs.\n\n" , |
486 | getpid(), nthreads, oneshot ? " (EPOLLONESHOT semantics)" : "" , nfds, nsecs); |
487 | |
488 | init_stats(&throughput_stats); |
489 | mutex_init(&thread_lock); |
490 | cond_init(&thread_parent); |
491 | cond_init(&thread_worker); |
492 | |
493 | threads_starting = nthreads; |
494 | |
495 | gettimeofday(&bench__start, NULL); |
496 | |
497 | do_threads(worker, cpu); |
498 | |
499 | mutex_lock(&thread_lock); |
500 | while (threads_starting) |
501 | cond_wait(&thread_parent, &thread_lock); |
502 | cond_broadcast(&thread_worker); |
503 | mutex_unlock(&thread_lock); |
504 | |
505 | /* |
506 | * At this point the workers should be blocked waiting for read events |
507 | * to become ready. Launch the writer which will constantly be writing |
508 | * to each thread's fdmap. |
509 | */ |
510 | ret = pthread_create(&wthread, NULL, writerfn, |
511 | (void *)(struct worker *) worker); |
512 | if (ret) |
513 | err(EXIT_FAILURE, "pthread_create" ); |
514 | |
515 | sleep(nsecs); |
516 | toggle_done(0, NULL, NULL); |
517 | printinfo("main thread: toggling done\n" ); |
518 | |
519 | sleep(1); /* meh */ |
520 | wdone = true; |
521 | ret = pthread_join(wthread, NULL); |
522 | if (ret) |
523 | err(EXIT_FAILURE, "pthread_join" ); |
524 | |
525 | /* cleanup & report results */ |
526 | cond_destroy(&thread_parent); |
527 | cond_destroy(&thread_worker); |
528 | mutex_destroy(&thread_lock); |
529 | |
530 | /* sort the array back before reporting */ |
531 | if (randomize) |
532 | qsort(worker, nthreads, sizeof(struct worker), cmpworker); |
533 | |
534 | for (i = 0; i < nthreads; i++) { |
535 | unsigned long t = bench__runtime.tv_sec > 0 ? |
536 | worker[i].ops / bench__runtime.tv_sec : 0; |
537 | |
538 | update_stats(&throughput_stats, t); |
539 | |
540 | if (nfds == 1) |
541 | printf("[thread %2d] fdmap: %p [ %04ld ops/sec ]\n" , |
542 | worker[i].tid, &worker[i].fdmap[0], t); |
543 | else |
544 | printf("[thread %2d] fdmap: %p ... %p [ %04ld ops/sec ]\n" , |
545 | worker[i].tid, &worker[i].fdmap[0], |
546 | &worker[i].fdmap[nfds-1], t); |
547 | } |
548 | |
549 | print_summary(); |
550 | |
551 | close(epollfd); |
552 | perf_cpu_map__put(cpu); |
553 | for (i = 0; i < nthreads; i++) |
554 | free(worker[i].fdmap); |
555 | |
556 | free(worker); |
557 | return ret; |
558 | errmem: |
559 | err(EXIT_FAILURE, "calloc" ); |
560 | } |
561 | #endif // HAVE_EVENTFD_SUPPORT |
562 | |