1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * numa.c |
4 | * |
5 | * numa: Simulate NUMA-sensitive workload and measure their NUMA performance |
6 | */ |
7 | |
8 | #include <inttypes.h> |
9 | |
10 | #include <subcmd/parse-options.h> |
11 | #include "../util/cloexec.h" |
12 | |
13 | #include "bench.h" |
14 | |
15 | #include <errno.h> |
16 | #include <sched.h> |
17 | #include <stdio.h> |
18 | #include <assert.h> |
19 | #include <debug.h> |
20 | #include <malloc.h> |
21 | #include <signal.h> |
22 | #include <stdlib.h> |
23 | #include <string.h> |
24 | #include <unistd.h> |
25 | #include <sys/mman.h> |
26 | #include <sys/time.h> |
27 | #include <sys/resource.h> |
28 | #include <sys/wait.h> |
29 | #include <sys/prctl.h> |
30 | #include <sys/types.h> |
31 | #include <linux/kernel.h> |
32 | #include <linux/time64.h> |
33 | #include <linux/numa.h> |
34 | #include <linux/zalloc.h> |
35 | |
36 | #include "../util/header.h" |
37 | #include "../util/mutex.h" |
38 | #include <numa.h> |
39 | #include <numaif.h> |
40 | |
41 | #ifndef RUSAGE_THREAD |
42 | # define RUSAGE_THREAD 1 |
43 | #endif |
44 | |
45 | /* |
46 | * Regular printout to the terminal, suppressed if -q is specified: |
47 | */ |
48 | #define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0) |
49 | |
50 | /* |
51 | * Debug printf: |
52 | */ |
53 | #undef dprintf |
54 | #define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0) |
55 | |
56 | struct thread_data { |
57 | int curr_cpu; |
58 | cpu_set_t *bind_cpumask; |
59 | int bind_node; |
60 | u8 *process_data; |
61 | int process_nr; |
62 | int thread_nr; |
63 | int task_nr; |
64 | unsigned int loops_done; |
65 | u64 val; |
66 | u64 runtime_ns; |
67 | u64 system_time_ns; |
68 | u64 user_time_ns; |
69 | double speed_gbs; |
70 | struct mutex *process_lock; |
71 | }; |
72 | |
73 | /* Parameters set by options: */ |
74 | |
75 | struct params { |
76 | /* Startup synchronization: */ |
77 | bool serialize_startup; |
78 | |
79 | /* Task hierarchy: */ |
80 | int nr_proc; |
81 | int nr_threads; |
82 | |
83 | /* Working set sizes: */ |
84 | const char *mb_global_str; |
85 | const char *mb_proc_str; |
86 | const char *mb_proc_locked_str; |
87 | const char *mb_thread_str; |
88 | |
89 | double mb_global; |
90 | double mb_proc; |
91 | double mb_proc_locked; |
92 | double mb_thread; |
93 | |
94 | /* Access patterns to the working set: */ |
95 | bool data_reads; |
96 | bool data_writes; |
97 | bool data_backwards; |
98 | bool data_zero_memset; |
99 | bool data_rand_walk; |
100 | u32 nr_loops; |
101 | u32 nr_secs; |
102 | u32 sleep_usecs; |
103 | |
104 | /* Working set initialization: */ |
105 | bool init_zero; |
106 | bool init_random; |
107 | bool init_cpu0; |
108 | |
109 | /* Misc options: */ |
110 | int show_details; |
111 | int run_all; |
112 | int thp; |
113 | |
114 | long bytes_global; |
115 | long bytes_process; |
116 | long bytes_process_locked; |
117 | long bytes_thread; |
118 | |
119 | int nr_tasks; |
120 | |
121 | bool show_convergence; |
122 | bool measure_convergence; |
123 | |
124 | int perturb_secs; |
125 | int nr_cpus; |
126 | int nr_nodes; |
127 | |
128 | /* Affinity options -C and -N: */ |
129 | char *cpu_list_str; |
130 | char *node_list_str; |
131 | }; |
132 | |
133 | |
134 | /* Global, read-writable area, accessible to all processes and threads: */ |
135 | |
136 | struct global_info { |
137 | u8 *data; |
138 | |
139 | struct mutex startup_mutex; |
140 | struct cond startup_cond; |
141 | int nr_tasks_started; |
142 | |
143 | struct mutex start_work_mutex; |
144 | struct cond start_work_cond; |
145 | int nr_tasks_working; |
146 | bool start_work; |
147 | |
148 | struct mutex stop_work_mutex; |
149 | u64 bytes_done; |
150 | |
151 | struct thread_data *threads; |
152 | |
153 | /* Convergence latency measurement: */ |
154 | bool all_converged; |
155 | bool stop_work; |
156 | |
157 | int print_once; |
158 | |
159 | struct params p; |
160 | }; |
161 | |
162 | static struct global_info *g = NULL; |
163 | |
164 | static int parse_cpus_opt(const struct option *opt, const char *arg, int unset); |
165 | static int parse_nodes_opt(const struct option *opt, const char *arg, int unset); |
166 | |
167 | struct params p0; |
168 | |
169 | static const struct option options[] = { |
170 | OPT_INTEGER('p', "nr_proc" , &p0.nr_proc, "number of processes" ), |
171 | OPT_INTEGER('t', "nr_threads" , &p0.nr_threads, "number of threads per process" ), |
172 | |
173 | OPT_STRING('G', "mb_global" , &p0.mb_global_str, "MB" , "global memory (MBs)" ), |
174 | OPT_STRING('P', "mb_proc" , &p0.mb_proc_str, "MB" , "process memory (MBs)" ), |
175 | OPT_STRING('L', "mb_proc_locked" , &p0.mb_proc_locked_str,"MB" , "process serialized/locked memory access (MBs), <= process_memory" ), |
176 | OPT_STRING('T', "mb_thread" , &p0.mb_thread_str, "MB" , "thread memory (MBs)" ), |
177 | |
178 | OPT_UINTEGER('l', "nr_loops" , &p0.nr_loops, "max number of loops to run (default: unlimited)" ), |
179 | OPT_UINTEGER('s', "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)" ), |
180 | OPT_UINTEGER('u', "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration" ), |
181 | |
182 | OPT_BOOLEAN('R', "data_reads" , &p0.data_reads, "access the data via reads (can be mixed with -W)" ), |
183 | OPT_BOOLEAN('W', "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)" ), |
184 | OPT_BOOLEAN('B', "data_backwards" , &p0.data_backwards, "access the data backwards as well" ), |
185 | OPT_BOOLEAN('Z', "data_zero_memset" , &p0.data_zero_memset,"access the data via glibc bzero only" ), |
186 | OPT_BOOLEAN('r', "data_rand_walk" , &p0.data_rand_walk, "access the data with random (32bit LFSR) walk" ), |
187 | |
188 | |
189 | OPT_BOOLEAN('z', "init_zero" , &p0.init_zero, "bzero the initial allocations" ), |
190 | OPT_BOOLEAN('I', "init_random" , &p0.init_random, "randomize the contents of the initial allocations" ), |
191 | OPT_BOOLEAN('0', "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0" ), |
192 | OPT_INTEGER('x', "perturb_secs" , &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability" ), |
193 | |
194 | OPT_INCR ('d', "show_details" , &p0.show_details, "Show details" ), |
195 | OPT_INCR ('a', "all" , &p0.run_all, "Run all tests in the suite" ), |
196 | OPT_INTEGER('H', "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE" ), |
197 | OPT_BOOLEAN('c', "show_convergence" , &p0.show_convergence, "show convergence details, " |
198 | "convergence is reached when each process (all its threads) is running on a single NUMA node." ), |
199 | OPT_BOOLEAN('m', "measure_convergence" , &p0.measure_convergence, "measure convergence latency" ), |
200 | OPT_BOOLEAN('q', "quiet" , &quiet, |
201 | "quiet mode (do not show any warnings or messages)" ), |
202 | OPT_BOOLEAN('S', "serialize-startup" , &p0.serialize_startup,"serialize thread startup" ), |
203 | |
204 | /* Special option string parsing callbacks: */ |
205 | OPT_CALLBACK('C', "cpus" , NULL, "cpu[,cpu2,...cpuN]" , |
206 | "bind the first N tasks to these specific cpus (the rest is unbound)" , |
207 | parse_cpus_opt), |
208 | OPT_CALLBACK('M', "memnodes" , NULL, "node[,node2,...nodeN]" , |
209 | "bind the first N tasks to these specific memory nodes (the rest is unbound)" , |
210 | parse_nodes_opt), |
211 | OPT_END() |
212 | }; |
213 | |
214 | static const char * const bench_numa_usage[] = { |
215 | "perf bench numa <options>" , |
216 | NULL |
217 | }; |
218 | |
219 | static const char * const numa_usage[] = { |
220 | "perf bench numa mem [<options>]" , |
221 | NULL |
222 | }; |
223 | |
224 | /* |
225 | * To get number of numa nodes present. |
226 | */ |
227 | static int nr_numa_nodes(void) |
228 | { |
229 | int i, nr_nodes = 0; |
230 | |
231 | for (i = 0; i < g->p.nr_nodes; i++) { |
232 | if (numa_bitmask_isbitset(numa_nodes_ptr, i)) |
233 | nr_nodes++; |
234 | } |
235 | |
236 | return nr_nodes; |
237 | } |
238 | |
239 | /* |
240 | * To check if given numa node is present. |
241 | */ |
242 | static int is_node_present(int node) |
243 | { |
244 | return numa_bitmask_isbitset(numa_nodes_ptr, node); |
245 | } |
246 | |
247 | /* |
248 | * To check given numa node has cpus. |
249 | */ |
250 | static bool node_has_cpus(int node) |
251 | { |
252 | struct bitmask *cpumask = numa_allocate_cpumask(); |
253 | bool ret = false; /* fall back to nocpus */ |
254 | int cpu; |
255 | |
256 | BUG_ON(!cpumask); |
257 | if (!numa_node_to_cpus(node, cpumask)) { |
258 | for (cpu = 0; cpu < (int)cpumask->size; cpu++) { |
259 | if (numa_bitmask_isbitset(cpumask, cpu)) { |
260 | ret = true; |
261 | break; |
262 | } |
263 | } |
264 | } |
265 | numa_free_cpumask(cpumask); |
266 | |
267 | return ret; |
268 | } |
269 | |
270 | static cpu_set_t *bind_to_cpu(int target_cpu) |
271 | { |
272 | int nrcpus = numa_num_possible_cpus(); |
273 | cpu_set_t *orig_mask, *mask; |
274 | size_t size; |
275 | |
276 | orig_mask = CPU_ALLOC(nrcpus); |
277 | BUG_ON(!orig_mask); |
278 | size = CPU_ALLOC_SIZE(nrcpus); |
279 | CPU_ZERO_S(size, orig_mask); |
280 | |
281 | if (sched_getaffinity(pid: 0, mask: size, orig_mask)) |
282 | goto err_out; |
283 | |
284 | mask = CPU_ALLOC(nrcpus); |
285 | if (!mask) |
286 | goto err_out; |
287 | |
288 | CPU_ZERO_S(size, mask); |
289 | |
290 | if (target_cpu == -1) { |
291 | int cpu; |
292 | |
293 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) |
294 | CPU_SET_S(cpu, size, mask); |
295 | } else { |
296 | if (target_cpu < 0 || target_cpu >= g->p.nr_cpus) |
297 | goto err; |
298 | |
299 | CPU_SET_S(target_cpu, size, mask); |
300 | } |
301 | |
302 | if (sched_setaffinity(0, size, mask)) |
303 | goto err; |
304 | |
305 | return orig_mask; |
306 | |
307 | err: |
308 | CPU_FREE(mask); |
309 | err_out: |
310 | CPU_FREE(orig_mask); |
311 | |
312 | /* BUG_ON due to failure in allocation of orig_mask/mask */ |
313 | BUG_ON(-1); |
314 | return NULL; |
315 | } |
316 | |
317 | static cpu_set_t *bind_to_node(int target_node) |
318 | { |
319 | int nrcpus = numa_num_possible_cpus(); |
320 | size_t size; |
321 | cpu_set_t *orig_mask, *mask; |
322 | int cpu; |
323 | |
324 | orig_mask = CPU_ALLOC(nrcpus); |
325 | BUG_ON(!orig_mask); |
326 | size = CPU_ALLOC_SIZE(nrcpus); |
327 | CPU_ZERO_S(size, orig_mask); |
328 | |
329 | if (sched_getaffinity(0, size, orig_mask)) |
330 | goto err_out; |
331 | |
332 | mask = CPU_ALLOC(nrcpus); |
333 | if (!mask) |
334 | goto err_out; |
335 | |
336 | CPU_ZERO_S(size, mask); |
337 | |
338 | if (target_node == NUMA_NO_NODE) { |
339 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) |
340 | CPU_SET_S(cpu, size, mask); |
341 | } else { |
342 | struct bitmask *cpumask = numa_allocate_cpumask(); |
343 | |
344 | if (!cpumask) |
345 | goto err; |
346 | |
347 | if (!numa_node_to_cpus(target_node, cpumask)) { |
348 | for (cpu = 0; cpu < (int)cpumask->size; cpu++) { |
349 | if (numa_bitmask_isbitset(cpumask, cpu)) |
350 | CPU_SET_S(cpu, size, mask); |
351 | } |
352 | } |
353 | numa_free_cpumask(cpumask); |
354 | } |
355 | |
356 | if (sched_setaffinity(0, size, mask)) |
357 | goto err; |
358 | |
359 | return orig_mask; |
360 | |
361 | err: |
362 | CPU_FREE(mask); |
363 | err_out: |
364 | CPU_FREE(orig_mask); |
365 | |
366 | /* BUG_ON due to failure in allocation of orig_mask/mask */ |
367 | BUG_ON(-1); |
368 | return NULL; |
369 | } |
370 | |
371 | static void bind_to_cpumask(cpu_set_t *mask) |
372 | { |
373 | int ret; |
374 | size_t size = CPU_ALLOC_SIZE(numa_num_possible_cpus()); |
375 | |
376 | ret = sched_setaffinity(pid: 0, new_mask: size, mask); |
377 | if (ret) { |
378 | CPU_FREE(mask); |
379 | BUG_ON(ret); |
380 | } |
381 | } |
382 | |
383 | static void mempol_restore(void) |
384 | { |
385 | int ret; |
386 | |
387 | ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-1); |
388 | |
389 | BUG_ON(ret); |
390 | } |
391 | |
392 | static void bind_to_memnode(int node) |
393 | { |
394 | struct bitmask *node_mask; |
395 | int ret; |
396 | |
397 | if (node == NUMA_NO_NODE) |
398 | return; |
399 | |
400 | node_mask = numa_allocate_nodemask(); |
401 | BUG_ON(!node_mask); |
402 | |
403 | numa_bitmask_clearall(node_mask); |
404 | numa_bitmask_setbit(node_mask, node); |
405 | |
406 | ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1); |
407 | dprintf("binding to node %d, mask: %016lx => %d\n" , node, *node_mask->maskp, ret); |
408 | |
409 | numa_bitmask_free(node_mask); |
410 | BUG_ON(ret); |
411 | } |
412 | |
413 | #define HPSIZE (2*1024*1024) |
414 | |
415 | #define set_taskname(fmt...) \ |
416 | do { \ |
417 | char name[20]; \ |
418 | \ |
419 | snprintf(name, 20, fmt); \ |
420 | prctl(PR_SET_NAME, name); \ |
421 | } while (0) |
422 | |
423 | static u8 *alloc_data(ssize_t bytes0, int map_flags, |
424 | int init_zero, int init_cpu0, int thp, int init_random) |
425 | { |
426 | cpu_set_t *orig_mask = NULL; |
427 | ssize_t bytes; |
428 | u8 *buf; |
429 | int ret; |
430 | |
431 | if (!bytes0) |
432 | return NULL; |
433 | |
434 | /* Allocate and initialize all memory on CPU#0: */ |
435 | if (init_cpu0) { |
436 | int node = numa_node_of_cpu(0); |
437 | |
438 | orig_mask = bind_to_node(node); |
439 | bind_to_memnode(node); |
440 | } |
441 | |
442 | bytes = bytes0 + HPSIZE; |
443 | |
444 | buf = (void *)mmap(0, bytes, PROT_READ|PROT_WRITE, MAP_ANON|map_flags, -1, 0); |
445 | BUG_ON(buf == (void *)-1); |
446 | |
447 | if (map_flags == MAP_PRIVATE) { |
448 | if (thp > 0) { |
449 | ret = madvise(buf, bytes, MADV_HUGEPAGE); |
450 | if (ret && !g->print_once) { |
451 | g->print_once = 1; |
452 | printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n" ); |
453 | } |
454 | } |
455 | if (thp < 0) { |
456 | ret = madvise(buf, bytes, MADV_NOHUGEPAGE); |
457 | if (ret && !g->print_once) { |
458 | g->print_once = 1; |
459 | printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n" ); |
460 | } |
461 | } |
462 | } |
463 | |
464 | if (init_zero) { |
465 | bzero(buf, bytes); |
466 | } else { |
467 | /* Initialize random contents, different in each word: */ |
468 | if (init_random) { |
469 | u64 *wbuf = (void *)buf; |
470 | long off = rand(); |
471 | long i; |
472 | |
473 | for (i = 0; i < bytes/8; i++) |
474 | wbuf[i] = i + off; |
475 | } |
476 | } |
477 | |
478 | /* Align to 2MB boundary: */ |
479 | buf = (void *)(((unsigned long)buf + HPSIZE-1) & ~(HPSIZE-1)); |
480 | |
481 | /* Restore affinity: */ |
482 | if (init_cpu0) { |
483 | bind_to_cpumask(orig_mask); |
484 | CPU_FREE(orig_mask); |
485 | mempol_restore(); |
486 | } |
487 | |
488 | return buf; |
489 | } |
490 | |
491 | static void free_data(void *data, ssize_t bytes) |
492 | { |
493 | int ret; |
494 | |
495 | if (!data) |
496 | return; |
497 | |
498 | ret = munmap(data, bytes); |
499 | BUG_ON(ret); |
500 | } |
501 | |
502 | /* |
503 | * Create a shared memory buffer that can be shared between processes, zeroed: |
504 | */ |
505 | static void * zalloc_shared_data(ssize_t bytes) |
506 | { |
507 | return alloc_data(bytes, MAP_SHARED, 1, g->p.init_cpu0, g->p.thp, g->p.init_random); |
508 | } |
509 | |
510 | /* |
511 | * Create a shared memory buffer that can be shared between processes: |
512 | */ |
513 | static void * setup_shared_data(ssize_t bytes) |
514 | { |
515 | return alloc_data(bytes, MAP_SHARED, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); |
516 | } |
517 | |
518 | /* |
519 | * Allocate process-local memory - this will either be shared between |
520 | * threads of this process, or only be accessed by this thread: |
521 | */ |
522 | static void * setup_private_data(ssize_t bytes) |
523 | { |
524 | return alloc_data(bytes, MAP_PRIVATE, 0, g->p.init_cpu0, g->p.thp, g->p.init_random); |
525 | } |
526 | |
527 | static int parse_cpu_list(const char *arg) |
528 | { |
529 | p0.cpu_list_str = strdup(arg); |
530 | |
531 | dprintf("got CPU list: {%s}\n" , p0.cpu_list_str); |
532 | |
533 | return 0; |
534 | } |
535 | |
536 | static int parse_setup_cpu_list(void) |
537 | { |
538 | struct thread_data *td; |
539 | char *str0, *str; |
540 | int t; |
541 | |
542 | if (!g->p.cpu_list_str) |
543 | return 0; |
544 | |
545 | dprintf("g->p.nr_tasks: %d\n" , g->p.nr_tasks); |
546 | |
547 | str0 = str = strdup(g->p.cpu_list_str); |
548 | t = 0; |
549 | |
550 | BUG_ON(!str); |
551 | |
552 | tprintf("# binding tasks to CPUs:\n" ); |
553 | tprintf("# " ); |
554 | |
555 | while (true) { |
556 | int bind_cpu, bind_cpu_0, bind_cpu_1; |
557 | char *tok, *tok_end, *tok_step, *tok_len, *tok_mul; |
558 | int bind_len; |
559 | int step; |
560 | int mul; |
561 | |
562 | tok = strsep(&str, "," ); |
563 | if (!tok) |
564 | break; |
565 | |
566 | tok_end = strstr(tok, "-" ); |
567 | |
568 | dprintf("\ntoken: {%s}, end: {%s}\n" , tok, tok_end); |
569 | if (!tok_end) { |
570 | /* Single CPU specified: */ |
571 | bind_cpu_0 = bind_cpu_1 = atol(tok); |
572 | } else { |
573 | /* CPU range specified (for example: "5-11"): */ |
574 | bind_cpu_0 = atol(tok); |
575 | bind_cpu_1 = atol(tok_end + 1); |
576 | } |
577 | |
578 | step = 1; |
579 | tok_step = strstr(tok, "#" ); |
580 | if (tok_step) { |
581 | step = atol(tok_step + 1); |
582 | BUG_ON(step <= 0 || step >= g->p.nr_cpus); |
583 | } |
584 | |
585 | /* |
586 | * Mask length. |
587 | * Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4', |
588 | * where the _4 means the next 4 CPUs are allowed. |
589 | */ |
590 | bind_len = 1; |
591 | tok_len = strstr(tok, "_" ); |
592 | if (tok_len) { |
593 | bind_len = atol(tok_len + 1); |
594 | BUG_ON(bind_len <= 0 || bind_len > g->p.nr_cpus); |
595 | } |
596 | |
597 | /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ |
598 | mul = 1; |
599 | tok_mul = strstr(tok, "x" ); |
600 | if (tok_mul) { |
601 | mul = atol(tok_mul + 1); |
602 | BUG_ON(mul <= 0); |
603 | } |
604 | |
605 | dprintf("CPUs: %d_%d-%d#%dx%d\n" , bind_cpu_0, bind_len, bind_cpu_1, step, mul); |
606 | |
607 | if (bind_cpu_0 >= g->p.nr_cpus || bind_cpu_1 >= g->p.nr_cpus) { |
608 | printf("\nTest not applicable, system has only %d CPUs.\n" , g->p.nr_cpus); |
609 | return -1; |
610 | } |
611 | |
612 | if (is_cpu_online(cpu: bind_cpu_0) != 1 || is_cpu_online(cpu: bind_cpu_1) != 1) { |
613 | printf("\nTest not applicable, bind_cpu_0 or bind_cpu_1 is offline\n" ); |
614 | return -1; |
615 | } |
616 | |
617 | BUG_ON(bind_cpu_0 < 0 || bind_cpu_1 < 0); |
618 | BUG_ON(bind_cpu_0 > bind_cpu_1); |
619 | |
620 | for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) { |
621 | size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus); |
622 | int i; |
623 | |
624 | for (i = 0; i < mul; i++) { |
625 | int cpu; |
626 | |
627 | if (t >= g->p.nr_tasks) { |
628 | printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #" , bind_cpu); |
629 | goto out; |
630 | } |
631 | td = g->threads + t; |
632 | |
633 | if (t) |
634 | tprintf("," ); |
635 | if (bind_len > 1) { |
636 | tprintf("%2d/%d" , bind_cpu, bind_len); |
637 | } else { |
638 | tprintf("%2d" , bind_cpu); |
639 | } |
640 | |
641 | td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); |
642 | BUG_ON(!td->bind_cpumask); |
643 | CPU_ZERO_S(size, td->bind_cpumask); |
644 | for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) { |
645 | if (cpu < 0 || cpu >= g->p.nr_cpus) { |
646 | CPU_FREE(td->bind_cpumask); |
647 | BUG_ON(-1); |
648 | } |
649 | CPU_SET_S(cpu, size, td->bind_cpumask); |
650 | } |
651 | t++; |
652 | } |
653 | } |
654 | } |
655 | out: |
656 | |
657 | tprintf("\n" ); |
658 | |
659 | if (t < g->p.nr_tasks) |
660 | printf("# NOTE: %d tasks bound, %d tasks unbound\n" , t, g->p.nr_tasks - t); |
661 | |
662 | free(str0); |
663 | return 0; |
664 | } |
665 | |
666 | static int parse_cpus_opt(const struct option *opt __maybe_unused, |
667 | const char *arg, int unset __maybe_unused) |
668 | { |
669 | if (!arg) |
670 | return -1; |
671 | |
672 | return parse_cpu_list(arg); |
673 | } |
674 | |
675 | static int parse_node_list(const char *arg) |
676 | { |
677 | p0.node_list_str = strdup(arg); |
678 | |
679 | dprintf("got NODE list: {%s}\n" , p0.node_list_str); |
680 | |
681 | return 0; |
682 | } |
683 | |
684 | static int parse_setup_node_list(void) |
685 | { |
686 | struct thread_data *td; |
687 | char *str0, *str; |
688 | int t; |
689 | |
690 | if (!g->p.node_list_str) |
691 | return 0; |
692 | |
693 | dprintf("g->p.nr_tasks: %d\n" , g->p.nr_tasks); |
694 | |
695 | str0 = str = strdup(g->p.node_list_str); |
696 | t = 0; |
697 | |
698 | BUG_ON(!str); |
699 | |
700 | tprintf("# binding tasks to NODEs:\n" ); |
701 | tprintf("# " ); |
702 | |
703 | while (true) { |
704 | int bind_node, bind_node_0, bind_node_1; |
705 | char *tok, *tok_end, *tok_step, *tok_mul; |
706 | int step; |
707 | int mul; |
708 | |
709 | tok = strsep(&str, "," ); |
710 | if (!tok) |
711 | break; |
712 | |
713 | tok_end = strstr(tok, "-" ); |
714 | |
715 | dprintf("\ntoken: {%s}, end: {%s}\n" , tok, tok_end); |
716 | if (!tok_end) { |
717 | /* Single NODE specified: */ |
718 | bind_node_0 = bind_node_1 = atol(tok); |
719 | } else { |
720 | /* NODE range specified (for example: "5-11"): */ |
721 | bind_node_0 = atol(tok); |
722 | bind_node_1 = atol(tok_end + 1); |
723 | } |
724 | |
725 | step = 1; |
726 | tok_step = strstr(tok, "#" ); |
727 | if (tok_step) { |
728 | step = atol(tok_step + 1); |
729 | BUG_ON(step <= 0 || step >= g->p.nr_nodes); |
730 | } |
731 | |
732 | /* Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" */ |
733 | mul = 1; |
734 | tok_mul = strstr(tok, "x" ); |
735 | if (tok_mul) { |
736 | mul = atol(tok_mul + 1); |
737 | BUG_ON(mul <= 0); |
738 | } |
739 | |
740 | dprintf("NODEs: %d-%d #%d\n" , bind_node_0, bind_node_1, step); |
741 | |
742 | if (bind_node_0 >= g->p.nr_nodes || bind_node_1 >= g->p.nr_nodes) { |
743 | printf("\nTest not applicable, system has only %d nodes.\n" , g->p.nr_nodes); |
744 | return -1; |
745 | } |
746 | |
747 | BUG_ON(bind_node_0 < 0 || bind_node_1 < 0); |
748 | BUG_ON(bind_node_0 > bind_node_1); |
749 | |
750 | for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) { |
751 | int i; |
752 | |
753 | for (i = 0; i < mul; i++) { |
754 | if (t >= g->p.nr_tasks || !node_has_cpus(node: bind_node)) { |
755 | printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n" , bind_node); |
756 | goto out; |
757 | } |
758 | td = g->threads + t; |
759 | |
760 | if (!t) |
761 | tprintf(" %2d" , bind_node); |
762 | else |
763 | tprintf(",%2d" , bind_node); |
764 | |
765 | td->bind_node = bind_node; |
766 | t++; |
767 | } |
768 | } |
769 | } |
770 | out: |
771 | |
772 | tprintf("\n" ); |
773 | |
774 | if (t < g->p.nr_tasks) |
775 | printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n" , t, g->p.nr_tasks - t); |
776 | |
777 | free(str0); |
778 | return 0; |
779 | } |
780 | |
781 | static int parse_nodes_opt(const struct option *opt __maybe_unused, |
782 | const char *arg, int unset __maybe_unused) |
783 | { |
784 | if (!arg) |
785 | return -1; |
786 | |
787 | return parse_node_list(arg); |
788 | } |
789 | |
790 | static inline uint32_t lfsr_32(uint32_t lfsr) |
791 | { |
792 | const uint32_t taps = BIT(1) | BIT(5) | BIT(6) | BIT(31); |
793 | return (lfsr>>1) ^ ((0x0u - (lfsr & 0x1u)) & taps); |
794 | } |
795 | |
796 | /* |
797 | * Make sure there's real data dependency to RAM (when read |
798 | * accesses are enabled), so the compiler, the CPU and the |
799 | * kernel (KSM, zero page, etc.) cannot optimize away RAM |
800 | * accesses: |
801 | */ |
802 | static inline u64 access_data(u64 *data, u64 val) |
803 | { |
804 | if (g->p.data_reads) |
805 | val += *data; |
806 | if (g->p.data_writes) |
807 | *data = val + 1; |
808 | return val; |
809 | } |
810 | |
811 | /* |
812 | * The worker process does two types of work, a forwards going |
813 | * loop and a backwards going loop. |
814 | * |
815 | * We do this so that on multiprocessor systems we do not create |
816 | * a 'train' of processing, with highly synchronized processes, |
817 | * skewing the whole benchmark. |
818 | */ |
819 | static u64 do_work(u8 *__data, long bytes, int nr, int nr_max, int loop, u64 val) |
820 | { |
821 | long words = bytes/sizeof(u64); |
822 | u64 *data = (void *)__data; |
823 | long chunk_0, chunk_1; |
824 | u64 *d0, *d, *d1; |
825 | long off; |
826 | long i; |
827 | |
828 | BUG_ON(!data && words); |
829 | BUG_ON(data && !words); |
830 | |
831 | if (!data) |
832 | return val; |
833 | |
834 | /* Very simple memset() work variant: */ |
835 | if (g->p.data_zero_memset && !g->p.data_rand_walk) { |
836 | bzero(data, bytes); |
837 | return val; |
838 | } |
839 | |
840 | /* Spread out by PID/TID nr and by loop nr: */ |
841 | chunk_0 = words/nr_max; |
842 | chunk_1 = words/g->p.nr_loops; |
843 | off = nr*chunk_0 + loop*chunk_1; |
844 | |
845 | while (off >= words) |
846 | off -= words; |
847 | |
848 | if (g->p.data_rand_walk) { |
849 | u32 lfsr = nr + loop + val; |
850 | long j; |
851 | |
852 | for (i = 0; i < words/1024; i++) { |
853 | long start, end; |
854 | |
855 | lfsr = lfsr_32(lfsr); |
856 | |
857 | start = lfsr % words; |
858 | end = min(start + 1024, words-1); |
859 | |
860 | if (g->p.data_zero_memset) { |
861 | bzero(data + start, (end-start) * sizeof(u64)); |
862 | } else { |
863 | for (j = start; j < end; j++) |
864 | val = access_data(data: data + j, val); |
865 | } |
866 | } |
867 | } else if (!g->p.data_backwards || (nr + loop) & 1) { |
868 | /* Process data forwards: */ |
869 | |
870 | d0 = data + off; |
871 | d = data + off + 1; |
872 | d1 = data + words; |
873 | |
874 | for (;;) { |
875 | if (unlikely(d >= d1)) |
876 | d = data; |
877 | if (unlikely(d == d0)) |
878 | break; |
879 | |
880 | val = access_data(data: d, val); |
881 | |
882 | d++; |
883 | } |
884 | } else { |
885 | /* Process data backwards: */ |
886 | |
887 | d0 = data + off; |
888 | d = data + off - 1; |
889 | d1 = data + words; |
890 | |
891 | for (;;) { |
892 | if (unlikely(d < data)) |
893 | d = data + words-1; |
894 | if (unlikely(d == d0)) |
895 | break; |
896 | |
897 | val = access_data(data: d, val); |
898 | |
899 | d--; |
900 | } |
901 | } |
902 | |
903 | return val; |
904 | } |
905 | |
906 | static void update_curr_cpu(int task_nr, unsigned long bytes_worked) |
907 | { |
908 | unsigned int cpu; |
909 | |
910 | cpu = sched_getcpu(); |
911 | |
912 | g->threads[task_nr].curr_cpu = cpu; |
913 | prctl(0, bytes_worked); |
914 | } |
915 | |
916 | /* |
917 | * Count the number of nodes a process's threads |
918 | * are spread out on. |
919 | * |
920 | * A count of 1 means that the process is compressed |
921 | * to a single node. A count of g->p.nr_nodes means it's |
922 | * spread out on the whole system. |
923 | */ |
924 | static int count_process_nodes(int process_nr) |
925 | { |
926 | char *node_present; |
927 | int nodes; |
928 | int n, t; |
929 | |
930 | node_present = (char *)malloc(g->p.nr_nodes * sizeof(char)); |
931 | BUG_ON(!node_present); |
932 | for (nodes = 0; nodes < g->p.nr_nodes; nodes++) |
933 | node_present[nodes] = 0; |
934 | |
935 | for (t = 0; t < g->p.nr_threads; t++) { |
936 | struct thread_data *td; |
937 | int task_nr; |
938 | int node; |
939 | |
940 | task_nr = process_nr*g->p.nr_threads + t; |
941 | td = g->threads + task_nr; |
942 | |
943 | node = numa_node_of_cpu(td->curr_cpu); |
944 | if (node < 0) /* curr_cpu was likely still -1 */ { |
945 | free(node_present); |
946 | return 0; |
947 | } |
948 | |
949 | node_present[node] = 1; |
950 | } |
951 | |
952 | nodes = 0; |
953 | |
954 | for (n = 0; n < g->p.nr_nodes; n++) |
955 | nodes += node_present[n]; |
956 | |
957 | free(node_present); |
958 | return nodes; |
959 | } |
960 | |
961 | /* |
962 | * Count the number of distinct process-threads a node contains. |
963 | * |
964 | * A count of 1 means that the node contains only a single |
965 | * process. If all nodes on the system contain at most one |
966 | * process then we are well-converged. |
967 | */ |
968 | static int count_node_processes(int node) |
969 | { |
970 | int processes = 0; |
971 | int t, p; |
972 | |
973 | for (p = 0; p < g->p.nr_proc; p++) { |
974 | for (t = 0; t < g->p.nr_threads; t++) { |
975 | struct thread_data *td; |
976 | int task_nr; |
977 | int n; |
978 | |
979 | task_nr = p*g->p.nr_threads + t; |
980 | td = g->threads + task_nr; |
981 | |
982 | n = numa_node_of_cpu(td->curr_cpu); |
983 | if (n == node) { |
984 | processes++; |
985 | break; |
986 | } |
987 | } |
988 | } |
989 | |
990 | return processes; |
991 | } |
992 | |
993 | static void calc_convergence_compression(int *strong) |
994 | { |
995 | unsigned int nodes_min, nodes_max; |
996 | int p; |
997 | |
998 | nodes_min = -1; |
999 | nodes_max = 0; |
1000 | |
1001 | for (p = 0; p < g->p.nr_proc; p++) { |
1002 | unsigned int nodes = count_process_nodes(process_nr: p); |
1003 | |
1004 | if (!nodes) { |
1005 | *strong = 0; |
1006 | return; |
1007 | } |
1008 | |
1009 | nodes_min = min(nodes, nodes_min); |
1010 | nodes_max = max(nodes, nodes_max); |
1011 | } |
1012 | |
1013 | /* Strong convergence: all threads compress on a single node: */ |
1014 | if (nodes_min == 1 && nodes_max == 1) { |
1015 | *strong = 1; |
1016 | } else { |
1017 | *strong = 0; |
1018 | tprintf(" {%d-%d}" , nodes_min, nodes_max); |
1019 | } |
1020 | } |
1021 | |
1022 | static void calc_convergence(double runtime_ns_max, double *convergence) |
1023 | { |
1024 | unsigned int loops_done_min, loops_done_max; |
1025 | int process_groups; |
1026 | int *nodes; |
1027 | int distance; |
1028 | int nr_min; |
1029 | int nr_max; |
1030 | int strong; |
1031 | int sum; |
1032 | int nr; |
1033 | int node; |
1034 | int cpu; |
1035 | int t; |
1036 | |
1037 | if (!g->p.show_convergence && !g->p.measure_convergence) |
1038 | return; |
1039 | |
1040 | nodes = (int *)malloc(g->p.nr_nodes * sizeof(int)); |
1041 | BUG_ON(!nodes); |
1042 | for (node = 0; node < g->p.nr_nodes; node++) |
1043 | nodes[node] = 0; |
1044 | |
1045 | loops_done_min = -1; |
1046 | loops_done_max = 0; |
1047 | |
1048 | for (t = 0; t < g->p.nr_tasks; t++) { |
1049 | struct thread_data *td = g->threads + t; |
1050 | unsigned int loops_done; |
1051 | |
1052 | cpu = td->curr_cpu; |
1053 | |
1054 | /* Not all threads have written it yet: */ |
1055 | if (cpu < 0) |
1056 | continue; |
1057 | |
1058 | node = numa_node_of_cpu(cpu); |
1059 | |
1060 | nodes[node]++; |
1061 | |
1062 | loops_done = td->loops_done; |
1063 | loops_done_min = min(loops_done, loops_done_min); |
1064 | loops_done_max = max(loops_done, loops_done_max); |
1065 | } |
1066 | |
1067 | nr_max = 0; |
1068 | nr_min = g->p.nr_tasks; |
1069 | sum = 0; |
1070 | |
1071 | for (node = 0; node < g->p.nr_nodes; node++) { |
1072 | if (!is_node_present(node)) |
1073 | continue; |
1074 | nr = nodes[node]; |
1075 | nr_min = min(nr, nr_min); |
1076 | nr_max = max(nr, nr_max); |
1077 | sum += nr; |
1078 | } |
1079 | BUG_ON(nr_min > nr_max); |
1080 | |
1081 | BUG_ON(sum > g->p.nr_tasks); |
1082 | |
1083 | if (0 && (sum < g->p.nr_tasks)) { |
1084 | free(nodes); |
1085 | return; |
1086 | } |
1087 | |
1088 | /* |
1089 | * Count the number of distinct process groups present |
1090 | * on nodes - when we are converged this will decrease |
1091 | * to g->p.nr_proc: |
1092 | */ |
1093 | process_groups = 0; |
1094 | |
1095 | for (node = 0; node < g->p.nr_nodes; node++) { |
1096 | int processes; |
1097 | |
1098 | if (!is_node_present(node)) |
1099 | continue; |
1100 | processes = count_node_processes(node); |
1101 | nr = nodes[node]; |
1102 | tprintf(" %2d/%-2d" , nr, processes); |
1103 | |
1104 | process_groups += processes; |
1105 | } |
1106 | |
1107 | distance = nr_max - nr_min; |
1108 | |
1109 | tprintf(" [%2d/%-2d]" , distance, process_groups); |
1110 | |
1111 | tprintf(" l:%3d-%-3d (%3d)" , |
1112 | loops_done_min, loops_done_max, loops_done_max-loops_done_min); |
1113 | |
1114 | if (loops_done_min && loops_done_max) { |
1115 | double skew = 1.0 - (double)loops_done_min/loops_done_max; |
1116 | |
1117 | tprintf(" [%4.1f%%]" , skew * 100.0); |
1118 | } |
1119 | |
1120 | calc_convergence_compression(strong: &strong); |
1121 | |
1122 | if (strong && process_groups == g->p.nr_proc) { |
1123 | if (!*convergence) { |
1124 | *convergence = runtime_ns_max; |
1125 | tprintf(" (%6.1fs converged)\n" , *convergence / NSEC_PER_SEC); |
1126 | if (g->p.measure_convergence) { |
1127 | g->all_converged = true; |
1128 | g->stop_work = true; |
1129 | } |
1130 | } |
1131 | } else { |
1132 | if (*convergence) { |
1133 | tprintf(" (%6.1fs de-converged)" , runtime_ns_max / NSEC_PER_SEC); |
1134 | *convergence = 0; |
1135 | } |
1136 | tprintf("\n" ); |
1137 | } |
1138 | |
1139 | free(nodes); |
1140 | } |
1141 | |
1142 | static void show_summary(double runtime_ns_max, int l, double *convergence) |
1143 | { |
1144 | tprintf("\r # %5.1f%% [%.1f mins]" , |
1145 | (double)(l+1)/g->p.nr_loops*100.0, runtime_ns_max / NSEC_PER_SEC / 60.0); |
1146 | |
1147 | calc_convergence(runtime_ns_max, convergence); |
1148 | |
1149 | if (g->p.show_details >= 0) |
1150 | fflush(stdout); |
1151 | } |
1152 | |
1153 | static void *worker_thread(void *__tdata) |
1154 | { |
1155 | struct thread_data *td = __tdata; |
1156 | struct timeval start0, start, stop, diff; |
1157 | int process_nr = td->process_nr; |
1158 | int thread_nr = td->thread_nr; |
1159 | unsigned long last_perturbance; |
1160 | int task_nr = td->task_nr; |
1161 | int details = g->p.show_details; |
1162 | int first_task, last_task; |
1163 | double convergence = 0; |
1164 | u64 val = td->val; |
1165 | double runtime_ns_max; |
1166 | u8 *global_data; |
1167 | u8 *process_data; |
1168 | u8 *thread_data; |
1169 | u64 bytes_done, secs; |
1170 | long work_done; |
1171 | u32 l; |
1172 | struct rusage rusage; |
1173 | |
1174 | bind_to_cpumask(td->bind_cpumask); |
1175 | bind_to_memnode(node: td->bind_node); |
1176 | |
1177 | set_taskname("thread %d/%d" , process_nr, thread_nr); |
1178 | |
1179 | global_data = g->data; |
1180 | process_data = td->process_data; |
1181 | thread_data = setup_private_data(g->p.bytes_thread); |
1182 | |
1183 | bytes_done = 0; |
1184 | |
1185 | last_task = 0; |
1186 | if (process_nr == g->p.nr_proc-1 && thread_nr == g->p.nr_threads-1) |
1187 | last_task = 1; |
1188 | |
1189 | first_task = 0; |
1190 | if (process_nr == 0 && thread_nr == 0) |
1191 | first_task = 1; |
1192 | |
1193 | if (details >= 2) { |
1194 | printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n" , |
1195 | process_nr, thread_nr, global_data, process_data, thread_data); |
1196 | } |
1197 | |
1198 | if (g->p.serialize_startup) { |
1199 | mutex_lock(&g->startup_mutex); |
1200 | g->nr_tasks_started++; |
1201 | /* The last thread wakes the main process. */ |
1202 | if (g->nr_tasks_started == g->p.nr_tasks) |
1203 | cond_signal(cnd: &g->startup_cond); |
1204 | |
1205 | mutex_unlock(mtx: &g->startup_mutex); |
1206 | |
1207 | /* Here we will wait for the main process to start us all at once: */ |
1208 | mutex_lock(&g->start_work_mutex); |
1209 | g->start_work = false; |
1210 | g->nr_tasks_working++; |
1211 | while (!g->start_work) |
1212 | cond_wait(cnd: &g->start_work_cond, mtx: &g->start_work_mutex); |
1213 | |
1214 | mutex_unlock(mtx: &g->start_work_mutex); |
1215 | } |
1216 | |
1217 | gettimeofday(&start0, NULL); |
1218 | |
1219 | start = stop = start0; |
1220 | last_perturbance = start.tv_sec; |
1221 | |
1222 | for (l = 0; l < g->p.nr_loops; l++) { |
1223 | start = stop; |
1224 | |
1225 | if (g->stop_work) |
1226 | break; |
1227 | |
1228 | val += do_work(data: global_data, bytes: g->p.bytes_global, nr: process_nr, nr_max: g->p.nr_proc, loop: l, val); |
1229 | val += do_work(data: process_data, bytes: g->p.bytes_process, nr: thread_nr, nr_max: g->p.nr_threads, loop: l, val); |
1230 | val += do_work(data: thread_data, bytes: g->p.bytes_thread, nr: 0, nr_max: 1, loop: l, val); |
1231 | |
1232 | if (g->p.sleep_usecs) { |
1233 | mutex_lock(td->process_lock); |
1234 | usleep(g->p.sleep_usecs); |
1235 | mutex_unlock(mtx: td->process_lock); |
1236 | } |
1237 | /* |
1238 | * Amount of work to be done under a process-global lock: |
1239 | */ |
1240 | if (g->p.bytes_process_locked) { |
1241 | mutex_lock(td->process_lock); |
1242 | val += do_work(data: process_data, bytes: g->p.bytes_process_locked, nr: thread_nr, nr_max: g->p.nr_threads, loop: l, val); |
1243 | mutex_unlock(mtx: td->process_lock); |
1244 | } |
1245 | |
1246 | work_done = g->p.bytes_global + g->p.bytes_process + |
1247 | g->p.bytes_process_locked + g->p.bytes_thread; |
1248 | |
1249 | update_curr_cpu(task_nr, bytes_worked: work_done); |
1250 | bytes_done += work_done; |
1251 | |
1252 | if (details < 0 && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs) |
1253 | continue; |
1254 | |
1255 | td->loops_done = l; |
1256 | |
1257 | gettimeofday(&stop, NULL); |
1258 | |
1259 | /* Check whether our max runtime timed out: */ |
1260 | if (g->p.nr_secs) { |
1261 | timersub(&stop, &start0, &diff); |
1262 | if ((u32)diff.tv_sec >= g->p.nr_secs) { |
1263 | g->stop_work = true; |
1264 | break; |
1265 | } |
1266 | } |
1267 | |
1268 | /* Update the summary at most once per second: */ |
1269 | if (start.tv_sec == stop.tv_sec) |
1270 | continue; |
1271 | |
1272 | /* |
1273 | * Perturb the first task's equilibrium every g->p.perturb_secs seconds, |
1274 | * by migrating to CPU#0: |
1275 | */ |
1276 | if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) { |
1277 | cpu_set_t *orig_mask; |
1278 | int target_cpu; |
1279 | int this_cpu; |
1280 | |
1281 | last_perturbance = stop.tv_sec; |
1282 | |
1283 | /* |
1284 | * Depending on where we are running, move into |
1285 | * the other half of the system, to create some |
1286 | * real disturbance: |
1287 | */ |
1288 | this_cpu = g->threads[task_nr].curr_cpu; |
1289 | if (this_cpu < g->p.nr_cpus/2) |
1290 | target_cpu = g->p.nr_cpus-1; |
1291 | else |
1292 | target_cpu = 0; |
1293 | |
1294 | orig_mask = bind_to_cpu(target_cpu); |
1295 | |
1296 | /* Here we are running on the target CPU already */ |
1297 | if (details >= 1) |
1298 | printf(" (injecting perturbalance, moved to CPU#%d)\n" , target_cpu); |
1299 | |
1300 | bind_to_cpumask(orig_mask); |
1301 | CPU_FREE(orig_mask); |
1302 | } |
1303 | |
1304 | if (details >= 3) { |
1305 | timersub(&stop, &start, &diff); |
1306 | runtime_ns_max = diff.tv_sec * NSEC_PER_SEC; |
1307 | runtime_ns_max += diff.tv_usec * NSEC_PER_USEC; |
1308 | |
1309 | if (details >= 0) { |
1310 | printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016" PRIx64"]\n" , |
1311 | process_nr, thread_nr, runtime_ns_max / bytes_done, val); |
1312 | } |
1313 | fflush(stdout); |
1314 | } |
1315 | if (!last_task) |
1316 | continue; |
1317 | |
1318 | timersub(&stop, &start0, &diff); |
1319 | runtime_ns_max = diff.tv_sec * NSEC_PER_SEC; |
1320 | runtime_ns_max += diff.tv_usec * NSEC_PER_USEC; |
1321 | |
1322 | show_summary(runtime_ns_max, l, convergence: &convergence); |
1323 | } |
1324 | |
1325 | gettimeofday(&stop, NULL); |
1326 | timersub(&stop, &start0, &diff); |
1327 | td->runtime_ns = diff.tv_sec * NSEC_PER_SEC; |
1328 | td->runtime_ns += diff.tv_usec * NSEC_PER_USEC; |
1329 | secs = td->runtime_ns / NSEC_PER_SEC; |
1330 | td->speed_gbs = secs ? bytes_done / secs / 1e9 : 0; |
1331 | |
1332 | getrusage(RUSAGE_THREAD, &rusage); |
1333 | td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC; |
1334 | td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC; |
1335 | td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC; |
1336 | td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC; |
1337 | |
1338 | free_data(data: thread_data, bytes: g->p.bytes_thread); |
1339 | |
1340 | mutex_lock(&g->stop_work_mutex); |
1341 | g->bytes_done += bytes_done; |
1342 | mutex_unlock(mtx: &g->stop_work_mutex); |
1343 | |
1344 | return NULL; |
1345 | } |
1346 | |
1347 | /* |
1348 | * A worker process starts a couple of threads: |
1349 | */ |
1350 | static void worker_process(int process_nr) |
1351 | { |
1352 | struct mutex process_lock; |
1353 | struct thread_data *td; |
1354 | pthread_t *pthreads; |
1355 | u8 *process_data; |
1356 | int task_nr; |
1357 | int ret; |
1358 | int t; |
1359 | |
1360 | mutex_init(&process_lock); |
1361 | set_taskname("process %d" , process_nr); |
1362 | |
1363 | /* |
1364 | * Pick up the memory policy and the CPU binding of our first thread, |
1365 | * so that we initialize memory accordingly: |
1366 | */ |
1367 | task_nr = process_nr*g->p.nr_threads; |
1368 | td = g->threads + task_nr; |
1369 | |
1370 | bind_to_memnode(node: td->bind_node); |
1371 | bind_to_cpumask(td->bind_cpumask); |
1372 | |
1373 | pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t)); |
1374 | process_data = setup_private_data(g->p.bytes_process); |
1375 | |
1376 | if (g->p.show_details >= 3) { |
1377 | printf(" # process %2d global mem: %p, process mem: %p\n" , |
1378 | process_nr, g->data, process_data); |
1379 | } |
1380 | |
1381 | for (t = 0; t < g->p.nr_threads; t++) { |
1382 | task_nr = process_nr*g->p.nr_threads + t; |
1383 | td = g->threads + task_nr; |
1384 | |
1385 | td->process_data = process_data; |
1386 | td->process_nr = process_nr; |
1387 | td->thread_nr = t; |
1388 | td->task_nr = task_nr; |
1389 | td->val = rand(); |
1390 | td->curr_cpu = -1; |
1391 | td->process_lock = &process_lock; |
1392 | |
1393 | ret = pthread_create(pthreads + t, NULL, worker_thread, td); |
1394 | BUG_ON(ret); |
1395 | } |
1396 | |
1397 | for (t = 0; t < g->p.nr_threads; t++) { |
1398 | ret = pthread_join(pthreads[t], NULL); |
1399 | BUG_ON(ret); |
1400 | } |
1401 | |
1402 | free_data(data: process_data, bytes: g->p.bytes_process); |
1403 | free(pthreads); |
1404 | } |
1405 | |
1406 | static void print_summary(void) |
1407 | { |
1408 | if (g->p.show_details < 0) |
1409 | return; |
1410 | |
1411 | printf("\n ###\n" ); |
1412 | printf(" # %d %s will execute (on %d nodes, %d CPUs):\n" , |
1413 | g->p.nr_tasks, g->p.nr_tasks == 1 ? "task" : "tasks" , nr_numa_nodes(), g->p.nr_cpus); |
1414 | printf(" # %5dx %5ldMB global shared mem operations\n" , |
1415 | g->p.nr_loops, g->p.bytes_global/1024/1024); |
1416 | printf(" # %5dx %5ldMB process shared mem operations\n" , |
1417 | g->p.nr_loops, g->p.bytes_process/1024/1024); |
1418 | printf(" # %5dx %5ldMB thread local mem operations\n" , |
1419 | g->p.nr_loops, g->p.bytes_thread/1024/1024); |
1420 | |
1421 | printf(" ###\n" ); |
1422 | |
1423 | printf("\n ###\n" ); fflush(stdout); |
1424 | } |
1425 | |
1426 | static void init_thread_data(void) |
1427 | { |
1428 | ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; |
1429 | int t; |
1430 | |
1431 | g->threads = zalloc_shared_data(bytes: size); |
1432 | |
1433 | for (t = 0; t < g->p.nr_tasks; t++) { |
1434 | struct thread_data *td = g->threads + t; |
1435 | size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus); |
1436 | int cpu; |
1437 | |
1438 | /* Allow all nodes by default: */ |
1439 | td->bind_node = NUMA_NO_NODE; |
1440 | |
1441 | /* Allow all CPUs by default: */ |
1442 | td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus); |
1443 | BUG_ON(!td->bind_cpumask); |
1444 | CPU_ZERO_S(cpuset_size, td->bind_cpumask); |
1445 | for (cpu = 0; cpu < g->p.nr_cpus; cpu++) |
1446 | CPU_SET_S(cpu, cpuset_size, td->bind_cpumask); |
1447 | } |
1448 | } |
1449 | |
1450 | static void deinit_thread_data(void) |
1451 | { |
1452 | ssize_t size = sizeof(*g->threads)*g->p.nr_tasks; |
1453 | int t; |
1454 | |
1455 | /* Free the bind_cpumask allocated for thread_data */ |
1456 | for (t = 0; t < g->p.nr_tasks; t++) { |
1457 | struct thread_data *td = g->threads + t; |
1458 | CPU_FREE(td->bind_cpumask); |
1459 | } |
1460 | |
1461 | free_data(data: g->threads, bytes: size); |
1462 | } |
1463 | |
1464 | static int init(void) |
1465 | { |
1466 | g = (void *)alloc_data(sizeof(*g), MAP_SHARED, 1, 0, 0 /* THP */, 0); |
1467 | |
1468 | /* Copy over options: */ |
1469 | g->p = p0; |
1470 | |
1471 | g->p.nr_cpus = numa_num_configured_cpus(); |
1472 | |
1473 | g->p.nr_nodes = numa_max_node() + 1; |
1474 | |
1475 | /* char array in count_process_nodes(): */ |
1476 | BUG_ON(g->p.nr_nodes < 0); |
1477 | |
1478 | if (quiet && !g->p.show_details) |
1479 | g->p.show_details = -1; |
1480 | |
1481 | /* Some memory should be specified: */ |
1482 | if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str) |
1483 | return -1; |
1484 | |
1485 | if (g->p.mb_global_str) { |
1486 | g->p.mb_global = atof(g->p.mb_global_str); |
1487 | BUG_ON(g->p.mb_global < 0); |
1488 | } |
1489 | |
1490 | if (g->p.mb_proc_str) { |
1491 | g->p.mb_proc = atof(g->p.mb_proc_str); |
1492 | BUG_ON(g->p.mb_proc < 0); |
1493 | } |
1494 | |
1495 | if (g->p.mb_proc_locked_str) { |
1496 | g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str); |
1497 | BUG_ON(g->p.mb_proc_locked < 0); |
1498 | BUG_ON(g->p.mb_proc_locked > g->p.mb_proc); |
1499 | } |
1500 | |
1501 | if (g->p.mb_thread_str) { |
1502 | g->p.mb_thread = atof(g->p.mb_thread_str); |
1503 | BUG_ON(g->p.mb_thread < 0); |
1504 | } |
1505 | |
1506 | BUG_ON(g->p.nr_threads <= 0); |
1507 | BUG_ON(g->p.nr_proc <= 0); |
1508 | |
1509 | g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads; |
1510 | |
1511 | g->p.bytes_global = g->p.mb_global *1024L*1024L; |
1512 | g->p.bytes_process = g->p.mb_proc *1024L*1024L; |
1513 | g->p.bytes_process_locked = g->p.mb_proc_locked *1024L*1024L; |
1514 | g->p.bytes_thread = g->p.mb_thread *1024L*1024L; |
1515 | |
1516 | g->data = setup_shared_data(g->p.bytes_global); |
1517 | |
1518 | /* Startup serialization: */ |
1519 | mutex_init_pshared(mtx: &g->start_work_mutex); |
1520 | cond_init_pshared(cnd: &g->start_work_cond); |
1521 | mutex_init_pshared(mtx: &g->startup_mutex); |
1522 | cond_init_pshared(cnd: &g->startup_cond); |
1523 | mutex_init_pshared(mtx: &g->stop_work_mutex); |
1524 | |
1525 | init_thread_data(); |
1526 | |
1527 | tprintf("#\n" ); |
1528 | if (parse_setup_cpu_list() || parse_setup_node_list()) |
1529 | return -1; |
1530 | tprintf("#\n" ); |
1531 | |
1532 | print_summary(); |
1533 | |
1534 | return 0; |
1535 | } |
1536 | |
1537 | static void deinit(void) |
1538 | { |
1539 | free_data(data: g->data, bytes: g->p.bytes_global); |
1540 | g->data = NULL; |
1541 | |
1542 | deinit_thread_data(); |
1543 | |
1544 | free_data(data: g, bytes: sizeof(*g)); |
1545 | g = NULL; |
1546 | } |
1547 | |
1548 | /* |
1549 | * Print a short or long result, depending on the verbosity setting: |
1550 | */ |
1551 | static void print_res(const char *name, double val, |
1552 | const char *txt_unit, const char *txt_short, const char *txt_long) |
1553 | { |
1554 | if (!name) |
1555 | name = "main," ; |
1556 | |
1557 | if (!quiet) |
1558 | printf(" %-30s %15.3f, %-15s %s\n" , name, val, txt_unit, txt_short); |
1559 | else |
1560 | printf(" %14.3f %s\n" , val, txt_long); |
1561 | } |
1562 | |
1563 | static int __bench_numa(const char *name) |
1564 | { |
1565 | struct timeval start, stop, diff; |
1566 | u64 runtime_ns_min, runtime_ns_sum; |
1567 | pid_t *pids, pid, wpid; |
1568 | double delta_runtime; |
1569 | double runtime_avg; |
1570 | double runtime_sec_max; |
1571 | double runtime_sec_min; |
1572 | int wait_stat; |
1573 | double bytes; |
1574 | int i, t, p; |
1575 | |
1576 | if (init()) |
1577 | return -1; |
1578 | |
1579 | pids = zalloc(g->p.nr_proc * sizeof(*pids)); |
1580 | pid = -1; |
1581 | |
1582 | if (g->p.serialize_startup) { |
1583 | tprintf(" #\n" ); |
1584 | tprintf(" # Startup synchronization: ..." ); fflush(stdout); |
1585 | } |
1586 | |
1587 | gettimeofday(&start, NULL); |
1588 | |
1589 | for (i = 0; i < g->p.nr_proc; i++) { |
1590 | pid = fork(); |
1591 | dprintf(" # process %2d: PID %d\n" , i, pid); |
1592 | |
1593 | BUG_ON(pid < 0); |
1594 | if (!pid) { |
1595 | /* Child process: */ |
1596 | worker_process(process_nr: i); |
1597 | |
1598 | exit(0); |
1599 | } |
1600 | pids[i] = pid; |
1601 | |
1602 | } |
1603 | |
1604 | if (g->p.serialize_startup) { |
1605 | bool threads_ready = false; |
1606 | double startup_sec; |
1607 | |
1608 | /* |
1609 | * Wait for all the threads to start up. The last thread will |
1610 | * signal this process. |
1611 | */ |
1612 | mutex_lock(&g->startup_mutex); |
1613 | while (g->nr_tasks_started != g->p.nr_tasks) |
1614 | cond_wait(cnd: &g->startup_cond, mtx: &g->startup_mutex); |
1615 | |
1616 | mutex_unlock(mtx: &g->startup_mutex); |
1617 | |
1618 | /* Wait for all threads to be at the start_work_cond. */ |
1619 | while (!threads_ready) { |
1620 | mutex_lock(&g->start_work_mutex); |
1621 | threads_ready = (g->nr_tasks_working == g->p.nr_tasks); |
1622 | mutex_unlock(mtx: &g->start_work_mutex); |
1623 | if (!threads_ready) |
1624 | usleep(1); |
1625 | } |
1626 | |
1627 | gettimeofday(&stop, NULL); |
1628 | |
1629 | timersub(&stop, &start, &diff); |
1630 | |
1631 | startup_sec = diff.tv_sec * NSEC_PER_SEC; |
1632 | startup_sec += diff.tv_usec * NSEC_PER_USEC; |
1633 | startup_sec /= NSEC_PER_SEC; |
1634 | |
1635 | tprintf(" threads initialized in %.6f seconds.\n" , startup_sec); |
1636 | tprintf(" #\n" ); |
1637 | |
1638 | start = stop; |
1639 | /* Start all threads running. */ |
1640 | mutex_lock(&g->start_work_mutex); |
1641 | g->start_work = true; |
1642 | mutex_unlock(mtx: &g->start_work_mutex); |
1643 | cond_broadcast(cnd: &g->start_work_cond); |
1644 | } else { |
1645 | gettimeofday(&start, NULL); |
1646 | } |
1647 | |
1648 | /* Parent process: */ |
1649 | |
1650 | |
1651 | for (i = 0; i < g->p.nr_proc; i++) { |
1652 | wpid = waitpid(pids[i], &wait_stat, 0); |
1653 | BUG_ON(wpid < 0); |
1654 | BUG_ON(!WIFEXITED(wait_stat)); |
1655 | |
1656 | } |
1657 | |
1658 | runtime_ns_sum = 0; |
1659 | runtime_ns_min = -1LL; |
1660 | |
1661 | for (t = 0; t < g->p.nr_tasks; t++) { |
1662 | u64 thread_runtime_ns = g->threads[t].runtime_ns; |
1663 | |
1664 | runtime_ns_sum += thread_runtime_ns; |
1665 | runtime_ns_min = min(thread_runtime_ns, runtime_ns_min); |
1666 | } |
1667 | |
1668 | gettimeofday(&stop, NULL); |
1669 | timersub(&stop, &start, &diff); |
1670 | |
1671 | BUG_ON(bench_format != BENCH_FORMAT_DEFAULT); |
1672 | |
1673 | tprintf("\n ###\n" ); |
1674 | tprintf("\n" ); |
1675 | |
1676 | runtime_sec_max = diff.tv_sec * NSEC_PER_SEC; |
1677 | runtime_sec_max += diff.tv_usec * NSEC_PER_USEC; |
1678 | runtime_sec_max /= NSEC_PER_SEC; |
1679 | |
1680 | runtime_sec_min = runtime_ns_min / NSEC_PER_SEC; |
1681 | |
1682 | bytes = g->bytes_done; |
1683 | runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC; |
1684 | |
1685 | if (g->p.measure_convergence) { |
1686 | print_res(name, val: runtime_sec_max, |
1687 | txt_unit: "secs," , txt_short: "NUMA-convergence-latency" , txt_long: "secs latency to NUMA-converge" ); |
1688 | } |
1689 | |
1690 | print_res(name, val: runtime_sec_max, |
1691 | txt_unit: "secs," , txt_short: "runtime-max/thread" , txt_long: "secs slowest (max) thread-runtime" ); |
1692 | |
1693 | print_res(name, val: runtime_sec_min, |
1694 | txt_unit: "secs," , txt_short: "runtime-min/thread" , txt_long: "secs fastest (min) thread-runtime" ); |
1695 | |
1696 | print_res(name, val: runtime_avg, |
1697 | txt_unit: "secs," , txt_short: "runtime-avg/thread" , txt_long: "secs average thread-runtime" ); |
1698 | |
1699 | delta_runtime = (runtime_sec_max - runtime_sec_min)/2.0; |
1700 | print_res(name, val: delta_runtime / runtime_sec_max * 100.0, |
1701 | txt_unit: "%," , txt_short: "spread-runtime/thread" , txt_long: "% difference between max/avg runtime" ); |
1702 | |
1703 | print_res(name, val: bytes / g->p.nr_tasks / 1e9, |
1704 | txt_unit: "GB," , txt_short: "data/thread" , txt_long: "GB data processed, per thread" ); |
1705 | |
1706 | print_res(name, val: bytes / 1e9, |
1707 | txt_unit: "GB," , txt_short: "data-total" , txt_long: "GB data processed, total" ); |
1708 | |
1709 | print_res(name, val: runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks), |
1710 | txt_unit: "nsecs," , txt_short: "runtime/byte/thread" ,txt_long: "nsecs/byte/thread runtime" ); |
1711 | |
1712 | print_res(name, val: bytes / g->p.nr_tasks / 1e9 / runtime_sec_max, |
1713 | txt_unit: "GB/sec," , txt_short: "thread-speed" , txt_long: "GB/sec/thread speed" ); |
1714 | |
1715 | print_res(name, val: bytes / runtime_sec_max / 1e9, |
1716 | txt_unit: "GB/sec," , txt_short: "total-speed" , txt_long: "GB/sec total speed" ); |
1717 | |
1718 | if (g->p.show_details >= 2) { |
1719 | char tname[14 + 2 * 11 + 1]; |
1720 | struct thread_data *td; |
1721 | for (p = 0; p < g->p.nr_proc; p++) { |
1722 | for (t = 0; t < g->p.nr_threads; t++) { |
1723 | memset(tname, 0, sizeof(tname)); |
1724 | td = g->threads + p*g->p.nr_threads + t; |
1725 | snprintf(buf: tname, size: sizeof(tname), fmt: "process%d:thread%d" , p, t); |
1726 | print_res(name: tname, val: td->speed_gbs, |
1727 | txt_unit: "GB/sec" , txt_short: "thread-speed" , txt_long: "GB/sec/thread speed" ); |
1728 | print_res(name: tname, val: td->system_time_ns / NSEC_PER_SEC, |
1729 | txt_unit: "secs" , txt_short: "thread-system-time" , txt_long: "system CPU time/thread" ); |
1730 | print_res(name: tname, val: td->user_time_ns / NSEC_PER_SEC, |
1731 | txt_unit: "secs" , txt_short: "thread-user-time" , txt_long: "user CPU time/thread" ); |
1732 | } |
1733 | } |
1734 | } |
1735 | |
1736 | free(pids); |
1737 | |
1738 | deinit(); |
1739 | |
1740 | return 0; |
1741 | } |
1742 | |
1743 | #define MAX_ARGS 50 |
1744 | |
1745 | static int command_size(const char **argv) |
1746 | { |
1747 | int size = 0; |
1748 | |
1749 | while (*argv) { |
1750 | size++; |
1751 | argv++; |
1752 | } |
1753 | |
1754 | BUG_ON(size >= MAX_ARGS); |
1755 | |
1756 | return size; |
1757 | } |
1758 | |
1759 | static void init_params(struct params *p, const char *name, int argc, const char **argv) |
1760 | { |
1761 | int i; |
1762 | |
1763 | printf("\n # Running %s \"perf bench numa" , name); |
1764 | |
1765 | for (i = 0; i < argc; i++) |
1766 | printf(" %s" , argv[i]); |
1767 | |
1768 | printf("\"\n" ); |
1769 | |
1770 | memset(p, 0, sizeof(*p)); |
1771 | |
1772 | /* Initialize nonzero defaults: */ |
1773 | |
1774 | p->serialize_startup = 1; |
1775 | p->data_reads = true; |
1776 | p->data_writes = true; |
1777 | p->data_backwards = true; |
1778 | p->data_rand_walk = true; |
1779 | p->nr_loops = -1; |
1780 | p->init_random = true; |
1781 | p->mb_global_str = "1" ; |
1782 | p->nr_proc = 1; |
1783 | p->nr_threads = 1; |
1784 | p->nr_secs = 5; |
1785 | p->run_all = argc == 1; |
1786 | } |
1787 | |
1788 | static int run_bench_numa(const char *name, const char **argv) |
1789 | { |
1790 | int argc = command_size(argv); |
1791 | |
1792 | init_params(p: &p0, name, argc, argv); |
1793 | argc = parse_options(argc, argv, options, bench_numa_usage, 0); |
1794 | if (argc) |
1795 | goto err; |
1796 | |
1797 | if (__bench_numa(name)) |
1798 | goto err; |
1799 | |
1800 | return 0; |
1801 | |
1802 | err: |
1803 | return -1; |
1804 | } |
1805 | |
1806 | #define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk" |
1807 | #define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1" |
1808 | |
1809 | #define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1" |
1810 | #define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1" |
1811 | |
1812 | #define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1" |
1813 | #define OPT_BW_NOTHP OPT_BW, "--thp", "-1" |
1814 | |
1815 | /* |
1816 | * The built-in test-suite executed by "perf bench numa -a". |
1817 | * |
1818 | * (A minimum of 4 nodes and 16 GB of RAM is recommended.) |
1819 | */ |
1820 | static const char *tests[][MAX_ARGS] = { |
1821 | /* Basic single-stream NUMA bandwidth measurements: */ |
1822 | { "RAM-bw-local," , "mem" , "-p" , "1" , "-t" , "1" , "-P" , "1024" , |
1823 | "-C" , "0" , "-M" , "0" , OPT_BW_RAM }, |
1824 | { "RAM-bw-local-NOTHP," , |
1825 | "mem" , "-p" , "1" , "-t" , "1" , "-P" , "1024" , |
1826 | "-C" , "0" , "-M" , "0" , OPT_BW_RAM_NOTHP }, |
1827 | { "RAM-bw-remote," , "mem" , "-p" , "1" , "-t" , "1" , "-P" , "1024" , |
1828 | "-C" , "0" , "-M" , "1" , OPT_BW_RAM }, |
1829 | |
1830 | /* 2-stream NUMA bandwidth measurements: */ |
1831 | { "RAM-bw-local-2x," , "mem" , "-p" , "2" , "-t" , "1" , "-P" , "1024" , |
1832 | "-C" , "0,2" , "-M" , "0x2" , OPT_BW_RAM }, |
1833 | { "RAM-bw-remote-2x," , "mem" , "-p" , "2" , "-t" , "1" , "-P" , "1024" , |
1834 | "-C" , "0,2" , "-M" , "1x2" , OPT_BW_RAM }, |
1835 | |
1836 | /* Cross-stream NUMA bandwidth measurement: */ |
1837 | { "RAM-bw-cross," , "mem" , "-p" , "2" , "-t" , "1" , "-P" , "1024" , |
1838 | "-C" , "0,8" , "-M" , "1,0" , OPT_BW_RAM }, |
1839 | |
1840 | /* Convergence latency measurements: */ |
1841 | { " 1x3-convergence," , "mem" , "-p" , "1" , "-t" , "3" , "-P" , "512" , OPT_CONV }, |
1842 | { " 1x4-convergence," , "mem" , "-p" , "1" , "-t" , "4" , "-P" , "512" , OPT_CONV }, |
1843 | { " 1x6-convergence," , "mem" , "-p" , "1" , "-t" , "6" , "-P" , "1020" , OPT_CONV }, |
1844 | { " 2x3-convergence," , "mem" , "-p" , "2" , "-t" , "3" , "-P" , "1020" , OPT_CONV }, |
1845 | { " 3x3-convergence," , "mem" , "-p" , "3" , "-t" , "3" , "-P" , "1020" , OPT_CONV }, |
1846 | { " 4x4-convergence," , "mem" , "-p" , "4" , "-t" , "4" , "-P" , "512" , OPT_CONV }, |
1847 | { " 4x4-convergence-NOTHP," , |
1848 | "mem" , "-p" , "4" , "-t" , "4" , "-P" , "512" , OPT_CONV_NOTHP }, |
1849 | { " 4x6-convergence," , "mem" , "-p" , "4" , "-t" , "6" , "-P" , "1020" , OPT_CONV }, |
1850 | { " 4x8-convergence," , "mem" , "-p" , "4" , "-t" , "8" , "-P" , "512" , OPT_CONV }, |
1851 | { " 8x4-convergence," , "mem" , "-p" , "8" , "-t" , "4" , "-P" , "512" , OPT_CONV }, |
1852 | { " 8x4-convergence-NOTHP," , |
1853 | "mem" , "-p" , "8" , "-t" , "4" , "-P" , "512" , OPT_CONV_NOTHP }, |
1854 | { " 3x1-convergence," , "mem" , "-p" , "3" , "-t" , "1" , "-P" , "512" , OPT_CONV }, |
1855 | { " 4x1-convergence," , "mem" , "-p" , "4" , "-t" , "1" , "-P" , "512" , OPT_CONV }, |
1856 | { " 8x1-convergence," , "mem" , "-p" , "8" , "-t" , "1" , "-P" , "512" , OPT_CONV }, |
1857 | { "16x1-convergence," , "mem" , "-p" , "16" , "-t" , "1" , "-P" , "256" , OPT_CONV }, |
1858 | { "32x1-convergence," , "mem" , "-p" , "32" , "-t" , "1" , "-P" , "128" , OPT_CONV }, |
1859 | |
1860 | /* Various NUMA process/thread layout bandwidth measurements: */ |
1861 | { " 2x1-bw-process," , "mem" , "-p" , "2" , "-t" , "1" , "-P" , "1024" , OPT_BW }, |
1862 | { " 3x1-bw-process," , "mem" , "-p" , "3" , "-t" , "1" , "-P" , "1024" , OPT_BW }, |
1863 | { " 4x1-bw-process," , "mem" , "-p" , "4" , "-t" , "1" , "-P" , "1024" , OPT_BW }, |
1864 | { " 8x1-bw-process," , "mem" , "-p" , "8" , "-t" , "1" , "-P" , " 512" , OPT_BW }, |
1865 | { " 8x1-bw-process-NOTHP," , |
1866 | "mem" , "-p" , "8" , "-t" , "1" , "-P" , " 512" , OPT_BW_NOTHP }, |
1867 | { "16x1-bw-process," , "mem" , "-p" , "16" , "-t" , "1" , "-P" , "256" , OPT_BW }, |
1868 | |
1869 | { " 1x4-bw-thread," , "mem" , "-p" , "1" , "-t" , "4" , "-T" , "256" , OPT_BW }, |
1870 | { " 1x8-bw-thread," , "mem" , "-p" , "1" , "-t" , "8" , "-T" , "256" , OPT_BW }, |
1871 | { "1x16-bw-thread," , "mem" , "-p" , "1" , "-t" , "16" , "-T" , "128" , OPT_BW }, |
1872 | { "1x32-bw-thread," , "mem" , "-p" , "1" , "-t" , "32" , "-T" , "64" , OPT_BW }, |
1873 | |
1874 | { " 2x3-bw-process," , "mem" , "-p" , "2" , "-t" , "3" , "-P" , "512" , OPT_BW }, |
1875 | { " 4x4-bw-process," , "mem" , "-p" , "4" , "-t" , "4" , "-P" , "512" , OPT_BW }, |
1876 | { " 4x6-bw-process," , "mem" , "-p" , "4" , "-t" , "6" , "-P" , "512" , OPT_BW }, |
1877 | { " 4x8-bw-process," , "mem" , "-p" , "4" , "-t" , "8" , "-P" , "512" , OPT_BW }, |
1878 | { " 4x8-bw-process-NOTHP," , |
1879 | "mem" , "-p" , "4" , "-t" , "8" , "-P" , "512" , OPT_BW_NOTHP }, |
1880 | { " 3x3-bw-process," , "mem" , "-p" , "3" , "-t" , "3" , "-P" , "512" , OPT_BW }, |
1881 | { " 5x5-bw-process," , "mem" , "-p" , "5" , "-t" , "5" , "-P" , "512" , OPT_BW }, |
1882 | |
1883 | { "2x16-bw-process," , "mem" , "-p" , "2" , "-t" , "16" , "-P" , "512" , OPT_BW }, |
1884 | { "1x32-bw-process," , "mem" , "-p" , "1" , "-t" , "32" , "-P" , "2048" , OPT_BW }, |
1885 | |
1886 | { "numa02-bw," , "mem" , "-p" , "1" , "-t" , "32" , "-T" , "32" , OPT_BW }, |
1887 | { "numa02-bw-NOTHP," , "mem" , "-p" , "1" , "-t" , "32" , "-T" , "32" , OPT_BW_NOTHP }, |
1888 | { "numa01-bw-thread," , "mem" , "-p" , "2" , "-t" , "16" , "-T" , "192" , OPT_BW }, |
1889 | { "numa01-bw-thread-NOTHP," , |
1890 | "mem" , "-p" , "2" , "-t" , "16" , "-T" , "192" , OPT_BW_NOTHP }, |
1891 | }; |
1892 | |
1893 | static int bench_all(void) |
1894 | { |
1895 | int nr = ARRAY_SIZE(tests); |
1896 | int ret; |
1897 | int i; |
1898 | |
1899 | ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'" ); |
1900 | BUG_ON(ret < 0); |
1901 | |
1902 | for (i = 0; i < nr; i++) { |
1903 | run_bench_numa(name: tests[i][0], argv: tests[i] + 1); |
1904 | } |
1905 | |
1906 | printf("\n" ); |
1907 | |
1908 | return 0; |
1909 | } |
1910 | |
1911 | int bench_numa(int argc, const char **argv) |
1912 | { |
1913 | init_params(p: &p0, name: "main," , argc, argv); |
1914 | argc = parse_options(argc, argv, options, bench_numa_usage, 0); |
1915 | if (argc) |
1916 | goto err; |
1917 | |
1918 | if (p0.run_all) |
1919 | return bench_all(); |
1920 | |
1921 | if (__bench_numa(NULL)) |
1922 | goto err; |
1923 | |
1924 | return 0; |
1925 | |
1926 | err: |
1927 | usage_with_options(numa_usage, options); |
1928 | return -1; |
1929 | } |
1930 | |