numa.c source code [linux/tools/perf/bench/numa.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* numa.c
4	*
5	* numa: Simulate NUMA-sensitive workload and measure their NUMA performance
6	*/
7
8	#include <inttypes.h>
9
10	#include <subcmd/parse-options.h>
11	#include "../util/cloexec.h"
12
13	#include "bench.h"
14
15	#include <errno.h>
16	#include <sched.h>
17	#include <stdio.h>
18	#include <assert.h>
19	#include <debug.h>
20	#include <malloc.h>
21	#include <signal.h>
22	#include <stdlib.h>
23	#include <string.h>
24	#include <unistd.h>
25	#include <sys/mman.h>
26	#include <sys/time.h>
27	#include <sys/resource.h>
28	#include <sys/wait.h>
29	#include <sys/prctl.h>
30	#include <sys/types.h>
31	#include <linux/kernel.h>
32	#include <linux/time64.h>
33	#include <linux/numa.h>
34	#include <linux/zalloc.h>
35
36	#include "../util/header.h"
37	#include "../util/mutex.h"
38	#include <numa.h>
39	#include <numaif.h>
40
41	#ifndef RUSAGE_THREAD
42	# define RUSAGE_THREAD 1
43	#endif
44
45	/*
46	* Regular printout to the terminal, suppressed if -q is specified:
47	*/
48	#define tprintf(x...) do { if (g && g->p.show_details >= 0) printf(x); } while (0)
49
50	/*
51	* Debug printf:
52	*/
53	#undef dprintf
54	#define dprintf(x...) do { if (g && g->p.show_details >= 1) printf(x); } while (0)
55
56	struct thread_data {
57	int curr_cpu;
58	cpu_set_t *bind_cpumask;
59	int bind_node;
60	u8 *process_data;
61	int process_nr;
62	int thread_nr;
63	int task_nr;
64	unsigned int loops_done;
65	u64 val;
66	u64 runtime_ns;
67	u64 system_time_ns;
68	u64 user_time_ns;
69	double speed_gbs;
70	struct mutex *process_lock;
71	};
72
73	/ Parameters set by options: /
74
75	struct params {
76	/ Startup synchronization: /
77	bool serialize_startup;
78
79	/ Task hierarchy: /
80	int nr_proc;
81	int nr_threads;
82
83	/ Working set sizes: /
84	const char *mb_global_str;
85	const char *mb_proc_str;
86	const char *mb_proc_locked_str;
87	const char *mb_thread_str;
88
89	double mb_global;
90	double mb_proc;
91	double mb_proc_locked;
92	double mb_thread;
93
94	/ Access patterns to the working set: /
95	bool data_reads;
96	bool data_writes;
97	bool data_backwards;
98	bool data_zero_memset;
99	bool data_rand_walk;
100	u32 nr_loops;
101	u32 nr_secs;
102	u32 sleep_usecs;
103
104	/ Working set initialization: /
105	bool init_zero;
106	bool init_random;
107	bool init_cpu0;
108
109	/ Misc options: /
110	int show_details;
111	int run_all;
112	int thp;
113
114	long bytes_global;
115	long bytes_process;
116	long bytes_process_locked;
117	long bytes_thread;
118
119	int nr_tasks;
120
121	bool show_convergence;
122	bool measure_convergence;
123
124	int perturb_secs;
125	int nr_cpus;
126	int nr_nodes;
127
128	/ Affinity options -C and -N: /
129	char *cpu_list_str;
130	char *node_list_str;
131	};
132
133
134	/ Global, read-writable area, accessible to all processes and threads: /
135
136	struct global_info {
137	u8 *data;
138
139	struct mutex startup_mutex;
140	struct cond startup_cond;
141	int nr_tasks_started;
142
143	struct mutex start_work_mutex;
144	struct cond start_work_cond;
145	int nr_tasks_working;
146	bool start_work;
147
148	struct mutex stop_work_mutex;
149	u64 bytes_done;
150
151	struct thread_data *threads;
152
153	/ Convergence latency measurement: /
154	bool all_converged;
155	bool stop_work;
156
157	int print_once;
158
159	struct params p;
160	};
161
162	static struct global_info *g = NULL;
163
164	static int parse_cpus_opt(const struct option opt, const* char arg, int* unset);
165	static int parse_nodes_opt(const struct option opt, const* char arg, int* unset);
166
167	struct params p0;
168
169	static const struct option options[] = {
170	OPT_INTEGER(`'p'`, "nr_proc" , &p0.nr_proc, "number of processes"),
171	OPT_INTEGER(`'t'`, "nr_threads" , &p0.nr_threads, "number of threads per process"),
172
173	OPT_STRING(`'G'`, "mb_global" , &p0.mb_global_str, "MB", "global memory (MBs)"),
174	OPT_STRING(`'P'`, "mb_proc" , &p0.mb_proc_str, "MB", "process memory (MBs)"),
175	OPT_STRING(`'L'`, "mb_proc_locked", &p0.mb_proc_locked_str,"MB", "process serialized/locked memory access (MBs), <= process_memory"),
176	OPT_STRING(`'T'`, "mb_thread" , &p0.mb_thread_str, "MB", "thread memory (MBs)"),
177
178	OPT_UINTEGER(`'l'`, "nr_loops" , &p0.nr_loops, "max number of loops to run (default: unlimited)"),
179	OPT_UINTEGER(`'s'`, "nr_secs" , &p0.nr_secs, "max number of seconds to run (default: 5 secs)"),
180	OPT_UINTEGER(`'u'`, "usleep" , &p0.sleep_usecs, "usecs to sleep per loop iteration"),
181
182	OPT_BOOLEAN(`'R'`, "data_reads" , &p0.data_reads, "access the data via reads (can be mixed with -W)"),
183	OPT_BOOLEAN(`'W'`, "data_writes" , &p0.data_writes, "access the data via writes (can be mixed with -R)"),
184	OPT_BOOLEAN(`'B'`, "data_backwards", &p0.data_backwards, "access the data backwards as well"),
185	OPT_BOOLEAN(`'Z'`, "data_zero_memset", &p0.data_zero_memset,"access the data via glibc bzero only"),
186	OPT_BOOLEAN(`'r'`, "data_rand_walk", &p0.data_rand_walk, "access the data with random (32bit LFSR) walk"),
187
188
189	OPT_BOOLEAN(`'z'`, "init_zero" , &p0.init_zero, "bzero the initial allocations"),
190	OPT_BOOLEAN(`'I'`, "init_random" , &p0.init_random, "randomize the contents of the initial allocations"),
191	OPT_BOOLEAN(`'0'`, "init_cpu0" , &p0.init_cpu0, "do the initial allocations on CPU#0"),
192	OPT_INTEGER(`'x'`, "perturb_secs", &p0.perturb_secs, "perturb thread 0/0 every X secs, to test convergence stability"),
193
194	OPT_INCR (`'d'`, "show_details" , &p0.show_details, "Show details"),
195	OPT_INCR (`'a'`, "all" , &p0.run_all, "Run all tests in the suite"),
196	OPT_INTEGER(`'H'`, "thp" , &p0.thp, "MADV_NOHUGEPAGE < 0 < MADV_HUGEPAGE"),
197	OPT_BOOLEAN(`'c'`, "show_convergence", &p0.show_convergence, "show convergence details, "
198	"convergence is reached when each process (all its threads) is running on a single NUMA node."),
199	OPT_BOOLEAN(`'m'`, "measure_convergence", &p0.measure_convergence, "measure convergence latency"),
200	OPT_BOOLEAN(`'q'`, "quiet" , &quiet,
201	"quiet mode (do not show any warnings or messages)"),
202	OPT_BOOLEAN(`'S'`, "serialize-startup", &p0.serialize_startup,"serialize thread startup"),
203
204	/ Special option string parsing callbacks: /
205	OPT_CALLBACK(`'C'`, "cpus", NULL, "cpu[,cpu2,...cpuN]",
206	"bind the first N tasks to these specific cpus (the rest is unbound)",
207	parse_cpus_opt),
208	OPT_CALLBACK(`'M'`, "memnodes", NULL, "node[,node2,...nodeN]",
209	"bind the first N tasks to these specific memory nodes (the rest is unbound)",
210	parse_nodes_opt),
211	OPT_END()
212	};
213
214	static const char * const bench_numa_usage[] = {
215	"perf bench numa <options>",
216	NULL
217	};
218
219	static const char * const numa_usage[] = {
220	"perf bench numa mem [<options>]",
221	NULL
222	};
223
224	/*
225	* To get number of numa nodes present.
226	*/
227	static int nr_numa_nodes(void)
228	{
229	int i, nr_nodes = `0`;
230
231	for (i = `0`; i < g->p.nr_nodes; i++) {
232	if (numa_bitmask_isbitset(numa_nodes_ptr, i))
233	nr_nodes++;
234	}
235
236	return nr_nodes;
237	}
238
239	/*
240	* To check if given numa node is present.
241	*/
242	static int is_node_present(int node)
243	{
244	return numa_bitmask_isbitset(numa_nodes_ptr, node);
245	}
246
247	/*
248	* To check given numa node has cpus.
249	*/
250	static bool node_has_cpus(int node)
251	{
252	struct bitmask *cpumask = numa_allocate_cpumask();
253	bool ret = false; / fall back to nocpus /
254	int cpu;
255
256	BUG_ON(!cpumask);
257	if (!numa_node_to_cpus(node, cpumask)) {
258	for (cpu = `0`; cpu < (int)cpumask->size; cpu++) {
259	if (numa_bitmask_isbitset(cpumask, cpu)) {
260	ret = true;
261	break;
262	}
263	}
264	}
265	numa_free_cpumask(cpumask);
266
267	return ret;
268	}
269
270	static cpu_set_t bind_to_cpu(int* target_cpu)
271	{
272	int nrcpus = numa_num_possible_cpus();
273	cpu_set_t orig_mask, mask;
274	size_t size;
275
276	orig_mask = CPU_ALLOC(nrcpus);
277	BUG_ON(!orig_mask);
278	size = CPU_ALLOC_SIZE(nrcpus);
279	CPU_ZERO_S(size, orig_mask);
280
281	if (sched_getaffinity(pid: `0`, mask: size, orig_mask))
282	goto err_out;
283
284	mask = CPU_ALLOC(nrcpus);
285	if (!mask)
286	goto err_out;
287
288	CPU_ZERO_S(size, mask);
289
290	if (target_cpu == -`1`) {
291	int cpu;
292
293	for (cpu = `0`; cpu < g->p.nr_cpus; cpu++)
294	CPU_SET_S(cpu, size, mask);
295	} else {
296	if (target_cpu < `0` \|\| target_cpu >= g->p.nr_cpus)
297	goto err;
298
299	CPU_SET_S(target_cpu, size, mask);
300	}
301
302	if (sched_setaffinity(`0`, size, mask))
303	goto err;
304
305	return orig_mask;
306
307	err:
308	CPU_FREE(mask);
309	err_out:
310	CPU_FREE(orig_mask);
311
312	/ BUG_ON due to failure in allocation of orig_mask/mask /
313	BUG_ON(-`1`);
314	return NULL;
315	}
316
317	static cpu_set_t bind_to_node(int* target_node)
318	{
319	int nrcpus = numa_num_possible_cpus();
320	size_t size;
321	cpu_set_t orig_mask, mask;
322	int cpu;
323
324	orig_mask = CPU_ALLOC(nrcpus);
325	BUG_ON(!orig_mask);
326	size = CPU_ALLOC_SIZE(nrcpus);
327	CPU_ZERO_S(size, orig_mask);
328
329	if (sched_getaffinity(`0`, size, orig_mask))
330	goto err_out;
331
332	mask = CPU_ALLOC(nrcpus);
333	if (!mask)
334	goto err_out;
335
336	CPU_ZERO_S(size, mask);
337
338	if (target_node == NUMA_NO_NODE) {
339	for (cpu = `0`; cpu < g->p.nr_cpus; cpu++)
340	CPU_SET_S(cpu, size, mask);
341	} else {
342	struct bitmask *cpumask = numa_allocate_cpumask();
343
344	if (!cpumask)
345	goto err;
346
347	if (!numa_node_to_cpus(target_node, cpumask)) {
348	for (cpu = `0`; cpu < (int)cpumask->size; cpu++) {
349	if (numa_bitmask_isbitset(cpumask, cpu))
350	CPU_SET_S(cpu, size, mask);
351	}
352	}
353	numa_free_cpumask(cpumask);
354	}
355
356	if (sched_setaffinity(`0`, size, mask))
357	goto err;
358
359	return orig_mask;
360
361	err:
362	CPU_FREE(mask);
363	err_out:
364	CPU_FREE(orig_mask);
365
366	/ BUG_ON due to failure in allocation of orig_mask/mask /
367	BUG_ON(-`1`);
368	return NULL;
369	}
370
371	static void bind_to_cpumask(cpu_set_t *mask)
372	{
373	int ret;
374	size_t size = CPU_ALLOC_SIZE(numa_num_possible_cpus());
375
376	ret = sched_setaffinity(pid: `0`, new_mask: size, mask);
377	if (ret) {
378	CPU_FREE(mask);
379	BUG_ON(ret);
380	}
381	}
382
383	static void mempol_restore(void)
384	{
385	int ret;
386
387	ret = set_mempolicy(MPOL_DEFAULT, NULL, g->p.nr_nodes-`1`);
388
389	BUG_ON(ret);
390	}
391
392	static void bind_to_memnode(int node)
393	{
394	struct bitmask *node_mask;
395	int ret;
396
397	if (node == NUMA_NO_NODE)
398	return;
399
400	node_mask = numa_allocate_nodemask();
401	BUG_ON(!node_mask);
402
403	numa_bitmask_clearall(node_mask);
404	numa_bitmask_setbit(node_mask, node);
405
406	ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + `1`);
407	dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret);
408
409	numa_bitmask_free(node_mask);
410	BUG_ON(ret);
411	}
412
413	#define HPSIZE (210241024)
414
415	#define set_taskname(fmt...) \
416	do { \
417	char name[20]; \
418	\
419	snprintf(name, 20, fmt); \
420	prctl(PR_SET_NAME, name); \
421	} while (0)
422
423	static u8 alloc_data(ssize_t bytes0, int* map_flags,
424	int init_zero, int init_cpu0, int thp, int init_random)
425	{
426	cpu_set_t *orig_mask = NULL;
427	ssize_t bytes;
428	u8 *buf;
429	int ret;
430
431	if (!bytes0)
432	return NULL;
433
434	/ Allocate and initialize all memory on CPU#0: /
435	if (init_cpu0) {
436	int node = numa_node_of_cpu(`0`);
437
438	orig_mask = bind_to_node(node);
439	bind_to_memnode(node);
440	}
441
442	bytes = bytes0 + HPSIZE;
443
444	buf = (void *)mmap(`0`, bytes, PROT_READ\|PROT_WRITE, MAP_ANON\|map_flags, -`1`, `0`);
445	BUG_ON(buf == (void *)-`1`);
446
447	if (map_flags == MAP_PRIVATE) {
448	if (thp > `0`) {
449	ret = madvise(buf, bytes, MADV_HUGEPAGE);
450	if (ret && !g->print_once) {
451	g->print_once = `1`;
452	printf("WARNING: Could not enable THP - do: 'echo madvise > /sys/kernel/mm/transparent_hugepage/enabled'\n");
453	}
454	}
455	if (thp < `0`) {
456	ret = madvise(buf, bytes, MADV_NOHUGEPAGE);
457	if (ret && !g->print_once) {
458	g->print_once = `1`;
459	printf("WARNING: Could not disable THP: run a CONFIG_TRANSPARENT_HUGEPAGE kernel?\n");
460	}
461	}
462	}
463
464	if (init_zero) {
465	bzero(buf, bytes);
466	} else {
467	/ Initialize random contents, different in each word: /
468	if (init_random) {
469	u64 wbuf = (void* *)buf;
470	long off = rand();
471	long i;
472
473	for (i = `0`; i < bytes/`8`; i++)
474	wbuf[i] = i + off;
475	}
476	}
477
478	/ Align to 2MB boundary: /
479	buf = (void )(((unsigned* long)buf + HPSIZE-`1`) & ~(HPSIZE-`1`));
480
481	/ Restore affinity: /
482	if (init_cpu0) {
483	bind_to_cpumask(orig_mask);
484	CPU_FREE(orig_mask);
485	mempol_restore();
486	}
487
488	return buf;
489	}
490
491	static void free_data(void *data, ssize_t bytes)
492	{
493	int ret;
494
495	if (!data)
496	return;
497
498	ret = munmap(data, bytes);
499	BUG_ON(ret);
500	}
501
502	/*
503	* Create a shared memory buffer that can be shared between processes, zeroed:
504	*/
505	static void * zalloc_shared_data(ssize_t bytes)
506	{
507	return alloc_data(bytes, MAP_SHARED, `1`, g->p.init_cpu0, g->p.thp, g->p.init_random);
508	}
509
510	/*
511	* Create a shared memory buffer that can be shared between processes:
512	*/
513	static void * setup_shared_data(ssize_t bytes)
514	{
515	return alloc_data(bytes, MAP_SHARED, `0`, g->p.init_cpu0, g->p.thp, g->p.init_random);
516	}
517
518	/*
519	* Allocate process-local memory - this will either be shared between
520	* threads of this process, or only be accessed by this thread:
521	*/
522	static void * setup_private_data(ssize_t bytes)
523	{
524	return alloc_data(bytes, MAP_PRIVATE, `0`, g->p.init_cpu0, g->p.thp, g->p.init_random);
525	}
526
527	static int parse_cpu_list(const char *arg)
528	{
529	p0.cpu_list_str = strdup(arg);
530
531	dprintf("got CPU list: {%s}\n", p0.cpu_list_str);
532
533	return `0`;
534	}
535
536	static int parse_setup_cpu_list(void)
537	{
538	struct thread_data *td;
539	char str0, str;
540	int t;
541
542	if (!g->p.cpu_list_str)
543	return `0`;
544
545	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
546
547	str0 = str = strdup(g->p.cpu_list_str);
548	t = `0`;
549
550	BUG_ON(!str);
551
552	tprintf("# binding tasks to CPUs:\n");
553	tprintf("# ");
554
555	while (true) {
556	int bind_cpu, bind_cpu_0, bind_cpu_1;
557	char tok, tok_end, tok_step, tok_len, *tok_mul;
558	int bind_len;
559	int step;
560	int mul;
561
562	tok = strsep(&str, ",");
563	if (!tok)
564	break;
565
566	tok_end = strstr(tok, "-");
567
568	dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
569	if (!tok_end) {
570	/ Single CPU specified: /
571	bind_cpu_0 = bind_cpu_1 = atol(tok);
572	} else {
573	/ CPU range specified (for example: "5-11"): /
574	bind_cpu_0 = atol(tok);
575	bind_cpu_1 = atol(tok_end + `1`);
576	}
577
578	step = `1`;
579	tok_step = strstr(tok, "#");
580	if (tok_step) {
581	step = atol(tok_step + `1`);
582	BUG_ON(step <= `0` \|\| step >= g->p.nr_cpus);
583	}
584
585	/*
586	* Mask length.
587	* Eg: "--cpus 8_4-16#4" means: '--cpus 8_4,12_4,16_4',
588	* where the _4 means the next 4 CPUs are allowed.
589	*/
590	bind_len = `1`;
591	tok_len = strstr(tok, "_");
592	if (tok_len) {
593	bind_len = atol(tok_len + `1`);
594	BUG_ON(bind_len <= `0` \|\| bind_len > g->p.nr_cpus);
595	}
596
597	/ Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" /
598	mul = `1`;
599	tok_mul = strstr(tok, "x");
600	if (tok_mul) {
601	mul = atol(tok_mul + `1`);
602	BUG_ON(mul <= `0`);
603	}
604
605	dprintf("CPUs: %d_%d-%d#%dx%d\n", bind_cpu_0, bind_len, bind_cpu_1, step, mul);
606
607	if (bind_cpu_0 >= g->p.nr_cpus \|\| bind_cpu_1 >= g->p.nr_cpus) {
608	printf("\nTest not applicable, system has only %d CPUs.\n", g->p.nr_cpus);
609	return -`1`;
610	}
611
612	if (is_cpu_online(cpu: bind_cpu_0) != `1` \|\| is_cpu_online(cpu: bind_cpu_1) != `1`) {
613	printf("\nTest not applicable, bind_cpu_0 or bind_cpu_1 is offline\n");
614	return -`1`;
615	}
616
617	BUG_ON(bind_cpu_0 < `0` \|\| bind_cpu_1 < `0`);
618	BUG_ON(bind_cpu_0 > bind_cpu_1);
619
620	for (bind_cpu = bind_cpu_0; bind_cpu <= bind_cpu_1; bind_cpu += step) {
621	size_t size = CPU_ALLOC_SIZE(g->p.nr_cpus);
622	int i;
623
624	for (i = `0`; i < mul; i++) {
625	int cpu;
626
627	if (t >= g->p.nr_tasks) {
628	printf("\n# NOTE: ignoring bind CPUs starting at CPU#%d\n #", bind_cpu);
629	goto out;
630	}
631	td = g->threads + t;
632
633	if (t)
634	tprintf(",");
635	if (bind_len > `1`) {
636	tprintf("%2d/%d", bind_cpu, bind_len);
637	} else {
638	tprintf("%2d", bind_cpu);
639	}
640
641	td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
642	BUG_ON(!td->bind_cpumask);
643	CPU_ZERO_S(size, td->bind_cpumask);
644	for (cpu = bind_cpu; cpu < bind_cpu+bind_len; cpu++) {
645	if (cpu < `0` \|\| cpu >= g->p.nr_cpus) {
646	CPU_FREE(td->bind_cpumask);
647	BUG_ON(-`1`);
648	}
649	CPU_SET_S(cpu, size, td->bind_cpumask);
650	}
651	t++;
652	}
653	}
654	}
655	out:
656
657	tprintf("\n");
658
659	if (t < g->p.nr_tasks)
660	printf("# NOTE: %d tasks bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
661
662	free(str0);
663	return `0`;
664	}
665
666	static int parse_cpus_opt(const struct option *opt __maybe_unused,
667	const char arg, int* unset __maybe_unused)
668	{
669	if (!arg)
670	return -`1`;
671
672	return parse_cpu_list(arg);
673	}
674
675	static int parse_node_list(const char *arg)
676	{
677	p0.node_list_str = strdup(arg);
678
679	dprintf("got NODE list: {%s}\n", p0.node_list_str);
680
681	return `0`;
682	}
683
684	static int parse_setup_node_list(void)
685	{
686	struct thread_data *td;
687	char str0, str;
688	int t;
689
690	if (!g->p.node_list_str)
691	return `0`;
692
693	dprintf("g->p.nr_tasks: %d\n", g->p.nr_tasks);
694
695	str0 = str = strdup(g->p.node_list_str);
696	t = `0`;
697
698	BUG_ON(!str);
699
700	tprintf("# binding tasks to NODEs:\n");
701	tprintf("# ");
702
703	while (true) {
704	int bind_node, bind_node_0, bind_node_1;
705	char tok, tok_end, tok_step, tok_mul;
706	int step;
707	int mul;
708
709	tok = strsep(&str, ",");
710	if (!tok)
711	break;
712
713	tok_end = strstr(tok, "-");
714
715	dprintf("\ntoken: {%s}, end: {%s}\n", tok, tok_end);
716	if (!tok_end) {
717	/ Single NODE specified: /
718	bind_node_0 = bind_node_1 = atol(tok);
719	} else {
720	/ NODE range specified (for example: "5-11"): /
721	bind_node_0 = atol(tok);
722	bind_node_1 = atol(tok_end + `1`);
723	}
724
725	step = `1`;
726	tok_step = strstr(tok, "#");
727	if (tok_step) {
728	step = atol(tok_step + `1`);
729	BUG_ON(step <= `0` \|\| step >= g->p.nr_nodes);
730	}
731
732	/ Multiplicator shortcut, "0x8" is a shortcut for: "0,0,0,0,0,0,0,0" /
733	mul = `1`;
734	tok_mul = strstr(tok, "x");
735	if (tok_mul) {
736	mul = atol(tok_mul + `1`);
737	BUG_ON(mul <= `0`);
738	}
739
740	dprintf("NODEs: %d-%d #%d\n", bind_node_0, bind_node_1, step);
741
742	if (bind_node_0 >= g->p.nr_nodes \|\| bind_node_1 >= g->p.nr_nodes) {
743	printf("\nTest not applicable, system has only %d nodes.\n", g->p.nr_nodes);
744	return -`1`;
745	}
746
747	BUG_ON(bind_node_0 < `0` \|\| bind_node_1 < `0`);
748	BUG_ON(bind_node_0 > bind_node_1);
749
750	for (bind_node = bind_node_0; bind_node <= bind_node_1; bind_node += step) {
751	int i;
752
753	for (i = `0`; i < mul; i++) {
754	if (t >= g->p.nr_tasks \|\| !node_has_cpus(node: bind_node)) {
755	printf("\n# NOTE: ignoring bind NODEs starting at NODE#%d\n", bind_node);
756	goto out;
757	}
758	td = g->threads + t;
759
760	if (!t)
761	tprintf(" %2d", bind_node);
762	else
763	tprintf(",%2d", bind_node);
764
765	td->bind_node = bind_node;
766	t++;
767	}
768	}
769	}
770	out:
771
772	tprintf("\n");
773
774	if (t < g->p.nr_tasks)
775	printf("# NOTE: %d tasks mem-bound, %d tasks unbound\n", t, g->p.nr_tasks - t);
776
777	free(str0);
778	return `0`;
779	}
780
781	static int parse_nodes_opt(const struct option *opt __maybe_unused,
782	const char arg, int* unset __maybe_unused)
783	{
784	if (!arg)
785	return -`1`;
786
787	return parse_node_list(arg);
788	}
789
790	static inline uint32_t lfsr_32(uint32_t lfsr)
791	{
792	const uint32_t taps = BIT(`1`) \| BIT(`5`) \| BIT(`6`) \| BIT(`31`);
793	return (lfsr>>`1`) ^ ((`0x0u` - (lfsr & `0x1u`)) & taps);
794	}
795
796	/*
797	* Make sure there's real data dependency to RAM (when read
798	* accesses are enabled), so the compiler, the CPU and the
799	* kernel (KSM, zero page, etc.) cannot optimize away RAM
800	* accesses:
801	*/
802	static inline u64 access_data(u64 *data, u64 val)
803	{
804	if (g->p.data_reads)
805	val += *data;
806	if (g->p.data_writes)
807	*data = val + `1`;
808	return val;
809	}
810
811	/*
812	* The worker process does two types of work, a forwards going
813	* loop and a backwards going loop.
814	*
815	* We do this so that on multiprocessor systems we do not create
816	* a 'train' of processing, with highly synchronized processes,
817	* skewing the whole benchmark.
818	*/
819	static u64 do_work(u8 __data, long* bytes, int nr, int nr_max, int loop, u64 val)
820	{
821	long words = bytes/sizeof(u64);
822	u64 data = (void* *)__data;
823	long chunk_0, chunk_1;
824	u64 d0, d, *d1;
825	long off;
826	long i;
827
828	BUG_ON(!data && words);
829	BUG_ON(data && !words);
830
831	if (!data)
832	return val;
833
834	/ Very simple memset() work variant: /
835	if (g->p.data_zero_memset && !g->p.data_rand_walk) {
836	bzero(data, bytes);
837	return val;
838	}
839
840	/ Spread out by PID/TID nr and by loop nr: /
841	chunk_0 = words/nr_max;
842	chunk_1 = words/g->p.nr_loops;
843	off = nrchunk_0 + loopchunk_1;
844
845	while (off >= words)
846	off -= words;
847
848	if (g->p.data_rand_walk) {
849	u32 lfsr = nr + loop + val;
850	long j;
851
852	for (i = `0`; i < words/`1024`; i++) {
853	long start, end;
854
855	lfsr = lfsr_32(lfsr);
856
857	start = lfsr % words;
858	end = min(start + `1024`, words-`1`);
859
860	if (g->p.data_zero_memset) {
861	bzero(data + start, (end-start) * sizeof(u64));
862	} else {
863	for (j = start; j < end; j++)
864	val = access_data(data: data + j, val);
865	}
866	}
867	} else if (!g->p.data_backwards \|\| (nr + loop) & `1`) {
868	/ Process data forwards: /
869
870	d0 = data + off;
871	d = data + off + `1`;
872	d1 = data + words;
873
874	for (;;) {
875	if (unlikely(d >= d1))
876	d = data;
877	if (unlikely(d == d0))
878	break;
879
880	val = access_data(data: d, val);
881
882	d++;
883	}
884	} else {
885	/ Process data backwards: /
886
887	d0 = data + off;
888	d = data + off - `1`;
889	d1 = data + words;
890
891	for (;;) {
892	if (unlikely(d < data))
893	d = data + words-`1`;
894	if (unlikely(d == d0))
895	break;
896
897	val = access_data(data: d, val);
898
899	d--;
900	}
901	}
902
903	return val;
904	}
905
906	static void update_curr_cpu(int task_nr, unsigned long bytes_worked)
907	{
908	unsigned int cpu;
909
910	cpu = sched_getcpu();
911
912	g->threads[task_nr].curr_cpu = cpu;
913	prctl(`0`, bytes_worked);
914	}
915
916	/*
917	* Count the number of nodes a process's threads
918	* are spread out on.
919	*
920	* A count of 1 means that the process is compressed
921	* to a single node. A count of g->p.nr_nodes means it's
922	* spread out on the whole system.
923	*/
924	static int count_process_nodes(int process_nr)
925	{
926	char *node_present;
927	int nodes;
928	int n, t;
929
930	node_present = (char )malloc(g->p.nr_nodes sizeof(char));
931	BUG_ON(!node_present);
932	for (nodes = `0`; nodes < g->p.nr_nodes; nodes++)
933	node_present[nodes] = `0`;
934
935	for (t = `0`; t < g->p.nr_threads; t++) {
936	struct thread_data *td;
937	int task_nr;
938	int node;
939
940	task_nr = process_nr*g->p.nr_threads + t;
941	td = g->threads + task_nr;
942
943	node = numa_node_of_cpu(td->curr_cpu);
944	if (node < `0`) / curr_cpu was likely still -1 / {
945	free(node_present);
946	return `0`;
947	}
948
949	node_present[node] = `1`;
950	}
951
952	nodes = `0`;
953
954	for (n = `0`; n < g->p.nr_nodes; n++)
955	nodes += node_present[n];
956
957	free(node_present);
958	return nodes;
959	}
960
961	/*
962	* Count the number of distinct process-threads a node contains.
963	*
964	* A count of 1 means that the node contains only a single
965	* process. If all nodes on the system contain at most one
966	* process then we are well-converged.
967	*/
968	static int count_node_processes(int node)
969	{
970	int processes = `0`;
971	int t, p;
972
973	for (p = `0`; p < g->p.nr_proc; p++) {
974	for (t = `0`; t < g->p.nr_threads; t++) {
975	struct thread_data *td;
976	int task_nr;
977	int n;
978
979	task_nr = p*g->p.nr_threads + t;
980	td = g->threads + task_nr;
981
982	n = numa_node_of_cpu(td->curr_cpu);
983	if (n == node) {
984	processes++;
985	break;
986	}
987	}
988	}
989
990	return processes;
991	}
992
993	static void calc_convergence_compression(int *strong)
994	{
995	unsigned int nodes_min, nodes_max;
996	int p;
997
998	nodes_min = -`1`;
999	nodes_max = `0`;
1000
1001	for (p = `0`; p < g->p.nr_proc; p++) {
1002	unsigned int nodes = count_process_nodes(process_nr: p);
1003
1004	if (!nodes) {
1005	*strong = `0`;
1006	return;
1007	}
1008
1009	nodes_min = min(nodes, nodes_min);
1010	nodes_max = max(nodes, nodes_max);
1011	}
1012
1013	/ Strong convergence: all threads compress on a single node: /
1014	if (nodes_min == `1` && nodes_max == `1`) {
1015	*strong = `1`;
1016	} else {
1017	*strong = `0`;
1018	tprintf(" {%d-%d}", nodes_min, nodes_max);
1019	}
1020	}
1021
1022	static void calc_convergence(double runtime_ns_max, double *convergence)
1023	{
1024	unsigned int loops_done_min, loops_done_max;
1025	int process_groups;
1026	int *nodes;
1027	int distance;
1028	int nr_min;
1029	int nr_max;
1030	int strong;
1031	int sum;
1032	int nr;
1033	int node;
1034	int cpu;
1035	int t;
1036
1037	if (!g->p.show_convergence && !g->p.measure_convergence)
1038	return;
1039
1040	nodes = (int )malloc(g->p.nr_nodes sizeof(int));
1041	BUG_ON(!nodes);
1042	for (node = `0`; node < g->p.nr_nodes; node++)
1043	nodes[node] = `0`;
1044
1045	loops_done_min = -`1`;
1046	loops_done_max = `0`;
1047
1048	for (t = `0`; t < g->p.nr_tasks; t++) {
1049	struct thread_data *td = g->threads + t;
1050	unsigned int loops_done;
1051
1052	cpu = td->curr_cpu;
1053
1054	/ Not all threads have written it yet: /
1055	if (cpu < `0`)
1056	continue;
1057
1058	node = numa_node_of_cpu(cpu);
1059
1060	nodes[node]++;
1061
1062	loops_done = td->loops_done;
1063	loops_done_min = min(loops_done, loops_done_min);
1064	loops_done_max = max(loops_done, loops_done_max);
1065	}
1066
1067	nr_max = `0`;
1068	nr_min = g->p.nr_tasks;
1069	sum = `0`;
1070
1071	for (node = `0`; node < g->p.nr_nodes; node++) {
1072	if (!is_node_present(node))
1073	continue;
1074	nr = nodes[node];
1075	nr_min = min(nr, nr_min);
1076	nr_max = max(nr, nr_max);
1077	sum += nr;
1078	}
1079	BUG_ON(nr_min > nr_max);
1080
1081	BUG_ON(sum > g->p.nr_tasks);
1082
1083	if (`0` && (sum < g->p.nr_tasks)) {
1084	free(nodes);
1085	return;
1086	}
1087
1088	/*
1089	* Count the number of distinct process groups present
1090	* on nodes - when we are converged this will decrease
1091	* to g->p.nr_proc:
1092	*/
1093	process_groups = `0`;
1094
1095	for (node = `0`; node < g->p.nr_nodes; node++) {
1096	int processes;
1097
1098	if (!is_node_present(node))
1099	continue;
1100	processes = count_node_processes(node);
1101	nr = nodes[node];
1102	tprintf(" %2d/%-2d", nr, processes);
1103
1104	process_groups += processes;
1105	}
1106
1107	distance = nr_max - nr_min;
1108
1109	tprintf(" [%2d/%-2d]", distance, process_groups);
1110
1111	tprintf(" l:%3d-%-3d (%3d)",
1112	loops_done_min, loops_done_max, loops_done_max-loops_done_min);
1113
1114	if (loops_done_min && loops_done_max) {
1115	double skew = `1.0` - (double)loops_done_min/loops_done_max;
1116
1117	tprintf(" [%4.1f%%]", skew * `100.0`);
1118	}
1119
1120	calc_convergence_compression(strong: &strong);
1121
1122	if (strong && process_groups == g->p.nr_proc) {
1123	if (!*convergence) {
1124	*convergence = runtime_ns_max;
1125	tprintf(" (%6.1fs converged)\n", *convergence / NSEC_PER_SEC);
1126	if (g->p.measure_convergence) {
1127	g->all_converged = true;
1128	g->stop_work = true;
1129	}
1130	}
1131	} else {
1132	if (*convergence) {
1133	tprintf(" (%6.1fs de-converged)", runtime_ns_max / NSEC_PER_SEC);
1134	*convergence = `0`;
1135	}
1136	tprintf("\n");
1137	}
1138
1139	free(nodes);
1140	}
1141
1142	static void show_summary(double runtime_ns_max, int l, double *convergence)
1143	{
1144	tprintf("\r # %5.1f%% [%.1f mins]",
1145	(double)(l+`1`)/g->p.nr_loops*`100.0`, runtime_ns_max / NSEC_PER_SEC / `60.0`);
1146
1147	calc_convergence(runtime_ns_max, convergence);
1148
1149	if (g->p.show_details >= `0`)
1150	fflush(stdout);
1151	}
1152
1153	static void worker_thread(void* *__tdata)
1154	{
1155	struct thread_data *td = __tdata;
1156	struct timeval start0, start, stop, diff;
1157	int process_nr = td->process_nr;
1158	int thread_nr = td->thread_nr;
1159	unsigned long last_perturbance;
1160	int task_nr = td->task_nr;
1161	int details = g->p.show_details;
1162	int first_task, last_task;
1163	double convergence = `0`;
1164	u64 val = td->val;
1165	double runtime_ns_max;
1166	u8 *global_data;
1167	u8 *process_data;
1168	u8 *thread_data;
1169	u64 bytes_done, secs;
1170	long work_done;
1171	u32 l;
1172	struct rusage rusage;
1173
1174	bind_to_cpumask(td->bind_cpumask);
1175	bind_to_memnode(node: td->bind_node);
1176
1177	set_taskname("thread %d/%d", process_nr, thread_nr);
1178
1179	global_data = g->data;
1180	process_data = td->process_data;
1181	thread_data = setup_private_data(g->p.bytes_thread);
1182
1183	bytes_done = `0`;
1184
1185	last_task = `0`;
1186	if (process_nr == g->p.nr_proc-`1` && thread_nr == g->p.nr_threads-`1`)
1187	last_task = `1`;
1188
1189	first_task = `0`;
1190	if (process_nr == `0` && thread_nr == `0`)
1191	first_task = `1`;
1192
1193	if (details >= `2`) {
1194	printf("# thread %2d / %2d global mem: %p, process mem: %p, thread mem: %p\n",
1195	process_nr, thread_nr, global_data, process_data, thread_data);
1196	}
1197
1198	if (g->p.serialize_startup) {
1199	mutex_lock(&g->startup_mutex);
1200	g->nr_tasks_started++;
1201	/ The last thread wakes the main process. /
1202	if (g->nr_tasks_started == g->p.nr_tasks)
1203	cond_signal(cnd: &g->startup_cond);
1204
1205	mutex_unlock(mtx: &g->startup_mutex);
1206
1207	/ Here we will wait for the main process to start us all at once: /
1208	mutex_lock(&g->start_work_mutex);
1209	g->start_work = false;
1210	g->nr_tasks_working++;
1211	while (!g->start_work)
1212	cond_wait(cnd: &g->start_work_cond, mtx: &g->start_work_mutex);
1213
1214	mutex_unlock(mtx: &g->start_work_mutex);
1215	}
1216
1217	gettimeofday(&start0, NULL);
1218
1219	start = stop = start0;
1220	last_perturbance = start.tv_sec;
1221
1222	for (l = `0`; l < g->p.nr_loops; l++) {
1223	start = stop;
1224
1225	if (g->stop_work)
1226	break;
1227
1228	val += do_work(data: global_data, bytes: g->p.bytes_global, nr: process_nr, nr_max: g->p.nr_proc, loop: l, val);
1229	val += do_work(data: process_data, bytes: g->p.bytes_process, nr: thread_nr, nr_max: g->p.nr_threads, loop: l, val);
1230	val += do_work(data: thread_data, bytes: g->p.bytes_thread, nr: `0`, nr_max: `1`, loop: l, val);
1231
1232	if (g->p.sleep_usecs) {
1233	mutex_lock(td->process_lock);
1234	usleep(g->p.sleep_usecs);
1235	mutex_unlock(mtx: td->process_lock);
1236	}
1237	/*
1238	* Amount of work to be done under a process-global lock:
1239	*/
1240	if (g->p.bytes_process_locked) {
1241	mutex_lock(td->process_lock);
1242	val += do_work(data: process_data, bytes: g->p.bytes_process_locked, nr: thread_nr, nr_max: g->p.nr_threads, loop: l, val);
1243	mutex_unlock(mtx: td->process_lock);
1244	}
1245
1246	work_done = g->p.bytes_global + g->p.bytes_process +
1247	g->p.bytes_process_locked + g->p.bytes_thread;
1248
1249	update_curr_cpu(task_nr, bytes_worked: work_done);
1250	bytes_done += work_done;
1251
1252	if (details < `0` && !g->p.perturb_secs && !g->p.measure_convergence && !g->p.nr_secs)
1253	continue;
1254
1255	td->loops_done = l;
1256
1257	gettimeofday(&stop, NULL);
1258
1259	/ Check whether our max runtime timed out: /
1260	if (g->p.nr_secs) {
1261	timersub(&stop, &start0, &diff);
1262	if ((u32)diff.tv_sec >= g->p.nr_secs) {
1263	g->stop_work = true;
1264	break;
1265	}
1266	}
1267
1268	/ Update the summary at most once per second: /
1269	if (start.tv_sec == stop.tv_sec)
1270	continue;
1271
1272	/*
1273	* Perturb the first task's equilibrium every g->p.perturb_secs seconds,
1274	* by migrating to CPU#0:
1275	*/
1276	if (first_task && g->p.perturb_secs && (int)(stop.tv_sec - last_perturbance) >= g->p.perturb_secs) {
1277	cpu_set_t *orig_mask;
1278	int target_cpu;
1279	int this_cpu;
1280
1281	last_perturbance = stop.tv_sec;
1282
1283	/*
1284	* Depending on where we are running, move into
1285	* the other half of the system, to create some
1286	* real disturbance:
1287	*/
1288	this_cpu = g->threads[task_nr].curr_cpu;
1289	if (this_cpu < g->p.nr_cpus/`2`)
1290	target_cpu = g->p.nr_cpus-`1`;
1291	else
1292	target_cpu = `0`;
1293
1294	orig_mask = bind_to_cpu(target_cpu);
1295
1296	/ Here we are running on the target CPU already /
1297	if (details >= `1`)
1298	printf(" (injecting perturbalance, moved to CPU#%d)\n", target_cpu);
1299
1300	bind_to_cpumask(orig_mask);
1301	CPU_FREE(orig_mask);
1302	}
1303
1304	if (details >= `3`) {
1305	timersub(&stop, &start, &diff);
1306	runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
1307	runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
1308
1309	if (details >= `0`) {
1310	printf(" #%2d / %2d: %14.2lf nsecs/op [val: %016"PRIx64"]\n",
1311	process_nr, thread_nr, runtime_ns_max / bytes_done, val);
1312	}
1313	fflush(stdout);
1314	}
1315	if (!last_task)
1316	continue;
1317
1318	timersub(&stop, &start0, &diff);
1319	runtime_ns_max = diff.tv_sec * NSEC_PER_SEC;
1320	runtime_ns_max += diff.tv_usec * NSEC_PER_USEC;
1321
1322	show_summary(runtime_ns_max, l, convergence: &convergence);
1323	}
1324
1325	gettimeofday(&stop, NULL);
1326	timersub(&stop, &start0, &diff);
1327	td->runtime_ns = diff.tv_sec * NSEC_PER_SEC;
1328	td->runtime_ns += diff.tv_usec * NSEC_PER_USEC;
1329	secs = td->runtime_ns / NSEC_PER_SEC;
1330	td->speed_gbs = secs ? bytes_done / secs / `1e9` : `0`;
1331
1332	getrusage(RUSAGE_THREAD, &rusage);
1333	td->system_time_ns = rusage.ru_stime.tv_sec * NSEC_PER_SEC;
1334	td->system_time_ns += rusage.ru_stime.tv_usec * NSEC_PER_USEC;
1335	td->user_time_ns = rusage.ru_utime.tv_sec * NSEC_PER_SEC;
1336	td->user_time_ns += rusage.ru_utime.tv_usec * NSEC_PER_USEC;
1337
1338	free_data(data: thread_data, bytes: g->p.bytes_thread);
1339
1340	mutex_lock(&g->stop_work_mutex);
1341	g->bytes_done += bytes_done;
1342	mutex_unlock(mtx: &g->stop_work_mutex);
1343
1344	return NULL;
1345	}
1346
1347	/*
1348	* A worker process starts a couple of threads:
1349	*/
1350	static void worker_process(int process_nr)
1351	{
1352	struct mutex process_lock;
1353	struct thread_data *td;
1354	pthread_t *pthreads;
1355	u8 *process_data;
1356	int task_nr;
1357	int ret;
1358	int t;
1359
1360	mutex_init(&process_lock);
1361	set_taskname("process %d", process_nr);
1362
1363	/*
1364	* Pick up the memory policy and the CPU binding of our first thread,
1365	* so that we initialize memory accordingly:
1366	*/
1367	task_nr = process_nr*g->p.nr_threads;
1368	td = g->threads + task_nr;
1369
1370	bind_to_memnode(node: td->bind_node);
1371	bind_to_cpumask(td->bind_cpumask);
1372
1373	pthreads = zalloc(g->p.nr_threads * sizeof(pthread_t));
1374	process_data = setup_private_data(g->p.bytes_process);
1375
1376	if (g->p.show_details >= `3`) {
1377	printf(" # process %2d global mem: %p, process mem: %p\n",
1378	process_nr, g->data, process_data);
1379	}
1380
1381	for (t = `0`; t < g->p.nr_threads; t++) {
1382	task_nr = process_nr*g->p.nr_threads + t;
1383	td = g->threads + task_nr;
1384
1385	td->process_data = process_data;
1386	td->process_nr = process_nr;
1387	td->thread_nr = t;
1388	td->task_nr = task_nr;
1389	td->val = rand();
1390	td->curr_cpu = -`1`;
1391	td->process_lock = &process_lock;
1392
1393	ret = pthread_create(pthreads + t, NULL, worker_thread, td);
1394	BUG_ON(ret);
1395	}
1396
1397	for (t = `0`; t < g->p.nr_threads; t++) {
1398	ret = pthread_join(pthreads[t], NULL);
1399	BUG_ON(ret);
1400	}
1401
1402	free_data(data: process_data, bytes: g->p.bytes_process);
1403	free(pthreads);
1404	}
1405
1406	static void print_summary(void)
1407	{
1408	if (g->p.show_details < `0`)
1409	return;
1410
1411	printf("\n ###\n");
1412	printf(" # %d %s will execute (on %d nodes, %d CPUs):\n",
1413	g->p.nr_tasks, g->p.nr_tasks == `1` ? "task" : "tasks", nr_numa_nodes(), g->p.nr_cpus);
1414	printf(" # %5dx %5ldMB global shared mem operations\n",
1415	g->p.nr_loops, g->p.bytes_global/`1024`/`1024`);
1416	printf(" # %5dx %5ldMB process shared mem operations\n",
1417	g->p.nr_loops, g->p.bytes_process/`1024`/`1024`);
1418	printf(" # %5dx %5ldMB thread local mem operations\n",
1419	g->p.nr_loops, g->p.bytes_thread/`1024`/`1024`);
1420
1421	printf(" ###\n");
1422
1423	printf("\n ###\n"); fflush(stdout);
1424	}
1425
1426	static void init_thread_data(void)
1427	{
1428	ssize_t size = sizeof(g->threads)g->p.nr_tasks;
1429	int t;
1430
1431	g->threads = zalloc_shared_data(bytes: size);
1432
1433	for (t = `0`; t < g->p.nr_tasks; t++) {
1434	struct thread_data *td = g->threads + t;
1435	size_t cpuset_size = CPU_ALLOC_SIZE(g->p.nr_cpus);
1436	int cpu;
1437
1438	/ Allow all nodes by default: /
1439	td->bind_node = NUMA_NO_NODE;
1440
1441	/ Allow all CPUs by default: /
1442	td->bind_cpumask = CPU_ALLOC(g->p.nr_cpus);
1443	BUG_ON(!td->bind_cpumask);
1444	CPU_ZERO_S(cpuset_size, td->bind_cpumask);
1445	for (cpu = `0`; cpu < g->p.nr_cpus; cpu++)
1446	CPU_SET_S(cpu, cpuset_size, td->bind_cpumask);
1447	}
1448	}
1449
1450	static void deinit_thread_data(void)
1451	{
1452	ssize_t size = sizeof(g->threads)g->p.nr_tasks;
1453	int t;
1454
1455	/ Free the bind_cpumask allocated for thread_data /
1456	for (t = `0`; t < g->p.nr_tasks; t++) {
1457	struct thread_data *td = g->threads + t;
1458	CPU_FREE(td->bind_cpumask);
1459	}
1460
1461	free_data(data: g->threads, bytes: size);
1462	}
1463
1464	static int init(void)
1465	{
1466	g = (void )alloc_data(sizeof(g), MAP_SHARED, `1`, `0`, `0` / THP /, `0`);
1467
1468	/ Copy over options: /
1469	g->p = p0;
1470
1471	g->p.nr_cpus = numa_num_configured_cpus();
1472
1473	g->p.nr_nodes = numa_max_node() + `1`;
1474
1475	/ char array in count_process_nodes(): /
1476	BUG_ON(g->p.nr_nodes < `0`);
1477
1478	if (quiet && !g->p.show_details)
1479	g->p.show_details = -`1`;
1480
1481	/ Some memory should be specified: /
1482	if (!g->p.mb_global_str && !g->p.mb_proc_str && !g->p.mb_thread_str)
1483	return -`1`;
1484
1485	if (g->p.mb_global_str) {
1486	g->p.mb_global = atof(g->p.mb_global_str);
1487	BUG_ON(g->p.mb_global < `0`);
1488	}
1489
1490	if (g->p.mb_proc_str) {
1491	g->p.mb_proc = atof(g->p.mb_proc_str);
1492	BUG_ON(g->p.mb_proc < `0`);
1493	}
1494
1495	if (g->p.mb_proc_locked_str) {
1496	g->p.mb_proc_locked = atof(g->p.mb_proc_locked_str);
1497	BUG_ON(g->p.mb_proc_locked < `0`);
1498	BUG_ON(g->p.mb_proc_locked > g->p.mb_proc);
1499	}
1500
1501	if (g->p.mb_thread_str) {
1502	g->p.mb_thread = atof(g->p.mb_thread_str);
1503	BUG_ON(g->p.mb_thread < `0`);
1504	}
1505
1506	BUG_ON(g->p.nr_threads <= `0`);
1507	BUG_ON(g->p.nr_proc <= `0`);
1508
1509	g->p.nr_tasks = g->p.nr_proc*g->p.nr_threads;
1510
1511	g->p.bytes_global = g->p.mb_global `1024L``1024L`;
1512	g->p.bytes_process = g->p.mb_proc `1024L``1024L`;
1513	g->p.bytes_process_locked = g->p.mb_proc_locked `1024L``1024L`;
1514	g->p.bytes_thread = g->p.mb_thread `1024L``1024L`;
1515
1516	g->data = setup_shared_data(g->p.bytes_global);
1517
1518	/ Startup serialization: /
1519	mutex_init_pshared(mtx: &g->start_work_mutex);
1520	cond_init_pshared(cnd: &g->start_work_cond);
1521	mutex_init_pshared(mtx: &g->startup_mutex);
1522	cond_init_pshared(cnd: &g->startup_cond);
1523	mutex_init_pshared(mtx: &g->stop_work_mutex);
1524
1525	init_thread_data();
1526
1527	tprintf("#\n");
1528	if (parse_setup_cpu_list() \|\| parse_setup_node_list())
1529	return -`1`;
1530	tprintf("#\n");
1531
1532	print_summary();
1533
1534	return `0`;
1535	}
1536
1537	static void deinit(void)
1538	{
1539	free_data(data: g->data, bytes: g->p.bytes_global);
1540	g->data = NULL;
1541
1542	deinit_thread_data();
1543
1544	free_data(data: g, bytes: sizeof(*g));
1545	g = NULL;
1546	}
1547
1548	/*
1549	* Print a short or long result, depending on the verbosity setting:
1550	*/
1551	static void print_res(const char name, double* val,
1552	const char txt_unit, const* char txt_short, const* char *txt_long)
1553	{
1554	if (!name)
1555	name = "main,";
1556
1557	if (!quiet)
1558	printf(" %-30s %15.3f, %-15s %s\n", name, val, txt_unit, txt_short);
1559	else
1560	printf(" %14.3f %s\n", val, txt_long);
1561	}
1562
1563	static int __bench_numa(const char *name)
1564	{
1565	struct timeval start, stop, diff;
1566	u64 runtime_ns_min, runtime_ns_sum;
1567	pid_t *pids, pid, wpid;
1568	double delta_runtime;
1569	double runtime_avg;
1570	double runtime_sec_max;
1571	double runtime_sec_min;
1572	int wait_stat;
1573	double bytes;
1574	int i, t, p;
1575
1576	if (init())
1577	return -`1`;
1578
1579	pids = zalloc(g->p.nr_proc * sizeof(*pids));
1580	pid = -`1`;
1581
1582	if (g->p.serialize_startup) {
1583	tprintf(" #\n");
1584	tprintf(" # Startup synchronization: ..."); fflush(stdout);
1585	}
1586
1587	gettimeofday(&start, NULL);
1588
1589	for (i = `0`; i < g->p.nr_proc; i++) {
1590	pid = fork();
1591	dprintf(" # process %2d: PID %d\n", i, pid);
1592
1593	BUG_ON(pid < `0`);
1594	if (!pid) {
1595	/ Child process: /
1596	worker_process(process_nr: i);
1597
1598	exit(`0`);
1599	}
1600	pids[i] = pid;
1601
1602	}
1603
1604	if (g->p.serialize_startup) {
1605	bool threads_ready = false;
1606	double startup_sec;
1607
1608	/*
1609	* Wait for all the threads to start up. The last thread will
1610	* signal this process.
1611	*/
1612	mutex_lock(&g->startup_mutex);
1613	while (g->nr_tasks_started != g->p.nr_tasks)
1614	cond_wait(cnd: &g->startup_cond, mtx: &g->startup_mutex);
1615
1616	mutex_unlock(mtx: &g->startup_mutex);
1617
1618	/ Wait for all threads to be at the start_work_cond. /
1619	while (!threads_ready) {
1620	mutex_lock(&g->start_work_mutex);
1621	threads_ready = (g->nr_tasks_working == g->p.nr_tasks);
1622	mutex_unlock(mtx: &g->start_work_mutex);
1623	if (!threads_ready)
1624	usleep(`1`);
1625	}
1626
1627	gettimeofday(&stop, NULL);
1628
1629	timersub(&stop, &start, &diff);
1630
1631	startup_sec = diff.tv_sec * NSEC_PER_SEC;
1632	startup_sec += diff.tv_usec * NSEC_PER_USEC;
1633	startup_sec /= NSEC_PER_SEC;
1634
1635	tprintf(" threads initialized in %.6f seconds.\n", startup_sec);
1636	tprintf(" #\n");
1637
1638	start = stop;
1639	/ Start all threads running. /
1640	mutex_lock(&g->start_work_mutex);
1641	g->start_work = true;
1642	mutex_unlock(mtx: &g->start_work_mutex);
1643	cond_broadcast(cnd: &g->start_work_cond);
1644	} else {
1645	gettimeofday(&start, NULL);
1646	}
1647
1648	/ Parent process: /
1649
1650
1651	for (i = `0`; i < g->p.nr_proc; i++) {
1652	wpid = waitpid(pids[i], &wait_stat, `0`);
1653	BUG_ON(wpid < `0`);
1654	BUG_ON(!WIFEXITED(wait_stat));
1655
1656	}
1657
1658	runtime_ns_sum = `0`;
1659	runtime_ns_min = -`1LL`;
1660
1661	for (t = `0`; t < g->p.nr_tasks; t++) {
1662	u64 thread_runtime_ns = g->threads[t].runtime_ns;
1663
1664	runtime_ns_sum += thread_runtime_ns;
1665	runtime_ns_min = min(thread_runtime_ns, runtime_ns_min);
1666	}
1667
1668	gettimeofday(&stop, NULL);
1669	timersub(&stop, &start, &diff);
1670
1671	BUG_ON(bench_format != BENCH_FORMAT_DEFAULT);
1672
1673	tprintf("\n ###\n");
1674	tprintf("\n");
1675
1676	runtime_sec_max = diff.tv_sec * NSEC_PER_SEC;
1677	runtime_sec_max += diff.tv_usec * NSEC_PER_USEC;
1678	runtime_sec_max /= NSEC_PER_SEC;
1679
1680	runtime_sec_min = runtime_ns_min / NSEC_PER_SEC;
1681
1682	bytes = g->bytes_done;
1683	runtime_avg = (double)runtime_ns_sum / g->p.nr_tasks / NSEC_PER_SEC;
1684
1685	if (g->p.measure_convergence) {
1686	print_res(name, val: runtime_sec_max,
1687	txt_unit: "secs,", txt_short: "NUMA-convergence-latency", txt_long: "secs latency to NUMA-converge");
1688	}
1689
1690	print_res(name, val: runtime_sec_max,
1691	txt_unit: "secs,", txt_short: "runtime-max/thread", txt_long: "secs slowest (max) thread-runtime");
1692
1693	print_res(name, val: runtime_sec_min,
1694	txt_unit: "secs,", txt_short: "runtime-min/thread", txt_long: "secs fastest (min) thread-runtime");
1695
1696	print_res(name, val: runtime_avg,
1697	txt_unit: "secs,", txt_short: "runtime-avg/thread", txt_long: "secs average thread-runtime");
1698
1699	delta_runtime = (runtime_sec_max - runtime_sec_min)/`2.0`;
1700	print_res(name, val: delta_runtime / runtime_sec_max * `100.0`,
1701	txt_unit: "%,", txt_short: "spread-runtime/thread", txt_long: "% difference between max/avg runtime");
1702
1703	print_res(name, val: bytes / g->p.nr_tasks / `1e9`,
1704	txt_unit: "GB,", txt_short: "data/thread", txt_long: "GB data processed, per thread");
1705
1706	print_res(name, val: bytes / `1e9`,
1707	txt_unit: "GB,", txt_short: "data-total", txt_long: "GB data processed, total");
1708
1709	print_res(name, val: runtime_sec_max * NSEC_PER_SEC / (bytes / g->p.nr_tasks),
1710	txt_unit: "nsecs,", txt_short: "runtime/byte/thread",txt_long: "nsecs/byte/thread runtime");
1711
1712	print_res(name, val: bytes / g->p.nr_tasks / `1e9` / runtime_sec_max,
1713	txt_unit: "GB/sec,", txt_short: "thread-speed", txt_long: "GB/sec/thread speed");
1714
1715	print_res(name, val: bytes / runtime_sec_max / `1e9`,
1716	txt_unit: "GB/sec,", txt_short: "total-speed", txt_long: "GB/sec total speed");
1717
1718	if (g->p.show_details >= `2`) {
1719	char tname[`14` + `2` * `11` + `1`];
1720	struct thread_data *td;
1721	for (p = `0`; p < g->p.nr_proc; p++) {
1722	for (t = `0`; t < g->p.nr_threads; t++) {
1723	memset(tname, `0`, sizeof(tname));
1724	td = g->threads + p*g->p.nr_threads + t;
1725	snprintf(buf: tname, size: sizeof(tname), fmt: "process%d:thread%d", p, t);
1726	print_res(name: tname, val: td->speed_gbs,
1727	txt_unit: "GB/sec", txt_short: "thread-speed", txt_long: "GB/sec/thread speed");
1728	print_res(name: tname, val: td->system_time_ns / NSEC_PER_SEC,
1729	txt_unit: "secs", txt_short: "thread-system-time", txt_long: "system CPU time/thread");
1730	print_res(name: tname, val: td->user_time_ns / NSEC_PER_SEC,
1731	txt_unit: "secs", txt_short: "thread-user-time", txt_long: "user CPU time/thread");
1732	}
1733	}
1734	}
1735
1736	free(pids);
1737
1738	deinit();
1739
1740	return `0`;
1741	}
1742
1743	#define MAX_ARGS 50
1744
1745	static int command_size(const char **argv)
1746	{
1747	int size = `0`;
1748
1749	while (*argv) {
1750	size++;
1751	argv++;
1752	}
1753
1754	BUG_ON(size >= MAX_ARGS);
1755
1756	return size;
1757	}
1758
1759	static void init_params(struct params p, const* char name, int* argc, const char **argv)
1760	{
1761	int i;
1762
1763	printf("\n # Running %s \"perf bench numa", name);
1764
1765	for (i = `0`; i < argc; i++)
1766	printf(" %s", argv[i]);
1767
1768	printf("\"\n");
1769
1770	memset(p, `0`, sizeof(*p));
1771
1772	/ Initialize nonzero defaults: /
1773
1774	p->serialize_startup = `1`;
1775	p->data_reads = true;
1776	p->data_writes = true;
1777	p->data_backwards = true;
1778	p->data_rand_walk = true;
1779	p->nr_loops = -`1`;
1780	p->init_random = true;
1781	p->mb_global_str = "1";
1782	p->nr_proc = `1`;
1783	p->nr_threads = `1`;
1784	p->nr_secs = `5`;
1785	p->run_all = argc == `1`;
1786	}
1787
1788	static int run_bench_numa(const char name, const* char **argv)
1789	{
1790	int argc = command_size(argv);
1791
1792	init_params(p: &p0, name, argc, argv);
1793	argc = parse_options(argc, argv, options, bench_numa_usage, `0`);
1794	if (argc)
1795	goto err;
1796
1797	if (__bench_numa(name))
1798	goto err;
1799
1800	return `0`;
1801
1802	err:
1803	return -`1`;
1804	}
1805
1806	#define OPT_BW_RAM "-s", "20", "-zZq", "--thp", " 1", "--no-data_rand_walk"
1807	#define OPT_BW_RAM_NOTHP OPT_BW_RAM, "--thp", "-1"
1808
1809	#define OPT_CONV "-s", "100", "-zZ0qcm", "--thp", " 1"
1810	#define OPT_CONV_NOTHP OPT_CONV, "--thp", "-1"
1811
1812	#define OPT_BW "-s", "20", "-zZ0q", "--thp", " 1"
1813	#define OPT_BW_NOTHP OPT_BW, "--thp", "-1"
1814
1815	/*
1816	* The built-in test-suite executed by "perf bench numa -a".
1817	*
1818	* (A minimum of 4 nodes and 16 GB of RAM is recommended.)
1819	*/
1820	static const char *tests[][MAX_ARGS] = {
1821	/ Basic single-stream NUMA bandwidth measurements: /
1822	{ "RAM-bw-local,", "mem", "-p", "1", "-t", "1", "-P", "1024",
1823	"-C" , "0", "-M", "0", OPT_BW_RAM },
1824	{ "RAM-bw-local-NOTHP,",
1825	"mem", "-p", "1", "-t", "1", "-P", "1024",
1826	"-C" , "0", "-M", "0", OPT_BW_RAM_NOTHP },
1827	{ "RAM-bw-remote,", "mem", "-p", "1", "-t", "1", "-P", "1024",
1828	"-C" , "0", "-M", "1", OPT_BW_RAM },
1829
1830	/ 2-stream NUMA bandwidth measurements: /
1831	{ "RAM-bw-local-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024",
1832	"-C", "0,2", "-M", "0x2", OPT_BW_RAM },
1833	{ "RAM-bw-remote-2x,", "mem", "-p", "2", "-t", "1", "-P", "1024",
1834	"-C", "0,2", "-M", "1x2", OPT_BW_RAM },
1835
1836	/ Cross-stream NUMA bandwidth measurement: /
1837	{ "RAM-bw-cross,", "mem", "-p", "2", "-t", "1", "-P", "1024",
1838	"-C", "0,8", "-M", "1,0", OPT_BW_RAM },
1839
1840	/ Convergence latency measurements: /
1841	{ " 1x3-convergence,", "mem", "-p", "1", "-t", "3", "-P", "512", OPT_CONV },
1842	{ " 1x4-convergence,", "mem", "-p", "1", "-t", "4", "-P", "512", OPT_CONV },
1843	{ " 1x6-convergence,", "mem", "-p", "1", "-t", "6", "-P", "1020", OPT_CONV },
1844	{ " 2x3-convergence,", "mem", "-p", "2", "-t", "3", "-P", "1020", OPT_CONV },
1845	{ " 3x3-convergence,", "mem", "-p", "3", "-t", "3", "-P", "1020", OPT_CONV },
1846	{ " 4x4-convergence,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV },
1847	{ " 4x4-convergence-NOTHP,",
1848	"mem", "-p", "4", "-t", "4", "-P", "512", OPT_CONV_NOTHP },
1849	{ " 4x6-convergence,", "mem", "-p", "4", "-t", "6", "-P", "1020", OPT_CONV },
1850	{ " 4x8-convergence,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_CONV },
1851	{ " 8x4-convergence,", "mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV },
1852	{ " 8x4-convergence-NOTHP,",
1853	"mem", "-p", "8", "-t", "4", "-P", "512", OPT_CONV_NOTHP },
1854	{ " 3x1-convergence,", "mem", "-p", "3", "-t", "1", "-P", "512", OPT_CONV },
1855	{ " 4x1-convergence,", "mem", "-p", "4", "-t", "1", "-P", "512", OPT_CONV },
1856	{ " 8x1-convergence,", "mem", "-p", "8", "-t", "1", "-P", "512", OPT_CONV },
1857	{ "16x1-convergence,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_CONV },
1858	{ "32x1-convergence,", "mem", "-p", "32", "-t", "1", "-P", "128", OPT_CONV },
1859
1860	/ Various NUMA process/thread layout bandwidth measurements: /
1861	{ " 2x1-bw-process,", "mem", "-p", "2", "-t", "1", "-P", "1024", OPT_BW },
1862	{ " 3x1-bw-process,", "mem", "-p", "3", "-t", "1", "-P", "1024", OPT_BW },
1863	{ " 4x1-bw-process,", "mem", "-p", "4", "-t", "1", "-P", "1024", OPT_BW },
1864	{ " 8x1-bw-process,", "mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW },
1865	{ " 8x1-bw-process-NOTHP,",
1866	"mem", "-p", "8", "-t", "1", "-P", " 512", OPT_BW_NOTHP },
1867	{ "16x1-bw-process,", "mem", "-p", "16", "-t", "1", "-P", "256", OPT_BW },
1868
1869	{ " 1x4-bw-thread,", "mem", "-p", "1", "-t", "4", "-T", "256", OPT_BW },
1870	{ " 1x8-bw-thread,", "mem", "-p", "1", "-t", "8", "-T", "256", OPT_BW },
1871	{ "1x16-bw-thread,", "mem", "-p", "1", "-t", "16", "-T", "128", OPT_BW },
1872	{ "1x32-bw-thread,", "mem", "-p", "1", "-t", "32", "-T", "64", OPT_BW },
1873
1874	{ " 2x3-bw-process,", "mem", "-p", "2", "-t", "3", "-P", "512", OPT_BW },
1875	{ " 4x4-bw-process,", "mem", "-p", "4", "-t", "4", "-P", "512", OPT_BW },
1876	{ " 4x6-bw-process,", "mem", "-p", "4", "-t", "6", "-P", "512", OPT_BW },
1877	{ " 4x8-bw-process,", "mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW },
1878	{ " 4x8-bw-process-NOTHP,",
1879	"mem", "-p", "4", "-t", "8", "-P", "512", OPT_BW_NOTHP },
1880	{ " 3x3-bw-process,", "mem", "-p", "3", "-t", "3", "-P", "512", OPT_BW },
1881	{ " 5x5-bw-process,", "mem", "-p", "5", "-t", "5", "-P", "512", OPT_BW },
1882
1883	{ "2x16-bw-process,", "mem", "-p", "2", "-t", "16", "-P", "512", OPT_BW },
1884	{ "1x32-bw-process,", "mem", "-p", "1", "-t", "32", "-P", "2048", OPT_BW },
1885
1886	{ "numa02-bw,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW },
1887	{ "numa02-bw-NOTHP,", "mem", "-p", "1", "-t", "32", "-T", "32", OPT_BW_NOTHP },
1888	{ "numa01-bw-thread,", "mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW },
1889	{ "numa01-bw-thread-NOTHP,",
1890	"mem", "-p", "2", "-t", "16", "-T", "192", OPT_BW_NOTHP },
1891	};
1892
1893	static int bench_all(void)
1894	{
1895	int nr = ARRAY_SIZE(tests);
1896	int ret;
1897	int i;
1898
1899	ret = system("echo ' #'; echo ' # Running test on: '$(uname -a); echo ' #'");
1900	BUG_ON(ret < `0`);
1901
1902	for (i = `0`; i < nr; i++) {
1903	run_bench_numa(name: tests[i][`0`], argv: tests[i] + `1`);
1904	}
1905
1906	printf("\n");
1907
1908	return `0`;
1909	}
1910
1911	int bench_numa(int argc, const char **argv)
1912	{
1913	init_params(p: &p0, name: "main,", argc, argv);
1914	argc = parse_options(argc, argv, options, bench_numa_usage, `0`);
1915	if (argc)
1916	goto err;
1917
1918	if (p0.run_all)
1919	return bench_all();
1920
1921	if (__bench_numa(NULL))
1922	goto err;
1923
1924	return `0`;
1925
1926	err:
1927	usage_with_options(numa_usage, options);
1928	return -`1`;
1929	}
1930

source code of linux/tools/perf/bench/numa.c