builtin-sched.c source code [linux/tools/perf/builtin-sched.c]

1	// SPDX-License-Identifier: GPL-2.0
2	#include "builtin.h"
3	#include "perf-sys.h"
4
5	#include "util/cpumap.h"
6	#include "util/evlist.h"
7	#include "util/evsel.h"
8	#include "util/evsel_fprintf.h"
9	#include "util/mutex.h"
10	#include "util/symbol.h"
11	#include "util/thread.h"
12	#include "util/header.h"
13	#include "util/session.h"
14	#include "util/tool.h"
15	#include "util/cloexec.h"
16	#include "util/thread_map.h"
17	#include "util/color.h"
18	#include "util/stat.h"
19	#include "util/string2.h"
20	#include "util/callchain.h"
21	#include "util/time-utils.h"
22
23	#include <subcmd/pager.h>
24	#include <subcmd/parse-options.h>
25	#include "util/trace-event.h"
26
27	#include "util/debug.h"
28	#include "util/event.h"
29	#include "util/util.h"
30
31	#include <linux/kernel.h>
32	#include <linux/log2.h>
33	#include <linux/zalloc.h>
34	#include <sys/prctl.h>
35	#include <sys/resource.h>
36	#include <inttypes.h>
37
38	#include <errno.h>
39	#include <semaphore.h>
40	#include <pthread.h>
41	#include <math.h>
42	#include <api/fs/fs.h>
43	#include <perf/cpumap.h>
44	#include <linux/time64.h>
45	#include <linux/err.h>
46
47	#include <linux/ctype.h>
48
49	#define PR_SET_NAME 15 /* Set process name */
50	#define MAX_CPUS 4096
51	#define COMM_LEN 20
52	#define SYM_LEN 129
53	#define MAX_PID 1024000
54
55	static const char *cpu_list;
56	static DECLARE_BITMAP(cpu_bitmap, MAX_NR_CPUS);
57
58	struct sched_atom;
59
60	struct task_desc {
61	unsigned long nr;
62	unsigned long pid;
63	char comm[COMM_LEN];
64
65	unsigned long nr_events;
66	unsigned long curr_event;
67	struct sched_atom **atoms;
68
69	pthread_t thread;
70	sem_t sleep_sem;
71
72	sem_t ready_for_work;
73	sem_t work_done_sem;
74
75	u64 cpu_usage;
76	};
77
78	enum sched_event_type {
79	SCHED_EVENT_RUN,
80	SCHED_EVENT_SLEEP,
81	SCHED_EVENT_WAKEUP,
82	SCHED_EVENT_MIGRATION,
83	};
84
85	struct sched_atom {
86	enum sched_event_type type;
87	int specific_wait;
88	u64 timestamp;
89	u64 duration;
90	unsigned long nr;
91	sem_t *wait_sem;
92	struct task_desc *wakee;
93	};
94
95	#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKWP"
96
97	/ task state bitmask, copied from include/linux/sched.h /
98	#define TASK_RUNNING 0
99	#define TASK_INTERRUPTIBLE 1
100	#define TASK_UNINTERRUPTIBLE 2
101	#define __TASK_STOPPED 4
102	#define __TASK_TRACED 8
103	/ in tsk->exit_state /
104	#define EXIT_DEAD 16
105	#define EXIT_ZOMBIE 32
106	#define EXIT_TRACE (EXIT_ZOMBIE \| EXIT_DEAD)
107	/ in tsk->state again /
108	#define TASK_DEAD 64
109	#define TASK_WAKEKILL 128
110	#define TASK_WAKING 256
111	#define TASK_PARKED 512
112
113	enum thread_state {
114	THREAD_SLEEPING = `0`,
115	THREAD_WAIT_CPU,
116	THREAD_SCHED_IN,
117	THREAD_IGNORE
118	};
119
120	struct work_atom {
121	struct list_head list;
122	enum thread_state state;
123	u64 sched_out_time;
124	u64 wake_up_time;
125	u64 sched_in_time;
126	u64 runtime;
127	};
128
129	struct work_atoms {
130	struct list_head work_list;
131	struct thread *thread;
132	struct rb_node node;
133	u64 max_lat;
134	u64 max_lat_start;
135	u64 max_lat_end;
136	u64 total_lat;
137	u64 nb_atoms;
138	u64 total_runtime;
139	int num_merged;
140	};
141
142	typedef int (sort_fn_t)(struct* work_atoms , struct* work_atoms *);
143
144	struct perf_sched;
145
146	struct trace_sched_handler {
147	int (switch_event)(struct* perf_sched sched, struct* evsel *evsel,
148	struct perf_sample sample, struct* machine *machine);
149
150	int (runtime_event)(struct* perf_sched sched, struct* evsel *evsel,
151	struct perf_sample sample, struct* machine *machine);
152
153	int (wakeup_event)(struct* perf_sched sched, struct* evsel *evsel,
154	struct perf_sample sample, struct* machine *machine);
155
156	/ PERF_RECORD_FORK event, not sched_process_fork tracepoint /
157	int (fork_event)(struct* perf_sched sched, union* perf_event *event,
158	struct machine *machine);
159
160	int (migrate_task_event)(struct* perf_sched *sched,
161	struct evsel *evsel,
162	struct perf_sample *sample,
163	struct machine *machine);
164	};
165
166	#define COLOR_PIDS PERF_COLOR_BLUE
167	#define COLOR_CPUS PERF_COLOR_BG_RED
168
169	struct perf_sched_map {
170	DECLARE_BITMAP(comp_cpus_mask, MAX_CPUS);
171	struct perf_cpu *comp_cpus;
172	bool comp;
173	struct perf_thread_map *color_pids;
174	const char *color_pids_str;
175	struct perf_cpu_map *color_cpus;
176	const char *color_cpus_str;
177	struct perf_cpu_map *cpus;
178	const char *cpus_str;
179	};
180
181	struct perf_sched {
182	struct perf_tool tool;
183	const char *sort_order;
184	unsigned long nr_tasks;
185	struct task_desc **pid_to_task;
186	struct task_desc **tasks;
187	const struct trace_sched_handler *tp_handler;
188	struct mutex start_work_mutex;
189	struct mutex work_done_wait_mutex;
190	int profile_cpu;
191	/*
192	* Track the current task - that way we can know whether there's any
193	* weird events, such as a task being switched away that is not current.
194	*/
195	struct perf_cpu max_cpu;
196	u32 *curr_pid;
197	struct thread **curr_thread;
198	char next_shortname1;
199	char next_shortname2;
200	unsigned int replay_repeat;
201	unsigned long nr_run_events;
202	unsigned long nr_sleep_events;
203	unsigned long nr_wakeup_events;
204	unsigned long nr_sleep_corrections;
205	unsigned long nr_run_events_optimized;
206	unsigned long targetless_wakeups;
207	unsigned long multitarget_wakeups;
208	unsigned long nr_runs;
209	unsigned long nr_timestamps;
210	unsigned long nr_unordered_timestamps;
211	unsigned long nr_context_switch_bugs;
212	unsigned long nr_events;
213	unsigned long nr_lost_chunks;
214	unsigned long nr_lost_events;
215	u64 run_measurement_overhead;
216	u64 sleep_measurement_overhead;
217	u64 start_time;
218	u64 cpu_usage;
219	u64 runavg_cpu_usage;
220	u64 parent_cpu_usage;
221	u64 runavg_parent_cpu_usage;
222	u64 sum_runtime;
223	u64 sum_fluct;
224	u64 run_avg;
225	u64 all_runtime;
226	u64 all_count;
227	u64 *cpu_last_switched;
228	struct rb_root_cached atom_root, sorted_atom_root, merged_atom_root;
229	struct list_head sort_list, cmp_pid;
230	bool force;
231	bool skip_merge;
232	struct perf_sched_map map;
233
234	/ options for timehist command /
235	bool summary;
236	bool summary_only;
237	bool idle_hist;
238	bool show_callchain;
239	unsigned int max_stack;
240	bool show_cpu_visual;
241	bool show_wakeups;
242	bool show_next;
243	bool show_migrations;
244	bool show_state;
245	u64 skipped_samples;
246	const char *time_str;
247	struct perf_time_interval ptime;
248	struct perf_time_interval hist_time;
249	volatile bool thread_funcs_exit;
250	};
251
252	/ per thread run time data /
253	struct thread_runtime {
254	u64 last_time; / time of previous sched in/out event /
255	u64 dt_run; / run time /
256	u64 dt_sleep; / time between CPU access by sleep (off cpu) /
257	u64 dt_iowait; / time between CPU access by iowait (off cpu) /
258	u64 dt_preempt; / time between CPU access by preempt (off cpu) /
259	u64 dt_delay; / time between wakeup and sched-in /
260	u64 ready_to_run; / time of wakeup /
261
262	struct stats run_stats;
263	u64 total_run_time;
264	u64 total_sleep_time;
265	u64 total_iowait_time;
266	u64 total_preempt_time;
267	u64 total_delay_time;
268
269	int last_state;
270
271	char shortname[`3`];
272	bool comm_changed;
273
274	u64 migrations;
275	};
276
277	/ per event run time data /
278	struct evsel_runtime {
279	u64 last_time; /* time this event was last seen per cpu /
280	u32 ncpu; / highest cpu slot allocated /
281	};
282
283	/ per cpu idle time data /
284	struct idle_thread_runtime {
285	struct thread_runtime tr;
286	struct thread *last_thread;
287	struct rb_root_cached sorted_root;
288	struct callchain_root callchain;
289	struct callchain_cursor cursor;
290	};
291
292	/ track idle times per cpu /
293	static struct thread **idle_threads;
294	static int idle_max_cpu;
295	static char idle_comm[] = "<idle>";
296
297	static u64 get_nsecs(void)
298	{
299	struct timespec ts;
300
301	clock_gettime(CLOCK_MONOTONIC, &ts);
302
303	return ts.tv_sec * NSEC_PER_SEC + ts.tv_nsec;
304	}
305
306	static void burn_nsecs(struct perf_sched *sched, u64 nsecs)
307	{
308	u64 T0 = get_nsecs(), T1;
309
310	do {
311	T1 = get_nsecs();
312	} while (T1 + sched->run_measurement_overhead < T0 + nsecs);
313	}
314
315	static void sleep_nsecs(u64 nsecs)
316	{
317	struct timespec ts;
318
319	ts.tv_nsec = nsecs % `999999999`;
320	ts.tv_sec = nsecs / `999999999`;
321
322	nanosleep(&ts, NULL);
323	}
324
325	static void calibrate_run_measurement_overhead(struct perf_sched *sched)
326	{
327	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
328	int i;
329
330	for (i = `0`; i < `10`; i++) {
331	T0 = get_nsecs();
332	burn_nsecs(sched, nsecs: `0`);
333	T1 = get_nsecs();
334	delta = T1-T0;
335	min_delta = min(min_delta, delta);
336	}
337	sched->run_measurement_overhead = min_delta;
338
339	printf("run measurement overhead: %" PRIu64 " nsecs\n", min_delta);
340	}
341
342	static void calibrate_sleep_measurement_overhead(struct perf_sched *sched)
343	{
344	u64 T0, T1, delta, min_delta = NSEC_PER_SEC;
345	int i;
346
347	for (i = `0`; i < `10`; i++) {
348	T0 = get_nsecs();
349	sleep_nsecs(nsecs: `10000`);
350	T1 = get_nsecs();
351	delta = T1-T0;
352	min_delta = min(min_delta, delta);
353	}
354	min_delta -= `10000`;
355	sched->sleep_measurement_overhead = min_delta;
356
357	printf("sleep measurement overhead: %" PRIu64 " nsecs\n", min_delta);
358	}
359
360	static struct sched_atom *
361	get_new_event(struct task_desc *task, u64 timestamp)
362	{
363	struct sched_atom event = zalloc(sizeof(event));
364	unsigned long idx = task->nr_events;
365	size_t size;
366
367	event->timestamp = timestamp;
368	event->nr = idx;
369
370	task->nr_events++;
371	size = sizeof(struct sched_atom ) task->nr_events;
372	task->atoms = realloc(task->atoms, size);
373	BUG_ON(!task->atoms);
374
375	task->atoms[idx] = event;
376
377	return event;
378	}
379
380	static struct sched_atom last_event(struct* task_desc *task)
381	{
382	if (!task->nr_events)
383	return NULL;
384
385	return task->atoms[task->nr_events - `1`];
386	}
387
388	static void add_sched_event_run(struct perf_sched sched, struct* task_desc *task,
389	u64 timestamp, u64 duration)
390	{
391	struct sched_atom event, curr_event = last_event(task);
392
393	/*
394	* optimize an existing RUN event by merging this one
395	* to it:
396	*/
397	if (curr_event && curr_event->type == SCHED_EVENT_RUN) {
398	sched->nr_run_events_optimized++;
399	curr_event->duration += duration;
400	return;
401	}
402
403	event = get_new_event(task, timestamp);
404
405	event->type = SCHED_EVENT_RUN;
406	event->duration = duration;
407
408	sched->nr_run_events++;
409	}
410
411	static void add_sched_event_wakeup(struct perf_sched sched, struct* task_desc *task,
412	u64 timestamp, struct task_desc *wakee)
413	{
414	struct sched_atom event, wakee_event;
415
416	event = get_new_event(task, timestamp);
417	event->type = SCHED_EVENT_WAKEUP;
418	event->wakee = wakee;
419
420	wakee_event = last_event(task: wakee);
421	if (!wakee_event \|\| wakee_event->type != SCHED_EVENT_SLEEP) {
422	sched->targetless_wakeups++;
423	return;
424	}
425	if (wakee_event->wait_sem) {
426	sched->multitarget_wakeups++;
427	return;
428	}
429
430	wakee_event->wait_sem = zalloc(sizeof(*wakee_event->wait_sem));
431	sem_init(wakee_event->wait_sem, `0`, `0`);
432	wakee_event->specific_wait = `1`;
433	event->wait_sem = wakee_event->wait_sem;
434
435	sched->nr_wakeup_events++;
436	}
437
438	static void add_sched_event_sleep(struct perf_sched sched, struct* task_desc *task,
439	u64 timestamp, u64 task_state __maybe_unused)
440	{
441	struct sched_atom *event = get_new_event(task, timestamp);
442
443	event->type = SCHED_EVENT_SLEEP;
444
445	sched->nr_sleep_events++;
446	}
447
448	static struct task_desc register_pid(struct* perf_sched *sched,
449	unsigned long pid, const char *comm)
450	{
451	struct task_desc *task;
452	static int pid_max;
453
454	if (sched->pid_to_task == NULL) {
455	if (sysctl__read_int("kernel/pid_max", &pid_max) < `0`)
456	pid_max = MAX_PID;
457	BUG_ON((sched->pid_to_task = calloc(pid_max, sizeof(struct task_desc *))) == NULL);
458	}
459	if (pid >= (unsigned long)pid_max) {
460	BUG_ON((sched->pid_to_task = realloc(sched->pid_to_task, (pid + `1`) *
461	sizeof(struct task_desc *))) == NULL);
462	while (pid >= (unsigned long)pid_max)
463	sched->pid_to_task[pid_max++] = NULL;
464	}
465
466	task = sched->pid_to_task[pid];
467
468	if (task)
469	return task;
470
471	task = zalloc(sizeof(*task));
472	task->pid = pid;
473	task->nr = sched->nr_tasks;
474	strcpy(p: task->comm, q: comm);
475	/*
476	* every task starts in sleeping state - this gets ignored
477	* if there's no wakeup pointing to this sleep state:
478	*/
479	add_sched_event_sleep(sched, task, timestamp: `0`, task_state: `0`);
480
481	sched->pid_to_task[pid] = task;
482	sched->nr_tasks++;
483	sched->tasks = realloc(sched->tasks, sched->nr_tasks * sizeof(struct task_desc *));
484	BUG_ON(!sched->tasks);
485	sched->tasks[task->nr] = task;
486
487	if (verbose > `0`)
488	printf("registered task #%ld, PID %ld (%s)\n", sched->nr_tasks, pid, comm);
489
490	return task;
491	}
492
493
494	static void print_task_traces(struct perf_sched *sched)
495	{
496	struct task_desc *task;
497	unsigned long i;
498
499	for (i = `0`; i < sched->nr_tasks; i++) {
500	task = sched->tasks[i];
501	printf("task %6ld (%20s:%10ld), nr_events: %ld\n",
502	task->nr, task->comm, task->pid, task->nr_events);
503	}
504	}
505
506	static void add_cross_task_wakeups(struct perf_sched *sched)
507	{
508	struct task_desc task1, task2;
509	unsigned long i, j;
510
511	for (i = `0`; i < sched->nr_tasks; i++) {
512	task1 = sched->tasks[i];
513	j = i + `1`;
514	if (j == sched->nr_tasks)
515	j = `0`;
516	task2 = sched->tasks[j];
517	add_sched_event_wakeup(sched, task: task1, timestamp: `0`, wakee: task2);
518	}
519	}
520
521	static void perf_sched__process_event(struct perf_sched *sched,
522	struct sched_atom *atom)
523	{
524	int ret = `0`;
525
526	switch (atom->type) {
527	case SCHED_EVENT_RUN:
528	burn_nsecs(sched, nsecs: atom->duration);
529	break;
530	case SCHED_EVENT_SLEEP:
531	if (atom->wait_sem)
532	ret = sem_wait(atom->wait_sem);
533	BUG_ON(ret);
534	break;
535	case SCHED_EVENT_WAKEUP:
536	if (atom->wait_sem)
537	ret = sem_post(atom->wait_sem);
538	BUG_ON(ret);
539	break;
540	case SCHED_EVENT_MIGRATION:
541	break;
542	default:
543	BUG_ON(`1`);
544	}
545	}
546
547	static u64 get_cpu_usage_nsec_parent(void)
548	{
549	struct rusage ru;
550	u64 sum;
551	int err;
552
553	err = getrusage(RUSAGE_SELF, &ru);
554	BUG_ON(err);
555
556	sum = ru.ru_utime.tv_sec * NSEC_PER_SEC + ru.ru_utime.tv_usec * NSEC_PER_USEC;
557	sum += ru.ru_stime.tv_sec * NSEC_PER_SEC + ru.ru_stime.tv_usec * NSEC_PER_USEC;
558
559	return sum;
560	}
561
562	static int self_open_counters(struct perf_sched sched, unsigned* long cur_task)
563	{
564	struct perf_event_attr attr;
565	char sbuf[STRERR_BUFSIZE], info[STRERR_BUFSIZE];
566	int fd;
567	struct rlimit limit;
568	bool need_privilege = false;
569
570	memset(&attr, `0`, sizeof(attr));
571
572	attr.type = PERF_TYPE_SOFTWARE;
573	attr.config = PERF_COUNT_SW_TASK_CLOCK;
574
575	force_again:
576	fd = sys_perf_event_open(attr: &attr, pid: `0`, cpu: -`1`, group_fd: -`1`,
577	flags: perf_event_open_cloexec_flag());
578
579	if (fd < `0`) {
580	if (errno == EMFILE) {
581	if (sched->force) {
582	BUG_ON(getrlimit(RLIMIT_NOFILE, &limit) == -`1`);
583	limit.rlim_cur += sched->nr_tasks - cur_task;
584	if (limit.rlim_cur > limit.rlim_max) {
585	limit.rlim_max = limit.rlim_cur;
586	need_privilege = true;
587	}
588	if (setrlimit(RLIMIT_NOFILE, &limit) == -`1`) {
589	if (need_privilege && errno == EPERM)
590	strcpy(p: info, q: "Need privilege\n");
591	} else
592	goto force_again;
593	} else
594	strcpy(p: info, q: "Have a try with -f option\n");
595	}
596	pr_err("Error: sys_perf_event_open() syscall returned "
597	"with %d (%s)\n%s", fd,
598	str_error_r(errno, sbuf, sizeof(sbuf)), info);
599	exit(EXIT_FAILURE);
600	}
601	return fd;
602	}
603
604	static u64 get_cpu_usage_nsec_self(int fd)
605	{
606	u64 runtime;
607	int ret;
608
609	ret = read(fd, &runtime, sizeof(runtime));
610	BUG_ON(ret != sizeof(runtime));
611
612	return runtime;
613	}
614
615	struct sched_thread_parms {
616	struct task_desc *task;
617	struct perf_sched *sched;
618	int fd;
619	};
620
621	static void thread_func(void* *ctx)
622	{
623	struct sched_thread_parms *parms = ctx;
624	struct task_desc *this_task = parms->task;
625	struct perf_sched *sched = parms->sched;
626	u64 cpu_usage_0, cpu_usage_1;
627	unsigned long i, ret;
628	char comm2[`22`];
629	int fd = parms->fd;
630
631	zfree(&parms);
632
633	sprintf(buf: comm2, fmt: ":%s", this_task->comm);
634	prctl(PR_SET_NAME, comm2);
635	if (fd < `0`)
636	return NULL;
637
638	while (!sched->thread_funcs_exit) {
639	ret = sem_post(&this_task->ready_for_work);
640	BUG_ON(ret);
641	mutex_lock(&sched->start_work_mutex);
642	mutex_unlock(mtx: &sched->start_work_mutex);
643
644	cpu_usage_0 = get_cpu_usage_nsec_self(fd);
645
646	for (i = `0`; i < this_task->nr_events; i++) {
647	this_task->curr_event = i;
648	perf_sched__process_event(sched, atom: this_task->atoms[i]);
649	}
650
651	cpu_usage_1 = get_cpu_usage_nsec_self(fd);
652	this_task->cpu_usage = cpu_usage_1 - cpu_usage_0;
653	ret = sem_post(&this_task->work_done_sem);
654	BUG_ON(ret);
655
656	mutex_lock(&sched->work_done_wait_mutex);
657	mutex_unlock(mtx: &sched->work_done_wait_mutex);
658	}
659	return NULL;
660	}
661
662	static void create_tasks(struct perf_sched *sched)
663	EXCLUSIVE_LOCK_FUNCTION(sched->start_work_mutex)
664	EXCLUSIVE_LOCK_FUNCTION(sched->work_done_wait_mutex)
665	{
666	struct task_desc *task;
667	pthread_attr_t attr;
668	unsigned long i;
669	int err;
670
671	err = pthread_attr_init(&attr);
672	BUG_ON(err);
673	err = pthread_attr_setstacksize(&attr,
674	(size_t) max(`16` * `1024`, (int)PTHREAD_STACK_MIN));
675	BUG_ON(err);
676	mutex_lock(&sched->start_work_mutex);
677	mutex_lock(&sched->work_done_wait_mutex);
678	for (i = `0`; i < sched->nr_tasks; i++) {
679	struct sched_thread_parms parms = malloc(sizeof(parms));
680	BUG_ON(parms == NULL);
681	parms->task = task = sched->tasks[i];
682	parms->sched = sched;
683	parms->fd = self_open_counters(sched, cur_task: i);
684	sem_init(&task->sleep_sem, `0`, `0`);
685	sem_init(&task->ready_for_work, `0`, `0`);
686	sem_init(&task->work_done_sem, `0`, `0`);
687	task->curr_event = `0`;
688	err = pthread_create(&task->thread, &attr, thread_func, parms);
689	BUG_ON(err);
690	}
691	}
692
693	static void destroy_tasks(struct perf_sched *sched)
694	UNLOCK_FUNCTION(sched->start_work_mutex)
695	UNLOCK_FUNCTION(sched->work_done_wait_mutex)
696	{
697	struct task_desc *task;
698	unsigned long i;
699	int err;
700
701	mutex_unlock(mtx: &sched->start_work_mutex);
702	mutex_unlock(mtx: &sched->work_done_wait_mutex);
703	/ Get rid of threads so they won't be upset by mutex destrunction /
704	for (i = `0`; i < sched->nr_tasks; i++) {
705	task = sched->tasks[i];
706	err = pthread_join(task->thread, NULL);
707	BUG_ON(err);
708	sem_destroy(&task->sleep_sem);
709	sem_destroy(&task->ready_for_work);
710	sem_destroy(&task->work_done_sem);
711	}
712	}
713
714	static void wait_for_tasks(struct perf_sched *sched)
715	EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
716	EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
717	{
718	u64 cpu_usage_0, cpu_usage_1;
719	struct task_desc *task;
720	unsigned long i, ret;
721
722	sched->start_time = get_nsecs();
723	sched->cpu_usage = `0`;
724	mutex_unlock(mtx: &sched->work_done_wait_mutex);
725
726	for (i = `0`; i < sched->nr_tasks; i++) {
727	task = sched->tasks[i];
728	ret = sem_wait(&task->ready_for_work);
729	BUG_ON(ret);
730	sem_init(&task->ready_for_work, `0`, `0`);
731	}
732	mutex_lock(&sched->work_done_wait_mutex);
733
734	cpu_usage_0 = get_cpu_usage_nsec_parent();
735
736	mutex_unlock(mtx: &sched->start_work_mutex);
737
738	for (i = `0`; i < sched->nr_tasks; i++) {
739	task = sched->tasks[i];
740	ret = sem_wait(&task->work_done_sem);
741	BUG_ON(ret);
742	sem_init(&task->work_done_sem, `0`, `0`);
743	sched->cpu_usage += task->cpu_usage;
744	task->cpu_usage = `0`;
745	}
746
747	cpu_usage_1 = get_cpu_usage_nsec_parent();
748	if (!sched->runavg_cpu_usage)
749	sched->runavg_cpu_usage = sched->cpu_usage;
750	sched->runavg_cpu_usage = (sched->runavg_cpu_usage * (sched->replay_repeat - `1`) + sched->cpu_usage) / sched->replay_repeat;
751
752	sched->parent_cpu_usage = cpu_usage_1 - cpu_usage_0;
753	if (!sched->runavg_parent_cpu_usage)
754	sched->runavg_parent_cpu_usage = sched->parent_cpu_usage;
755	sched->runavg_parent_cpu_usage = (sched->runavg_parent_cpu_usage * (sched->replay_repeat - `1`) +
756	sched->parent_cpu_usage)/sched->replay_repeat;
757
758	mutex_lock(&sched->start_work_mutex);
759
760	for (i = `0`; i < sched->nr_tasks; i++) {
761	task = sched->tasks[i];
762	sem_init(&task->sleep_sem, `0`, `0`);
763	task->curr_event = `0`;
764	}
765	}
766
767	static void run_one_test(struct perf_sched *sched)
768	EXCLUSIVE_LOCKS_REQUIRED(sched->work_done_wait_mutex)
769	EXCLUSIVE_LOCKS_REQUIRED(sched->start_work_mutex)
770	{
771	u64 T0, T1, delta, avg_delta, fluct;
772
773	T0 = get_nsecs();
774	wait_for_tasks(sched);
775	T1 = get_nsecs();
776
777	delta = T1 - T0;
778	sched->sum_runtime += delta;
779	sched->nr_runs++;
780
781	avg_delta = sched->sum_runtime / sched->nr_runs;
782	if (delta < avg_delta)
783	fluct = avg_delta - delta;
784	else
785	fluct = delta - avg_delta;
786	sched->sum_fluct += fluct;
787	if (!sched->run_avg)
788	sched->run_avg = delta;
789	sched->run_avg = (sched->run_avg * (sched->replay_repeat - `1`) + delta) / sched->replay_repeat;
790
791	printf("#%-3ld: %0.3f, ", sched->nr_runs, (double)delta / NSEC_PER_MSEC);
792
793	printf("ravg: %0.2f, ", (double)sched->run_avg / NSEC_PER_MSEC);
794
795	printf("cpu: %0.2f / %0.2f",
796	(double)sched->cpu_usage / NSEC_PER_MSEC, (double)sched->runavg_cpu_usage / NSEC_PER_MSEC);
797
798	#if 0
799	/*
800	* rusage statistics done by the parent, these are less
801	* accurate than the sched->sum_exec_runtime based statistics:
802	*/
803	printf(" [%0.2f / %0.2f]",
804	(double)sched->parent_cpu_usage / NSEC_PER_MSEC,
805	(double)sched->runavg_parent_cpu_usage / NSEC_PER_MSEC);
806	#endif
807
808	printf("\n");
809
810	if (sched->nr_sleep_corrections)
811	printf(" (%ld sleep corrections)\n", sched->nr_sleep_corrections);
812	sched->nr_sleep_corrections = `0`;
813	}
814
815	static void test_calibrations(struct perf_sched *sched)
816	{
817	u64 T0, T1;
818
819	T0 = get_nsecs();
820	burn_nsecs(sched, NSEC_PER_MSEC);
821	T1 = get_nsecs();
822
823	printf("the run test took %" PRIu64 " nsecs\n", T1 - T0);
824
825	T0 = get_nsecs();
826	sleep_nsecs(NSEC_PER_MSEC);
827	T1 = get_nsecs();
828
829	printf("the sleep test took %" PRIu64 " nsecs\n", T1 - T0);
830	}
831
832	static int
833	replay_wakeup_event(struct perf_sched *sched,
834	struct evsel evsel, struct* perf_sample *sample,
835	struct machine *machine __maybe_unused)
836	{
837	const char *comm = evsel__strval(evsel, sample, "comm");
838	const u32 pid = evsel__intval(evsel, sample, "pid");
839	struct task_desc waker, wakee;
840
841	if (verbose > `0`) {
842	printf("sched_wakeup event %p\n", evsel);
843
844	printf(" ... pid %d woke up %s/%d\n", sample->tid, comm, pid);
845	}
846
847	waker = register_pid(sched, pid: sample->tid, comm: "<unknown>");
848	wakee = register_pid(sched, pid, comm);
849
850	add_sched_event_wakeup(sched, task: waker, timestamp: sample->time, wakee);
851	return `0`;
852	}
853
854	static int replay_switch_event(struct perf_sched *sched,
855	struct evsel *evsel,
856	struct perf_sample *sample,
857	struct machine *machine __maybe_unused)
858	{
859	const char *prev_comm = evsel__strval(evsel, sample, "prev_comm"),
860	*next_comm = evsel__strval(evsel, sample, "next_comm");
861	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
862	next_pid = evsel__intval(evsel, sample, "next_pid");
863	const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
864	struct task_desc prev, __maybe_unused next;
865	u64 timestamp0, timestamp = sample->time;
866	int cpu = sample->cpu;
867	s64 delta;
868
869	if (verbose > `0`)
870	printf("sched_switch event %p\n", evsel);
871
872	if (cpu >= MAX_CPUS \|\| cpu < `0`)
873	return `0`;
874
875	timestamp0 = sched->cpu_last_switched[cpu];
876	if (timestamp0)
877	delta = timestamp - timestamp0;
878	else
879	delta = `0`;
880
881	if (delta < `0`) {
882	pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
883	return -`1`;
884	}
885
886	pr_debug(" ... switch from %s/%d to %s/%d [ran %" PRIu64 " nsecs]\n",
887	prev_comm, prev_pid, next_comm, next_pid, delta);
888
889	prev = register_pid(sched, pid: prev_pid, comm: prev_comm);
890	next = register_pid(sched, pid: next_pid, comm: next_comm);
891
892	sched->cpu_last_switched[cpu] = timestamp;
893
894	add_sched_event_run(sched, task: prev, timestamp, duration: delta);
895	add_sched_event_sleep(sched, task: prev, timestamp, task_state: prev_state);
896
897	return `0`;
898	}
899
900	static int replay_fork_event(struct perf_sched *sched,
901	union perf_event *event,
902	struct machine *machine)
903	{
904	struct thread child, parent;
905
906	child = machine__findnew_thread(machine, pid: event->fork.pid,
907	tid: event->fork.tid);
908	parent = machine__findnew_thread(machine, pid: event->fork.ppid,
909	tid: event->fork.ptid);
910
911	if (child == NULL \|\| parent == NULL) {
912	pr_debug("thread does not exist on fork event: child %p, parent %p\n",
913	child, parent);
914	goto out_put;
915	}
916
917	if (verbose > `0`) {
918	printf("fork event\n");
919	printf("... parent: %s/%d\n", thread__comm_str(thread: parent), thread__tid(thread: parent));
920	printf("... child: %s/%d\n", thread__comm_str(thread: child), thread__tid(thread: child));
921	}
922
923	register_pid(sched, pid: thread__tid(thread: parent), comm: thread__comm_str(thread: parent));
924	register_pid(sched, pid: thread__tid(thread: child), comm: thread__comm_str(thread: child));
925	out_put:
926	thread__put(thread: child);
927	thread__put(thread: parent);
928	return `0`;
929	}
930
931	struct sort_dimension {
932	const char *name;
933	sort_fn_t cmp;
934	struct list_head list;
935	};
936
937	/*
938	* handle runtime stats saved per thread
939	*/
940	static struct thread_runtime thread__init_runtime(struct* thread *thread)
941	{
942	struct thread_runtime *r;
943
944	r = zalloc(sizeof(struct thread_runtime));
945	if (!r)
946	return NULL;
947
948	init_stats(stats: &r->run_stats);
949	thread__set_priv(thread, p: r);
950
951	return r;
952	}
953
954	static struct thread_runtime thread__get_runtime(struct* thread *thread)
955	{
956	struct thread_runtime *tr;
957
958	tr = thread__priv(thread);
959	if (tr == NULL) {
960	tr = thread__init_runtime(thread);
961	if (tr == NULL)
962	pr_debug("Failed to malloc memory for runtime data.\n");
963	}
964
965	return tr;
966	}
967
968	static int
969	thread_lat_cmp(struct list_head list, struct* work_atoms l, struct* work_atoms *r)
970	{
971	struct sort_dimension *sort;
972	int ret = `0`;
973
974	BUG_ON(list_empty(list));
975
976	list_for_each_entry(sort, list, list) {
977	ret = sort->cmp(l, r);
978	if (ret)
979	return ret;
980	}
981
982	return ret;
983	}
984
985	static struct work_atoms *
986	thread_atoms_search(struct rb_root_cached root, struct* thread *thread,
987	struct list_head *sort_list)
988	{
989	struct rb_node *node = root->rb_root.rb_node;
990	struct work_atoms key = { .thread = thread };
991
992	while (node) {
993	struct work_atoms *atoms;
994	int cmp;
995
996	atoms = container_of(node, struct work_atoms, node);
997
998	cmp = thread_lat_cmp(list: sort_list, l: &key, r: atoms);
999	if (cmp > `0`)
1000	node = node->rb_left;
1001	else if (cmp < `0`)
1002	node = node->rb_right;
1003	else {
1004	BUG_ON(thread != atoms->thread);
1005	return atoms;
1006	}
1007	}
1008	return NULL;
1009	}
1010
1011	static void
1012	__thread_latency_insert(struct rb_root_cached root, struct* work_atoms *data,
1013	struct list_head *sort_list)
1014	{
1015	struct rb_node *new = &(root->rb_root.rb_node), parent = NULL;
1016	bool leftmost = true;
1017
1018	while (*new) {
1019	struct work_atoms *this;
1020	int cmp;
1021
1022	this = container_of(new, struct* work_atoms, node);
1023	parent = *new;
1024
1025	cmp = thread_lat_cmp(list: sort_list, l: data, r: this);
1026
1027	if (cmp > `0`)
1028	new = &((*new)->rb_left);
1029	else {
1030	new = &((*new)->rb_right);
1031	leftmost = false;
1032	}
1033	}
1034
1035	rb_link_node(node: &data->node, parent, rb_link: new);
1036	rb_insert_color_cached(node: &data->node, root, leftmost);
1037	}
1038
1039	static int thread_atoms_insert(struct perf_sched sched, struct* thread *thread)
1040	{
1041	struct work_atoms atoms = zalloc(sizeof(atoms));
1042	if (!atoms) {
1043	pr_err("No memory at %s\n", __func__);
1044	return -`1`;
1045	}
1046
1047	atoms->thread = thread__get(thread);
1048	INIT_LIST_HEAD(list: &atoms->work_list);
1049	__thread_latency_insert(root: &sched->atom_root, data: atoms, sort_list: &sched->cmp_pid);
1050	return `0`;
1051	}
1052
1053	static char sched_out_state(u64 prev_state)
1054	{
1055	const char *str = TASK_STATE_TO_CHAR_STR;
1056
1057	return str[prev_state];
1058	}
1059
1060	static int
1061	add_sched_out_event(struct work_atoms *atoms,
1062	char run_state,
1063	u64 timestamp)
1064	{
1065	struct work_atom atom = zalloc(sizeof(atom));
1066	if (!atom) {
1067	pr_err("Non memory at %s", __func__);
1068	return -`1`;
1069	}
1070
1071	atom->sched_out_time = timestamp;
1072
1073	if (run_state == `'R'`) {
1074	atom->state = THREAD_WAIT_CPU;
1075	atom->wake_up_time = atom->sched_out_time;
1076	}
1077
1078	list_add_tail(new: &atom->list, head: &atoms->work_list);
1079	return `0`;
1080	}
1081
1082	static void
1083	add_runtime_event(struct work_atoms *atoms, u64 delta,
1084	u64 timestamp __maybe_unused)
1085	{
1086	struct work_atom *atom;
1087
1088	BUG_ON(list_empty(&atoms->work_list));
1089
1090	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1091
1092	atom->runtime += delta;
1093	atoms->total_runtime += delta;
1094	}
1095
1096	static void
1097	add_sched_in_event(struct work_atoms *atoms, u64 timestamp)
1098	{
1099	struct work_atom *atom;
1100	u64 delta;
1101
1102	if (list_empty(head: &atoms->work_list))
1103	return;
1104
1105	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1106
1107	if (atom->state != THREAD_WAIT_CPU)
1108	return;
1109
1110	if (timestamp < atom->wake_up_time) {
1111	atom->state = THREAD_IGNORE;
1112	return;
1113	}
1114
1115	atom->state = THREAD_SCHED_IN;
1116	atom->sched_in_time = timestamp;
1117
1118	delta = atom->sched_in_time - atom->wake_up_time;
1119	atoms->total_lat += delta;
1120	if (delta > atoms->max_lat) {
1121	atoms->max_lat = delta;
1122	atoms->max_lat_start = atom->wake_up_time;
1123	atoms->max_lat_end = timestamp;
1124	}
1125	atoms->nb_atoms++;
1126	}
1127
1128	static int latency_switch_event(struct perf_sched *sched,
1129	struct evsel *evsel,
1130	struct perf_sample *sample,
1131	struct machine *machine)
1132	{
1133	const u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
1134	next_pid = evsel__intval(evsel, sample, "next_pid");
1135	const u64 prev_state = evsel__intval(evsel, sample, "prev_state");
1136	struct work_atoms out_events, in_events;
1137	struct thread sched_out, sched_in;
1138	u64 timestamp0, timestamp = sample->time;
1139	int cpu = sample->cpu, err = -`1`;
1140	s64 delta;
1141
1142	BUG_ON(cpu >= MAX_CPUS \|\| cpu < `0`);
1143
1144	timestamp0 = sched->cpu_last_switched[cpu];
1145	sched->cpu_last_switched[cpu] = timestamp;
1146	if (timestamp0)
1147	delta = timestamp - timestamp0;
1148	else
1149	delta = `0`;
1150
1151	if (delta < `0`) {
1152	pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1153	return -`1`;
1154	}
1155
1156	sched_out = machine__findnew_thread(machine, pid: -`1`, tid: prev_pid);
1157	sched_in = machine__findnew_thread(machine, pid: -`1`, tid: next_pid);
1158	if (sched_out == NULL \|\| sched_in == NULL)
1159	goto out_put;
1160
1161	out_events = thread_atoms_search(root: &sched->atom_root, thread: sched_out, sort_list: &sched->cmp_pid);
1162	if (!out_events) {
1163	if (thread_atoms_insert(sched, thread: sched_out))
1164	goto out_put;
1165	out_events = thread_atoms_search(root: &sched->atom_root, thread: sched_out, sort_list: &sched->cmp_pid);
1166	if (!out_events) {
1167	pr_err("out-event: Internal tree error");
1168	goto out_put;
1169	}
1170	}
1171	if (add_sched_out_event(atoms: out_events, run_state: sched_out_state(prev_state), timestamp))
1172	return -`1`;
1173
1174	in_events = thread_atoms_search(root: &sched->atom_root, thread: sched_in, sort_list: &sched->cmp_pid);
1175	if (!in_events) {
1176	if (thread_atoms_insert(sched, thread: sched_in))
1177	goto out_put;
1178	in_events = thread_atoms_search(root: &sched->atom_root, thread: sched_in, sort_list: &sched->cmp_pid);
1179	if (!in_events) {
1180	pr_err("in-event: Internal tree error");
1181	goto out_put;
1182	}
1183	/*
1184	* Take came in we have not heard about yet,
1185	* add in an initial atom in runnable state:
1186	*/
1187	if (add_sched_out_event(atoms: in_events, run_state: `'R'`, timestamp))
1188	goto out_put;
1189	}
1190	add_sched_in_event(atoms: in_events, timestamp);
1191	err = `0`;
1192	out_put:
1193	thread__put(thread: sched_out);
1194	thread__put(thread: sched_in);
1195	return err;
1196	}
1197
1198	static int latency_runtime_event(struct perf_sched *sched,
1199	struct evsel *evsel,
1200	struct perf_sample *sample,
1201	struct machine *machine)
1202	{
1203	const u32 pid = evsel__intval(evsel, sample, "pid");
1204	const u64 runtime = evsel__intval(evsel, sample, "runtime");
1205	struct thread *thread = machine__findnew_thread(machine, pid: -`1`, tid: pid);
1206	struct work_atoms *atoms = thread_atoms_search(root: &sched->atom_root, thread, sort_list: &sched->cmp_pid);
1207	u64 timestamp = sample->time;
1208	int cpu = sample->cpu, err = -`1`;
1209
1210	if (thread == NULL)
1211	return -`1`;
1212
1213	BUG_ON(cpu >= MAX_CPUS \|\| cpu < `0`);
1214	if (!atoms) {
1215	if (thread_atoms_insert(sched, thread))
1216	goto out_put;
1217	atoms = thread_atoms_search(root: &sched->atom_root, thread, sort_list: &sched->cmp_pid);
1218	if (!atoms) {
1219	pr_err("in-event: Internal tree error");
1220	goto out_put;
1221	}
1222	if (add_sched_out_event(atoms, run_state: `'R'`, timestamp))
1223	goto out_put;
1224	}
1225
1226	add_runtime_event(atoms, delta: runtime, timestamp);
1227	err = `0`;
1228	out_put:
1229	thread__put(thread);
1230	return err;
1231	}
1232
1233	static int latency_wakeup_event(struct perf_sched *sched,
1234	struct evsel *evsel,
1235	struct perf_sample *sample,
1236	struct machine *machine)
1237	{
1238	const u32 pid = evsel__intval(evsel, sample, "pid");
1239	struct work_atoms *atoms;
1240	struct work_atom *atom;
1241	struct thread *wakee;
1242	u64 timestamp = sample->time;
1243	int err = -`1`;
1244
1245	wakee = machine__findnew_thread(machine, pid: -`1`, tid: pid);
1246	if (wakee == NULL)
1247	return -`1`;
1248	atoms = thread_atoms_search(root: &sched->atom_root, thread: wakee, sort_list: &sched->cmp_pid);
1249	if (!atoms) {
1250	if (thread_atoms_insert(sched, thread: wakee))
1251	goto out_put;
1252	atoms = thread_atoms_search(root: &sched->atom_root, thread: wakee, sort_list: &sched->cmp_pid);
1253	if (!atoms) {
1254	pr_err("wakeup-event: Internal tree error");
1255	goto out_put;
1256	}
1257	if (add_sched_out_event(atoms, run_state: `'S'`, timestamp))
1258	goto out_put;
1259	}
1260
1261	BUG_ON(list_empty(&atoms->work_list));
1262
1263	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1264
1265	/*
1266	* As we do not guarantee the wakeup event happens when
1267	* task is out of run queue, also may happen when task is
1268	* on run queue and wakeup only change ->state to TASK_RUNNING,
1269	* then we should not set the ->wake_up_time when wake up a
1270	* task which is on run queue.
1271	*
1272	* You WILL be missing events if you've recorded only
1273	* one CPU, or are only looking at only one, so don't
1274	* skip in this case.
1275	*/
1276	if (sched->profile_cpu == -`1` && atom->state != THREAD_SLEEPING)
1277	goto out_ok;
1278
1279	sched->nr_timestamps++;
1280	if (atom->sched_out_time > timestamp) {
1281	sched->nr_unordered_timestamps++;
1282	goto out_ok;
1283	}
1284
1285	atom->state = THREAD_WAIT_CPU;
1286	atom->wake_up_time = timestamp;
1287	out_ok:
1288	err = `0`;
1289	out_put:
1290	thread__put(thread: wakee);
1291	return err;
1292	}
1293
1294	static int latency_migrate_task_event(struct perf_sched *sched,
1295	struct evsel *evsel,
1296	struct perf_sample *sample,
1297	struct machine *machine)
1298	{
1299	const u32 pid = evsel__intval(evsel, sample, "pid");
1300	u64 timestamp = sample->time;
1301	struct work_atoms *atoms;
1302	struct work_atom *atom;
1303	struct thread *migrant;
1304	int err = -`1`;
1305
1306	/*
1307	* Only need to worry about migration when profiling one CPU.
1308	*/
1309	if (sched->profile_cpu == -`1`)
1310	return `0`;
1311
1312	migrant = machine__findnew_thread(machine, pid: -`1`, tid: pid);
1313	if (migrant == NULL)
1314	return -`1`;
1315	atoms = thread_atoms_search(root: &sched->atom_root, thread: migrant, sort_list: &sched->cmp_pid);
1316	if (!atoms) {
1317	if (thread_atoms_insert(sched, thread: migrant))
1318	goto out_put;
1319	register_pid(sched, pid: thread__tid(thread: migrant), comm: thread__comm_str(thread: migrant));
1320	atoms = thread_atoms_search(root: &sched->atom_root, thread: migrant, sort_list: &sched->cmp_pid);
1321	if (!atoms) {
1322	pr_err("migration-event: Internal tree error");
1323	goto out_put;
1324	}
1325	if (add_sched_out_event(atoms, run_state: `'R'`, timestamp))
1326	goto out_put;
1327	}
1328
1329	BUG_ON(list_empty(&atoms->work_list));
1330
1331	atom = list_entry(atoms->work_list.prev, struct work_atom, list);
1332	atom->sched_in_time = atom->sched_out_time = atom->wake_up_time = timestamp;
1333
1334	sched->nr_timestamps++;
1335
1336	if (atom->sched_out_time > timestamp)
1337	sched->nr_unordered_timestamps++;
1338	err = `0`;
1339	out_put:
1340	thread__put(thread: migrant);
1341	return err;
1342	}
1343
1344	static void output_lat_thread(struct perf_sched sched, struct* work_atoms *work_list)
1345	{
1346	int i;
1347	int ret;
1348	u64 avg;
1349	char max_lat_start[`32`], max_lat_end[`32`];
1350
1351	if (!work_list->nb_atoms)
1352	return;
1353	/*
1354	* Ignore idle threads:
1355	*/
1356	if (!strcmp(thread__comm_str(thread: work_list->thread), "swapper"))
1357	return;
1358
1359	sched->all_runtime += work_list->total_runtime;
1360	sched->all_count += work_list->nb_atoms;
1361
1362	if (work_list->num_merged > `1`) {
1363	ret = printf(" %s:(%d) ", thread__comm_str(thread: work_list->thread),
1364	work_list->num_merged);
1365	} else {
1366	ret = printf(" %s:%d ", thread__comm_str(thread: work_list->thread),
1367	thread__tid(thread: work_list->thread));
1368	}
1369
1370	for (i = `0`; i < `24` - ret; i++)
1371	printf(" ");
1372
1373	avg = work_list->total_lat / work_list->nb_atoms;
1374	timestamp__scnprintf_usec(timestamp: work_list->max_lat_start, buf: max_lat_start, sz: sizeof(max_lat_start));
1375	timestamp__scnprintf_usec(timestamp: work_list->max_lat_end, buf: max_lat_end, sz: sizeof(max_lat_end));
1376
1377	printf("\|%11.3f ms \|%9" PRIu64 " \| avg:%8.3f ms \| max:%8.3f ms \| max start: %12s s \| max end: %12s s\n",
1378	(double)work_list->total_runtime / NSEC_PER_MSEC,
1379	work_list->nb_atoms, (double)avg / NSEC_PER_MSEC,
1380	(double)work_list->max_lat / NSEC_PER_MSEC,
1381	max_lat_start, max_lat_end);
1382	}
1383
1384	static int pid_cmp(struct work_atoms l, struct* work_atoms *r)
1385	{
1386	pid_t l_tid, r_tid;
1387
1388	if (RC_CHK_EQUAL(l->thread, r->thread))
1389	return `0`;
1390	l_tid = thread__tid(thread: l->thread);
1391	r_tid = thread__tid(thread: r->thread);
1392	if (l_tid < r_tid)
1393	return -`1`;
1394	if (l_tid > r_tid)
1395	return `1`;
1396	return (int)(RC_CHK_ACCESS(l->thread) - RC_CHK_ACCESS(r->thread));
1397	}
1398
1399	static int avg_cmp(struct work_atoms l, struct* work_atoms *r)
1400	{
1401	u64 avgl, avgr;
1402
1403	if (!l->nb_atoms)
1404	return -`1`;
1405
1406	if (!r->nb_atoms)
1407	return `1`;
1408
1409	avgl = l->total_lat / l->nb_atoms;
1410	avgr = r->total_lat / r->nb_atoms;
1411
1412	if (avgl < avgr)
1413	return -`1`;
1414	if (avgl > avgr)
1415	return `1`;
1416
1417	return `0`;
1418	}
1419
1420	static int max_cmp(struct work_atoms l, struct* work_atoms *r)
1421	{
1422	if (l->max_lat < r->max_lat)
1423	return -`1`;
1424	if (l->max_lat > r->max_lat)
1425	return `1`;
1426
1427	return `0`;
1428	}
1429
1430	static int switch_cmp(struct work_atoms l, struct* work_atoms *r)
1431	{
1432	if (l->nb_atoms < r->nb_atoms)
1433	return -`1`;
1434	if (l->nb_atoms > r->nb_atoms)
1435	return `1`;
1436
1437	return `0`;
1438	}
1439
1440	static int runtime_cmp(struct work_atoms l, struct* work_atoms *r)
1441	{
1442	if (l->total_runtime < r->total_runtime)
1443	return -`1`;
1444	if (l->total_runtime > r->total_runtime)
1445	return `1`;
1446
1447	return `0`;
1448	}
1449
1450	static int sort_dimension__add(const char tok, struct* list_head *list)
1451	{
1452	size_t i;
1453	static struct sort_dimension avg_sort_dimension = {
1454	.name = "avg",
1455	.cmp = avg_cmp,
1456	};
1457	static struct sort_dimension max_sort_dimension = {
1458	.name = "max",
1459	.cmp = max_cmp,
1460	};
1461	static struct sort_dimension pid_sort_dimension = {
1462	.name = "pid",
1463	.cmp = pid_cmp,
1464	};
1465	static struct sort_dimension runtime_sort_dimension = {
1466	.name = "runtime",
1467	.cmp = runtime_cmp,
1468	};
1469	static struct sort_dimension switch_sort_dimension = {
1470	.name = "switch",
1471	.cmp = switch_cmp,
1472	};
1473	struct sort_dimension *available_sorts[] = {
1474	&pid_sort_dimension,
1475	&avg_sort_dimension,
1476	&max_sort_dimension,
1477	&switch_sort_dimension,
1478	&runtime_sort_dimension,
1479	};
1480
1481	for (i = `0`; i < ARRAY_SIZE(available_sorts); i++) {
1482	if (!strcmp(available_sorts[i]->name, tok)) {
1483	list_add_tail(new: &available_sorts[i]->list, head: list);
1484
1485	return `0`;
1486	}
1487	}
1488
1489	return -`1`;
1490	}
1491
1492	static void perf_sched__sort_lat(struct perf_sched *sched)
1493	{
1494	struct rb_node *node;
1495	struct rb_root_cached *root = &sched->atom_root;
1496	again:
1497	for (;;) {
1498	struct work_atoms *data;
1499	node = rb_first_cached(root);
1500	if (!node)
1501	break;
1502
1503	rb_erase_cached(node, root);
1504	data = rb_entry(node, struct work_atoms, node);
1505	__thread_latency_insert(root: &sched->sorted_atom_root, data, sort_list: &sched->sort_list);
1506	}
1507	if (root == &sched->atom_root) {
1508	root = &sched->merged_atom_root;
1509	goto again;
1510	}
1511	}
1512
1513	static int process_sched_wakeup_event(struct perf_tool *tool,
1514	struct evsel *evsel,
1515	struct perf_sample *sample,
1516	struct machine *machine)
1517	{
1518	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
1519
1520	if (sched->tp_handler->wakeup_event)
1521	return sched->tp_handler->wakeup_event(sched, evsel, sample, machine);
1522
1523	return `0`;
1524	}
1525
1526	static int process_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused,
1527	struct evsel *evsel __maybe_unused,
1528	struct perf_sample *sample __maybe_unused,
1529	struct machine *machine __maybe_unused)
1530	{
1531	return `0`;
1532	}
1533
1534	union map_priv {
1535	void *ptr;
1536	bool color;
1537	};
1538
1539	static bool thread__has_color(struct thread *thread)
1540	{
1541	union map_priv priv = {
1542	.ptr = thread__priv(thread),
1543	};
1544
1545	return priv.color;
1546	}
1547
1548	static struct thread*
1549	map__findnew_thread(struct perf_sched sched, struct* machine *machine, pid_t pid, pid_t tid)
1550	{
1551	struct thread *thread = machine__findnew_thread(machine, pid, tid);
1552	union map_priv priv = {
1553	.color = false,
1554	};
1555
1556	if (!sched->map.color_pids \|\| !thread \|\| thread__priv(thread))
1557	return thread;
1558
1559	if (thread_map__has(threads: sched->map.color_pids, pid: tid))
1560	priv.color = true;
1561
1562	thread__set_priv(thread, p: priv.ptr);
1563	return thread;
1564	}
1565
1566	static int map_switch_event(struct perf_sched sched, struct* evsel *evsel,
1567	struct perf_sample sample, struct* machine *machine)
1568	{
1569	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
1570	struct thread *sched_in;
1571	struct thread_runtime *tr;
1572	int new_shortname;
1573	u64 timestamp0, timestamp = sample->time;
1574	s64 delta;
1575	int i;
1576	struct perf_cpu this_cpu = {
1577	.cpu = sample->cpu,
1578	};
1579	int cpus_nr;
1580	bool new_cpu = false;
1581	const char *color = PERF_COLOR_NORMAL;
1582	char stimestamp[`32`];
1583
1584	BUG_ON(this_cpu.cpu >= MAX_CPUS \|\| this_cpu.cpu < `0`);
1585
1586	if (this_cpu.cpu > sched->max_cpu.cpu)
1587	sched->max_cpu = this_cpu;
1588
1589	if (sched->map.comp) {
1590	cpus_nr = bitmap_weight(src: sched->map.comp_cpus_mask, MAX_CPUS);
1591	if (!__test_and_set_bit(this_cpu.cpu, sched->map.comp_cpus_mask)) {
1592	sched->map.comp_cpus[cpus_nr++] = this_cpu;
1593	new_cpu = true;
1594	}
1595	} else
1596	cpus_nr = sched->max_cpu.cpu;
1597
1598	timestamp0 = sched->cpu_last_switched[this_cpu.cpu];
1599	sched->cpu_last_switched[this_cpu.cpu] = timestamp;
1600	if (timestamp0)
1601	delta = timestamp - timestamp0;
1602	else
1603	delta = `0`;
1604
1605	if (delta < `0`) {
1606	pr_err("hm, delta: %" PRIu64 " < 0 ?\n", delta);
1607	return -`1`;
1608	}
1609
1610	sched_in = map__findnew_thread(sched, machine, pid: -`1`, tid: next_pid);
1611	if (sched_in == NULL)
1612	return -`1`;
1613
1614	tr = thread__get_runtime(thread: sched_in);
1615	if (tr == NULL) {
1616	thread__put(thread: sched_in);
1617	return -`1`;
1618	}
1619
1620	sched->curr_thread[this_cpu.cpu] = thread__get(thread: sched_in);
1621
1622	printf(" ");
1623
1624	new_shortname = `0`;
1625	if (!tr->shortname[`0`]) {
1626	if (!strcmp(thread__comm_str(thread: sched_in), "swapper")) {
1627	/*
1628	* Don't allocate a letter-number for swapper:0
1629	* as a shortname. Instead, we use '.' for it.
1630	*/
1631	tr->shortname[`0`] = `'.'`;
1632	tr->shortname[`1`] = `' '`;
1633	} else {
1634	tr->shortname[`0`] = sched->next_shortname1;
1635	tr->shortname[`1`] = sched->next_shortname2;
1636
1637	if (sched->next_shortname1 < `'Z'`) {
1638	sched->next_shortname1++;
1639	} else {
1640	sched->next_shortname1 = `'A'`;
1641	if (sched->next_shortname2 < `'9'`)
1642	sched->next_shortname2++;
1643	else
1644	sched->next_shortname2 = `'0'`;
1645	}
1646	}
1647	new_shortname = `1`;
1648	}
1649
1650	for (i = `0`; i < cpus_nr; i++) {
1651	struct perf_cpu cpu = {
1652	.cpu = sched->map.comp ? sched->map.comp_cpus[i].cpu : i,
1653	};
1654	struct thread *curr_thread = sched->curr_thread[cpu.cpu];
1655	struct thread_runtime *curr_tr;
1656	const char *pid_color = color;
1657	const char *cpu_color = color;
1658
1659	if (curr_thread && thread__has_color(thread: curr_thread))
1660	pid_color = COLOR_PIDS;
1661
1662	if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, cpu))
1663	continue;
1664
1665	if (sched->map.color_cpus && perf_cpu_map__has(sched->map.color_cpus, cpu))
1666	cpu_color = COLOR_CPUS;
1667
1668	if (cpu.cpu != this_cpu.cpu)
1669	color_fprintf(stdout, color, " ");
1670	else
1671	color_fprintf(stdout, cpu_color, "*");
1672
1673	if (sched->curr_thread[cpu.cpu]) {
1674	curr_tr = thread__get_runtime(thread: sched->curr_thread[cpu.cpu]);
1675	if (curr_tr == NULL) {
1676	thread__put(thread: sched_in);
1677	return -`1`;
1678	}
1679	color_fprintf(stdout, pid_color, "%2s ", curr_tr->shortname);
1680	} else
1681	color_fprintf(stdout, color, " ");
1682	}
1683
1684	if (sched->map.cpus && !perf_cpu_map__has(sched->map.cpus, this_cpu))
1685	goto out;
1686
1687	timestamp__scnprintf_usec(timestamp, buf: stimestamp, sz: sizeof(stimestamp));
1688	color_fprintf(stdout, color, " %12s secs ", stimestamp);
1689	if (new_shortname \|\| tr->comm_changed \|\| (verbose > `0` && thread__tid(thread: sched_in))) {
1690	const char *pid_color = color;
1691
1692	if (thread__has_color(thread: sched_in))
1693	pid_color = COLOR_PIDS;
1694
1695	color_fprintf(stdout, pid_color, "%s => %s:%d",
1696	tr->shortname, thread__comm_str(sched_in), thread__tid(sched_in));
1697	tr->comm_changed = false;
1698	}
1699
1700	if (sched->map.comp && new_cpu)
1701	color_fprintf(stdout, color, " (CPU %d)", this_cpu);
1702
1703	out:
1704	color_fprintf(stdout, color, "\n");
1705
1706	thread__put(thread: sched_in);
1707
1708	return `0`;
1709	}
1710
1711	static int process_sched_switch_event(struct perf_tool *tool,
1712	struct evsel *evsel,
1713	struct perf_sample *sample,
1714	struct machine *machine)
1715	{
1716	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
1717	int this_cpu = sample->cpu, err = `0`;
1718	u32 prev_pid = evsel__intval(evsel, sample, "prev_pid"),
1719	next_pid = evsel__intval(evsel, sample, "next_pid");
1720
1721	if (sched->curr_pid[this_cpu] != (u32)-`1`) {
1722	/*
1723	* Are we trying to switch away a PID that is
1724	* not current?
1725	*/
1726	if (sched->curr_pid[this_cpu] != prev_pid)
1727	sched->nr_context_switch_bugs++;
1728	}
1729
1730	if (sched->tp_handler->switch_event)
1731	err = sched->tp_handler->switch_event(sched, evsel, sample, machine);
1732
1733	sched->curr_pid[this_cpu] = next_pid;
1734	return err;
1735	}
1736
1737	static int process_sched_runtime_event(struct perf_tool *tool,
1738	struct evsel *evsel,
1739	struct perf_sample *sample,
1740	struct machine *machine)
1741	{
1742	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
1743
1744	if (sched->tp_handler->runtime_event)
1745	return sched->tp_handler->runtime_event(sched, evsel, sample, machine);
1746
1747	return `0`;
1748	}
1749
1750	static int perf_sched__process_fork_event(struct perf_tool *tool,
1751	union perf_event *event,
1752	struct perf_sample *sample,
1753	struct machine *machine)
1754	{
1755	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
1756
1757	/ run the fork event through the perf machinery /
1758	perf_event__process_fork(tool, event, sample, machine);
1759
1760	/ and then run additional processing needed for this command /
1761	if (sched->tp_handler->fork_event)
1762	return sched->tp_handler->fork_event(sched, event, machine);
1763
1764	return `0`;
1765	}
1766
1767	static int process_sched_migrate_task_event(struct perf_tool *tool,
1768	struct evsel *evsel,
1769	struct perf_sample *sample,
1770	struct machine *machine)
1771	{
1772	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
1773
1774	if (sched->tp_handler->migrate_task_event)
1775	return sched->tp_handler->migrate_task_event(sched, evsel, sample, machine);
1776
1777	return `0`;
1778	}
1779
1780	typedef int (tracepoint_handler)(struct* perf_tool *tool,
1781	struct evsel *evsel,
1782	struct perf_sample *sample,
1783	struct machine *machine);
1784
1785	static int perf_sched__process_tracepoint_sample(struct perf_tool *tool __maybe_unused,
1786	union perf_event *event __maybe_unused,
1787	struct perf_sample *sample,
1788	struct evsel *evsel,
1789	struct machine *machine)
1790	{
1791	int err = `0`;
1792
1793	if (evsel->handler != NULL) {
1794	tracepoint_handler f = evsel->handler;
1795	err = f(tool, evsel, sample, machine);
1796	}
1797
1798	return err;
1799	}
1800
1801	static int perf_sched__process_comm(struct perf_tool *tool __maybe_unused,
1802	union perf_event *event,
1803	struct perf_sample *sample,
1804	struct machine *machine)
1805	{
1806	struct thread *thread;
1807	struct thread_runtime *tr;
1808	int err;
1809
1810	err = perf_event__process_comm(tool, event, sample, machine);
1811	if (err)
1812	return err;
1813
1814	thread = machine__find_thread(machine, pid: sample->pid, tid: sample->tid);
1815	if (!thread) {
1816	pr_err("Internal error: can't find thread\n");
1817	return -`1`;
1818	}
1819
1820	tr = thread__get_runtime(thread);
1821	if (tr == NULL) {
1822	thread__put(thread);
1823	return -`1`;
1824	}
1825
1826	tr->comm_changed = true;
1827	thread__put(thread);
1828
1829	return `0`;
1830	}
1831
1832	static int perf_sched__read_events(struct perf_sched *sched)
1833	{
1834	struct evsel_str_handler handlers[] = {
1835	{ "sched:sched_switch", process_sched_switch_event, },
1836	{ "sched:sched_stat_runtime", process_sched_runtime_event, },
1837	{ "sched:sched_wakeup", process_sched_wakeup_event, },
1838	{ "sched:sched_waking", process_sched_wakeup_event, },
1839	{ "sched:sched_wakeup_new", process_sched_wakeup_event, },
1840	{ "sched:sched_migrate_task", process_sched_migrate_task_event, },
1841	};
1842	struct perf_session *session;
1843	struct perf_data data = {
1844	.path = input_name,
1845	.mode = PERF_DATA_MODE_READ,
1846	.force = sched->force,
1847	};
1848	int rc = -`1`;
1849
1850	session = perf_session__new(data: &data, tool: &sched->tool);
1851	if (IS_ERR(ptr: session)) {
1852	pr_debug("Error creating perf session");
1853	return PTR_ERR(ptr: session);
1854	}
1855
1856	symbol__init(env: &session->header.env);
1857
1858	/ prefer sched_waking if it is captured /
1859	if (evlist__find_tracepoint_by_name(evlist: session->evlist, name: "sched:sched_waking"))
1860	handlers[`2`].handler = process_sched_wakeup_ignore;
1861
1862	if (perf_session__set_tracepoints_handlers(session, handlers))
1863	goto out_delete;
1864
1865	if (perf_session__has_traces(session, msg: "record -R")) {
1866	int err = perf_session__process_events(session);
1867	if (err) {
1868	pr_err("Failed to process events, error %d", err);
1869	goto out_delete;
1870	}
1871
1872	sched->nr_events = session->evlist->stats.nr_events[`0`];
1873	sched->nr_lost_events = session->evlist->stats.total_lost;
1874	sched->nr_lost_chunks = session->evlist->stats.nr_events[PERF_RECORD_LOST];
1875	}
1876
1877	rc = `0`;
1878	out_delete:
1879	perf_session__delete(session);
1880	return rc;
1881	}
1882
1883	/*
1884	* scheduling times are printed as msec.usec
1885	*/
1886	static inline void print_sched_time(unsigned long long nsecs, int width)
1887	{
1888	unsigned long msecs;
1889	unsigned long usecs;
1890
1891	msecs = nsecs / NSEC_PER_MSEC;
1892	nsecs -= msecs * NSEC_PER_MSEC;
1893	usecs = nsecs / NSEC_PER_USEC;
1894	printf("%*lu.%03lu ", width, msecs, usecs);
1895	}
1896
1897	/*
1898	* returns runtime data for event, allocating memory for it the
1899	* first time it is used.
1900	*/
1901	static struct evsel_runtime evsel__get_runtime(struct* evsel *evsel)
1902	{
1903	struct evsel_runtime *r = evsel->priv;
1904
1905	if (r == NULL) {
1906	r = zalloc(sizeof(struct evsel_runtime));
1907	evsel->priv = r;
1908	}
1909
1910	return r;
1911	}
1912
1913	/*
1914	* save last time event was seen per cpu
1915	*/
1916	static void evsel__save_time(struct evsel *evsel, u64 timestamp, u32 cpu)
1917	{
1918	struct evsel_runtime *r = evsel__get_runtime(evsel);
1919
1920	if (r == NULL)
1921	return;
1922
1923	if ((cpu >= r->ncpu) \|\| (r->last_time == NULL)) {
1924	int i, n = __roundup_pow_of_two(n: cpu+`1`);
1925	void *p = r->last_time;
1926
1927	p = realloc(r->last_time, n * sizeof(u64));
1928	if (!p)
1929	return;
1930
1931	r->last_time = p;
1932	for (i = r->ncpu; i < n; ++i)
1933	r->last_time[i] = (u64) `0`;
1934
1935	r->ncpu = n;
1936	}
1937
1938	r->last_time[cpu] = timestamp;
1939	}
1940
1941	/ returns last time this event was seen on the given cpu /
1942	static u64 evsel__get_time(struct evsel *evsel, u32 cpu)
1943	{
1944	struct evsel_runtime *r = evsel__get_runtime(evsel);
1945
1946	if ((r == NULL) \|\| (r->last_time == NULL) \|\| (cpu >= r->ncpu))
1947	return `0`;
1948
1949	return r->last_time[cpu];
1950	}
1951
1952	static int comm_width = `30`;
1953
1954	static char timehist_get_commstr(struct* thread *thread)
1955	{
1956	static char str[`32`];
1957	const char *comm = thread__comm_str(thread);
1958	pid_t tid = thread__tid(thread);
1959	pid_t pid = thread__pid(thread);
1960	int n;
1961
1962	if (pid == `0`)
1963	n = scnprintf(buf: str, size: sizeof(str), fmt: "%s", comm);
1964
1965	else if (tid != pid)
1966	n = scnprintf(buf: str, size: sizeof(str), fmt: "%s[%d/%d]", comm, tid, pid);
1967
1968	else
1969	n = scnprintf(buf: str, size: sizeof(str), fmt: "%s[%d]", comm, tid);
1970
1971	if (n > comm_width)
1972	comm_width = n;
1973
1974	return str;
1975	}
1976
1977	static void timehist_header(struct perf_sched *sched)
1978	{
1979	u32 ncpus = sched->max_cpu.cpu + `1`;
1980	u32 i, j;
1981
1982	printf("%15s %6s ", "time", "cpu");
1983
1984	if (sched->show_cpu_visual) {
1985	printf(" ");
1986	for (i = `0`, j = `0`; i < ncpus; ++i) {
1987	printf("%x", j++);
1988	if (j > `15`)
1989	j = `0`;
1990	}
1991	printf(" ");
1992	}
1993
1994	printf(" %-*s %9s %9s %9s", comm_width,
1995	"task name", "wait time", "sch delay", "run time");
1996
1997	if (sched->show_state)
1998	printf(" %s", "state");
1999
2000	printf("\n");
2001
2002	/*
2003	* units row
2004	*/
2005	printf("%15s %-6s ", "", "");
2006
2007	if (sched->show_cpu_visual)
2008	printf(" %*s ", ncpus, "");
2009
2010	printf(" %-*s %9s %9s %9s", comm_width,
2011	"[tid/pid]", "(msec)", "(msec)", "(msec)");
2012
2013	if (sched->show_state)
2014	printf(" %5s", "");
2015
2016	printf("\n");
2017
2018	/*
2019	* separator
2020	*/
2021	printf("%.15s %.6s ", graph_dotted_line, graph_dotted_line);
2022
2023	if (sched->show_cpu_visual)
2024	printf(" %.*s ", ncpus, graph_dotted_line);
2025
2026	printf(" %.*s %.9s %.9s %.9s", comm_width,
2027	graph_dotted_line, graph_dotted_line, graph_dotted_line,
2028	graph_dotted_line);
2029
2030	if (sched->show_state)
2031	printf(" %.5s", graph_dotted_line);
2032
2033	printf("\n");
2034	}
2035
2036	static char task_state_char(struct thread thread, int* state)
2037	{
2038	static const char state_to_char[] = TASK_STATE_TO_CHAR_STR;
2039	unsigned bit = state ? ffs(state) : `0`;
2040
2041	/ 'I' for idle /
2042	if (thread__tid(thread) == `0`)
2043	return `'I'`;
2044
2045	return bit < sizeof(state_to_char) - `1` ? state_to_char[bit] : `'?'`;
2046	}
2047
2048	static void timehist_print_sample(struct perf_sched *sched,
2049	struct evsel *evsel,
2050	struct perf_sample *sample,
2051	struct addr_location *al,
2052	struct thread *thread,
2053	u64 t, int state)
2054	{
2055	struct thread_runtime *tr = thread__priv(thread);
2056	const char *next_comm = evsel__strval(evsel, sample, "next_comm");
2057	const u32 next_pid = evsel__intval(evsel, sample, "next_pid");
2058	u32 max_cpus = sched->max_cpu.cpu + `1`;
2059	char tstr[`64`];
2060	char nstr[`30`];
2061	u64 wait_time;
2062
2063	if (cpu_list && !test_bit(sample->cpu, cpu_bitmap))
2064	return;
2065
2066	timestamp__scnprintf_usec(timestamp: t, buf: tstr, sz: sizeof(tstr));
2067	printf("%15s [%04d] ", tstr, sample->cpu);
2068
2069	if (sched->show_cpu_visual) {
2070	u32 i;
2071	char c;
2072
2073	printf(" ");
2074	for (i = `0`; i < max_cpus; ++i) {
2075	/ flag idle times with 'i'; others are sched events /
2076	if (i == sample->cpu)
2077	c = (thread__tid(thread) == `0`) ? `'i'` : `'s'`;
2078	else
2079	c = `' '`;
2080	printf("%c", c);
2081	}
2082	printf(" ");
2083	}
2084
2085	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2086
2087	wait_time = tr->dt_sleep + tr->dt_iowait + tr->dt_preempt;
2088	print_sched_time(nsecs: wait_time, width: `6`);
2089
2090	print_sched_time(nsecs: tr->dt_delay, width: `6`);
2091	print_sched_time(nsecs: tr->dt_run, width: `6`);
2092
2093	if (sched->show_state)
2094	printf(" %5c ", task_state_char(thread, state));
2095
2096	if (sched->show_next) {
2097	snprintf(buf: nstr, size: sizeof(nstr), fmt: "next: %s[%d]", next_comm, next_pid);
2098	printf(" %-*s", comm_width, nstr);
2099	}
2100
2101	if (sched->show_wakeups && !sched->show_next)
2102	printf(" %-*s", comm_width, "");
2103
2104	if (thread__tid(thread) == `0`)
2105	goto out;
2106
2107	if (sched->show_callchain)
2108	printf(" ");
2109
2110	sample__fprintf_sym(sample, al, `0`,
2111	EVSEL__PRINT_SYM \| EVSEL__PRINT_ONELINE \|
2112	EVSEL__PRINT_CALLCHAIN_ARROW \|
2113	EVSEL__PRINT_SKIP_IGNORED,
2114	get_tls_callchain_cursor(), symbol_conf.bt_stop_list, stdout);
2115
2116	out:
2117	printf("\n");
2118	}
2119
2120	/*
2121	* Explanation of delta-time stats:
2122	*
2123	* t = time of current schedule out event
2124	* tprev = time of previous sched out event
2125	* also time of schedule-in event for current task
2126	* last_time = time of last sched change event for current task
2127	* (i.e, time process was last scheduled out)
2128	* ready_to_run = time of wakeup for current task
2129	*
2130	* -----\|------------\|------------\|------------\|------
2131	* last ready tprev t
2132	* time to run
2133	*
2134	* \|-------- dt_wait --------\|
2135	* \|- dt_delay -\|-- dt_run --\|
2136	*
2137	* dt_run = run time of current task
2138	* dt_wait = time between last schedule out event for task and tprev
2139	* represents time spent off the cpu
2140	* dt_delay = time between wakeup and schedule-in of task
2141	*/
2142
2143	static void timehist_update_runtime_stats(struct thread_runtime *r,
2144	u64 t, u64 tprev)
2145	{
2146	r->dt_delay = `0`;
2147	r->dt_sleep = `0`;
2148	r->dt_iowait = `0`;
2149	r->dt_preempt = `0`;
2150	r->dt_run = `0`;
2151
2152	if (tprev) {
2153	r->dt_run = t - tprev;
2154	if (r->ready_to_run) {
2155	if (r->ready_to_run > tprev)
2156	pr_debug("time travel: wakeup time for task > previous sched_switch event\n");
2157	else
2158	r->dt_delay = tprev - r->ready_to_run;
2159	}
2160
2161	if (r->last_time > tprev)
2162	pr_debug("time travel: last sched out time for task > previous sched_switch event\n");
2163	else if (r->last_time) {
2164	u64 dt_wait = tprev - r->last_time;
2165
2166	if (r->last_state == TASK_RUNNING)
2167	r->dt_preempt = dt_wait;
2168	else if (r->last_state == TASK_UNINTERRUPTIBLE)
2169	r->dt_iowait = dt_wait;
2170	else
2171	r->dt_sleep = dt_wait;
2172	}
2173	}
2174
2175	update_stats(stats: &r->run_stats, val: r->dt_run);
2176
2177	r->total_run_time += r->dt_run;
2178	r->total_delay_time += r->dt_delay;
2179	r->total_sleep_time += r->dt_sleep;
2180	r->total_iowait_time += r->dt_iowait;
2181	r->total_preempt_time += r->dt_preempt;
2182	}
2183
2184	static bool is_idle_sample(struct perf_sample *sample,
2185	struct evsel *evsel)
2186	{
2187	/ pid 0 == swapper == idle task /
2188	if (strcmp(evsel__name(evsel), "sched:sched_switch") == `0`)
2189	return evsel__intval(evsel, sample, "prev_pid") == `0`;
2190
2191	return sample->pid == `0`;
2192	}
2193
2194	static void save_task_callchain(struct perf_sched *sched,
2195	struct perf_sample *sample,
2196	struct evsel *evsel,
2197	struct machine *machine)
2198	{
2199	struct callchain_cursor *cursor;
2200	struct thread *thread;
2201
2202	/ want main thread for process - has maps /
2203	thread = machine__findnew_thread(machine, pid: sample->pid, tid: sample->pid);
2204	if (thread == NULL) {
2205	pr_debug("Failed to get thread for pid %d.\n", sample->pid);
2206	return;
2207	}
2208
2209	if (!sched->show_callchain \|\| sample->callchain == NULL)
2210	return;
2211
2212	cursor = get_tls_callchain_cursor();
2213
2214	if (thread__resolve_callchain(thread, cursor, evsel, sample,
2215	NULL, NULL, max_stack: sched->max_stack + `2`) != `0`) {
2216	if (verbose > `0`)
2217	pr_err("Failed to resolve callchain. Skipping\n");
2218
2219	return;
2220	}
2221
2222	callchain_cursor_commit(cursor);
2223
2224	while (true) {
2225	struct callchain_cursor_node *node;
2226	struct symbol *sym;
2227
2228	node = callchain_cursor_current(cursor);
2229	if (node == NULL)
2230	break;
2231
2232	sym = node->ms.sym;
2233	if (sym) {
2234	if (!strcmp(sym->name, "schedule") \|\|
2235	!strcmp(sym->name, "__schedule") \|\|
2236	!strcmp(sym->name, "preempt_schedule"))
2237	sym->ignore = `1`;
2238	}
2239
2240	callchain_cursor_advance(cursor);
2241	}
2242	}
2243
2244	static int init_idle_thread(struct thread *thread)
2245	{
2246	struct idle_thread_runtime *itr;
2247
2248	thread__set_comm(thread, comm: idle_comm, timestamp: `0`);
2249
2250	itr = zalloc(sizeof(*itr));
2251	if (itr == NULL)
2252	return -ENOMEM;
2253
2254	init_stats(stats: &itr->tr.run_stats);
2255	callchain_init(root: &itr->callchain);
2256	callchain_cursor_reset(cursor: &itr->cursor);
2257	thread__set_priv(thread, p: itr);
2258
2259	return `0`;
2260	}
2261
2262	/*
2263	* Track idle stats per cpu by maintaining a local thread
2264	* struct for the idle task on each cpu.
2265	*/
2266	static int init_idle_threads(int ncpu)
2267	{
2268	int i, ret;
2269
2270	idle_threads = zalloc(ncpu * sizeof(struct thread *));
2271	if (!idle_threads)
2272	return -ENOMEM;
2273
2274	idle_max_cpu = ncpu;
2275
2276	/ allocate the actual thread struct if needed /
2277	for (i = `0`; i < ncpu; ++i) {
2278	idle_threads[i] = thread__new(pid: `0`, tid: `0`);
2279	if (idle_threads[i] == NULL)
2280	return -ENOMEM;
2281
2282	ret = init_idle_thread(thread: idle_threads[i]);
2283	if (ret < `0`)
2284	return ret;
2285	}
2286
2287	return `0`;
2288	}
2289
2290	static void free_idle_threads(void)
2291	{
2292	int i;
2293
2294	if (idle_threads == NULL)
2295	return;
2296
2297	for (i = `0`; i < idle_max_cpu; ++i) {
2298	if ((idle_threads[i]))
2299	thread__delete(thread: idle_threads[i]);
2300	}
2301
2302	free(idle_threads);
2303	}
2304
2305	static struct thread get_idle_thread(int* cpu)
2306	{
2307	/*
2308	* expand/allocate array of pointers to local thread
2309	* structs if needed
2310	*/
2311	if ((cpu >= idle_max_cpu) \|\| (idle_threads == NULL)) {
2312	int i, j = __roundup_pow_of_two(n: cpu+`1`);
2313	void *p;
2314
2315	p = realloc(idle_threads, j * sizeof(struct thread *));
2316	if (!p)
2317	return NULL;
2318
2319	idle_threads = (struct thread **) p;
2320	for (i = idle_max_cpu; i < j; ++i)
2321	idle_threads[i] = NULL;
2322
2323	idle_max_cpu = j;
2324	}
2325
2326	/ allocate a new thread struct if needed /
2327	if (idle_threads[cpu] == NULL) {
2328	idle_threads[cpu] = thread__new(pid: `0`, tid: `0`);
2329
2330	if (idle_threads[cpu]) {
2331	if (init_idle_thread(thread: idle_threads[cpu]) < `0`)
2332	return NULL;
2333	}
2334	}
2335
2336	return idle_threads[cpu];
2337	}
2338
2339	static void save_idle_callchain(struct perf_sched *sched,
2340	struct idle_thread_runtime *itr,
2341	struct perf_sample *sample)
2342	{
2343	struct callchain_cursor *cursor;
2344
2345	if (!sched->show_callchain \|\| sample->callchain == NULL)
2346	return;
2347
2348	cursor = get_tls_callchain_cursor();
2349	if (cursor == NULL)
2350	return;
2351
2352	callchain_cursor__copy(dst: &itr->cursor, src: cursor);
2353	}
2354
2355	static struct thread timehist_get_thread(struct* perf_sched *sched,
2356	struct perf_sample *sample,
2357	struct machine *machine,
2358	struct evsel *evsel)
2359	{
2360	struct thread *thread;
2361
2362	if (is_idle_sample(sample, evsel)) {
2363	thread = get_idle_thread(cpu: sample->cpu);
2364	if (thread == NULL)
2365	pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2366
2367	} else {
2368	/ there were samples with tid 0 but non-zero pid /
2369	thread = machine__findnew_thread(machine, pid: sample->pid,
2370	tid: sample->tid ?: sample->pid);
2371	if (thread == NULL) {
2372	pr_debug("Failed to get thread for tid %d. skipping sample.\n",
2373	sample->tid);
2374	}
2375
2376	save_task_callchain(sched, sample, evsel, machine);
2377	if (sched->idle_hist) {
2378	struct thread *idle;
2379	struct idle_thread_runtime *itr;
2380
2381	idle = get_idle_thread(cpu: sample->cpu);
2382	if (idle == NULL) {
2383	pr_err("Failed to get idle thread for cpu %d.\n", sample->cpu);
2384	return NULL;
2385	}
2386
2387	itr = thread__priv(thread: idle);
2388	if (itr == NULL)
2389	return NULL;
2390
2391	itr->last_thread = thread;
2392
2393	/ copy task callchain when entering to idle /
2394	if (evsel__intval(evsel, sample, "next_pid") == `0`)
2395	save_idle_callchain(sched, itr, sample);
2396	}
2397	}
2398
2399	return thread;
2400	}
2401
2402	static bool timehist_skip_sample(struct perf_sched *sched,
2403	struct thread *thread,
2404	struct evsel *evsel,
2405	struct perf_sample *sample)
2406	{
2407	bool rc = false;
2408
2409	if (thread__is_filtered(thread)) {
2410	rc = true;
2411	sched->skipped_samples++;
2412	}
2413
2414	if (sched->idle_hist) {
2415	if (strcmp(evsel__name(evsel), "sched:sched_switch"))
2416	rc = true;
2417	else if (evsel__intval(evsel, sample, "prev_pid") != `0` &&
2418	evsel__intval(evsel, sample, "next_pid") != `0`)
2419	rc = true;
2420	}
2421
2422	return rc;
2423	}
2424
2425	static void timehist_print_wakeup_event(struct perf_sched *sched,
2426	struct evsel *evsel,
2427	struct perf_sample *sample,
2428	struct machine *machine,
2429	struct thread *awakened)
2430	{
2431	struct thread *thread;
2432	char tstr[`64`];
2433
2434	thread = machine__findnew_thread(machine, pid: sample->pid, tid: sample->tid);
2435	if (thread == NULL)
2436	return;
2437
2438	/ show wakeup unless both awakee and awaker are filtered /
2439	if (timehist_skip_sample(sched, thread, evsel, sample) &&
2440	timehist_skip_sample(sched, thread: awakened, evsel, sample)) {
2441	return;
2442	}
2443
2444	timestamp__scnprintf_usec(timestamp: sample->time, buf: tstr, sz: sizeof(tstr));
2445	printf("%15s [%04d] ", tstr, sample->cpu);
2446	if (sched->show_cpu_visual)
2447	printf(" %*s ", sched->max_cpu.cpu + `1`, "");
2448
2449	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2450
2451	/ dt spacer /
2452	printf(" %9s %9s %9s ", "", "", "");
2453
2454	printf("awakened: %s", timehist_get_commstr(thread: awakened));
2455
2456	printf("\n");
2457	}
2458
2459	static int timehist_sched_wakeup_ignore(struct perf_tool *tool __maybe_unused,
2460	union perf_event *event __maybe_unused,
2461	struct evsel *evsel __maybe_unused,
2462	struct perf_sample *sample __maybe_unused,
2463	struct machine *machine __maybe_unused)
2464	{
2465	return `0`;
2466	}
2467
2468	static int timehist_sched_wakeup_event(struct perf_tool *tool,
2469	union perf_event *event __maybe_unused,
2470	struct evsel *evsel,
2471	struct perf_sample *sample,
2472	struct machine *machine)
2473	{
2474	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
2475	struct thread *thread;
2476	struct thread_runtime *tr = NULL;
2477	/ want pid of awakened task not pid in sample /
2478	const u32 pid = evsel__intval(evsel, sample, "pid");
2479
2480	thread = machine__findnew_thread(machine, pid: `0`, tid: pid);
2481	if (thread == NULL)
2482	return -`1`;
2483
2484	tr = thread__get_runtime(thread);
2485	if (tr == NULL)
2486	return -`1`;
2487
2488	if (tr->ready_to_run == `0`)
2489	tr->ready_to_run = sample->time;
2490
2491	/ show wakeups if requested /
2492	if (sched->show_wakeups &&
2493	!perf_time__skip_sample(ptime: &sched->ptime, timestamp: sample->time))
2494	timehist_print_wakeup_event(sched, evsel, sample, machine, awakened: thread);
2495
2496	return `0`;
2497	}
2498
2499	static void timehist_print_migration_event(struct perf_sched *sched,
2500	struct evsel *evsel,
2501	struct perf_sample *sample,
2502	struct machine *machine,
2503	struct thread *migrated)
2504	{
2505	struct thread *thread;
2506	char tstr[`64`];
2507	u32 max_cpus;
2508	u32 ocpu, dcpu;
2509
2510	if (sched->summary_only)
2511	return;
2512
2513	max_cpus = sched->max_cpu.cpu + `1`;
2514	ocpu = evsel__intval(evsel, sample, "orig_cpu");
2515	dcpu = evsel__intval(evsel, sample, "dest_cpu");
2516
2517	thread = machine__findnew_thread(machine, pid: sample->pid, tid: sample->tid);
2518	if (thread == NULL)
2519	return;
2520
2521	if (timehist_skip_sample(sched, thread, evsel, sample) &&
2522	timehist_skip_sample(sched, thread: migrated, evsel, sample)) {
2523	return;
2524	}
2525
2526	timestamp__scnprintf_usec(timestamp: sample->time, buf: tstr, sz: sizeof(tstr));
2527	printf("%15s [%04d] ", tstr, sample->cpu);
2528
2529	if (sched->show_cpu_visual) {
2530	u32 i;
2531	char c;
2532
2533	printf(" ");
2534	for (i = `0`; i < max_cpus; ++i) {
2535	c = (i == sample->cpu) ? `'m'` : `' '`;
2536	printf("%c", c);
2537	}
2538	printf(" ");
2539	}
2540
2541	printf(" %-*s ", comm_width, timehist_get_commstr(thread));
2542
2543	/ dt spacer /
2544	printf(" %9s %9s %9s ", "", "", "");
2545
2546	printf("migrated: %s", timehist_get_commstr(thread: migrated));
2547	printf(" cpu %d => %d", ocpu, dcpu);
2548
2549	printf("\n");
2550	}
2551
2552	static int timehist_migrate_task_event(struct perf_tool *tool,
2553	union perf_event *event __maybe_unused,
2554	struct evsel *evsel,
2555	struct perf_sample *sample,
2556	struct machine *machine)
2557	{
2558	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
2559	struct thread *thread;
2560	struct thread_runtime *tr = NULL;
2561	/ want pid of migrated task not pid in sample /
2562	const u32 pid = evsel__intval(evsel, sample, "pid");
2563
2564	thread = machine__findnew_thread(machine, pid: `0`, tid: pid);
2565	if (thread == NULL)
2566	return -`1`;
2567
2568	tr = thread__get_runtime(thread);
2569	if (tr == NULL)
2570	return -`1`;
2571
2572	tr->migrations++;
2573
2574	/ show migrations if requested /
2575	timehist_print_migration_event(sched, evsel, sample, machine, migrated: thread);
2576
2577	return `0`;
2578	}
2579
2580	static int timehist_sched_change_event(struct perf_tool *tool,
2581	union perf_event *event,
2582	struct evsel *evsel,
2583	struct perf_sample *sample,
2584	struct machine *machine)
2585	{
2586	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
2587	struct perf_time_interval *ptime = &sched->ptime;
2588	struct addr_location al;
2589	struct thread *thread;
2590	struct thread_runtime *tr = NULL;
2591	u64 tprev, t = sample->time;
2592	int rc = `0`;
2593	int state = evsel__intval(evsel, sample, "prev_state");
2594
2595	addr_location__init(al: &al);
2596	if (machine__resolve(machine, al: &al, sample) < `0`) {
2597	pr_err("problem processing %d event. skipping it\n",
2598	event->header.type);
2599	rc = -`1`;
2600	goto out;
2601	}
2602
2603	thread = timehist_get_thread(sched, sample, machine, evsel);
2604	if (thread == NULL) {
2605	rc = -`1`;
2606	goto out;
2607	}
2608
2609	if (timehist_skip_sample(sched, thread, evsel, sample))
2610	goto out;
2611
2612	tr = thread__get_runtime(thread);
2613	if (tr == NULL) {
2614	rc = -`1`;
2615	goto out;
2616	}
2617
2618	tprev = evsel__get_time(evsel, cpu: sample->cpu);
2619
2620	/*
2621	* If start time given:
2622	* - sample time is under window user cares about - skip sample
2623	* - tprev is under window user cares about - reset to start of window
2624	*/
2625	if (ptime->start && ptime->start > t)
2626	goto out;
2627
2628	if (tprev && ptime->start > tprev)
2629	tprev = ptime->start;
2630
2631	/*
2632	* If end time given:
2633	* - previous sched event is out of window - we are done
2634	* - sample time is beyond window user cares about - reset it
2635	* to close out stats for time window interest
2636	*/
2637	if (ptime->end) {
2638	if (tprev > ptime->end)
2639	goto out;
2640
2641	if (t > ptime->end)
2642	t = ptime->end;
2643	}
2644
2645	if (!sched->idle_hist \|\| thread__tid(thread) == `0`) {
2646	if (!cpu_list \|\| test_bit(sample->cpu, cpu_bitmap))
2647	timehist_update_runtime_stats(r: tr, t, tprev);
2648
2649	if (sched->idle_hist) {
2650	struct idle_thread_runtime itr = (void* *)tr;
2651	struct thread_runtime *last_tr;
2652
2653	BUG_ON(thread__tid(thread) != `0`);
2654
2655	if (itr->last_thread == NULL)
2656	goto out;
2657
2658	/ add current idle time as last thread's runtime /
2659	last_tr = thread__get_runtime(thread: itr->last_thread);
2660	if (last_tr == NULL)
2661	goto out;
2662
2663	timehist_update_runtime_stats(r: last_tr, t, tprev);
2664	/*
2665	* remove delta time of last thread as it's not updated
2666	* and otherwise it will show an invalid value next
2667	* time. we only care total run time and run stat.
2668	*/
2669	last_tr->dt_run = `0`;
2670	last_tr->dt_delay = `0`;
2671	last_tr->dt_sleep = `0`;
2672	last_tr->dt_iowait = `0`;
2673	last_tr->dt_preempt = `0`;
2674
2675	if (itr->cursor.nr)
2676	callchain_append(root: &itr->callchain, cursor: &itr->cursor, period: t - tprev);
2677
2678	itr->last_thread = NULL;
2679	}
2680	}
2681
2682	if (!sched->summary_only)
2683	timehist_print_sample(sched, evsel, sample, al: &al, thread, t, state);
2684
2685	out:
2686	if (sched->hist_time.start == `0` && t >= ptime->start)
2687	sched->hist_time.start = t;
2688	if (ptime->end == `0` \|\| t <= ptime->end)
2689	sched->hist_time.end = t;
2690
2691	if (tr) {
2692	/ time of this sched_switch event becomes last time task seen /
2693	tr->last_time = sample->time;
2694
2695	/ last state is used to determine where to account wait time /
2696	tr->last_state = state;
2697
2698	/ sched out event for task so reset ready to run time /
2699	tr->ready_to_run = `0`;
2700	}
2701
2702	evsel__save_time(evsel, timestamp: sample->time, cpu: sample->cpu);
2703
2704	addr_location__exit(al: &al);
2705	return rc;
2706	}
2707
2708	static int timehist_sched_switch_event(struct perf_tool *tool,
2709	union perf_event *event,
2710	struct evsel *evsel,
2711	struct perf_sample *sample,
2712	struct machine *machine __maybe_unused)
2713	{
2714	return timehist_sched_change_event(tool, event, evsel, sample, machine);
2715	}
2716
2717	static int process_lost(struct perf_tool *tool __maybe_unused,
2718	union perf_event *event,
2719	struct perf_sample *sample,
2720	struct machine *machine __maybe_unused)
2721	{
2722	char tstr[`64`];
2723
2724	timestamp__scnprintf_usec(timestamp: sample->time, buf: tstr, sz: sizeof(tstr));
2725	printf("%15s ", tstr);
2726	printf("lost %" PRI_lu64 " events on cpu %d\n", event->lost.lost, sample->cpu);
2727
2728	return `0`;
2729	}
2730
2731
2732	static void print_thread_runtime(struct thread *t,
2733	struct thread_runtime *r)
2734	{
2735	double mean = avg_stats(stats: &r->run_stats);
2736	float stddev;
2737
2738	printf("%*s %5d %9" PRIu64 " ",
2739	comm_width, timehist_get_commstr(t), thread__ppid(t),
2740	(u64) r->run_stats.n);
2741
2742	print_sched_time(nsecs: r->total_run_time, width: `8`);
2743	stddev = rel_stddev_stats(stddev: stddev_stats(stats: &r->run_stats), avg: mean);
2744	print_sched_time(nsecs: r->run_stats.min, width: `6`);
2745	printf(" ");
2746	print_sched_time(nsecs: (u64) mean, width: `6`);
2747	printf(" ");
2748	print_sched_time(nsecs: r->run_stats.max, width: `6`);
2749	printf(" ");
2750	printf("%5.2f", stddev);
2751	printf(" %5" PRIu64, r->migrations);
2752	printf("\n");
2753	}
2754
2755	static void print_thread_waittime(struct thread *t,
2756	struct thread_runtime *r)
2757	{
2758	printf("%*s %5d %9" PRIu64 " ",
2759	comm_width, timehist_get_commstr(t), thread__ppid(t),
2760	(u64) r->run_stats.n);
2761
2762	print_sched_time(nsecs: r->total_run_time, width: `8`);
2763	print_sched_time(nsecs: r->total_sleep_time, width: `6`);
2764	printf(" ");
2765	print_sched_time(nsecs: r->total_iowait_time, width: `6`);
2766	printf(" ");
2767	print_sched_time(nsecs: r->total_preempt_time, width: `6`);
2768	printf(" ");
2769	print_sched_time(nsecs: r->total_delay_time, width: `6`);
2770	printf("\n");
2771	}
2772
2773	struct total_run_stats {
2774	struct perf_sched *sched;
2775	u64 sched_count;
2776	u64 task_count;
2777	u64 total_run_time;
2778	};
2779
2780	static int show_thread_runtime(struct thread t, void* *priv)
2781	{
2782	struct total_run_stats *stats = priv;
2783	struct thread_runtime *r;
2784
2785	if (thread__is_filtered(thread: t))
2786	return `0`;
2787
2788	r = thread__priv(thread: t);
2789	if (r && r->run_stats.n) {
2790	stats->task_count++;
2791	stats->sched_count += r->run_stats.n;
2792	stats->total_run_time += r->total_run_time;
2793
2794	if (stats->sched->show_state)
2795	print_thread_waittime(t, r);
2796	else
2797	print_thread_runtime(t, r);
2798	}
2799
2800	return `0`;
2801	}
2802
2803	static size_t callchain__fprintf_folded(FILE fp, struct* callchain_node *node)
2804	{
2805	const char *sep = " <- ";
2806	struct callchain_list *chain;
2807	size_t ret = `0`;
2808	char bf[`1024`];
2809	bool first;
2810
2811	if (node == NULL)
2812	return `0`;
2813
2814	ret = callchain__fprintf_folded(fp, node->parent);
2815	first = (ret == `0`);
2816
2817	list_for_each_entry(chain, &node->val, list) {
2818	if (chain->ip >= PERF_CONTEXT_MAX)
2819	continue;
2820	if (chain->ms.sym && chain->ms.sym->ignore)
2821	continue;
2822	ret += fprintf(fp, "%s%s", first ? "" : sep,
2823	callchain_list__sym_name(cl: chain, bf, bfsize: sizeof(bf),
2824	show_dso: false));
2825	first = false;
2826	}
2827
2828	return ret;
2829	}
2830
2831	static size_t timehist_print_idlehist_callchain(struct rb_root_cached *root)
2832	{
2833	size_t ret = `0`;
2834	FILE *fp = stdout;
2835	struct callchain_node *chain;
2836	struct rb_node *rb_node = rb_first_cached(root);
2837
2838	printf(" %16s %8s %s\n", "Idle time (msec)", "Count", "Callchains");
2839	printf(" %.16s %.8s %.50s\n", graph_dotted_line, graph_dotted_line,
2840	graph_dotted_line);
2841
2842	while (rb_node) {
2843	chain = rb_entry(rb_node, struct callchain_node, rb_node);
2844	rb_node = rb_next(rb_node);
2845
2846	ret += fprintf(fp, " ");
2847	print_sched_time(nsecs: chain->hit, width: `12`);
2848	ret += `16`; / print_sched_time returns 2nd arg + 4 /
2849	ret += fprintf(fp, " %8d ", chain->count);
2850	ret += callchain__fprintf_folded(fp, chain);
2851	ret += fprintf(fp, "\n");
2852	}
2853
2854	return ret;
2855	}
2856
2857	static void timehist_print_summary(struct perf_sched *sched,
2858	struct perf_session *session)
2859	{
2860	struct machine *m = &session->machines.host;
2861	struct total_run_stats totals;
2862	u64 task_count;
2863	struct thread *t;
2864	struct thread_runtime *r;
2865	int i;
2866	u64 hist_time = sched->hist_time.end - sched->hist_time.start;
2867
2868	memset(&totals, `0`, sizeof(totals));
2869	totals.sched = sched;
2870
2871	if (sched->idle_hist) {
2872	printf("\nIdle-time summary\n");
2873	printf("%*s parent sched-out ", comm_width, "comm");
2874	printf(" idle-time min-idle avg-idle max-idle stddev migrations\n");
2875	} else if (sched->show_state) {
2876	printf("\nWait-time summary\n");
2877	printf("%*s parent sched-in ", comm_width, "comm");
2878	printf(" run-time sleep iowait preempt delay\n");
2879	} else {
2880	printf("\nRuntime summary\n");
2881	printf("%*s parent sched-in ", comm_width, "comm");
2882	printf(" run-time min-run avg-run max-run stddev migrations\n");
2883	}
2884	printf("%*s (count) ", comm_width, "");
2885	printf(" (msec) (msec) (msec) (msec) %s\n",
2886	sched->show_state ? "(msec)" : "%");
2887	printf("%.117s\n", graph_dotted_line);
2888
2889	machine__for_each_thread(machine: m, fn: show_thread_runtime, priv: &totals);
2890	task_count = totals.task_count;
2891	if (!task_count)
2892	printf("<no still running tasks>\n");
2893
2894	/ CPU idle stats not tracked when samples were skipped /
2895	if (sched->skipped_samples && !sched->idle_hist)
2896	return;
2897
2898	printf("\nIdle stats:\n");
2899	for (i = `0`; i < idle_max_cpu; ++i) {
2900	if (cpu_list && !test_bit(i, cpu_bitmap))
2901	continue;
2902
2903	t = idle_threads[i];
2904	if (!t)
2905	continue;
2906
2907	r = thread__priv(thread: t);
2908	if (r && r->run_stats.n) {
2909	totals.sched_count += r->run_stats.n;
2910	printf(" CPU %2d idle for ", i);
2911	print_sched_time(nsecs: r->total_run_time, width: `6`);
2912	printf(" msec (%6.2f%%)\n", `100.0` * r->total_run_time / hist_time);
2913	} else
2914	printf(" CPU %2d idle entire time window\n", i);
2915	}
2916
2917	if (sched->idle_hist && sched->show_callchain) {
2918	callchain_param.mode = CHAIN_FOLDED;
2919	callchain_param.value = CCVAL_PERIOD;
2920
2921	callchain_register_param(param: &callchain_param);
2922
2923	printf("\nIdle stats by callchain:\n");
2924	for (i = `0`; i < idle_max_cpu; ++i) {
2925	struct idle_thread_runtime *itr;
2926
2927	t = idle_threads[i];
2928	if (!t)
2929	continue;
2930
2931	itr = thread__priv(thread: t);
2932	if (itr == NULL)
2933	continue;
2934
2935	callchain_param.sort(&itr->sorted_root.rb_root, &itr->callchain,
2936	`0`, &callchain_param);
2937
2938	printf(" CPU %2d:", i);
2939	print_sched_time(nsecs: itr->tr.total_run_time, width: `6`);
2940	printf(" msec\n");
2941	timehist_print_idlehist_callchain(root: &itr->sorted_root);
2942	printf("\n");
2943	}
2944	}
2945
2946	printf("\n"
2947	" Total number of unique tasks: %" PRIu64 "\n"
2948	"Total number of context switches: %" PRIu64 "\n",
2949	totals.task_count, totals.sched_count);
2950
2951	printf(" Total run time (msec): ");
2952	print_sched_time(nsecs: totals.total_run_time, width: `2`);
2953	printf("\n");
2954
2955	printf(" Total scheduling time (msec): ");
2956	print_sched_time(nsecs: hist_time, width: `2`);
2957	printf(" (x %d)\n", sched->max_cpu.cpu);
2958	}
2959
2960	typedef int (sched_handler)(struct* perf_tool *tool,
2961	union perf_event *event,
2962	struct evsel *evsel,
2963	struct perf_sample *sample,
2964	struct machine *machine);
2965
2966	static int perf_timehist__process_sample(struct perf_tool *tool,
2967	union perf_event *event,
2968	struct perf_sample *sample,
2969	struct evsel *evsel,
2970	struct machine *machine)
2971	{
2972	struct perf_sched sched = container_of(tool, struct* perf_sched, tool);
2973	int err = `0`;
2974	struct perf_cpu this_cpu = {
2975	.cpu = sample->cpu,
2976	};
2977
2978	if (this_cpu.cpu > sched->max_cpu.cpu)
2979	sched->max_cpu = this_cpu;
2980
2981	if (evsel->handler != NULL) {
2982	sched_handler f = evsel->handler;
2983
2984	err = f(tool, event, evsel, sample, machine);
2985	}
2986
2987	return err;
2988	}
2989
2990	static int timehist_check_attr(struct perf_sched *sched,
2991	struct evlist *evlist)
2992	{
2993	struct evsel *evsel;
2994	struct evsel_runtime *er;
2995
2996	list_for_each_entry(evsel, &evlist->core.entries, core.node) {
2997	er = evsel__get_runtime(evsel);
2998	if (er == NULL) {
2999	pr_err("Failed to allocate memory for evsel runtime data\n");
3000	return -`1`;
3001	}
3002
3003	if (sched->show_callchain && !evsel__has_callchain(evsel)) {
3004	pr_info("Samples do not have callchains.\n");
3005	sched->show_callchain = `0`;
3006	symbol_conf.use_callchain = `0`;
3007	}
3008	}
3009
3010	return `0`;
3011	}
3012
3013	static int perf_sched__timehist(struct perf_sched *sched)
3014	{
3015	struct evsel_str_handler handlers[] = {
3016	{ "sched:sched_switch", timehist_sched_switch_event, },
3017	{ "sched:sched_wakeup", timehist_sched_wakeup_event, },
3018	{ "sched:sched_waking", timehist_sched_wakeup_event, },
3019	{ "sched:sched_wakeup_new", timehist_sched_wakeup_event, },
3020	};
3021	const struct evsel_str_handler migrate_handlers[] = {
3022	{ "sched:sched_migrate_task", timehist_migrate_task_event, },
3023	};
3024	struct perf_data data = {
3025	.path = input_name,
3026	.mode = PERF_DATA_MODE_READ,
3027	.force = sched->force,
3028	};
3029
3030	struct perf_session *session;
3031	struct evlist *evlist;
3032	int err = -`1`;
3033
3034	/*
3035	* event handlers for timehist option
3036	*/
3037	sched->tool.sample = perf_timehist__process_sample;
3038	sched->tool.mmap = perf_event__process_mmap;
3039	sched->tool.comm = perf_event__process_comm;
3040	sched->tool.exit = perf_event__process_exit;
3041	sched->tool.fork = perf_event__process_fork;
3042	sched->tool.lost = process_lost;
3043	sched->tool.attr = perf_event__process_attr;
3044	sched->tool.tracing_data = perf_event__process_tracing_data;
3045	sched->tool.build_id = perf_event__process_build_id;
3046
3047	sched->tool.ordered_events = true;
3048	sched->tool.ordering_requires_timestamps = true;
3049
3050	symbol_conf.use_callchain = sched->show_callchain;
3051
3052	session = perf_session__new(data: &data, tool: &sched->tool);
3053	if (IS_ERR(ptr: session))
3054	return PTR_ERR(ptr: session);
3055
3056	if (cpu_list) {
3057	err = perf_session__cpu_bitmap(session, cpu_list, cpu_bitmap: cpu_bitmap);
3058	if (err < `0`)
3059	goto out;
3060	}
3061
3062	evlist = session->evlist;
3063
3064	symbol__init(env: &session->header.env);
3065
3066	if (perf_time__parse_str(ptime: &sched->ptime, ostr: sched->time_str) != `0`) {
3067	pr_err("Invalid time string\n");
3068	return -EINVAL;
3069	}
3070
3071	if (timehist_check_attr(sched, evlist) != `0`)
3072	goto out;
3073
3074	setup_pager();
3075
3076	/ prefer sched_waking if it is captured /
3077	if (evlist__find_tracepoint_by_name(evlist: session->evlist, name: "sched:sched_waking"))
3078	handlers[`1`].handler = timehist_sched_wakeup_ignore;
3079
3080	/ setup per-evsel handlers /
3081	if (perf_session__set_tracepoints_handlers(session, handlers))
3082	goto out;
3083
3084	/ sched_switch event at a minimum needs to exist /
3085	if (!evlist__find_tracepoint_by_name(evlist: session->evlist, name: "sched:sched_switch")) {
3086	pr_err("No sched_switch events found. Have you run 'perf sched record'?\n");
3087	goto out;
3088	}
3089
3090	if (sched->show_migrations &&
3091	perf_session__set_tracepoints_handlers(session, migrate_handlers))
3092	goto out;
3093
3094	/ pre-allocate struct for per-CPU idle stats /
3095	sched->max_cpu.cpu = session->header.env.nr_cpus_online;
3096	if (sched->max_cpu.cpu == `0`)
3097	sched->max_cpu.cpu = `4`;
3098	if (init_idle_threads(ncpu: sched->max_cpu.cpu))
3099	goto out;
3100
3101	/ summary_only implies summary option, but don't overwrite summary if set /
3102	if (sched->summary_only)
3103	sched->summary = sched->summary_only;
3104
3105	if (!sched->summary_only)
3106	timehist_header(sched);
3107
3108	err = perf_session__process_events(session);
3109	if (err) {
3110	pr_err("Failed to process events, error %d", err);
3111	goto out;
3112	}
3113
3114	sched->nr_events = evlist->stats.nr_events[`0`];
3115	sched->nr_lost_events = evlist->stats.total_lost;
3116	sched->nr_lost_chunks = evlist->stats.nr_events[PERF_RECORD_LOST];
3117
3118	if (sched->summary)
3119	timehist_print_summary(sched, session);
3120
3121	out:
3122	free_idle_threads();
3123	perf_session__delete(session);
3124
3125	return err;
3126	}
3127
3128
3129	static void print_bad_events(struct perf_sched *sched)
3130	{
3131	if (sched->nr_unordered_timestamps && sched->nr_timestamps) {
3132	printf(" INFO: %.3f%% unordered timestamps (%ld out of %ld)\n",
3133	(double)sched->nr_unordered_timestamps/(double)sched->nr_timestamps*`100.0`,
3134	sched->nr_unordered_timestamps, sched->nr_timestamps);
3135	}
3136	if (sched->nr_lost_events && sched->nr_events) {
3137	printf(" INFO: %.3f%% lost events (%ld out of %ld, in %ld chunks)\n",
3138	(double)sched->nr_lost_events/(double)sched->nr_events * `100.0`,
3139	sched->nr_lost_events, sched->nr_events, sched->nr_lost_chunks);
3140	}
3141	if (sched->nr_context_switch_bugs && sched->nr_timestamps) {
3142	printf(" INFO: %.3f%% context switch bugs (%ld out of %ld)",
3143	(double)sched->nr_context_switch_bugs/(double)sched->nr_timestamps*`100.0`,
3144	sched->nr_context_switch_bugs, sched->nr_timestamps);
3145	if (sched->nr_lost_events)
3146	printf(" (due to lost events?)");
3147	printf("\n");
3148	}
3149	}
3150
3151	static void __merge_work_atoms(struct rb_root_cached root, struct* work_atoms *data)
3152	{
3153	struct rb_node *new = &(root->rb_root.rb_node), parent = NULL;
3154	struct work_atoms *this;
3155	const char comm = thread__comm_str(thread: data->thread), this_comm;
3156	bool leftmost = true;
3157
3158	while (*new) {
3159	int cmp;
3160
3161	this = container_of(new, struct* work_atoms, node);
3162	parent = *new;
3163
3164	this_comm = thread__comm_str(thread: this->thread);
3165	cmp = strcmp(comm, this_comm);
3166	if (cmp > `0`) {
3167	new = &((*new)->rb_left);
3168	} else if (cmp < `0`) {
3169	new = &((*new)->rb_right);
3170	leftmost = false;
3171	} else {
3172	this->num_merged++;
3173	this->total_runtime += data->total_runtime;
3174	this->nb_atoms += data->nb_atoms;
3175	this->total_lat += data->total_lat;
3176	list_splice(list: &data->work_list, head: &this->work_list);
3177	if (this->max_lat < data->max_lat) {
3178	this->max_lat = data->max_lat;
3179	this->max_lat_start = data->max_lat_start;
3180	this->max_lat_end = data->max_lat_end;
3181	}
3182	zfree(&data);
3183	return;
3184	}
3185	}
3186
3187	data->num_merged++;
3188	rb_link_node(node: &data->node, parent, rb_link: new);
3189	rb_insert_color_cached(node: &data->node, root, leftmost);
3190	}
3191
3192	static void perf_sched__merge_lat(struct perf_sched *sched)
3193	{
3194	struct work_atoms *data;
3195	struct rb_node *node;
3196
3197	if (sched->skip_merge)
3198	return;
3199
3200	while ((node = rb_first_cached(&sched->atom_root))) {
3201	rb_erase_cached(node, root: &sched->atom_root);
3202	data = rb_entry(node, struct work_atoms, node);
3203	__merge_work_atoms(root: &sched->merged_atom_root, data);
3204	}
3205	}
3206
3207	static int perf_sched__lat(struct perf_sched *sched)
3208	{
3209	struct rb_node *next;
3210
3211	setup_pager();
3212
3213	if (perf_sched__read_events(sched))
3214	return -`1`;
3215
3216	perf_sched__merge_lat(sched);
3217	perf_sched__sort_lat(sched);
3218
3219	printf("\n -------------------------------------------------------------------------------------------------------------------------------------------\n");
3220	printf(" Task \| Runtime ms \| Switches \| Avg delay ms \| Max delay ms \| Max delay start \| Max delay end \|\n");
3221	printf(" -------------------------------------------------------------------------------------------------------------------------------------------\n");
3222
3223	next = rb_first_cached(&sched->sorted_atom_root);
3224
3225	while (next) {
3226	struct work_atoms *work_list;
3227
3228	work_list = rb_entry(next, struct work_atoms, node);
3229	output_lat_thread(sched, work_list);
3230	next = rb_next(next);
3231	thread__zput(work_list->thread);
3232	}
3233
3234	printf(" -----------------------------------------------------------------------------------------------------------------\n");
3235	printf(" TOTAL: \|%11.3f ms \|%9" PRIu64 " \|\n",
3236	(double)sched->all_runtime / NSEC_PER_MSEC, sched->all_count);
3237
3238	printf(" ---------------------------------------------------\n");
3239
3240	print_bad_events(sched);
3241	printf("\n");
3242
3243	return `0`;
3244	}
3245
3246	static int setup_map_cpus(struct perf_sched *sched)
3247	{
3248	struct perf_cpu_map *map;
3249
3250	sched->max_cpu.cpu = sysconf(_SC_NPROCESSORS_CONF);
3251
3252	if (sched->map.comp) {
3253	sched->map.comp_cpus = zalloc(sched->max_cpu.cpu * sizeof(int));
3254	if (!sched->map.comp_cpus)
3255	return -`1`;
3256	}
3257
3258	if (!sched->map.cpus_str)
3259	return `0`;
3260
3261	map = perf_cpu_map__new(sched->map.cpus_str);
3262	if (!map) {
3263	pr_err("failed to get cpus map from %s\n", sched->map.cpus_str);
3264	return -`1`;
3265	}
3266
3267	sched->map.cpus = map;
3268	return `0`;
3269	}
3270
3271	static int setup_color_pids(struct perf_sched *sched)
3272	{
3273	struct perf_thread_map *map;
3274
3275	if (!sched->map.color_pids_str)
3276	return `0`;
3277
3278	map = thread_map__new_by_tid_str(tid_str: sched->map.color_pids_str);
3279	if (!map) {
3280	pr_err("failed to get thread map from %s\n", sched->map.color_pids_str);
3281	return -`1`;
3282	}
3283
3284	sched->map.color_pids = map;
3285	return `0`;
3286	}
3287
3288	static int setup_color_cpus(struct perf_sched *sched)
3289	{
3290	struct perf_cpu_map *map;
3291
3292	if (!sched->map.color_cpus_str)
3293	return `0`;
3294
3295	map = perf_cpu_map__new(sched->map.color_cpus_str);
3296	if (!map) {
3297	pr_err("failed to get thread map from %s\n", sched->map.color_cpus_str);
3298	return -`1`;
3299	}
3300
3301	sched->map.color_cpus = map;
3302	return `0`;
3303	}
3304
3305	static int perf_sched__map(struct perf_sched *sched)
3306	{
3307	if (setup_map_cpus(sched))
3308	return -`1`;
3309
3310	if (setup_color_pids(sched))
3311	return -`1`;
3312
3313	if (setup_color_cpus(sched))
3314	return -`1`;
3315
3316	setup_pager();
3317	if (perf_sched__read_events(sched))
3318	return -`1`;
3319	print_bad_events(sched);
3320	return `0`;
3321	}
3322
3323	static int perf_sched__replay(struct perf_sched *sched)
3324	{
3325	unsigned long i;
3326
3327	calibrate_run_measurement_overhead(sched);
3328	calibrate_sleep_measurement_overhead(sched);
3329
3330	test_calibrations(sched);
3331
3332	if (perf_sched__read_events(sched))
3333	return -`1`;
3334
3335	printf("nr_run_events: %ld\n", sched->nr_run_events);
3336	printf("nr_sleep_events: %ld\n", sched->nr_sleep_events);
3337	printf("nr_wakeup_events: %ld\n", sched->nr_wakeup_events);
3338
3339	if (sched->targetless_wakeups)
3340	printf("target-less wakeups: %ld\n", sched->targetless_wakeups);
3341	if (sched->multitarget_wakeups)
3342	printf("multi-target wakeups: %ld\n", sched->multitarget_wakeups);
3343	if (sched->nr_run_events_optimized)
3344	printf("run atoms optimized: %ld\n",
3345	sched->nr_run_events_optimized);
3346
3347	print_task_traces(sched);
3348	add_cross_task_wakeups(sched);
3349
3350	sched->thread_funcs_exit = false;
3351	create_tasks(sched);
3352	printf("------------------------------------------------------------\n");
3353	for (i = `0`; i < sched->replay_repeat; i++)
3354	run_one_test(sched);
3355
3356	sched->thread_funcs_exit = true;
3357	destroy_tasks(sched);
3358	return `0`;
3359	}
3360
3361	static void setup_sorting(struct perf_sched sched, const* struct option *options,
3362	const char * const usage_msg[])
3363	{
3364	char tmp, tok, *str = strdup(sched->sort_order);
3365
3366	for (tok = strtok_r(str, ", ", &tmp);
3367	tok; tok = strtok_r(NULL, ", ", &tmp)) {
3368	if (sort_dimension__add(tok, list: &sched->sort_list) < `0`) {
3369	usage_with_options_msg(usage_msg, options,
3370	"Unknown --sort key: `%s'", tok);
3371	}
3372	}
3373
3374	free(str);
3375
3376	sort_dimension__add(tok: "pid", list: &sched->cmp_pid);
3377	}
3378
3379	static bool schedstat_events_exposed(void)
3380	{
3381	/*
3382	* Select "sched:sched_stat_wait" event to check
3383	* whether schedstat tracepoints are exposed.
3384	*/
3385	return IS_ERR(ptr: trace_event__tp_format(sys: "sched", name: "sched_stat_wait")) ?
3386	false : true;
3387	}
3388
3389	static int __cmd_record(int argc, const char **argv)
3390	{
3391	unsigned int rec_argc, i, j;
3392	char **rec_argv;
3393	const char **rec_argv_copy;
3394	const char * const record_args[] = {
3395	"record",
3396	"-a",
3397	"-R",
3398	"-m", "1024",
3399	"-c", "1",
3400	"-e", "sched:sched_switch",
3401	"-e", "sched:sched_stat_runtime",
3402	"-e", "sched:sched_process_fork",
3403	"-e", "sched:sched_wakeup_new",
3404	"-e", "sched:sched_migrate_task",
3405	};
3406
3407	/*
3408	* The tracepoints trace_sched_stat_{wait, sleep, iowait}
3409	* are not exposed to user if CONFIG_SCHEDSTATS is not set,
3410	* to prevent "perf sched record" execution failure, determine
3411	* whether to record schedstat events according to actual situation.
3412	*/
3413	const char * const schedstat_args[] = {
3414	"-e", "sched:sched_stat_wait",
3415	"-e", "sched:sched_stat_sleep",
3416	"-e", "sched:sched_stat_iowait",
3417	};
3418	unsigned int schedstat_argc = schedstat_events_exposed() ?
3419	ARRAY_SIZE(schedstat_args) : `0`;
3420
3421	struct tep_event *waking_event;
3422	int ret;
3423
3424	/*
3425	* +2 for either "-e", "sched:sched_wakeup" or
3426	* "-e", "sched:sched_waking"
3427	*/
3428	rec_argc = ARRAY_SIZE(record_args) + `2` + schedstat_argc + argc - `1`;
3429	rec_argv = calloc(rec_argc + `1`, sizeof(char *));
3430	if (rec_argv == NULL)
3431	return -ENOMEM;
3432	rec_argv_copy = calloc(rec_argc + `1`, sizeof(char *));
3433	if (rec_argv_copy == NULL) {
3434	free(rec_argv);
3435	return -ENOMEM;
3436	}
3437
3438	for (i = `0`; i < ARRAY_SIZE(record_args); i++)
3439	rec_argv[i] = strdup(record_args[i]);
3440
3441	rec_argv[i++] = strdup("-e");
3442	waking_event = trace_event__tp_format(sys: "sched", name: "sched_waking");
3443	if (!IS_ERR(ptr: waking_event))
3444	rec_argv[i++] = strdup("sched:sched_waking");
3445	else
3446	rec_argv[i++] = strdup("sched:sched_wakeup");
3447
3448	for (j = `0`; j < schedstat_argc; j++)
3449	rec_argv[i++] = strdup(schedstat_args[j]);
3450
3451	for (j = `1`; j < (unsigned int)argc; j++, i++)
3452	rec_argv[i] = strdup(argv[j]);
3453
3454	BUG_ON(i != rec_argc);
3455
3456	memcpy(rec_argv_copy, rec_argv, sizeof(char ) rec_argc);
3457	ret = cmd_record(argc: rec_argc, argv: rec_argv_copy);
3458
3459	for (i = `0`; i < rec_argc; i++)
3460	free(rec_argv[i]);
3461	free(rec_argv);
3462	free(rec_argv_copy);
3463
3464	return ret;
3465	}
3466
3467	int cmd_sched(int argc, const char **argv)
3468	{
3469	static const char default_sort_order[] = "avg, max, switch, runtime";
3470	struct perf_sched sched = {
3471	.tool = {
3472	.sample = perf_sched__process_tracepoint_sample,
3473	.comm = perf_sched__process_comm,
3474	.namespaces = perf_event__process_namespaces,
3475	.lost = perf_event__process_lost,
3476	.fork = perf_sched__process_fork_event,
3477	.ordered_events = true,
3478	},
3479	.cmp_pid = LIST_HEAD_INIT(sched.cmp_pid),
3480	.sort_list = LIST_HEAD_INIT(sched.sort_list),
3481	.sort_order = default_sort_order,
3482	.replay_repeat = `10`,
3483	.profile_cpu = -`1`,
3484	.next_shortname1 = `'A'`,
3485	.next_shortname2 = `'0'`,
3486	.skip_merge = `0`,
3487	.show_callchain = `1`,
3488	.max_stack = `5`,
3489	};
3490	const struct option sched_options[] = {
3491	OPT_STRING(`'i'`, "input", &input_name, "file",
3492	"input file name"),
3493	OPT_INCR(`'v'`, "verbose", &verbose,
3494	"be more verbose (show symbol address, etc)"),
3495	OPT_BOOLEAN(`'D'`, "dump-raw-trace", &dump_trace,
3496	"dump raw trace in ASCII"),
3497	OPT_BOOLEAN(`'f'`, "force", &sched.force, "don't complain, do it"),
3498	OPT_END()
3499	};
3500	const struct option latency_options[] = {
3501	OPT_STRING(`'s'`, "sort", &sched.sort_order, "key[,key2...]",
3502	"sort by key(s): runtime, switch, avg, max"),
3503	OPT_INTEGER(`'C'`, "CPU", &sched.profile_cpu,
3504	"CPU to profile on"),
3505	OPT_BOOLEAN(`'p'`, "pids", &sched.skip_merge,
3506	"latency stats per pid instead of per comm"),
3507	OPT_PARENT(sched_options)
3508	};
3509	const struct option replay_options[] = {
3510	OPT_UINTEGER(`'r'`, "repeat", &sched.replay_repeat,
3511	"repeat the workload replay N times (-1: infinite)"),
3512	OPT_PARENT(sched_options)
3513	};
3514	const struct option map_options[] = {
3515	OPT_BOOLEAN(`0`, "compact", &sched.map.comp,
3516	"map output in compact mode"),
3517	OPT_STRING(`0`, "color-pids", &sched.map.color_pids_str, "pids",
3518	"highlight given pids in map"),
3519	OPT_STRING(`0`, "color-cpus", &sched.map.color_cpus_str, "cpus",
3520	"highlight given CPUs in map"),
3521	OPT_STRING(`0`, "cpus", &sched.map.cpus_str, "cpus",
3522	"display given CPUs in map"),
3523	OPT_PARENT(sched_options)
3524	};
3525	const struct option timehist_options[] = {
3526	OPT_STRING(`'k'`, "vmlinux", &symbol_conf.vmlinux_name,
3527	"file", "vmlinux pathname"),
3528	OPT_STRING(`0`, "kallsyms", &symbol_conf.kallsyms_name,
3529	"file", "kallsyms pathname"),
3530	OPT_BOOLEAN(`'g'`, "call-graph", &sched.show_callchain,
3531	"Display call chains if present (default on)"),
3532	OPT_UINTEGER(`0`, "max-stack", &sched.max_stack,
3533	"Maximum number of functions to display backtrace."),
3534	OPT_STRING(`0`, "symfs", &symbol_conf.symfs, "directory",
3535	"Look for files with symbols relative to this directory"),
3536	OPT_BOOLEAN(`'s'`, "summary", &sched.summary_only,
3537	"Show only syscall summary with statistics"),
3538	OPT_BOOLEAN(`'S'`, "with-summary", &sched.summary,
3539	"Show all syscalls and summary with statistics"),
3540	OPT_BOOLEAN(`'w'`, "wakeups", &sched.show_wakeups, "Show wakeup events"),
3541	OPT_BOOLEAN(`'n'`, "next", &sched.show_next, "Show next task"),
3542	OPT_BOOLEAN(`'M'`, "migrations", &sched.show_migrations, "Show migration events"),
3543	OPT_BOOLEAN(`'V'`, "cpu-visual", &sched.show_cpu_visual, "Add CPU visual"),
3544	OPT_BOOLEAN(`'I'`, "idle-hist", &sched.idle_hist, "Show idle events only"),
3545	OPT_STRING(`0`, "time", &sched.time_str, "str",
3546	"Time span for analysis (start,stop)"),
3547	OPT_BOOLEAN(`0`, "state", &sched.show_state, "Show task state when sched-out"),
3548	OPT_STRING(`'p'`, "pid", &symbol_conf.pid_list_str, "pid[,pid...]",
3549	"analyze events only for given process id(s)"),
3550	OPT_STRING(`'t'`, "tid", &symbol_conf.tid_list_str, "tid[,tid...]",
3551	"analyze events only for given thread id(s)"),
3552	OPT_STRING(`'C'`, "cpu", &cpu_list, "cpu", "list of cpus to profile"),
3553	OPT_PARENT(sched_options)
3554	};
3555
3556	const char * const latency_usage[] = {
3557	"perf sched latency [<options>]",
3558	NULL
3559	};
3560	const char * const replay_usage[] = {
3561	"perf sched replay [<options>]",
3562	NULL
3563	};
3564	const char * const map_usage[] = {
3565	"perf sched map [<options>]",
3566	NULL
3567	};
3568	const char * const timehist_usage[] = {
3569	"perf sched timehist [<options>]",
3570	NULL
3571	};
3572	const char *const sched_subcommands[] = { "record", "latency", "map",
3573	"replay", "script",
3574	"timehist", NULL };
3575	const char *sched_usage[] = {
3576	NULL,
3577	NULL
3578	};
3579	struct trace_sched_handler lat_ops = {
3580	.wakeup_event = latency_wakeup_event,
3581	.switch_event = latency_switch_event,
3582	.runtime_event = latency_runtime_event,
3583	.migrate_task_event = latency_migrate_task_event,
3584	};
3585	struct trace_sched_handler map_ops = {
3586	.switch_event = map_switch_event,
3587	};
3588	struct trace_sched_handler replay_ops = {
3589	.wakeup_event = replay_wakeup_event,
3590	.switch_event = replay_switch_event,
3591	.fork_event = replay_fork_event,
3592	};
3593	unsigned int i;
3594	int ret = `0`;
3595
3596	mutex_init(&sched.start_work_mutex);
3597	mutex_init(&sched.work_done_wait_mutex);
3598	sched.curr_thread = calloc(MAX_CPUS, sizeof(*sched.curr_thread));
3599	if (!sched.curr_thread) {
3600	ret = -ENOMEM;
3601	goto out;
3602	}
3603	sched.cpu_last_switched = calloc(MAX_CPUS, sizeof(*sched.cpu_last_switched));
3604	if (!sched.cpu_last_switched) {
3605	ret = -ENOMEM;
3606	goto out;
3607	}
3608	sched.curr_pid = malloc(MAX_CPUS * sizeof(*sched.curr_pid));
3609	if (!sched.curr_pid) {
3610	ret = -ENOMEM;
3611	goto out;
3612	}
3613	for (i = `0`; i < MAX_CPUS; i++)
3614	sched.curr_pid[i] = -`1`;
3615
3616	argc = parse_options_subcommand(argc, argv, sched_options, sched_subcommands,
3617	sched_usage, PARSE_OPT_STOP_AT_NON_OPTION);
3618	if (!argc)
3619	usage_with_options(sched_usage, sched_options);
3620
3621	/*
3622	* Aliased to 'perf script' for now:
3623	*/
3624	if (!strcmp(argv[`0`], "script")) {
3625	ret = cmd_script(argc, argv);
3626	} else if (strlen(argv[`0`]) > `2` && strstarts(str: "record", prefix: argv[`0`])) {
3627	ret = __cmd_record(argc, argv);
3628	} else if (strlen(argv[`0`]) > `2` && strstarts(str: "latency", prefix: argv[`0`])) {
3629	sched.tp_handler = &lat_ops;
3630	if (argc > `1`) {
3631	argc = parse_options(argc, argv, latency_options, latency_usage, `0`);
3632	if (argc)
3633	usage_with_options(latency_usage, latency_options);
3634	}
3635	setup_sorting(sched: &sched, options: latency_options, usage_msg: latency_usage);
3636	ret = perf_sched__lat(sched: &sched);
3637	} else if (!strcmp(argv[`0`], "map")) {
3638	if (argc) {
3639	argc = parse_options(argc, argv, map_options, map_usage, `0`);
3640	if (argc)
3641	usage_with_options(map_usage, map_options);
3642	}
3643	sched.tp_handler = &map_ops;
3644	setup_sorting(sched: &sched, options: latency_options, usage_msg: latency_usage);
3645	ret = perf_sched__map(sched: &sched);
3646	} else if (strlen(argv[`0`]) > `2` && strstarts(str: "replay", prefix: argv[`0`])) {
3647	sched.tp_handler = &replay_ops;
3648	if (argc) {
3649	argc = parse_options(argc, argv, replay_options, replay_usage, `0`);
3650	if (argc)
3651	usage_with_options(replay_usage, replay_options);
3652	}
3653	ret = perf_sched__replay(sched: &sched);
3654	} else if (!strcmp(argv[`0`], "timehist")) {
3655	if (argc) {
3656	argc = parse_options(argc, argv, timehist_options,
3657	timehist_usage, `0`);
3658	if (argc)
3659	usage_with_options(timehist_usage, timehist_options);
3660	}
3661	if ((sched.show_wakeups \|\| sched.show_next) &&
3662	sched.summary_only) {
3663	pr_err(" Error: -s and -[n\|w] are mutually exclusive.\n");
3664	parse_options_usage(timehist_usage, timehist_options, "s", true);
3665	if (sched.show_wakeups)
3666	parse_options_usage(NULL, timehist_options, "w", true);
3667	if (sched.show_next)
3668	parse_options_usage(NULL, timehist_options, "n", true);
3669	ret = -EINVAL;
3670	goto out;
3671	}
3672	ret = symbol__validate_sym_arguments();
3673	if (ret)
3674	goto out;
3675
3676	ret = perf_sched__timehist(sched: &sched);
3677	} else {
3678	usage_with_options(sched_usage, sched_options);
3679	}
3680
3681	out:
3682	free(sched.curr_pid);
3683	free(sched.cpu_last_switched);
3684	free(sched.curr_thread);
3685	mutex_destroy(mtx: &sched.start_work_mutex);
3686	mutex_destroy(mtx: &sched.work_done_wait_mutex);
3687
3688	return ret;
3689	}
3690

source code of linux/tools/perf/builtin-sched.c