1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | /* Copyright (c) 2019 Facebook */ |
4 | |
5 | #include <assert.h> |
6 | #include <limits.h> |
7 | #include <unistd.h> |
8 | #include <sys/file.h> |
9 | #include <sys/time.h> |
10 | #include <linux/err.h> |
11 | #include <linux/zalloc.h> |
12 | #include <api/fs/fs.h> |
13 | #include <perf/bpf_perf.h> |
14 | |
15 | #include "bpf_counter.h" |
16 | #include "bpf-utils.h" |
17 | #include "counts.h" |
18 | #include "debug.h" |
19 | #include "evsel.h" |
20 | #include "evlist.h" |
21 | #include "target.h" |
22 | #include "cgroup.h" |
23 | #include "cpumap.h" |
24 | #include "thread_map.h" |
25 | |
26 | #include "bpf_skel/bpf_prog_profiler.skel.h" |
27 | #include "bpf_skel/bperf_u.h" |
28 | #include "bpf_skel/bperf_leader.skel.h" |
29 | #include "bpf_skel/bperf_follower.skel.h" |
30 | |
31 | #define ATTR_MAP_SIZE 16 |
32 | |
33 | static inline void *u64_to_ptr(__u64 ptr) |
34 | { |
35 | return (void *)(unsigned long)ptr; |
36 | } |
37 | |
38 | static struct bpf_counter *bpf_counter_alloc(void) |
39 | { |
40 | struct bpf_counter *counter; |
41 | |
42 | counter = zalloc(sizeof(*counter)); |
43 | if (counter) |
44 | INIT_LIST_HEAD(list: &counter->list); |
45 | return counter; |
46 | } |
47 | |
48 | static int bpf_program_profiler__destroy(struct evsel *evsel) |
49 | { |
50 | struct bpf_counter *counter, *tmp; |
51 | |
52 | list_for_each_entry_safe(counter, tmp, |
53 | &evsel->bpf_counter_list, list) { |
54 | list_del_init(entry: &counter->list); |
55 | bpf_prog_profiler_bpf__destroy(counter->skel); |
56 | free(counter); |
57 | } |
58 | assert(list_empty(head: &evsel->bpf_counter_list)); |
59 | |
60 | return 0; |
61 | } |
62 | |
63 | static char *bpf_target_prog_name(int tgt_fd) |
64 | { |
65 | struct bpf_func_info *func_info; |
66 | struct perf_bpil *info_linear; |
67 | const struct btf_type *t; |
68 | struct btf *btf = NULL; |
69 | char *name = NULL; |
70 | |
71 | info_linear = get_bpf_prog_info_linear(tgt_fd, 1UL << PERF_BPIL_FUNC_INFO); |
72 | if (IS_ERR_OR_NULL(ptr: info_linear)) { |
73 | pr_debug("failed to get info_linear for prog FD %d\n" , tgt_fd); |
74 | return NULL; |
75 | } |
76 | |
77 | if (info_linear->info.btf_id == 0) { |
78 | pr_debug("prog FD %d doesn't have valid btf\n" , tgt_fd); |
79 | goto out; |
80 | } |
81 | |
82 | btf = btf__load_from_kernel_by_id(info_linear->info.btf_id); |
83 | if (libbpf_get_error(btf)) { |
84 | pr_debug("failed to load btf for prog FD %d\n" , tgt_fd); |
85 | goto out; |
86 | } |
87 | |
88 | func_info = u64_to_ptr(ptr: info_linear->info.func_info); |
89 | t = btf__type_by_id(btf, func_info[0].type_id); |
90 | if (!t) { |
91 | pr_debug("btf %d doesn't have type %d\n" , |
92 | info_linear->info.btf_id, func_info[0].type_id); |
93 | goto out; |
94 | } |
95 | name = strdup(btf__name_by_offset(btf, t->name_off)); |
96 | out: |
97 | btf__free(btf); |
98 | free(info_linear); |
99 | return name; |
100 | } |
101 | |
102 | static int bpf_program_profiler_load_one(struct evsel *evsel, u32 prog_id) |
103 | { |
104 | struct bpf_prog_profiler_bpf *skel; |
105 | struct bpf_counter *counter; |
106 | struct bpf_program *prog; |
107 | char *prog_name = NULL; |
108 | int prog_fd; |
109 | int err; |
110 | |
111 | prog_fd = bpf_prog_get_fd_by_id(prog_id); |
112 | if (prog_fd < 0) { |
113 | pr_err("Failed to open fd for bpf prog %u\n" , prog_id); |
114 | return -1; |
115 | } |
116 | counter = bpf_counter_alloc(); |
117 | if (!counter) { |
118 | close(prog_fd); |
119 | return -1; |
120 | } |
121 | |
122 | skel = bpf_prog_profiler_bpf__open(); |
123 | if (!skel) { |
124 | pr_err("Failed to open bpf skeleton\n" ); |
125 | goto err_out; |
126 | } |
127 | |
128 | skel->rodata->num_cpu = evsel__nr_cpus(evsel); |
129 | |
130 | bpf_map__set_max_entries(skel->maps.events, evsel__nr_cpus(evsel)); |
131 | bpf_map__set_max_entries(skel->maps.fentry_readings, 1); |
132 | bpf_map__set_max_entries(skel->maps.accum_readings, 1); |
133 | |
134 | prog_name = bpf_target_prog_name(tgt_fd: prog_fd); |
135 | if (!prog_name) { |
136 | pr_err("Failed to get program name for bpf prog %u. Does it have BTF?\n" , prog_id); |
137 | goto err_out; |
138 | } |
139 | |
140 | bpf_object__for_each_program(prog, skel->obj) { |
141 | err = bpf_program__set_attach_target(prog, prog_fd, prog_name); |
142 | if (err) { |
143 | pr_err("bpf_program__set_attach_target failed.\n" |
144 | "Does bpf prog %u have BTF?\n" , prog_id); |
145 | goto err_out; |
146 | } |
147 | } |
148 | set_max_rlimit(); |
149 | err = bpf_prog_profiler_bpf__load(skel); |
150 | if (err) { |
151 | pr_err("bpf_prog_profiler_bpf__load failed\n" ); |
152 | goto err_out; |
153 | } |
154 | |
155 | assert(skel != NULL); |
156 | counter->skel = skel; |
157 | list_add(new: &counter->list, head: &evsel->bpf_counter_list); |
158 | free(prog_name); |
159 | close(prog_fd); |
160 | return 0; |
161 | err_out: |
162 | bpf_prog_profiler_bpf__destroy(skel); |
163 | free(prog_name); |
164 | free(counter); |
165 | close(prog_fd); |
166 | return -1; |
167 | } |
168 | |
169 | static int bpf_program_profiler__load(struct evsel *evsel, struct target *target) |
170 | { |
171 | char *bpf_str, *bpf_str_, *tok, *saveptr = NULL, *p; |
172 | u32 prog_id; |
173 | int ret; |
174 | |
175 | bpf_str_ = bpf_str = strdup(target->bpf_str); |
176 | if (!bpf_str) |
177 | return -1; |
178 | |
179 | while ((tok = strtok_r(bpf_str, "," , &saveptr)) != NULL) { |
180 | prog_id = strtoul(tok, &p, 10); |
181 | if (prog_id == 0 || prog_id == UINT_MAX || |
182 | (*p != '\0' && *p != ',')) { |
183 | pr_err("Failed to parse bpf prog ids %s\n" , |
184 | target->bpf_str); |
185 | free(bpf_str_); |
186 | return -1; |
187 | } |
188 | |
189 | ret = bpf_program_profiler_load_one(evsel, prog_id); |
190 | if (ret) { |
191 | bpf_program_profiler__destroy(evsel); |
192 | free(bpf_str_); |
193 | return -1; |
194 | } |
195 | bpf_str = NULL; |
196 | } |
197 | free(bpf_str_); |
198 | return 0; |
199 | } |
200 | |
201 | static int bpf_program_profiler__enable(struct evsel *evsel) |
202 | { |
203 | struct bpf_counter *counter; |
204 | int ret; |
205 | |
206 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
207 | assert(counter->skel != NULL); |
208 | ret = bpf_prog_profiler_bpf__attach(counter->skel); |
209 | if (ret) { |
210 | bpf_program_profiler__destroy(evsel); |
211 | return ret; |
212 | } |
213 | } |
214 | return 0; |
215 | } |
216 | |
217 | static int bpf_program_profiler__disable(struct evsel *evsel) |
218 | { |
219 | struct bpf_counter *counter; |
220 | |
221 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
222 | assert(counter->skel != NULL); |
223 | bpf_prog_profiler_bpf__detach(counter->skel); |
224 | } |
225 | return 0; |
226 | } |
227 | |
228 | static int bpf_program_profiler__read(struct evsel *evsel) |
229 | { |
230 | // BPF_MAP_TYPE_PERCPU_ARRAY uses /sys/devices/system/cpu/possible |
231 | // Sometimes possible > online, like on a Ryzen 3900X that has 24 |
232 | // threads but its possible showed 0-31 -acme |
233 | int num_cpu_bpf = libbpf_num_possible_cpus(); |
234 | struct bpf_perf_event_value values[num_cpu_bpf]; |
235 | struct bpf_counter *counter; |
236 | struct perf_counts_values *counts; |
237 | int reading_map_fd; |
238 | __u32 key = 0; |
239 | int err, idx, bpf_cpu; |
240 | |
241 | if (list_empty(head: &evsel->bpf_counter_list)) |
242 | return -EAGAIN; |
243 | |
244 | perf_cpu_map__for_each_idx(idx, evsel__cpus(evsel)) { |
245 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: idx, thread: 0); |
246 | counts->val = 0; |
247 | counts->ena = 0; |
248 | counts->run = 0; |
249 | } |
250 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
251 | struct bpf_prog_profiler_bpf *skel = counter->skel; |
252 | |
253 | assert(skel != NULL); |
254 | reading_map_fd = bpf_map__fd(skel->maps.accum_readings); |
255 | |
256 | err = bpf_map_lookup_elem(reading_map_fd, &key, values); |
257 | if (err) { |
258 | pr_err("failed to read value\n" ); |
259 | return err; |
260 | } |
261 | |
262 | for (bpf_cpu = 0; bpf_cpu < num_cpu_bpf; bpf_cpu++) { |
263 | idx = perf_cpu_map__idx(evsel__cpus(evsel), |
264 | (struct perf_cpu){.cpu = bpf_cpu}); |
265 | if (idx == -1) |
266 | continue; |
267 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: idx, thread: 0); |
268 | counts->val += values[bpf_cpu].counter; |
269 | counts->ena += values[bpf_cpu].enabled; |
270 | counts->run += values[bpf_cpu].running; |
271 | } |
272 | } |
273 | return 0; |
274 | } |
275 | |
276 | static int bpf_program_profiler__install_pe(struct evsel *evsel, int cpu_map_idx, |
277 | int fd) |
278 | { |
279 | struct bpf_prog_profiler_bpf *skel; |
280 | struct bpf_counter *counter; |
281 | int ret; |
282 | |
283 | list_for_each_entry(counter, &evsel->bpf_counter_list, list) { |
284 | skel = counter->skel; |
285 | assert(skel != NULL); |
286 | |
287 | ret = bpf_map_update_elem(bpf_map__fd(skel->maps.events), |
288 | &cpu_map_idx, &fd, BPF_ANY); |
289 | if (ret) |
290 | return ret; |
291 | } |
292 | return 0; |
293 | } |
294 | |
295 | struct bpf_counter_ops bpf_program_profiler_ops = { |
296 | .load = bpf_program_profiler__load, |
297 | .enable = bpf_program_profiler__enable, |
298 | .disable = bpf_program_profiler__disable, |
299 | .read = bpf_program_profiler__read, |
300 | .destroy = bpf_program_profiler__destroy, |
301 | .install_pe = bpf_program_profiler__install_pe, |
302 | }; |
303 | |
304 | static bool bperf_attr_map_compatible(int attr_map_fd) |
305 | { |
306 | struct bpf_map_info map_info = {0}; |
307 | __u32 map_info_len = sizeof(map_info); |
308 | int err; |
309 | |
310 | err = bpf_obj_get_info_by_fd(attr_map_fd, &map_info, &map_info_len); |
311 | |
312 | if (err) |
313 | return false; |
314 | return (map_info.key_size == sizeof(struct perf_event_attr)) && |
315 | (map_info.value_size == sizeof(struct perf_event_attr_map_entry)); |
316 | } |
317 | |
318 | static int bperf_lock_attr_map(struct target *target) |
319 | { |
320 | char path[PATH_MAX]; |
321 | int map_fd, err; |
322 | |
323 | if (target->attr_map) { |
324 | scnprintf(buf: path, PATH_MAX, fmt: "%s" , target->attr_map); |
325 | } else { |
326 | scnprintf(buf: path, PATH_MAX, fmt: "%s/fs/bpf/%s" , sysfs__mountpoint(), |
327 | BPF_PERF_DEFAULT_ATTR_MAP_PATH); |
328 | } |
329 | |
330 | if (access(path, F_OK)) { |
331 | map_fd = bpf_map_create(BPF_MAP_TYPE_HASH, NULL, |
332 | sizeof(struct perf_event_attr), |
333 | sizeof(struct perf_event_attr_map_entry), |
334 | ATTR_MAP_SIZE, NULL); |
335 | if (map_fd < 0) |
336 | return -1; |
337 | |
338 | err = bpf_obj_pin(map_fd, path); |
339 | if (err) { |
340 | /* someone pinned the map in parallel? */ |
341 | close(map_fd); |
342 | map_fd = bpf_obj_get(path); |
343 | if (map_fd < 0) |
344 | return -1; |
345 | } |
346 | } else { |
347 | map_fd = bpf_obj_get(path); |
348 | if (map_fd < 0) |
349 | return -1; |
350 | } |
351 | |
352 | if (!bperf_attr_map_compatible(attr_map_fd: map_fd)) { |
353 | close(map_fd); |
354 | return -1; |
355 | |
356 | } |
357 | err = flock(map_fd, LOCK_EX); |
358 | if (err) { |
359 | close(map_fd); |
360 | return -1; |
361 | } |
362 | return map_fd; |
363 | } |
364 | |
365 | static int bperf_check_target(struct evsel *evsel, |
366 | struct target *target, |
367 | enum bperf_filter_type *filter_type, |
368 | __u32 *filter_entry_cnt) |
369 | { |
370 | if (evsel->core.leader->nr_members > 1) { |
371 | pr_err("bpf managed perf events do not yet support groups.\n" ); |
372 | return -1; |
373 | } |
374 | |
375 | /* determine filter type based on target */ |
376 | if (target->system_wide) { |
377 | *filter_type = BPERF_FILTER_GLOBAL; |
378 | *filter_entry_cnt = 1; |
379 | } else if (target->cpu_list) { |
380 | *filter_type = BPERF_FILTER_CPU; |
381 | *filter_entry_cnt = perf_cpu_map__nr(evsel__cpus(evsel)); |
382 | } else if (target->tid) { |
383 | *filter_type = BPERF_FILTER_PID; |
384 | *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); |
385 | } else if (target->pid || evsel->evlist->workload.pid != -1) { |
386 | *filter_type = BPERF_FILTER_TGID; |
387 | *filter_entry_cnt = perf_thread_map__nr(evsel->core.threads); |
388 | } else { |
389 | pr_err("bpf managed perf events do not yet support these targets.\n" ); |
390 | return -1; |
391 | } |
392 | |
393 | return 0; |
394 | } |
395 | |
396 | static struct perf_cpu_map *all_cpu_map; |
397 | |
398 | static int bperf_reload_leader_program(struct evsel *evsel, int attr_map_fd, |
399 | struct perf_event_attr_map_entry *entry) |
400 | { |
401 | struct bperf_leader_bpf *skel = bperf_leader_bpf__open(); |
402 | int link_fd, diff_map_fd, err; |
403 | struct bpf_link *link = NULL; |
404 | |
405 | if (!skel) { |
406 | pr_err("Failed to open leader skeleton\n" ); |
407 | return -1; |
408 | } |
409 | |
410 | bpf_map__set_max_entries(skel->maps.events, libbpf_num_possible_cpus()); |
411 | err = bperf_leader_bpf__load(skel); |
412 | if (err) { |
413 | pr_err("Failed to load leader skeleton\n" ); |
414 | goto out; |
415 | } |
416 | |
417 | link = bpf_program__attach(skel->progs.on_switch); |
418 | if (IS_ERR(ptr: link)) { |
419 | pr_err("Failed to attach leader program\n" ); |
420 | err = PTR_ERR(ptr: link); |
421 | goto out; |
422 | } |
423 | |
424 | link_fd = bpf_link__fd(link); |
425 | diff_map_fd = bpf_map__fd(skel->maps.diff_readings); |
426 | entry->link_id = bpf_link_get_id(link_fd); |
427 | entry->diff_map_id = bpf_map_get_id(diff_map_fd); |
428 | err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, entry, BPF_ANY); |
429 | assert(err == 0); |
430 | |
431 | evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry->link_id); |
432 | assert(evsel->bperf_leader_link_fd >= 0); |
433 | |
434 | /* |
435 | * save leader_skel for install_pe, which is called within |
436 | * following evsel__open_per_cpu call |
437 | */ |
438 | evsel->leader_skel = skel; |
439 | evsel__open_per_cpu(evsel, cpus: all_cpu_map, cpu_map_idx: -1); |
440 | |
441 | out: |
442 | bperf_leader_bpf__destroy(skel); |
443 | bpf_link__destroy(link); |
444 | return err; |
445 | } |
446 | |
447 | static int bperf__load(struct evsel *evsel, struct target *target) |
448 | { |
449 | struct perf_event_attr_map_entry entry = {0xffffffff, 0xffffffff}; |
450 | int attr_map_fd, diff_map_fd = -1, err; |
451 | enum bperf_filter_type filter_type; |
452 | __u32 filter_entry_cnt, i; |
453 | |
454 | if (bperf_check_target(evsel, target, filter_type: &filter_type, filter_entry_cnt: &filter_entry_cnt)) |
455 | return -1; |
456 | |
457 | if (!all_cpu_map) { |
458 | all_cpu_map = perf_cpu_map__new_online_cpus(); |
459 | if (!all_cpu_map) |
460 | return -1; |
461 | } |
462 | |
463 | evsel->bperf_leader_prog_fd = -1; |
464 | evsel->bperf_leader_link_fd = -1; |
465 | |
466 | /* |
467 | * Step 1: hold a fd on the leader program and the bpf_link, if |
468 | * the program is not already gone, reload the program. |
469 | * Use flock() to ensure exclusive access to the perf_event_attr |
470 | * map. |
471 | */ |
472 | attr_map_fd = bperf_lock_attr_map(target); |
473 | if (attr_map_fd < 0) { |
474 | pr_err("Failed to lock perf_event_attr map\n" ); |
475 | return -1; |
476 | } |
477 | |
478 | err = bpf_map_lookup_elem(attr_map_fd, &evsel->core.attr, &entry); |
479 | if (err) { |
480 | err = bpf_map_update_elem(attr_map_fd, &evsel->core.attr, &entry, BPF_ANY); |
481 | if (err) |
482 | goto out; |
483 | } |
484 | |
485 | evsel->bperf_leader_link_fd = bpf_link_get_fd_by_id(entry.link_id); |
486 | if (evsel->bperf_leader_link_fd < 0 && |
487 | bperf_reload_leader_program(evsel, attr_map_fd, entry: &entry)) { |
488 | err = -1; |
489 | goto out; |
490 | } |
491 | /* |
492 | * The bpf_link holds reference to the leader program, and the |
493 | * leader program holds reference to the maps. Therefore, if |
494 | * link_id is valid, diff_map_id should also be valid. |
495 | */ |
496 | evsel->bperf_leader_prog_fd = bpf_prog_get_fd_by_id( |
497 | bpf_link_get_prog_id(evsel->bperf_leader_link_fd)); |
498 | assert(evsel->bperf_leader_prog_fd >= 0); |
499 | |
500 | diff_map_fd = bpf_map_get_fd_by_id(entry.diff_map_id); |
501 | assert(diff_map_fd >= 0); |
502 | |
503 | /* |
504 | * bperf uses BPF_PROG_TEST_RUN to get accurate reading. Check |
505 | * whether the kernel support it |
506 | */ |
507 | err = bperf_trigger_reading(evsel->bperf_leader_prog_fd, 0); |
508 | if (err) { |
509 | pr_err("The kernel does not support test_run for raw_tp BPF programs.\n" |
510 | "Therefore, --use-bpf might show inaccurate readings\n" ); |
511 | goto out; |
512 | } |
513 | |
514 | /* Step 2: load the follower skeleton */ |
515 | evsel->follower_skel = bperf_follower_bpf__open(); |
516 | if (!evsel->follower_skel) { |
517 | err = -1; |
518 | pr_err("Failed to open follower skeleton\n" ); |
519 | goto out; |
520 | } |
521 | |
522 | /* attach fexit program to the leader program */ |
523 | bpf_program__set_attach_target(evsel->follower_skel->progs.fexit_XXX, |
524 | evsel->bperf_leader_prog_fd, "on_switch" ); |
525 | |
526 | /* connect to leader diff_reading map */ |
527 | bpf_map__reuse_fd(evsel->follower_skel->maps.diff_readings, diff_map_fd); |
528 | |
529 | /* set up reading map */ |
530 | bpf_map__set_max_entries(evsel->follower_skel->maps.accum_readings, |
531 | filter_entry_cnt); |
532 | /* set up follower filter based on target */ |
533 | bpf_map__set_max_entries(evsel->follower_skel->maps.filter, |
534 | filter_entry_cnt); |
535 | err = bperf_follower_bpf__load(evsel->follower_skel); |
536 | if (err) { |
537 | pr_err("Failed to load follower skeleton\n" ); |
538 | bperf_follower_bpf__destroy(evsel->follower_skel); |
539 | evsel->follower_skel = NULL; |
540 | goto out; |
541 | } |
542 | |
543 | for (i = 0; i < filter_entry_cnt; i++) { |
544 | int filter_map_fd; |
545 | __u32 key; |
546 | |
547 | if (filter_type == BPERF_FILTER_PID || |
548 | filter_type == BPERF_FILTER_TGID) |
549 | key = perf_thread_map__pid(evsel->core.threads, i); |
550 | else if (filter_type == BPERF_FILTER_CPU) |
551 | key = perf_cpu_map__cpu(evsel->core.cpus, i).cpu; |
552 | else |
553 | break; |
554 | |
555 | filter_map_fd = bpf_map__fd(evsel->follower_skel->maps.filter); |
556 | bpf_map_update_elem(filter_map_fd, &key, &i, BPF_ANY); |
557 | } |
558 | |
559 | evsel->follower_skel->bss->type = filter_type; |
560 | |
561 | err = bperf_follower_bpf__attach(evsel->follower_skel); |
562 | |
563 | out: |
564 | if (err && evsel->bperf_leader_link_fd >= 0) |
565 | close(evsel->bperf_leader_link_fd); |
566 | if (err && evsel->bperf_leader_prog_fd >= 0) |
567 | close(evsel->bperf_leader_prog_fd); |
568 | if (diff_map_fd >= 0) |
569 | close(diff_map_fd); |
570 | |
571 | flock(attr_map_fd, LOCK_UN); |
572 | close(attr_map_fd); |
573 | |
574 | return err; |
575 | } |
576 | |
577 | static int bperf__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) |
578 | { |
579 | struct bperf_leader_bpf *skel = evsel->leader_skel; |
580 | |
581 | return bpf_map_update_elem(bpf_map__fd(skel->maps.events), |
582 | &cpu_map_idx, &fd, BPF_ANY); |
583 | } |
584 | |
585 | /* |
586 | * trigger the leader prog on each cpu, so the accum_reading map could get |
587 | * the latest readings. |
588 | */ |
589 | static int bperf_sync_counters(struct evsel *evsel) |
590 | { |
591 | int num_cpu, i, cpu; |
592 | |
593 | num_cpu = perf_cpu_map__nr(all_cpu_map); |
594 | for (i = 0; i < num_cpu; i++) { |
595 | cpu = perf_cpu_map__cpu(all_cpu_map, i).cpu; |
596 | bperf_trigger_reading(evsel->bperf_leader_prog_fd, cpu); |
597 | } |
598 | return 0; |
599 | } |
600 | |
601 | static int bperf__enable(struct evsel *evsel) |
602 | { |
603 | evsel->follower_skel->bss->enabled = 1; |
604 | return 0; |
605 | } |
606 | |
607 | static int bperf__disable(struct evsel *evsel) |
608 | { |
609 | evsel->follower_skel->bss->enabled = 0; |
610 | return 0; |
611 | } |
612 | |
613 | static int bperf__read(struct evsel *evsel) |
614 | { |
615 | struct bperf_follower_bpf *skel = evsel->follower_skel; |
616 | __u32 num_cpu_bpf = cpu__max_cpu().cpu; |
617 | struct bpf_perf_event_value values[num_cpu_bpf]; |
618 | struct perf_counts_values *counts; |
619 | int reading_map_fd, err = 0; |
620 | __u32 i; |
621 | int j; |
622 | |
623 | bperf_sync_counters(evsel); |
624 | reading_map_fd = bpf_map__fd(skel->maps.accum_readings); |
625 | |
626 | for (i = 0; i < bpf_map__max_entries(skel->maps.accum_readings); i++) { |
627 | struct perf_cpu entry; |
628 | __u32 cpu; |
629 | |
630 | err = bpf_map_lookup_elem(reading_map_fd, &i, values); |
631 | if (err) |
632 | goto out; |
633 | switch (evsel->follower_skel->bss->type) { |
634 | case BPERF_FILTER_GLOBAL: |
635 | assert(i == 0); |
636 | |
637 | perf_cpu_map__for_each_cpu(entry, j, evsel__cpus(evsel)) { |
638 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: j, thread: 0); |
639 | counts->val = values[entry.cpu].counter; |
640 | counts->ena = values[entry.cpu].enabled; |
641 | counts->run = values[entry.cpu].running; |
642 | } |
643 | break; |
644 | case BPERF_FILTER_CPU: |
645 | cpu = perf_cpu_map__cpu(evsel__cpus(evsel), i).cpu; |
646 | assert(cpu >= 0); |
647 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: i, thread: 0); |
648 | counts->val = values[cpu].counter; |
649 | counts->ena = values[cpu].enabled; |
650 | counts->run = values[cpu].running; |
651 | break; |
652 | case BPERF_FILTER_PID: |
653 | case BPERF_FILTER_TGID: |
654 | counts = perf_counts(counts: evsel->counts, cpu_map_idx: 0, thread: i); |
655 | counts->val = 0; |
656 | counts->ena = 0; |
657 | counts->run = 0; |
658 | |
659 | for (cpu = 0; cpu < num_cpu_bpf; cpu++) { |
660 | counts->val += values[cpu].counter; |
661 | counts->ena += values[cpu].enabled; |
662 | counts->run += values[cpu].running; |
663 | } |
664 | break; |
665 | default: |
666 | break; |
667 | } |
668 | } |
669 | out: |
670 | return err; |
671 | } |
672 | |
673 | static int bperf__destroy(struct evsel *evsel) |
674 | { |
675 | bperf_follower_bpf__destroy(evsel->follower_skel); |
676 | close(evsel->bperf_leader_prog_fd); |
677 | close(evsel->bperf_leader_link_fd); |
678 | return 0; |
679 | } |
680 | |
681 | /* |
682 | * bperf: share hardware PMCs with BPF |
683 | * |
684 | * perf uses performance monitoring counters (PMC) to monitor system |
685 | * performance. The PMCs are limited hardware resources. For example, |
686 | * Intel CPUs have 3x fixed PMCs and 4x programmable PMCs per cpu. |
687 | * |
688 | * Modern data center systems use these PMCs in many different ways: |
689 | * system level monitoring, (maybe nested) container level monitoring, per |
690 | * process monitoring, profiling (in sample mode), etc. In some cases, |
691 | * there are more active perf_events than available hardware PMCs. To allow |
692 | * all perf_events to have a chance to run, it is necessary to do expensive |
693 | * time multiplexing of events. |
694 | * |
695 | * On the other hand, many monitoring tools count the common metrics |
696 | * (cycles, instructions). It is a waste to have multiple tools create |
697 | * multiple perf_events of "cycles" and occupy multiple PMCs. |
698 | * |
699 | * bperf tries to reduce such wastes by allowing multiple perf_events of |
700 | * "cycles" or "instructions" (at different scopes) to share PMUs. Instead |
701 | * of having each perf-stat session to read its own perf_events, bperf uses |
702 | * BPF programs to read the perf_events and aggregate readings to BPF maps. |
703 | * Then, the perf-stat session(s) reads the values from these BPF maps. |
704 | * |
705 | * || |
706 | * shared progs and maps <- || -> per session progs and maps |
707 | * || |
708 | * --------------- || |
709 | * | perf_events | || |
710 | * --------------- fexit || ----------------- |
711 | * | --------||----> | follower prog | |
712 | * --------------- / || --- ----------------- |
713 | * cs -> | leader prog |/ ||/ | | |
714 | * --> --------------- /|| -------------- ------------------ |
715 | * / | | / || | filter map | | accum_readings | |
716 | * / ------------ ------------ || -------------- ------------------ |
717 | * | | prev map | | diff map | || | |
718 | * | ------------ ------------ || | |
719 | * \ || | |
720 | * = \ ==================================================== | ============ |
721 | * \ / user space |
722 | * \ / |
723 | * \ / |
724 | * BPF_PROG_TEST_RUN BPF_MAP_LOOKUP_ELEM |
725 | * \ / |
726 | * \ / |
727 | * \------ perf-stat ----------------------/ |
728 | * |
729 | * The figure above shows the architecture of bperf. Note that the figure |
730 | * is divided into 3 regions: shared progs and maps (top left), per session |
731 | * progs and maps (top right), and user space (bottom). |
732 | * |
733 | * The leader prog is triggered on each context switch (cs). The leader |
734 | * prog reads perf_events and stores the difference (current_reading - |
735 | * previous_reading) to the diff map. For the same metric, e.g. "cycles", |
736 | * multiple perf-stat sessions share the same leader prog. |
737 | * |
738 | * Each perf-stat session creates a follower prog as fexit program to the |
739 | * leader prog. It is possible to attach up to BPF_MAX_TRAMP_PROGS (38) |
740 | * follower progs to the same leader prog. The follower prog checks current |
741 | * task and processor ID to decide whether to add the value from the diff |
742 | * map to its accumulated reading map (accum_readings). |
743 | * |
744 | * Finally, perf-stat user space reads the value from accum_reading map. |
745 | * |
746 | * Besides context switch, it is also necessary to trigger the leader prog |
747 | * before perf-stat reads the value. Otherwise, the accum_reading map may |
748 | * not have the latest reading from the perf_events. This is achieved by |
749 | * triggering the event via sys_bpf(BPF_PROG_TEST_RUN) to each CPU. |
750 | * |
751 | * Comment before the definition of struct perf_event_attr_map_entry |
752 | * describes how different sessions of perf-stat share information about |
753 | * the leader prog. |
754 | */ |
755 | |
756 | struct bpf_counter_ops bperf_ops = { |
757 | .load = bperf__load, |
758 | .enable = bperf__enable, |
759 | .disable = bperf__disable, |
760 | .read = bperf__read, |
761 | .install_pe = bperf__install_pe, |
762 | .destroy = bperf__destroy, |
763 | }; |
764 | |
765 | extern struct bpf_counter_ops bperf_cgrp_ops; |
766 | |
767 | static inline bool bpf_counter_skip(struct evsel *evsel) |
768 | { |
769 | return evsel->bpf_counter_ops == NULL; |
770 | } |
771 | |
772 | int bpf_counter__install_pe(struct evsel *evsel, int cpu_map_idx, int fd) |
773 | { |
774 | if (bpf_counter_skip(evsel)) |
775 | return 0; |
776 | return evsel->bpf_counter_ops->install_pe(evsel, cpu_map_idx, fd); |
777 | } |
778 | |
779 | int bpf_counter__load(struct evsel *evsel, struct target *target) |
780 | { |
781 | if (target->bpf_str) |
782 | evsel->bpf_counter_ops = &bpf_program_profiler_ops; |
783 | else if (cgrp_event_expanded && target->use_bpf) |
784 | evsel->bpf_counter_ops = &bperf_cgrp_ops; |
785 | else if (target->use_bpf || evsel->bpf_counter || |
786 | evsel__match_bpf_counter_events(name: evsel->name)) |
787 | evsel->bpf_counter_ops = &bperf_ops; |
788 | |
789 | if (evsel->bpf_counter_ops) |
790 | return evsel->bpf_counter_ops->load(evsel, target); |
791 | return 0; |
792 | } |
793 | |
794 | int bpf_counter__enable(struct evsel *evsel) |
795 | { |
796 | if (bpf_counter_skip(evsel)) |
797 | return 0; |
798 | return evsel->bpf_counter_ops->enable(evsel); |
799 | } |
800 | |
801 | int bpf_counter__disable(struct evsel *evsel) |
802 | { |
803 | if (bpf_counter_skip(evsel)) |
804 | return 0; |
805 | return evsel->bpf_counter_ops->disable(evsel); |
806 | } |
807 | |
808 | int bpf_counter__read(struct evsel *evsel) |
809 | { |
810 | if (bpf_counter_skip(evsel)) |
811 | return -EAGAIN; |
812 | return evsel->bpf_counter_ops->read(evsel); |
813 | } |
814 | |
815 | void bpf_counter__destroy(struct evsel *evsel) |
816 | { |
817 | if (bpf_counter_skip(evsel)) |
818 | return; |
819 | evsel->bpf_counter_ops->destroy(evsel); |
820 | evsel->bpf_counter_ops = NULL; |
821 | evsel->bpf_skel = NULL; |
822 | } |
823 | |