usdt.c source code [linux/tools/lib/bpf/usdt.c]

1	// SPDX-License-Identifier: (LGPL-2.1 OR BSD-2-Clause)
2	/ Copyright (c) 2022 Meta Platforms, Inc. and affiliates. /
3	#include <ctype.h>
4	#include <stdio.h>
5	#include <stdlib.h>
6	#include <string.h>
7	#include <libelf.h>
8	#include <gelf.h>
9	#include <unistd.h>
10	#include <linux/ptrace.h>
11	#include <linux/kernel.h>
12
13	/ s8 will be marked as poison while it's a reg of riscv /
14	#if defined(__riscv)
15	#define rv_s8 s8
16	#endif
17
18	#include "bpf.h"
19	#include "libbpf.h"
20	#include "libbpf_common.h"
21	#include "libbpf_internal.h"
22	#include "hashmap.h"
23
24	/ libbpf's USDT support consists of BPF-side state/code and user-space*
25	* state/code working together in concert. BPF-side parts are defined in
26	* usdt.bpf.h header library. User-space state is encapsulated by struct
27	* usdt_manager and all the supporting code centered around usdt_manager.
28	*
29	* usdt.bpf.h defines two BPF maps that usdt_manager expects: USDT spec map
30	* and IP-to-spec-ID map, which is auxiliary map necessary for kernels that
31	* don't support BPF cookie (see below). These two maps are implicitly
32	* embedded into user's end BPF object file when user's code included
33	* usdt.bpf.h. This means that libbpf doesn't do anything special to create
34	* these USDT support maps. They are created by normal libbpf logic of
35	* instantiating BPF maps when opening and loading BPF object.
36	*
37	* As such, libbpf is basically unaware of the need to do anything
38	* USDT-related until the very first call to bpf_program__attach_usdt(), which
39	* can be called by user explicitly or happen automatically during skeleton
40	* attach (or, equivalently, through generic bpf_program__attach() call). At
41	* this point, libbpf will instantiate and initialize struct usdt_manager and
42	* store it in bpf_object. USDT manager is per-BPF object construct, as each
43	* independent BPF object might or might not have USDT programs, and thus all
44	* the expected USDT-related state. There is no coordination between two
45	* bpf_object in parts of USDT attachment, they are oblivious of each other's
46	* existence and libbpf is just oblivious, dealing with bpf_object-specific
47	* USDT state.
48	*
49	* Quick crash course on USDTs.
50	*
51	* From user-space application's point of view, USDT is essentially just
52	* a slightly special function call that normally has zero overhead, unless it
53	* is being traced by some external entity (e.g, BPF-based tool). Here's how
54	* a typical application can trigger USDT probe:
55	*
56	* #include <sys/sdt.h> // provided by systemtap-sdt-devel package
57	* // folly also provide similar functionality in folly/tracing/StaticTracepoint.h
58	*
59	* STAP_PROBE3(my_usdt_provider, my_usdt_probe_name, 123, x, &y);
60	*
61	* USDT is identified by it's <provider-name>:<probe-name> pair of names. Each
62	* individual USDT has a fixed number of arguments (3 in the above example)
63	* and specifies values of each argument as if it was a function call.
64	*
65	* USDT call is actually not a function call, but is instead replaced by
66	* a single NOP instruction (thus zero overhead, effectively). But in addition
67	* to that, those USDT macros generate special SHT_NOTE ELF records in
68	* .note.stapsdt ELF section. Here's an example USDT definition as emitted by
69	* `readelf -n <binary>`:
70	*
71	* stapsdt 0x00000089 NT_STAPSDT (SystemTap probe descriptors)
72	* Provider: test
73	* Name: usdt12
74	* Location: 0x0000000000549df3, Base: 0x00000000008effa4, Semaphore: 0x0000000000a4606e
75	* Arguments: -4@-1204(%rbp) -4@%edi -8@-1216(%rbp) -8@%r8 -4@$5 -8@%r9 8@%rdx 8@%r10 -4@$-9 -2@%cx -2@%ax -1@%sil
76	*
77	* In this case we have USDT test:usdt12 with 12 arguments.
78	*
79	* Location and base are offsets used to calculate absolute IP address of that
80	* NOP instruction that kernel can replace with an interrupt instruction to
81	* trigger instrumentation code (BPF program for all that we care about).
82	*
83	* Semaphore above is and optional feature. It records an address of a 2-byte
84	* refcount variable (normally in '.probes' ELF section) used for signaling if
85	* there is anything that is attached to USDT. This is useful for user
86	* applications if, for example, they need to prepare some arguments that are
87	* passed only to USDTs and preparation is expensive. By checking if USDT is
88	* "activated", an application can avoid paying those costs unnecessarily.
89	* Recent enough kernel has built-in support for automatically managing this
90	* refcount, which libbpf expects and relies on. If USDT is defined without
91	* associated semaphore, this value will be zero. See selftests for semaphore
92	* examples.
93	*
94	* Arguments is the most interesting part. This USDT specification string is
95	* providing information about all the USDT arguments and their locations. The
96	* part before @ sign defined byte size of the argument (1, 2, 4, or 8) and
97	* whether the argument is signed or unsigned (negative size means signed).
98	* The part after @ sign is assembly-like definition of argument location
99	* (see [0] for more details). Technically, assembler can provide some pretty
100	* advanced definitions, but libbpf is currently supporting three most common
101	* cases:
102	* 1) immediate constant, see 5th and 9th args above (-4@$5 and -4@-9);
103	* 2) register value, e.g., 8@%rdx, which means "unsigned 8-byte integer
104	* whose value is in register %rdx";
105	* 3) memory dereference addressed by register, e.g., -4@-1204(%rbp), which
106	* specifies signed 32-bit integer stored at offset -1204 bytes from
107	* memory address stored in %rbp.
108	*
109	* [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
110	*
111	* During attachment, libbpf parses all the relevant USDT specifications and
112	* prepares `struct usdt_spec` (USDT spec), which is then provided to BPF-side
113	* code through spec map. This allows BPF applications to quickly fetch the
114	* actual value at runtime using a simple BPF-side code.
115	*
116	* With basics out of the way, let's go over less immediately obvious aspects
117	* of supporting USDTs.
118	*
119	* First, there is no special USDT BPF program type. It is actually just
120	* a uprobe BPF program (which for kernel, at least currently, is just a kprobe
121	* program, so BPF_PROG_TYPE_KPROBE program type). With the only difference
122	* that uprobe is usually attached at the function entry, while USDT will
123	* normally will be somewhere inside the function. But it should always be
124	* pointing to NOP instruction, which makes such uprobes the fastest uprobe
125	* kind.
126	*
127	* Second, it's important to realize that such STAP_PROBEn(provider, name, ...)
128	* macro invocations can end up being inlined many-many times, depending on
129	* specifics of each individual user application. So single conceptual USDT
130	* (identified by provider:name pair of identifiers) is, generally speaking,
131	* multiple uprobe locations (USDT call sites) in different places in user
132	* application. Further, again due to inlining, each USDT call site might end
133	* up having the same argument #N be located in a different place. In one call
134	* site it could be a constant, in another will end up in a register, and in
135	* yet another could be some other register or even somewhere on the stack.
136	*
137	* As such, "attaching to USDT" means (in general case) attaching the same
138	* uprobe BPF program to multiple target locations in user application, each
139	* potentially having a completely different USDT spec associated with it.
140	* To wire all this up together libbpf allocates a unique integer spec ID for
141	* each unique USDT spec. Spec IDs are allocated as sequential small integers
142	* so that they can be used as keys in array BPF map (for performance reasons).
143	* Spec ID allocation and accounting is big part of what usdt_manager is
144	* about. This state has to be maintained per-BPF object and coordinate
145	* between different USDT attachments within the same BPF object.
146	*
147	* Spec ID is the key in spec BPF map, value is the actual USDT spec layed out
148	* as struct usdt_spec. Each invocation of BPF program at runtime needs to
149	* know its associated spec ID. It gets it either through BPF cookie, which
150	* libbpf sets to spec ID during attach time, or, if kernel is too old to
151	* support BPF cookie, through IP-to-spec-ID map that libbpf maintains in such
152	* case. The latter means that some modes of operation can't be supported
153	* without BPF cookie. Such mode is attaching to shared library "generically",
154	* without specifying target process. In such case, it's impossible to
155	* calculate absolute IP addresses for IP-to-spec-ID map, and thus such mode
156	* is not supported without BPF cookie support.
157	*
158	* Note that libbpf is using BPF cookie functionality for its own internal
159	* needs, so user itself can't rely on BPF cookie feature. To that end, libbpf
160	* provides conceptually equivalent USDT cookie support. It's still u64
161	* user-provided value that can be associated with USDT attachment. Note that
162	* this will be the same value for all USDT call sites within the same single
163	* logical USDT attachment. This makes sense because to user attaching to
164	* USDT is a single BPF program triggered for singular USDT probe. The fact
165	* that this is done at multiple actual locations is a mostly hidden
166	* implementation details. This USDT cookie value can be fetched with
167	* bpf_usdt_cookie(ctx) API provided by usdt.bpf.h
168	*
169	* Lastly, while single USDT can have tons of USDT call sites, it doesn't
170	* necessarily have that many different USDT specs. It very well might be
171	* that 1000 USDT call sites only need 5 different USDT specs, because all the
172	* arguments are typically contained in a small set of registers or stack
173	* locations. As such, it's wasteful to allocate as many USDT spec IDs as
174	* there are USDT call sites. So libbpf tries to be frugal and performs
175	* on-the-fly deduplication during a single USDT attachment to only allocate
176	* the minimal required amount of unique USDT specs (and thus spec IDs). This
177	* is trivially achieved by using USDT spec string (Arguments string from USDT
178	* note) as a lookup key in a hashmap. USDT spec string uniquely defines
179	* everything about how to fetch USDT arguments, so two USDT call sites
180	* sharing USDT spec string can safely share the same USDT spec and spec ID.
181	* Note, this spec string deduplication is happening only during the same USDT
182	* attachment, so each USDT spec shares the same USDT cookie value. This is
183	* not generally true for other USDT attachments within the same BPF object,
184	* as even if USDT spec string is the same, USDT cookie value can be
185	* different. It was deemed excessive to try to deduplicate across independent
186	* USDT attachments by taking into account USDT spec string and USDT cookie
187	* value, which would complicated spec ID accounting significantly for little
188	* gain.
189	*/
190
191	#define USDT_BASE_SEC ".stapsdt.base"
192	#define USDT_SEMA_SEC ".probes"
193	#define USDT_NOTE_SEC ".note.stapsdt"
194	#define USDT_NOTE_TYPE 3
195	#define USDT_NOTE_NAME "stapsdt"
196
197	/ should match exactly enum __bpf_usdt_arg_type from usdt.bpf.h /
198	enum usdt_arg_type {
199	USDT_ARG_CONST,
200	USDT_ARG_REG,
201	USDT_ARG_REG_DEREF,
202	};
203
204	/ should match exactly struct __bpf_usdt_arg_spec from usdt.bpf.h /
205	struct usdt_arg_spec {
206	__u64 val_off;
207	enum usdt_arg_type arg_type;
208	short reg_off;
209	bool arg_signed;
210	char arg_bitshift;
211	};
212
213	/ should match BPF_USDT_MAX_ARG_CNT in usdt.bpf.h /
214	#define USDT_MAX_ARG_CNT 12
215
216	/ should match struct __bpf_usdt_spec from usdt.bpf.h /
217	struct usdt_spec {
218	struct usdt_arg_spec args[USDT_MAX_ARG_CNT];
219	__u64 usdt_cookie;
220	short arg_cnt;
221	};
222
223	struct usdt_note {
224	const char *provider;
225	const char *name;
226	/ USDT args specification string, e.g.:*
227	* "-4@%esi -4@-24(%rbp) -4@%ecx 2@%ax 8@%rdx"
228	*/
229	const char *args;
230	long loc_addr;
231	long base_addr;
232	long sema_addr;
233	};
234
235	struct usdt_target {
236	long abs_ip;
237	long rel_ip;
238	long sema_off;
239	struct usdt_spec spec;
240	const char *spec_str;
241	};
242
243	struct usdt_manager {
244	struct bpf_map *specs_map;
245	struct bpf_map *ip_to_spec_id_map;
246
247	int *free_spec_ids;
248	size_t free_spec_cnt;
249	size_t next_free_spec_id;
250
251	bool has_bpf_cookie;
252	bool has_sema_refcnt;
253	bool has_uprobe_multi;
254	};
255
256	struct usdt_manager usdt_manager_new(struct* bpf_object *obj)
257	{
258	static const char *ref_ctr_sysfs_path = "/sys/bus/event_source/devices/uprobe/format/ref_ctr_offset";
259	struct usdt_manager *man;
260	struct bpf_map specs_map, ip_to_spec_id_map;
261
262	specs_map = bpf_object__find_map_by_name(obj, name: "__bpf_usdt_specs");
263	ip_to_spec_id_map = bpf_object__find_map_by_name(obj, name: "__bpf_usdt_ip_to_spec_id");
264	if (!specs_map \|\| !ip_to_spec_id_map) {
265	pr_warn("usdt: failed to find USDT support BPF maps, did you forget to include bpf/usdt.bpf.h?\n");
266	return ERR_PTR(error: -ESRCH);
267	}
268
269	man = calloc(`1`, sizeof(*man));
270	if (!man)
271	return ERR_PTR(error: -ENOMEM);
272
273	man->specs_map = specs_map;
274	man->ip_to_spec_id_map = ip_to_spec_id_map;
275
276	/ Detect if BPF cookie is supported for kprobes.*
277	* We don't need IP-to-ID mapping if we can use BPF cookies.
278	* Added in: 7adfc6c9b315 ("bpf: Add bpf_get_attach_cookie() BPF helper to access bpf_cookie value")
279	*/
280	man->has_bpf_cookie = kernel_supports(obj, feat_id: FEAT_BPF_COOKIE);
281
282	/ Detect kernel support for automatic refcounting of USDT semaphore.*
283	* If this is not supported, USDTs with semaphores will not be supported.
284	* Added in: a6ca88b241d5 ("trace_uprobe: support reference counter in fd-based uprobe")
285	*/
286	man->has_sema_refcnt = faccessat(AT_FDCWD, ref_ctr_sysfs_path, F_OK, AT_EACCESS) == `0`;
287
288	/*
289	* Detect kernel support for uprobe multi link to be used for attaching
290	* usdt probes.
291	*/
292	man->has_uprobe_multi = kernel_supports(obj, feat_id: FEAT_UPROBE_MULTI_LINK);
293	return man;
294	}
295
296	void usdt_manager_free(struct usdt_manager *man)
297	{
298	if (IS_ERR_OR_NULL(ptr: man))
299	return;
300
301	free(man->free_spec_ids);
302	free(man);
303	}
304
305	static int sanity_check_usdt_elf(Elf elf, const* char *path)
306	{
307	GElf_Ehdr ehdr;
308	int endianness;
309
310	if (elf_kind(elf) != ELF_K_ELF) {
311	pr_warn("usdt: unrecognized ELF kind %d for '%s'\n", elf_kind(elf), path);
312	return -EBADF;
313	}
314
315	switch (gelf_getclass(elf)) {
316	case ELFCLASS64:
317	if (sizeof(void *) != `8`) {
318	pr_warn("usdt: attaching to 64-bit ELF binary '%s' is not supported\n", path);
319	return -EBADF;
320	}
321	break;
322	case ELFCLASS32:
323	if (sizeof(void *) != `4`) {
324	pr_warn("usdt: attaching to 32-bit ELF binary '%s' is not supported\n", path);
325	return -EBADF;
326	}
327	break;
328	default:
329	pr_warn("usdt: unsupported ELF class for '%s'\n", path);
330	return -EBADF;
331	}
332
333	if (!gelf_getehdr(elf, &ehdr))
334	return -EINVAL;
335
336	if (ehdr.e_type != ET_EXEC && ehdr.e_type != ET_DYN) {
337	pr_warn("usdt: unsupported type of ELF binary '%s' (%d), only ET_EXEC and ET_DYN are supported\n",
338	path, ehdr.e_type);
339	return -EBADF;
340	}
341
342	#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
343	endianness = ELFDATA2LSB;
344	#elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__
345	endianness = ELFDATA2MSB;
346	#else
347	# error "Unrecognized __BYTE_ORDER__"
348	#endif
349	if (endianness != ehdr.e_ident[EI_DATA]) {
350	pr_warn("usdt: ELF endianness mismatch for '%s'\n", path);
351	return -EBADF;
352	}
353
354	return `0`;
355	}
356
357	static int find_elf_sec_by_name(Elf elf, const* char sec_name, GElf_Shdr shdr, Elf_Scn **scn)
358	{
359	Elf_Scn *sec = NULL;
360	size_t shstrndx;
361
362	if (elf_getshdrstrndx(elf, &shstrndx))
363	return -EINVAL;
364
365	/ check if ELF is corrupted and avoid calling elf_strptr if yes /
366	if (!elf_rawdata(elf_getscn(elf, shstrndx), NULL))
367	return -EINVAL;
368
369	while ((sec = elf_nextscn(elf, sec)) != NULL) {
370	char *name;
371
372	if (!gelf_getshdr(sec, shdr))
373	return -EINVAL;
374
375	name = elf_strptr(elf, shstrndx, shdr->sh_name);
376	if (name && strcmp(sec_name, name) == `0`) {
377	*scn = sec;
378	return `0`;
379	}
380	}
381
382	return -ENOENT;
383	}
384
385	struct elf_seg {
386	long start;
387	long end;
388	long offset;
389	bool is_exec;
390	};
391
392	static int cmp_elf_segs(const void _a, const* void *_b)
393	{
394	const struct elf_seg *a = _a;
395	const struct elf_seg *b = _b;
396
397	return a->start < b->start ? -`1` : `1`;
398	}
399
400	static int parse_elf_segs(Elf elf, const* char path, struct* elf_seg *segs, size_t seg_cnt)
401	{
402	GElf_Phdr phdr;
403	size_t n;
404	int i, err;
405	struct elf_seg *seg;
406	void *tmp;
407
408	*seg_cnt = `0`;
409
410	if (elf_getphdrnum(elf, &n)) {
411	err = -errno;
412	return err;
413	}
414
415	for (i = `0`; i < n; i++) {
416	if (!gelf_getphdr(elf, i, &phdr)) {
417	err = -errno;
418	return err;
419	}
420
421	pr_debug("usdt: discovered PHDR #%d in '%s': vaddr 0x%lx memsz 0x%lx offset 0x%lx type 0x%lx flags 0x%lx\n",
422	i, path, (long)phdr.p_vaddr, (long)phdr.p_memsz, (long)phdr.p_offset,
423	(long)phdr.p_type, (long)phdr.p_flags);
424	if (phdr.p_type != PT_LOAD)
425	continue;
426
427	tmp = libbpf_reallocarray(ptr: segs, nmemb: seg_cnt + `1`, size: sizeof(**segs));
428	if (!tmp)
429	return -ENOMEM;
430
431	*segs = tmp;
432	seg = segs + seg_cnt;
433	(*seg_cnt)++;
434
435	seg->start = phdr.p_vaddr;
436	seg->end = phdr.p_vaddr + phdr.p_memsz;
437	seg->offset = phdr.p_offset;
438	seg->is_exec = phdr.p_flags & PF_X;
439	}
440
441	if (*seg_cnt == `0`) {
442	pr_warn("usdt: failed to find PT_LOAD program headers in '%s'\n", path);
443	return -ESRCH;
444	}
445
446	qsort(segs, seg_cnt, sizeof(**segs), cmp_elf_segs);
447	return `0`;
448	}
449
450	static int parse_vma_segs(int pid, const char lib_path, struct* elf_seg *segs, size_t seg_cnt)
451	{
452	char path[PATH_MAX], line[PATH_MAX], mode[`16`];
453	size_t seg_start, seg_end, seg_off;
454	struct elf_seg *seg;
455	int tmp_pid, i, err;
456	FILE *f;
457
458	*seg_cnt = `0`;
459
460	/ Handle containerized binaries only accessible from*
461	* /proc/<pid>/root/<path>. They will be reported as just /<path> in
462	* /proc/<pid>/maps.
463	*/
464	if (sscanf(lib_path, "/proc/%d/root%s", &tmp_pid, path) == `2` && pid == tmp_pid)
465	goto proceed;
466
467	if (!realpath(lib_path, path)) {
468	pr_warn("usdt: failed to get absolute path of '%s' (err %d), using path as is...\n",
469	lib_path, -errno);
470	libbpf_strlcpy(dst: path, src: lib_path, sz: sizeof(path));
471	}
472
473	proceed:
474	sprintf(buf: line, fmt: "/proc/%d/maps", pid);
475	f = fopen(line, "re");
476	if (!f) {
477	err = -errno;
478	pr_warn("usdt: failed to open '%s' to get base addr of '%s': %d\n",
479	line, lib_path, err);
480	return err;
481	}
482
483	/ We need to handle lines with no path at the end:*
484	*
485	* 7f5c6f5d1000-7f5c6f5d3000 rw-p 001c7000 08:04 21238613 /usr/lib64/libc-2.17.so
486	* 7f5c6f5d3000-7f5c6f5d8000 rw-p 00000000 00:00 0
487	* 7f5c6f5d8000-7f5c6f5d9000 r-xp 00000000 103:01 362990598 /data/users/andriin/linux/tools/bpf/usdt/libhello_usdt.so
488	*/
489	while (fscanf(f, "%zx-%zx %s %zx %s %d%[^\n]\n",
490	&seg_start, &seg_end, mode, &seg_off, line) == `5`) {
491	void *tmp;
492
493	/ to handle no path case (see above) we need to capture line*
494	* without skipping any whitespaces. So we need to strip
495	* leading whitespaces manually here
496	*/
497	i = `0`;
498	while (isblank(line[i]))
499	i++;
500	if (strcmp(line + i, path) != `0`)
501	continue;
502
503	pr_debug("usdt: discovered segment for lib '%s': addrs %zx-%zx mode %s offset %zx\n",
504	path, seg_start, seg_end, mode, seg_off);
505
506	/ ignore non-executable sections for shared libs /
507	if (mode[`2`] != `'x'`)
508	continue;
509
510	tmp = libbpf_reallocarray(ptr: segs, nmemb: seg_cnt + `1`, size: sizeof(**segs));
511	if (!tmp) {
512	err = -ENOMEM;
513	goto err_out;
514	}
515
516	*segs = tmp;
517	seg = segs + seg_cnt;
518	*seg_cnt += `1`;
519
520	seg->start = seg_start;
521	seg->end = seg_end;
522	seg->offset = seg_off;
523	seg->is_exec = true;
524	}
525
526	if (*seg_cnt == `0`) {
527	pr_warn("usdt: failed to find '%s' (resolved to '%s') within PID %d memory mappings\n",
528	lib_path, path, pid);
529	err = -ESRCH;
530	goto err_out;
531	}
532
533	qsort(segs, seg_cnt, sizeof(**segs), cmp_elf_segs);
534	err = `0`;
535	err_out:
536	fclose(f);
537	return err;
538	}
539
540	static struct elf_seg find_elf_seg(struct* elf_seg segs, size_t seg_cnt, long* virtaddr)
541	{
542	struct elf_seg *seg;
543	int i;
544
545	/ for ELF binaries (both executables and shared libraries), we are*
546	* given virtual address (absolute for executables, relative for
547	* libraries) which should match address range of [seg_start, seg_end)
548	*/
549	for (i = `0`, seg = segs; i < seg_cnt; i++, seg++) {
550	if (seg->start <= virtaddr && virtaddr < seg->end)
551	return seg;
552	}
553	return NULL;
554	}
555
556	static struct elf_seg find_vma_seg(struct* elf_seg segs, size_t seg_cnt, long* offset)
557	{
558	struct elf_seg *seg;
559	int i;
560
561	/ for VMA segments from /proc/<pid>/maps file, provided "address" is*
562	* actually a file offset, so should be fall within logical
563	* offset-based range of [offset_start, offset_end)
564	*/
565	for (i = `0`, seg = segs; i < seg_cnt; i++, seg++) {
566	if (seg->offset <= offset && offset < seg->offset + (seg->end - seg->start))
567	return seg;
568	}
569	return NULL;
570	}
571
572	static int parse_usdt_note(Elf elf, const* char path, GElf_Nhdr nhdr,
573	const char *data, size_t name_off, size_t desc_off,
574	struct usdt_note *usdt_note);
575
576	static int parse_usdt_spec(struct usdt_spec spec, const* struct usdt_note *note, __u64 usdt_cookie);
577
578	static int collect_usdt_targets(struct usdt_manager man, Elf elf, const char *path, pid_t pid,
579	const char usdt_provider, const* char *usdt_name, __u64 usdt_cookie,
580	struct usdt_target *out_targets, size_t out_target_cnt)
581	{
582	size_t off, name_off, desc_off, seg_cnt = `0`, vma_seg_cnt = `0`, target_cnt = `0`;
583	struct elf_seg segs = NULL, vma_segs = NULL;
584	struct usdt_target targets = NULL, target;
585	long base_addr = `0`;
586	Elf_Scn notes_scn, base_scn;
587	GElf_Shdr base_shdr, notes_shdr;
588	GElf_Ehdr ehdr;
589	GElf_Nhdr nhdr;
590	Elf_Data *data;
591	int err;
592
593	*out_targets = NULL;
594	*out_target_cnt = `0`;
595
596	err = find_elf_sec_by_name(elf, USDT_NOTE_SEC, &notes_shdr, &notes_scn);
597	if (err) {
598	pr_warn("usdt: no USDT notes section (%s) found in '%s'\n", USDT_NOTE_SEC, path);
599	return err;
600	}
601
602	if (notes_shdr.sh_type != SHT_NOTE \|\| !gelf_getehdr(elf, &ehdr)) {
603	pr_warn("usdt: invalid USDT notes section (%s) in '%s'\n", USDT_NOTE_SEC, path);
604	return -EINVAL;
605	}
606
607	err = parse_elf_segs(elf, path, &segs, &seg_cnt);
608	if (err) {
609	pr_warn("usdt: failed to process ELF program segments for '%s': %d\n", path, err);
610	goto err_out;
611	}
612
613	/ .stapsdt.base ELF section is optional, but is used for prelink*
614	* offset compensation (see a big comment further below)
615	*/
616	if (find_elf_sec_by_name(elf, USDT_BASE_SEC, &base_shdr, &base_scn) == `0`)
617	base_addr = base_shdr.sh_addr;
618
619	data = elf_getdata(notes_scn, `0`);
620	off = `0`;
621	while ((off = gelf_getnote(data, off, &nhdr, &name_off, &desc_off)) > `0`) {
622	long usdt_abs_ip, usdt_rel_ip, usdt_sema_off = `0`;
623	struct usdt_note note;
624	struct elf_seg *seg = NULL;
625	void *tmp;
626
627	err = parse_usdt_note(elf, path, &nhdr, data->d_buf, name_off, desc_off, &note);
628	if (err)
629	goto err_out;
630
631	if (strcmp(note.provider, usdt_provider) != `0` \|\| strcmp(note.name, usdt_name) != `0`)
632	continue;
633
634	/ We need to compensate "prelink effect". See [0] for details,*
635	* relevant parts quoted here:
636	*
637	* Each SDT probe also expands into a non-allocated ELF note. You can
638	* find this by looking at SHT_NOTE sections and decoding the format;
639	* see below for details. Because the note is non-allocated, it means
640	* there is no runtime cost, and also preserved in both stripped files
641	* and .debug files.
642	*
643	* However, this means that prelink won't adjust the note's contents
644	* for address offsets. Instead, this is done via the .stapsdt.base
645	* section. This is a special section that is added to the text. We
646	* will only ever have one of these sections in a final link and it
647	* will only ever be one byte long. Nothing about this section itself
648	* matters, we just use it as a marker to detect prelink address
649	* adjustments.
650	*
651	* Each probe note records the link-time address of the .stapsdt.base
652	* section alongside the probe PC address. The decoder compares the
653	* base address stored in the note with the .stapsdt.base section's
654	* sh_addr. Initially these are the same, but the section header will
655	* be adjusted by prelink. So the decoder applies the difference to
656	* the probe PC address to get the correct prelinked PC address; the
657	* same adjustment is applied to the semaphore address, if any.
658	*
659	* [0] https://sourceware.org/systemtap/wiki/UserSpaceProbeImplementation
660	*/
661	usdt_abs_ip = note.loc_addr;
662	if (base_addr)
663	usdt_abs_ip += base_addr - note.base_addr;
664
665	/ When attaching uprobes (which is what USDTs basically are)*
666	* kernel expects file offset to be specified, not a relative
667	* virtual address, so we need to translate virtual address to
668	* file offset, for both ET_EXEC and ET_DYN binaries.
669	*/
670	seg = find_elf_seg(segs, seg_cnt, virtaddr: usdt_abs_ip);
671	if (!seg) {
672	err = -ESRCH;
673	pr_warn("usdt: failed to find ELF program segment for '%s:%s' in '%s' at IP 0x%lx\n",
674	usdt_provider, usdt_name, path, usdt_abs_ip);
675	goto err_out;
676	}
677	if (!seg->is_exec) {
678	err = -ESRCH;
679	pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx) for '%s:%s' at IP 0x%lx is not executable\n",
680	path, seg->start, seg->end, usdt_provider, usdt_name,
681	usdt_abs_ip);
682	goto err_out;
683	}
684	/ translate from virtual address to file offset /
685	usdt_rel_ip = usdt_abs_ip - seg->start + seg->offset;
686
687	if (ehdr.e_type == ET_DYN && !man->has_bpf_cookie) {
688	/ If we don't have BPF cookie support but need to*
689	* attach to a shared library, we'll need to know and
690	* record absolute addresses of attach points due to
691	* the need to lookup USDT spec by absolute IP of
692	* triggered uprobe. Doing this resolution is only
693	* possible when we have a specific PID of the process
694	* that's using specified shared library. BPF cookie
695	* removes the absolute address limitation as we don't
696	* need to do this lookup (we just use BPF cookie as
697	* an index of USDT spec), so for newer kernels with
698	* BPF cookie support libbpf supports USDT attachment
699	* to shared libraries with no PID filter.
700	*/
701	if (pid < `0`) {
702	pr_warn("usdt: attaching to shared libraries without specific PID is not supported on current kernel\n");
703	err = -ENOTSUP;
704	goto err_out;
705	}
706
707	/ vma_segs are lazily initialized only if necessary /
708	if (vma_seg_cnt == `0`) {
709	err = parse_vma_segs(pid, lib_path: path, segs: &vma_segs, seg_cnt: &vma_seg_cnt);
710	if (err) {
711	pr_warn("usdt: failed to get memory segments in PID %d for shared library '%s': %d\n",
712	pid, path, err);
713	goto err_out;
714	}
715	}
716
717	seg = find_vma_seg(segs: vma_segs, seg_cnt: vma_seg_cnt, offset: usdt_rel_ip);
718	if (!seg) {
719	err = -ESRCH;
720	pr_warn("usdt: failed to find shared lib memory segment for '%s:%s' in '%s' at relative IP 0x%lx\n",
721	usdt_provider, usdt_name, path, usdt_rel_ip);
722	goto err_out;
723	}
724
725	usdt_abs_ip = seg->start - seg->offset + usdt_rel_ip;
726	}
727
728	pr_debug("usdt: probe for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved abs_ip 0x%lx rel_ip 0x%lx) args '%s' in segment [0x%lx, 0x%lx) at offset 0x%lx\n",
729	usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ", path,
730	note.loc_addr, note.base_addr, usdt_abs_ip, usdt_rel_ip, note.args,
731	seg ? seg->start : `0`, seg ? seg->end : `0`, seg ? seg->offset : `0`);
732
733	/ Adjust semaphore address to be a file offset /
734	if (note.sema_addr) {
735	if (!man->has_sema_refcnt) {
736	pr_warn("usdt: kernel doesn't support USDT semaphore refcounting for '%s:%s' in '%s'\n",
737	usdt_provider, usdt_name, path);
738	err = -ENOTSUP;
739	goto err_out;
740	}
741
742	seg = find_elf_seg(segs, seg_cnt, virtaddr: note.sema_addr);
743	if (!seg) {
744	err = -ESRCH;
745	pr_warn("usdt: failed to find ELF loadable segment with semaphore of '%s:%s' in '%s' at 0x%lx\n",
746	usdt_provider, usdt_name, path, note.sema_addr);
747	goto err_out;
748	}
749	if (seg->is_exec) {
750	err = -ESRCH;
751	pr_warn("usdt: matched ELF binary '%s' segment [0x%lx, 0x%lx] for semaphore of '%s:%s' at 0x%lx is executable\n",
752	path, seg->start, seg->end, usdt_provider, usdt_name,
753	note.sema_addr);
754	goto err_out;
755	}
756
757	usdt_sema_off = note.sema_addr - seg->start + seg->offset;
758
759	pr_debug("usdt: sema for '%s:%s' in %s '%s': addr 0x%lx base 0x%lx (resolved 0x%lx) in segment [0x%lx, 0x%lx] at offset 0x%lx\n",
760	usdt_provider, usdt_name, ehdr.e_type == ET_EXEC ? "exec" : "lib ",
761	path, note.sema_addr, note.base_addr, usdt_sema_off,
762	seg->start, seg->end, seg->offset);
763	}
764
765	/ Record adjusted addresses and offsets and parse USDT spec /
766	tmp = libbpf_reallocarray(ptr: targets, nmemb: target_cnt + `1`, size: sizeof(*targets));
767	if (!tmp) {
768	err = -ENOMEM;
769	goto err_out;
770	}
771	targets = tmp;
772
773	target = &targets[target_cnt];
774	memset(target, `0`, sizeof(*target));
775
776	target->abs_ip = usdt_abs_ip;
777	target->rel_ip = usdt_rel_ip;
778	target->sema_off = usdt_sema_off;
779
780	/ notes.args references strings from ELF itself, so they can*
781	* be referenced safely until elf_end() call
782	*/
783	target->spec_str = note.args;
784
785	err = parse_usdt_spec(spec: &target->spec, note: &note, usdt_cookie);
786	if (err)
787	goto err_out;
788
789	target_cnt++;
790	}
791
792	*out_targets = targets;
793	*out_target_cnt = target_cnt;
794	err = target_cnt;
795
796	err_out:
797	free(segs);
798	free(vma_segs);
799	if (err < `0`)
800	free(targets);
801	return err;
802	}
803
804	struct bpf_link_usdt {
805	struct bpf_link link;
806
807	struct usdt_manager *usdt_man;
808
809	size_t spec_cnt;
810	int *spec_ids;
811
812	size_t uprobe_cnt;
813	struct {
814	long abs_ip;
815	struct bpf_link *link;
816	} *uprobes;
817
818	struct bpf_link *multi_link;
819	};
820
821	static int bpf_link_usdt_detach(struct bpf_link *link)
822	{
823	struct bpf_link_usdt usdt_link = container_of(link, struct* bpf_link_usdt, link);
824	struct usdt_manager *man = usdt_link->usdt_man;
825	int i;
826
827	bpf_link__destroy(link: usdt_link->multi_link);
828
829	/ When having multi_link, uprobe_cnt is 0 /
830	for (i = `0`; i < usdt_link->uprobe_cnt; i++) {
831	/ detach underlying uprobe link /
832	bpf_link__destroy(link: usdt_link->uprobes[i].link);
833	/ there is no need to update specs map because it will be*
834	* unconditionally overwritten on subsequent USDT attaches,
835	* but if BPF cookies are not used we need to remove entry
836	* from ip_to_spec_id map, otherwise we'll run into false
837	* conflicting IP errors
838	*/
839	if (!man->has_bpf_cookie) {
840	/ not much we can do about errors here /
841	(void)bpf_map_delete_elem(fd: bpf_map__fd(map: man->ip_to_spec_id_map),
842	key: &usdt_link->uprobes[i].abs_ip);
843	}
844	}
845
846	/ try to return the list of previously used spec IDs to usdt_manager*
847	* for future reuse for subsequent USDT attaches
848	*/
849	if (!man->free_spec_ids) {
850	/ if there were no free spec IDs yet, just transfer our IDs /
851	man->free_spec_ids = usdt_link->spec_ids;
852	man->free_spec_cnt = usdt_link->spec_cnt;
853	usdt_link->spec_ids = NULL;
854	} else {
855	/ otherwise concat IDs /
856	size_t new_cnt = man->free_spec_cnt + usdt_link->spec_cnt;
857	int *new_free_ids;
858
859	new_free_ids = libbpf_reallocarray(ptr: man->free_spec_ids, nmemb: new_cnt,
860	size: sizeof(*new_free_ids));
861	/ If we couldn't resize free_spec_ids, we'll just leak*
862	* a bunch of free IDs; this is very unlikely to happen and if
863	* system is so exhausted on memory, it's the least of user's
864	* concerns, probably.
865	* So just do our best here to return those IDs to usdt_manager.
866	* Another edge case when we can legitimately get NULL is when
867	* new_cnt is zero, which can happen in some edge cases, so we
868	* need to be careful about that.
869	*/
870	if (new_free_ids \|\| new_cnt == `0`) {
871	memcpy(new_free_ids + man->free_spec_cnt, usdt_link->spec_ids,
872	usdt_link->spec_cnt * sizeof(*usdt_link->spec_ids));
873	man->free_spec_ids = new_free_ids;
874	man->free_spec_cnt = new_cnt;
875	}
876	}
877
878	return `0`;
879	}
880
881	static void bpf_link_usdt_dealloc(struct bpf_link *link)
882	{
883	struct bpf_link_usdt usdt_link = container_of(link, struct* bpf_link_usdt, link);
884
885	free(usdt_link->spec_ids);
886	free(usdt_link->uprobes);
887	free(usdt_link);
888	}
889
890	static size_t specs_hash_fn(long key, void *ctx)
891	{
892	return str_hash(s: (char *)key);
893	}
894
895	static bool specs_equal_fn(long key1, long key2, void *ctx)
896	{
897	return strcmp((char )key1, (char* *)key2) == `0`;
898	}
899
900	static int allocate_spec_id(struct usdt_manager man, struct* hashmap *specs_hash,
901	struct bpf_link_usdt link, struct* usdt_target *target,
902	int spec_id, bool is_new)
903	{
904	long tmp;
905	void *new_ids;
906	int err;
907
908	/ check if we already allocated spec ID for this spec string /
909	if (hashmap__find(specs_hash, target->spec_str, &tmp)) {
910	*spec_id = tmp;
911	*is_new = false;
912	return `0`;
913	}
914
915	/ otherwise it's a new ID that needs to be set up in specs map and*
916	* returned back to usdt_manager when USDT link is detached
917	*/
918	new_ids = libbpf_reallocarray(ptr: link->spec_ids, nmemb: link->spec_cnt + `1`, size: sizeof(*link->spec_ids));
919	if (!new_ids)
920	return -ENOMEM;
921	link->spec_ids = new_ids;
922
923	/ get next free spec ID, giving preference to free list, if not empty /
924	if (man->free_spec_cnt) {
925	*spec_id = man->free_spec_ids[man->free_spec_cnt - `1`];
926
927	/ cache spec ID for current spec string for future lookups /
928	err = hashmap__add(specs_hash, target->spec_str, *spec_id);
929	if (err)
930	return err;
931
932	man->free_spec_cnt--;
933	} else {
934	/ don't allocate spec ID bigger than what fits in specs map /
935	if (man->next_free_spec_id >= bpf_map__max_entries(map: man->specs_map))
936	return -E2BIG;
937
938	*spec_id = man->next_free_spec_id;
939
940	/ cache spec ID for current spec string for future lookups /
941	err = hashmap__add(specs_hash, target->spec_str, *spec_id);
942	if (err)
943	return err;
944
945	man->next_free_spec_id++;
946	}
947
948	/ remember new spec ID in the link for later return back to free list on detach /
949	link->spec_ids[link->spec_cnt] = *spec_id;
950	link->spec_cnt++;
951	*is_new = true;
952	return `0`;
953	}
954
955	struct bpf_link usdt_manager_attach_usdt(struct* usdt_manager man, const* struct bpf_program *prog,
956	pid_t pid, const char *path,
957	const char usdt_provider, const* char *usdt_name,
958	__u64 usdt_cookie)
959	{
960	unsigned long offsets = NULL, ref_ctr_offsets = NULL;
961	int i, err, spec_map_fd, ip_map_fd;
962	LIBBPF_OPTS(bpf_uprobe_opts, opts);
963	struct hashmap *specs_hash = NULL;
964	struct bpf_link_usdt *link = NULL;
965	struct usdt_target *targets = NULL;
966	__u64 *cookies = NULL;
967	struct elf_fd elf_fd;
968	size_t target_cnt;
969
970	spec_map_fd = bpf_map__fd(map: man->specs_map);
971	ip_map_fd = bpf_map__fd(map: man->ip_to_spec_id_map);
972
973	err = elf_open(binary_path: path, elf_fd: &elf_fd);
974	if (err)
975	return libbpf_err_ptr(err);
976
977	err = sanity_check_usdt_elf(elf_fd.elf, path);
978	if (err)
979	goto err_out;
980
981	/ normalize PID filter /
982	if (pid < `0`)
983	pid = -`1`;
984	else if (pid == `0`)
985	pid = getpid();
986
987	/ discover USDT in given binary, optionally limiting*
988	* activations to a given PID, if pid > 0
989	*/
990	err = collect_usdt_targets(man, elf_fd.elf, path, pid, usdt_provider, usdt_name,
991	usdt_cookie, &targets, &target_cnt);
992	if (err <= `0`) {
993	err = (err == `0`) ? -ENOENT : err;
994	goto err_out;
995	}
996
997	specs_hash = hashmap__new(hash_fn: specs_hash_fn, equal_fn: specs_equal_fn, NULL);
998	if (IS_ERR(ptr: specs_hash)) {
999	err = PTR_ERR(ptr: specs_hash);
1000	goto err_out;
1001	}
1002
1003	link = calloc(`1`, sizeof(*link));
1004	if (!link) {
1005	err = -ENOMEM;
1006	goto err_out;
1007	}
1008
1009	link->usdt_man = man;
1010	link->link.detach = &bpf_link_usdt_detach;
1011	link->link.dealloc = &bpf_link_usdt_dealloc;
1012
1013	if (man->has_uprobe_multi) {
1014	offsets = calloc(target_cnt, sizeof(*offsets));
1015	cookies = calloc(target_cnt, sizeof(*cookies));
1016	ref_ctr_offsets = calloc(target_cnt, sizeof(*ref_ctr_offsets));
1017
1018	if (!offsets \|\| !ref_ctr_offsets \|\| !cookies) {
1019	err = -ENOMEM;
1020	goto err_out;
1021	}
1022	} else {
1023	link->uprobes = calloc(target_cnt, sizeof(*link->uprobes));
1024	if (!link->uprobes) {
1025	err = -ENOMEM;
1026	goto err_out;
1027	}
1028	}
1029
1030	for (i = `0`; i < target_cnt; i++) {
1031	struct usdt_target *target = &targets[i];
1032	struct bpf_link *uprobe_link;
1033	bool is_new;
1034	int spec_id;
1035
1036	/ Spec ID can be either reused or newly allocated. If it is*
1037	* newly allocated, we'll need to fill out spec map, otherwise
1038	* entire spec should be valid and can be just used by a new
1039	* uprobe. We reuse spec when USDT arg spec is identical. We
1040	* also never share specs between two different USDT
1041	* attachments ("links"), so all the reused specs already
1042	* share USDT cookie value implicitly.
1043	*/
1044	err = allocate_spec_id(man, specs_hash, link, target, spec_id: &spec_id, is_new: &is_new);
1045	if (err)
1046	goto err_out;
1047
1048	if (is_new && bpf_map_update_elem(fd: spec_map_fd, key: &spec_id, value: &target->spec, flags: BPF_ANY)) {
1049	err = -errno;
1050	pr_warn("usdt: failed to set USDT spec #%d for '%s:%s' in '%s': %d\n",
1051	spec_id, usdt_provider, usdt_name, path, err);
1052	goto err_out;
1053	}
1054	if (!man->has_bpf_cookie &&
1055	bpf_map_update_elem(fd: ip_map_fd, key: &target->abs_ip, value: &spec_id, flags: BPF_NOEXIST)) {
1056	err = -errno;
1057	if (err == -EEXIST) {
1058	pr_warn("usdt: IP collision detected for spec #%d for '%s:%s' in '%s'\n",
1059	spec_id, usdt_provider, usdt_name, path);
1060	} else {
1061	pr_warn("usdt: failed to map IP 0x%lx to spec #%d for '%s:%s' in '%s': %d\n",
1062	target->abs_ip, spec_id, usdt_provider, usdt_name,
1063	path, err);
1064	}
1065	goto err_out;
1066	}
1067
1068	if (man->has_uprobe_multi) {
1069	offsets[i] = target->rel_ip;
1070	ref_ctr_offsets[i] = target->sema_off;
1071	cookies[i] = spec_id;
1072	} else {
1073	opts.ref_ctr_offset = target->sema_off;
1074	opts.bpf_cookie = man->has_bpf_cookie ? spec_id : `0`;
1075	uprobe_link = bpf_program__attach_uprobe_opts(prog, pid, binary_path: path,
1076	func_offset: target->rel_ip, opts: &opts);
1077	err = libbpf_get_error(ptr: uprobe_link);
1078	if (err) {
1079	pr_warn("usdt: failed to attach uprobe #%d for '%s:%s' in '%s': %d\n",
1080	i, usdt_provider, usdt_name, path, err);
1081	goto err_out;
1082	}
1083
1084	link->uprobes[i].link = uprobe_link;
1085	link->uprobes[i].abs_ip = target->abs_ip;
1086	link->uprobe_cnt++;
1087	}
1088	}
1089
1090	if (man->has_uprobe_multi) {
1091	LIBBPF_OPTS(bpf_uprobe_multi_opts, opts_multi,
1092	.ref_ctr_offsets = ref_ctr_offsets,
1093	.offsets = offsets,
1094	.cookies = cookies,
1095	.cnt = target_cnt,
1096	);
1097
1098	link->multi_link = bpf_program__attach_uprobe_multi(prog, pid, binary_path: path,
1099	NULL, opts: &opts_multi);
1100	if (!link->multi_link) {
1101	err = -errno;
1102	pr_warn("usdt: failed to attach uprobe multi for '%s:%s' in '%s': %d\n",
1103	usdt_provider, usdt_name, path, err);
1104	goto err_out;
1105	}
1106
1107	free(offsets);
1108	free(ref_ctr_offsets);
1109	free(cookies);
1110	}
1111
1112	free(targets);
1113	hashmap__free(map: specs_hash);
1114	elf_close(elf_fd: &elf_fd);
1115	return &link->link;
1116
1117	err_out:
1118	free(offsets);
1119	free(ref_ctr_offsets);
1120	free(cookies);
1121
1122	if (link)
1123	bpf_link__destroy(link: &link->link);
1124	free(targets);
1125	hashmap__free(map: specs_hash);
1126	elf_close(elf_fd: &elf_fd);
1127	return libbpf_err_ptr(err);
1128	}
1129
1130	/ Parse out USDT ELF note from '.note.stapsdt' section.*
1131	* Logic inspired by perf's code.
1132	*/
1133	static int parse_usdt_note(Elf elf, const* char path, GElf_Nhdr nhdr,
1134	const char *data, size_t name_off, size_t desc_off,
1135	struct usdt_note *note)
1136	{
1137	const char provider, name, *args;
1138	long addrs[`3`];
1139	size_t len;
1140
1141	/ sanity check USDT note name and type first /
1142	if (strncmp(data + name_off, USDT_NOTE_NAME, nhdr->n_namesz) != `0`)
1143	return -EINVAL;
1144	if (nhdr->n_type != USDT_NOTE_TYPE)
1145	return -EINVAL;
1146
1147	/ sanity check USDT note contents ("description" in ELF terminology) /
1148	len = nhdr->n_descsz;
1149	data = data + desc_off;
1150
1151	/ +3 is the very minimum required to store three empty strings /
1152	if (len < sizeof(addrs) + `3`)
1153	return -EINVAL;
1154
1155	/ get location, base, and semaphore addrs /
1156	memcpy(&addrs, data, sizeof(addrs));
1157
1158	/ parse string fields: provider, name, args /
1159	provider = data + sizeof(addrs);
1160
1161	name = (const char *)memchr(p: provider, c: `'\0'`, size: data + len - provider);
1162	if (!name) / non-zero-terminated provider /
1163	return -EINVAL;
1164	name++;
1165	if (name >= data + len \|\| name == `'\0'`) /* missing or empty name /
1166	return -EINVAL;
1167
1168	args = memchr(p: name, c: `'\0'`, size: data + len - name);
1169	if (!args) / non-zero-terminated name /
1170	return -EINVAL;
1171	++args;
1172	if (args >= data + len) / missing arguments spec /
1173	return -EINVAL;
1174
1175	note->provider = provider;
1176	note->name = name;
1177	if (args == `'\0'` \|\| args == `':'`)
1178	note->args = "";
1179	else
1180	note->args = args;
1181	note->loc_addr = addrs[`0`];
1182	note->base_addr = addrs[`1`];
1183	note->sema_addr = addrs[`2`];
1184
1185	return `0`;
1186	}
1187
1188	static int parse_usdt_arg(const char arg_str, int* arg_num, struct usdt_arg_spec arg, int* *arg_sz);
1189
1190	static int parse_usdt_spec(struct usdt_spec spec, const* struct usdt_note *note, __u64 usdt_cookie)
1191	{
1192	struct usdt_arg_spec *arg;
1193	const char *s;
1194	int arg_sz, len;
1195
1196	spec->usdt_cookie = usdt_cookie;
1197	spec->arg_cnt = `0`;
1198
1199	s = note->args;
1200	while (s[`0`]) {
1201	if (spec->arg_cnt >= USDT_MAX_ARG_CNT) {
1202	pr_warn("usdt: too many USDT arguments (> %d) for '%s:%s' with args spec '%s'\n",
1203	USDT_MAX_ARG_CNT, note->provider, note->name, note->args);
1204	return -E2BIG;
1205	}
1206
1207	arg = &spec->args[spec->arg_cnt];
1208	len = parse_usdt_arg(arg_str: s, arg_num: spec->arg_cnt, arg, arg_sz: &arg_sz);
1209	if (len < `0`)
1210	return len;
1211
1212	arg->arg_signed = arg_sz < `0`;
1213	if (arg_sz < `0`)
1214	arg_sz = -arg_sz;
1215
1216	switch (arg_sz) {
1217	case `1`: case `2`: case `4`: case `8`:
1218	arg->arg_bitshift = `64` - arg_sz * `8`;
1219	break;
1220	default:
1221	pr_warn("usdt: unsupported arg #%d (spec '%s') size: %d\n",
1222	spec->arg_cnt, s, arg_sz);
1223	return -EINVAL;
1224	}
1225
1226	s += len;
1227	spec->arg_cnt++;
1228	}
1229
1230	return `0`;
1231	}
1232
1233	/ Architecture-specific logic for parsing USDT argument location specs /
1234
1235	#if defined(__x86_64__) \|\| defined(__i386__)
1236
1237	static int calc_pt_regs_off(const char *reg_name)
1238	{
1239	static struct {
1240	const char *names[`4`];
1241	size_t pt_regs_off;
1242	} reg_map[] = {
1243	#ifdef __x86_64__
1244	#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg64)
1245	#else
1246	#define reg_off(reg64, reg32) offsetof(struct pt_regs, reg32)
1247	#endif
1248	{ {"rip", "eip", "", ""}, reg_off(rip, eip) },
1249	{ {"rax", "eax", "ax", "al"}, reg_off(rax, eax) },
1250	{ {"rbx", "ebx", "bx", "bl"}, reg_off(rbx, ebx) },
1251	{ {"rcx", "ecx", "cx", "cl"}, reg_off(rcx, ecx) },
1252	{ {"rdx", "edx", "dx", "dl"}, reg_off(rdx, edx) },
1253	{ {"rsi", "esi", "si", "sil"}, reg_off(rsi, esi) },
1254	{ {"rdi", "edi", "di", "dil"}, reg_off(rdi, edi) },
1255	{ {"rbp", "ebp", "bp", "bpl"}, reg_off(rbp, ebp) },
1256	{ {"rsp", "esp", "sp", "spl"}, reg_off(rsp, esp) },
1257	#undef reg_off
1258	#ifdef __x86_64__
1259	{ {"r8", "r8d", "r8w", "r8b"}, offsetof(struct pt_regs, r8) },
1260	{ {"r9", "r9d", "r9w", "r9b"}, offsetof(struct pt_regs, r9) },
1261	{ {"r10", "r10d", "r10w", "r10b"}, offsetof(struct pt_regs, r10) },
1262	{ {"r11", "r11d", "r11w", "r11b"}, offsetof(struct pt_regs, r11) },
1263	{ {"r12", "r12d", "r12w", "r12b"}, offsetof(struct pt_regs, r12) },
1264	{ {"r13", "r13d", "r13w", "r13b"}, offsetof(struct pt_regs, r13) },
1265	{ {"r14", "r14d", "r14w", "r14b"}, offsetof(struct pt_regs, r14) },
1266	{ {"r15", "r15d", "r15w", "r15b"}, offsetof(struct pt_regs, r15) },
1267	#endif
1268	};
1269	int i, j;
1270
1271	for (i = `0`; i < ARRAY_SIZE(reg_map); i++) {
1272	for (j = `0`; j < ARRAY_SIZE(reg_map[i].names); j++) {
1273	if (strcmp(reg_name, reg_map[i].names[j]) == `0`)
1274	return reg_map[i].pt_regs_off;
1275	}
1276	}
1277
1278	pr_warn("usdt: unrecognized register '%s'\n", reg_name);
1279	return -ENOENT;
1280	}
1281
1282	static int parse_usdt_arg(const char arg_str, int* arg_num, struct usdt_arg_spec arg, int* *arg_sz)
1283	{
1284	char reg_name[`16`];
1285	int len, reg_off;
1286	long off;
1287
1288	if (sscanf(arg_str, " %d @ %ld ( %%%15[^)] ) %n", arg_sz, &off, reg_name, &len) == `3`) {
1289	/ Memory dereference case, e.g., -4@-20(%rbp) /
1290	arg->arg_type = USDT_ARG_REG_DEREF;
1291	arg->val_off = off;
1292	reg_off = calc_pt_regs_off(reg_name);
1293	if (reg_off < `0`)
1294	return reg_off;
1295	arg->reg_off = reg_off;
1296	} else if (sscanf(arg_str, " %d @ ( %%%15[^)] ) %n", arg_sz, reg_name, &len) == `2`) {
1297	/ Memory dereference case without offset, e.g., 8@(%rsp) /
1298	arg->arg_type = USDT_ARG_REG_DEREF;
1299	arg->val_off = `0`;
1300	reg_off = calc_pt_regs_off(reg_name);
1301	if (reg_off < `0`)
1302	return reg_off;
1303	arg->reg_off = reg_off;
1304	} else if (sscanf(arg_str, " %d @ %%%15s %n", arg_sz, reg_name, &len) == `2`) {
1305	/ Register read case, e.g., -4@%eax /
1306	arg->arg_type = USDT_ARG_REG;
1307	arg->val_off = `0`;
1308
1309	reg_off = calc_pt_regs_off(reg_name);
1310	if (reg_off < `0`)
1311	return reg_off;
1312	arg->reg_off = reg_off;
1313	} else if (sscanf(arg_str, " %d @ $%ld %n", arg_sz, &off, &len) == `2`) {
1314	/ Constant value case, e.g., 4@$71 /
1315	arg->arg_type = USDT_ARG_CONST;
1316	arg->val_off = off;
1317	arg->reg_off = `0`;
1318	} else {
1319	pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str);
1320	return -EINVAL;
1321	}
1322
1323	return len;
1324	}
1325
1326	#elif defined(__s390x__)
1327
1328	/ Do not support __s390__ for now, since user_pt_regs is broken with -m31. /
1329
1330	static int parse_usdt_arg(const char arg_str, int* arg_num, struct usdt_arg_spec arg, int* *arg_sz)
1331	{
1332	unsigned int reg;
1333	int len;
1334	long off;
1335
1336	if (sscanf(arg_str, " %d @ %ld ( %%r%u ) %n", arg_sz, &off, &reg, &len) == `3`) {
1337	/ Memory dereference case, e.g., -2@-28(%r15) /
1338	arg->arg_type = USDT_ARG_REG_DEREF;
1339	arg->val_off = off;
1340	if (reg > `15`) {
1341	pr_warn("usdt: unrecognized register '%%r%u'\n", reg);
1342	return -EINVAL;
1343	}
1344	arg->reg_off = offsetof(user_pt_regs, gprs[reg]);
1345	} else if (sscanf(arg_str, " %d @ %%r%u %n", arg_sz, &reg, &len) == `2`) {
1346	/ Register read case, e.g., -8@%r0 /
1347	arg->arg_type = USDT_ARG_REG;
1348	arg->val_off = `0`;
1349	if (reg > `15`) {
1350	pr_warn("usdt: unrecognized register '%%r%u'\n", reg);
1351	return -EINVAL;
1352	}
1353	arg->reg_off = offsetof(user_pt_regs, gprs[reg]);
1354	} else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == `2`) {
1355	/ Constant value case, e.g., 4@71 /
1356	arg->arg_type = USDT_ARG_CONST;
1357	arg->val_off = off;
1358	arg->reg_off = `0`;
1359	} else {
1360	pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str);
1361	return -EINVAL;
1362	}
1363
1364	return len;
1365	}
1366
1367	#elif defined(__aarch64__)
1368
1369	static int calc_pt_regs_off(const char *reg_name)
1370	{
1371	int reg_num;
1372
1373	if (sscanf(reg_name, "x%d", &reg_num) == `1`) {
1374	if (reg_num >= `0` && reg_num < `31`)
1375	return offsetof(struct user_pt_regs, regs[reg_num]);
1376	} else if (strcmp(reg_name, "sp") == `0`) {
1377	return offsetof(struct user_pt_regs, sp);
1378	}
1379	pr_warn("usdt: unrecognized register '%s'\n", reg_name);
1380	return -ENOENT;
1381	}
1382
1383	static int parse_usdt_arg(const char arg_str, int* arg_num, struct usdt_arg_spec arg, int* *arg_sz)
1384	{
1385	char reg_name[`16`];
1386	int len, reg_off;
1387	long off;
1388
1389	if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , %ld ] %n", arg_sz, reg_name, &off, &len) == `3`) {
1390	/ Memory dereference case, e.g., -4@[sp, 96] /
1391	arg->arg_type = USDT_ARG_REG_DEREF;
1392	arg->val_off = off;
1393	reg_off = calc_pt_regs_off(reg_name);
1394	if (reg_off < `0`)
1395	return reg_off;
1396	arg->reg_off = reg_off;
1397	} else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == `2`) {
1398	/ Memory dereference case, e.g., -4@[sp] /
1399	arg->arg_type = USDT_ARG_REG_DEREF;
1400	arg->val_off = `0`;
1401	reg_off = calc_pt_regs_off(reg_name);
1402	if (reg_off < `0`)
1403	return reg_off;
1404	arg->reg_off = reg_off;
1405	} else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == `2`) {
1406	/ Constant value case, e.g., 4@5 /
1407	arg->arg_type = USDT_ARG_CONST;
1408	arg->val_off = off;
1409	arg->reg_off = `0`;
1410	} else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == `2`) {
1411	/ Register read case, e.g., -8@x4 /
1412	arg->arg_type = USDT_ARG_REG;
1413	arg->val_off = `0`;
1414	reg_off = calc_pt_regs_off(reg_name);
1415	if (reg_off < `0`)
1416	return reg_off;
1417	arg->reg_off = reg_off;
1418	} else {
1419	pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str);
1420	return -EINVAL;
1421	}
1422
1423	return len;
1424	}
1425
1426	#elif defined(__riscv)
1427
1428	static int calc_pt_regs_off(const char *reg_name)
1429	{
1430	static struct {
1431	const char *name;
1432	size_t pt_regs_off;
1433	} reg_map[] = {
1434	{ "ra", offsetof(struct user_regs_struct, ra) },
1435	{ "sp", offsetof(struct user_regs_struct, sp) },
1436	{ "gp", offsetof(struct user_regs_struct, gp) },
1437	{ "tp", offsetof(struct user_regs_struct, tp) },
1438	{ "a0", offsetof(struct user_regs_struct, a0) },
1439	{ "a1", offsetof(struct user_regs_struct, a1) },
1440	{ "a2", offsetof(struct user_regs_struct, a2) },
1441	{ "a3", offsetof(struct user_regs_struct, a3) },
1442	{ "a4", offsetof(struct user_regs_struct, a4) },
1443	{ "a5", offsetof(struct user_regs_struct, a5) },
1444	{ "a6", offsetof(struct user_regs_struct, a6) },
1445	{ "a7", offsetof(struct user_regs_struct, a7) },
1446	{ "s0", offsetof(struct user_regs_struct, s0) },
1447	{ "s1", offsetof(struct user_regs_struct, s1) },
1448	{ "s2", offsetof(struct user_regs_struct, s2) },
1449	{ "s3", offsetof(struct user_regs_struct, s3) },
1450	{ "s4", offsetof(struct user_regs_struct, s4) },
1451	{ "s5", offsetof(struct user_regs_struct, s5) },
1452	{ "s6", offsetof(struct user_regs_struct, s6) },
1453	{ "s7", offsetof(struct user_regs_struct, s7) },
1454	{ "s8", offsetof(struct user_regs_struct, rv_s8) },
1455	{ "s9", offsetof(struct user_regs_struct, s9) },
1456	{ "s10", offsetof(struct user_regs_struct, s10) },
1457	{ "s11", offsetof(struct user_regs_struct, s11) },
1458	{ "t0", offsetof(struct user_regs_struct, t0) },
1459	{ "t1", offsetof(struct user_regs_struct, t1) },
1460	{ "t2", offsetof(struct user_regs_struct, t2) },
1461	{ "t3", offsetof(struct user_regs_struct, t3) },
1462	{ "t4", offsetof(struct user_regs_struct, t4) },
1463	{ "t5", offsetof(struct user_regs_struct, t5) },
1464	{ "t6", offsetof(struct user_regs_struct, t6) },
1465	};
1466	int i;
1467
1468	for (i = `0`; i < ARRAY_SIZE(reg_map); i++) {
1469	if (strcmp(reg_name, reg_map[i].name) == `0`)
1470	return reg_map[i].pt_regs_off;
1471	}
1472
1473	pr_warn("usdt: unrecognized register '%s'\n", reg_name);
1474	return -ENOENT;
1475	}
1476
1477	static int parse_usdt_arg(const char arg_str, int* arg_num, struct usdt_arg_spec arg, int* *arg_sz)
1478	{
1479	char reg_name[`16`];
1480	int len, reg_off;
1481	long off;
1482
1483	if (sscanf(arg_str, " %d @ %ld ( %15[a-z0-9] ) %n", arg_sz, &off, reg_name, &len) == `3`) {
1484	/ Memory dereference case, e.g., -8@-88(s0) /
1485	arg->arg_type = USDT_ARG_REG_DEREF;
1486	arg->val_off = off;
1487	reg_off = calc_pt_regs_off(reg_name);
1488	if (reg_off < `0`)
1489	return reg_off;
1490	arg->reg_off = reg_off;
1491	} else if (sscanf(arg_str, " %d @ %ld %n", arg_sz, &off, &len) == `2`) {
1492	/ Constant value case, e.g., 4@5 /
1493	arg->arg_type = USDT_ARG_CONST;
1494	arg->val_off = off;
1495	arg->reg_off = `0`;
1496	} else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == `2`) {
1497	/ Register read case, e.g., -8@a1 /
1498	arg->arg_type = USDT_ARG_REG;
1499	arg->val_off = `0`;
1500	reg_off = calc_pt_regs_off(reg_name);
1501	if (reg_off < `0`)
1502	return reg_off;
1503	arg->reg_off = reg_off;
1504	} else {
1505	pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str);
1506	return -EINVAL;
1507	}
1508
1509	return len;
1510	}
1511
1512	#elif defined(__arm__)
1513
1514	static int calc_pt_regs_off(const char *reg_name)
1515	{
1516	static struct {
1517	const char *name;
1518	size_t pt_regs_off;
1519	} reg_map[] = {
1520	{ "r0", offsetof(struct pt_regs, uregs[`0`]) },
1521	{ "r1", offsetof(struct pt_regs, uregs[`1`]) },
1522	{ "r2", offsetof(struct pt_regs, uregs[`2`]) },
1523	{ "r3", offsetof(struct pt_regs, uregs[`3`]) },
1524	{ "r4", offsetof(struct pt_regs, uregs[`4`]) },
1525	{ "r5", offsetof(struct pt_regs, uregs[`5`]) },
1526	{ "r6", offsetof(struct pt_regs, uregs[`6`]) },
1527	{ "r7", offsetof(struct pt_regs, uregs[`7`]) },
1528	{ "r8", offsetof(struct pt_regs, uregs[`8`]) },
1529	{ "r9", offsetof(struct pt_regs, uregs[`9`]) },
1530	{ "r10", offsetof(struct pt_regs, uregs[`10`]) },
1531	{ "fp", offsetof(struct pt_regs, uregs[`11`]) },
1532	{ "ip", offsetof(struct pt_regs, uregs[`12`]) },
1533	{ "sp", offsetof(struct pt_regs, uregs[`13`]) },
1534	{ "lr", offsetof(struct pt_regs, uregs[`14`]) },
1535	{ "pc", offsetof(struct pt_regs, uregs[`15`]) },
1536	};
1537	int i;
1538
1539	for (i = `0`; i < ARRAY_SIZE(reg_map); i++) {
1540	if (strcmp(reg_name, reg_map[i].name) == `0`)
1541	return reg_map[i].pt_regs_off;
1542	}
1543
1544	pr_warn("usdt: unrecognized register '%s'\n", reg_name);
1545	return -ENOENT;
1546	}
1547
1548	static int parse_usdt_arg(const char arg_str, int* arg_num, struct usdt_arg_spec arg, int* *arg_sz)
1549	{
1550	char reg_name[`16`];
1551	int len, reg_off;
1552	long off;
1553
1554	if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] , #%ld ] %n",
1555	arg_sz, reg_name, &off, &len) == `3`) {
1556	/ Memory dereference case, e.g., -4@[fp, #96] /
1557	arg->arg_type = USDT_ARG_REG_DEREF;
1558	arg->val_off = off;
1559	reg_off = calc_pt_regs_off(reg_name);
1560	if (reg_off < `0`)
1561	return reg_off;
1562	arg->reg_off = reg_off;
1563	} else if (sscanf(arg_str, " %d @ \[ %15[a-z0-9] ] %n", arg_sz, reg_name, &len) == `2`) {
1564	/ Memory dereference case, e.g., -4@[sp] /
1565	arg->arg_type = USDT_ARG_REG_DEREF;
1566	arg->val_off = `0`;
1567	reg_off = calc_pt_regs_off(reg_name);
1568	if (reg_off < `0`)
1569	return reg_off;
1570	arg->reg_off = reg_off;
1571	} else if (sscanf(arg_str, " %d @ #%ld %n", arg_sz, &off, &len) == `2`) {
1572	/ Constant value case, e.g., 4@#5 /
1573	arg->arg_type = USDT_ARG_CONST;
1574	arg->val_off = off;
1575	arg->reg_off = `0`;
1576	} else if (sscanf(arg_str, " %d @ %15[a-z0-9] %n", arg_sz, reg_name, &len) == `2`) {
1577	/ Register read case, e.g., -8@r4 /
1578	arg->arg_type = USDT_ARG_REG;
1579	arg->val_off = `0`;
1580	reg_off = calc_pt_regs_off(reg_name);
1581	if (reg_off < `0`)
1582	return reg_off;
1583	arg->reg_off = reg_off;
1584	} else {
1585	pr_warn("usdt: unrecognized arg #%d spec '%s'\n", arg_num, arg_str);
1586	return -EINVAL;
1587	}
1588
1589	return len;
1590	}
1591
1592	#else
1593
1594	static int parse_usdt_arg(const char arg_str, int* arg_num, struct usdt_arg_spec arg, int* *arg_sz)
1595	{
1596	pr_warn("usdt: libbpf doesn't support USDTs on current architecture\n");
1597	return -ENOTSUP;
1598	}
1599
1600	#endif
1601

source code of linux/tools/lib/bpf/usdt.c