trace_event_perf.c source code [linux/kernel/trace/trace_event_perf.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* trace event based perf event profiling/tracing
4	*
5	* Copyright (C) 2009 Red Hat Inc, Peter Zijlstra
6	* Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com>
7	*/
8
9	#include <linux/module.h>
10	#include <linux/kprobes.h>
11	#include <linux/security.h>
12	#include "trace.h"
13	#include "trace_probe.h"
14
15	static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS];
16
17	/*
18	* Force it to be aligned to unsigned long to avoid misaligned accesses
19	* surprises
20	*/
21	typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)])
22	perf_trace_t;
23
24	/ Count the events in use (per event id, not per instance) /
25	static int total_ref_count;
26
27	static int perf_trace_event_perm(struct trace_event_call *tp_event,
28	struct perf_event *p_event)
29	{
30	int ret;
31
32	if (tp_event->perf_perm) {
33	ret = tp_event->perf_perm(tp_event, p_event);
34	if (ret)
35	return ret;
36	}
37
38	/*
39	* We checked and allowed to create parent,
40	* allow children without checking.
41	*/
42	if (p_event->parent)
43	return `0`;
44
45	/*
46	* It's ok to check current process (owner) permissions in here,
47	* because code below is called only via perf_event_open syscall.
48	*/
49
50	/ The ftrace function trace is allowed only for root. /
51	if (ftrace_event_is_function(call: tp_event)) {
52	ret = perf_allow_tracepoint(attr: &p_event->attr);
53	if (ret)
54	return ret;
55
56	if (!is_sampling_event(event: p_event))
57	return `0`;
58
59	/*
60	* We don't allow user space callchains for function trace
61	* event, due to issues with page faults while tracing page
62	* fault handler and its overall trickiness nature.
63	*/
64	if (!p_event->attr.exclude_callchain_user)
65	return -EINVAL;
66
67	/*
68	* Same reason to disable user stack dump as for user space
69	* callchains above.
70	*/
71	if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER)
72	return -EINVAL;
73	}
74
75	/ No tracing, just counting, so no obvious leak /
76	if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW))
77	return `0`;
78
79	/ Some events are ok to be traced by non-root users... /
80	if (p_event->attach_state == PERF_ATTACH_TASK) {
81	if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY)
82	return `0`;
83	}
84
85	/*
86	* ...otherwise raw tracepoint data can be a severe data leak,
87	* only allow root to have these.
88	*/
89	ret = perf_allow_tracepoint(attr: &p_event->attr);
90	if (ret)
91	return ret;
92
93	return `0`;
94	}
95
96	static int perf_trace_event_reg(struct trace_event_call *tp_event,
97	struct perf_event *p_event)
98	{
99	struct hlist_head __percpu *list;
100	int ret = -ENOMEM;
101	int cpu;
102
103	p_event->tp_event = tp_event;
104	if (tp_event->perf_refcount++ > `0`)
105	return `0`;
106
107	list = alloc_percpu(struct hlist_head);
108	if (!list)
109	goto fail;
110
111	for_each_possible_cpu(cpu)
112	INIT_HLIST_HEAD(per_cpu_ptr(list, cpu));
113
114	tp_event->perf_events = list;
115
116	if (!total_ref_count) {
117	char __percpu *buf;
118	int i;
119
120	for (i = `0`; i < PERF_NR_CONTEXTS; i++) {
121	buf = (char __percpu *)alloc_percpu(perf_trace_t);
122	if (!buf)
123	goto fail;
124
125	perf_trace_buf[i] = buf;
126	}
127	}
128
129	ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL);
130	if (ret)
131	goto fail;
132
133	total_ref_count++;
134	return `0`;
135
136	fail:
137	if (!total_ref_count) {
138	int i;
139
140	for (i = `0`; i < PERF_NR_CONTEXTS; i++) {
141	free_percpu(pdata: perf_trace_buf[i]);
142	perf_trace_buf[i] = NULL;
143	}
144	}
145
146	if (!--tp_event->perf_refcount) {
147	free_percpu(pdata: tp_event->perf_events);
148	tp_event->perf_events = NULL;
149	}
150
151	return ret;
152	}
153
154	static void perf_trace_event_unreg(struct perf_event *p_event)
155	{
156	struct trace_event_call *tp_event = p_event->tp_event;
157	int i;
158
159	if (--tp_event->perf_refcount > `0`)
160	return;
161
162	tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL);
163
164	/*
165	* Ensure our callback won't be called anymore. The buffers
166	* will be freed after that.
167	*/
168	tracepoint_synchronize_unregister();
169
170	free_percpu(pdata: tp_event->perf_events);
171	tp_event->perf_events = NULL;
172
173	if (!--total_ref_count) {
174	for (i = `0`; i < PERF_NR_CONTEXTS; i++) {
175	free_percpu(pdata: perf_trace_buf[i]);
176	perf_trace_buf[i] = NULL;
177	}
178	}
179	}
180
181	static int perf_trace_event_open(struct perf_event *p_event)
182	{
183	struct trace_event_call *tp_event = p_event->tp_event;
184	return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event);
185	}
186
187	static void perf_trace_event_close(struct perf_event *p_event)
188	{
189	struct trace_event_call *tp_event = p_event->tp_event;
190	tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event);
191	}
192
193	static int perf_trace_event_init(struct trace_event_call *tp_event,
194	struct perf_event *p_event)
195	{
196	int ret;
197
198	ret = perf_trace_event_perm(tp_event, p_event);
199	if (ret)
200	return ret;
201
202	ret = perf_trace_event_reg(tp_event, p_event);
203	if (ret)
204	return ret;
205
206	ret = perf_trace_event_open(p_event);
207	if (ret) {
208	perf_trace_event_unreg(p_event);
209	return ret;
210	}
211
212	return `0`;
213	}
214
215	int perf_trace_init(struct perf_event *p_event)
216	{
217	struct trace_event_call *tp_event;
218	u64 event_id = p_event->attr.config;
219	int ret = -EINVAL;
220
221	mutex_lock(&event_mutex);
222	list_for_each_entry(tp_event, &ftrace_events, list) {
223	if (tp_event->event.type == event_id &&
224	tp_event->class && tp_event->class->reg &&
225	trace_event_try_get_ref(call: tp_event)) {
226	ret = perf_trace_event_init(tp_event, p_event);
227	if (ret)
228	trace_event_put_ref(call: tp_event);
229	break;
230	}
231	}
232	mutex_unlock(lock: &event_mutex);
233
234	return ret;
235	}
236
237	void perf_trace_destroy(struct perf_event *p_event)
238	{
239	mutex_lock(&event_mutex);
240	perf_trace_event_close(p_event);
241	perf_trace_event_unreg(p_event);
242	trace_event_put_ref(call: p_event->tp_event);
243	mutex_unlock(lock: &event_mutex);
244	}
245
246	#ifdef CONFIG_KPROBE_EVENTS
247	int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe)
248	{
249	int ret;
250	char *func = NULL;
251	struct trace_event_call *tp_event;
252
253	if (p_event->attr.kprobe_func) {
254	func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func),
255	KSYM_NAME_LEN);
256	if (IS_ERR(ptr: func)) {
257	ret = PTR_ERR(ptr: func);
258	return (ret == -EINVAL) ? -E2BIG : ret;
259	}
260
261	if (func[`0`] == `'\0'`) {
262	kfree(objp: func);
263	func = NULL;
264	}
265	}
266
267	tp_event = create_local_trace_kprobe(
268	func, addr: (void )(unsigned* long)(p_event->attr.kprobe_addr),
269	offs: p_event->attr.probe_offset, is_return: is_retprobe);
270	if (IS_ERR(ptr: tp_event)) {
271	ret = PTR_ERR(ptr: tp_event);
272	goto out;
273	}
274
275	mutex_lock(&event_mutex);
276	ret = perf_trace_event_init(tp_event, p_event);
277	if (ret)
278	destroy_local_trace_kprobe(event_call: tp_event);
279	mutex_unlock(lock: &event_mutex);
280	out:
281	kfree(objp: func);
282	return ret;
283	}
284
285	void perf_kprobe_destroy(struct perf_event *p_event)
286	{
287	mutex_lock(&event_mutex);
288	perf_trace_event_close(p_event);
289	perf_trace_event_unreg(p_event);
290	trace_event_put_ref(call: p_event->tp_event);
291	mutex_unlock(lock: &event_mutex);
292
293	destroy_local_trace_kprobe(event_call: p_event->tp_event);
294	}
295	#endif /* CONFIG_KPROBE_EVENTS */
296
297	#ifdef CONFIG_UPROBE_EVENTS
298	int perf_uprobe_init(struct perf_event *p_event,
299	unsigned long ref_ctr_offset, bool is_retprobe)
300	{
301	int ret;
302	char *path = NULL;
303	struct trace_event_call *tp_event;
304
305	if (!p_event->attr.uprobe_path)
306	return -EINVAL;
307
308	path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path),
309	PATH_MAX);
310	if (IS_ERR(ptr: path)) {
311	ret = PTR_ERR(ptr: path);
312	return (ret == -EINVAL) ? -E2BIG : ret;
313	}
314	if (path[`0`] == `'\0'`) {
315	ret = -EINVAL;
316	goto out;
317	}
318
319	tp_event = create_local_trace_uprobe(name: path, offs: p_event->attr.probe_offset,
320	ref_ctr_offset, is_return: is_retprobe);
321	if (IS_ERR(ptr: tp_event)) {
322	ret = PTR_ERR(ptr: tp_event);
323	goto out;
324	}
325
326	/*
327	* local trace_uprobe need to hold event_mutex to call
328	* uprobe_buffer_enable() and uprobe_buffer_disable().
329	* event_mutex is not required for local trace_kprobes.
330	*/
331	mutex_lock(&event_mutex);
332	ret = perf_trace_event_init(tp_event, p_event);
333	if (ret)
334	destroy_local_trace_uprobe(event_call: tp_event);
335	mutex_unlock(lock: &event_mutex);
336	out:
337	kfree(objp: path);
338	return ret;
339	}
340
341	void perf_uprobe_destroy(struct perf_event *p_event)
342	{
343	mutex_lock(&event_mutex);
344	perf_trace_event_close(p_event);
345	perf_trace_event_unreg(p_event);
346	trace_event_put_ref(call: p_event->tp_event);
347	mutex_unlock(lock: &event_mutex);
348	destroy_local_trace_uprobe(event_call: p_event->tp_event);
349	}
350	#endif /* CONFIG_UPROBE_EVENTS */
351
352	int perf_trace_add(struct perf_event p_event, int* flags)
353	{
354	struct trace_event_call *tp_event = p_event->tp_event;
355
356	if (!(flags & PERF_EF_START))
357	p_event->hw.state = PERF_HES_STOPPED;
358
359	/*
360	* If TRACE_REG_PERF_ADD returns false; no custom action was performed
361	* and we need to take the default action of enqueueing our event on
362	* the right per-cpu hlist.
363	*/
364	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) {
365	struct hlist_head __percpu *pcpu_list;
366	struct hlist_head *list;
367
368	pcpu_list = tp_event->perf_events;
369	if (WARN_ON_ONCE(!pcpu_list))
370	return -EINVAL;
371
372	list = this_cpu_ptr(pcpu_list);
373	hlist_add_head_rcu(n: &p_event->hlist_entry, h: list);
374	}
375
376	return `0`;
377	}
378
379	void perf_trace_del(struct perf_event p_event, int* flags)
380	{
381	struct trace_event_call *tp_event = p_event->tp_event;
382
383	/*
384	* If TRACE_REG_PERF_DEL returns false; no custom action was performed
385	* and we need to take the default action of dequeueing our event from
386	* the right per-cpu hlist.
387	*/
388	if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event))
389	hlist_del_rcu(n: &p_event->hlist_entry);
390	}
391
392	void perf_trace_buf_alloc(int* size, struct pt_regs *regs, int* *rctxp)
393	{
394	char *raw_data;
395	int rctx;
396
397	BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long));
398
399	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE,
400	"perf buffer not large enough, wanted %d, have %d",
401	size, PERF_MAX_TRACE_SIZE))
402	return NULL;
403
404	*rctxp = rctx = perf_swevent_get_recursion_context();
405	if (rctx < `0`)
406	return NULL;
407
408	if (regs)
409	*regs = this_cpu_ptr(&__perf_regs[rctx]);
410	raw_data = this_cpu_ptr(perf_trace_buf[rctx]);
411
412	/ zero the dead bytes from align to not leak stack to user /
413	memset(&raw_data[size - sizeof(u64)], `0`, sizeof(u64));
414	return raw_data;
415	}
416	EXPORT_SYMBOL_GPL(perf_trace_buf_alloc);
417	NOKPROBE_SYMBOL(perf_trace_buf_alloc);
418
419	void perf_trace_buf_update(void *record, u16 type)
420	{
421	struct trace_entry *entry = record;
422
423	tracing_generic_entry_update(entry, type, trace_ctx: tracing_gen_ctx());
424	}
425	NOKPROBE_SYMBOL(perf_trace_buf_update);
426
427	#ifdef CONFIG_FUNCTION_TRACER
428	static void
429	perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip,
430	struct ftrace_ops ops, struct* ftrace_regs *fregs)
431	{
432	struct ftrace_entry *entry;
433	struct perf_event *event;
434	struct hlist_head head;
435	struct pt_regs regs;
436	int rctx;
437	int bit;
438
439	if (!rcu_is_watching())
440	return;
441
442	bit = ftrace_test_recursion_trylock(ip, parent_ip);
443	if (bit < `0`)
444	return;
445
446	if ((unsigned long)ops->private != smp_processor_id())
447	goto out;
448
449	event = container_of(ops, struct perf_event, ftrace_ops);
450
451	/*
452	* @event->hlist entry is NULL (per INIT_HLIST_NODE), and all
453	* the perf code does is hlist_for_each_entry_rcu(), so we can
454	* get away with simply setting the @head.first pointer in order
455	* to create a singular list.
456	*/
457	head.first = &event->hlist_entry;
458
459	#define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \
460	sizeof(u64)) - sizeof(u32))
461
462	BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE);
463
464	memset(&regs, `0`, sizeof(regs));
465	perf_fetch_caller_regs(regs: &regs);
466
467	entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx);
468	if (!entry)
469	goto out;
470
471	entry->ip = ip;
472	entry->parent_ip = parent_ip;
473	perf_trace_buf_submit(raw_data: entry, ENTRY_SIZE, rctx, type: TRACE_FN,
474	count: `1`, regs: &regs, head: &head, NULL);
475
476	out:
477	ftrace_test_recursion_unlock(bit);
478	#undef ENTRY_SIZE
479	}
480
481	static int perf_ftrace_function_register(struct perf_event *event)
482	{
483	struct ftrace_ops *ops = &event->ftrace_ops;
484
485	ops->func = perf_ftrace_function_call;
486	ops->private = (void )(unsigned* long)nr_cpu_ids;
487
488	return register_ftrace_function(ops);
489	}
490
491	static int perf_ftrace_function_unregister(struct perf_event *event)
492	{
493	struct ftrace_ops *ops = &event->ftrace_ops;
494	int ret = unregister_ftrace_function(ops);
495	ftrace_free_filter(ops);
496	return ret;
497	}
498
499	int perf_ftrace_event_register(struct trace_event_call *call,
500	enum trace_reg type, void *data)
501	{
502	struct perf_event *event = data;
503
504	switch (type) {
505	case TRACE_REG_REGISTER:
506	case TRACE_REG_UNREGISTER:
507	break;
508	case TRACE_REG_PERF_REGISTER:
509	case TRACE_REG_PERF_UNREGISTER:
510	return `0`;
511	case TRACE_REG_PERF_OPEN:
512	return perf_ftrace_function_register(event: data);
513	case TRACE_REG_PERF_CLOSE:
514	return perf_ftrace_function_unregister(event: data);
515	case TRACE_REG_PERF_ADD:
516	event->ftrace_ops.private = (void )(unsigned* long)smp_processor_id();
517	return `1`;
518	case TRACE_REG_PERF_DEL:
519	event->ftrace_ops.private = (void )(unsigned* long)nr_cpu_ids;
520	return `1`;
521	}
522
523	return -EINVAL;
524	}
525	#endif /* CONFIG_FUNCTION_TRACER */
526

source code of linux/kernel/trace/trace_event_perf.c