1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * trace event based perf event profiling/tracing |
4 | * |
5 | * Copyright (C) 2009 Red Hat Inc, Peter Zijlstra |
6 | * Copyright (C) 2009-2010 Frederic Weisbecker <fweisbec@gmail.com> |
7 | */ |
8 | |
9 | #include <linux/module.h> |
10 | #include <linux/kprobes.h> |
11 | #include <linux/security.h> |
12 | #include "trace.h" |
13 | #include "trace_probe.h" |
14 | |
15 | static char __percpu *perf_trace_buf[PERF_NR_CONTEXTS]; |
16 | |
17 | /* |
18 | * Force it to be aligned to unsigned long to avoid misaligned accesses |
19 | * surprises |
20 | */ |
21 | typedef typeof(unsigned long [PERF_MAX_TRACE_SIZE / sizeof(unsigned long)]) |
22 | perf_trace_t; |
23 | |
24 | /* Count the events in use (per event id, not per instance) */ |
25 | static int total_ref_count; |
26 | |
27 | static int perf_trace_event_perm(struct trace_event_call *tp_event, |
28 | struct perf_event *p_event) |
29 | { |
30 | int ret; |
31 | |
32 | if (tp_event->perf_perm) { |
33 | ret = tp_event->perf_perm(tp_event, p_event); |
34 | if (ret) |
35 | return ret; |
36 | } |
37 | |
38 | /* |
39 | * We checked and allowed to create parent, |
40 | * allow children without checking. |
41 | */ |
42 | if (p_event->parent) |
43 | return 0; |
44 | |
45 | /* |
46 | * It's ok to check current process (owner) permissions in here, |
47 | * because code below is called only via perf_event_open syscall. |
48 | */ |
49 | |
50 | /* The ftrace function trace is allowed only for root. */ |
51 | if (ftrace_event_is_function(call: tp_event)) { |
52 | ret = perf_allow_tracepoint(attr: &p_event->attr); |
53 | if (ret) |
54 | return ret; |
55 | |
56 | if (!is_sampling_event(event: p_event)) |
57 | return 0; |
58 | |
59 | /* |
60 | * We don't allow user space callchains for function trace |
61 | * event, due to issues with page faults while tracing page |
62 | * fault handler and its overall trickiness nature. |
63 | */ |
64 | if (!p_event->attr.exclude_callchain_user) |
65 | return -EINVAL; |
66 | |
67 | /* |
68 | * Same reason to disable user stack dump as for user space |
69 | * callchains above. |
70 | */ |
71 | if (p_event->attr.sample_type & PERF_SAMPLE_STACK_USER) |
72 | return -EINVAL; |
73 | } |
74 | |
75 | /* No tracing, just counting, so no obvious leak */ |
76 | if (!(p_event->attr.sample_type & PERF_SAMPLE_RAW)) |
77 | return 0; |
78 | |
79 | /* Some events are ok to be traced by non-root users... */ |
80 | if (p_event->attach_state == PERF_ATTACH_TASK) { |
81 | if (tp_event->flags & TRACE_EVENT_FL_CAP_ANY) |
82 | return 0; |
83 | } |
84 | |
85 | /* |
86 | * ...otherwise raw tracepoint data can be a severe data leak, |
87 | * only allow root to have these. |
88 | */ |
89 | ret = perf_allow_tracepoint(attr: &p_event->attr); |
90 | if (ret) |
91 | return ret; |
92 | |
93 | return 0; |
94 | } |
95 | |
96 | static int perf_trace_event_reg(struct trace_event_call *tp_event, |
97 | struct perf_event *p_event) |
98 | { |
99 | struct hlist_head __percpu *list; |
100 | int ret = -ENOMEM; |
101 | int cpu; |
102 | |
103 | p_event->tp_event = tp_event; |
104 | if (tp_event->perf_refcount++ > 0) |
105 | return 0; |
106 | |
107 | list = alloc_percpu(struct hlist_head); |
108 | if (!list) |
109 | goto fail; |
110 | |
111 | for_each_possible_cpu(cpu) |
112 | INIT_HLIST_HEAD(per_cpu_ptr(list, cpu)); |
113 | |
114 | tp_event->perf_events = list; |
115 | |
116 | if (!total_ref_count) { |
117 | char __percpu *buf; |
118 | int i; |
119 | |
120 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
121 | buf = (char __percpu *)alloc_percpu(perf_trace_t); |
122 | if (!buf) |
123 | goto fail; |
124 | |
125 | perf_trace_buf[i] = buf; |
126 | } |
127 | } |
128 | |
129 | ret = tp_event->class->reg(tp_event, TRACE_REG_PERF_REGISTER, NULL); |
130 | if (ret) |
131 | goto fail; |
132 | |
133 | total_ref_count++; |
134 | return 0; |
135 | |
136 | fail: |
137 | if (!total_ref_count) { |
138 | int i; |
139 | |
140 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
141 | free_percpu(pdata: perf_trace_buf[i]); |
142 | perf_trace_buf[i] = NULL; |
143 | } |
144 | } |
145 | |
146 | if (!--tp_event->perf_refcount) { |
147 | free_percpu(pdata: tp_event->perf_events); |
148 | tp_event->perf_events = NULL; |
149 | } |
150 | |
151 | return ret; |
152 | } |
153 | |
154 | static void perf_trace_event_unreg(struct perf_event *p_event) |
155 | { |
156 | struct trace_event_call *tp_event = p_event->tp_event; |
157 | int i; |
158 | |
159 | if (--tp_event->perf_refcount > 0) |
160 | return; |
161 | |
162 | tp_event->class->reg(tp_event, TRACE_REG_PERF_UNREGISTER, NULL); |
163 | |
164 | /* |
165 | * Ensure our callback won't be called anymore. The buffers |
166 | * will be freed after that. |
167 | */ |
168 | tracepoint_synchronize_unregister(); |
169 | |
170 | free_percpu(pdata: tp_event->perf_events); |
171 | tp_event->perf_events = NULL; |
172 | |
173 | if (!--total_ref_count) { |
174 | for (i = 0; i < PERF_NR_CONTEXTS; i++) { |
175 | free_percpu(pdata: perf_trace_buf[i]); |
176 | perf_trace_buf[i] = NULL; |
177 | } |
178 | } |
179 | } |
180 | |
181 | static int perf_trace_event_open(struct perf_event *p_event) |
182 | { |
183 | struct trace_event_call *tp_event = p_event->tp_event; |
184 | return tp_event->class->reg(tp_event, TRACE_REG_PERF_OPEN, p_event); |
185 | } |
186 | |
187 | static void perf_trace_event_close(struct perf_event *p_event) |
188 | { |
189 | struct trace_event_call *tp_event = p_event->tp_event; |
190 | tp_event->class->reg(tp_event, TRACE_REG_PERF_CLOSE, p_event); |
191 | } |
192 | |
193 | static int perf_trace_event_init(struct trace_event_call *tp_event, |
194 | struct perf_event *p_event) |
195 | { |
196 | int ret; |
197 | |
198 | ret = perf_trace_event_perm(tp_event, p_event); |
199 | if (ret) |
200 | return ret; |
201 | |
202 | ret = perf_trace_event_reg(tp_event, p_event); |
203 | if (ret) |
204 | return ret; |
205 | |
206 | ret = perf_trace_event_open(p_event); |
207 | if (ret) { |
208 | perf_trace_event_unreg(p_event); |
209 | return ret; |
210 | } |
211 | |
212 | return 0; |
213 | } |
214 | |
215 | int perf_trace_init(struct perf_event *p_event) |
216 | { |
217 | struct trace_event_call *tp_event; |
218 | u64 event_id = p_event->attr.config; |
219 | int ret = -EINVAL; |
220 | |
221 | mutex_lock(&event_mutex); |
222 | list_for_each_entry(tp_event, &ftrace_events, list) { |
223 | if (tp_event->event.type == event_id && |
224 | tp_event->class && tp_event->class->reg && |
225 | trace_event_try_get_ref(call: tp_event)) { |
226 | ret = perf_trace_event_init(tp_event, p_event); |
227 | if (ret) |
228 | trace_event_put_ref(call: tp_event); |
229 | break; |
230 | } |
231 | } |
232 | mutex_unlock(lock: &event_mutex); |
233 | |
234 | return ret; |
235 | } |
236 | |
237 | void perf_trace_destroy(struct perf_event *p_event) |
238 | { |
239 | mutex_lock(&event_mutex); |
240 | perf_trace_event_close(p_event); |
241 | perf_trace_event_unreg(p_event); |
242 | trace_event_put_ref(call: p_event->tp_event); |
243 | mutex_unlock(lock: &event_mutex); |
244 | } |
245 | |
246 | #ifdef CONFIG_KPROBE_EVENTS |
247 | int perf_kprobe_init(struct perf_event *p_event, bool is_retprobe) |
248 | { |
249 | int ret; |
250 | char *func = NULL; |
251 | struct trace_event_call *tp_event; |
252 | |
253 | if (p_event->attr.kprobe_func) { |
254 | func = strndup_user(u64_to_user_ptr(p_event->attr.kprobe_func), |
255 | KSYM_NAME_LEN); |
256 | if (IS_ERR(ptr: func)) { |
257 | ret = PTR_ERR(ptr: func); |
258 | return (ret == -EINVAL) ? -E2BIG : ret; |
259 | } |
260 | |
261 | if (func[0] == '\0') { |
262 | kfree(objp: func); |
263 | func = NULL; |
264 | } |
265 | } |
266 | |
267 | tp_event = create_local_trace_kprobe( |
268 | func, addr: (void *)(unsigned long)(p_event->attr.kprobe_addr), |
269 | offs: p_event->attr.probe_offset, is_return: is_retprobe); |
270 | if (IS_ERR(ptr: tp_event)) { |
271 | ret = PTR_ERR(ptr: tp_event); |
272 | goto out; |
273 | } |
274 | |
275 | mutex_lock(&event_mutex); |
276 | ret = perf_trace_event_init(tp_event, p_event); |
277 | if (ret) |
278 | destroy_local_trace_kprobe(event_call: tp_event); |
279 | mutex_unlock(lock: &event_mutex); |
280 | out: |
281 | kfree(objp: func); |
282 | return ret; |
283 | } |
284 | |
285 | void perf_kprobe_destroy(struct perf_event *p_event) |
286 | { |
287 | mutex_lock(&event_mutex); |
288 | perf_trace_event_close(p_event); |
289 | perf_trace_event_unreg(p_event); |
290 | trace_event_put_ref(call: p_event->tp_event); |
291 | mutex_unlock(lock: &event_mutex); |
292 | |
293 | destroy_local_trace_kprobe(event_call: p_event->tp_event); |
294 | } |
295 | #endif /* CONFIG_KPROBE_EVENTS */ |
296 | |
297 | #ifdef CONFIG_UPROBE_EVENTS |
298 | int perf_uprobe_init(struct perf_event *p_event, |
299 | unsigned long ref_ctr_offset, bool is_retprobe) |
300 | { |
301 | int ret; |
302 | char *path = NULL; |
303 | struct trace_event_call *tp_event; |
304 | |
305 | if (!p_event->attr.uprobe_path) |
306 | return -EINVAL; |
307 | |
308 | path = strndup_user(u64_to_user_ptr(p_event->attr.uprobe_path), |
309 | PATH_MAX); |
310 | if (IS_ERR(ptr: path)) { |
311 | ret = PTR_ERR(ptr: path); |
312 | return (ret == -EINVAL) ? -E2BIG : ret; |
313 | } |
314 | if (path[0] == '\0') { |
315 | ret = -EINVAL; |
316 | goto out; |
317 | } |
318 | |
319 | tp_event = create_local_trace_uprobe(name: path, offs: p_event->attr.probe_offset, |
320 | ref_ctr_offset, is_return: is_retprobe); |
321 | if (IS_ERR(ptr: tp_event)) { |
322 | ret = PTR_ERR(ptr: tp_event); |
323 | goto out; |
324 | } |
325 | |
326 | /* |
327 | * local trace_uprobe need to hold event_mutex to call |
328 | * uprobe_buffer_enable() and uprobe_buffer_disable(). |
329 | * event_mutex is not required for local trace_kprobes. |
330 | */ |
331 | mutex_lock(&event_mutex); |
332 | ret = perf_trace_event_init(tp_event, p_event); |
333 | if (ret) |
334 | destroy_local_trace_uprobe(event_call: tp_event); |
335 | mutex_unlock(lock: &event_mutex); |
336 | out: |
337 | kfree(objp: path); |
338 | return ret; |
339 | } |
340 | |
341 | void perf_uprobe_destroy(struct perf_event *p_event) |
342 | { |
343 | mutex_lock(&event_mutex); |
344 | perf_trace_event_close(p_event); |
345 | perf_trace_event_unreg(p_event); |
346 | trace_event_put_ref(call: p_event->tp_event); |
347 | mutex_unlock(lock: &event_mutex); |
348 | destroy_local_trace_uprobe(event_call: p_event->tp_event); |
349 | } |
350 | #endif /* CONFIG_UPROBE_EVENTS */ |
351 | |
352 | int perf_trace_add(struct perf_event *p_event, int flags) |
353 | { |
354 | struct trace_event_call *tp_event = p_event->tp_event; |
355 | |
356 | if (!(flags & PERF_EF_START)) |
357 | p_event->hw.state = PERF_HES_STOPPED; |
358 | |
359 | /* |
360 | * If TRACE_REG_PERF_ADD returns false; no custom action was performed |
361 | * and we need to take the default action of enqueueing our event on |
362 | * the right per-cpu hlist. |
363 | */ |
364 | if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_ADD, p_event)) { |
365 | struct hlist_head __percpu *pcpu_list; |
366 | struct hlist_head *list; |
367 | |
368 | pcpu_list = tp_event->perf_events; |
369 | if (WARN_ON_ONCE(!pcpu_list)) |
370 | return -EINVAL; |
371 | |
372 | list = this_cpu_ptr(pcpu_list); |
373 | hlist_add_head_rcu(n: &p_event->hlist_entry, h: list); |
374 | } |
375 | |
376 | return 0; |
377 | } |
378 | |
379 | void perf_trace_del(struct perf_event *p_event, int flags) |
380 | { |
381 | struct trace_event_call *tp_event = p_event->tp_event; |
382 | |
383 | /* |
384 | * If TRACE_REG_PERF_DEL returns false; no custom action was performed |
385 | * and we need to take the default action of dequeueing our event from |
386 | * the right per-cpu hlist. |
387 | */ |
388 | if (!tp_event->class->reg(tp_event, TRACE_REG_PERF_DEL, p_event)) |
389 | hlist_del_rcu(n: &p_event->hlist_entry); |
390 | } |
391 | |
392 | void *perf_trace_buf_alloc(int size, struct pt_regs **regs, int *rctxp) |
393 | { |
394 | char *raw_data; |
395 | int rctx; |
396 | |
397 | BUILD_BUG_ON(PERF_MAX_TRACE_SIZE % sizeof(unsigned long)); |
398 | |
399 | if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, |
400 | "perf buffer not large enough, wanted %d, have %d" , |
401 | size, PERF_MAX_TRACE_SIZE)) |
402 | return NULL; |
403 | |
404 | *rctxp = rctx = perf_swevent_get_recursion_context(); |
405 | if (rctx < 0) |
406 | return NULL; |
407 | |
408 | if (regs) |
409 | *regs = this_cpu_ptr(&__perf_regs[rctx]); |
410 | raw_data = this_cpu_ptr(perf_trace_buf[rctx]); |
411 | |
412 | /* zero the dead bytes from align to not leak stack to user */ |
413 | memset(&raw_data[size - sizeof(u64)], 0, sizeof(u64)); |
414 | return raw_data; |
415 | } |
416 | EXPORT_SYMBOL_GPL(perf_trace_buf_alloc); |
417 | NOKPROBE_SYMBOL(perf_trace_buf_alloc); |
418 | |
419 | void perf_trace_buf_update(void *record, u16 type) |
420 | { |
421 | struct trace_entry *entry = record; |
422 | |
423 | tracing_generic_entry_update(entry, type, trace_ctx: tracing_gen_ctx()); |
424 | } |
425 | NOKPROBE_SYMBOL(perf_trace_buf_update); |
426 | |
427 | #ifdef CONFIG_FUNCTION_TRACER |
428 | static void |
429 | perf_ftrace_function_call(unsigned long ip, unsigned long parent_ip, |
430 | struct ftrace_ops *ops, struct ftrace_regs *fregs) |
431 | { |
432 | struct ftrace_entry *entry; |
433 | struct perf_event *event; |
434 | struct hlist_head head; |
435 | struct pt_regs regs; |
436 | int rctx; |
437 | int bit; |
438 | |
439 | if (!rcu_is_watching()) |
440 | return; |
441 | |
442 | bit = ftrace_test_recursion_trylock(ip, parent_ip); |
443 | if (bit < 0) |
444 | return; |
445 | |
446 | if ((unsigned long)ops->private != smp_processor_id()) |
447 | goto out; |
448 | |
449 | event = container_of(ops, struct perf_event, ftrace_ops); |
450 | |
451 | /* |
452 | * @event->hlist entry is NULL (per INIT_HLIST_NODE), and all |
453 | * the perf code does is hlist_for_each_entry_rcu(), so we can |
454 | * get away with simply setting the @head.first pointer in order |
455 | * to create a singular list. |
456 | */ |
457 | head.first = &event->hlist_entry; |
458 | |
459 | #define ENTRY_SIZE (ALIGN(sizeof(struct ftrace_entry) + sizeof(u32), \ |
460 | sizeof(u64)) - sizeof(u32)) |
461 | |
462 | BUILD_BUG_ON(ENTRY_SIZE > PERF_MAX_TRACE_SIZE); |
463 | |
464 | memset(®s, 0, sizeof(regs)); |
465 | perf_fetch_caller_regs(regs: ®s); |
466 | |
467 | entry = perf_trace_buf_alloc(ENTRY_SIZE, NULL, &rctx); |
468 | if (!entry) |
469 | goto out; |
470 | |
471 | entry->ip = ip; |
472 | entry->parent_ip = parent_ip; |
473 | perf_trace_buf_submit(raw_data: entry, ENTRY_SIZE, rctx, type: TRACE_FN, |
474 | count: 1, regs: ®s, head: &head, NULL); |
475 | |
476 | out: |
477 | ftrace_test_recursion_unlock(bit); |
478 | #undef ENTRY_SIZE |
479 | } |
480 | |
481 | static int perf_ftrace_function_register(struct perf_event *event) |
482 | { |
483 | struct ftrace_ops *ops = &event->ftrace_ops; |
484 | |
485 | ops->func = perf_ftrace_function_call; |
486 | ops->private = (void *)(unsigned long)nr_cpu_ids; |
487 | |
488 | return register_ftrace_function(ops); |
489 | } |
490 | |
491 | static int perf_ftrace_function_unregister(struct perf_event *event) |
492 | { |
493 | struct ftrace_ops *ops = &event->ftrace_ops; |
494 | int ret = unregister_ftrace_function(ops); |
495 | ftrace_free_filter(ops); |
496 | return ret; |
497 | } |
498 | |
499 | int perf_ftrace_event_register(struct trace_event_call *call, |
500 | enum trace_reg type, void *data) |
501 | { |
502 | struct perf_event *event = data; |
503 | |
504 | switch (type) { |
505 | case TRACE_REG_REGISTER: |
506 | case TRACE_REG_UNREGISTER: |
507 | break; |
508 | case TRACE_REG_PERF_REGISTER: |
509 | case TRACE_REG_PERF_UNREGISTER: |
510 | return 0; |
511 | case TRACE_REG_PERF_OPEN: |
512 | return perf_ftrace_function_register(event: data); |
513 | case TRACE_REG_PERF_CLOSE: |
514 | return perf_ftrace_function_unregister(event: data); |
515 | case TRACE_REG_PERF_ADD: |
516 | event->ftrace_ops.private = (void *)(unsigned long)smp_processor_id(); |
517 | return 1; |
518 | case TRACE_REG_PERF_DEL: |
519 | event->ftrace_ops.private = (void *)(unsigned long)nr_cpu_ids; |
520 | return 1; |
521 | } |
522 | |
523 | return -EINVAL; |
524 | } |
525 | #endif /* CONFIG_FUNCTION_TRACER */ |
526 | |