1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* Copyright (c) 2019 Facebook */ |
3 | #include <linux/hash.h> |
4 | #include <linux/bpf.h> |
5 | #include <linux/filter.h> |
6 | #include <linux/ftrace.h> |
7 | #include <linux/rbtree_latch.h> |
8 | #include <linux/perf_event.h> |
9 | #include <linux/btf.h> |
10 | #include <linux/rcupdate_trace.h> |
11 | #include <linux/rcupdate_wait.h> |
12 | #include <linux/static_call.h> |
13 | #include <linux/bpf_verifier.h> |
14 | #include <linux/bpf_lsm.h> |
15 | #include <linux/delay.h> |
16 | |
17 | /* dummy _ops. The verifier will operate on target program's ops. */ |
18 | const struct bpf_verifier_ops bpf_extension_verifier_ops = { |
19 | }; |
20 | const struct bpf_prog_ops bpf_extension_prog_ops = { |
21 | }; |
22 | |
23 | /* btf_vmlinux has ~22k attachable functions. 1k htab is enough. */ |
24 | #define TRAMPOLINE_HASH_BITS 10 |
25 | #define TRAMPOLINE_TABLE_SIZE (1 << TRAMPOLINE_HASH_BITS) |
26 | |
27 | static struct hlist_head trampoline_table[TRAMPOLINE_TABLE_SIZE]; |
28 | |
29 | /* serializes access to trampoline_table */ |
30 | static DEFINE_MUTEX(trampoline_mutex); |
31 | |
32 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS |
33 | static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex); |
34 | |
35 | static int bpf_tramp_ftrace_ops_func(struct ftrace_ops *ops, enum ftrace_ops_cmd cmd) |
36 | { |
37 | struct bpf_trampoline *tr = ops->private; |
38 | int ret = 0; |
39 | |
40 | if (cmd == FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_SELF) { |
41 | /* This is called inside register_ftrace_direct_multi(), so |
42 | * tr->mutex is already locked. |
43 | */ |
44 | lockdep_assert_held_once(&tr->mutex); |
45 | |
46 | /* Instead of updating the trampoline here, we propagate |
47 | * -EAGAIN to register_ftrace_direct(). Then we can |
48 | * retry register_ftrace_direct() after updating the |
49 | * trampoline. |
50 | */ |
51 | if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && |
52 | !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) { |
53 | if (WARN_ON_ONCE(tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY)) |
54 | return -EBUSY; |
55 | |
56 | tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; |
57 | return -EAGAIN; |
58 | } |
59 | |
60 | return 0; |
61 | } |
62 | |
63 | /* The normal locking order is |
64 | * tr->mutex => direct_mutex (ftrace.c) => ftrace_lock (ftrace.c) |
65 | * |
66 | * The following two commands are called from |
67 | * |
68 | * prepare_direct_functions_for_ipmodify |
69 | * cleanup_direct_functions_after_ipmodify |
70 | * |
71 | * In both cases, direct_mutex is already locked. Use |
72 | * mutex_trylock(&tr->mutex) to avoid deadlock in race condition |
73 | * (something else is making changes to this same trampoline). |
74 | */ |
75 | if (!mutex_trylock(lock: &tr->mutex)) { |
76 | /* sleep 1 ms to make sure whatever holding tr->mutex makes |
77 | * some progress. |
78 | */ |
79 | msleep(msecs: 1); |
80 | return -EAGAIN; |
81 | } |
82 | |
83 | switch (cmd) { |
84 | case FTRACE_OPS_CMD_ENABLE_SHARE_IPMODIFY_PEER: |
85 | tr->flags |= BPF_TRAMP_F_SHARE_IPMODIFY; |
86 | |
87 | if ((tr->flags & BPF_TRAMP_F_CALL_ORIG) && |
88 | !(tr->flags & BPF_TRAMP_F_ORIG_STACK)) |
89 | ret = bpf_trampoline_update(tr, lock_direct_mutex: false /* lock_direct_mutex */); |
90 | break; |
91 | case FTRACE_OPS_CMD_DISABLE_SHARE_IPMODIFY_PEER: |
92 | tr->flags &= ~BPF_TRAMP_F_SHARE_IPMODIFY; |
93 | |
94 | if (tr->flags & BPF_TRAMP_F_ORIG_STACK) |
95 | ret = bpf_trampoline_update(tr, lock_direct_mutex: false /* lock_direct_mutex */); |
96 | break; |
97 | default: |
98 | ret = -EINVAL; |
99 | break; |
100 | } |
101 | |
102 | mutex_unlock(lock: &tr->mutex); |
103 | return ret; |
104 | } |
105 | #endif |
106 | |
107 | bool bpf_prog_has_trampoline(const struct bpf_prog *prog) |
108 | { |
109 | enum bpf_attach_type eatype = prog->expected_attach_type; |
110 | enum bpf_prog_type ptype = prog->type; |
111 | |
112 | return (ptype == BPF_PROG_TYPE_TRACING && |
113 | (eatype == BPF_TRACE_FENTRY || eatype == BPF_TRACE_FEXIT || |
114 | eatype == BPF_MODIFY_RETURN)) || |
115 | (ptype == BPF_PROG_TYPE_LSM && eatype == BPF_LSM_MAC); |
116 | } |
117 | |
118 | void bpf_image_ksym_add(void *data, unsigned int size, struct bpf_ksym *ksym) |
119 | { |
120 | ksym->start = (unsigned long) data; |
121 | ksym->end = ksym->start + size; |
122 | bpf_ksym_add(ksym); |
123 | perf_event_ksymbol(ksym_type: PERF_RECORD_KSYMBOL_TYPE_BPF, addr: ksym->start, |
124 | PAGE_SIZE, unregister: false, sym: ksym->name); |
125 | } |
126 | |
127 | void bpf_image_ksym_del(struct bpf_ksym *ksym) |
128 | { |
129 | bpf_ksym_del(ksym); |
130 | perf_event_ksymbol(ksym_type: PERF_RECORD_KSYMBOL_TYPE_BPF, addr: ksym->start, |
131 | PAGE_SIZE, unregister: true, sym: ksym->name); |
132 | } |
133 | |
134 | static struct bpf_trampoline *bpf_trampoline_lookup(u64 key) |
135 | { |
136 | struct bpf_trampoline *tr; |
137 | struct hlist_head *head; |
138 | int i; |
139 | |
140 | mutex_lock(&trampoline_mutex); |
141 | head = &trampoline_table[hash_64(val: key, TRAMPOLINE_HASH_BITS)]; |
142 | hlist_for_each_entry(tr, head, hlist) { |
143 | if (tr->key == key) { |
144 | refcount_inc(r: &tr->refcnt); |
145 | goto out; |
146 | } |
147 | } |
148 | tr = kzalloc(size: sizeof(*tr), GFP_KERNEL); |
149 | if (!tr) |
150 | goto out; |
151 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS |
152 | tr->fops = kzalloc(size: sizeof(struct ftrace_ops), GFP_KERNEL); |
153 | if (!tr->fops) { |
154 | kfree(objp: tr); |
155 | tr = NULL; |
156 | goto out; |
157 | } |
158 | tr->fops->private = tr; |
159 | tr->fops->ops_func = bpf_tramp_ftrace_ops_func; |
160 | #endif |
161 | |
162 | tr->key = key; |
163 | INIT_HLIST_NODE(h: &tr->hlist); |
164 | hlist_add_head(n: &tr->hlist, h: head); |
165 | refcount_set(r: &tr->refcnt, n: 1); |
166 | mutex_init(&tr->mutex); |
167 | for (i = 0; i < BPF_TRAMP_MAX; i++) |
168 | INIT_HLIST_HEAD(&tr->progs_hlist[i]); |
169 | out: |
170 | mutex_unlock(lock: &trampoline_mutex); |
171 | return tr; |
172 | } |
173 | |
174 | static int unregister_fentry(struct bpf_trampoline *tr, void *old_addr) |
175 | { |
176 | void *ip = tr->func.addr; |
177 | int ret; |
178 | |
179 | if (tr->func.ftrace_managed) |
180 | ret = unregister_ftrace_direct(ops: tr->fops, addr: (long)old_addr, free_filters: false); |
181 | else |
182 | ret = bpf_arch_text_poke(ip, t: BPF_MOD_CALL, addr1: old_addr, NULL); |
183 | |
184 | return ret; |
185 | } |
186 | |
187 | static int modify_fentry(struct bpf_trampoline *tr, void *old_addr, void *new_addr, |
188 | bool lock_direct_mutex) |
189 | { |
190 | void *ip = tr->func.addr; |
191 | int ret; |
192 | |
193 | if (tr->func.ftrace_managed) { |
194 | if (lock_direct_mutex) |
195 | ret = modify_ftrace_direct(ops: tr->fops, addr: (long)new_addr); |
196 | else |
197 | ret = modify_ftrace_direct_nolock(ops: tr->fops, addr: (long)new_addr); |
198 | } else { |
199 | ret = bpf_arch_text_poke(ip, t: BPF_MOD_CALL, addr1: old_addr, addr2: new_addr); |
200 | } |
201 | return ret; |
202 | } |
203 | |
204 | /* first time registering */ |
205 | static int register_fentry(struct bpf_trampoline *tr, void *new_addr) |
206 | { |
207 | void *ip = tr->func.addr; |
208 | unsigned long faddr; |
209 | int ret; |
210 | |
211 | faddr = ftrace_location(ip: (unsigned long)ip); |
212 | if (faddr) { |
213 | if (!tr->fops) |
214 | return -ENOTSUPP; |
215 | tr->func.ftrace_managed = true; |
216 | } |
217 | |
218 | if (tr->func.ftrace_managed) { |
219 | ftrace_set_filter_ip(ops: tr->fops, ip: (unsigned long)ip, remove: 0, reset: 1); |
220 | ret = register_ftrace_direct(ops: tr->fops, addr: (long)new_addr); |
221 | } else { |
222 | ret = bpf_arch_text_poke(ip, t: BPF_MOD_CALL, NULL, addr2: new_addr); |
223 | } |
224 | |
225 | return ret; |
226 | } |
227 | |
228 | static struct bpf_tramp_links * |
229 | bpf_trampoline_get_progs(const struct bpf_trampoline *tr, int *total, bool *ip_arg) |
230 | { |
231 | struct bpf_tramp_link *link; |
232 | struct bpf_tramp_links *tlinks; |
233 | struct bpf_tramp_link **links; |
234 | int kind; |
235 | |
236 | *total = 0; |
237 | tlinks = kcalloc(n: BPF_TRAMP_MAX, size: sizeof(*tlinks), GFP_KERNEL); |
238 | if (!tlinks) |
239 | return ERR_PTR(error: -ENOMEM); |
240 | |
241 | for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { |
242 | tlinks[kind].nr_links = tr->progs_cnt[kind]; |
243 | *total += tr->progs_cnt[kind]; |
244 | links = tlinks[kind].links; |
245 | |
246 | hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { |
247 | *ip_arg |= link->link.prog->call_get_func_ip; |
248 | *links++ = link; |
249 | } |
250 | } |
251 | return tlinks; |
252 | } |
253 | |
254 | static void bpf_tramp_image_free(struct bpf_tramp_image *im) |
255 | { |
256 | bpf_image_ksym_del(ksym: &im->ksym); |
257 | arch_free_bpf_trampoline(image: im->image, size: im->size); |
258 | bpf_jit_uncharge_modmem(size: im->size); |
259 | percpu_ref_exit(ref: &im->pcref); |
260 | kfree_rcu(im, rcu); |
261 | } |
262 | |
263 | static void __bpf_tramp_image_put_deferred(struct work_struct *work) |
264 | { |
265 | struct bpf_tramp_image *im; |
266 | |
267 | im = container_of(work, struct bpf_tramp_image, work); |
268 | bpf_tramp_image_free(im); |
269 | } |
270 | |
271 | /* callback, fexit step 3 or fentry step 2 */ |
272 | static void __bpf_tramp_image_put_rcu(struct rcu_head *rcu) |
273 | { |
274 | struct bpf_tramp_image *im; |
275 | |
276 | im = container_of(rcu, struct bpf_tramp_image, rcu); |
277 | INIT_WORK(&im->work, __bpf_tramp_image_put_deferred); |
278 | schedule_work(work: &im->work); |
279 | } |
280 | |
281 | /* callback, fexit step 2. Called after percpu_ref_kill confirms. */ |
282 | static void __bpf_tramp_image_release(struct percpu_ref *pcref) |
283 | { |
284 | struct bpf_tramp_image *im; |
285 | |
286 | im = container_of(pcref, struct bpf_tramp_image, pcref); |
287 | call_rcu_tasks(head: &im->rcu, func: __bpf_tramp_image_put_rcu); |
288 | } |
289 | |
290 | /* callback, fexit or fentry step 1 */ |
291 | static void __bpf_tramp_image_put_rcu_tasks(struct rcu_head *rcu) |
292 | { |
293 | struct bpf_tramp_image *im; |
294 | |
295 | im = container_of(rcu, struct bpf_tramp_image, rcu); |
296 | if (im->ip_after_call) |
297 | /* the case of fmod_ret/fexit trampoline and CONFIG_PREEMPTION=y */ |
298 | percpu_ref_kill(ref: &im->pcref); |
299 | else |
300 | /* the case of fentry trampoline */ |
301 | call_rcu_tasks(head: &im->rcu, func: __bpf_tramp_image_put_rcu); |
302 | } |
303 | |
304 | static void bpf_tramp_image_put(struct bpf_tramp_image *im) |
305 | { |
306 | /* The trampoline image that calls original function is using: |
307 | * rcu_read_lock_trace to protect sleepable bpf progs |
308 | * rcu_read_lock to protect normal bpf progs |
309 | * percpu_ref to protect trampoline itself |
310 | * rcu tasks to protect trampoline asm not covered by percpu_ref |
311 | * (which are few asm insns before __bpf_tramp_enter and |
312 | * after __bpf_tramp_exit) |
313 | * |
314 | * The trampoline is unreachable before bpf_tramp_image_put(). |
315 | * |
316 | * First, patch the trampoline to avoid calling into fexit progs. |
317 | * The progs will be freed even if the original function is still |
318 | * executing or sleeping. |
319 | * In case of CONFIG_PREEMPT=y use call_rcu_tasks() to wait on |
320 | * first few asm instructions to execute and call into |
321 | * __bpf_tramp_enter->percpu_ref_get. |
322 | * Then use percpu_ref_kill to wait for the trampoline and the original |
323 | * function to finish. |
324 | * Then use call_rcu_tasks() to make sure few asm insns in |
325 | * the trampoline epilogue are done as well. |
326 | * |
327 | * In !PREEMPT case the task that got interrupted in the first asm |
328 | * insns won't go through an RCU quiescent state which the |
329 | * percpu_ref_kill will be waiting for. Hence the first |
330 | * call_rcu_tasks() is not necessary. |
331 | */ |
332 | if (im->ip_after_call) { |
333 | int err = bpf_arch_text_poke(ip: im->ip_after_call, t: BPF_MOD_JUMP, |
334 | NULL, addr2: im->ip_epilogue); |
335 | WARN_ON(err); |
336 | if (IS_ENABLED(CONFIG_PREEMPTION)) |
337 | call_rcu_tasks(head: &im->rcu, func: __bpf_tramp_image_put_rcu_tasks); |
338 | else |
339 | percpu_ref_kill(ref: &im->pcref); |
340 | return; |
341 | } |
342 | |
343 | /* The trampoline without fexit and fmod_ret progs doesn't call original |
344 | * function and doesn't use percpu_ref. |
345 | * Use call_rcu_tasks_trace() to wait for sleepable progs to finish. |
346 | * Then use call_rcu_tasks() to wait for the rest of trampoline asm |
347 | * and normal progs. |
348 | */ |
349 | call_rcu_tasks_trace(rhp: &im->rcu, func: __bpf_tramp_image_put_rcu_tasks); |
350 | } |
351 | |
352 | static struct bpf_tramp_image *bpf_tramp_image_alloc(u64 key, int size) |
353 | { |
354 | struct bpf_tramp_image *im; |
355 | struct bpf_ksym *ksym; |
356 | void *image; |
357 | int err = -ENOMEM; |
358 | |
359 | im = kzalloc(size: sizeof(*im), GFP_KERNEL); |
360 | if (!im) |
361 | goto out; |
362 | |
363 | err = bpf_jit_charge_modmem(size); |
364 | if (err) |
365 | goto out_free_im; |
366 | im->size = size; |
367 | |
368 | err = -ENOMEM; |
369 | im->image = image = arch_alloc_bpf_trampoline(size); |
370 | if (!image) |
371 | goto out_uncharge; |
372 | |
373 | err = percpu_ref_init(ref: &im->pcref, release: __bpf_tramp_image_release, flags: 0, GFP_KERNEL); |
374 | if (err) |
375 | goto out_free_image; |
376 | |
377 | ksym = &im->ksym; |
378 | INIT_LIST_HEAD_RCU(list: &ksym->lnode); |
379 | snprintf(buf: ksym->name, KSYM_NAME_LEN, fmt: "bpf_trampoline_%llu" , key); |
380 | bpf_image_ksym_add(data: image, size, ksym); |
381 | return im; |
382 | |
383 | out_free_image: |
384 | arch_free_bpf_trampoline(image: im->image, size: im->size); |
385 | out_uncharge: |
386 | bpf_jit_uncharge_modmem(size); |
387 | out_free_im: |
388 | kfree(objp: im); |
389 | out: |
390 | return ERR_PTR(error: err); |
391 | } |
392 | |
393 | static int bpf_trampoline_update(struct bpf_trampoline *tr, bool lock_direct_mutex) |
394 | { |
395 | struct bpf_tramp_image *im; |
396 | struct bpf_tramp_links *tlinks; |
397 | u32 orig_flags = tr->flags; |
398 | bool ip_arg = false; |
399 | int err, total, size; |
400 | |
401 | tlinks = bpf_trampoline_get_progs(tr, total: &total, ip_arg: &ip_arg); |
402 | if (IS_ERR(ptr: tlinks)) |
403 | return PTR_ERR(ptr: tlinks); |
404 | |
405 | if (total == 0) { |
406 | err = unregister_fentry(tr, old_addr: tr->cur_image->image); |
407 | bpf_tramp_image_put(im: tr->cur_image); |
408 | tr->cur_image = NULL; |
409 | goto out; |
410 | } |
411 | |
412 | /* clear all bits except SHARE_IPMODIFY and TAIL_CALL_CTX */ |
413 | tr->flags &= (BPF_TRAMP_F_SHARE_IPMODIFY | BPF_TRAMP_F_TAIL_CALL_CTX); |
414 | |
415 | if (tlinks[BPF_TRAMP_FEXIT].nr_links || |
416 | tlinks[BPF_TRAMP_MODIFY_RETURN].nr_links) { |
417 | /* NOTE: BPF_TRAMP_F_RESTORE_REGS and BPF_TRAMP_F_SKIP_FRAME |
418 | * should not be set together. |
419 | */ |
420 | tr->flags |= BPF_TRAMP_F_CALL_ORIG | BPF_TRAMP_F_SKIP_FRAME; |
421 | } else { |
422 | tr->flags |= BPF_TRAMP_F_RESTORE_REGS; |
423 | } |
424 | |
425 | if (ip_arg) |
426 | tr->flags |= BPF_TRAMP_F_IP_ARG; |
427 | |
428 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS |
429 | again: |
430 | if ((tr->flags & BPF_TRAMP_F_SHARE_IPMODIFY) && |
431 | (tr->flags & BPF_TRAMP_F_CALL_ORIG)) |
432 | tr->flags |= BPF_TRAMP_F_ORIG_STACK; |
433 | #endif |
434 | |
435 | size = arch_bpf_trampoline_size(m: &tr->func.model, flags: tr->flags, |
436 | tlinks, func_addr: tr->func.addr); |
437 | if (size < 0) { |
438 | err = size; |
439 | goto out; |
440 | } |
441 | |
442 | if (size > PAGE_SIZE) { |
443 | err = -E2BIG; |
444 | goto out; |
445 | } |
446 | |
447 | im = bpf_tramp_image_alloc(key: tr->key, size); |
448 | if (IS_ERR(ptr: im)) { |
449 | err = PTR_ERR(ptr: im); |
450 | goto out; |
451 | } |
452 | |
453 | err = arch_prepare_bpf_trampoline(im, image: im->image, image_end: im->image + size, |
454 | m: &tr->func.model, flags: tr->flags, tlinks, |
455 | func_addr: tr->func.addr); |
456 | if (err < 0) |
457 | goto out_free; |
458 | |
459 | arch_protect_bpf_trampoline(image: im->image, size: im->size); |
460 | |
461 | WARN_ON(tr->cur_image && total == 0); |
462 | if (tr->cur_image) |
463 | /* progs already running at this address */ |
464 | err = modify_fentry(tr, old_addr: tr->cur_image->image, new_addr: im->image, lock_direct_mutex); |
465 | else |
466 | /* first time registering */ |
467 | err = register_fentry(tr, new_addr: im->image); |
468 | |
469 | #ifdef CONFIG_DYNAMIC_FTRACE_WITH_DIRECT_CALLS |
470 | if (err == -EAGAIN) { |
471 | /* -EAGAIN from bpf_tramp_ftrace_ops_func. Now |
472 | * BPF_TRAMP_F_SHARE_IPMODIFY is set, we can generate the |
473 | * trampoline again, and retry register. |
474 | */ |
475 | /* reset fops->func and fops->trampoline for re-register */ |
476 | tr->fops->func = NULL; |
477 | tr->fops->trampoline = 0; |
478 | |
479 | /* free im memory and reallocate later */ |
480 | bpf_tramp_image_free(im); |
481 | goto again; |
482 | } |
483 | #endif |
484 | if (err) |
485 | goto out_free; |
486 | |
487 | if (tr->cur_image) |
488 | bpf_tramp_image_put(im: tr->cur_image); |
489 | tr->cur_image = im; |
490 | out: |
491 | /* If any error happens, restore previous flags */ |
492 | if (err) |
493 | tr->flags = orig_flags; |
494 | kfree(objp: tlinks); |
495 | return err; |
496 | |
497 | out_free: |
498 | bpf_tramp_image_free(im); |
499 | goto out; |
500 | } |
501 | |
502 | static enum bpf_tramp_prog_type bpf_attach_type_to_tramp(struct bpf_prog *prog) |
503 | { |
504 | switch (prog->expected_attach_type) { |
505 | case BPF_TRACE_FENTRY: |
506 | return BPF_TRAMP_FENTRY; |
507 | case BPF_MODIFY_RETURN: |
508 | return BPF_TRAMP_MODIFY_RETURN; |
509 | case BPF_TRACE_FEXIT: |
510 | return BPF_TRAMP_FEXIT; |
511 | case BPF_LSM_MAC: |
512 | if (!prog->aux->attach_func_proto->type) |
513 | /* The function returns void, we cannot modify its |
514 | * return value. |
515 | */ |
516 | return BPF_TRAMP_FEXIT; |
517 | else |
518 | return BPF_TRAMP_MODIFY_RETURN; |
519 | default: |
520 | return BPF_TRAMP_REPLACE; |
521 | } |
522 | } |
523 | |
524 | static int __bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) |
525 | { |
526 | enum bpf_tramp_prog_type kind; |
527 | struct bpf_tramp_link *link_exiting; |
528 | int err = 0; |
529 | int cnt = 0, i; |
530 | |
531 | kind = bpf_attach_type_to_tramp(prog: link->link.prog); |
532 | if (tr->extension_prog) |
533 | /* cannot attach fentry/fexit if extension prog is attached. |
534 | * cannot overwrite extension prog either. |
535 | */ |
536 | return -EBUSY; |
537 | |
538 | for (i = 0; i < BPF_TRAMP_MAX; i++) |
539 | cnt += tr->progs_cnt[i]; |
540 | |
541 | if (kind == BPF_TRAMP_REPLACE) { |
542 | /* Cannot attach extension if fentry/fexit are in use. */ |
543 | if (cnt) |
544 | return -EBUSY; |
545 | tr->extension_prog = link->link.prog; |
546 | return bpf_arch_text_poke(ip: tr->func.addr, t: BPF_MOD_JUMP, NULL, |
547 | addr2: link->link.prog->bpf_func); |
548 | } |
549 | if (cnt >= BPF_MAX_TRAMP_LINKS) |
550 | return -E2BIG; |
551 | if (!hlist_unhashed(h: &link->tramp_hlist)) |
552 | /* prog already linked */ |
553 | return -EBUSY; |
554 | hlist_for_each_entry(link_exiting, &tr->progs_hlist[kind], tramp_hlist) { |
555 | if (link_exiting->link.prog != link->link.prog) |
556 | continue; |
557 | /* prog already linked */ |
558 | return -EBUSY; |
559 | } |
560 | |
561 | hlist_add_head(n: &link->tramp_hlist, h: &tr->progs_hlist[kind]); |
562 | tr->progs_cnt[kind]++; |
563 | err = bpf_trampoline_update(tr, lock_direct_mutex: true /* lock_direct_mutex */); |
564 | if (err) { |
565 | hlist_del_init(n: &link->tramp_hlist); |
566 | tr->progs_cnt[kind]--; |
567 | } |
568 | return err; |
569 | } |
570 | |
571 | int bpf_trampoline_link_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) |
572 | { |
573 | int err; |
574 | |
575 | mutex_lock(&tr->mutex); |
576 | err = __bpf_trampoline_link_prog(link, tr); |
577 | mutex_unlock(lock: &tr->mutex); |
578 | return err; |
579 | } |
580 | |
581 | static int __bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) |
582 | { |
583 | enum bpf_tramp_prog_type kind; |
584 | int err; |
585 | |
586 | kind = bpf_attach_type_to_tramp(prog: link->link.prog); |
587 | if (kind == BPF_TRAMP_REPLACE) { |
588 | WARN_ON_ONCE(!tr->extension_prog); |
589 | err = bpf_arch_text_poke(ip: tr->func.addr, t: BPF_MOD_JUMP, |
590 | addr1: tr->extension_prog->bpf_func, NULL); |
591 | tr->extension_prog = NULL; |
592 | return err; |
593 | } |
594 | hlist_del_init(n: &link->tramp_hlist); |
595 | tr->progs_cnt[kind]--; |
596 | return bpf_trampoline_update(tr, lock_direct_mutex: true /* lock_direct_mutex */); |
597 | } |
598 | |
599 | /* bpf_trampoline_unlink_prog() should never fail. */ |
600 | int bpf_trampoline_unlink_prog(struct bpf_tramp_link *link, struct bpf_trampoline *tr) |
601 | { |
602 | int err; |
603 | |
604 | mutex_lock(&tr->mutex); |
605 | err = __bpf_trampoline_unlink_prog(link, tr); |
606 | mutex_unlock(lock: &tr->mutex); |
607 | return err; |
608 | } |
609 | |
610 | #if defined(CONFIG_CGROUP_BPF) && defined(CONFIG_BPF_LSM) |
611 | static void bpf_shim_tramp_link_release(struct bpf_link *link) |
612 | { |
613 | struct bpf_shim_tramp_link *shim_link = |
614 | container_of(link, struct bpf_shim_tramp_link, link.link); |
615 | |
616 | /* paired with 'shim_link->trampoline = tr' in bpf_trampoline_link_cgroup_shim */ |
617 | if (!shim_link->trampoline) |
618 | return; |
619 | |
620 | WARN_ON_ONCE(bpf_trampoline_unlink_prog(&shim_link->link, shim_link->trampoline)); |
621 | bpf_trampoline_put(tr: shim_link->trampoline); |
622 | } |
623 | |
624 | static void bpf_shim_tramp_link_dealloc(struct bpf_link *link) |
625 | { |
626 | struct bpf_shim_tramp_link *shim_link = |
627 | container_of(link, struct bpf_shim_tramp_link, link.link); |
628 | |
629 | kfree(objp: shim_link); |
630 | } |
631 | |
632 | static const struct bpf_link_ops bpf_shim_tramp_link_lops = { |
633 | .release = bpf_shim_tramp_link_release, |
634 | .dealloc = bpf_shim_tramp_link_dealloc, |
635 | }; |
636 | |
637 | static struct bpf_shim_tramp_link *cgroup_shim_alloc(const struct bpf_prog *prog, |
638 | bpf_func_t bpf_func, |
639 | int cgroup_atype) |
640 | { |
641 | struct bpf_shim_tramp_link *shim_link = NULL; |
642 | struct bpf_prog *p; |
643 | |
644 | shim_link = kzalloc(size: sizeof(*shim_link), GFP_USER); |
645 | if (!shim_link) |
646 | return NULL; |
647 | |
648 | p = bpf_prog_alloc(size: 1, gfp_extra_flags: 0); |
649 | if (!p) { |
650 | kfree(objp: shim_link); |
651 | return NULL; |
652 | } |
653 | |
654 | p->jited = false; |
655 | p->bpf_func = bpf_func; |
656 | |
657 | p->aux->cgroup_atype = cgroup_atype; |
658 | p->aux->attach_func_proto = prog->aux->attach_func_proto; |
659 | p->aux->attach_btf_id = prog->aux->attach_btf_id; |
660 | p->aux->attach_btf = prog->aux->attach_btf; |
661 | btf_get(btf: p->aux->attach_btf); |
662 | p->type = BPF_PROG_TYPE_LSM; |
663 | p->expected_attach_type = BPF_LSM_MAC; |
664 | bpf_prog_inc(prog: p); |
665 | bpf_link_init(link: &shim_link->link.link, type: BPF_LINK_TYPE_UNSPEC, |
666 | ops: &bpf_shim_tramp_link_lops, prog: p); |
667 | bpf_cgroup_atype_get(attach_btf_id: p->aux->attach_btf_id, cgroup_atype); |
668 | |
669 | return shim_link; |
670 | } |
671 | |
672 | static struct bpf_shim_tramp_link *cgroup_shim_find(struct bpf_trampoline *tr, |
673 | bpf_func_t bpf_func) |
674 | { |
675 | struct bpf_tramp_link *link; |
676 | int kind; |
677 | |
678 | for (kind = 0; kind < BPF_TRAMP_MAX; kind++) { |
679 | hlist_for_each_entry(link, &tr->progs_hlist[kind], tramp_hlist) { |
680 | struct bpf_prog *p = link->link.prog; |
681 | |
682 | if (p->bpf_func == bpf_func) |
683 | return container_of(link, struct bpf_shim_tramp_link, link); |
684 | } |
685 | } |
686 | |
687 | return NULL; |
688 | } |
689 | |
690 | int bpf_trampoline_link_cgroup_shim(struct bpf_prog *prog, |
691 | int cgroup_atype) |
692 | { |
693 | struct bpf_shim_tramp_link *shim_link = NULL; |
694 | struct bpf_attach_target_info tgt_info = {}; |
695 | struct bpf_trampoline *tr; |
696 | bpf_func_t bpf_func; |
697 | u64 key; |
698 | int err; |
699 | |
700 | err = bpf_check_attach_target(NULL, prog, NULL, |
701 | btf_id: prog->aux->attach_btf_id, |
702 | tgt_info: &tgt_info); |
703 | if (err) |
704 | return err; |
705 | |
706 | key = bpf_trampoline_compute_key(NULL, btf: prog->aux->attach_btf, |
707 | btf_id: prog->aux->attach_btf_id); |
708 | |
709 | bpf_lsm_find_cgroup_shim(prog, bpf_func: &bpf_func); |
710 | tr = bpf_trampoline_get(key, tgt_info: &tgt_info); |
711 | if (!tr) |
712 | return -ENOMEM; |
713 | |
714 | mutex_lock(&tr->mutex); |
715 | |
716 | shim_link = cgroup_shim_find(tr, bpf_func); |
717 | if (shim_link) { |
718 | /* Reusing existing shim attached by the other program. */ |
719 | bpf_link_inc(link: &shim_link->link.link); |
720 | |
721 | mutex_unlock(lock: &tr->mutex); |
722 | bpf_trampoline_put(tr); /* bpf_trampoline_get above */ |
723 | return 0; |
724 | } |
725 | |
726 | /* Allocate and install new shim. */ |
727 | |
728 | shim_link = cgroup_shim_alloc(prog, bpf_func, cgroup_atype); |
729 | if (!shim_link) { |
730 | err = -ENOMEM; |
731 | goto err; |
732 | } |
733 | |
734 | err = __bpf_trampoline_link_prog(link: &shim_link->link, tr); |
735 | if (err) |
736 | goto err; |
737 | |
738 | shim_link->trampoline = tr; |
739 | /* note, we're still holding tr refcnt from above */ |
740 | |
741 | mutex_unlock(lock: &tr->mutex); |
742 | |
743 | return 0; |
744 | err: |
745 | mutex_unlock(lock: &tr->mutex); |
746 | |
747 | if (shim_link) |
748 | bpf_link_put(link: &shim_link->link.link); |
749 | |
750 | /* have to release tr while _not_ holding its mutex */ |
751 | bpf_trampoline_put(tr); /* bpf_trampoline_get above */ |
752 | |
753 | return err; |
754 | } |
755 | |
756 | void bpf_trampoline_unlink_cgroup_shim(struct bpf_prog *prog) |
757 | { |
758 | struct bpf_shim_tramp_link *shim_link = NULL; |
759 | struct bpf_trampoline *tr; |
760 | bpf_func_t bpf_func; |
761 | u64 key; |
762 | |
763 | key = bpf_trampoline_compute_key(NULL, btf: prog->aux->attach_btf, |
764 | btf_id: prog->aux->attach_btf_id); |
765 | |
766 | bpf_lsm_find_cgroup_shim(prog, bpf_func: &bpf_func); |
767 | tr = bpf_trampoline_lookup(key); |
768 | if (WARN_ON_ONCE(!tr)) |
769 | return; |
770 | |
771 | mutex_lock(&tr->mutex); |
772 | shim_link = cgroup_shim_find(tr, bpf_func); |
773 | mutex_unlock(lock: &tr->mutex); |
774 | |
775 | if (shim_link) |
776 | bpf_link_put(link: &shim_link->link.link); |
777 | |
778 | bpf_trampoline_put(tr); /* bpf_trampoline_lookup above */ |
779 | } |
780 | #endif |
781 | |
782 | struct bpf_trampoline *bpf_trampoline_get(u64 key, |
783 | struct bpf_attach_target_info *tgt_info) |
784 | { |
785 | struct bpf_trampoline *tr; |
786 | |
787 | tr = bpf_trampoline_lookup(key); |
788 | if (!tr) |
789 | return NULL; |
790 | |
791 | mutex_lock(&tr->mutex); |
792 | if (tr->func.addr) |
793 | goto out; |
794 | |
795 | memcpy(&tr->func.model, &tgt_info->fmodel, sizeof(tgt_info->fmodel)); |
796 | tr->func.addr = (void *)tgt_info->tgt_addr; |
797 | out: |
798 | mutex_unlock(lock: &tr->mutex); |
799 | return tr; |
800 | } |
801 | |
802 | void bpf_trampoline_put(struct bpf_trampoline *tr) |
803 | { |
804 | int i; |
805 | |
806 | if (!tr) |
807 | return; |
808 | mutex_lock(&trampoline_mutex); |
809 | if (!refcount_dec_and_test(r: &tr->refcnt)) |
810 | goto out; |
811 | WARN_ON_ONCE(mutex_is_locked(&tr->mutex)); |
812 | |
813 | for (i = 0; i < BPF_TRAMP_MAX; i++) |
814 | if (WARN_ON_ONCE(!hlist_empty(&tr->progs_hlist[i]))) |
815 | goto out; |
816 | |
817 | /* This code will be executed even when the last bpf_tramp_image |
818 | * is alive. All progs are detached from the trampoline and the |
819 | * trampoline image is patched with jmp into epilogue to skip |
820 | * fexit progs. The fentry-only trampoline will be freed via |
821 | * multiple rcu callbacks. |
822 | */ |
823 | hlist_del(n: &tr->hlist); |
824 | if (tr->fops) { |
825 | ftrace_free_filter(ops: tr->fops); |
826 | kfree(objp: tr->fops); |
827 | } |
828 | kfree(objp: tr); |
829 | out: |
830 | mutex_unlock(lock: &trampoline_mutex); |
831 | } |
832 | |
833 | #define NO_START_TIME 1 |
834 | static __always_inline u64 notrace bpf_prog_start_time(void) |
835 | { |
836 | u64 start = NO_START_TIME; |
837 | |
838 | if (static_branch_unlikely(&bpf_stats_enabled_key)) { |
839 | start = sched_clock(); |
840 | if (unlikely(!start)) |
841 | start = NO_START_TIME; |
842 | } |
843 | return start; |
844 | } |
845 | |
846 | /* The logic is similar to bpf_prog_run(), but with an explicit |
847 | * rcu_read_lock() and migrate_disable() which are required |
848 | * for the trampoline. The macro is split into |
849 | * call __bpf_prog_enter |
850 | * call prog->bpf_func |
851 | * call __bpf_prog_exit |
852 | * |
853 | * __bpf_prog_enter returns: |
854 | * 0 - skip execution of the bpf prog |
855 | * 1 - execute bpf prog |
856 | * [2..MAX_U64] - execute bpf prog and record execution time. |
857 | * This is start time. |
858 | */ |
859 | static u64 notrace __bpf_prog_enter_recur(struct bpf_prog *prog, struct bpf_tramp_run_ctx *run_ctx) |
860 | __acquires(RCU) |
861 | { |
862 | rcu_read_lock(); |
863 | migrate_disable(); |
864 | |
865 | run_ctx->saved_run_ctx = bpf_set_run_ctx(new_ctx: &run_ctx->run_ctx); |
866 | |
867 | if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { |
868 | bpf_prog_inc_misses_counter(prog); |
869 | return 0; |
870 | } |
871 | return bpf_prog_start_time(); |
872 | } |
873 | |
874 | static void notrace update_prog_stats(struct bpf_prog *prog, |
875 | u64 start) |
876 | { |
877 | struct bpf_prog_stats *stats; |
878 | |
879 | if (static_branch_unlikely(&bpf_stats_enabled_key) && |
880 | /* static_key could be enabled in __bpf_prog_enter* |
881 | * and disabled in __bpf_prog_exit*. |
882 | * And vice versa. |
883 | * Hence check that 'start' is valid. |
884 | */ |
885 | start > NO_START_TIME) { |
886 | unsigned long flags; |
887 | |
888 | stats = this_cpu_ptr(prog->stats); |
889 | flags = u64_stats_update_begin_irqsave(syncp: &stats->syncp); |
890 | u64_stats_inc(p: &stats->cnt); |
891 | u64_stats_add(p: &stats->nsecs, val: sched_clock() - start); |
892 | u64_stats_update_end_irqrestore(syncp: &stats->syncp, flags); |
893 | } |
894 | } |
895 | |
896 | static void notrace __bpf_prog_exit_recur(struct bpf_prog *prog, u64 start, |
897 | struct bpf_tramp_run_ctx *run_ctx) |
898 | __releases(RCU) |
899 | { |
900 | bpf_reset_run_ctx(old_ctx: run_ctx->saved_run_ctx); |
901 | |
902 | update_prog_stats(prog, start); |
903 | this_cpu_dec(*(prog->active)); |
904 | migrate_enable(); |
905 | rcu_read_unlock(); |
906 | } |
907 | |
908 | static u64 notrace __bpf_prog_enter_lsm_cgroup(struct bpf_prog *prog, |
909 | struct bpf_tramp_run_ctx *run_ctx) |
910 | __acquires(RCU) |
911 | { |
912 | /* Runtime stats are exported via actual BPF_LSM_CGROUP |
913 | * programs, not the shims. |
914 | */ |
915 | rcu_read_lock(); |
916 | migrate_disable(); |
917 | |
918 | run_ctx->saved_run_ctx = bpf_set_run_ctx(new_ctx: &run_ctx->run_ctx); |
919 | |
920 | return NO_START_TIME; |
921 | } |
922 | |
923 | static void notrace __bpf_prog_exit_lsm_cgroup(struct bpf_prog *prog, u64 start, |
924 | struct bpf_tramp_run_ctx *run_ctx) |
925 | __releases(RCU) |
926 | { |
927 | bpf_reset_run_ctx(old_ctx: run_ctx->saved_run_ctx); |
928 | |
929 | migrate_enable(); |
930 | rcu_read_unlock(); |
931 | } |
932 | |
933 | u64 notrace __bpf_prog_enter_sleepable_recur(struct bpf_prog *prog, |
934 | struct bpf_tramp_run_ctx *run_ctx) |
935 | { |
936 | rcu_read_lock_trace(); |
937 | migrate_disable(); |
938 | might_fault(); |
939 | |
940 | run_ctx->saved_run_ctx = bpf_set_run_ctx(new_ctx: &run_ctx->run_ctx); |
941 | |
942 | if (unlikely(this_cpu_inc_return(*(prog->active)) != 1)) { |
943 | bpf_prog_inc_misses_counter(prog); |
944 | return 0; |
945 | } |
946 | return bpf_prog_start_time(); |
947 | } |
948 | |
949 | void notrace __bpf_prog_exit_sleepable_recur(struct bpf_prog *prog, u64 start, |
950 | struct bpf_tramp_run_ctx *run_ctx) |
951 | { |
952 | bpf_reset_run_ctx(old_ctx: run_ctx->saved_run_ctx); |
953 | |
954 | update_prog_stats(prog, start); |
955 | this_cpu_dec(*(prog->active)); |
956 | migrate_enable(); |
957 | rcu_read_unlock_trace(); |
958 | } |
959 | |
960 | static u64 notrace __bpf_prog_enter_sleepable(struct bpf_prog *prog, |
961 | struct bpf_tramp_run_ctx *run_ctx) |
962 | { |
963 | rcu_read_lock_trace(); |
964 | migrate_disable(); |
965 | might_fault(); |
966 | |
967 | run_ctx->saved_run_ctx = bpf_set_run_ctx(new_ctx: &run_ctx->run_ctx); |
968 | |
969 | return bpf_prog_start_time(); |
970 | } |
971 | |
972 | static void notrace __bpf_prog_exit_sleepable(struct bpf_prog *prog, u64 start, |
973 | struct bpf_tramp_run_ctx *run_ctx) |
974 | { |
975 | bpf_reset_run_ctx(old_ctx: run_ctx->saved_run_ctx); |
976 | |
977 | update_prog_stats(prog, start); |
978 | migrate_enable(); |
979 | rcu_read_unlock_trace(); |
980 | } |
981 | |
982 | static u64 notrace __bpf_prog_enter(struct bpf_prog *prog, |
983 | struct bpf_tramp_run_ctx *run_ctx) |
984 | __acquires(RCU) |
985 | { |
986 | rcu_read_lock(); |
987 | migrate_disable(); |
988 | |
989 | run_ctx->saved_run_ctx = bpf_set_run_ctx(new_ctx: &run_ctx->run_ctx); |
990 | |
991 | return bpf_prog_start_time(); |
992 | } |
993 | |
994 | static void notrace __bpf_prog_exit(struct bpf_prog *prog, u64 start, |
995 | struct bpf_tramp_run_ctx *run_ctx) |
996 | __releases(RCU) |
997 | { |
998 | bpf_reset_run_ctx(old_ctx: run_ctx->saved_run_ctx); |
999 | |
1000 | update_prog_stats(prog, start); |
1001 | migrate_enable(); |
1002 | rcu_read_unlock(); |
1003 | } |
1004 | |
1005 | void notrace __bpf_tramp_enter(struct bpf_tramp_image *tr) |
1006 | { |
1007 | percpu_ref_get(ref: &tr->pcref); |
1008 | } |
1009 | |
1010 | void notrace __bpf_tramp_exit(struct bpf_tramp_image *tr) |
1011 | { |
1012 | percpu_ref_put(ref: &tr->pcref); |
1013 | } |
1014 | |
1015 | bpf_trampoline_enter_t bpf_trampoline_enter(const struct bpf_prog *prog) |
1016 | { |
1017 | bool sleepable = prog->sleepable; |
1018 | |
1019 | if (bpf_prog_check_recur(prog)) |
1020 | return sleepable ? __bpf_prog_enter_sleepable_recur : |
1021 | __bpf_prog_enter_recur; |
1022 | |
1023 | if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && |
1024 | prog->expected_attach_type == BPF_LSM_CGROUP) |
1025 | return __bpf_prog_enter_lsm_cgroup; |
1026 | |
1027 | return sleepable ? __bpf_prog_enter_sleepable : __bpf_prog_enter; |
1028 | } |
1029 | |
1030 | bpf_trampoline_exit_t bpf_trampoline_exit(const struct bpf_prog *prog) |
1031 | { |
1032 | bool sleepable = prog->sleepable; |
1033 | |
1034 | if (bpf_prog_check_recur(prog)) |
1035 | return sleepable ? __bpf_prog_exit_sleepable_recur : |
1036 | __bpf_prog_exit_recur; |
1037 | |
1038 | if (resolve_prog_type(prog) == BPF_PROG_TYPE_LSM && |
1039 | prog->expected_attach_type == BPF_LSM_CGROUP) |
1040 | return __bpf_prog_exit_lsm_cgroup; |
1041 | |
1042 | return sleepable ? __bpf_prog_exit_sleepable : __bpf_prog_exit; |
1043 | } |
1044 | |
1045 | int __weak |
1046 | arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, void *image, void *image_end, |
1047 | const struct btf_func_model *m, u32 flags, |
1048 | struct bpf_tramp_links *tlinks, |
1049 | void *func_addr) |
1050 | { |
1051 | return -ENOTSUPP; |
1052 | } |
1053 | |
1054 | void * __weak arch_alloc_bpf_trampoline(unsigned int size) |
1055 | { |
1056 | void *image; |
1057 | |
1058 | if (WARN_ON_ONCE(size > PAGE_SIZE)) |
1059 | return NULL; |
1060 | image = bpf_jit_alloc_exec(PAGE_SIZE); |
1061 | if (image) |
1062 | set_vm_flush_reset_perms(image); |
1063 | return image; |
1064 | } |
1065 | |
1066 | void __weak arch_free_bpf_trampoline(void *image, unsigned int size) |
1067 | { |
1068 | WARN_ON_ONCE(size > PAGE_SIZE); |
1069 | /* bpf_jit_free_exec doesn't need "size", but |
1070 | * bpf_prog_pack_free() needs it. |
1071 | */ |
1072 | bpf_jit_free_exec(addr: image); |
1073 | } |
1074 | |
1075 | void __weak arch_protect_bpf_trampoline(void *image, unsigned int size) |
1076 | { |
1077 | WARN_ON_ONCE(size > PAGE_SIZE); |
1078 | set_memory_rox(addr: (long)image, numpages: 1); |
1079 | } |
1080 | |
1081 | void __weak arch_unprotect_bpf_trampoline(void *image, unsigned int size) |
1082 | { |
1083 | WARN_ON_ONCE(size > PAGE_SIZE); |
1084 | set_memory_nx(addr: (long)image, numpages: 1); |
1085 | set_memory_rw(addr: (long)image, numpages: 1); |
1086 | } |
1087 | |
1088 | int __weak arch_bpf_trampoline_size(const struct btf_func_model *m, u32 flags, |
1089 | struct bpf_tramp_links *tlinks, void *func_addr) |
1090 | { |
1091 | return -ENOTSUPP; |
1092 | } |
1093 | |
1094 | static int __init init_trampolines(void) |
1095 | { |
1096 | int i; |
1097 | |
1098 | for (i = 0; i < TRAMPOLINE_TABLE_SIZE; i++) |
1099 | INIT_HLIST_HEAD(&trampoline_table[i]); |
1100 | return 0; |
1101 | } |
1102 | late_initcall(init_trampolines); |
1103 | |