1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Context tracking: Probe on high level context boundaries such as kernel, |
4 | * userspace, guest or idle. |
5 | * |
6 | * This is used by RCU to remove its dependency on the timer tick while a CPU |
7 | * runs in idle, userspace or guest mode. |
8 | * |
9 | * User/guest tracking started by Frederic Weisbecker: |
10 | * |
11 | * Copyright (C) 2012 Red Hat, Inc., Frederic Weisbecker |
12 | * |
13 | * Many thanks to Gilad Ben-Yossef, Paul McKenney, Ingo Molnar, Andrew Morton, |
14 | * Steven Rostedt, Peter Zijlstra for suggestions and improvements. |
15 | * |
16 | * RCU extended quiescent state bits imported from kernel/rcu/tree.c |
17 | * where the relevant authorship may be found. |
18 | */ |
19 | |
20 | #include <linux/context_tracking.h> |
21 | #include <linux/rcupdate.h> |
22 | #include <linux/sched.h> |
23 | #include <linux/hardirq.h> |
24 | #include <linux/export.h> |
25 | #include <linux/kprobes.h> |
26 | #include <trace/events/rcu.h> |
27 | |
28 | |
29 | DEFINE_PER_CPU(struct context_tracking, context_tracking) = { |
30 | #ifdef CONFIG_CONTEXT_TRACKING_IDLE |
31 | .dynticks_nesting = 1, |
32 | .dynticks_nmi_nesting = DYNTICK_IRQ_NONIDLE, |
33 | #endif |
34 | .state = ATOMIC_INIT(RCU_DYNTICKS_IDX), |
35 | }; |
36 | EXPORT_SYMBOL_GPL(context_tracking); |
37 | |
38 | #ifdef CONFIG_CONTEXT_TRACKING_IDLE |
39 | #define TPS(x) tracepoint_string(x) |
40 | |
41 | /* Record the current task on dyntick-idle entry. */ |
42 | static __always_inline void rcu_dynticks_task_enter(void) |
43 | { |
44 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) |
45 | WRITE_ONCE(current->rcu_tasks_idle_cpu, smp_processor_id()); |
46 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ |
47 | } |
48 | |
49 | /* Record no current task on dyntick-idle exit. */ |
50 | static __always_inline void rcu_dynticks_task_exit(void) |
51 | { |
52 | #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) |
53 | WRITE_ONCE(current->rcu_tasks_idle_cpu, -1); |
54 | #endif /* #if defined(CONFIG_TASKS_RCU) && defined(CONFIG_NO_HZ_FULL) */ |
55 | } |
56 | |
57 | /* Turn on heavyweight RCU tasks trace readers on idle/user entry. */ |
58 | static __always_inline void rcu_dynticks_task_trace_enter(void) |
59 | { |
60 | #ifdef CONFIG_TASKS_TRACE_RCU |
61 | if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) |
62 | current->trc_reader_special.b.need_mb = true; |
63 | #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ |
64 | } |
65 | |
66 | /* Turn off heavyweight RCU tasks trace readers on idle/user exit. */ |
67 | static __always_inline void rcu_dynticks_task_trace_exit(void) |
68 | { |
69 | #ifdef CONFIG_TASKS_TRACE_RCU |
70 | if (IS_ENABLED(CONFIG_TASKS_TRACE_RCU_READ_MB)) |
71 | current->trc_reader_special.b.need_mb = false; |
72 | #endif /* #ifdef CONFIG_TASKS_TRACE_RCU */ |
73 | } |
74 | |
75 | /* |
76 | * Record entry into an extended quiescent state. This is only to be |
77 | * called when not already in an extended quiescent state, that is, |
78 | * RCU is watching prior to the call to this function and is no longer |
79 | * watching upon return. |
80 | */ |
81 | static noinstr void ct_kernel_exit_state(int offset) |
82 | { |
83 | int seq; |
84 | |
85 | /* |
86 | * CPUs seeing atomic_add_return() must see prior RCU read-side |
87 | * critical sections, and we also must force ordering with the |
88 | * next idle sojourn. |
89 | */ |
90 | rcu_dynticks_task_trace_enter(); // Before ->dynticks update! |
91 | seq = ct_state_inc(incby: offset); |
92 | // RCU is no longer watching. Better be in extended quiescent state! |
93 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && (seq & RCU_DYNTICKS_IDX)); |
94 | } |
95 | |
96 | /* |
97 | * Record exit from an extended quiescent state. This is only to be |
98 | * called from an extended quiescent state, that is, RCU is not watching |
99 | * prior to the call to this function and is watching upon return. |
100 | */ |
101 | static noinstr void ct_kernel_enter_state(int offset) |
102 | { |
103 | int seq; |
104 | |
105 | /* |
106 | * CPUs seeing atomic_add_return() must see prior idle sojourns, |
107 | * and we also must force ordering with the next RCU read-side |
108 | * critical section. |
109 | */ |
110 | seq = ct_state_inc(incby: offset); |
111 | // RCU is now watching. Better not be in an extended quiescent state! |
112 | rcu_dynticks_task_trace_exit(); // After ->dynticks update! |
113 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !(seq & RCU_DYNTICKS_IDX)); |
114 | } |
115 | |
116 | /* |
117 | * Enter an RCU extended quiescent state, which can be either the |
118 | * idle loop or adaptive-tickless usermode execution. |
119 | * |
120 | * We crowbar the ->dynticks_nmi_nesting field to zero to allow for |
121 | * the possibility of usermode upcalls having messed up our count |
122 | * of interrupt nesting level during the prior busy period. |
123 | */ |
124 | static void noinstr ct_kernel_exit(bool user, int offset) |
125 | { |
126 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
127 | |
128 | WARN_ON_ONCE(ct_dynticks_nmi_nesting() != DYNTICK_IRQ_NONIDLE); |
129 | WRITE_ONCE(ct->dynticks_nmi_nesting, 0); |
130 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && |
131 | ct_dynticks_nesting() == 0); |
132 | if (ct_dynticks_nesting() != 1) { |
133 | // RCU will still be watching, so just do accounting and leave. |
134 | ct->dynticks_nesting--; |
135 | return; |
136 | } |
137 | |
138 | instrumentation_begin(); |
139 | lockdep_assert_irqs_disabled(); |
140 | trace_rcu_dyntick(TPS("Start" ), oldnesting: ct_dynticks_nesting(), newnesting: 0, dynticks: ct_dynticks()); |
141 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); |
142 | rcu_preempt_deferred_qs(current); |
143 | |
144 | // instrumentation for the noinstr ct_kernel_exit_state() |
145 | instrument_atomic_write(v: &ct->state, size: sizeof(ct->state)); |
146 | |
147 | instrumentation_end(); |
148 | WRITE_ONCE(ct->dynticks_nesting, 0); /* Avoid irq-access tearing. */ |
149 | // RCU is watching here ... |
150 | ct_kernel_exit_state(offset); |
151 | // ... but is no longer watching here. |
152 | rcu_dynticks_task_enter(); |
153 | } |
154 | |
155 | /* |
156 | * Exit an RCU extended quiescent state, which can be either the |
157 | * idle loop or adaptive-tickless usermode execution. |
158 | * |
159 | * We crowbar the ->dynticks_nmi_nesting field to DYNTICK_IRQ_NONIDLE to |
160 | * allow for the possibility of usermode upcalls messing up our count of |
161 | * interrupt nesting level during the busy period that is just now starting. |
162 | */ |
163 | static void noinstr ct_kernel_enter(bool user, int offset) |
164 | { |
165 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
166 | long oldval; |
167 | |
168 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); |
169 | oldval = ct_dynticks_nesting(); |
170 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && oldval < 0); |
171 | if (oldval) { |
172 | // RCU was already watching, so just do accounting and leave. |
173 | ct->dynticks_nesting++; |
174 | return; |
175 | } |
176 | rcu_dynticks_task_exit(); |
177 | // RCU is not watching here ... |
178 | ct_kernel_enter_state(offset); |
179 | // ... but is watching here. |
180 | instrumentation_begin(); |
181 | |
182 | // instrumentation for the noinstr ct_kernel_enter_state() |
183 | instrument_atomic_write(v: &ct->state, size: sizeof(ct->state)); |
184 | |
185 | trace_rcu_dyntick(TPS("End" ), oldnesting: ct_dynticks_nesting(), newnesting: 1, dynticks: ct_dynticks()); |
186 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !user && !is_idle_task(current)); |
187 | WRITE_ONCE(ct->dynticks_nesting, 1); |
188 | WARN_ON_ONCE(ct_dynticks_nmi_nesting()); |
189 | WRITE_ONCE(ct->dynticks_nmi_nesting, DYNTICK_IRQ_NONIDLE); |
190 | instrumentation_end(); |
191 | } |
192 | |
193 | /** |
194 | * ct_nmi_exit - inform RCU of exit from NMI context |
195 | * |
196 | * If we are returning from the outermost NMI handler that interrupted an |
197 | * RCU-idle period, update ct->state and ct->dynticks_nmi_nesting |
198 | * to let the RCU grace-period handling know that the CPU is back to |
199 | * being RCU-idle. |
200 | * |
201 | * If you add or remove a call to ct_nmi_exit(), be sure to test |
202 | * with CONFIG_RCU_EQS_DEBUG=y. |
203 | */ |
204 | void noinstr ct_nmi_exit(void) |
205 | { |
206 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
207 | |
208 | instrumentation_begin(); |
209 | /* |
210 | * Check for ->dynticks_nmi_nesting underflow and bad ->dynticks. |
211 | * (We are exiting an NMI handler, so RCU better be paying attention |
212 | * to us!) |
213 | */ |
214 | WARN_ON_ONCE(ct_dynticks_nmi_nesting() <= 0); |
215 | WARN_ON_ONCE(rcu_dynticks_curr_cpu_in_eqs()); |
216 | |
217 | /* |
218 | * If the nesting level is not 1, the CPU wasn't RCU-idle, so |
219 | * leave it in non-RCU-idle state. |
220 | */ |
221 | if (ct_dynticks_nmi_nesting() != 1) { |
222 | trace_rcu_dyntick(TPS("--=" ), oldnesting: ct_dynticks_nmi_nesting(), newnesting: ct_dynticks_nmi_nesting() - 2, |
223 | dynticks: ct_dynticks()); |
224 | WRITE_ONCE(ct->dynticks_nmi_nesting, /* No store tearing. */ |
225 | ct_dynticks_nmi_nesting() - 2); |
226 | instrumentation_end(); |
227 | return; |
228 | } |
229 | |
230 | /* This NMI interrupted an RCU-idle CPU, restore RCU-idleness. */ |
231 | trace_rcu_dyntick(TPS("Startirq" ), oldnesting: ct_dynticks_nmi_nesting(), newnesting: 0, dynticks: ct_dynticks()); |
232 | WRITE_ONCE(ct->dynticks_nmi_nesting, 0); /* Avoid store tearing. */ |
233 | |
234 | // instrumentation for the noinstr ct_kernel_exit_state() |
235 | instrument_atomic_write(v: &ct->state, size: sizeof(ct->state)); |
236 | instrumentation_end(); |
237 | |
238 | // RCU is watching here ... |
239 | ct_kernel_exit_state(RCU_DYNTICKS_IDX); |
240 | // ... but is no longer watching here. |
241 | |
242 | if (!in_nmi()) |
243 | rcu_dynticks_task_enter(); |
244 | } |
245 | |
246 | /** |
247 | * ct_nmi_enter - inform RCU of entry to NMI context |
248 | * |
249 | * If the CPU was idle from RCU's viewpoint, update ct->state and |
250 | * ct->dynticks_nmi_nesting to let the RCU grace-period handling know |
251 | * that the CPU is active. This implementation permits nested NMIs, as |
252 | * long as the nesting level does not overflow an int. (You will probably |
253 | * run out of stack space first.) |
254 | * |
255 | * If you add or remove a call to ct_nmi_enter(), be sure to test |
256 | * with CONFIG_RCU_EQS_DEBUG=y. |
257 | */ |
258 | void noinstr ct_nmi_enter(void) |
259 | { |
260 | long incby = 2; |
261 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
262 | |
263 | /* Complain about underflow. */ |
264 | WARN_ON_ONCE(ct_dynticks_nmi_nesting() < 0); |
265 | |
266 | /* |
267 | * If idle from RCU viewpoint, atomically increment ->dynticks |
268 | * to mark non-idle and increment ->dynticks_nmi_nesting by one. |
269 | * Otherwise, increment ->dynticks_nmi_nesting by two. This means |
270 | * if ->dynticks_nmi_nesting is equal to one, we are guaranteed |
271 | * to be in the outermost NMI handler that interrupted an RCU-idle |
272 | * period (observation due to Andy Lutomirski). |
273 | */ |
274 | if (rcu_dynticks_curr_cpu_in_eqs()) { |
275 | |
276 | if (!in_nmi()) |
277 | rcu_dynticks_task_exit(); |
278 | |
279 | // RCU is not watching here ... |
280 | ct_kernel_enter_state(RCU_DYNTICKS_IDX); |
281 | // ... but is watching here. |
282 | |
283 | instrumentation_begin(); |
284 | // instrumentation for the noinstr rcu_dynticks_curr_cpu_in_eqs() |
285 | instrument_atomic_read(v: &ct->state, size: sizeof(ct->state)); |
286 | // instrumentation for the noinstr ct_kernel_enter_state() |
287 | instrument_atomic_write(v: &ct->state, size: sizeof(ct->state)); |
288 | |
289 | incby = 1; |
290 | } else if (!in_nmi()) { |
291 | instrumentation_begin(); |
292 | rcu_irq_enter_check_tick(); |
293 | } else { |
294 | instrumentation_begin(); |
295 | } |
296 | |
297 | trace_rcu_dyntick(polarity: incby == 1 ? TPS("Endirq" ) : TPS("++=" ), |
298 | oldnesting: ct_dynticks_nmi_nesting(), |
299 | newnesting: ct_dynticks_nmi_nesting() + incby, dynticks: ct_dynticks()); |
300 | instrumentation_end(); |
301 | WRITE_ONCE(ct->dynticks_nmi_nesting, /* Prevent store tearing. */ |
302 | ct_dynticks_nmi_nesting() + incby); |
303 | barrier(); |
304 | } |
305 | |
306 | /** |
307 | * ct_idle_enter - inform RCU that current CPU is entering idle |
308 | * |
309 | * Enter idle mode, in other words, -leave- the mode in which RCU |
310 | * read-side critical sections can occur. (Though RCU read-side |
311 | * critical sections can occur in irq handlers in idle, a possibility |
312 | * handled by irq_enter() and irq_exit().) |
313 | * |
314 | * If you add or remove a call to ct_idle_enter(), be sure to test with |
315 | * CONFIG_RCU_EQS_DEBUG=y. |
316 | */ |
317 | void noinstr ct_idle_enter(void) |
318 | { |
319 | WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && !raw_irqs_disabled()); |
320 | ct_kernel_exit(user: false, RCU_DYNTICKS_IDX + CONTEXT_IDLE); |
321 | } |
322 | EXPORT_SYMBOL_GPL(ct_idle_enter); |
323 | |
324 | /** |
325 | * ct_idle_exit - inform RCU that current CPU is leaving idle |
326 | * |
327 | * Exit idle mode, in other words, -enter- the mode in which RCU |
328 | * read-side critical sections can occur. |
329 | * |
330 | * If you add or remove a call to ct_idle_exit(), be sure to test with |
331 | * CONFIG_RCU_EQS_DEBUG=y. |
332 | */ |
333 | void noinstr ct_idle_exit(void) |
334 | { |
335 | unsigned long flags; |
336 | |
337 | raw_local_irq_save(flags); |
338 | ct_kernel_enter(user: false, RCU_DYNTICKS_IDX - CONTEXT_IDLE); |
339 | raw_local_irq_restore(flags); |
340 | } |
341 | EXPORT_SYMBOL_GPL(ct_idle_exit); |
342 | |
343 | /** |
344 | * ct_irq_enter - inform RCU that current CPU is entering irq away from idle |
345 | * |
346 | * Enter an interrupt handler, which might possibly result in exiting |
347 | * idle mode, in other words, entering the mode in which read-side critical |
348 | * sections can occur. The caller must have disabled interrupts. |
349 | * |
350 | * Note that the Linux kernel is fully capable of entering an interrupt |
351 | * handler that it never exits, for example when doing upcalls to user mode! |
352 | * This code assumes that the idle loop never does upcalls to user mode. |
353 | * If your architecture's idle loop does do upcalls to user mode (or does |
354 | * anything else that results in unbalanced calls to the irq_enter() and |
355 | * irq_exit() functions), RCU will give you what you deserve, good and hard. |
356 | * But very infrequently and irreproducibly. |
357 | * |
358 | * Use things like work queues to work around this limitation. |
359 | * |
360 | * You have been warned. |
361 | * |
362 | * If you add or remove a call to ct_irq_enter(), be sure to test with |
363 | * CONFIG_RCU_EQS_DEBUG=y. |
364 | */ |
365 | noinstr void ct_irq_enter(void) |
366 | { |
367 | lockdep_assert_irqs_disabled(); |
368 | ct_nmi_enter(); |
369 | } |
370 | |
371 | /** |
372 | * ct_irq_exit - inform RCU that current CPU is exiting irq towards idle |
373 | * |
374 | * Exit from an interrupt handler, which might possibly result in entering |
375 | * idle mode, in other words, leaving the mode in which read-side critical |
376 | * sections can occur. The caller must have disabled interrupts. |
377 | * |
378 | * This code assumes that the idle loop never does anything that might |
379 | * result in unbalanced calls to irq_enter() and irq_exit(). If your |
380 | * architecture's idle loop violates this assumption, RCU will give you what |
381 | * you deserve, good and hard. But very infrequently and irreproducibly. |
382 | * |
383 | * Use things like work queues to work around this limitation. |
384 | * |
385 | * You have been warned. |
386 | * |
387 | * If you add or remove a call to ct_irq_exit(), be sure to test with |
388 | * CONFIG_RCU_EQS_DEBUG=y. |
389 | */ |
390 | noinstr void ct_irq_exit(void) |
391 | { |
392 | lockdep_assert_irqs_disabled(); |
393 | ct_nmi_exit(); |
394 | } |
395 | |
396 | /* |
397 | * Wrapper for ct_irq_enter() where interrupts are enabled. |
398 | * |
399 | * If you add or remove a call to ct_irq_enter_irqson(), be sure to test |
400 | * with CONFIG_RCU_EQS_DEBUG=y. |
401 | */ |
402 | void ct_irq_enter_irqson(void) |
403 | { |
404 | unsigned long flags; |
405 | |
406 | local_irq_save(flags); |
407 | ct_irq_enter(); |
408 | local_irq_restore(flags); |
409 | } |
410 | |
411 | /* |
412 | * Wrapper for ct_irq_exit() where interrupts are enabled. |
413 | * |
414 | * If you add or remove a call to ct_irq_exit_irqson(), be sure to test |
415 | * with CONFIG_RCU_EQS_DEBUG=y. |
416 | */ |
417 | void ct_irq_exit_irqson(void) |
418 | { |
419 | unsigned long flags; |
420 | |
421 | local_irq_save(flags); |
422 | ct_irq_exit(); |
423 | local_irq_restore(flags); |
424 | } |
425 | #else |
426 | static __always_inline void ct_kernel_exit(bool user, int offset) { } |
427 | static __always_inline void ct_kernel_enter(bool user, int offset) { } |
428 | #endif /* #ifdef CONFIG_CONTEXT_TRACKING_IDLE */ |
429 | |
430 | #ifdef CONFIG_CONTEXT_TRACKING_USER |
431 | |
432 | #define CREATE_TRACE_POINTS |
433 | #include <trace/events/context_tracking.h> |
434 | |
435 | DEFINE_STATIC_KEY_FALSE(context_tracking_key); |
436 | EXPORT_SYMBOL_GPL(context_tracking_key); |
437 | |
438 | static noinstr bool context_tracking_recursion_enter(void) |
439 | { |
440 | int recursion; |
441 | |
442 | recursion = __this_cpu_inc_return(context_tracking.recursion); |
443 | if (recursion == 1) |
444 | return true; |
445 | |
446 | WARN_ONCE((recursion < 1), "Invalid context tracking recursion value %d\n" , recursion); |
447 | __this_cpu_dec(context_tracking.recursion); |
448 | |
449 | return false; |
450 | } |
451 | |
452 | static __always_inline void context_tracking_recursion_exit(void) |
453 | { |
454 | __this_cpu_dec(context_tracking.recursion); |
455 | } |
456 | |
457 | /** |
458 | * __ct_user_enter - Inform the context tracking that the CPU is going |
459 | * to enter user or guest space mode. |
460 | * |
461 | * This function must be called right before we switch from the kernel |
462 | * to user or guest space, when it's guaranteed the remaining kernel |
463 | * instructions to execute won't use any RCU read side critical section |
464 | * because this function sets RCU in extended quiescent state. |
465 | */ |
466 | void noinstr __ct_user_enter(enum ctx_state state) |
467 | { |
468 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
469 | lockdep_assert_irqs_disabled(); |
470 | |
471 | /* Kernel threads aren't supposed to go to userspace */ |
472 | WARN_ON_ONCE(!current->mm); |
473 | |
474 | if (!context_tracking_recursion_enter()) |
475 | return; |
476 | |
477 | if (__ct_state() != state) { |
478 | if (ct->active) { |
479 | /* |
480 | * At this stage, only low level arch entry code remains and |
481 | * then we'll run in userspace. We can assume there won't be |
482 | * any RCU read-side critical section until the next call to |
483 | * user_exit() or ct_irq_enter(). Let's remove RCU's dependency |
484 | * on the tick. |
485 | */ |
486 | if (state == CONTEXT_USER) { |
487 | instrumentation_begin(); |
488 | trace_user_enter(0); |
489 | vtime_user_enter(current); |
490 | instrumentation_end(); |
491 | } |
492 | /* |
493 | * Other than generic entry implementation, we may be past the last |
494 | * rescheduling opportunity in the entry code. Trigger a self IPI |
495 | * that will fire and reschedule once we resume in user/guest mode. |
496 | */ |
497 | rcu_irq_work_resched(); |
498 | |
499 | /* |
500 | * Enter RCU idle mode right before resuming userspace. No use of RCU |
501 | * is permitted between this call and rcu_eqs_exit(). This way the |
502 | * CPU doesn't need to maintain the tick for RCU maintenance purposes |
503 | * when the CPU runs in userspace. |
504 | */ |
505 | ct_kernel_exit(true, RCU_DYNTICKS_IDX + state); |
506 | |
507 | /* |
508 | * Special case if we only track user <-> kernel transitions for tickless |
509 | * cputime accounting but we don't support RCU extended quiescent state. |
510 | * In this we case we don't care about any concurrency/ordering. |
511 | */ |
512 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) |
513 | raw_atomic_set(&ct->state, state); |
514 | } else { |
515 | /* |
516 | * Even if context tracking is disabled on this CPU, because it's outside |
517 | * the full dynticks mask for example, we still have to keep track of the |
518 | * context transitions and states to prevent inconsistency on those of |
519 | * other CPUs. |
520 | * If a task triggers an exception in userspace, sleep on the exception |
521 | * handler and then migrate to another CPU, that new CPU must know where |
522 | * the exception returns by the time we call exception_exit(). |
523 | * This information can only be provided by the previous CPU when it called |
524 | * exception_enter(). |
525 | * OTOH we can spare the calls to vtime and RCU when context_tracking.active |
526 | * is false because we know that CPU is not tickless. |
527 | */ |
528 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { |
529 | /* Tracking for vtime only, no concurrent RCU EQS accounting */ |
530 | raw_atomic_set(&ct->state, state); |
531 | } else { |
532 | /* |
533 | * Tracking for vtime and RCU EQS. Make sure we don't race |
534 | * with NMIs. OTOH we don't care about ordering here since |
535 | * RCU only requires RCU_DYNTICKS_IDX increments to be fully |
536 | * ordered. |
537 | */ |
538 | raw_atomic_add(state, &ct->state); |
539 | } |
540 | } |
541 | } |
542 | context_tracking_recursion_exit(); |
543 | } |
544 | EXPORT_SYMBOL_GPL(__ct_user_enter); |
545 | |
546 | /* |
547 | * OBSOLETE: |
548 | * This function should be noinstr but the below local_irq_restore() is |
549 | * unsafe because it involves illegal RCU uses through tracing and lockdep. |
550 | * This is unlikely to be fixed as this function is obsolete. The preferred |
551 | * way is to call __context_tracking_enter() through user_enter_irqoff() |
552 | * or context_tracking_guest_enter(). It should be the arch entry code |
553 | * responsibility to call into context tracking with IRQs disabled. |
554 | */ |
555 | void ct_user_enter(enum ctx_state state) |
556 | { |
557 | unsigned long flags; |
558 | |
559 | /* |
560 | * Some contexts may involve an exception occuring in an irq, |
561 | * leading to that nesting: |
562 | * ct_irq_enter() rcu_eqs_exit(true) rcu_eqs_enter(true) ct_irq_exit() |
563 | * This would mess up the dyntick_nesting count though. And rcu_irq_*() |
564 | * helpers are enough to protect RCU uses inside the exception. So |
565 | * just return immediately if we detect we are in an IRQ. |
566 | */ |
567 | if (in_interrupt()) |
568 | return; |
569 | |
570 | local_irq_save(flags); |
571 | __ct_user_enter(state); |
572 | local_irq_restore(flags); |
573 | } |
574 | NOKPROBE_SYMBOL(ct_user_enter); |
575 | EXPORT_SYMBOL_GPL(ct_user_enter); |
576 | |
577 | /** |
578 | * user_enter_callable() - Unfortunate ASM callable version of user_enter() for |
579 | * archs that didn't manage to check the context tracking |
580 | * static key from low level code. |
581 | * |
582 | * This OBSOLETE function should be noinstr but it unsafely calls |
583 | * local_irq_restore(), involving illegal RCU uses through tracing and lockdep. |
584 | * This is unlikely to be fixed as this function is obsolete. The preferred |
585 | * way is to call user_enter_irqoff(). It should be the arch entry code |
586 | * responsibility to call into context tracking with IRQs disabled. |
587 | */ |
588 | void user_enter_callable(void) |
589 | { |
590 | user_enter(); |
591 | } |
592 | NOKPROBE_SYMBOL(user_enter_callable); |
593 | |
594 | /** |
595 | * __ct_user_exit - Inform the context tracking that the CPU is |
596 | * exiting user or guest mode and entering the kernel. |
597 | * |
598 | * This function must be called after we entered the kernel from user or |
599 | * guest space before any use of RCU read side critical section. This |
600 | * potentially include any high level kernel code like syscalls, exceptions, |
601 | * signal handling, etc... |
602 | * |
603 | * This call supports re-entrancy. This way it can be called from any exception |
604 | * handler without needing to know if we came from userspace or not. |
605 | */ |
606 | void noinstr __ct_user_exit(enum ctx_state state) |
607 | { |
608 | struct context_tracking *ct = this_cpu_ptr(&context_tracking); |
609 | |
610 | if (!context_tracking_recursion_enter()) |
611 | return; |
612 | |
613 | if (__ct_state() == state) { |
614 | if (ct->active) { |
615 | /* |
616 | * Exit RCU idle mode while entering the kernel because it can |
617 | * run a RCU read side critical section anytime. |
618 | */ |
619 | ct_kernel_enter(true, RCU_DYNTICKS_IDX - state); |
620 | if (state == CONTEXT_USER) { |
621 | instrumentation_begin(); |
622 | vtime_user_exit(current); |
623 | trace_user_exit(0); |
624 | instrumentation_end(); |
625 | } |
626 | |
627 | /* |
628 | * Special case if we only track user <-> kernel transitions for tickless |
629 | * cputime accounting but we don't support RCU extended quiescent state. |
630 | * In this we case we don't care about any concurrency/ordering. |
631 | */ |
632 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) |
633 | raw_atomic_set(&ct->state, CONTEXT_KERNEL); |
634 | |
635 | } else { |
636 | if (!IS_ENABLED(CONFIG_CONTEXT_TRACKING_IDLE)) { |
637 | /* Tracking for vtime only, no concurrent RCU EQS accounting */ |
638 | raw_atomic_set(&ct->state, CONTEXT_KERNEL); |
639 | } else { |
640 | /* |
641 | * Tracking for vtime and RCU EQS. Make sure we don't race |
642 | * with NMIs. OTOH we don't care about ordering here since |
643 | * RCU only requires RCU_DYNTICKS_IDX increments to be fully |
644 | * ordered. |
645 | */ |
646 | raw_atomic_sub(state, &ct->state); |
647 | } |
648 | } |
649 | } |
650 | context_tracking_recursion_exit(); |
651 | } |
652 | EXPORT_SYMBOL_GPL(__ct_user_exit); |
653 | |
654 | /* |
655 | * OBSOLETE: |
656 | * This function should be noinstr but the below local_irq_save() is |
657 | * unsafe because it involves illegal RCU uses through tracing and lockdep. |
658 | * This is unlikely to be fixed as this function is obsolete. The preferred |
659 | * way is to call __context_tracking_exit() through user_exit_irqoff() |
660 | * or context_tracking_guest_exit(). It should be the arch entry code |
661 | * responsibility to call into context tracking with IRQs disabled. |
662 | */ |
663 | void ct_user_exit(enum ctx_state state) |
664 | { |
665 | unsigned long flags; |
666 | |
667 | if (in_interrupt()) |
668 | return; |
669 | |
670 | local_irq_save(flags); |
671 | __ct_user_exit(state); |
672 | local_irq_restore(flags); |
673 | } |
674 | NOKPROBE_SYMBOL(ct_user_exit); |
675 | EXPORT_SYMBOL_GPL(ct_user_exit); |
676 | |
677 | /** |
678 | * user_exit_callable() - Unfortunate ASM callable version of user_exit() for |
679 | * archs that didn't manage to check the context tracking |
680 | * static key from low level code. |
681 | * |
682 | * This OBSOLETE function should be noinstr but it unsafely calls local_irq_save(), |
683 | * involving illegal RCU uses through tracing and lockdep. This is unlikely |
684 | * to be fixed as this function is obsolete. The preferred way is to call |
685 | * user_exit_irqoff(). It should be the arch entry code responsibility to |
686 | * call into context tracking with IRQs disabled. |
687 | */ |
688 | void user_exit_callable(void) |
689 | { |
690 | user_exit(); |
691 | } |
692 | NOKPROBE_SYMBOL(user_exit_callable); |
693 | |
694 | void __init ct_cpu_track_user(int cpu) |
695 | { |
696 | static __initdata bool initialized = false; |
697 | |
698 | if (!per_cpu(context_tracking.active, cpu)) { |
699 | per_cpu(context_tracking.active, cpu) = true; |
700 | static_branch_inc(&context_tracking_key); |
701 | } |
702 | |
703 | if (initialized) |
704 | return; |
705 | |
706 | #ifdef CONFIG_HAVE_TIF_NOHZ |
707 | /* |
708 | * Set TIF_NOHZ to init/0 and let it propagate to all tasks through fork |
709 | * This assumes that init is the only task at this early boot stage. |
710 | */ |
711 | set_tsk_thread_flag(&init_task, TIF_NOHZ); |
712 | #endif |
713 | WARN_ON_ONCE(!tasklist_empty()); |
714 | |
715 | initialized = true; |
716 | } |
717 | |
718 | #ifdef CONFIG_CONTEXT_TRACKING_USER_FORCE |
719 | void __init context_tracking_init(void) |
720 | { |
721 | int cpu; |
722 | |
723 | for_each_possible_cpu(cpu) |
724 | ct_cpu_track_user(cpu); |
725 | } |
726 | #endif |
727 | |
728 | #endif /* #ifdef CONFIG_CONTEXT_TRACKING_USER */ |
729 | |