1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * sched_clock() for unstable CPU clocks |
4 | * |
5 | * Copyright (C) 2008 Red Hat, Inc., Peter Zijlstra |
6 | * |
7 | * Updates and enhancements: |
8 | * Copyright (C) 2008 Red Hat, Inc. Steven Rostedt <srostedt@redhat.com> |
9 | * |
10 | * Based on code by: |
11 | * Ingo Molnar <mingo@redhat.com> |
12 | * Guillaume Chazarain <guichaz@gmail.com> |
13 | * |
14 | * |
15 | * What this file implements: |
16 | * |
17 | * cpu_clock(i) provides a fast (execution time) high resolution |
18 | * clock with bounded drift between CPUs. The value of cpu_clock(i) |
19 | * is monotonic for constant i. The timestamp returned is in nanoseconds. |
20 | * |
21 | * ######################### BIG FAT WARNING ########################## |
22 | * # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can # |
23 | * # go backwards !! # |
24 | * #################################################################### |
25 | * |
26 | * There is no strict promise about the base, although it tends to start |
27 | * at 0 on boot (but people really shouldn't rely on that). |
28 | * |
29 | * cpu_clock(i) -- can be used from any context, including NMI. |
30 | * local_clock() -- is cpu_clock() on the current CPU. |
31 | * |
32 | * sched_clock_cpu(i) |
33 | * |
34 | * How it is implemented: |
35 | * |
36 | * The implementation either uses sched_clock() when |
37 | * !CONFIG_HAVE_UNSTABLE_SCHED_CLOCK, which means in that case the |
38 | * sched_clock() is assumed to provide these properties (mostly it means |
39 | * the architecture provides a globally synchronized highres time source). |
40 | * |
41 | * Otherwise it tries to create a semi stable clock from a mixture of other |
42 | * clocks, including: |
43 | * |
44 | * - GTOD (clock monotonic) |
45 | * - sched_clock() |
46 | * - explicit idle events |
47 | * |
48 | * We use GTOD as base and use sched_clock() deltas to improve resolution. The |
49 | * deltas are filtered to provide monotonicity and keeping it within an |
50 | * expected window. |
51 | * |
52 | * Furthermore, explicit sleep and wakeup hooks allow us to account for time |
53 | * that is otherwise invisible (TSC gets stopped). |
54 | * |
55 | */ |
56 | |
57 | /* |
58 | * Scheduler clock - returns current time in nanosec units. |
59 | * This is default implementation. |
60 | * Architectures and sub-architectures can override this. |
61 | */ |
62 | notrace unsigned long long __weak sched_clock(void) |
63 | { |
64 | return (unsigned long long)(jiffies - INITIAL_JIFFIES) |
65 | * (NSEC_PER_SEC / HZ); |
66 | } |
67 | EXPORT_SYMBOL_GPL(sched_clock); |
68 | |
69 | static DEFINE_STATIC_KEY_FALSE(sched_clock_running); |
70 | |
71 | #ifdef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK |
72 | /* |
73 | * We must start with !__sched_clock_stable because the unstable -> stable |
74 | * transition is accurate, while the stable -> unstable transition is not. |
75 | * |
76 | * Similarly we start with __sched_clock_stable_early, thereby assuming we |
77 | * will become stable, such that there's only a single 1 -> 0 transition. |
78 | */ |
79 | static DEFINE_STATIC_KEY_FALSE(__sched_clock_stable); |
80 | static int __sched_clock_stable_early = 1; |
81 | |
82 | /* |
83 | * We want: ktime_get_ns() + __gtod_offset == sched_clock() + __sched_clock_offset |
84 | */ |
85 | __read_mostly u64 __sched_clock_offset; |
86 | static __read_mostly u64 __gtod_offset; |
87 | |
88 | struct sched_clock_data { |
89 | u64 tick_raw; |
90 | u64 tick_gtod; |
91 | u64 clock; |
92 | }; |
93 | |
94 | static DEFINE_PER_CPU_SHARED_ALIGNED(struct sched_clock_data, sched_clock_data); |
95 | |
96 | static __always_inline struct sched_clock_data *this_scd(void) |
97 | { |
98 | return this_cpu_ptr(&sched_clock_data); |
99 | } |
100 | |
101 | notrace static inline struct sched_clock_data *cpu_sdc(int cpu) |
102 | { |
103 | return &per_cpu(sched_clock_data, cpu); |
104 | } |
105 | |
106 | notrace int sched_clock_stable(void) |
107 | { |
108 | return static_branch_likely(&__sched_clock_stable); |
109 | } |
110 | |
111 | notrace static void __scd_stamp(struct sched_clock_data *scd) |
112 | { |
113 | scd->tick_gtod = ktime_get_ns(); |
114 | scd->tick_raw = sched_clock(); |
115 | } |
116 | |
117 | notrace static void __set_sched_clock_stable(void) |
118 | { |
119 | struct sched_clock_data *scd; |
120 | |
121 | /* |
122 | * Since we're still unstable and the tick is already running, we have |
123 | * to disable IRQs in order to get a consistent scd->tick* reading. |
124 | */ |
125 | local_irq_disable(); |
126 | scd = this_scd(); |
127 | /* |
128 | * Attempt to make the (initial) unstable->stable transition continuous. |
129 | */ |
130 | __sched_clock_offset = (scd->tick_gtod + __gtod_offset) - (scd->tick_raw); |
131 | local_irq_enable(); |
132 | |
133 | printk(KERN_INFO "sched_clock: Marking stable (%lld, %lld)->(%lld, %lld)\n" , |
134 | scd->tick_gtod, __gtod_offset, |
135 | scd->tick_raw, __sched_clock_offset); |
136 | |
137 | static_branch_enable(&__sched_clock_stable); |
138 | tick_dep_clear(bit: TICK_DEP_BIT_CLOCK_UNSTABLE); |
139 | } |
140 | |
141 | /* |
142 | * If we ever get here, we're screwed, because we found out -- typically after |
143 | * the fact -- that TSC wasn't good. This means all our clocksources (including |
144 | * ktime) could have reported wrong values. |
145 | * |
146 | * What we do here is an attempt to fix up and continue sort of where we left |
147 | * off in a coherent manner. |
148 | * |
149 | * The only way to fully avoid random clock jumps is to boot with: |
150 | * "tsc=unstable". |
151 | */ |
152 | notrace static void __sched_clock_work(struct work_struct *work) |
153 | { |
154 | struct sched_clock_data *scd; |
155 | int cpu; |
156 | |
157 | /* take a current timestamp and set 'now' */ |
158 | preempt_disable(); |
159 | scd = this_scd(); |
160 | __scd_stamp(scd); |
161 | scd->clock = scd->tick_gtod + __gtod_offset; |
162 | preempt_enable(); |
163 | |
164 | /* clone to all CPUs */ |
165 | for_each_possible_cpu(cpu) |
166 | per_cpu(sched_clock_data, cpu) = *scd; |
167 | |
168 | printk(KERN_WARNING "TSC found unstable after boot, most likely due to broken BIOS. Use 'tsc=unstable'.\n" ); |
169 | printk(KERN_INFO "sched_clock: Marking unstable (%lld, %lld)<-(%lld, %lld)\n" , |
170 | scd->tick_gtod, __gtod_offset, |
171 | scd->tick_raw, __sched_clock_offset); |
172 | |
173 | static_branch_disable(&__sched_clock_stable); |
174 | } |
175 | |
176 | static DECLARE_WORK(sched_clock_work, __sched_clock_work); |
177 | |
178 | notrace static void __clear_sched_clock_stable(void) |
179 | { |
180 | if (!sched_clock_stable()) |
181 | return; |
182 | |
183 | tick_dep_set(bit: TICK_DEP_BIT_CLOCK_UNSTABLE); |
184 | schedule_work(work: &sched_clock_work); |
185 | } |
186 | |
187 | notrace void clear_sched_clock_stable(void) |
188 | { |
189 | __sched_clock_stable_early = 0; |
190 | |
191 | smp_mb(); /* matches sched_clock_init_late() */ |
192 | |
193 | if (static_key_count(key: &sched_clock_running.key) == 2) |
194 | __clear_sched_clock_stable(); |
195 | } |
196 | |
197 | notrace static void __sched_clock_gtod_offset(void) |
198 | { |
199 | struct sched_clock_data *scd = this_scd(); |
200 | |
201 | __scd_stamp(scd); |
202 | __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; |
203 | } |
204 | |
205 | void __init sched_clock_init(void) |
206 | { |
207 | /* |
208 | * Set __gtod_offset such that once we mark sched_clock_running, |
209 | * sched_clock_tick() continues where sched_clock() left off. |
210 | * |
211 | * Even if TSC is buggered, we're still UP at this point so it |
212 | * can't really be out of sync. |
213 | */ |
214 | local_irq_disable(); |
215 | __sched_clock_gtod_offset(); |
216 | local_irq_enable(); |
217 | |
218 | static_branch_inc(&sched_clock_running); |
219 | } |
220 | /* |
221 | * We run this as late_initcall() such that it runs after all built-in drivers, |
222 | * notably: acpi_processor and intel_idle, which can mark the TSC as unstable. |
223 | */ |
224 | static int __init sched_clock_init_late(void) |
225 | { |
226 | static_branch_inc(&sched_clock_running); |
227 | /* |
228 | * Ensure that it is impossible to not do a static_key update. |
229 | * |
230 | * Either {set,clear}_sched_clock_stable() must see sched_clock_running |
231 | * and do the update, or we must see their __sched_clock_stable_early |
232 | * and do the update, or both. |
233 | */ |
234 | smp_mb(); /* matches {set,clear}_sched_clock_stable() */ |
235 | |
236 | if (__sched_clock_stable_early) |
237 | __set_sched_clock_stable(); |
238 | |
239 | return 0; |
240 | } |
241 | late_initcall(sched_clock_init_late); |
242 | |
243 | /* |
244 | * min, max except they take wrapping into account |
245 | */ |
246 | |
247 | static __always_inline u64 wrap_min(u64 x, u64 y) |
248 | { |
249 | return (s64)(x - y) < 0 ? x : y; |
250 | } |
251 | |
252 | static __always_inline u64 wrap_max(u64 x, u64 y) |
253 | { |
254 | return (s64)(x - y) > 0 ? x : y; |
255 | } |
256 | |
257 | /* |
258 | * update the percpu scd from the raw @now value |
259 | * |
260 | * - filter out backward motion |
261 | * - use the GTOD tick value to create a window to filter crazy TSC values |
262 | */ |
263 | static __always_inline u64 sched_clock_local(struct sched_clock_data *scd) |
264 | { |
265 | u64 now, clock, old_clock, min_clock, max_clock, gtod; |
266 | s64 delta; |
267 | |
268 | again: |
269 | now = sched_clock_noinstr(); |
270 | delta = now - scd->tick_raw; |
271 | if (unlikely(delta < 0)) |
272 | delta = 0; |
273 | |
274 | old_clock = scd->clock; |
275 | |
276 | /* |
277 | * scd->clock = clamp(scd->tick_gtod + delta, |
278 | * max(scd->tick_gtod, scd->clock), |
279 | * scd->tick_gtod + TICK_NSEC); |
280 | */ |
281 | |
282 | gtod = scd->tick_gtod + __gtod_offset; |
283 | clock = gtod + delta; |
284 | min_clock = wrap_max(x: gtod, y: old_clock); |
285 | max_clock = wrap_max(x: old_clock, y: gtod + TICK_NSEC); |
286 | |
287 | clock = wrap_max(x: clock, y: min_clock); |
288 | clock = wrap_min(x: clock, y: max_clock); |
289 | |
290 | if (!raw_try_cmpxchg64(&scd->clock, &old_clock, clock)) |
291 | goto again; |
292 | |
293 | return clock; |
294 | } |
295 | |
296 | noinstr u64 local_clock_noinstr(void) |
297 | { |
298 | u64 clock; |
299 | |
300 | if (static_branch_likely(&__sched_clock_stable)) |
301 | return sched_clock_noinstr() + __sched_clock_offset; |
302 | |
303 | if (!static_branch_likely(&sched_clock_running)) |
304 | return sched_clock_noinstr(); |
305 | |
306 | clock = sched_clock_local(scd: this_scd()); |
307 | |
308 | return clock; |
309 | } |
310 | |
311 | u64 local_clock(void) |
312 | { |
313 | u64 now; |
314 | preempt_disable_notrace(); |
315 | now = local_clock_noinstr(); |
316 | preempt_enable_notrace(); |
317 | return now; |
318 | } |
319 | EXPORT_SYMBOL_GPL(local_clock); |
320 | |
321 | static notrace u64 sched_clock_remote(struct sched_clock_data *scd) |
322 | { |
323 | struct sched_clock_data *my_scd = this_scd(); |
324 | u64 this_clock, remote_clock; |
325 | u64 *ptr, old_val, val; |
326 | |
327 | #if BITS_PER_LONG != 64 |
328 | again: |
329 | /* |
330 | * Careful here: The local and the remote clock values need to |
331 | * be read out atomic as we need to compare the values and |
332 | * then update either the local or the remote side. So the |
333 | * cmpxchg64 below only protects one readout. |
334 | * |
335 | * We must reread via sched_clock_local() in the retry case on |
336 | * 32-bit kernels as an NMI could use sched_clock_local() via the |
337 | * tracer and hit between the readout of |
338 | * the low 32-bit and the high 32-bit portion. |
339 | */ |
340 | this_clock = sched_clock_local(my_scd); |
341 | /* |
342 | * We must enforce atomic readout on 32-bit, otherwise the |
343 | * update on the remote CPU can hit inbetween the readout of |
344 | * the low 32-bit and the high 32-bit portion. |
345 | */ |
346 | remote_clock = cmpxchg64(&scd->clock, 0, 0); |
347 | #else |
348 | /* |
349 | * On 64-bit kernels the read of [my]scd->clock is atomic versus the |
350 | * update, so we can avoid the above 32-bit dance. |
351 | */ |
352 | sched_clock_local(scd: my_scd); |
353 | again: |
354 | this_clock = my_scd->clock; |
355 | remote_clock = scd->clock; |
356 | #endif |
357 | |
358 | /* |
359 | * Use the opportunity that we have both locks |
360 | * taken to couple the two clocks: we take the |
361 | * larger time as the latest time for both |
362 | * runqueues. (this creates monotonic movement) |
363 | */ |
364 | if (likely((s64)(remote_clock - this_clock) < 0)) { |
365 | ptr = &scd->clock; |
366 | old_val = remote_clock; |
367 | val = this_clock; |
368 | } else { |
369 | /* |
370 | * Should be rare, but possible: |
371 | */ |
372 | ptr = &my_scd->clock; |
373 | old_val = this_clock; |
374 | val = remote_clock; |
375 | } |
376 | |
377 | if (!try_cmpxchg64(ptr, &old_val, val)) |
378 | goto again; |
379 | |
380 | return val; |
381 | } |
382 | |
383 | /* |
384 | * Similar to cpu_clock(), but requires local IRQs to be disabled. |
385 | * |
386 | * See cpu_clock(). |
387 | */ |
388 | notrace u64 sched_clock_cpu(int cpu) |
389 | { |
390 | struct sched_clock_data *scd; |
391 | u64 clock; |
392 | |
393 | if (sched_clock_stable()) |
394 | return sched_clock() + __sched_clock_offset; |
395 | |
396 | if (!static_branch_likely(&sched_clock_running)) |
397 | return sched_clock(); |
398 | |
399 | preempt_disable_notrace(); |
400 | scd = cpu_sdc(cpu); |
401 | |
402 | if (cpu != smp_processor_id()) |
403 | clock = sched_clock_remote(scd); |
404 | else |
405 | clock = sched_clock_local(scd); |
406 | preempt_enable_notrace(); |
407 | |
408 | return clock; |
409 | } |
410 | EXPORT_SYMBOL_GPL(sched_clock_cpu); |
411 | |
412 | notrace void sched_clock_tick(void) |
413 | { |
414 | struct sched_clock_data *scd; |
415 | |
416 | if (sched_clock_stable()) |
417 | return; |
418 | |
419 | if (!static_branch_likely(&sched_clock_running)) |
420 | return; |
421 | |
422 | lockdep_assert_irqs_disabled(); |
423 | |
424 | scd = this_scd(); |
425 | __scd_stamp(scd); |
426 | sched_clock_local(scd); |
427 | } |
428 | |
429 | notrace void sched_clock_tick_stable(void) |
430 | { |
431 | if (!sched_clock_stable()) |
432 | return; |
433 | |
434 | /* |
435 | * Called under watchdog_lock. |
436 | * |
437 | * The watchdog just found this TSC to (still) be stable, so now is a |
438 | * good moment to update our __gtod_offset. Because once we find the |
439 | * TSC to be unstable, any computation will be computing crap. |
440 | */ |
441 | local_irq_disable(); |
442 | __sched_clock_gtod_offset(); |
443 | local_irq_enable(); |
444 | } |
445 | |
446 | /* |
447 | * We are going deep-idle (irqs are disabled): |
448 | */ |
449 | notrace void sched_clock_idle_sleep_event(void) |
450 | { |
451 | sched_clock_cpu(smp_processor_id()); |
452 | } |
453 | EXPORT_SYMBOL_GPL(sched_clock_idle_sleep_event); |
454 | |
455 | /* |
456 | * We just idled; resync with ktime. |
457 | */ |
458 | notrace void sched_clock_idle_wakeup_event(void) |
459 | { |
460 | unsigned long flags; |
461 | |
462 | if (sched_clock_stable()) |
463 | return; |
464 | |
465 | if (unlikely(timekeeping_suspended)) |
466 | return; |
467 | |
468 | local_irq_save(flags); |
469 | sched_clock_tick(); |
470 | local_irq_restore(flags); |
471 | } |
472 | EXPORT_SYMBOL_GPL(sched_clock_idle_wakeup_event); |
473 | |
474 | #else /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
475 | |
476 | void __init sched_clock_init(void) |
477 | { |
478 | static_branch_inc(&sched_clock_running); |
479 | local_irq_disable(); |
480 | generic_sched_clock_init(); |
481 | local_irq_enable(); |
482 | } |
483 | |
484 | notrace u64 sched_clock_cpu(int cpu) |
485 | { |
486 | if (!static_branch_likely(&sched_clock_running)) |
487 | return 0; |
488 | |
489 | return sched_clock(); |
490 | } |
491 | |
492 | #endif /* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK */ |
493 | |
494 | /* |
495 | * Running clock - returns the time that has elapsed while a guest has been |
496 | * running. |
497 | * On a guest this value should be local_clock minus the time the guest was |
498 | * suspended by the hypervisor (for any reason). |
499 | * On bare metal this function should return the same as local_clock. |
500 | * Architectures and sub-architectures can override this. |
501 | */ |
502 | notrace u64 __weak running_clock(void) |
503 | { |
504 | return local_clock(); |
505 | } |
506 | |