1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * check TSC synchronization. |
4 | * |
5 | * Copyright (C) 2006, Red Hat, Inc., Ingo Molnar |
6 | * |
7 | * We check whether all boot CPUs have their TSC's synchronized, |
8 | * print a warning if not and turn off the TSC clock-source. |
9 | * |
10 | * The warp-check is point-to-point between two CPUs, the CPU |
11 | * initiating the bootup is the 'source CPU', the freshly booting |
12 | * CPU is the 'target CPU'. |
13 | * |
14 | * Only two CPUs may participate - they can enter in any order. |
15 | * ( The serial nature of the boot logic and the CPU hotplug lock |
16 | * protects against more than 2 CPUs entering this code. ) |
17 | */ |
18 | #include <linux/workqueue.h> |
19 | #include <linux/topology.h> |
20 | #include <linux/spinlock.h> |
21 | #include <linux/kernel.h> |
22 | #include <linux/smp.h> |
23 | #include <linux/nmi.h> |
24 | #include <asm/tsc.h> |
25 | |
26 | struct tsc_adjust { |
27 | s64 bootval; |
28 | s64 adjusted; |
29 | unsigned long nextcheck; |
30 | bool warned; |
31 | }; |
32 | |
33 | static DEFINE_PER_CPU(struct tsc_adjust, tsc_adjust); |
34 | static struct timer_list tsc_sync_check_timer; |
35 | |
36 | /* |
37 | * TSC's on different sockets may be reset asynchronously. |
38 | * This may cause the TSC ADJUST value on socket 0 to be NOT 0. |
39 | */ |
40 | bool __read_mostly tsc_async_resets; |
41 | |
42 | void mark_tsc_async_resets(char *reason) |
43 | { |
44 | if (tsc_async_resets) |
45 | return; |
46 | tsc_async_resets = true; |
47 | pr_info("tsc: Marking TSC async resets true due to %s\n" , reason); |
48 | } |
49 | |
50 | void tsc_verify_tsc_adjust(bool resume) |
51 | { |
52 | struct tsc_adjust *adj = this_cpu_ptr(&tsc_adjust); |
53 | s64 curval; |
54 | |
55 | if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) |
56 | return; |
57 | |
58 | /* Skip unnecessary error messages if TSC already unstable */ |
59 | if (check_tsc_unstable()) |
60 | return; |
61 | |
62 | /* Rate limit the MSR check */ |
63 | if (!resume && time_before(jiffies, adj->nextcheck)) |
64 | return; |
65 | |
66 | adj->nextcheck = jiffies + HZ; |
67 | |
68 | rdmsrl(MSR_IA32_TSC_ADJUST, curval); |
69 | if (adj->adjusted == curval) |
70 | return; |
71 | |
72 | /* Restore the original value */ |
73 | wrmsrl(MSR_IA32_TSC_ADJUST, val: adj->adjusted); |
74 | |
75 | if (!adj->warned || resume) { |
76 | pr_warn(FW_BUG "TSC ADJUST differs: CPU%u %lld --> %lld. Restoring\n" , |
77 | smp_processor_id(), adj->adjusted, curval); |
78 | adj->warned = true; |
79 | } |
80 | } |
81 | |
82 | /* |
83 | * Normally the tsc_sync will be checked every time system enters idle |
84 | * state, but there is still caveat that a system won't enter idle, |
85 | * either because it's too busy or configured purposely to not enter |
86 | * idle. |
87 | * |
88 | * So setup a periodic timer (every 10 minutes) to make sure the check |
89 | * is always on. |
90 | */ |
91 | |
92 | #define SYNC_CHECK_INTERVAL (HZ * 600) |
93 | |
94 | static void tsc_sync_check_timer_fn(struct timer_list *unused) |
95 | { |
96 | int next_cpu; |
97 | |
98 | tsc_verify_tsc_adjust(resume: false); |
99 | |
100 | /* Run the check for all onlined CPUs in turn */ |
101 | next_cpu = cpumask_next(raw_smp_processor_id(), cpu_online_mask); |
102 | if (next_cpu >= nr_cpu_ids) |
103 | next_cpu = cpumask_first(cpu_online_mask); |
104 | |
105 | tsc_sync_check_timer.expires += SYNC_CHECK_INTERVAL; |
106 | add_timer_on(timer: &tsc_sync_check_timer, cpu: next_cpu); |
107 | } |
108 | |
109 | static int __init start_sync_check_timer(void) |
110 | { |
111 | if (!cpu_feature_enabled(X86_FEATURE_TSC_ADJUST) || tsc_clocksource_reliable) |
112 | return 0; |
113 | |
114 | timer_setup(&tsc_sync_check_timer, tsc_sync_check_timer_fn, 0); |
115 | tsc_sync_check_timer.expires = jiffies + SYNC_CHECK_INTERVAL; |
116 | add_timer(timer: &tsc_sync_check_timer); |
117 | |
118 | return 0; |
119 | } |
120 | late_initcall(start_sync_check_timer); |
121 | |
122 | static void tsc_sanitize_first_cpu(struct tsc_adjust *cur, s64 bootval, |
123 | unsigned int cpu, bool bootcpu) |
124 | { |
125 | /* |
126 | * First online CPU in a package stores the boot value in the |
127 | * adjustment value. This value might change later via the sync |
128 | * mechanism. If that fails we still can yell about boot values not |
129 | * being consistent. |
130 | * |
131 | * On the boot cpu we just force set the ADJUST value to 0 if it's |
132 | * non zero. We don't do that on non boot cpus because physical |
133 | * hotplug should have set the ADJUST register to a value > 0 so |
134 | * the TSC is in sync with the already running cpus. |
135 | * |
136 | * Also don't force the ADJUST value to zero if that is a valid value |
137 | * for socket 0 as determined by the system arch. This is required |
138 | * when multiple sockets are reset asynchronously with each other |
139 | * and socket 0 may not have an TSC ADJUST value of 0. |
140 | */ |
141 | if (bootcpu && bootval != 0) { |
142 | if (likely(!tsc_async_resets)) { |
143 | pr_warn(FW_BUG "TSC ADJUST: CPU%u: %lld force to 0\n" , |
144 | cpu, bootval); |
145 | wrmsrl(MSR_IA32_TSC_ADJUST, val: 0); |
146 | bootval = 0; |
147 | } else { |
148 | pr_info("TSC ADJUST: CPU%u: %lld NOT forced to 0\n" , |
149 | cpu, bootval); |
150 | } |
151 | } |
152 | cur->adjusted = bootval; |
153 | } |
154 | |
155 | #ifndef CONFIG_SMP |
156 | bool __init tsc_store_and_check_tsc_adjust(bool bootcpu) |
157 | { |
158 | struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); |
159 | s64 bootval; |
160 | |
161 | if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) |
162 | return false; |
163 | |
164 | /* Skip unnecessary error messages if TSC already unstable */ |
165 | if (check_tsc_unstable()) |
166 | return false; |
167 | |
168 | rdmsrl(MSR_IA32_TSC_ADJUST, bootval); |
169 | cur->bootval = bootval; |
170 | cur->nextcheck = jiffies + HZ; |
171 | tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), bootcpu); |
172 | return false; |
173 | } |
174 | |
175 | #else /* !CONFIG_SMP */ |
176 | |
177 | /* |
178 | * Store and check the TSC ADJUST MSR if available |
179 | */ |
180 | bool tsc_store_and_check_tsc_adjust(bool bootcpu) |
181 | { |
182 | struct tsc_adjust *ref, *cur = this_cpu_ptr(&tsc_adjust); |
183 | unsigned int refcpu, cpu = smp_processor_id(); |
184 | struct cpumask *mask; |
185 | s64 bootval; |
186 | |
187 | if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) |
188 | return false; |
189 | |
190 | rdmsrl(MSR_IA32_TSC_ADJUST, bootval); |
191 | cur->bootval = bootval; |
192 | cur->nextcheck = jiffies + HZ; |
193 | cur->warned = false; |
194 | |
195 | /* |
196 | * If a non-zero TSC value for socket 0 may be valid then the default |
197 | * adjusted value cannot assumed to be zero either. |
198 | */ |
199 | if (tsc_async_resets) |
200 | cur->adjusted = bootval; |
201 | |
202 | /* |
203 | * Check whether this CPU is the first in a package to come up. In |
204 | * this case do not check the boot value against another package |
205 | * because the new package might have been physically hotplugged, |
206 | * where TSC_ADJUST is expected to be different. When called on the |
207 | * boot CPU topology_core_cpumask() might not be available yet. |
208 | */ |
209 | mask = topology_core_cpumask(cpu); |
210 | refcpu = mask ? cpumask_any_but(mask, cpu) : nr_cpu_ids; |
211 | |
212 | if (refcpu >= nr_cpu_ids) { |
213 | tsc_sanitize_first_cpu(cur, bootval, smp_processor_id(), |
214 | bootcpu); |
215 | return false; |
216 | } |
217 | |
218 | ref = per_cpu_ptr(&tsc_adjust, refcpu); |
219 | /* |
220 | * Compare the boot value and complain if it differs in the |
221 | * package. |
222 | */ |
223 | if (bootval != ref->bootval) |
224 | printk_once(FW_BUG "TSC ADJUST differs within socket(s), fixing all errors\n" ); |
225 | |
226 | /* |
227 | * The TSC_ADJUST values in a package must be the same. If the boot |
228 | * value on this newly upcoming CPU differs from the adjustment |
229 | * value of the already online CPU in this package, set it to that |
230 | * adjusted value. |
231 | */ |
232 | if (bootval != ref->adjusted) { |
233 | cur->adjusted = ref->adjusted; |
234 | wrmsrl(MSR_IA32_TSC_ADJUST, val: ref->adjusted); |
235 | } |
236 | /* |
237 | * We have the TSCs forced to be in sync on this package. Skip sync |
238 | * test: |
239 | */ |
240 | return true; |
241 | } |
242 | |
243 | /* |
244 | * Entry/exit counters that make sure that both CPUs |
245 | * run the measurement code at once: |
246 | */ |
247 | static atomic_t start_count; |
248 | static atomic_t stop_count; |
249 | static atomic_t test_runs; |
250 | |
251 | /* |
252 | * We use a raw spinlock in this exceptional case, because |
253 | * we want to have the fastest, inlined, non-debug version |
254 | * of a critical section, to be able to prove TSC time-warps: |
255 | */ |
256 | static arch_spinlock_t sync_lock = __ARCH_SPIN_LOCK_UNLOCKED; |
257 | |
258 | static cycles_t last_tsc; |
259 | static cycles_t max_warp; |
260 | static int nr_warps; |
261 | static int random_warps; |
262 | |
263 | /* |
264 | * TSC-warp measurement loop running on both CPUs. This is not called |
265 | * if there is no TSC. |
266 | */ |
267 | static cycles_t check_tsc_warp(unsigned int timeout) |
268 | { |
269 | cycles_t start, now, prev, end, cur_max_warp = 0; |
270 | int i, cur_warps = 0; |
271 | |
272 | start = rdtsc_ordered(); |
273 | /* |
274 | * The measurement runs for 'timeout' msecs: |
275 | */ |
276 | end = start + (cycles_t) tsc_khz * timeout; |
277 | |
278 | for (i = 0; ; i++) { |
279 | /* |
280 | * We take the global lock, measure TSC, save the |
281 | * previous TSC that was measured (possibly on |
282 | * another CPU) and update the previous TSC timestamp. |
283 | */ |
284 | arch_spin_lock(&sync_lock); |
285 | prev = last_tsc; |
286 | now = rdtsc_ordered(); |
287 | last_tsc = now; |
288 | arch_spin_unlock(&sync_lock); |
289 | |
290 | /* |
291 | * Be nice every now and then (and also check whether |
292 | * measurement is done [we also insert a 10 million |
293 | * loops safety exit, so we dont lock up in case the |
294 | * TSC readout is totally broken]): |
295 | */ |
296 | if (unlikely(!(i & 7))) { |
297 | if (now > end || i > 10000000) |
298 | break; |
299 | cpu_relax(); |
300 | touch_nmi_watchdog(); |
301 | } |
302 | /* |
303 | * Outside the critical section we can now see whether |
304 | * we saw a time-warp of the TSC going backwards: |
305 | */ |
306 | if (unlikely(prev > now)) { |
307 | arch_spin_lock(&sync_lock); |
308 | max_warp = max(max_warp, prev - now); |
309 | cur_max_warp = max_warp; |
310 | /* |
311 | * Check whether this bounces back and forth. Only |
312 | * one CPU should observe time going backwards. |
313 | */ |
314 | if (cur_warps != nr_warps) |
315 | random_warps++; |
316 | nr_warps++; |
317 | cur_warps = nr_warps; |
318 | arch_spin_unlock(&sync_lock); |
319 | } |
320 | } |
321 | WARN(!(now-start), |
322 | "Warning: zero tsc calibration delta: %Ld [max: %Ld]\n" , |
323 | now-start, end-start); |
324 | return cur_max_warp; |
325 | } |
326 | |
327 | /* |
328 | * If the target CPU coming online doesn't have any of its core-siblings |
329 | * online, a timeout of 20msec will be used for the TSC-warp measurement |
330 | * loop. Otherwise a smaller timeout of 2msec will be used, as we have some |
331 | * information about this socket already (and this information grows as we |
332 | * have more and more logical-siblings in that socket). |
333 | * |
334 | * Ideally we should be able to skip the TSC sync check on the other |
335 | * core-siblings, if the first logical CPU in a socket passed the sync test. |
336 | * But as the TSC is per-logical CPU and can potentially be modified wrongly |
337 | * by the bios, TSC sync test for smaller duration should be able |
338 | * to catch such errors. Also this will catch the condition where all the |
339 | * cores in the socket don't get reset at the same time. |
340 | */ |
341 | static inline unsigned int loop_timeout(int cpu) |
342 | { |
343 | return (cpumask_weight(topology_core_cpumask(cpu)) > 1) ? 2 : 20; |
344 | } |
345 | |
346 | static void tsc_sync_mark_tsc_unstable(struct work_struct *work) |
347 | { |
348 | mark_tsc_unstable(reason: "check_tsc_sync_source failed" ); |
349 | } |
350 | |
351 | static DECLARE_WORK(tsc_sync_work, tsc_sync_mark_tsc_unstable); |
352 | |
353 | /* |
354 | * The freshly booted CPU initiates this via an async SMP function call. |
355 | */ |
356 | static void check_tsc_sync_source(void *__cpu) |
357 | { |
358 | unsigned int cpu = (unsigned long)__cpu; |
359 | int cpus = 2; |
360 | |
361 | /* |
362 | * Set the maximum number of test runs to |
363 | * 1 if the CPU does not provide the TSC_ADJUST MSR |
364 | * 3 if the MSR is available, so the target can try to adjust |
365 | */ |
366 | if (!boot_cpu_has(X86_FEATURE_TSC_ADJUST)) |
367 | atomic_set(v: &test_runs, i: 1); |
368 | else |
369 | atomic_set(v: &test_runs, i: 3); |
370 | retry: |
371 | /* Wait for the target to start. */ |
372 | while (atomic_read(v: &start_count) != cpus - 1) |
373 | cpu_relax(); |
374 | |
375 | /* |
376 | * Trigger the target to continue into the measurement too: |
377 | */ |
378 | atomic_inc(v: &start_count); |
379 | |
380 | check_tsc_warp(timeout: loop_timeout(cpu)); |
381 | |
382 | while (atomic_read(v: &stop_count) != cpus-1) |
383 | cpu_relax(); |
384 | |
385 | /* |
386 | * If the test was successful set the number of runs to zero and |
387 | * stop. If not, decrement the number of runs an check if we can |
388 | * retry. In case of random warps no retry is attempted. |
389 | */ |
390 | if (!nr_warps) { |
391 | atomic_set(v: &test_runs, i: 0); |
392 | |
393 | pr_debug("TSC synchronization [CPU#%d -> CPU#%u]: passed\n" , |
394 | smp_processor_id(), cpu); |
395 | |
396 | } else if (atomic_dec_and_test(v: &test_runs) || random_warps) { |
397 | /* Force it to 0 if random warps brought us here */ |
398 | atomic_set(v: &test_runs, i: 0); |
399 | |
400 | pr_warn("TSC synchronization [CPU#%d -> CPU#%u]:\n" , |
401 | smp_processor_id(), cpu); |
402 | pr_warn("Measured %Ld cycles TSC warp between CPUs, " |
403 | "turning off TSC clock.\n" , max_warp); |
404 | if (random_warps) |
405 | pr_warn("TSC warped randomly between CPUs\n" ); |
406 | schedule_work(work: &tsc_sync_work); |
407 | } |
408 | |
409 | /* |
410 | * Reset it - just in case we boot another CPU later: |
411 | */ |
412 | atomic_set(v: &start_count, i: 0); |
413 | random_warps = 0; |
414 | nr_warps = 0; |
415 | max_warp = 0; |
416 | last_tsc = 0; |
417 | |
418 | /* |
419 | * Let the target continue with the bootup: |
420 | */ |
421 | atomic_inc(v: &stop_count); |
422 | |
423 | /* |
424 | * Retry, if there is a chance to do so. |
425 | */ |
426 | if (atomic_read(v: &test_runs) > 0) |
427 | goto retry; |
428 | } |
429 | |
430 | /* |
431 | * Freshly booted CPUs call into this: |
432 | */ |
433 | void check_tsc_sync_target(void) |
434 | { |
435 | struct tsc_adjust *cur = this_cpu_ptr(&tsc_adjust); |
436 | unsigned int cpu = smp_processor_id(); |
437 | cycles_t cur_max_warp, gbl_max_warp; |
438 | int cpus = 2; |
439 | |
440 | /* Also aborts if there is no TSC. */ |
441 | if (unsynchronized_tsc()) |
442 | return; |
443 | |
444 | /* |
445 | * Store, verify and sanitize the TSC adjust register. If |
446 | * successful skip the test. |
447 | * |
448 | * The test is also skipped when the TSC is marked reliable. This |
449 | * is true for SoCs which have no fallback clocksource. On these |
450 | * SoCs the TSC is frequency synchronized, but still the TSC ADJUST |
451 | * register might have been wreckaged by the BIOS.. |
452 | */ |
453 | if (tsc_store_and_check_tsc_adjust(bootcpu: false) || tsc_clocksource_reliable) |
454 | return; |
455 | |
456 | /* Kick the control CPU into the TSC synchronization function */ |
457 | smp_call_function_single(cpuid: cpumask_first(cpu_online_mask), func: check_tsc_sync_source, |
458 | info: (unsigned long *)(unsigned long)cpu, wait: 0); |
459 | retry: |
460 | /* |
461 | * Register this CPU's participation and wait for the |
462 | * source CPU to start the measurement: |
463 | */ |
464 | atomic_inc(v: &start_count); |
465 | while (atomic_read(v: &start_count) != cpus) |
466 | cpu_relax(); |
467 | |
468 | cur_max_warp = check_tsc_warp(timeout: loop_timeout(cpu)); |
469 | |
470 | /* |
471 | * Store the maximum observed warp value for a potential retry: |
472 | */ |
473 | gbl_max_warp = max_warp; |
474 | |
475 | /* |
476 | * Ok, we are done: |
477 | */ |
478 | atomic_inc(v: &stop_count); |
479 | |
480 | /* |
481 | * Wait for the source CPU to print stuff: |
482 | */ |
483 | while (atomic_read(v: &stop_count) != cpus) |
484 | cpu_relax(); |
485 | |
486 | /* |
487 | * Reset it for the next sync test: |
488 | */ |
489 | atomic_set(v: &stop_count, i: 0); |
490 | |
491 | /* |
492 | * Check the number of remaining test runs. If not zero, the test |
493 | * failed and a retry with adjusted TSC is possible. If zero the |
494 | * test was either successful or failed terminally. |
495 | */ |
496 | if (!atomic_read(v: &test_runs)) |
497 | return; |
498 | |
499 | /* |
500 | * If the warp value of this CPU is 0, then the other CPU |
501 | * observed time going backwards so this TSC was ahead and |
502 | * needs to move backwards. |
503 | */ |
504 | if (!cur_max_warp) |
505 | cur_max_warp = -gbl_max_warp; |
506 | |
507 | /* |
508 | * Add the result to the previous adjustment value. |
509 | * |
510 | * The adjustment value is slightly off by the overhead of the |
511 | * sync mechanism (observed values are ~200 TSC cycles), but this |
512 | * really depends on CPU, node distance and frequency. So |
513 | * compensating for this is hard to get right. Experiments show |
514 | * that the warp is not longer detectable when the observed warp |
515 | * value is used. In the worst case the adjustment needs to go |
516 | * through a 3rd run for fine tuning. |
517 | */ |
518 | cur->adjusted += cur_max_warp; |
519 | |
520 | pr_warn("TSC ADJUST compensate: CPU%u observed %lld warp. Adjust: %lld\n" , |
521 | cpu, cur_max_warp, cur->adjusted); |
522 | |
523 | wrmsrl(MSR_IA32_TSC_ADJUST, val: cur->adjusted); |
524 | goto retry; |
525 | |
526 | } |
527 | |
528 | #endif /* CONFIG_SMP */ |
529 | |