1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * menu.c - the menu idle governor |
4 | * |
5 | * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com> |
6 | * Copyright (C) 2009 Intel Corporation |
7 | * Author: |
8 | * Arjan van de Ven <arjan@linux.intel.com> |
9 | */ |
10 | |
11 | #include <linux/kernel.h> |
12 | #include <linux/cpuidle.h> |
13 | #include <linux/time.h> |
14 | #include <linux/ktime.h> |
15 | #include <linux/hrtimer.h> |
16 | #include <linux/tick.h> |
17 | #include <linux/sched.h> |
18 | #include <linux/sched/loadavg.h> |
19 | #include <linux/sched/stat.h> |
20 | #include <linux/math64.h> |
21 | |
22 | #include "gov.h" |
23 | |
24 | #define BUCKETS 12 |
25 | #define INTERVAL_SHIFT 3 |
26 | #define INTERVALS (1UL << INTERVAL_SHIFT) |
27 | #define RESOLUTION 1024 |
28 | #define DECAY 8 |
29 | #define MAX_INTERESTING (50000 * NSEC_PER_USEC) |
30 | |
31 | /* |
32 | * Concepts and ideas behind the menu governor |
33 | * |
34 | * For the menu governor, there are 3 decision factors for picking a C |
35 | * state: |
36 | * 1) Energy break even point |
37 | * 2) Performance impact |
38 | * 3) Latency tolerance (from pmqos infrastructure) |
39 | * These three factors are treated independently. |
40 | * |
41 | * Energy break even point |
42 | * ----------------------- |
43 | * C state entry and exit have an energy cost, and a certain amount of time in |
44 | * the C state is required to actually break even on this cost. CPUIDLE |
45 | * provides us this duration in the "target_residency" field. So all that we |
46 | * need is a good prediction of how long we'll be idle. Like the traditional |
47 | * menu governor, we start with the actual known "next timer event" time. |
48 | * |
49 | * Since there are other source of wakeups (interrupts for example) than |
50 | * the next timer event, this estimation is rather optimistic. To get a |
51 | * more realistic estimate, a correction factor is applied to the estimate, |
52 | * that is based on historic behavior. For example, if in the past the actual |
53 | * duration always was 50% of the next timer tick, the correction factor will |
54 | * be 0.5. |
55 | * |
56 | * menu uses a running average for this correction factor, however it uses a |
57 | * set of factors, not just a single factor. This stems from the realization |
58 | * that the ratio is dependent on the order of magnitude of the expected |
59 | * duration; if we expect 500 milliseconds of idle time the likelihood of |
60 | * getting an interrupt very early is much higher than if we expect 50 micro |
61 | * seconds of idle time. A second independent factor that has big impact on |
62 | * the actual factor is if there is (disk) IO outstanding or not. |
63 | * (as a special twist, we consider every sleep longer than 50 milliseconds |
64 | * as perfect; there are no power gains for sleeping longer than this) |
65 | * |
66 | * For these two reasons we keep an array of 12 independent factors, that gets |
67 | * indexed based on the magnitude of the expected duration as well as the |
68 | * "is IO outstanding" property. |
69 | * |
70 | * Repeatable-interval-detector |
71 | * ---------------------------- |
72 | * There are some cases where "next timer" is a completely unusable predictor: |
73 | * Those cases where the interval is fixed, for example due to hardware |
74 | * interrupt mitigation, but also due to fixed transfer rate devices such as |
75 | * mice. |
76 | * For this, we use a different predictor: We track the duration of the last 8 |
77 | * intervals and if the stand deviation of these 8 intervals is below a |
78 | * threshold value, we use the average of these intervals as prediction. |
79 | * |
80 | * Limiting Performance Impact |
81 | * --------------------------- |
82 | * C states, especially those with large exit latencies, can have a real |
83 | * noticeable impact on workloads, which is not acceptable for most sysadmins, |
84 | * and in addition, less performance has a power price of its own. |
85 | * |
86 | * As a general rule of thumb, menu assumes that the following heuristic |
87 | * holds: |
88 | * The busier the system, the less impact of C states is acceptable |
89 | * |
90 | * This rule-of-thumb is implemented using a performance-multiplier: |
91 | * If the exit latency times the performance multiplier is longer than |
92 | * the predicted duration, the C state is not considered a candidate |
93 | * for selection due to a too high performance impact. So the higher |
94 | * this multiplier is, the longer we need to be idle to pick a deep C |
95 | * state, and thus the less likely a busy CPU will hit such a deep |
96 | * C state. |
97 | * |
98 | * Two factors are used in determing this multiplier: |
99 | * a value of 10 is added for each point of "per cpu load average" we have. |
100 | * a value of 5 points is added for each process that is waiting for |
101 | * IO on this CPU. |
102 | * (these values are experimentally determined) |
103 | * |
104 | * The load average factor gives a longer term (few seconds) input to the |
105 | * decision, while the iowait value gives a cpu local instantanious input. |
106 | * The iowait factor may look low, but realize that this is also already |
107 | * represented in the system load average. |
108 | * |
109 | */ |
110 | |
111 | struct { |
112 | int ; |
113 | int ; |
114 | |
115 | u64 ; |
116 | unsigned int ; |
117 | unsigned int [BUCKETS]; |
118 | unsigned int [INTERVALS]; |
119 | int ; |
120 | }; |
121 | |
122 | static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters) |
123 | { |
124 | int bucket = 0; |
125 | |
126 | /* |
127 | * We keep two groups of stats; one with no |
128 | * IO pending, one without. |
129 | * This allows us to calculate |
130 | * E(duration)|iowait |
131 | */ |
132 | if (nr_iowaiters) |
133 | bucket = BUCKETS/2; |
134 | |
135 | if (duration_ns < 10ULL * NSEC_PER_USEC) |
136 | return bucket; |
137 | if (duration_ns < 100ULL * NSEC_PER_USEC) |
138 | return bucket + 1; |
139 | if (duration_ns < 1000ULL * NSEC_PER_USEC) |
140 | return bucket + 2; |
141 | if (duration_ns < 10000ULL * NSEC_PER_USEC) |
142 | return bucket + 3; |
143 | if (duration_ns < 100000ULL * NSEC_PER_USEC) |
144 | return bucket + 4; |
145 | return bucket + 5; |
146 | } |
147 | |
148 | /* |
149 | * Return a multiplier for the exit latency that is intended |
150 | * to take performance requirements into account. |
151 | * The more performance critical we estimate the system |
152 | * to be, the higher this multiplier, and thus the higher |
153 | * the barrier to go to an expensive C state. |
154 | */ |
155 | static inline int performance_multiplier(unsigned int nr_iowaiters) |
156 | { |
157 | /* for IO wait tasks (per cpu!) we add 10x each */ |
158 | return 1 + 10 * nr_iowaiters; |
159 | } |
160 | |
161 | static DEFINE_PER_CPU(struct menu_device, ); |
162 | |
163 | static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); |
164 | |
165 | /* |
166 | * Try detecting repeating patterns by keeping track of the last 8 |
167 | * intervals, and checking if the standard deviation of that set |
168 | * of points is below a threshold. If it is... then use the |
169 | * average of these 8 points as the estimated value. |
170 | */ |
171 | static unsigned int get_typical_interval(struct menu_device *data) |
172 | { |
173 | int i, divisor; |
174 | unsigned int min, max, thresh, avg; |
175 | uint64_t sum, variance; |
176 | |
177 | thresh = INT_MAX; /* Discard outliers above this value */ |
178 | |
179 | again: |
180 | |
181 | /* First calculate the average of past intervals */ |
182 | min = UINT_MAX; |
183 | max = 0; |
184 | sum = 0; |
185 | divisor = 0; |
186 | for (i = 0; i < INTERVALS; i++) { |
187 | unsigned int value = data->intervals[i]; |
188 | if (value <= thresh) { |
189 | sum += value; |
190 | divisor++; |
191 | if (value > max) |
192 | max = value; |
193 | |
194 | if (value < min) |
195 | min = value; |
196 | } |
197 | } |
198 | |
199 | if (!max) |
200 | return UINT_MAX; |
201 | |
202 | if (divisor == INTERVALS) |
203 | avg = sum >> INTERVAL_SHIFT; |
204 | else |
205 | avg = div_u64(dividend: sum, divisor); |
206 | |
207 | /* Then try to determine variance */ |
208 | variance = 0; |
209 | for (i = 0; i < INTERVALS; i++) { |
210 | unsigned int value = data->intervals[i]; |
211 | if (value <= thresh) { |
212 | int64_t diff = (int64_t)value - avg; |
213 | variance += diff * diff; |
214 | } |
215 | } |
216 | if (divisor == INTERVALS) |
217 | variance >>= INTERVAL_SHIFT; |
218 | else |
219 | do_div(variance, divisor); |
220 | |
221 | /* |
222 | * The typical interval is obtained when standard deviation is |
223 | * small (stddev <= 20 us, variance <= 400 us^2) or standard |
224 | * deviation is small compared to the average interval (avg > |
225 | * 6*stddev, avg^2 > 36*variance). The average is smaller than |
226 | * UINT_MAX aka U32_MAX, so computing its square does not |
227 | * overflow a u64. We simply reject this candidate average if |
228 | * the standard deviation is greater than 715 s (which is |
229 | * rather unlikely). |
230 | * |
231 | * Use this result only if there is no timer to wake us up sooner. |
232 | */ |
233 | if (likely(variance <= U64_MAX/36)) { |
234 | if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3)) |
235 | || variance <= 400) { |
236 | return avg; |
237 | } |
238 | } |
239 | |
240 | /* |
241 | * If we have outliers to the upside in our distribution, discard |
242 | * those by setting the threshold to exclude these outliers, then |
243 | * calculate the average and standard deviation again. Once we get |
244 | * down to the bottom 3/4 of our samples, stop excluding samples. |
245 | * |
246 | * This can deal with workloads that have long pauses interspersed |
247 | * with sporadic activity with a bunch of short pauses. |
248 | */ |
249 | if ((divisor * 4) <= INTERVALS * 3) |
250 | return UINT_MAX; |
251 | |
252 | thresh = max - 1; |
253 | goto again; |
254 | } |
255 | |
256 | /** |
257 | * menu_select - selects the next idle state to enter |
258 | * @drv: cpuidle driver containing state data |
259 | * @dev: the CPU |
260 | * @stop_tick: indication on whether or not to stop the tick |
261 | */ |
262 | static int (struct cpuidle_driver *drv, struct cpuidle_device *dev, |
263 | bool *stop_tick) |
264 | { |
265 | struct menu_device *data = this_cpu_ptr(&menu_devices); |
266 | s64 latency_req = cpuidle_governor_latency_req(cpu: dev->cpu); |
267 | u64 predicted_ns; |
268 | u64 interactivity_req; |
269 | unsigned int nr_iowaiters; |
270 | ktime_t delta, delta_tick; |
271 | int i, idx; |
272 | |
273 | if (data->needs_update) { |
274 | menu_update(drv, dev); |
275 | data->needs_update = 0; |
276 | } |
277 | |
278 | nr_iowaiters = nr_iowait_cpu(cpu: dev->cpu); |
279 | |
280 | /* Find the shortest expected idle interval. */ |
281 | predicted_ns = get_typical_interval(data) * NSEC_PER_USEC; |
282 | if (predicted_ns > RESIDENCY_THRESHOLD_NS) { |
283 | unsigned int timer_us; |
284 | |
285 | /* Determine the time till the closest timer. */ |
286 | delta = tick_nohz_get_sleep_length(delta_next: &delta_tick); |
287 | if (unlikely(delta < 0)) { |
288 | delta = 0; |
289 | delta_tick = 0; |
290 | } |
291 | |
292 | data->next_timer_ns = delta; |
293 | data->bucket = which_bucket(duration_ns: data->next_timer_ns, nr_iowaiters); |
294 | |
295 | /* Round up the result for half microseconds. */ |
296 | timer_us = div_u64(dividend: (RESOLUTION * DECAY * NSEC_PER_USEC) / 2 + |
297 | data->next_timer_ns * |
298 | data->correction_factor[data->bucket], |
299 | RESOLUTION * DECAY * NSEC_PER_USEC); |
300 | /* Use the lowest expected idle interval to pick the idle state. */ |
301 | predicted_ns = min((u64)timer_us * NSEC_PER_USEC, predicted_ns); |
302 | } else { |
303 | /* |
304 | * Because the next timer event is not going to be determined |
305 | * in this case, assume that without the tick the closest timer |
306 | * will be in distant future and that the closest tick will occur |
307 | * after 1/2 of the tick period. |
308 | */ |
309 | data->next_timer_ns = KTIME_MAX; |
310 | delta_tick = TICK_NSEC / 2; |
311 | data->bucket = which_bucket(KTIME_MAX, nr_iowaiters); |
312 | } |
313 | |
314 | if (unlikely(drv->state_count <= 1 || latency_req == 0) || |
315 | ((data->next_timer_ns < drv->states[1].target_residency_ns || |
316 | latency_req < drv->states[1].exit_latency_ns) && |
317 | !dev->states_usage[0].disable)) { |
318 | /* |
319 | * In this case state[0] will be used no matter what, so return |
320 | * it right away and keep the tick running if state[0] is a |
321 | * polling one. |
322 | */ |
323 | *stop_tick = !(drv->states[0].flags & CPUIDLE_FLAG_POLLING); |
324 | return 0; |
325 | } |
326 | |
327 | if (tick_nohz_tick_stopped()) { |
328 | /* |
329 | * If the tick is already stopped, the cost of possible short |
330 | * idle duration misprediction is much higher, because the CPU |
331 | * may be stuck in a shallow idle state for a long time as a |
332 | * result of it. In that case say we might mispredict and use |
333 | * the known time till the closest timer event for the idle |
334 | * state selection. |
335 | */ |
336 | if (predicted_ns < TICK_NSEC) |
337 | predicted_ns = data->next_timer_ns; |
338 | } else { |
339 | /* |
340 | * Use the performance multiplier and the user-configurable |
341 | * latency_req to determine the maximum exit latency. |
342 | */ |
343 | interactivity_req = div64_u64(dividend: predicted_ns, |
344 | divisor: performance_multiplier(nr_iowaiters)); |
345 | if (latency_req > interactivity_req) |
346 | latency_req = interactivity_req; |
347 | } |
348 | |
349 | /* |
350 | * Find the idle state with the lowest power while satisfying |
351 | * our constraints. |
352 | */ |
353 | idx = -1; |
354 | for (i = 0; i < drv->state_count; i++) { |
355 | struct cpuidle_state *s = &drv->states[i]; |
356 | |
357 | if (dev->states_usage[i].disable) |
358 | continue; |
359 | |
360 | if (idx == -1) |
361 | idx = i; /* first enabled state */ |
362 | |
363 | if (s->target_residency_ns > predicted_ns) { |
364 | /* |
365 | * Use a physical idle state, not busy polling, unless |
366 | * a timer is going to trigger soon enough. |
367 | */ |
368 | if ((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) && |
369 | s->exit_latency_ns <= latency_req && |
370 | s->target_residency_ns <= data->next_timer_ns) { |
371 | predicted_ns = s->target_residency_ns; |
372 | idx = i; |
373 | break; |
374 | } |
375 | if (predicted_ns < TICK_NSEC) |
376 | break; |
377 | |
378 | if (!tick_nohz_tick_stopped()) { |
379 | /* |
380 | * If the state selected so far is shallow, |
381 | * waking up early won't hurt, so retain the |
382 | * tick in that case and let the governor run |
383 | * again in the next iteration of the loop. |
384 | */ |
385 | predicted_ns = drv->states[idx].target_residency_ns; |
386 | break; |
387 | } |
388 | |
389 | /* |
390 | * If the state selected so far is shallow and this |
391 | * state's target residency matches the time till the |
392 | * closest timer event, select this one to avoid getting |
393 | * stuck in the shallow one for too long. |
394 | */ |
395 | if (drv->states[idx].target_residency_ns < TICK_NSEC && |
396 | s->target_residency_ns <= delta_tick) |
397 | idx = i; |
398 | |
399 | return idx; |
400 | } |
401 | if (s->exit_latency_ns > latency_req) |
402 | break; |
403 | |
404 | idx = i; |
405 | } |
406 | |
407 | if (idx == -1) |
408 | idx = 0; /* No states enabled. Must use 0. */ |
409 | |
410 | /* |
411 | * Don't stop the tick if the selected state is a polling one or if the |
412 | * expected idle duration is shorter than the tick period length. |
413 | */ |
414 | if (((drv->states[idx].flags & CPUIDLE_FLAG_POLLING) || |
415 | predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { |
416 | *stop_tick = false; |
417 | |
418 | if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) { |
419 | /* |
420 | * The tick is not going to be stopped and the target |
421 | * residency of the state to be returned is not within |
422 | * the time until the next timer event including the |
423 | * tick, so try to correct that. |
424 | */ |
425 | for (i = idx - 1; i >= 0; i--) { |
426 | if (dev->states_usage[i].disable) |
427 | continue; |
428 | |
429 | idx = i; |
430 | if (drv->states[i].target_residency_ns <= delta_tick) |
431 | break; |
432 | } |
433 | } |
434 | } |
435 | |
436 | return idx; |
437 | } |
438 | |
439 | /** |
440 | * menu_reflect - records that data structures need update |
441 | * @dev: the CPU |
442 | * @index: the index of actual entered state |
443 | * |
444 | * NOTE: it's important to be fast here because this operation will add to |
445 | * the overall exit latency. |
446 | */ |
447 | static void (struct cpuidle_device *dev, int index) |
448 | { |
449 | struct menu_device *data = this_cpu_ptr(&menu_devices); |
450 | |
451 | dev->last_state_idx = index; |
452 | data->needs_update = 1; |
453 | data->tick_wakeup = tick_nohz_idle_got_tick(); |
454 | } |
455 | |
456 | /** |
457 | * menu_update - attempts to guess what happened after entry |
458 | * @drv: cpuidle driver containing state data |
459 | * @dev: the CPU |
460 | */ |
461 | static void (struct cpuidle_driver *drv, struct cpuidle_device *dev) |
462 | { |
463 | struct menu_device *data = this_cpu_ptr(&menu_devices); |
464 | int last_idx = dev->last_state_idx; |
465 | struct cpuidle_state *target = &drv->states[last_idx]; |
466 | u64 measured_ns; |
467 | unsigned int new_factor; |
468 | |
469 | /* |
470 | * Try to figure out how much time passed between entry to low |
471 | * power state and occurrence of the wakeup event. |
472 | * |
473 | * If the entered idle state didn't support residency measurements, |
474 | * we use them anyway if they are short, and if long, |
475 | * truncate to the whole expected time. |
476 | * |
477 | * Any measured amount of time will include the exit latency. |
478 | * Since we are interested in when the wakeup begun, not when it |
479 | * was completed, we must subtract the exit latency. However, if |
480 | * the measured amount of time is less than the exit latency, |
481 | * assume the state was never reached and the exit latency is 0. |
482 | */ |
483 | |
484 | if (data->tick_wakeup && data->next_timer_ns > TICK_NSEC) { |
485 | /* |
486 | * The nohz code said that there wouldn't be any events within |
487 | * the tick boundary (if the tick was stopped), but the idle |
488 | * duration predictor had a differing opinion. Since the CPU |
489 | * was woken up by a tick (that wasn't stopped after all), the |
490 | * predictor was not quite right, so assume that the CPU could |
491 | * have been idle long (but not forever) to help the idle |
492 | * duration predictor do a better job next time. |
493 | */ |
494 | measured_ns = 9 * MAX_INTERESTING / 10; |
495 | } else if ((drv->states[last_idx].flags & CPUIDLE_FLAG_POLLING) && |
496 | dev->poll_time_limit) { |
497 | /* |
498 | * The CPU exited the "polling" state due to a time limit, so |
499 | * the idle duration prediction leading to the selection of that |
500 | * state was inaccurate. If a better prediction had been made, |
501 | * the CPU might have been woken up from idle by the next timer. |
502 | * Assume that to be the case. |
503 | */ |
504 | measured_ns = data->next_timer_ns; |
505 | } else { |
506 | /* measured value */ |
507 | measured_ns = dev->last_residency_ns; |
508 | |
509 | /* Deduct exit latency */ |
510 | if (measured_ns > 2 * target->exit_latency_ns) |
511 | measured_ns -= target->exit_latency_ns; |
512 | else |
513 | measured_ns /= 2; |
514 | } |
515 | |
516 | /* Make sure our coefficients do not exceed unity */ |
517 | if (measured_ns > data->next_timer_ns) |
518 | measured_ns = data->next_timer_ns; |
519 | |
520 | /* Update our correction ratio */ |
521 | new_factor = data->correction_factor[data->bucket]; |
522 | new_factor -= new_factor / DECAY; |
523 | |
524 | if (data->next_timer_ns > 0 && measured_ns < MAX_INTERESTING) |
525 | new_factor += div64_u64(RESOLUTION * measured_ns, |
526 | divisor: data->next_timer_ns); |
527 | else |
528 | /* |
529 | * we were idle so long that we count it as a perfect |
530 | * prediction |
531 | */ |
532 | new_factor += RESOLUTION; |
533 | |
534 | /* |
535 | * We don't want 0 as factor; we always want at least |
536 | * a tiny bit of estimated time. Fortunately, due to rounding, |
537 | * new_factor will stay nonzero regardless of measured_us values |
538 | * and the compiler can eliminate this test as long as DECAY > 1. |
539 | */ |
540 | if (DECAY == 1 && unlikely(new_factor == 0)) |
541 | new_factor = 1; |
542 | |
543 | data->correction_factor[data->bucket] = new_factor; |
544 | |
545 | /* update the repeating-pattern data */ |
546 | data->intervals[data->interval_ptr++] = ktime_to_us(kt: measured_ns); |
547 | if (data->interval_ptr >= INTERVALS) |
548 | data->interval_ptr = 0; |
549 | } |
550 | |
551 | /** |
552 | * menu_enable_device - scans a CPU's states and does setup |
553 | * @drv: cpuidle driver |
554 | * @dev: the CPU |
555 | */ |
556 | static int (struct cpuidle_driver *drv, |
557 | struct cpuidle_device *dev) |
558 | { |
559 | struct menu_device *data = &per_cpu(menu_devices, dev->cpu); |
560 | int i; |
561 | |
562 | memset(data, 0, sizeof(struct menu_device)); |
563 | |
564 | /* |
565 | * if the correction factor is 0 (eg first time init or cpu hotplug |
566 | * etc), we actually want to start out with a unity factor. |
567 | */ |
568 | for(i = 0; i < BUCKETS; i++) |
569 | data->correction_factor[i] = RESOLUTION * DECAY; |
570 | |
571 | return 0; |
572 | } |
573 | |
574 | static struct cpuidle_governor = { |
575 | .name = "menu" , |
576 | .rating = 20, |
577 | .enable = menu_enable_device, |
578 | .select = menu_select, |
579 | .reflect = menu_reflect, |
580 | }; |
581 | |
582 | /** |
583 | * init_menu - initializes the governor |
584 | */ |
585 | static int __init (void) |
586 | { |
587 | return cpuidle_register_governor(gov: &menu_governor); |
588 | } |
589 | |
590 | postcore_initcall(init_menu); |
591 | |