1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * kernel/sched/loadavg.c |
4 | * |
5 | * This file contains the magic bits required to compute the global loadavg |
6 | * figure. Its a silly number but people think its important. We go through |
7 | * great pains to make it work on big machines and tickless kernels. |
8 | */ |
9 | |
10 | /* |
11 | * Global load-average calculations |
12 | * |
13 | * We take a distributed and async approach to calculating the global load-avg |
14 | * in order to minimize overhead. |
15 | * |
16 | * The global load average is an exponentially decaying average of nr_running + |
17 | * nr_uninterruptible. |
18 | * |
19 | * Once every LOAD_FREQ: |
20 | * |
21 | * nr_active = 0; |
22 | * for_each_possible_cpu(cpu) |
23 | * nr_active += cpu_of(cpu)->nr_running + cpu_of(cpu)->nr_uninterruptible; |
24 | * |
25 | * avenrun[n] = avenrun[0] * exp_n + nr_active * (1 - exp_n) |
26 | * |
27 | * Due to a number of reasons the above turns in the mess below: |
28 | * |
29 | * - for_each_possible_cpu() is prohibitively expensive on machines with |
30 | * serious number of CPUs, therefore we need to take a distributed approach |
31 | * to calculating nr_active. |
32 | * |
33 | * \Sum_i x_i(t) = \Sum_i x_i(t) - x_i(t_0) | x_i(t_0) := 0 |
34 | * = \Sum_i { \Sum_j=1 x_i(t_j) - x_i(t_j-1) } |
35 | * |
36 | * So assuming nr_active := 0 when we start out -- true per definition, we |
37 | * can simply take per-CPU deltas and fold those into a global accumulate |
38 | * to obtain the same result. See calc_load_fold_active(). |
39 | * |
40 | * Furthermore, in order to avoid synchronizing all per-CPU delta folding |
41 | * across the machine, we assume 10 ticks is sufficient time for every |
42 | * CPU to have completed this task. |
43 | * |
44 | * This places an upper-bound on the IRQ-off latency of the machine. Then |
45 | * again, being late doesn't loose the delta, just wrecks the sample. |
46 | * |
47 | * - cpu_rq()->nr_uninterruptible isn't accurately tracked per-CPU because |
48 | * this would add another cross-CPU cacheline miss and atomic operation |
49 | * to the wakeup path. Instead we increment on whatever CPU the task ran |
50 | * when it went into uninterruptible state and decrement on whatever CPU |
51 | * did the wakeup. This means that only the sum of nr_uninterruptible over |
52 | * all CPUs yields the correct result. |
53 | * |
54 | * This covers the NO_HZ=n code, for extra head-aches, see the comment below. |
55 | */ |
56 | |
57 | /* Variables and functions for calc_load */ |
58 | atomic_long_t calc_load_tasks; |
59 | unsigned long calc_load_update; |
60 | unsigned long avenrun[3]; |
61 | EXPORT_SYMBOL(avenrun); /* should be removed */ |
62 | |
63 | /** |
64 | * get_avenrun - get the load average array |
65 | * @loads: pointer to dest load array |
66 | * @offset: offset to add |
67 | * @shift: shift count to shift the result left |
68 | * |
69 | * These values are estimates at best, so no need for locking. |
70 | */ |
71 | void get_avenrun(unsigned long *loads, unsigned long offset, int shift) |
72 | { |
73 | loads[0] = (avenrun[0] + offset) << shift; |
74 | loads[1] = (avenrun[1] + offset) << shift; |
75 | loads[2] = (avenrun[2] + offset) << shift; |
76 | } |
77 | |
78 | long calc_load_fold_active(struct rq *this_rq, long adjust) |
79 | { |
80 | long nr_active, delta = 0; |
81 | |
82 | nr_active = this_rq->nr_running - adjust; |
83 | nr_active += (int)this_rq->nr_uninterruptible; |
84 | |
85 | if (nr_active != this_rq->calc_load_active) { |
86 | delta = nr_active - this_rq->calc_load_active; |
87 | this_rq->calc_load_active = nr_active; |
88 | } |
89 | |
90 | return delta; |
91 | } |
92 | |
93 | /** |
94 | * fixed_power_int - compute: x^n, in O(log n) time |
95 | * |
96 | * @x: base of the power |
97 | * @frac_bits: fractional bits of @x |
98 | * @n: power to raise @x to. |
99 | * |
100 | * By exploiting the relation between the definition of the natural power |
101 | * function: x^n := x*x*...*x (x multiplied by itself for n times), and |
102 | * the binary encoding of numbers used by computers: n := \Sum n_i * 2^i, |
103 | * (where: n_i \elem {0, 1}, the binary vector representing n), |
104 | * we find: x^n := x^(\Sum n_i * 2^i) := \Prod x^(n_i * 2^i), which is |
105 | * of course trivially computable in O(log_2 n), the length of our binary |
106 | * vector. |
107 | */ |
108 | static unsigned long |
109 | fixed_power_int(unsigned long x, unsigned int frac_bits, unsigned int n) |
110 | { |
111 | unsigned long result = 1UL << frac_bits; |
112 | |
113 | if (n) { |
114 | for (;;) { |
115 | if (n & 1) { |
116 | result *= x; |
117 | result += 1UL << (frac_bits - 1); |
118 | result >>= frac_bits; |
119 | } |
120 | n >>= 1; |
121 | if (!n) |
122 | break; |
123 | x *= x; |
124 | x += 1UL << (frac_bits - 1); |
125 | x >>= frac_bits; |
126 | } |
127 | } |
128 | |
129 | return result; |
130 | } |
131 | |
132 | /* |
133 | * a1 = a0 * e + a * (1 - e) |
134 | * |
135 | * a2 = a1 * e + a * (1 - e) |
136 | * = (a0 * e + a * (1 - e)) * e + a * (1 - e) |
137 | * = a0 * e^2 + a * (1 - e) * (1 + e) |
138 | * |
139 | * a3 = a2 * e + a * (1 - e) |
140 | * = (a0 * e^2 + a * (1 - e) * (1 + e)) * e + a * (1 - e) |
141 | * = a0 * e^3 + a * (1 - e) * (1 + e + e^2) |
142 | * |
143 | * ... |
144 | * |
145 | * an = a0 * e^n + a * (1 - e) * (1 + e + ... + e^n-1) [1] |
146 | * = a0 * e^n + a * (1 - e) * (1 - e^n)/(1 - e) |
147 | * = a0 * e^n + a * (1 - e^n) |
148 | * |
149 | * [1] application of the geometric series: |
150 | * |
151 | * n 1 - x^(n+1) |
152 | * S_n := \Sum x^i = ------------- |
153 | * i=0 1 - x |
154 | */ |
155 | unsigned long |
156 | calc_load_n(unsigned long load, unsigned long exp, |
157 | unsigned long active, unsigned int n) |
158 | { |
159 | return calc_load(load, exp: fixed_power_int(x: exp, FSHIFT, n), active); |
160 | } |
161 | |
162 | #ifdef CONFIG_NO_HZ_COMMON |
163 | /* |
164 | * Handle NO_HZ for the global load-average. |
165 | * |
166 | * Since the above described distributed algorithm to compute the global |
167 | * load-average relies on per-CPU sampling from the tick, it is affected by |
168 | * NO_HZ. |
169 | * |
170 | * The basic idea is to fold the nr_active delta into a global NO_HZ-delta upon |
171 | * entering NO_HZ state such that we can include this as an 'extra' CPU delta |
172 | * when we read the global state. |
173 | * |
174 | * Obviously reality has to ruin such a delightfully simple scheme: |
175 | * |
176 | * - When we go NO_HZ idle during the window, we can negate our sample |
177 | * contribution, causing under-accounting. |
178 | * |
179 | * We avoid this by keeping two NO_HZ-delta counters and flipping them |
180 | * when the window starts, thus separating old and new NO_HZ load. |
181 | * |
182 | * The only trick is the slight shift in index flip for read vs write. |
183 | * |
184 | * 0s 5s 10s 15s |
185 | * +10 +10 +10 +10 |
186 | * |-|-----------|-|-----------|-|-----------|-| |
187 | * r:0 0 1 1 0 0 1 1 0 |
188 | * w:0 1 1 0 0 1 1 0 0 |
189 | * |
190 | * This ensures we'll fold the old NO_HZ contribution in this window while |
191 | * accumulating the new one. |
192 | * |
193 | * - When we wake up from NO_HZ during the window, we push up our |
194 | * contribution, since we effectively move our sample point to a known |
195 | * busy state. |
196 | * |
197 | * This is solved by pushing the window forward, and thus skipping the |
198 | * sample, for this CPU (effectively using the NO_HZ-delta for this CPU which |
199 | * was in effect at the time the window opened). This also solves the issue |
200 | * of having to deal with a CPU having been in NO_HZ for multiple LOAD_FREQ |
201 | * intervals. |
202 | * |
203 | * When making the ILB scale, we should try to pull this in as well. |
204 | */ |
205 | static atomic_long_t calc_load_nohz[2]; |
206 | static int calc_load_idx; |
207 | |
208 | static inline int calc_load_write_idx(void) |
209 | { |
210 | int idx = calc_load_idx; |
211 | |
212 | /* |
213 | * See calc_global_nohz(), if we observe the new index, we also |
214 | * need to observe the new update time. |
215 | */ |
216 | smp_rmb(); |
217 | |
218 | /* |
219 | * If the folding window started, make sure we start writing in the |
220 | * next NO_HZ-delta. |
221 | */ |
222 | if (!time_before(jiffies, READ_ONCE(calc_load_update))) |
223 | idx++; |
224 | |
225 | return idx & 1; |
226 | } |
227 | |
228 | static inline int calc_load_read_idx(void) |
229 | { |
230 | return calc_load_idx & 1; |
231 | } |
232 | |
233 | static void calc_load_nohz_fold(struct rq *rq) |
234 | { |
235 | long delta; |
236 | |
237 | delta = calc_load_fold_active(this_rq: rq, adjust: 0); |
238 | if (delta) { |
239 | int idx = calc_load_write_idx(); |
240 | |
241 | atomic_long_add(i: delta, v: &calc_load_nohz[idx]); |
242 | } |
243 | } |
244 | |
245 | void calc_load_nohz_start(void) |
246 | { |
247 | /* |
248 | * We're going into NO_HZ mode, if there's any pending delta, fold it |
249 | * into the pending NO_HZ delta. |
250 | */ |
251 | calc_load_nohz_fold(this_rq()); |
252 | } |
253 | |
254 | /* |
255 | * Keep track of the load for NOHZ_FULL, must be called between |
256 | * calc_load_nohz_{start,stop}(). |
257 | */ |
258 | void calc_load_nohz_remote(struct rq *rq) |
259 | { |
260 | calc_load_nohz_fold(rq); |
261 | } |
262 | |
263 | void calc_load_nohz_stop(void) |
264 | { |
265 | struct rq *this_rq = this_rq(); |
266 | |
267 | /* |
268 | * If we're still before the pending sample window, we're done. |
269 | */ |
270 | this_rq->calc_load_update = READ_ONCE(calc_load_update); |
271 | if (time_before(jiffies, this_rq->calc_load_update)) |
272 | return; |
273 | |
274 | /* |
275 | * We woke inside or after the sample window, this means we're already |
276 | * accounted through the nohz accounting, so skip the entire deal and |
277 | * sync up for the next window. |
278 | */ |
279 | if (time_before(jiffies, this_rq->calc_load_update + 10)) |
280 | this_rq->calc_load_update += LOAD_FREQ; |
281 | } |
282 | |
283 | static long calc_load_nohz_read(void) |
284 | { |
285 | int idx = calc_load_read_idx(); |
286 | long delta = 0; |
287 | |
288 | if (atomic_long_read(v: &calc_load_nohz[idx])) |
289 | delta = atomic_long_xchg(v: &calc_load_nohz[idx], new: 0); |
290 | |
291 | return delta; |
292 | } |
293 | |
294 | /* |
295 | * NO_HZ can leave us missing all per-CPU ticks calling |
296 | * calc_load_fold_active(), but since a NO_HZ CPU folds its delta into |
297 | * calc_load_nohz per calc_load_nohz_start(), all we need to do is fold |
298 | * in the pending NO_HZ delta if our NO_HZ period crossed a load cycle boundary. |
299 | * |
300 | * Once we've updated the global active value, we need to apply the exponential |
301 | * weights adjusted to the number of cycles missed. |
302 | */ |
303 | static void calc_global_nohz(void) |
304 | { |
305 | unsigned long sample_window; |
306 | long delta, active, n; |
307 | |
308 | sample_window = READ_ONCE(calc_load_update); |
309 | if (!time_before(jiffies, sample_window + 10)) { |
310 | /* |
311 | * Catch-up, fold however many we are behind still |
312 | */ |
313 | delta = jiffies - sample_window - 10; |
314 | n = 1 + (delta / LOAD_FREQ); |
315 | |
316 | active = atomic_long_read(v: &calc_load_tasks); |
317 | active = active > 0 ? active * FIXED_1 : 0; |
318 | |
319 | avenrun[0] = calc_load_n(load: avenrun[0], EXP_1, active, n); |
320 | avenrun[1] = calc_load_n(load: avenrun[1], EXP_5, active, n); |
321 | avenrun[2] = calc_load_n(load: avenrun[2], EXP_15, active, n); |
322 | |
323 | WRITE_ONCE(calc_load_update, sample_window + n * LOAD_FREQ); |
324 | } |
325 | |
326 | /* |
327 | * Flip the NO_HZ index... |
328 | * |
329 | * Make sure we first write the new time then flip the index, so that |
330 | * calc_load_write_idx() will see the new time when it reads the new |
331 | * index, this avoids a double flip messing things up. |
332 | */ |
333 | smp_wmb(); |
334 | calc_load_idx++; |
335 | } |
336 | #else /* !CONFIG_NO_HZ_COMMON */ |
337 | |
338 | static inline long calc_load_nohz_read(void) { return 0; } |
339 | static inline void calc_global_nohz(void) { } |
340 | |
341 | #endif /* CONFIG_NO_HZ_COMMON */ |
342 | |
343 | /* |
344 | * calc_load - update the avenrun load estimates 10 ticks after the |
345 | * CPUs have updated calc_load_tasks. |
346 | * |
347 | * Called from the global timer code. |
348 | */ |
349 | void calc_global_load(void) |
350 | { |
351 | unsigned long sample_window; |
352 | long active, delta; |
353 | |
354 | sample_window = READ_ONCE(calc_load_update); |
355 | if (time_before(jiffies, sample_window + 10)) |
356 | return; |
357 | |
358 | /* |
359 | * Fold the 'old' NO_HZ-delta to include all NO_HZ CPUs. |
360 | */ |
361 | delta = calc_load_nohz_read(); |
362 | if (delta) |
363 | atomic_long_add(i: delta, v: &calc_load_tasks); |
364 | |
365 | active = atomic_long_read(v: &calc_load_tasks); |
366 | active = active > 0 ? active * FIXED_1 : 0; |
367 | |
368 | avenrun[0] = calc_load(load: avenrun[0], EXP_1, active); |
369 | avenrun[1] = calc_load(load: avenrun[1], EXP_5, active); |
370 | avenrun[2] = calc_load(load: avenrun[2], EXP_15, active); |
371 | |
372 | WRITE_ONCE(calc_load_update, sample_window + LOAD_FREQ); |
373 | |
374 | /* |
375 | * In case we went to NO_HZ for multiple LOAD_FREQ intervals |
376 | * catch up in bulk. |
377 | */ |
378 | calc_global_nohz(); |
379 | } |
380 | |
381 | /* |
382 | * Called from scheduler_tick() to periodically update this CPU's |
383 | * active count. |
384 | */ |
385 | void calc_global_load_tick(struct rq *this_rq) |
386 | { |
387 | long delta; |
388 | |
389 | if (time_before(jiffies, this_rq->calc_load_update)) |
390 | return; |
391 | |
392 | delta = calc_load_fold_active(this_rq, adjust: 0); |
393 | if (delta) |
394 | atomic_long_add(i: delta, v: &calc_load_tasks); |
395 | |
396 | this_rq->calc_load_update += LOAD_FREQ; |
397 | } |
398 | |