pelt.c source code [linux/kernel/sched/pelt.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Per Entity Load Tracking (PELT)
4	*
5	* Copyright (C) 2007 Red Hat, Inc., Ingo Molnar <mingo@redhat.com>
6	*
7	* Interactivity improvements by Mike Galbraith
8	* (C) 2007 Mike Galbraith <efault@gmx.de>
9	*
10	* Various enhancements by Dmitry Adamushko.
11	* (C) 2007 Dmitry Adamushko <dmitry.adamushko@gmail.com>
12	*
13	* Group scheduling enhancements by Srivatsa Vaddagiri
14	* Copyright IBM Corporation, 2007
15	* Author: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com>
16	*
17	* Scaled math optimizations by Thomas Gleixner
18	* Copyright (C) 2007, Thomas Gleixner <tglx@linutronix.de>
19	*
20	* Adaptive scheduling granularity, math enhancements by Peter Zijlstra
21	* Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra
22	*
23	* Move PELT related code from fair.c into this pelt.c file
24	* Author: Vincent Guittot <vincent.guittot@linaro.org>
25	*/
26
27	/*
28	* Approximate:
29	* val * y^n, where y^32 ~= 0.5 (~1 scheduling period)
30	*/
31	static u64 decay_load(u64 val, u64 n)
32	{
33	unsigned int local_n;
34
35	if (unlikely(n > LOAD_AVG_PERIOD * `63`))
36	return `0`;
37
38	/ after bounds checking we can collapse to 32-bit /
39	local_n = n;
40
41	/*
42	* As y^PERIOD = 1/2, we can combine
43	* y^n = 1/2^(n/PERIOD) * y^(n%PERIOD)
44	* With a look-up table which covers y^n (n<PERIOD)
45	*
46	* To achieve constant time decay_load.
47	*/
48	if (unlikely(local_n >= LOAD_AVG_PERIOD)) {
49	val >>= local_n / LOAD_AVG_PERIOD;
50	local_n %= LOAD_AVG_PERIOD;
51	}
52
53	val = mul_u64_u32_shr(a: val, mul: runnable_avg_yN_inv[local_n], shift: `32`);
54	return val;
55	}
56
57	static u32 __accumulate_pelt_segments(u64 periods, u32 d1, u32 d3)
58	{
59	u32 c1, c2, c3 = d3; / y^0 == 1 /
60
61	/*
62	* c1 = d1 y^p
63	*/
64	c1 = decay_load(val: (u64)d1, n: periods);
65
66	/*
67	* p-1
68	* c2 = 1024 \Sum y^n
69	* n=1
70	*
71	* inf inf
72	* = 1024 ( \Sum y^n - \Sum y^n - y^0 )
73	* n=0 n=p
74	*/
75	c2 = LOAD_AVG_MAX - decay_load(LOAD_AVG_MAX, n: periods) - `1024`;
76
77	return c1 + c2 + c3;
78	}
79
80	/*
81	* Accumulate the three separate parts of the sum; d1 the remainder
82	* of the last (incomplete) period, d2 the span of full periods and d3
83	* the remainder of the (incomplete) current period.
84	*
85	* d1 d2 d3
86	* ^ ^ ^
87	* \| \| \|
88	* \|<->\|<----------------->\|<--->\|
89	* ... \|---x---\|------\| ... \|------\|-----x (now)
90	*
91	* p-1
92	* u' = (u + d1) y^p + 1024 \Sum y^n + d3 y^0
93	* n=1
94	*
95	* = u y^p + (Step 1)
96	*
97	* p-1
98	* d1 y^p + 1024 \Sum y^n + d3 y^0 (Step 2)
99	* n=1
100	*/
101	static __always_inline u32
102	accumulate_sum(u64 delta, struct sched_avg *sa,
103	unsigned long load, unsigned long runnable, int running)
104	{
105	u32 contrib = (u32)delta; / p == 0 -> delta < 1024 /
106	u64 periods;
107
108	delta += sa->period_contrib;
109	periods = delta / `1024`; / A period is 1024us (~1ms) /
110
111	/*
112	* Step 1: decay old *_sum if we crossed period boundaries.
113	*/
114	if (periods) {
115	sa->load_sum = decay_load(val: sa->load_sum, n: periods);
116	sa->runnable_sum =
117	decay_load(val: sa->runnable_sum, n: periods);
118	sa->util_sum = decay_load(val: (u64)(sa->util_sum), n: periods);
119
120	/*
121	* Step 2
122	*/
123	delta %= `1024`;
124	if (load) {
125	/*
126	* This relies on the:
127	*
128	* if (!load)
129	* runnable = running = 0;
130	*
131	* clause from ___update_load_sum(); this results in
132	* the below usage of @contrib to disappear entirely,
133	* so no point in calculating it.
134	*/
135	contrib = __accumulate_pelt_segments(periods,
136	d1: `1024` - sa->period_contrib, d3: delta);
137	}
138	}
139	sa->period_contrib = delta;
140
141	if (load)
142	sa->load_sum += load * contrib;
143	if (runnable)
144	sa->runnable_sum += runnable * contrib << SCHED_CAPACITY_SHIFT;
145	if (running)
146	sa->util_sum += contrib << SCHED_CAPACITY_SHIFT;
147
148	return periods;
149	}
150
151	/*
152	* We can represent the historical contribution to runnable average as the
153	* coefficients of a geometric series. To do this we sub-divide our runnable
154	* history into segments of approximately 1ms (1024us); label the segment that
155	* occurred N-ms ago p_N, with p_0 corresponding to the current period, e.g.
156	*
157	* [<- 1024us ->\|<- 1024us ->\|<- 1024us ->\| ...
158	* p0 p1 p2
159	* (now) (~1ms ago) (~2ms ago)
160	*
161	* Let u_i denote the fraction of p_i that the entity was runnable.
162	*
163	* We then designate the fractions u_i as our co-efficients, yielding the
164	* following representation of historical load:
165	* u_0 + u_1y + u_2y^2 + u_3*y^3 + ...
166	*
167	* We choose y based on the with of a reasonably scheduling period, fixing:
168	* y^32 = 0.5
169	*
170	* This means that the contribution to load ~32ms ago (u_32) will be weighted
171	* approximately half as much as the contribution to load within the last ms
172	* (u_0).
173	*
174	* When a period "rolls over" and we have new u_0`, multiplying the previous
175	* sum again by y is sufficient to update:
176	* load_avg = u_0` + y(u_0 + u_1y + u_2*y^2 + ... )
177	* = u_0 + u_1y + u_2y^2 + ... [re-labeling u_i --> u_{i+1}]
178	*/
179	static __always_inline int
180	___update_load_sum(u64 now, struct sched_avg *sa,
181	unsigned long load, unsigned long runnable, int running)
182	{
183	u64 delta;
184
185	delta = now - sa->last_update_time;
186	/*
187	* This should only happen when time goes backwards, which it
188	* unfortunately does during sched clock init when we swap over to TSC.
189	*/
190	if ((s64)delta < `0`) {
191	sa->last_update_time = now;
192	return `0`;
193	}
194
195	/*
196	* Use 1024ns as the unit of measurement since it's a reasonable
197	* approximation of 1us and fast to compute.
198	*/
199	delta >>= `10`;
200	if (!delta)
201	return `0`;
202
203	sa->last_update_time += delta << `10`;
204
205	/*
206	* running is a subset of runnable (weight) so running can't be set if
207	* runnable is clear. But there are some corner cases where the current
208	* se has been already dequeued but cfs_rq->curr still points to it.
209	* This means that weight will be 0 but not running for a sched_entity
210	* but also for a cfs_rq if the latter becomes idle. As an example,
211	* this happens during idle_balance() which calls
212	* update_blocked_averages().
213	*
214	* Also see the comment in accumulate_sum().
215	*/
216	if (!load)
217	runnable = running = `0`;
218
219	/*
220	* Now we know we crossed measurement unit boundaries. The *_avg
221	* accrues by two steps:
222	*
223	* Step 1: accumulate *_sum since last_update_time. If we haven't
224	* crossed period boundaries, finish.
225	*/
226	if (!accumulate_sum(delta, sa, load, runnable, running))
227	return `0`;
228
229	return `1`;
230	}
231
232	/*
233	* When syncing _avg with _sum, we must take into account the current
234	* position in the PELT segment otherwise the remaining part of the segment
235	* will be considered as idle time whereas it's not yet elapsed and this will
236	* generate unwanted oscillation in the range [1002..1024[.
237	*
238	* The max value of *_sum varies with the position in the time segment and is
239	* equals to :
240	*
241	* LOAD_AVG_MAX*y + sa->period_contrib
242	*
243	* which can be simplified into:
244	*
245	* LOAD_AVG_MAX - 1024 + sa->period_contrib
246	*
247	* because LOAD_AVG_MAX*y == LOAD_AVG_MAX-1024
248	*
249	* The same care must be taken when a sched entity is added, updated or
250	* removed from a cfs_rq and we need to update sched_avg. Scheduler entities
251	* and the cfs rq, to which they are attached, have the same position in the
252	* time segment because they use the same clock. This means that we can use
253	* the period_contrib of cfs_rq when updating the sched_avg of a sched_entity
254	* if it's more convenient.
255	*/
256	static __always_inline void
257	___update_load_avg(struct sched_avg sa, unsigned* long load)
258	{
259	u32 divider = get_pelt_divider(avg: sa);
260
261	/*
262	* Step 2: update *_avg.
263	*/
264	sa->load_avg = div_u64(dividend: load * sa->load_sum, divisor: divider);
265	sa->runnable_avg = div_u64(dividend: sa->runnable_sum, divisor: divider);
266	WRITE_ONCE(sa->util_avg, sa->util_sum / divider);
267	}
268
269	/*
270	* sched_entity:
271	*
272	* task:
273	* se_weight() = se->load.weight
274	* se_runnable() = !!on_rq
275	*
276	* group: [ see update_cfs_group() ]
277	* se_weight() = tg->weight * grq->load_avg / tg->load_avg
278	* se_runnable() = grq->h_nr_running
279	*
280	* runnable_sum = se_runnable() * runnable = grq->runnable_sum
281	* runnable_avg = runnable_sum
282	*
283	* load_sum := runnable
284	* load_avg = se_weight(se) * load_sum
285	*
286	* cfq_rq:
287	*
288	* runnable_sum = \Sum se->avg.runnable_sum
289	* runnable_avg = \Sum se->avg.runnable_avg
290	*
291	* load_sum = \Sum se_weight(se) * se->avg.load_sum
292	* load_avg = \Sum se->avg.load_avg
293	*/
294
295	int __update_load_avg_blocked_se(u64 now, struct sched_entity *se)
296	{
297	if (___update_load_sum(now, sa: &se->avg, load: `0`, runnable: `0`, running: `0`)) {
298	___update_load_avg(sa: &se->avg, load: se_weight(se));
299	trace_pelt_se_tp(se);
300	return `1`;
301	}
302
303	return `0`;
304	}
305
306	int __update_load_avg_se(u64 now, struct cfs_rq cfs_rq, struct* sched_entity *se)
307	{
308	if (___update_load_sum(now, sa: &se->avg, load: !!se->on_rq, runnable: se_runnable(se),
309	running: cfs_rq->curr == se)) {
310
311	___update_load_avg(sa: &se->avg, load: se_weight(se));
312	cfs_se_util_change(avg: &se->avg);
313	trace_pelt_se_tp(se);
314	return `1`;
315	}
316
317	return `0`;
318	}
319
320	int __update_load_avg_cfs_rq(u64 now, struct cfs_rq *cfs_rq)
321	{
322	if (___update_load_sum(now, sa: &cfs_rq->avg,
323	scale_load_down(cfs_rq->load.weight),
324	runnable: cfs_rq->h_nr_running,
325	running: cfs_rq->curr != NULL)) {
326
327	___update_load_avg(sa: &cfs_rq->avg, load: `1`);
328	trace_pelt_cfs_tp(cfs_rq);
329	return `1`;
330	}
331
332	return `0`;
333	}
334
335	/*
336	* rt_rq:
337	*
338	* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
339	* util_sum = cpu_scale * load_sum
340	* runnable_sum = util_sum
341	*
342	* load_avg and runnable_avg are not supported and meaningless.
343	*
344	*/
345
346	int update_rt_rq_load_avg(u64 now, struct rq rq, int* running)
347	{
348	if (___update_load_sum(now, sa: &rq->avg_rt,
349	load: running,
350	runnable: running,
351	running)) {
352
353	___update_load_avg(sa: &rq->avg_rt, load: `1`);
354	trace_pelt_rt_tp(rq);
355	return `1`;
356	}
357
358	return `0`;
359	}
360
361	/*
362	* dl_rq:
363	*
364	* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
365	* util_sum = cpu_scale * load_sum
366	* runnable_sum = util_sum
367	*
368	* load_avg and runnable_avg are not supported and meaningless.
369	*
370	*/
371
372	int update_dl_rq_load_avg(u64 now, struct rq rq, int* running)
373	{
374	if (___update_load_sum(now, sa: &rq->avg_dl,
375	load: running,
376	runnable: running,
377	running)) {
378
379	___update_load_avg(sa: &rq->avg_dl, load: `1`);
380	trace_pelt_dl_tp(rq);
381	return `1`;
382	}
383
384	return `0`;
385	}
386
387	#ifdef CONFIG_SCHED_THERMAL_PRESSURE
388	/*
389	* thermal:
390	*
391	* load_sum = \Sum se->avg.load_sum but se->avg.load_sum is not tracked
392	*
393	* util_avg and runnable_load_avg are not supported and meaningless.
394	*
395	* Unlike rt/dl utilization tracking that track time spent by a cpu
396	* running a rt/dl task through util_avg, the average thermal pressure is
397	* tracked through load_avg. This is because thermal pressure signal is
398	* time weighted "delta" capacity unlike util_avg which is binary.
399	* "delta capacity" = actual capacity -
400	* capped capacity a cpu due to a thermal event.
401	*/
402
403	int update_thermal_load_avg(u64 now, struct rq *rq, u64 capacity)
404	{
405	if (___update_load_sum(now, &rq->avg_thermal,
406	capacity,
407	capacity,
408	capacity)) {
409	___update_load_avg(&rq->avg_thermal, `1`);
410	trace_pelt_thermal_tp(rq);
411	return `1`;
412	}
413
414	return `0`;
415	}
416	#endif
417
418	#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
419	/*
420	* irq:
421	*
422	* util_sum = \Sum se->avg.util_sum but se->avg.util_sum is not tracked
423	* util_sum = cpu_scale * load_sum
424	* runnable_sum = util_sum
425	*
426	* load_avg and runnable_avg are not supported and meaningless.
427	*
428	*/
429
430	int update_irq_load_avg(struct rq *rq, u64 running)
431	{
432	int ret = `0`;
433
434	/*
435	* We can't use clock_pelt because irq time is not accounted in
436	* clock_task. Instead we directly scale the running time to
437	* reflect the real amount of computation
438	*/
439	running = cap_scale(running, arch_scale_freq_capacity(cpu_of(rq)));
440	running = cap_scale(running, arch_scale_cpu_capacity(cpu_of(rq)));
441
442	/*
443	* We know the time that has been used by interrupt since last update
444	* but we don't when. Let be pessimistic and assume that interrupt has
445	* happened just before the update. This is not so far from reality
446	* because interrupt will most probably wake up task and trig an update
447	* of rq clock during which the metric is updated.
448	* We start to decay with normal context time and then we add the
449	* interrupt context time.
450	* We can safely remove running from rq->clock because
451	* rq->clock += delta with delta >= running
452	*/
453	ret = ___update_load_sum(now: rq->clock - running, sa: &rq->avg_irq,
454	load: `0`,
455	runnable: `0`,
456	running: `0`);
457	ret += ___update_load_sum(now: rq->clock, sa: &rq->avg_irq,
458	load: `1`,
459	runnable: `1`,
460	running: `1`);
461
462	if (ret) {
463	___update_load_avg(sa: &rq->avg_irq, load: `1`);
464	trace_pelt_irq_tp(rq);
465	}
466
467	return ret;
468	}
469	#endif
470

source code of linux/kernel/sched/pelt.c