energy_model.h source code [linux/include/linux/energy_model.h]

1	/ SPDX-License-Identifier: GPL-2.0 /
2	#ifndef _LINUX_ENERGY_MODEL_H
3	#define _LINUX_ENERGY_MODEL_H
4	#include <linux/cpumask.h>
5	#include <linux/device.h>
6	#include <linux/jump_label.h>
7	#include <linux/kobject.h>
8	#include <linux/rcupdate.h>
9	#include <linux/sched/cpufreq.h>
10	#include <linux/sched/topology.h>
11	#include <linux/types.h>
12
13	/**
14	* struct em_perf_state - Performance state of a performance domain
15	* @frequency: The frequency in KHz, for consistency with CPUFreq
16	* @power: The power consumed at this level (by 1 CPU or by a registered
17	* device). It can be a total power: static and dynamic.
18	* @cost: The cost coefficient associated with this level, used during
19	* energy calculation. Equal to: power * max_frequency / frequency
20	* @flags: see "em_perf_state flags" description below.
21	*/
22	struct em_perf_state {
23	unsigned long frequency;
24	unsigned long power;
25	unsigned long cost;
26	unsigned long flags;
27	};
28
29	/*
30	* em_perf_state flags:
31	*
32	* EM_PERF_STATE_INEFFICIENT: The performance state is inefficient. There is
33	* in this em_perf_domain, another performance state with a higher frequency
34	* but a lower or equal power cost. Such inefficient states are ignored when
35	* using em_pd_get_efficient_*() functions.
36	*/
37	#define EM_PERF_STATE_INEFFICIENT BIT(0)
38
39	/**
40	* struct em_perf_domain - Performance domain
41	* @table: List of performance states, in ascending order
42	* @nr_perf_states: Number of performance states
43	* @flags: See "em_perf_domain flags"
44	* @cpus: Cpumask covering the CPUs of the domain. It's here
45	* for performance reasons to avoid potential cache
46	* misses during energy calculations in the scheduler
47	* and simplifies allocating/freeing that memory region.
48	*
49	* In case of CPU device, a "performance domain" represents a group of CPUs
50	* whose performance is scaled together. All CPUs of a performance domain
51	* must have the same micro-architecture. Performance domains often have
52	* a 1-to-1 mapping with CPUFreq policies. In case of other devices the @cpus
53	* field is unused.
54	*/
55	struct em_perf_domain {
56	struct em_perf_state *table;
57	int nr_perf_states;
58	unsigned long flags;
59	unsigned long cpus[];
60	};
61
62	/*
63	* em_perf_domain flags:
64	*
65	* EM_PERF_DOMAIN_MICROWATTS: The power values are in micro-Watts or some
66	* other scale.
67	*
68	* EM_PERF_DOMAIN_SKIP_INEFFICIENCIES: Skip inefficient states when estimating
69	* energy consumption.
70	*
71	* EM_PERF_DOMAIN_ARTIFICIAL: The power values are artificial and might be
72	* created by platform missing real power information
73	*/
74	#define EM_PERF_DOMAIN_MICROWATTS BIT(0)
75	#define EM_PERF_DOMAIN_SKIP_INEFFICIENCIES BIT(1)
76	#define EM_PERF_DOMAIN_ARTIFICIAL BIT(2)
77
78	#define em_span_cpus(em) (to_cpumask((em)->cpus))
79	#define em_is_artificial(em) ((em)->flags & EM_PERF_DOMAIN_ARTIFICIAL)
80
81	#ifdef CONFIG_ENERGY_MODEL
82	/*
83	* The max power value in micro-Watts. The limit of 64 Watts is set as
84	* a safety net to not overflow multiplications on 32bit platforms. The
85	* 32bit value limit for total Perf Domain power implies a limit of
86	* maximum CPUs in such domain to 64.
87	*/
88	#define EM_MAX_POWER (64000000) /* 64 Watts */
89
90	/*
91	* To avoid possible energy estimation overflow on 32bit machines add
92	* limits to number of CPUs in the Perf. Domain.
93	* We are safe on 64bit machine, thus some big number.
94	*/
95	#ifdef CONFIG_64BIT
96	#define EM_MAX_NUM_CPUS 4096
97	#else
98	#define EM_MAX_NUM_CPUS 16
99	#endif
100
101	/*
102	* To avoid an overflow on 32bit machines while calculating the energy
103	* use a different order in the operation. First divide by the 'cpu_scale'
104	* which would reduce big value stored in the 'cost' field, then multiply by
105	* the 'sum_util'. This would allow to handle existing platforms, which have
106	* e.g. power ~1.3 Watt at max freq, so the 'cost' value > 1mln micro-Watts.
107	* In such scenario, where there are 4 CPUs in the Perf. Domain the 'sum_util'
108	* could be 4096, then multiplication: 'cost' * 'sum_util' would overflow.
109	* This reordering of operations has some limitations, we lose small
110	* precision in the estimation (comparing to 64bit platform w/o reordering).
111	*
112	* We are safe on 64bit machine.
113	*/
114	#ifdef CONFIG_64BIT
115	#define em_estimate_energy(cost, sum_util, scale_cpu) \
116	(((cost) * (sum_util)) / (scale_cpu))
117	#else
118	#define em_estimate_energy(cost, sum_util, scale_cpu) \
119	(((cost) / (scale_cpu)) * (sum_util))
120	#endif
121
122	struct em_data_callback {
123	/**
124	* active_power() - Provide power at the next performance state of
125	* a device
126	* @dev : Device for which we do this operation (can be a CPU)
127	* @power : Active power at the performance state
128	* (modified)
129	* @freq : Frequency at the performance state in kHz
130	* (modified)
131	*
132	* active_power() must find the lowest performance state of 'dev' above
133	* 'freq' and update 'power' and 'freq' to the matching active power
134	* and frequency.
135	*
136	* In case of CPUs, the power is the one of a single CPU in the domain,
137	* expressed in micro-Watts or an abstract scale. It is expected to
138	* fit in the [0, EM_MAX_POWER] range.
139	*
140	* Return 0 on success.
141	*/
142	int (active_power)(struct* device dev, unsigned* long *power,
143	unsigned long *freq);
144
145	/**
146	* get_cost() - Provide the cost at the given performance state of
147	* a device
148	* @dev : Device for which we do this operation (can be a CPU)
149	* @freq : Frequency at the performance state in kHz
150	* @cost : The cost value for the performance state
151	* (modified)
152	*
153	* In case of CPUs, the cost is the one of a single CPU in the domain.
154	* It is expected to fit in the [0, EM_MAX_POWER] range due to internal
155	* usage in EAS calculation.
156	*
157	* Return 0 on success, or appropriate error value in case of failure.
158	*/
159	int (get_cost)(struct* device dev, unsigned* long freq,
160	unsigned long *cost);
161	};
162	#define EM_SET_ACTIVE_POWER_CB(em_cb, cb) ((em_cb).active_power = cb)
163	#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) \
164	{ .active_power = _active_power_cb, \
165	.get_cost = _cost_cb }
166	#define EM_DATA_CB(_active_power_cb) \
167	EM_ADV_DATA_CB(_active_power_cb, NULL)
168
169	struct em_perf_domain em_cpu_get(int* cpu);
170	struct em_perf_domain em_pd_get(struct* device *dev);
171	int em_dev_register_perf_domain(struct device dev, unsigned* int nr_states,
172	struct em_data_callback cb, cpumask_t span,
173	bool microwatts);
174	void em_dev_unregister_perf_domain(struct device *dev);
175
176	/**
177	* em_pd_get_efficient_state() - Get an efficient performance state from the EM
178	* @pd : Performance domain for which we want an efficient frequency
179	* @freq : Frequency to map with the EM
180	*
181	* It is called from the scheduler code quite frequently and as a consequence
182	* doesn't implement any check.
183	*
184	* Return: An efficient performance state, high enough to meet @freq
185	* requirement.
186	*/
187	static inline
188	struct em_perf_state em_pd_get_efficient_state(struct* em_perf_domain *pd,
189	unsigned long freq)
190	{
191	struct em_perf_state *ps;
192	int i;
193
194	for (i = `0`; i < pd->nr_perf_states; i++) {
195	ps = &pd->table[i];
196	if (ps->frequency >= freq) {
197	if (pd->flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
198	ps->flags & EM_PERF_STATE_INEFFICIENT)
199	continue;
200	break;
201	}
202	}
203
204	return ps;
205	}
206
207	/**
208	* em_cpu_energy() - Estimates the energy consumed by the CPUs of a
209	* performance domain
210	* @pd : performance domain for which energy has to be estimated
211	* @max_util : highest utilization among CPUs of the domain
212	* @sum_util : sum of the utilization of all CPUs in the domain
213	* @allowed_cpu_cap : maximum allowed CPU capacity for the @pd, which
214	* might reflect reduced frequency (due to thermal)
215	*
216	* This function must be used only for CPU devices. There is no validation,
217	* i.e. if the EM is a CPU type and has cpumask allocated. It is called from
218	* the scheduler code quite frequently and that is why there is not checks.
219	*
220	* Return: the sum of the energy consumed by the CPUs of the domain assuming
221	* a capacity state satisfying the max utilization of the domain.
222	*/
223	static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
224	unsigned long max_util, unsigned long sum_util,
225	unsigned long allowed_cpu_cap)
226	{
227	unsigned long freq, scale_cpu;
228	struct em_perf_state *ps;
229	int cpu;
230
231	if (!sum_util)
232	return `0`;
233
234	/*
235	* In order to predict the performance state, map the utilization of
236	* the most utilized CPU of the performance domain to a requested
237	* frequency, like schedutil. Take also into account that the real
238	* frequency might be set lower (due to thermal capping). Thus, clamp
239	* max utilization to the allowed CPU capacity before calculating
240	* effective frequency.
241	*/
242	cpu = cpumask_first(to_cpumask(pd->cpus));
243	scale_cpu = arch_scale_cpu_capacity(cpu);
244	ps = &pd->table[pd->nr_perf_states - `1`];
245
246	max_util = map_util_perf(util: max_util);
247	max_util = min(max_util, allowed_cpu_cap);
248	freq = map_util_freq(util: max_util, freq: ps->frequency, cap: scale_cpu);
249
250	/*
251	* Find the lowest performance state of the Energy Model above the
252	* requested frequency.
253	*/
254	ps = em_pd_get_efficient_state(pd, freq);
255
256	/*
257	* The capacity of a CPU in the domain at the performance state (ps)
258	* can be computed as:
259	*
260	* ps->freq * scale_cpu
261	* ps->cap = -------------------- (1)
262	* cpu_max_freq
263	*
264	* So, ignoring the costs of idle states (which are not available in
265	* the EM), the energy consumed by this CPU at that performance state
266	* is estimated as:
267	*
268	* ps->power * cpu_util
269	* cpu_nrg = -------------------- (2)
270	* ps->cap
271	*
272	* since 'cpu_util / ps->cap' represents its percentage of busy time.
273	*
274	* NOTE: Although the result of this computation actually is in
275	* units of power, it can be manipulated as an energy value
276	* over a scheduling period, since it is assumed to be
277	* constant during that interval.
278	*
279	* By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
280	* of two terms:
281	*
282	* ps->power * cpu_max_freq cpu_util
283	* cpu_nrg = ------------------------ * --------- (3)
284	* ps->freq scale_cpu
285	*
286	* The first term is static, and is stored in the em_perf_state struct
287	* as 'ps->cost'.
288	*
289	* Since all CPUs of the domain have the same micro-architecture, they
290	* share the same 'ps->cost', and the same CPU capacity. Hence, the
291	* total energy of the domain (which is the simple sum of the energy of
292	* all of its CPUs) can be factorized as:
293	*
294	* ps->cost * \Sum cpu_util
295	* pd_nrg = ------------------------ (4)
296	* scale_cpu
297	*/
298	return em_estimate_energy(ps->cost, sum_util, scale_cpu);
299	}
300
301	/**
302	* em_pd_nr_perf_states() - Get the number of performance states of a perf.
303	* domain
304	* @pd : performance domain for which this must be done
305	*
306	* Return: the number of performance states in the performance domain table
307	*/
308	static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
309	{
310	return pd->nr_perf_states;
311	}
312
313	#else
314	struct em_data_callback {};
315	#define EM_ADV_DATA_CB(_active_power_cb, _cost_cb) { }
316	#define EM_DATA_CB(_active_power_cb) { }
317	#define EM_SET_ACTIVE_POWER_CB(em_cb, cb) do { } while (0)
318
319	static inline
320	int em_dev_register_perf_domain(struct device dev, unsigned* int nr_states,
321	struct em_data_callback cb, cpumask_t span,
322	bool microwatts)
323	{
324	return -EINVAL;
325	}
326	static inline void em_dev_unregister_perf_domain(struct device *dev)
327	{
328	}
329	static inline struct em_perf_domain em_cpu_get(int* cpu)
330	{
331	return NULL;
332	}
333	static inline struct em_perf_domain em_pd_get(struct* device *dev)
334	{
335	return NULL;
336	}
337	static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
338	unsigned long max_util, unsigned long sum_util,
339	unsigned long allowed_cpu_cap)
340	{
341	return `0`;
342	}
343	static inline int em_pd_nr_perf_states(struct em_perf_domain *pd)
344	{
345	return `0`;
346	}
347	#endif
348
349	#endif
350

source code of linux/include/linux/energy_model.h