cpupri.c source code [linux/kernel/sched/cpupri.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* kernel/sched/cpupri.c
4	*
5	* CPU priority management
6	*
7	* Copyright (C) 2007-2008 Novell
8	*
9	* Author: Gregory Haskins <ghaskins@novell.com>
10	*
11	* This code tracks the priority of each CPU so that global migration
12	* decisions are easy to calculate. Each CPU can be in a state as follows:
13	*
14	* (INVALID), NORMAL, RT1, ... RT99, HIGHER
15	*
16	* going from the lowest priority to the highest. CPUs in the INVALID state
17	* are not eligible for routing. The system maintains this state with
18	* a 2 dimensional bitmap (the first for priority class, the second for CPUs
19	* in that class). Therefore a typical application without affinity
20	* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
21	* searches). For tasks with affinity restrictions, the algorithm has a
22	* worst case complexity of O(min(101, nr_domcpus)), though the scenario that
23	* yields the worst case search is fairly contrived.
24	*/
25
26	/*
27	* p->rt_priority p->prio newpri cpupri
28	*
29	* -1 -1 (CPUPRI_INVALID)
30	*
31	* 99 0 (CPUPRI_NORMAL)
32	*
33	* 1 98 98 1
34	* ...
35	* 49 50 50 49
36	* 50 49 49 50
37	* ...
38	* 99 0 0 99
39	*
40	* 100 100 (CPUPRI_HIGHER)
41	*/
42	static int convert_prio(int prio)
43	{
44	int cpupri;
45
46	switch (prio) {
47	case CPUPRI_INVALID:
48	cpupri = CPUPRI_INVALID; / -1 /
49	break;
50
51	case `0` ... `98`:
52	cpupri = MAX_RT_PRIO-`1` - prio; / 1 ... 99 /
53	break;
54
55	case MAX_RT_PRIO-`1`:
56	cpupri = CPUPRI_NORMAL; / 0 /
57	break;
58
59	case MAX_RT_PRIO:
60	cpupri = CPUPRI_HIGHER; / 100 /
61	break;
62	}
63
64	return cpupri;
65	}
66
67	static inline int __cpupri_find(struct cpupri cp, struct* task_struct *p,
68	struct cpumask lowest_mask, int* idx)
69	{
70	struct cpupri_vec *vec = &cp->pri_to_cpu[idx];
71	int skip = `0`;
72
73	if (!atomic_read(v: &(vec)->count))
74	skip = `1`;
75	/*
76	* When looking at the vector, we need to read the counter,
77	* do a memory barrier, then read the mask.
78	*
79	* Note: This is still all racy, but we can deal with it.
80	* Ideally, we only want to look at masks that are set.
81	*
82	* If a mask is not set, then the only thing wrong is that we
83	* did a little more work than necessary.
84	*
85	* If we read a zero count but the mask is set, because of the
86	* memory barriers, that can only happen when the highest prio
87	* task for a run queue has left the run queue, in which case,
88	* it will be followed by a pull. If the task we are processing
89	* fails to find a proper place to go, that pull request will
90	* pull this task if the run queue is running at a lower
91	* priority.
92	*/
93	smp_rmb();
94
95	/ Need to do the rmb for every iteration /
96	if (skip)
97	return `0`;
98
99	if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids)
100	return `0`;
101
102	if (lowest_mask) {
103	cpumask_and(dstp: lowest_mask, src1p: &p->cpus_mask, src2p: vec->mask);
104	cpumask_and(dstp: lowest_mask, src1p: lowest_mask, cpu_active_mask);
105
106	/*
107	* We have to ensure that we have at least one bit
108	* still set in the array, since the map could have
109	* been concurrently emptied between the first and
110	* second reads of vec->mask. If we hit this
111	* condition, simply act as though we never hit this
112	* priority level and continue on.
113	*/
114	if (cpumask_empty(srcp: lowest_mask))
115	return `0`;
116	}
117
118	return `1`;
119	}
120
121	int cpupri_find(struct cpupri cp, struct* task_struct *p,
122	struct cpumask *lowest_mask)
123	{
124	return cpupri_find_fitness(cp, p, lowest_mask, NULL);
125	}
126
127	/**
128	* cpupri_find_fitness - find the best (lowest-pri) CPU in the system
129	* @cp: The cpupri context
130	* @p: The task
131	* @lowest_mask: A mask to fill in with selected CPUs (or NULL)
132	* @fitness_fn: A pointer to a function to do custom checks whether the CPU
133	* fits a specific criteria so that we only return those CPUs.
134	*
135	* Note: This function returns the recommended CPUs as calculated during the
136	* current invocation. By the time the call returns, the CPUs may have in
137	* fact changed priorities any number of times. While not ideal, it is not
138	* an issue of correctness since the normal rebalancer logic will correct
139	* any discrepancies created by racing against the uncertainty of the current
140	* priority configuration.
141	*
142	* Return: (int)bool - CPUs were found
143	*/
144	int cpupri_find_fitness(struct cpupri cp, struct* task_struct *p,
145	struct cpumask *lowest_mask,
146	bool (fitness_fn)(struct* task_struct p, int* cpu))
147	{
148	int task_pri = convert_prio(prio: p->prio);
149	int idx, cpu;
150
151	WARN_ON_ONCE(task_pri >= CPUPRI_NR_PRIORITIES);
152
153	for (idx = `0`; idx < task_pri; idx++) {
154
155	if (!__cpupri_find(cp, p, lowest_mask, idx))
156	continue;
157
158	if (!lowest_mask \|\| !fitness_fn)
159	return `1`;
160
161	/ Ensure the capacity of the CPUs fit the task /
162	for_each_cpu(cpu, lowest_mask) {
163	if (!fitness_fn(p, cpu))
164	cpumask_clear_cpu(cpu, dstp: lowest_mask);
165	}
166
167	/*
168	* If no CPU at the current priority can fit the task
169	* continue looking
170	*/
171	if (cpumask_empty(srcp: lowest_mask))
172	continue;
173
174	return `1`;
175	}
176
177	/*
178	* If we failed to find a fitting lowest_mask, kick off a new search
179	* but without taking into account any fitness criteria this time.
180	*
181	* This rule favours honouring priority over fitting the task in the
182	* correct CPU (Capacity Awareness being the only user now).
183	* The idea is that if a higher priority task can run, then it should
184	* run even if this ends up being on unfitting CPU.
185	*
186	* The cost of this trade-off is not entirely clear and will probably
187	* be good for some workloads and bad for others.
188	*
189	* The main idea here is that if some CPUs were over-committed, we try
190	* to spread which is what the scheduler traditionally did. Sys admins
191	* must do proper RT planning to avoid overloading the system if they
192	* really care.
193	*/
194	if (fitness_fn)
195	return cpupri_find(cp, p, lowest_mask);
196
197	return `0`;
198	}
199
200	/**
201	* cpupri_set - update the CPU priority setting
202	* @cp: The cpupri context
203	* @cpu: The target CPU
204	* @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
205	*
206	* Note: Assumes cpu_rq(cpu)->lock is locked
207	*
208	* Returns: (void)
209	*/
210	void cpupri_set(struct cpupri cp, int* cpu, int newpri)
211	{
212	int *currpri = &cp->cpu_to_pri[cpu];
213	int oldpri = *currpri;
214	int do_mb = `0`;
215
216	newpri = convert_prio(prio: newpri);
217
218	BUG_ON(newpri >= CPUPRI_NR_PRIORITIES);
219
220	if (newpri == oldpri)
221	return;
222
223	/*
224	* If the CPU was currently mapped to a different value, we
225	* need to map it to the new value then remove the old value.
226	* Note, we must add the new value first, otherwise we risk the
227	* cpu being missed by the priority loop in cpupri_find.
228	*/
229	if (likely(newpri != CPUPRI_INVALID)) {
230	struct cpupri_vec *vec = &cp->pri_to_cpu[newpri];
231
232	cpumask_set_cpu(cpu, dstp: vec->mask);
233	/*
234	* When adding a new vector, we update the mask first,
235	* do a write memory barrier, and then update the count, to
236	* make sure the vector is visible when count is set.
237	*/
238	smp_mb__before_atomic();
239	atomic_inc(v: &(vec)->count);
240	do_mb = `1`;
241	}
242	if (likely(oldpri != CPUPRI_INVALID)) {
243	struct cpupri_vec *vec = &cp->pri_to_cpu[oldpri];
244
245	/*
246	* Because the order of modification of the vec->count
247	* is important, we must make sure that the update
248	* of the new prio is seen before we decrement the
249	* old prio. This makes sure that the loop sees
250	* one or the other when we raise the priority of
251	* the run queue. We don't care about when we lower the
252	* priority, as that will trigger an rt pull anyway.
253	*
254	* We only need to do a memory barrier if we updated
255	* the new priority vec.
256	*/
257	if (do_mb)
258	smp_mb__after_atomic();
259
260	/*
261	* When removing from the vector, we decrement the counter first
262	* do a memory barrier and then clear the mask.
263	*/
264	atomic_dec(v: &(vec)->count);
265	smp_mb__after_atomic();
266	cpumask_clear_cpu(cpu, dstp: vec->mask);
267	}
268
269	*currpri = newpri;
270	}
271
272	/**
273	* cpupri_init - initialize the cpupri structure
274	* @cp: The cpupri context
275	*
276	* Return: -ENOMEM on memory allocation failure.
277	*/
278	int cpupri_init(struct cpupri *cp)
279	{
280	int i;
281
282	for (i = `0`; i < CPUPRI_NR_PRIORITIES; i++) {
283	struct cpupri_vec *vec = &cp->pri_to_cpu[i];
284
285	atomic_set(v: &vec->count, i: `0`);
286	if (!zalloc_cpumask_var(mask: &vec->mask, GFP_KERNEL))
287	goto cleanup;
288	}
289
290	cp->cpu_to_pri = kcalloc(n: nr_cpu_ids, size: sizeof(int), GFP_KERNEL);
291	if (!cp->cpu_to_pri)
292	goto cleanup;
293
294	for_each_possible_cpu(i)
295	cp->cpu_to_pri[i] = CPUPRI_INVALID;
296
297	return `0`;
298
299	cleanup:
300	for (i--; i >= `0`; i--)
301	free_cpumask_var(mask: cp->pri_to_cpu[i].mask);
302	return -ENOMEM;
303	}
304
305	/**
306	* cpupri_cleanup - clean up the cpupri structure
307	* @cp: The cpupri context
308	*/
309	void cpupri_cleanup(struct cpupri *cp)
310	{
311	int i;
312
313	kfree(objp: cp->cpu_to_pri);
314	for (i = `0`; i < CPUPRI_NR_PRIORITIES; i++)
315	free_cpumask_var(mask: cp->pri_to_cpu[i].mask);
316	}
317

source code of linux/kernel/sched/cpupri.c