1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright (C) 2016 Thomas Gleixner. |
4 | * Copyright (C) 2016-2017 Christoph Hellwig. |
5 | */ |
6 | #include <linux/kernel.h> |
7 | #include <linux/slab.h> |
8 | #include <linux/cpu.h> |
9 | #include <linux/sort.h> |
10 | #include <linux/group_cpus.h> |
11 | |
12 | #ifdef CONFIG_SMP |
13 | |
14 | static void grp_spread_init_one(struct cpumask *irqmsk, struct cpumask *nmsk, |
15 | unsigned int cpus_per_grp) |
16 | { |
17 | const struct cpumask *siblmsk; |
18 | int cpu, sibl; |
19 | |
20 | for ( ; cpus_per_grp > 0; ) { |
21 | cpu = cpumask_first(srcp: nmsk); |
22 | |
23 | /* Should not happen, but I'm too lazy to think about it */ |
24 | if (cpu >= nr_cpu_ids) |
25 | return; |
26 | |
27 | cpumask_clear_cpu(cpu, dstp: nmsk); |
28 | cpumask_set_cpu(cpu, dstp: irqmsk); |
29 | cpus_per_grp--; |
30 | |
31 | /* If the cpu has siblings, use them first */ |
32 | siblmsk = topology_sibling_cpumask(cpu); |
33 | for (sibl = -1; cpus_per_grp > 0; ) { |
34 | sibl = cpumask_next(n: sibl, srcp: siblmsk); |
35 | if (sibl >= nr_cpu_ids) |
36 | break; |
37 | if (!cpumask_test_and_clear_cpu(cpu: sibl, cpumask: nmsk)) |
38 | continue; |
39 | cpumask_set_cpu(cpu: sibl, dstp: irqmsk); |
40 | cpus_per_grp--; |
41 | } |
42 | } |
43 | } |
44 | |
45 | static cpumask_var_t *alloc_node_to_cpumask(void) |
46 | { |
47 | cpumask_var_t *masks; |
48 | int node; |
49 | |
50 | masks = kcalloc(n: nr_node_ids, size: sizeof(cpumask_var_t), GFP_KERNEL); |
51 | if (!masks) |
52 | return NULL; |
53 | |
54 | for (node = 0; node < nr_node_ids; node++) { |
55 | if (!zalloc_cpumask_var(mask: &masks[node], GFP_KERNEL)) |
56 | goto out_unwind; |
57 | } |
58 | |
59 | return masks; |
60 | |
61 | out_unwind: |
62 | while (--node >= 0) |
63 | free_cpumask_var(mask: masks[node]); |
64 | kfree(objp: masks); |
65 | return NULL; |
66 | } |
67 | |
68 | static void free_node_to_cpumask(cpumask_var_t *masks) |
69 | { |
70 | int node; |
71 | |
72 | for (node = 0; node < nr_node_ids; node++) |
73 | free_cpumask_var(mask: masks[node]); |
74 | kfree(objp: masks); |
75 | } |
76 | |
77 | static void build_node_to_cpumask(cpumask_var_t *masks) |
78 | { |
79 | int cpu; |
80 | |
81 | for_each_possible_cpu(cpu) |
82 | cpumask_set_cpu(cpu, dstp: masks[cpu_to_node(cpu)]); |
83 | } |
84 | |
85 | static int get_nodes_in_cpumask(cpumask_var_t *node_to_cpumask, |
86 | const struct cpumask *mask, nodemask_t *nodemsk) |
87 | { |
88 | int n, nodes = 0; |
89 | |
90 | /* Calculate the number of nodes in the supplied affinity mask */ |
91 | for_each_node(n) { |
92 | if (cpumask_intersects(src1p: mask, src2p: node_to_cpumask[n])) { |
93 | node_set(n, *nodemsk); |
94 | nodes++; |
95 | } |
96 | } |
97 | return nodes; |
98 | } |
99 | |
100 | struct node_groups { |
101 | unsigned id; |
102 | |
103 | union { |
104 | unsigned ngroups; |
105 | unsigned ncpus; |
106 | }; |
107 | }; |
108 | |
109 | static int ncpus_cmp_func(const void *l, const void *r) |
110 | { |
111 | const struct node_groups *ln = l; |
112 | const struct node_groups *rn = r; |
113 | |
114 | return ln->ncpus - rn->ncpus; |
115 | } |
116 | |
117 | /* |
118 | * Allocate group number for each node, so that for each node: |
119 | * |
120 | * 1) the allocated number is >= 1 |
121 | * |
122 | * 2) the allocated number is <= active CPU number of this node |
123 | * |
124 | * The actual allocated total groups may be less than @numgrps when |
125 | * active total CPU number is less than @numgrps. |
126 | * |
127 | * Active CPUs means the CPUs in '@cpu_mask AND @node_to_cpumask[]' |
128 | * for each node. |
129 | */ |
130 | static void alloc_nodes_groups(unsigned int numgrps, |
131 | cpumask_var_t *node_to_cpumask, |
132 | const struct cpumask *cpu_mask, |
133 | const nodemask_t nodemsk, |
134 | struct cpumask *nmsk, |
135 | struct node_groups *node_groups) |
136 | { |
137 | unsigned n, remaining_ncpus = 0; |
138 | |
139 | for (n = 0; n < nr_node_ids; n++) { |
140 | node_groups[n].id = n; |
141 | node_groups[n].ncpus = UINT_MAX; |
142 | } |
143 | |
144 | for_each_node_mask(n, nodemsk) { |
145 | unsigned ncpus; |
146 | |
147 | cpumask_and(dstp: nmsk, src1p: cpu_mask, src2p: node_to_cpumask[n]); |
148 | ncpus = cpumask_weight(srcp: nmsk); |
149 | |
150 | if (!ncpus) |
151 | continue; |
152 | remaining_ncpus += ncpus; |
153 | node_groups[n].ncpus = ncpus; |
154 | } |
155 | |
156 | numgrps = min_t(unsigned, remaining_ncpus, numgrps); |
157 | |
158 | sort(base: node_groups, num: nr_node_ids, size: sizeof(node_groups[0]), |
159 | cmp_func: ncpus_cmp_func, NULL); |
160 | |
161 | /* |
162 | * Allocate groups for each node according to the ratio of this |
163 | * node's nr_cpus to remaining un-assigned ncpus. 'numgrps' is |
164 | * bigger than number of active numa nodes. Always start the |
165 | * allocation from the node with minimized nr_cpus. |
166 | * |
167 | * This way guarantees that each active node gets allocated at |
168 | * least one group, and the theory is simple: over-allocation |
169 | * is only done when this node is assigned by one group, so |
170 | * other nodes will be allocated >= 1 groups, since 'numgrps' is |
171 | * bigger than number of numa nodes. |
172 | * |
173 | * One perfect invariant is that number of allocated groups for |
174 | * each node is <= CPU count of this node: |
175 | * |
176 | * 1) suppose there are two nodes: A and B |
177 | * ncpu(X) is CPU count of node X |
178 | * grps(X) is the group count allocated to node X via this |
179 | * algorithm |
180 | * |
181 | * ncpu(A) <= ncpu(B) |
182 | * ncpu(A) + ncpu(B) = N |
183 | * grps(A) + grps(B) = G |
184 | * |
185 | * grps(A) = max(1, round_down(G * ncpu(A) / N)) |
186 | * grps(B) = G - grps(A) |
187 | * |
188 | * both N and G are integer, and 2 <= G <= N, suppose |
189 | * G = N - delta, and 0 <= delta <= N - 2 |
190 | * |
191 | * 2) obviously grps(A) <= ncpu(A) because: |
192 | * |
193 | * if grps(A) is 1, then grps(A) <= ncpu(A) given |
194 | * ncpu(A) >= 1 |
195 | * |
196 | * otherwise, |
197 | * grps(A) <= G * ncpu(A) / N <= ncpu(A), given G <= N |
198 | * |
199 | * 3) prove how grps(B) <= ncpu(B): |
200 | * |
201 | * if round_down(G * ncpu(A) / N) == 0, vecs(B) won't be |
202 | * over-allocated, so grps(B) <= ncpu(B), |
203 | * |
204 | * otherwise: |
205 | * |
206 | * grps(A) = |
207 | * round_down(G * ncpu(A) / N) = |
208 | * round_down((N - delta) * ncpu(A) / N) = |
209 | * round_down((N * ncpu(A) - delta * ncpu(A)) / N) >= |
210 | * round_down((N * ncpu(A) - delta * N) / N) = |
211 | * cpu(A) - delta |
212 | * |
213 | * then: |
214 | * |
215 | * grps(A) - G >= ncpu(A) - delta - G |
216 | * => |
217 | * G - grps(A) <= G + delta - ncpu(A) |
218 | * => |
219 | * grps(B) <= N - ncpu(A) |
220 | * => |
221 | * grps(B) <= cpu(B) |
222 | * |
223 | * For nodes >= 3, it can be thought as one node and another big |
224 | * node given that is exactly what this algorithm is implemented, |
225 | * and we always re-calculate 'remaining_ncpus' & 'numgrps', and |
226 | * finally for each node X: grps(X) <= ncpu(X). |
227 | * |
228 | */ |
229 | for (n = 0; n < nr_node_ids; n++) { |
230 | unsigned ngroups, ncpus; |
231 | |
232 | if (node_groups[n].ncpus == UINT_MAX) |
233 | continue; |
234 | |
235 | WARN_ON_ONCE(numgrps == 0); |
236 | |
237 | ncpus = node_groups[n].ncpus; |
238 | ngroups = max_t(unsigned, 1, |
239 | numgrps * ncpus / remaining_ncpus); |
240 | WARN_ON_ONCE(ngroups > ncpus); |
241 | |
242 | node_groups[n].ngroups = ngroups; |
243 | |
244 | remaining_ncpus -= ncpus; |
245 | numgrps -= ngroups; |
246 | } |
247 | } |
248 | |
249 | static int __group_cpus_evenly(unsigned int startgrp, unsigned int numgrps, |
250 | cpumask_var_t *node_to_cpumask, |
251 | const struct cpumask *cpu_mask, |
252 | struct cpumask *nmsk, struct cpumask *masks) |
253 | { |
254 | unsigned int i, n, nodes, cpus_per_grp, , done = 0; |
255 | unsigned int last_grp = numgrps; |
256 | unsigned int curgrp = startgrp; |
257 | nodemask_t nodemsk = NODE_MASK_NONE; |
258 | struct node_groups *node_groups; |
259 | |
260 | if (cpumask_empty(srcp: cpu_mask)) |
261 | return 0; |
262 | |
263 | nodes = get_nodes_in_cpumask(node_to_cpumask, mask: cpu_mask, nodemsk: &nodemsk); |
264 | |
265 | /* |
266 | * If the number of nodes in the mask is greater than or equal the |
267 | * number of groups we just spread the groups across the nodes. |
268 | */ |
269 | if (numgrps <= nodes) { |
270 | for_each_node_mask(n, nodemsk) { |
271 | /* Ensure that only CPUs which are in both masks are set */ |
272 | cpumask_and(dstp: nmsk, src1p: cpu_mask, src2p: node_to_cpumask[n]); |
273 | cpumask_or(dstp: &masks[curgrp], src1p: &masks[curgrp], src2p: nmsk); |
274 | if (++curgrp == last_grp) |
275 | curgrp = 0; |
276 | } |
277 | return numgrps; |
278 | } |
279 | |
280 | node_groups = kcalloc(n: nr_node_ids, |
281 | size: sizeof(struct node_groups), |
282 | GFP_KERNEL); |
283 | if (!node_groups) |
284 | return -ENOMEM; |
285 | |
286 | /* allocate group number for each node */ |
287 | alloc_nodes_groups(numgrps, node_to_cpumask, cpu_mask, |
288 | nodemsk, nmsk, node_groups); |
289 | for (i = 0; i < nr_node_ids; i++) { |
290 | unsigned int ncpus, v; |
291 | struct node_groups *nv = &node_groups[i]; |
292 | |
293 | if (nv->ngroups == UINT_MAX) |
294 | continue; |
295 | |
296 | /* Get the cpus on this node which are in the mask */ |
297 | cpumask_and(dstp: nmsk, src1p: cpu_mask, src2p: node_to_cpumask[nv->id]); |
298 | ncpus = cpumask_weight(srcp: nmsk); |
299 | if (!ncpus) |
300 | continue; |
301 | |
302 | WARN_ON_ONCE(nv->ngroups > ncpus); |
303 | |
304 | /* Account for rounding errors */ |
305 | extra_grps = ncpus - nv->ngroups * (ncpus / nv->ngroups); |
306 | |
307 | /* Spread allocated groups on CPUs of the current node */ |
308 | for (v = 0; v < nv->ngroups; v++, curgrp++) { |
309 | cpus_per_grp = ncpus / nv->ngroups; |
310 | |
311 | /* Account for extra groups to compensate rounding errors */ |
312 | if (extra_grps) { |
313 | cpus_per_grp++; |
314 | --extra_grps; |
315 | } |
316 | |
317 | /* |
318 | * wrapping has to be considered given 'startgrp' |
319 | * may start anywhere |
320 | */ |
321 | if (curgrp >= last_grp) |
322 | curgrp = 0; |
323 | grp_spread_init_one(irqmsk: &masks[curgrp], nmsk, |
324 | cpus_per_grp); |
325 | } |
326 | done += nv->ngroups; |
327 | } |
328 | kfree(objp: node_groups); |
329 | return done; |
330 | } |
331 | |
332 | /** |
333 | * group_cpus_evenly - Group all CPUs evenly per NUMA/CPU locality |
334 | * @numgrps: number of groups |
335 | * |
336 | * Return: cpumask array if successful, NULL otherwise. And each element |
337 | * includes CPUs assigned to this group |
338 | * |
339 | * Try to put close CPUs from viewpoint of CPU and NUMA locality into |
340 | * same group, and run two-stage grouping: |
341 | * 1) allocate present CPUs on these groups evenly first |
342 | * 2) allocate other possible CPUs on these groups evenly |
343 | * |
344 | * We guarantee in the resulted grouping that all CPUs are covered, and |
345 | * no same CPU is assigned to multiple groups |
346 | */ |
347 | struct cpumask *group_cpus_evenly(unsigned int numgrps) |
348 | { |
349 | unsigned int curgrp = 0, nr_present = 0, nr_others = 0; |
350 | cpumask_var_t *node_to_cpumask; |
351 | cpumask_var_t nmsk, npresmsk; |
352 | int ret = -ENOMEM; |
353 | struct cpumask *masks = NULL; |
354 | |
355 | if (!zalloc_cpumask_var(mask: &nmsk, GFP_KERNEL)) |
356 | return NULL; |
357 | |
358 | if (!zalloc_cpumask_var(mask: &npresmsk, GFP_KERNEL)) |
359 | goto fail_nmsk; |
360 | |
361 | node_to_cpumask = alloc_node_to_cpumask(); |
362 | if (!node_to_cpumask) |
363 | goto fail_npresmsk; |
364 | |
365 | masks = kcalloc(n: numgrps, size: sizeof(*masks), GFP_KERNEL); |
366 | if (!masks) |
367 | goto fail_node_to_cpumask; |
368 | |
369 | build_node_to_cpumask(masks: node_to_cpumask); |
370 | |
371 | /* |
372 | * Make a local cache of 'cpu_present_mask', so the two stages |
373 | * spread can observe consistent 'cpu_present_mask' without holding |
374 | * cpu hotplug lock, then we can reduce deadlock risk with cpu |
375 | * hotplug code. |
376 | * |
377 | * Here CPU hotplug may happen when reading `cpu_present_mask`, and |
378 | * we can live with the case because it only affects that hotplug |
379 | * CPU is handled in the 1st or 2nd stage, and either way is correct |
380 | * from API user viewpoint since 2-stage spread is sort of |
381 | * optimization. |
382 | */ |
383 | cpumask_copy(dstp: npresmsk, data_race(cpu_present_mask)); |
384 | |
385 | /* grouping present CPUs first */ |
386 | ret = __group_cpus_evenly(startgrp: curgrp, numgrps, node_to_cpumask, |
387 | cpu_mask: npresmsk, nmsk, masks); |
388 | if (ret < 0) |
389 | goto fail_build_affinity; |
390 | nr_present = ret; |
391 | |
392 | /* |
393 | * Allocate non present CPUs starting from the next group to be |
394 | * handled. If the grouping of present CPUs already exhausted the |
395 | * group space, assign the non present CPUs to the already |
396 | * allocated out groups. |
397 | */ |
398 | if (nr_present >= numgrps) |
399 | curgrp = 0; |
400 | else |
401 | curgrp = nr_present; |
402 | cpumask_andnot(dstp: npresmsk, cpu_possible_mask, src2p: npresmsk); |
403 | ret = __group_cpus_evenly(startgrp: curgrp, numgrps, node_to_cpumask, |
404 | cpu_mask: npresmsk, nmsk, masks); |
405 | if (ret >= 0) |
406 | nr_others = ret; |
407 | |
408 | fail_build_affinity: |
409 | if (ret >= 0) |
410 | WARN_ON(nr_present + nr_others < numgrps); |
411 | |
412 | fail_node_to_cpumask: |
413 | free_node_to_cpumask(masks: node_to_cpumask); |
414 | |
415 | fail_npresmsk: |
416 | free_cpumask_var(mask: npresmsk); |
417 | |
418 | fail_nmsk: |
419 | free_cpumask_var(mask: nmsk); |
420 | if (ret < 0) { |
421 | kfree(objp: masks); |
422 | return NULL; |
423 | } |
424 | return masks; |
425 | } |
426 | #else /* CONFIG_SMP */ |
427 | struct cpumask *group_cpus_evenly(unsigned int numgrps) |
428 | { |
429 | struct cpumask *masks = kcalloc(numgrps, sizeof(*masks), GFP_KERNEL); |
430 | |
431 | if (!masks) |
432 | return NULL; |
433 | |
434 | /* assign all CPUs(cpu 0) to the 1st group only */ |
435 | cpumask_copy(&masks[0], cpu_possible_mask); |
436 | return masks; |
437 | } |
438 | #endif /* CONFIG_SMP */ |
439 | EXPORT_SYMBOL_GPL(group_cpus_evenly); |
440 | |