1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Scheduler topology setup/handling methods |
4 | */ |
5 | |
6 | #include <linux/bsearch.h> |
7 | |
8 | DEFINE_MUTEX(sched_domains_mutex); |
9 | |
10 | /* Protected by sched_domains_mutex: */ |
11 | static cpumask_var_t sched_domains_tmpmask; |
12 | static cpumask_var_t sched_domains_tmpmask2; |
13 | |
14 | #ifdef CONFIG_SCHED_DEBUG |
15 | |
16 | static int __init sched_debug_setup(char *str) |
17 | { |
18 | sched_debug_verbose = true; |
19 | |
20 | return 0; |
21 | } |
22 | early_param("sched_verbose" , sched_debug_setup); |
23 | |
24 | static inline bool sched_debug(void) |
25 | { |
26 | return sched_debug_verbose; |
27 | } |
28 | |
29 | #define SD_FLAG(_name, mflags) [__##_name] = { .meta_flags = mflags, .name = #_name }, |
30 | const struct sd_flag_debug sd_flag_debug[] = { |
31 | #include <linux/sched/sd_flags.h> |
32 | }; |
33 | #undef SD_FLAG |
34 | |
35 | static int sched_domain_debug_one(struct sched_domain *sd, int cpu, int level, |
36 | struct cpumask *groupmask) |
37 | { |
38 | struct sched_group *group = sd->groups; |
39 | unsigned long flags = sd->flags; |
40 | unsigned int idx; |
41 | |
42 | cpumask_clear(dstp: groupmask); |
43 | |
44 | printk(KERN_DEBUG "%*s domain-%d: " , level, "" , level); |
45 | printk(KERN_CONT "span=%*pbl level=%s\n" , |
46 | cpumask_pr_args(sched_domain_span(sd)), sd->name); |
47 | |
48 | if (!cpumask_test_cpu(cpu, cpumask: sched_domain_span(sd))) { |
49 | printk(KERN_ERR "ERROR: domain->span does not contain CPU%d\n" , cpu); |
50 | } |
51 | if (group && !cpumask_test_cpu(cpu, cpumask: sched_group_span(sg: group))) { |
52 | printk(KERN_ERR "ERROR: domain->groups does not contain CPU%d\n" , cpu); |
53 | } |
54 | |
55 | for_each_set_bit(idx, &flags, __SD_FLAG_CNT) { |
56 | unsigned int flag = BIT(idx); |
57 | unsigned int meta_flags = sd_flag_debug[idx].meta_flags; |
58 | |
59 | if ((meta_flags & SDF_SHARED_CHILD) && sd->child && |
60 | !(sd->child->flags & flag)) |
61 | printk(KERN_ERR "ERROR: flag %s set here but not in child\n" , |
62 | sd_flag_debug[idx].name); |
63 | |
64 | if ((meta_flags & SDF_SHARED_PARENT) && sd->parent && |
65 | !(sd->parent->flags & flag)) |
66 | printk(KERN_ERR "ERROR: flag %s set here but not in parent\n" , |
67 | sd_flag_debug[idx].name); |
68 | } |
69 | |
70 | printk(KERN_DEBUG "%*s groups:" , level + 1, "" ); |
71 | do { |
72 | if (!group) { |
73 | printk("\n" ); |
74 | printk(KERN_ERR "ERROR: group is NULL\n" ); |
75 | break; |
76 | } |
77 | |
78 | if (cpumask_empty(srcp: sched_group_span(sg: group))) { |
79 | printk(KERN_CONT "\n" ); |
80 | printk(KERN_ERR "ERROR: empty group\n" ); |
81 | break; |
82 | } |
83 | |
84 | if (!(sd->flags & SD_OVERLAP) && |
85 | cpumask_intersects(src1p: groupmask, src2p: sched_group_span(sg: group))) { |
86 | printk(KERN_CONT "\n" ); |
87 | printk(KERN_ERR "ERROR: repeated CPUs\n" ); |
88 | break; |
89 | } |
90 | |
91 | cpumask_or(dstp: groupmask, src1p: groupmask, src2p: sched_group_span(sg: group)); |
92 | |
93 | printk(KERN_CONT " %d:{ span=%*pbl" , |
94 | group->sgc->id, |
95 | cpumask_pr_args(sched_group_span(group))); |
96 | |
97 | if ((sd->flags & SD_OVERLAP) && |
98 | !cpumask_equal(src1p: group_balance_mask(sg: group), src2p: sched_group_span(sg: group))) { |
99 | printk(KERN_CONT " mask=%*pbl" , |
100 | cpumask_pr_args(group_balance_mask(group))); |
101 | } |
102 | |
103 | if (group->sgc->capacity != SCHED_CAPACITY_SCALE) |
104 | printk(KERN_CONT " cap=%lu" , group->sgc->capacity); |
105 | |
106 | if (group == sd->groups && sd->child && |
107 | !cpumask_equal(src1p: sched_domain_span(sd: sd->child), |
108 | src2p: sched_group_span(sg: group))) { |
109 | printk(KERN_ERR "ERROR: domain->groups does not match domain->child\n" ); |
110 | } |
111 | |
112 | printk(KERN_CONT " }" ); |
113 | |
114 | group = group->next; |
115 | |
116 | if (group != sd->groups) |
117 | printk(KERN_CONT "," ); |
118 | |
119 | } while (group != sd->groups); |
120 | printk(KERN_CONT "\n" ); |
121 | |
122 | if (!cpumask_equal(src1p: sched_domain_span(sd), src2p: groupmask)) |
123 | printk(KERN_ERR "ERROR: groups don't span domain->span\n" ); |
124 | |
125 | if (sd->parent && |
126 | !cpumask_subset(src1p: groupmask, src2p: sched_domain_span(sd: sd->parent))) |
127 | printk(KERN_ERR "ERROR: parent span is not a superset of domain->span\n" ); |
128 | return 0; |
129 | } |
130 | |
131 | static void sched_domain_debug(struct sched_domain *sd, int cpu) |
132 | { |
133 | int level = 0; |
134 | |
135 | if (!sched_debug_verbose) |
136 | return; |
137 | |
138 | if (!sd) { |
139 | printk(KERN_DEBUG "CPU%d attaching NULL sched-domain.\n" , cpu); |
140 | return; |
141 | } |
142 | |
143 | printk(KERN_DEBUG "CPU%d attaching sched-domain(s):\n" , cpu); |
144 | |
145 | for (;;) { |
146 | if (sched_domain_debug_one(sd, cpu, level, groupmask: sched_domains_tmpmask)) |
147 | break; |
148 | level++; |
149 | sd = sd->parent; |
150 | if (!sd) |
151 | break; |
152 | } |
153 | } |
154 | #else /* !CONFIG_SCHED_DEBUG */ |
155 | |
156 | # define sched_debug_verbose 0 |
157 | # define sched_domain_debug(sd, cpu) do { } while (0) |
158 | static inline bool sched_debug(void) |
159 | { |
160 | return false; |
161 | } |
162 | #endif /* CONFIG_SCHED_DEBUG */ |
163 | |
164 | /* Generate a mask of SD flags with the SDF_NEEDS_GROUPS metaflag */ |
165 | #define SD_FLAG(name, mflags) (name * !!((mflags) & SDF_NEEDS_GROUPS)) | |
166 | static const unsigned int SD_DEGENERATE_GROUPS_MASK = |
167 | #include <linux/sched/sd_flags.h> |
168 | 0; |
169 | #undef SD_FLAG |
170 | |
171 | static int sd_degenerate(struct sched_domain *sd) |
172 | { |
173 | if (cpumask_weight(srcp: sched_domain_span(sd)) == 1) |
174 | return 1; |
175 | |
176 | /* Following flags need at least 2 groups */ |
177 | if ((sd->flags & SD_DEGENERATE_GROUPS_MASK) && |
178 | (sd->groups != sd->groups->next)) |
179 | return 0; |
180 | |
181 | /* Following flags don't use groups */ |
182 | if (sd->flags & (SD_WAKE_AFFINE)) |
183 | return 0; |
184 | |
185 | return 1; |
186 | } |
187 | |
188 | static int |
189 | sd_parent_degenerate(struct sched_domain *sd, struct sched_domain *parent) |
190 | { |
191 | unsigned long cflags = sd->flags, pflags = parent->flags; |
192 | |
193 | if (sd_degenerate(sd: parent)) |
194 | return 1; |
195 | |
196 | if (!cpumask_equal(src1p: sched_domain_span(sd), src2p: sched_domain_span(sd: parent))) |
197 | return 0; |
198 | |
199 | /* Flags needing groups don't count if only 1 group in parent */ |
200 | if (parent->groups == parent->groups->next) |
201 | pflags &= ~SD_DEGENERATE_GROUPS_MASK; |
202 | |
203 | if (~cflags & pflags) |
204 | return 0; |
205 | |
206 | return 1; |
207 | } |
208 | |
209 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) |
210 | DEFINE_STATIC_KEY_FALSE(sched_energy_present); |
211 | static unsigned int sysctl_sched_energy_aware = 1; |
212 | static DEFINE_MUTEX(sched_energy_mutex); |
213 | static bool sched_energy_update; |
214 | |
215 | static bool sched_is_eas_possible(const struct cpumask *cpu_mask) |
216 | { |
217 | bool any_asym_capacity = false; |
218 | struct cpufreq_policy *policy; |
219 | struct cpufreq_governor *gov; |
220 | int i; |
221 | |
222 | /* EAS is enabled for asymmetric CPU capacity topologies. */ |
223 | for_each_cpu(i, cpu_mask) { |
224 | if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, i))) { |
225 | any_asym_capacity = true; |
226 | break; |
227 | } |
228 | } |
229 | if (!any_asym_capacity) { |
230 | if (sched_debug()) { |
231 | pr_info("rd %*pbl: Checking EAS, CPUs do not have asymmetric capacities\n" , |
232 | cpumask_pr_args(cpu_mask)); |
233 | } |
234 | return false; |
235 | } |
236 | |
237 | /* EAS definitely does *not* handle SMT */ |
238 | if (sched_smt_active()) { |
239 | if (sched_debug()) { |
240 | pr_info("rd %*pbl: Checking EAS, SMT is not supported\n" , |
241 | cpumask_pr_args(cpu_mask)); |
242 | } |
243 | return false; |
244 | } |
245 | |
246 | if (!arch_scale_freq_invariant()) { |
247 | if (sched_debug()) { |
248 | pr_info("rd %*pbl: Checking EAS: frequency-invariant load tracking not yet supported" , |
249 | cpumask_pr_args(cpu_mask)); |
250 | } |
251 | return false; |
252 | } |
253 | |
254 | /* Do not attempt EAS if schedutil is not being used. */ |
255 | for_each_cpu(i, cpu_mask) { |
256 | policy = cpufreq_cpu_get(cpu: i); |
257 | if (!policy) { |
258 | if (sched_debug()) { |
259 | pr_info("rd %*pbl: Checking EAS, cpufreq policy not set for CPU: %d" , |
260 | cpumask_pr_args(cpu_mask), i); |
261 | } |
262 | return false; |
263 | } |
264 | gov = policy->governor; |
265 | cpufreq_cpu_put(policy); |
266 | if (gov != &schedutil_gov) { |
267 | if (sched_debug()) { |
268 | pr_info("rd %*pbl: Checking EAS, schedutil is mandatory\n" , |
269 | cpumask_pr_args(cpu_mask)); |
270 | } |
271 | return false; |
272 | } |
273 | } |
274 | |
275 | return true; |
276 | } |
277 | |
278 | void rebuild_sched_domains_energy(void) |
279 | { |
280 | mutex_lock(&sched_energy_mutex); |
281 | sched_energy_update = true; |
282 | rebuild_sched_domains(); |
283 | sched_energy_update = false; |
284 | mutex_unlock(lock: &sched_energy_mutex); |
285 | } |
286 | |
287 | #ifdef CONFIG_PROC_SYSCTL |
288 | static int sched_energy_aware_handler(struct ctl_table *table, int write, |
289 | void *buffer, size_t *lenp, loff_t *ppos) |
290 | { |
291 | int ret, state; |
292 | |
293 | if (write && !capable(CAP_SYS_ADMIN)) |
294 | return -EPERM; |
295 | |
296 | if (!sched_is_eas_possible(cpu_active_mask)) { |
297 | if (write) { |
298 | return -EOPNOTSUPP; |
299 | } else { |
300 | *lenp = 0; |
301 | return 0; |
302 | } |
303 | } |
304 | |
305 | ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); |
306 | if (!ret && write) { |
307 | state = static_branch_unlikely(&sched_energy_present); |
308 | if (state != sysctl_sched_energy_aware) |
309 | rebuild_sched_domains_energy(); |
310 | } |
311 | |
312 | return ret; |
313 | } |
314 | |
315 | static struct ctl_table sched_energy_aware_sysctls[] = { |
316 | { |
317 | .procname = "sched_energy_aware" , |
318 | .data = &sysctl_sched_energy_aware, |
319 | .maxlen = sizeof(unsigned int), |
320 | .mode = 0644, |
321 | .proc_handler = sched_energy_aware_handler, |
322 | .extra1 = SYSCTL_ZERO, |
323 | .extra2 = SYSCTL_ONE, |
324 | }, |
325 | {} |
326 | }; |
327 | |
328 | static int __init sched_energy_aware_sysctl_init(void) |
329 | { |
330 | register_sysctl_init("kernel" , sched_energy_aware_sysctls); |
331 | return 0; |
332 | } |
333 | |
334 | late_initcall(sched_energy_aware_sysctl_init); |
335 | #endif |
336 | |
337 | static void free_pd(struct perf_domain *pd) |
338 | { |
339 | struct perf_domain *tmp; |
340 | |
341 | while (pd) { |
342 | tmp = pd->next; |
343 | kfree(objp: pd); |
344 | pd = tmp; |
345 | } |
346 | } |
347 | |
348 | static struct perf_domain *find_pd(struct perf_domain *pd, int cpu) |
349 | { |
350 | while (pd) { |
351 | if (cpumask_test_cpu(cpu, perf_domain_span(pd))) |
352 | return pd; |
353 | pd = pd->next; |
354 | } |
355 | |
356 | return NULL; |
357 | } |
358 | |
359 | static struct perf_domain *pd_init(int cpu) |
360 | { |
361 | struct em_perf_domain *obj = em_cpu_get(cpu); |
362 | struct perf_domain *pd; |
363 | |
364 | if (!obj) { |
365 | if (sched_debug()) |
366 | pr_info("%s: no EM found for CPU%d\n" , __func__, cpu); |
367 | return NULL; |
368 | } |
369 | |
370 | pd = kzalloc(size: sizeof(*pd), GFP_KERNEL); |
371 | if (!pd) |
372 | return NULL; |
373 | pd->em_pd = obj; |
374 | |
375 | return pd; |
376 | } |
377 | |
378 | static void perf_domain_debug(const struct cpumask *cpu_map, |
379 | struct perf_domain *pd) |
380 | { |
381 | if (!sched_debug() || !pd) |
382 | return; |
383 | |
384 | printk(KERN_DEBUG "root_domain %*pbl:" , cpumask_pr_args(cpu_map)); |
385 | |
386 | while (pd) { |
387 | printk(KERN_CONT " pd%d:{ cpus=%*pbl nr_pstate=%d }" , |
388 | cpumask_first(perf_domain_span(pd)), |
389 | cpumask_pr_args(perf_domain_span(pd)), |
390 | em_pd_nr_perf_states(pd->em_pd)); |
391 | pd = pd->next; |
392 | } |
393 | |
394 | printk(KERN_CONT "\n" ); |
395 | } |
396 | |
397 | static void destroy_perf_domain_rcu(struct rcu_head *rp) |
398 | { |
399 | struct perf_domain *pd; |
400 | |
401 | pd = container_of(rp, struct perf_domain, rcu); |
402 | free_pd(pd); |
403 | } |
404 | |
405 | static void sched_energy_set(bool has_eas) |
406 | { |
407 | if (!has_eas && static_branch_unlikely(&sched_energy_present)) { |
408 | if (sched_debug()) |
409 | pr_info("%s: stopping EAS\n" , __func__); |
410 | static_branch_disable_cpuslocked(&sched_energy_present); |
411 | } else if (has_eas && !static_branch_unlikely(&sched_energy_present)) { |
412 | if (sched_debug()) |
413 | pr_info("%s: starting EAS\n" , __func__); |
414 | static_branch_enable_cpuslocked(&sched_energy_present); |
415 | } |
416 | } |
417 | |
418 | /* |
419 | * EAS can be used on a root domain if it meets all the following conditions: |
420 | * 1. an Energy Model (EM) is available; |
421 | * 2. the SD_ASYM_CPUCAPACITY flag is set in the sched_domain hierarchy. |
422 | * 3. no SMT is detected. |
423 | * 4. schedutil is driving the frequency of all CPUs of the rd; |
424 | * 5. frequency invariance support is present; |
425 | */ |
426 | static bool build_perf_domains(const struct cpumask *cpu_map) |
427 | { |
428 | int i; |
429 | struct perf_domain *pd = NULL, *tmp; |
430 | int cpu = cpumask_first(srcp: cpu_map); |
431 | struct root_domain *rd = cpu_rq(cpu)->rd; |
432 | |
433 | if (!sysctl_sched_energy_aware) |
434 | goto free; |
435 | |
436 | if (!sched_is_eas_possible(cpu_mask: cpu_map)) |
437 | goto free; |
438 | |
439 | for_each_cpu(i, cpu_map) { |
440 | /* Skip already covered CPUs. */ |
441 | if (find_pd(pd, cpu: i)) |
442 | continue; |
443 | |
444 | /* Create the new pd and add it to the local list. */ |
445 | tmp = pd_init(cpu: i); |
446 | if (!tmp) |
447 | goto free; |
448 | tmp->next = pd; |
449 | pd = tmp; |
450 | } |
451 | |
452 | perf_domain_debug(cpu_map, pd); |
453 | |
454 | /* Attach the new list of performance domains to the root domain. */ |
455 | tmp = rd->pd; |
456 | rcu_assign_pointer(rd->pd, pd); |
457 | if (tmp) |
458 | call_rcu(head: &tmp->rcu, func: destroy_perf_domain_rcu); |
459 | |
460 | return !!pd; |
461 | |
462 | free: |
463 | free_pd(pd); |
464 | tmp = rd->pd; |
465 | rcu_assign_pointer(rd->pd, NULL); |
466 | if (tmp) |
467 | call_rcu(head: &tmp->rcu, func: destroy_perf_domain_rcu); |
468 | |
469 | return false; |
470 | } |
471 | #else |
472 | static void free_pd(struct perf_domain *pd) { } |
473 | #endif /* CONFIG_ENERGY_MODEL && CONFIG_CPU_FREQ_GOV_SCHEDUTIL*/ |
474 | |
475 | static void free_rootdomain(struct rcu_head *rcu) |
476 | { |
477 | struct root_domain *rd = container_of(rcu, struct root_domain, rcu); |
478 | |
479 | cpupri_cleanup(cp: &rd->cpupri); |
480 | cpudl_cleanup(cp: &rd->cpudl); |
481 | free_cpumask_var(mask: rd->dlo_mask); |
482 | free_cpumask_var(mask: rd->rto_mask); |
483 | free_cpumask_var(mask: rd->online); |
484 | free_cpumask_var(mask: rd->span); |
485 | free_pd(pd: rd->pd); |
486 | kfree(objp: rd); |
487 | } |
488 | |
489 | void rq_attach_root(struct rq *rq, struct root_domain *rd) |
490 | { |
491 | struct root_domain *old_rd = NULL; |
492 | struct rq_flags rf; |
493 | |
494 | rq_lock_irqsave(rq, rf: &rf); |
495 | |
496 | if (rq->rd) { |
497 | old_rd = rq->rd; |
498 | |
499 | if (cpumask_test_cpu(cpu: rq->cpu, cpumask: old_rd->online)) |
500 | set_rq_offline(rq); |
501 | |
502 | cpumask_clear_cpu(cpu: rq->cpu, dstp: old_rd->span); |
503 | |
504 | /* |
505 | * If we dont want to free the old_rd yet then |
506 | * set old_rd to NULL to skip the freeing later |
507 | * in this function: |
508 | */ |
509 | if (!atomic_dec_and_test(v: &old_rd->refcount)) |
510 | old_rd = NULL; |
511 | } |
512 | |
513 | atomic_inc(v: &rd->refcount); |
514 | rq->rd = rd; |
515 | |
516 | cpumask_set_cpu(cpu: rq->cpu, dstp: rd->span); |
517 | if (cpumask_test_cpu(cpu: rq->cpu, cpu_active_mask)) |
518 | set_rq_online(rq); |
519 | |
520 | rq_unlock_irqrestore(rq, rf: &rf); |
521 | |
522 | if (old_rd) |
523 | call_rcu(head: &old_rd->rcu, func: free_rootdomain); |
524 | } |
525 | |
526 | void sched_get_rd(struct root_domain *rd) |
527 | { |
528 | atomic_inc(v: &rd->refcount); |
529 | } |
530 | |
531 | void sched_put_rd(struct root_domain *rd) |
532 | { |
533 | if (!atomic_dec_and_test(v: &rd->refcount)) |
534 | return; |
535 | |
536 | call_rcu(head: &rd->rcu, func: free_rootdomain); |
537 | } |
538 | |
539 | static int init_rootdomain(struct root_domain *rd) |
540 | { |
541 | if (!zalloc_cpumask_var(mask: &rd->span, GFP_KERNEL)) |
542 | goto out; |
543 | if (!zalloc_cpumask_var(mask: &rd->online, GFP_KERNEL)) |
544 | goto free_span; |
545 | if (!zalloc_cpumask_var(mask: &rd->dlo_mask, GFP_KERNEL)) |
546 | goto free_online; |
547 | if (!zalloc_cpumask_var(mask: &rd->rto_mask, GFP_KERNEL)) |
548 | goto free_dlo_mask; |
549 | |
550 | #ifdef HAVE_RT_PUSH_IPI |
551 | rd->rto_cpu = -1; |
552 | raw_spin_lock_init(&rd->rto_lock); |
553 | rd->rto_push_work = IRQ_WORK_INIT_HARD(rto_push_irq_work_func); |
554 | #endif |
555 | |
556 | rd->visit_gen = 0; |
557 | init_dl_bw(dl_b: &rd->dl_bw); |
558 | if (cpudl_init(cp: &rd->cpudl) != 0) |
559 | goto free_rto_mask; |
560 | |
561 | if (cpupri_init(cp: &rd->cpupri) != 0) |
562 | goto free_cpudl; |
563 | return 0; |
564 | |
565 | free_cpudl: |
566 | cpudl_cleanup(cp: &rd->cpudl); |
567 | free_rto_mask: |
568 | free_cpumask_var(mask: rd->rto_mask); |
569 | free_dlo_mask: |
570 | free_cpumask_var(mask: rd->dlo_mask); |
571 | free_online: |
572 | free_cpumask_var(mask: rd->online); |
573 | free_span: |
574 | free_cpumask_var(mask: rd->span); |
575 | out: |
576 | return -ENOMEM; |
577 | } |
578 | |
579 | /* |
580 | * By default the system creates a single root-domain with all CPUs as |
581 | * members (mimicking the global state we have today). |
582 | */ |
583 | struct root_domain def_root_domain; |
584 | |
585 | void __init init_defrootdomain(void) |
586 | { |
587 | init_rootdomain(rd: &def_root_domain); |
588 | |
589 | atomic_set(v: &def_root_domain.refcount, i: 1); |
590 | } |
591 | |
592 | static struct root_domain *alloc_rootdomain(void) |
593 | { |
594 | struct root_domain *rd; |
595 | |
596 | rd = kzalloc(size: sizeof(*rd), GFP_KERNEL); |
597 | if (!rd) |
598 | return NULL; |
599 | |
600 | if (init_rootdomain(rd) != 0) { |
601 | kfree(objp: rd); |
602 | return NULL; |
603 | } |
604 | |
605 | return rd; |
606 | } |
607 | |
608 | static void free_sched_groups(struct sched_group *sg, int free_sgc) |
609 | { |
610 | struct sched_group *tmp, *first; |
611 | |
612 | if (!sg) |
613 | return; |
614 | |
615 | first = sg; |
616 | do { |
617 | tmp = sg->next; |
618 | |
619 | if (free_sgc && atomic_dec_and_test(v: &sg->sgc->ref)) |
620 | kfree(objp: sg->sgc); |
621 | |
622 | if (atomic_dec_and_test(v: &sg->ref)) |
623 | kfree(objp: sg); |
624 | sg = tmp; |
625 | } while (sg != first); |
626 | } |
627 | |
628 | static void destroy_sched_domain(struct sched_domain *sd) |
629 | { |
630 | /* |
631 | * A normal sched domain may have multiple group references, an |
632 | * overlapping domain, having private groups, only one. Iterate, |
633 | * dropping group/capacity references, freeing where none remain. |
634 | */ |
635 | free_sched_groups(sg: sd->groups, free_sgc: 1); |
636 | |
637 | if (sd->shared && atomic_dec_and_test(v: &sd->shared->ref)) |
638 | kfree(objp: sd->shared); |
639 | kfree(objp: sd); |
640 | } |
641 | |
642 | static void destroy_sched_domains_rcu(struct rcu_head *rcu) |
643 | { |
644 | struct sched_domain *sd = container_of(rcu, struct sched_domain, rcu); |
645 | |
646 | while (sd) { |
647 | struct sched_domain *parent = sd->parent; |
648 | destroy_sched_domain(sd); |
649 | sd = parent; |
650 | } |
651 | } |
652 | |
653 | static void destroy_sched_domains(struct sched_domain *sd) |
654 | { |
655 | if (sd) |
656 | call_rcu(head: &sd->rcu, func: destroy_sched_domains_rcu); |
657 | } |
658 | |
659 | /* |
660 | * Keep a special pointer to the highest sched_domain that has |
661 | * SD_SHARE_PKG_RESOURCE set (Last Level Cache Domain) for this |
662 | * allows us to avoid some pointer chasing select_idle_sibling(). |
663 | * |
664 | * Also keep a unique ID per domain (we use the first CPU number in |
665 | * the cpumask of the domain), this allows us to quickly tell if |
666 | * two CPUs are in the same cache domain, see cpus_share_cache(). |
667 | */ |
668 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); |
669 | DEFINE_PER_CPU(int, sd_llc_size); |
670 | DEFINE_PER_CPU(int, sd_llc_id); |
671 | DEFINE_PER_CPU(int, sd_share_id); |
672 | DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); |
673 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); |
674 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_packing); |
675 | DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); |
676 | |
677 | DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); |
678 | DEFINE_STATIC_KEY_FALSE(sched_cluster_active); |
679 | |
680 | static void update_top_cache_domain(int cpu) |
681 | { |
682 | struct sched_domain_shared *sds = NULL; |
683 | struct sched_domain *sd; |
684 | int id = cpu; |
685 | int size = 1; |
686 | |
687 | sd = highest_flag_domain(cpu, flag: SD_SHARE_PKG_RESOURCES); |
688 | if (sd) { |
689 | id = cpumask_first(srcp: sched_domain_span(sd)); |
690 | size = cpumask_weight(srcp: sched_domain_span(sd)); |
691 | sds = sd->shared; |
692 | } |
693 | |
694 | rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); |
695 | per_cpu(sd_llc_size, cpu) = size; |
696 | per_cpu(sd_llc_id, cpu) = id; |
697 | rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); |
698 | |
699 | sd = lowest_flag_domain(cpu, flag: SD_CLUSTER); |
700 | if (sd) |
701 | id = cpumask_first(srcp: sched_domain_span(sd)); |
702 | |
703 | /* |
704 | * This assignment should be placed after the sd_llc_id as |
705 | * we want this id equals to cluster id on cluster machines |
706 | * but equals to LLC id on non-Cluster machines. |
707 | */ |
708 | per_cpu(sd_share_id, cpu) = id; |
709 | |
710 | sd = lowest_flag_domain(cpu, flag: SD_NUMA); |
711 | rcu_assign_pointer(per_cpu(sd_numa, cpu), sd); |
712 | |
713 | sd = highest_flag_domain(cpu, flag: SD_ASYM_PACKING); |
714 | rcu_assign_pointer(per_cpu(sd_asym_packing, cpu), sd); |
715 | |
716 | sd = lowest_flag_domain(cpu, flag: SD_ASYM_CPUCAPACITY_FULL); |
717 | rcu_assign_pointer(per_cpu(sd_asym_cpucapacity, cpu), sd); |
718 | } |
719 | |
720 | /* |
721 | * Attach the domain 'sd' to 'cpu' as its base domain. Callers must |
722 | * hold the hotplug lock. |
723 | */ |
724 | static void |
725 | cpu_attach_domain(struct sched_domain *sd, struct root_domain *rd, int cpu) |
726 | { |
727 | struct rq *rq = cpu_rq(cpu); |
728 | struct sched_domain *tmp; |
729 | |
730 | /* Remove the sched domains which do not contribute to scheduling. */ |
731 | for (tmp = sd; tmp; ) { |
732 | struct sched_domain *parent = tmp->parent; |
733 | if (!parent) |
734 | break; |
735 | |
736 | if (sd_parent_degenerate(sd: tmp, parent)) { |
737 | tmp->parent = parent->parent; |
738 | |
739 | if (parent->parent) { |
740 | parent->parent->child = tmp; |
741 | parent->parent->groups->flags = tmp->flags; |
742 | } |
743 | |
744 | /* |
745 | * Transfer SD_PREFER_SIBLING down in case of a |
746 | * degenerate parent; the spans match for this |
747 | * so the property transfers. |
748 | */ |
749 | if (parent->flags & SD_PREFER_SIBLING) |
750 | tmp->flags |= SD_PREFER_SIBLING; |
751 | destroy_sched_domain(sd: parent); |
752 | } else |
753 | tmp = tmp->parent; |
754 | } |
755 | |
756 | if (sd && sd_degenerate(sd)) { |
757 | tmp = sd; |
758 | sd = sd->parent; |
759 | destroy_sched_domain(sd: tmp); |
760 | if (sd) { |
761 | struct sched_group *sg = sd->groups; |
762 | |
763 | /* |
764 | * sched groups hold the flags of the child sched |
765 | * domain for convenience. Clear such flags since |
766 | * the child is being destroyed. |
767 | */ |
768 | do { |
769 | sg->flags = 0; |
770 | } while (sg != sd->groups); |
771 | |
772 | sd->child = NULL; |
773 | } |
774 | } |
775 | |
776 | sched_domain_debug(sd, cpu); |
777 | |
778 | rq_attach_root(rq, rd); |
779 | tmp = rq->sd; |
780 | rcu_assign_pointer(rq->sd, sd); |
781 | dirty_sched_domain_sysctl(cpu); |
782 | destroy_sched_domains(sd: tmp); |
783 | |
784 | update_top_cache_domain(cpu); |
785 | } |
786 | |
787 | struct s_data { |
788 | struct sched_domain * __percpu *sd; |
789 | struct root_domain *rd; |
790 | }; |
791 | |
792 | enum s_alloc { |
793 | sa_rootdomain, |
794 | sa_sd, |
795 | sa_sd_storage, |
796 | sa_none, |
797 | }; |
798 | |
799 | /* |
800 | * Return the canonical balance CPU for this group, this is the first CPU |
801 | * of this group that's also in the balance mask. |
802 | * |
803 | * The balance mask are all those CPUs that could actually end up at this |
804 | * group. See build_balance_mask(). |
805 | * |
806 | * Also see should_we_balance(). |
807 | */ |
808 | int group_balance_cpu(struct sched_group *sg) |
809 | { |
810 | return cpumask_first(srcp: group_balance_mask(sg)); |
811 | } |
812 | |
813 | |
814 | /* |
815 | * NUMA topology (first read the regular topology blurb below) |
816 | * |
817 | * Given a node-distance table, for example: |
818 | * |
819 | * node 0 1 2 3 |
820 | * 0: 10 20 30 20 |
821 | * 1: 20 10 20 30 |
822 | * 2: 30 20 10 20 |
823 | * 3: 20 30 20 10 |
824 | * |
825 | * which represents a 4 node ring topology like: |
826 | * |
827 | * 0 ----- 1 |
828 | * | | |
829 | * | | |
830 | * | | |
831 | * 3 ----- 2 |
832 | * |
833 | * We want to construct domains and groups to represent this. The way we go |
834 | * about doing this is to build the domains on 'hops'. For each NUMA level we |
835 | * construct the mask of all nodes reachable in @level hops. |
836 | * |
837 | * For the above NUMA topology that gives 3 levels: |
838 | * |
839 | * NUMA-2 0-3 0-3 0-3 0-3 |
840 | * groups: {0-1,3},{1-3} {0-2},{0,2-3} {1-3},{0-1,3} {0,2-3},{0-2} |
841 | * |
842 | * NUMA-1 0-1,3 0-2 1-3 0,2-3 |
843 | * groups: {0},{1},{3} {0},{1},{2} {1},{2},{3} {0},{2},{3} |
844 | * |
845 | * NUMA-0 0 1 2 3 |
846 | * |
847 | * |
848 | * As can be seen; things don't nicely line up as with the regular topology. |
849 | * When we iterate a domain in child domain chunks some nodes can be |
850 | * represented multiple times -- hence the "overlap" naming for this part of |
851 | * the topology. |
852 | * |
853 | * In order to minimize this overlap, we only build enough groups to cover the |
854 | * domain. For instance Node-0 NUMA-2 would only get groups: 0-1,3 and 1-3. |
855 | * |
856 | * Because: |
857 | * |
858 | * - the first group of each domain is its child domain; this |
859 | * gets us the first 0-1,3 |
860 | * - the only uncovered node is 2, who's child domain is 1-3. |
861 | * |
862 | * However, because of the overlap, computing a unique CPU for each group is |
863 | * more complicated. Consider for instance the groups of NODE-1 NUMA-2, both |
864 | * groups include the CPUs of Node-0, while those CPUs would not in fact ever |
865 | * end up at those groups (they would end up in group: 0-1,3). |
866 | * |
867 | * To correct this we have to introduce the group balance mask. This mask |
868 | * will contain those CPUs in the group that can reach this group given the |
869 | * (child) domain tree. |
870 | * |
871 | * With this we can once again compute balance_cpu and sched_group_capacity |
872 | * relations. |
873 | * |
874 | * XXX include words on how balance_cpu is unique and therefore can be |
875 | * used for sched_group_capacity links. |
876 | * |
877 | * |
878 | * Another 'interesting' topology is: |
879 | * |
880 | * node 0 1 2 3 |
881 | * 0: 10 20 20 30 |
882 | * 1: 20 10 20 20 |
883 | * 2: 20 20 10 20 |
884 | * 3: 30 20 20 10 |
885 | * |
886 | * Which looks a little like: |
887 | * |
888 | * 0 ----- 1 |
889 | * | / | |
890 | * | / | |
891 | * | / | |
892 | * 2 ----- 3 |
893 | * |
894 | * This topology is asymmetric, nodes 1,2 are fully connected, but nodes 0,3 |
895 | * are not. |
896 | * |
897 | * This leads to a few particularly weird cases where the sched_domain's are |
898 | * not of the same number for each CPU. Consider: |
899 | * |
900 | * NUMA-2 0-3 0-3 |
901 | * groups: {0-2},{1-3} {1-3},{0-2} |
902 | * |
903 | * NUMA-1 0-2 0-3 0-3 1-3 |
904 | * |
905 | * NUMA-0 0 1 2 3 |
906 | * |
907 | */ |
908 | |
909 | |
910 | /* |
911 | * Build the balance mask; it contains only those CPUs that can arrive at this |
912 | * group and should be considered to continue balancing. |
913 | * |
914 | * We do this during the group creation pass, therefore the group information |
915 | * isn't complete yet, however since each group represents a (child) domain we |
916 | * can fully construct this using the sched_domain bits (which are already |
917 | * complete). |
918 | */ |
919 | static void |
920 | build_balance_mask(struct sched_domain *sd, struct sched_group *sg, struct cpumask *mask) |
921 | { |
922 | const struct cpumask *sg_span = sched_group_span(sg); |
923 | struct sd_data *sdd = sd->private; |
924 | struct sched_domain *sibling; |
925 | int i; |
926 | |
927 | cpumask_clear(dstp: mask); |
928 | |
929 | for_each_cpu(i, sg_span) { |
930 | sibling = *per_cpu_ptr(sdd->sd, i); |
931 | |
932 | /* |
933 | * Can happen in the asymmetric case, where these siblings are |
934 | * unused. The mask will not be empty because those CPUs that |
935 | * do have the top domain _should_ span the domain. |
936 | */ |
937 | if (!sibling->child) |
938 | continue; |
939 | |
940 | /* If we would not end up here, we can't continue from here */ |
941 | if (!cpumask_equal(src1p: sg_span, src2p: sched_domain_span(sd: sibling->child))) |
942 | continue; |
943 | |
944 | cpumask_set_cpu(cpu: i, dstp: mask); |
945 | } |
946 | |
947 | /* We must not have empty masks here */ |
948 | WARN_ON_ONCE(cpumask_empty(mask)); |
949 | } |
950 | |
951 | /* |
952 | * XXX: This creates per-node group entries; since the load-balancer will |
953 | * immediately access remote memory to construct this group's load-balance |
954 | * statistics having the groups node local is of dubious benefit. |
955 | */ |
956 | static struct sched_group * |
957 | build_group_from_child_sched_domain(struct sched_domain *sd, int cpu) |
958 | { |
959 | struct sched_group *sg; |
960 | struct cpumask *sg_span; |
961 | |
962 | sg = kzalloc_node(size: sizeof(struct sched_group) + cpumask_size(), |
963 | GFP_KERNEL, cpu_to_node(cpu)); |
964 | |
965 | if (!sg) |
966 | return NULL; |
967 | |
968 | sg_span = sched_group_span(sg); |
969 | if (sd->child) { |
970 | cpumask_copy(dstp: sg_span, srcp: sched_domain_span(sd: sd->child)); |
971 | sg->flags = sd->child->flags; |
972 | } else { |
973 | cpumask_copy(dstp: sg_span, srcp: sched_domain_span(sd)); |
974 | } |
975 | |
976 | atomic_inc(v: &sg->ref); |
977 | return sg; |
978 | } |
979 | |
980 | static void init_overlap_sched_group(struct sched_domain *sd, |
981 | struct sched_group *sg) |
982 | { |
983 | struct cpumask *mask = sched_domains_tmpmask2; |
984 | struct sd_data *sdd = sd->private; |
985 | struct cpumask *sg_span; |
986 | int cpu; |
987 | |
988 | build_balance_mask(sd, sg, mask); |
989 | cpu = cpumask_first(srcp: mask); |
990 | |
991 | sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); |
992 | if (atomic_inc_return(v: &sg->sgc->ref) == 1) |
993 | cpumask_copy(dstp: group_balance_mask(sg), srcp: mask); |
994 | else |
995 | WARN_ON_ONCE(!cpumask_equal(group_balance_mask(sg), mask)); |
996 | |
997 | /* |
998 | * Initialize sgc->capacity such that even if we mess up the |
999 | * domains and no possible iteration will get us here, we won't |
1000 | * die on a /0 trap. |
1001 | */ |
1002 | sg_span = sched_group_span(sg); |
1003 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(srcp: sg_span); |
1004 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; |
1005 | sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; |
1006 | } |
1007 | |
1008 | static struct sched_domain * |
1009 | find_descended_sibling(struct sched_domain *sd, struct sched_domain *sibling) |
1010 | { |
1011 | /* |
1012 | * The proper descendant would be the one whose child won't span out |
1013 | * of sd |
1014 | */ |
1015 | while (sibling->child && |
1016 | !cpumask_subset(src1p: sched_domain_span(sd: sibling->child), |
1017 | src2p: sched_domain_span(sd))) |
1018 | sibling = sibling->child; |
1019 | |
1020 | /* |
1021 | * As we are referencing sgc across different topology level, we need |
1022 | * to go down to skip those sched_domains which don't contribute to |
1023 | * scheduling because they will be degenerated in cpu_attach_domain |
1024 | */ |
1025 | while (sibling->child && |
1026 | cpumask_equal(src1p: sched_domain_span(sd: sibling->child), |
1027 | src2p: sched_domain_span(sd: sibling))) |
1028 | sibling = sibling->child; |
1029 | |
1030 | return sibling; |
1031 | } |
1032 | |
1033 | static int |
1034 | build_overlap_sched_groups(struct sched_domain *sd, int cpu) |
1035 | { |
1036 | struct sched_group *first = NULL, *last = NULL, *sg; |
1037 | const struct cpumask *span = sched_domain_span(sd); |
1038 | struct cpumask *covered = sched_domains_tmpmask; |
1039 | struct sd_data *sdd = sd->private; |
1040 | struct sched_domain *sibling; |
1041 | int i; |
1042 | |
1043 | cpumask_clear(dstp: covered); |
1044 | |
1045 | for_each_cpu_wrap(i, span, cpu) { |
1046 | struct cpumask *sg_span; |
1047 | |
1048 | if (cpumask_test_cpu(cpu: i, cpumask: covered)) |
1049 | continue; |
1050 | |
1051 | sibling = *per_cpu_ptr(sdd->sd, i); |
1052 | |
1053 | /* |
1054 | * Asymmetric node setups can result in situations where the |
1055 | * domain tree is of unequal depth, make sure to skip domains |
1056 | * that already cover the entire range. |
1057 | * |
1058 | * In that case build_sched_domains() will have terminated the |
1059 | * iteration early and our sibling sd spans will be empty. |
1060 | * Domains should always include the CPU they're built on, so |
1061 | * check that. |
1062 | */ |
1063 | if (!cpumask_test_cpu(cpu: i, cpumask: sched_domain_span(sd: sibling))) |
1064 | continue; |
1065 | |
1066 | /* |
1067 | * Usually we build sched_group by sibling's child sched_domain |
1068 | * But for machines whose NUMA diameter are 3 or above, we move |
1069 | * to build sched_group by sibling's proper descendant's child |
1070 | * domain because sibling's child sched_domain will span out of |
1071 | * the sched_domain being built as below. |
1072 | * |
1073 | * Smallest diameter=3 topology is: |
1074 | * |
1075 | * node 0 1 2 3 |
1076 | * 0: 10 20 30 40 |
1077 | * 1: 20 10 20 30 |
1078 | * 2: 30 20 10 20 |
1079 | * 3: 40 30 20 10 |
1080 | * |
1081 | * 0 --- 1 --- 2 --- 3 |
1082 | * |
1083 | * NUMA-3 0-3 N/A N/A 0-3 |
1084 | * groups: {0-2},{1-3} {1-3},{0-2} |
1085 | * |
1086 | * NUMA-2 0-2 0-3 0-3 1-3 |
1087 | * groups: {0-1},{1-3} {0-2},{2-3} {1-3},{0-1} {2-3},{0-2} |
1088 | * |
1089 | * NUMA-1 0-1 0-2 1-3 2-3 |
1090 | * groups: {0},{1} {1},{2},{0} {2},{3},{1} {3},{2} |
1091 | * |
1092 | * NUMA-0 0 1 2 3 |
1093 | * |
1094 | * The NUMA-2 groups for nodes 0 and 3 are obviously buggered, as the |
1095 | * group span isn't a subset of the domain span. |
1096 | */ |
1097 | if (sibling->child && |
1098 | !cpumask_subset(src1p: sched_domain_span(sd: sibling->child), src2p: span)) |
1099 | sibling = find_descended_sibling(sd, sibling); |
1100 | |
1101 | sg = build_group_from_child_sched_domain(sd: sibling, cpu); |
1102 | if (!sg) |
1103 | goto fail; |
1104 | |
1105 | sg_span = sched_group_span(sg); |
1106 | cpumask_or(dstp: covered, src1p: covered, src2p: sg_span); |
1107 | |
1108 | init_overlap_sched_group(sd: sibling, sg); |
1109 | |
1110 | if (!first) |
1111 | first = sg; |
1112 | if (last) |
1113 | last->next = sg; |
1114 | last = sg; |
1115 | last->next = first; |
1116 | } |
1117 | sd->groups = first; |
1118 | |
1119 | return 0; |
1120 | |
1121 | fail: |
1122 | free_sched_groups(sg: first, free_sgc: 0); |
1123 | |
1124 | return -ENOMEM; |
1125 | } |
1126 | |
1127 | |
1128 | /* |
1129 | * Package topology (also see the load-balance blurb in fair.c) |
1130 | * |
1131 | * The scheduler builds a tree structure to represent a number of important |
1132 | * topology features. By default (default_topology[]) these include: |
1133 | * |
1134 | * - Simultaneous multithreading (SMT) |
1135 | * - Multi-Core Cache (MC) |
1136 | * - Package (PKG) |
1137 | * |
1138 | * Where the last one more or less denotes everything up to a NUMA node. |
1139 | * |
1140 | * The tree consists of 3 primary data structures: |
1141 | * |
1142 | * sched_domain -> sched_group -> sched_group_capacity |
1143 | * ^ ^ ^ ^ |
1144 | * `-' `-' |
1145 | * |
1146 | * The sched_domains are per-CPU and have a two way link (parent & child) and |
1147 | * denote the ever growing mask of CPUs belonging to that level of topology. |
1148 | * |
1149 | * Each sched_domain has a circular (double) linked list of sched_group's, each |
1150 | * denoting the domains of the level below (or individual CPUs in case of the |
1151 | * first domain level). The sched_group linked by a sched_domain includes the |
1152 | * CPU of that sched_domain [*]. |
1153 | * |
1154 | * Take for instance a 2 threaded, 2 core, 2 cache cluster part: |
1155 | * |
1156 | * CPU 0 1 2 3 4 5 6 7 |
1157 | * |
1158 | * PKG [ ] |
1159 | * MC [ ] [ ] |
1160 | * SMT [ ] [ ] [ ] [ ] |
1161 | * |
1162 | * - or - |
1163 | * |
1164 | * PKG 0-7 0-7 0-7 0-7 0-7 0-7 0-7 0-7 |
1165 | * MC 0-3 0-3 0-3 0-3 4-7 4-7 4-7 4-7 |
1166 | * SMT 0-1 0-1 2-3 2-3 4-5 4-5 6-7 6-7 |
1167 | * |
1168 | * CPU 0 1 2 3 4 5 6 7 |
1169 | * |
1170 | * One way to think about it is: sched_domain moves you up and down among these |
1171 | * topology levels, while sched_group moves you sideways through it, at child |
1172 | * domain granularity. |
1173 | * |
1174 | * sched_group_capacity ensures each unique sched_group has shared storage. |
1175 | * |
1176 | * There are two related construction problems, both require a CPU that |
1177 | * uniquely identify each group (for a given domain): |
1178 | * |
1179 | * - The first is the balance_cpu (see should_we_balance() and the |
1180 | * load-balance blub in fair.c); for each group we only want 1 CPU to |
1181 | * continue balancing at a higher domain. |
1182 | * |
1183 | * - The second is the sched_group_capacity; we want all identical groups |
1184 | * to share a single sched_group_capacity. |
1185 | * |
1186 | * Since these topologies are exclusive by construction. That is, its |
1187 | * impossible for an SMT thread to belong to multiple cores, and cores to |
1188 | * be part of multiple caches. There is a very clear and unique location |
1189 | * for each CPU in the hierarchy. |
1190 | * |
1191 | * Therefore computing a unique CPU for each group is trivial (the iteration |
1192 | * mask is redundant and set all 1s; all CPUs in a group will end up at _that_ |
1193 | * group), we can simply pick the first CPU in each group. |
1194 | * |
1195 | * |
1196 | * [*] in other words, the first group of each domain is its child domain. |
1197 | */ |
1198 | |
1199 | static struct sched_group *get_group(int cpu, struct sd_data *sdd) |
1200 | { |
1201 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
1202 | struct sched_domain *child = sd->child; |
1203 | struct sched_group *sg; |
1204 | bool already_visited; |
1205 | |
1206 | if (child) |
1207 | cpu = cpumask_first(srcp: sched_domain_span(sd: child)); |
1208 | |
1209 | sg = *per_cpu_ptr(sdd->sg, cpu); |
1210 | sg->sgc = *per_cpu_ptr(sdd->sgc, cpu); |
1211 | |
1212 | /* Increase refcounts for claim_allocations: */ |
1213 | already_visited = atomic_inc_return(v: &sg->ref) > 1; |
1214 | /* sgc visits should follow a similar trend as sg */ |
1215 | WARN_ON(already_visited != (atomic_inc_return(&sg->sgc->ref) > 1)); |
1216 | |
1217 | /* If we have already visited that group, it's already initialized. */ |
1218 | if (already_visited) |
1219 | return sg; |
1220 | |
1221 | if (child) { |
1222 | cpumask_copy(dstp: sched_group_span(sg), srcp: sched_domain_span(sd: child)); |
1223 | cpumask_copy(dstp: group_balance_mask(sg), srcp: sched_group_span(sg)); |
1224 | sg->flags = child->flags; |
1225 | } else { |
1226 | cpumask_set_cpu(cpu, dstp: sched_group_span(sg)); |
1227 | cpumask_set_cpu(cpu, dstp: group_balance_mask(sg)); |
1228 | } |
1229 | |
1230 | sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(srcp: sched_group_span(sg)); |
1231 | sg->sgc->min_capacity = SCHED_CAPACITY_SCALE; |
1232 | sg->sgc->max_capacity = SCHED_CAPACITY_SCALE; |
1233 | |
1234 | return sg; |
1235 | } |
1236 | |
1237 | /* |
1238 | * build_sched_groups will build a circular linked list of the groups |
1239 | * covered by the given span, will set each group's ->cpumask correctly, |
1240 | * and will initialize their ->sgc. |
1241 | * |
1242 | * Assumes the sched_domain tree is fully constructed |
1243 | */ |
1244 | static int |
1245 | build_sched_groups(struct sched_domain *sd, int cpu) |
1246 | { |
1247 | struct sched_group *first = NULL, *last = NULL; |
1248 | struct sd_data *sdd = sd->private; |
1249 | const struct cpumask *span = sched_domain_span(sd); |
1250 | struct cpumask *covered; |
1251 | int i; |
1252 | |
1253 | lockdep_assert_held(&sched_domains_mutex); |
1254 | covered = sched_domains_tmpmask; |
1255 | |
1256 | cpumask_clear(dstp: covered); |
1257 | |
1258 | for_each_cpu_wrap(i, span, cpu) { |
1259 | struct sched_group *sg; |
1260 | |
1261 | if (cpumask_test_cpu(cpu: i, cpumask: covered)) |
1262 | continue; |
1263 | |
1264 | sg = get_group(cpu: i, sdd); |
1265 | |
1266 | cpumask_or(dstp: covered, src1p: covered, src2p: sched_group_span(sg)); |
1267 | |
1268 | if (!first) |
1269 | first = sg; |
1270 | if (last) |
1271 | last->next = sg; |
1272 | last = sg; |
1273 | } |
1274 | last->next = first; |
1275 | sd->groups = first; |
1276 | |
1277 | return 0; |
1278 | } |
1279 | |
1280 | /* |
1281 | * Initialize sched groups cpu_capacity. |
1282 | * |
1283 | * cpu_capacity indicates the capacity of sched group, which is used while |
1284 | * distributing the load between different sched groups in a sched domain. |
1285 | * Typically cpu_capacity for all the groups in a sched domain will be same |
1286 | * unless there are asymmetries in the topology. If there are asymmetries, |
1287 | * group having more cpu_capacity will pickup more load compared to the |
1288 | * group having less cpu_capacity. |
1289 | */ |
1290 | static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) |
1291 | { |
1292 | struct sched_group *sg = sd->groups; |
1293 | struct cpumask *mask = sched_domains_tmpmask2; |
1294 | |
1295 | WARN_ON(!sg); |
1296 | |
1297 | do { |
1298 | int cpu, cores = 0, max_cpu = -1; |
1299 | |
1300 | sg->group_weight = cpumask_weight(srcp: sched_group_span(sg)); |
1301 | |
1302 | cpumask_copy(dstp: mask, srcp: sched_group_span(sg)); |
1303 | for_each_cpu(cpu, mask) { |
1304 | cores++; |
1305 | #ifdef CONFIG_SCHED_SMT |
1306 | cpumask_andnot(dstp: mask, src1p: mask, src2p: cpu_smt_mask(cpu)); |
1307 | #endif |
1308 | } |
1309 | sg->cores = cores; |
1310 | |
1311 | if (!(sd->flags & SD_ASYM_PACKING)) |
1312 | goto next; |
1313 | |
1314 | for_each_cpu(cpu, sched_group_span(sg)) { |
1315 | if (max_cpu < 0) |
1316 | max_cpu = cpu; |
1317 | else if (sched_asym_prefer(a: cpu, b: max_cpu)) |
1318 | max_cpu = cpu; |
1319 | } |
1320 | sg->asym_prefer_cpu = max_cpu; |
1321 | |
1322 | next: |
1323 | sg = sg->next; |
1324 | } while (sg != sd->groups); |
1325 | |
1326 | if (cpu != group_balance_cpu(sg)) |
1327 | return; |
1328 | |
1329 | update_group_capacity(sd, cpu); |
1330 | } |
1331 | |
1332 | /* |
1333 | * Asymmetric CPU capacity bits |
1334 | */ |
1335 | struct asym_cap_data { |
1336 | struct list_head link; |
1337 | unsigned long capacity; |
1338 | unsigned long cpus[]; |
1339 | }; |
1340 | |
1341 | /* |
1342 | * Set of available CPUs grouped by their corresponding capacities |
1343 | * Each list entry contains a CPU mask reflecting CPUs that share the same |
1344 | * capacity. |
1345 | * The lifespan of data is unlimited. |
1346 | */ |
1347 | static LIST_HEAD(asym_cap_list); |
1348 | |
1349 | #define cpu_capacity_span(asym_data) to_cpumask((asym_data)->cpus) |
1350 | |
1351 | /* |
1352 | * Verify whether there is any CPU capacity asymmetry in a given sched domain. |
1353 | * Provides sd_flags reflecting the asymmetry scope. |
1354 | */ |
1355 | static inline int |
1356 | asym_cpu_capacity_classify(const struct cpumask *sd_span, |
1357 | const struct cpumask *cpu_map) |
1358 | { |
1359 | struct asym_cap_data *entry; |
1360 | int count = 0, miss = 0; |
1361 | |
1362 | /* |
1363 | * Count how many unique CPU capacities this domain spans across |
1364 | * (compare sched_domain CPUs mask with ones representing available |
1365 | * CPUs capacities). Take into account CPUs that might be offline: |
1366 | * skip those. |
1367 | */ |
1368 | list_for_each_entry(entry, &asym_cap_list, link) { |
1369 | if (cpumask_intersects(src1p: sd_span, cpu_capacity_span(entry))) |
1370 | ++count; |
1371 | else if (cpumask_intersects(src1p: cpu_map, cpu_capacity_span(entry))) |
1372 | ++miss; |
1373 | } |
1374 | |
1375 | WARN_ON_ONCE(!count && !list_empty(&asym_cap_list)); |
1376 | |
1377 | /* No asymmetry detected */ |
1378 | if (count < 2) |
1379 | return 0; |
1380 | /* Some of the available CPU capacity values have not been detected */ |
1381 | if (miss) |
1382 | return SD_ASYM_CPUCAPACITY; |
1383 | |
1384 | /* Full asymmetry */ |
1385 | return SD_ASYM_CPUCAPACITY | SD_ASYM_CPUCAPACITY_FULL; |
1386 | |
1387 | } |
1388 | |
1389 | static inline void asym_cpu_capacity_update_data(int cpu) |
1390 | { |
1391 | unsigned long capacity = arch_scale_cpu_capacity(cpu); |
1392 | struct asym_cap_data *entry = NULL; |
1393 | |
1394 | list_for_each_entry(entry, &asym_cap_list, link) { |
1395 | if (capacity == entry->capacity) |
1396 | goto done; |
1397 | } |
1398 | |
1399 | entry = kzalloc(size: sizeof(*entry) + cpumask_size(), GFP_KERNEL); |
1400 | if (WARN_ONCE(!entry, "Failed to allocate memory for asymmetry data\n" )) |
1401 | return; |
1402 | entry->capacity = capacity; |
1403 | list_add(new: &entry->link, head: &asym_cap_list); |
1404 | done: |
1405 | __cpumask_set_cpu(cpu, cpu_capacity_span(entry)); |
1406 | } |
1407 | |
1408 | /* |
1409 | * Build-up/update list of CPUs grouped by their capacities |
1410 | * An update requires explicit request to rebuild sched domains |
1411 | * with state indicating CPU topology changes. |
1412 | */ |
1413 | static void asym_cpu_capacity_scan(void) |
1414 | { |
1415 | struct asym_cap_data *entry, *next; |
1416 | int cpu; |
1417 | |
1418 | list_for_each_entry(entry, &asym_cap_list, link) |
1419 | cpumask_clear(cpu_capacity_span(entry)); |
1420 | |
1421 | for_each_cpu_and(cpu, cpu_possible_mask, housekeeping_cpumask(HK_TYPE_DOMAIN)) |
1422 | asym_cpu_capacity_update_data(cpu); |
1423 | |
1424 | list_for_each_entry_safe(entry, next, &asym_cap_list, link) { |
1425 | if (cpumask_empty(cpu_capacity_span(entry))) { |
1426 | list_del(entry: &entry->link); |
1427 | kfree(objp: entry); |
1428 | } |
1429 | } |
1430 | |
1431 | /* |
1432 | * Only one capacity value has been detected i.e. this system is symmetric. |
1433 | * No need to keep this data around. |
1434 | */ |
1435 | if (list_is_singular(head: &asym_cap_list)) { |
1436 | entry = list_first_entry(&asym_cap_list, typeof(*entry), link); |
1437 | list_del(entry: &entry->link); |
1438 | kfree(objp: entry); |
1439 | } |
1440 | } |
1441 | |
1442 | /* |
1443 | * Initializers for schedule domains |
1444 | * Non-inlined to reduce accumulated stack pressure in build_sched_domains() |
1445 | */ |
1446 | |
1447 | static int default_relax_domain_level = -1; |
1448 | int sched_domain_level_max; |
1449 | |
1450 | static int __init setup_relax_domain_level(char *str) |
1451 | { |
1452 | if (kstrtoint(s: str, base: 0, res: &default_relax_domain_level)) |
1453 | pr_warn("Unable to set relax_domain_level\n" ); |
1454 | |
1455 | return 1; |
1456 | } |
1457 | __setup("relax_domain_level=" , setup_relax_domain_level); |
1458 | |
1459 | static void set_domain_attribute(struct sched_domain *sd, |
1460 | struct sched_domain_attr *attr) |
1461 | { |
1462 | int request; |
1463 | |
1464 | if (!attr || attr->relax_domain_level < 0) { |
1465 | if (default_relax_domain_level < 0) |
1466 | return; |
1467 | request = default_relax_domain_level; |
1468 | } else |
1469 | request = attr->relax_domain_level; |
1470 | |
1471 | if (sd->level > request) { |
1472 | /* Turn off idle balance on this domain: */ |
1473 | sd->flags &= ~(SD_BALANCE_WAKE|SD_BALANCE_NEWIDLE); |
1474 | } |
1475 | } |
1476 | |
1477 | static void __sdt_free(const struct cpumask *cpu_map); |
1478 | static int __sdt_alloc(const struct cpumask *cpu_map); |
1479 | |
1480 | static void __free_domain_allocs(struct s_data *d, enum s_alloc what, |
1481 | const struct cpumask *cpu_map) |
1482 | { |
1483 | switch (what) { |
1484 | case sa_rootdomain: |
1485 | if (!atomic_read(v: &d->rd->refcount)) |
1486 | free_rootdomain(rcu: &d->rd->rcu); |
1487 | fallthrough; |
1488 | case sa_sd: |
1489 | free_percpu(pdata: d->sd); |
1490 | fallthrough; |
1491 | case sa_sd_storage: |
1492 | __sdt_free(cpu_map); |
1493 | fallthrough; |
1494 | case sa_none: |
1495 | break; |
1496 | } |
1497 | } |
1498 | |
1499 | static enum s_alloc |
1500 | __visit_domain_allocation_hell(struct s_data *d, const struct cpumask *cpu_map) |
1501 | { |
1502 | memset(d, 0, sizeof(*d)); |
1503 | |
1504 | if (__sdt_alloc(cpu_map)) |
1505 | return sa_sd_storage; |
1506 | d->sd = alloc_percpu(struct sched_domain *); |
1507 | if (!d->sd) |
1508 | return sa_sd_storage; |
1509 | d->rd = alloc_rootdomain(); |
1510 | if (!d->rd) |
1511 | return sa_sd; |
1512 | |
1513 | return sa_rootdomain; |
1514 | } |
1515 | |
1516 | /* |
1517 | * NULL the sd_data elements we've used to build the sched_domain and |
1518 | * sched_group structure so that the subsequent __free_domain_allocs() |
1519 | * will not free the data we're using. |
1520 | */ |
1521 | static void claim_allocations(int cpu, struct sched_domain *sd) |
1522 | { |
1523 | struct sd_data *sdd = sd->private; |
1524 | |
1525 | WARN_ON_ONCE(*per_cpu_ptr(sdd->sd, cpu) != sd); |
1526 | *per_cpu_ptr(sdd->sd, cpu) = NULL; |
1527 | |
1528 | if (atomic_read(v: &(*per_cpu_ptr(sdd->sds, cpu))->ref)) |
1529 | *per_cpu_ptr(sdd->sds, cpu) = NULL; |
1530 | |
1531 | if (atomic_read(v: &(*per_cpu_ptr(sdd->sg, cpu))->ref)) |
1532 | *per_cpu_ptr(sdd->sg, cpu) = NULL; |
1533 | |
1534 | if (atomic_read(v: &(*per_cpu_ptr(sdd->sgc, cpu))->ref)) |
1535 | *per_cpu_ptr(sdd->sgc, cpu) = NULL; |
1536 | } |
1537 | |
1538 | #ifdef CONFIG_NUMA |
1539 | enum numa_topology_type sched_numa_topology_type; |
1540 | |
1541 | static int sched_domains_numa_levels; |
1542 | static int sched_domains_curr_level; |
1543 | |
1544 | int sched_max_numa_distance; |
1545 | static int *sched_domains_numa_distance; |
1546 | static struct cpumask ***sched_domains_numa_masks; |
1547 | #endif |
1548 | |
1549 | /* |
1550 | * SD_flags allowed in topology descriptions. |
1551 | * |
1552 | * These flags are purely descriptive of the topology and do not prescribe |
1553 | * behaviour. Behaviour is artificial and mapped in the below sd_init() |
1554 | * function: |
1555 | * |
1556 | * SD_SHARE_CPUCAPACITY - describes SMT topologies |
1557 | * SD_SHARE_PKG_RESOURCES - describes shared caches |
1558 | * SD_NUMA - describes NUMA topologies |
1559 | * |
1560 | * Odd one out, which beside describing the topology has a quirk also |
1561 | * prescribes the desired behaviour that goes along with it: |
1562 | * |
1563 | * SD_ASYM_PACKING - describes SMT quirks |
1564 | */ |
1565 | #define TOPOLOGY_SD_FLAGS \ |
1566 | (SD_SHARE_CPUCAPACITY | \ |
1567 | SD_CLUSTER | \ |
1568 | SD_SHARE_PKG_RESOURCES | \ |
1569 | SD_NUMA | \ |
1570 | SD_ASYM_PACKING) |
1571 | |
1572 | static struct sched_domain * |
1573 | sd_init(struct sched_domain_topology_level *tl, |
1574 | const struct cpumask *cpu_map, |
1575 | struct sched_domain *child, int cpu) |
1576 | { |
1577 | struct sd_data *sdd = &tl->data; |
1578 | struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu); |
1579 | int sd_id, sd_weight, sd_flags = 0; |
1580 | struct cpumask *sd_span; |
1581 | |
1582 | #ifdef CONFIG_NUMA |
1583 | /* |
1584 | * Ugly hack to pass state to sd_numa_mask()... |
1585 | */ |
1586 | sched_domains_curr_level = tl->numa_level; |
1587 | #endif |
1588 | |
1589 | sd_weight = cpumask_weight(srcp: tl->mask(cpu)); |
1590 | |
1591 | if (tl->sd_flags) |
1592 | sd_flags = (*tl->sd_flags)(); |
1593 | if (WARN_ONCE(sd_flags & ~TOPOLOGY_SD_FLAGS, |
1594 | "wrong sd_flags in topology description\n" )) |
1595 | sd_flags &= TOPOLOGY_SD_FLAGS; |
1596 | |
1597 | *sd = (struct sched_domain){ |
1598 | .min_interval = sd_weight, |
1599 | .max_interval = 2*sd_weight, |
1600 | .busy_factor = 16, |
1601 | .imbalance_pct = 117, |
1602 | |
1603 | .cache_nice_tries = 0, |
1604 | |
1605 | .flags = 1*SD_BALANCE_NEWIDLE |
1606 | | 1*SD_BALANCE_EXEC |
1607 | | 1*SD_BALANCE_FORK |
1608 | | 0*SD_BALANCE_WAKE |
1609 | | 1*SD_WAKE_AFFINE |
1610 | | 0*SD_SHARE_CPUCAPACITY |
1611 | | 0*SD_SHARE_PKG_RESOURCES |
1612 | | 0*SD_SERIALIZE |
1613 | | 1*SD_PREFER_SIBLING |
1614 | | 0*SD_NUMA |
1615 | | sd_flags |
1616 | , |
1617 | |
1618 | .last_balance = jiffies, |
1619 | .balance_interval = sd_weight, |
1620 | .max_newidle_lb_cost = 0, |
1621 | .last_decay_max_lb_cost = jiffies, |
1622 | .child = child, |
1623 | #ifdef CONFIG_SCHED_DEBUG |
1624 | .name = tl->name, |
1625 | #endif |
1626 | }; |
1627 | |
1628 | sd_span = sched_domain_span(sd); |
1629 | cpumask_and(dstp: sd_span, src1p: cpu_map, src2p: tl->mask(cpu)); |
1630 | sd_id = cpumask_first(srcp: sd_span); |
1631 | |
1632 | sd->flags |= asym_cpu_capacity_classify(sd_span, cpu_map); |
1633 | |
1634 | WARN_ONCE((sd->flags & (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY)) == |
1635 | (SD_SHARE_CPUCAPACITY | SD_ASYM_CPUCAPACITY), |
1636 | "CPU capacity asymmetry not supported on SMT\n" ); |
1637 | |
1638 | /* |
1639 | * Convert topological properties into behaviour. |
1640 | */ |
1641 | /* Don't attempt to spread across CPUs of different capacities. */ |
1642 | if ((sd->flags & SD_ASYM_CPUCAPACITY) && sd->child) |
1643 | sd->child->flags &= ~SD_PREFER_SIBLING; |
1644 | |
1645 | if (sd->flags & SD_SHARE_CPUCAPACITY) { |
1646 | sd->imbalance_pct = 110; |
1647 | |
1648 | } else if (sd->flags & SD_SHARE_PKG_RESOURCES) { |
1649 | sd->imbalance_pct = 117; |
1650 | sd->cache_nice_tries = 1; |
1651 | |
1652 | #ifdef CONFIG_NUMA |
1653 | } else if (sd->flags & SD_NUMA) { |
1654 | sd->cache_nice_tries = 2; |
1655 | |
1656 | sd->flags &= ~SD_PREFER_SIBLING; |
1657 | sd->flags |= SD_SERIALIZE; |
1658 | if (sched_domains_numa_distance[tl->numa_level] > node_reclaim_distance) { |
1659 | sd->flags &= ~(SD_BALANCE_EXEC | |
1660 | SD_BALANCE_FORK | |
1661 | SD_WAKE_AFFINE); |
1662 | } |
1663 | |
1664 | #endif |
1665 | } else { |
1666 | sd->cache_nice_tries = 1; |
1667 | } |
1668 | |
1669 | /* |
1670 | * For all levels sharing cache; connect a sched_domain_shared |
1671 | * instance. |
1672 | */ |
1673 | if (sd->flags & SD_SHARE_PKG_RESOURCES) { |
1674 | sd->shared = *per_cpu_ptr(sdd->sds, sd_id); |
1675 | atomic_inc(v: &sd->shared->ref); |
1676 | atomic_set(v: &sd->shared->nr_busy_cpus, i: sd_weight); |
1677 | } |
1678 | |
1679 | sd->private = sdd; |
1680 | |
1681 | return sd; |
1682 | } |
1683 | |
1684 | /* |
1685 | * Topology list, bottom-up. |
1686 | */ |
1687 | static struct sched_domain_topology_level default_topology[] = { |
1688 | #ifdef CONFIG_SCHED_SMT |
1689 | { cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) }, |
1690 | #endif |
1691 | |
1692 | #ifdef CONFIG_SCHED_CLUSTER |
1693 | { cpu_clustergroup_mask, cpu_cluster_flags, SD_INIT_NAME(CLS) }, |
1694 | #endif |
1695 | |
1696 | #ifdef CONFIG_SCHED_MC |
1697 | { cpu_coregroup_mask, cpu_core_flags, SD_INIT_NAME(MC) }, |
1698 | #endif |
1699 | { cpu_cpu_mask, SD_INIT_NAME(PKG) }, |
1700 | { NULL, }, |
1701 | }; |
1702 | |
1703 | static struct sched_domain_topology_level *sched_domain_topology = |
1704 | default_topology; |
1705 | static struct sched_domain_topology_level *sched_domain_topology_saved; |
1706 | |
1707 | #define for_each_sd_topology(tl) \ |
1708 | for (tl = sched_domain_topology; tl->mask; tl++) |
1709 | |
1710 | void __init set_sched_topology(struct sched_domain_topology_level *tl) |
1711 | { |
1712 | if (WARN_ON_ONCE(sched_smp_initialized)) |
1713 | return; |
1714 | |
1715 | sched_domain_topology = tl; |
1716 | sched_domain_topology_saved = NULL; |
1717 | } |
1718 | |
1719 | #ifdef CONFIG_NUMA |
1720 | |
1721 | static const struct cpumask *sd_numa_mask(int cpu) |
1722 | { |
1723 | return sched_domains_numa_masks[sched_domains_curr_level][cpu_to_node(cpu)]; |
1724 | } |
1725 | |
1726 | static void sched_numa_warn(const char *str) |
1727 | { |
1728 | static int done = false; |
1729 | int i,j; |
1730 | |
1731 | if (done) |
1732 | return; |
1733 | |
1734 | done = true; |
1735 | |
1736 | printk(KERN_WARNING "ERROR: %s\n\n" , str); |
1737 | |
1738 | for (i = 0; i < nr_node_ids; i++) { |
1739 | printk(KERN_WARNING " " ); |
1740 | for (j = 0; j < nr_node_ids; j++) { |
1741 | if (!node_state(node: i, state: N_CPU) || !node_state(node: j, state: N_CPU)) |
1742 | printk(KERN_CONT "(%02d) " , node_distance(i,j)); |
1743 | else |
1744 | printk(KERN_CONT " %02d " , node_distance(i,j)); |
1745 | } |
1746 | printk(KERN_CONT "\n" ); |
1747 | } |
1748 | printk(KERN_WARNING "\n" ); |
1749 | } |
1750 | |
1751 | bool find_numa_distance(int distance) |
1752 | { |
1753 | bool found = false; |
1754 | int i, *distances; |
1755 | |
1756 | if (distance == node_distance(0, 0)) |
1757 | return true; |
1758 | |
1759 | rcu_read_lock(); |
1760 | distances = rcu_dereference(sched_domains_numa_distance); |
1761 | if (!distances) |
1762 | goto unlock; |
1763 | for (i = 0; i < sched_domains_numa_levels; i++) { |
1764 | if (distances[i] == distance) { |
1765 | found = true; |
1766 | break; |
1767 | } |
1768 | } |
1769 | unlock: |
1770 | rcu_read_unlock(); |
1771 | |
1772 | return found; |
1773 | } |
1774 | |
1775 | #define for_each_cpu_node_but(n, nbut) \ |
1776 | for_each_node_state(n, N_CPU) \ |
1777 | if (n == nbut) \ |
1778 | continue; \ |
1779 | else |
1780 | |
1781 | /* |
1782 | * A system can have three types of NUMA topology: |
1783 | * NUMA_DIRECT: all nodes are directly connected, or not a NUMA system |
1784 | * NUMA_GLUELESS_MESH: some nodes reachable through intermediary nodes |
1785 | * NUMA_BACKPLANE: nodes can reach other nodes through a backplane |
1786 | * |
1787 | * The difference between a glueless mesh topology and a backplane |
1788 | * topology lies in whether communication between not directly |
1789 | * connected nodes goes through intermediary nodes (where programs |
1790 | * could run), or through backplane controllers. This affects |
1791 | * placement of programs. |
1792 | * |
1793 | * The type of topology can be discerned with the following tests: |
1794 | * - If the maximum distance between any nodes is 1 hop, the system |
1795 | * is directly connected. |
1796 | * - If for two nodes A and B, located N > 1 hops away from each other, |
1797 | * there is an intermediary node C, which is < N hops away from both |
1798 | * nodes A and B, the system is a glueless mesh. |
1799 | */ |
1800 | static void init_numa_topology_type(int offline_node) |
1801 | { |
1802 | int a, b, c, n; |
1803 | |
1804 | n = sched_max_numa_distance; |
1805 | |
1806 | if (sched_domains_numa_levels <= 2) { |
1807 | sched_numa_topology_type = NUMA_DIRECT; |
1808 | return; |
1809 | } |
1810 | |
1811 | for_each_cpu_node_but(a, offline_node) { |
1812 | for_each_cpu_node_but(b, offline_node) { |
1813 | /* Find two nodes furthest removed from each other. */ |
1814 | if (node_distance(a, b) < n) |
1815 | continue; |
1816 | |
1817 | /* Is there an intermediary node between a and b? */ |
1818 | for_each_cpu_node_but(c, offline_node) { |
1819 | if (node_distance(a, c) < n && |
1820 | node_distance(b, c) < n) { |
1821 | sched_numa_topology_type = |
1822 | NUMA_GLUELESS_MESH; |
1823 | return; |
1824 | } |
1825 | } |
1826 | |
1827 | sched_numa_topology_type = NUMA_BACKPLANE; |
1828 | return; |
1829 | } |
1830 | } |
1831 | |
1832 | pr_err("Failed to find a NUMA topology type, defaulting to DIRECT\n" ); |
1833 | sched_numa_topology_type = NUMA_DIRECT; |
1834 | } |
1835 | |
1836 | |
1837 | #define NR_DISTANCE_VALUES (1 << DISTANCE_BITS) |
1838 | |
1839 | void sched_init_numa(int offline_node) |
1840 | { |
1841 | struct sched_domain_topology_level *tl; |
1842 | unsigned long *distance_map; |
1843 | int nr_levels = 0; |
1844 | int i, j; |
1845 | int *distances; |
1846 | struct cpumask ***masks; |
1847 | |
1848 | /* |
1849 | * O(nr_nodes^2) deduplicating selection sort -- in order to find the |
1850 | * unique distances in the node_distance() table. |
1851 | */ |
1852 | distance_map = bitmap_alloc(NR_DISTANCE_VALUES, GFP_KERNEL); |
1853 | if (!distance_map) |
1854 | return; |
1855 | |
1856 | bitmap_zero(dst: distance_map, NR_DISTANCE_VALUES); |
1857 | for_each_cpu_node_but(i, offline_node) { |
1858 | for_each_cpu_node_but(j, offline_node) { |
1859 | int distance = node_distance(i, j); |
1860 | |
1861 | if (distance < LOCAL_DISTANCE || distance >= NR_DISTANCE_VALUES) { |
1862 | sched_numa_warn(str: "Invalid distance value range" ); |
1863 | bitmap_free(bitmap: distance_map); |
1864 | return; |
1865 | } |
1866 | |
1867 | bitmap_set(map: distance_map, start: distance, nbits: 1); |
1868 | } |
1869 | } |
1870 | /* |
1871 | * We can now figure out how many unique distance values there are and |
1872 | * allocate memory accordingly. |
1873 | */ |
1874 | nr_levels = bitmap_weight(src: distance_map, NR_DISTANCE_VALUES); |
1875 | |
1876 | distances = kcalloc(n: nr_levels, size: sizeof(int), GFP_KERNEL); |
1877 | if (!distances) { |
1878 | bitmap_free(bitmap: distance_map); |
1879 | return; |
1880 | } |
1881 | |
1882 | for (i = 0, j = 0; i < nr_levels; i++, j++) { |
1883 | j = find_next_bit(addr: distance_map, NR_DISTANCE_VALUES, offset: j); |
1884 | distances[i] = j; |
1885 | } |
1886 | rcu_assign_pointer(sched_domains_numa_distance, distances); |
1887 | |
1888 | bitmap_free(bitmap: distance_map); |
1889 | |
1890 | /* |
1891 | * 'nr_levels' contains the number of unique distances |
1892 | * |
1893 | * The sched_domains_numa_distance[] array includes the actual distance |
1894 | * numbers. |
1895 | */ |
1896 | |
1897 | /* |
1898 | * Here, we should temporarily reset sched_domains_numa_levels to 0. |
1899 | * If it fails to allocate memory for array sched_domains_numa_masks[][], |
1900 | * the array will contain less then 'nr_levels' members. This could be |
1901 | * dangerous when we use it to iterate array sched_domains_numa_masks[][] |
1902 | * in other functions. |
1903 | * |
1904 | * We reset it to 'nr_levels' at the end of this function. |
1905 | */ |
1906 | sched_domains_numa_levels = 0; |
1907 | |
1908 | masks = kzalloc(size: sizeof(void *) * nr_levels, GFP_KERNEL); |
1909 | if (!masks) |
1910 | return; |
1911 | |
1912 | /* |
1913 | * Now for each level, construct a mask per node which contains all |
1914 | * CPUs of nodes that are that many hops away from us. |
1915 | */ |
1916 | for (i = 0; i < nr_levels; i++) { |
1917 | masks[i] = kzalloc(size: nr_node_ids * sizeof(void *), GFP_KERNEL); |
1918 | if (!masks[i]) |
1919 | return; |
1920 | |
1921 | for_each_cpu_node_but(j, offline_node) { |
1922 | struct cpumask *mask = kzalloc(size: cpumask_size(), GFP_KERNEL); |
1923 | int k; |
1924 | |
1925 | if (!mask) |
1926 | return; |
1927 | |
1928 | masks[i][j] = mask; |
1929 | |
1930 | for_each_cpu_node_but(k, offline_node) { |
1931 | if (sched_debug() && (node_distance(j, k) != node_distance(k, j))) |
1932 | sched_numa_warn(str: "Node-distance not symmetric" ); |
1933 | |
1934 | if (node_distance(j, k) > sched_domains_numa_distance[i]) |
1935 | continue; |
1936 | |
1937 | cpumask_or(dstp: mask, src1p: mask, src2p: cpumask_of_node(node: k)); |
1938 | } |
1939 | } |
1940 | } |
1941 | rcu_assign_pointer(sched_domains_numa_masks, masks); |
1942 | |
1943 | /* Compute default topology size */ |
1944 | for (i = 0; sched_domain_topology[i].mask; i++); |
1945 | |
1946 | tl = kzalloc(size: (i + nr_levels + 1) * |
1947 | sizeof(struct sched_domain_topology_level), GFP_KERNEL); |
1948 | if (!tl) |
1949 | return; |
1950 | |
1951 | /* |
1952 | * Copy the default topology bits.. |
1953 | */ |
1954 | for (i = 0; sched_domain_topology[i].mask; i++) |
1955 | tl[i] = sched_domain_topology[i]; |
1956 | |
1957 | /* |
1958 | * Add the NUMA identity distance, aka single NODE. |
1959 | */ |
1960 | tl[i++] = (struct sched_domain_topology_level){ |
1961 | .mask = sd_numa_mask, |
1962 | .numa_level = 0, |
1963 | SD_INIT_NAME(NODE) |
1964 | }; |
1965 | |
1966 | /* |
1967 | * .. and append 'j' levels of NUMA goodness. |
1968 | */ |
1969 | for (j = 1; j < nr_levels; i++, j++) { |
1970 | tl[i] = (struct sched_domain_topology_level){ |
1971 | .mask = sd_numa_mask, |
1972 | .sd_flags = cpu_numa_flags, |
1973 | .flags = SDTL_OVERLAP, |
1974 | .numa_level = j, |
1975 | SD_INIT_NAME(NUMA) |
1976 | }; |
1977 | } |
1978 | |
1979 | sched_domain_topology_saved = sched_domain_topology; |
1980 | sched_domain_topology = tl; |
1981 | |
1982 | sched_domains_numa_levels = nr_levels; |
1983 | WRITE_ONCE(sched_max_numa_distance, sched_domains_numa_distance[nr_levels - 1]); |
1984 | |
1985 | init_numa_topology_type(offline_node); |
1986 | } |
1987 | |
1988 | |
1989 | static void sched_reset_numa(void) |
1990 | { |
1991 | int nr_levels, *distances; |
1992 | struct cpumask ***masks; |
1993 | |
1994 | nr_levels = sched_domains_numa_levels; |
1995 | sched_domains_numa_levels = 0; |
1996 | sched_max_numa_distance = 0; |
1997 | sched_numa_topology_type = NUMA_DIRECT; |
1998 | distances = sched_domains_numa_distance; |
1999 | rcu_assign_pointer(sched_domains_numa_distance, NULL); |
2000 | masks = sched_domains_numa_masks; |
2001 | rcu_assign_pointer(sched_domains_numa_masks, NULL); |
2002 | if (distances || masks) { |
2003 | int i, j; |
2004 | |
2005 | synchronize_rcu(); |
2006 | kfree(objp: distances); |
2007 | for (i = 0; i < nr_levels && masks; i++) { |
2008 | if (!masks[i]) |
2009 | continue; |
2010 | for_each_node(j) |
2011 | kfree(objp: masks[i][j]); |
2012 | kfree(objp: masks[i]); |
2013 | } |
2014 | kfree(objp: masks); |
2015 | } |
2016 | if (sched_domain_topology_saved) { |
2017 | kfree(objp: sched_domain_topology); |
2018 | sched_domain_topology = sched_domain_topology_saved; |
2019 | sched_domain_topology_saved = NULL; |
2020 | } |
2021 | } |
2022 | |
2023 | /* |
2024 | * Call with hotplug lock held |
2025 | */ |
2026 | void sched_update_numa(int cpu, bool online) |
2027 | { |
2028 | int node; |
2029 | |
2030 | node = cpu_to_node(cpu); |
2031 | /* |
2032 | * Scheduler NUMA topology is updated when the first CPU of a |
2033 | * node is onlined or the last CPU of a node is offlined. |
2034 | */ |
2035 | if (cpumask_weight(srcp: cpumask_of_node(node)) != 1) |
2036 | return; |
2037 | |
2038 | sched_reset_numa(); |
2039 | sched_init_numa(offline_node: online ? NUMA_NO_NODE : node); |
2040 | } |
2041 | |
2042 | void sched_domains_numa_masks_set(unsigned int cpu) |
2043 | { |
2044 | int node = cpu_to_node(cpu); |
2045 | int i, j; |
2046 | |
2047 | for (i = 0; i < sched_domains_numa_levels; i++) { |
2048 | for (j = 0; j < nr_node_ids; j++) { |
2049 | if (!node_state(node: j, state: N_CPU)) |
2050 | continue; |
2051 | |
2052 | /* Set ourselves in the remote node's masks */ |
2053 | if (node_distance(j, node) <= sched_domains_numa_distance[i]) |
2054 | cpumask_set_cpu(cpu, dstp: sched_domains_numa_masks[i][j]); |
2055 | } |
2056 | } |
2057 | } |
2058 | |
2059 | void sched_domains_numa_masks_clear(unsigned int cpu) |
2060 | { |
2061 | int i, j; |
2062 | |
2063 | for (i = 0; i < sched_domains_numa_levels; i++) { |
2064 | for (j = 0; j < nr_node_ids; j++) { |
2065 | if (sched_domains_numa_masks[i][j]) |
2066 | cpumask_clear_cpu(cpu, dstp: sched_domains_numa_masks[i][j]); |
2067 | } |
2068 | } |
2069 | } |
2070 | |
2071 | /* |
2072 | * sched_numa_find_closest() - given the NUMA topology, find the cpu |
2073 | * closest to @cpu from @cpumask. |
2074 | * cpumask: cpumask to find a cpu from |
2075 | * cpu: cpu to be close to |
2076 | * |
2077 | * returns: cpu, or nr_cpu_ids when nothing found. |
2078 | */ |
2079 | int sched_numa_find_closest(const struct cpumask *cpus, int cpu) |
2080 | { |
2081 | int i, j = cpu_to_node(cpu), found = nr_cpu_ids; |
2082 | struct cpumask ***masks; |
2083 | |
2084 | rcu_read_lock(); |
2085 | masks = rcu_dereference(sched_domains_numa_masks); |
2086 | if (!masks) |
2087 | goto unlock; |
2088 | for (i = 0; i < sched_domains_numa_levels; i++) { |
2089 | if (!masks[i][j]) |
2090 | break; |
2091 | cpu = cpumask_any_and(cpus, masks[i][j]); |
2092 | if (cpu < nr_cpu_ids) { |
2093 | found = cpu; |
2094 | break; |
2095 | } |
2096 | } |
2097 | unlock: |
2098 | rcu_read_unlock(); |
2099 | |
2100 | return found; |
2101 | } |
2102 | |
2103 | struct __cmp_key { |
2104 | const struct cpumask *cpus; |
2105 | struct cpumask ***masks; |
2106 | int node; |
2107 | int cpu; |
2108 | int w; |
2109 | }; |
2110 | |
2111 | static int hop_cmp(const void *a, const void *b) |
2112 | { |
2113 | struct cpumask **prev_hop, **cur_hop = *(struct cpumask ***)b; |
2114 | struct __cmp_key *k = (struct __cmp_key *)a; |
2115 | |
2116 | if (cpumask_weight_and(srcp1: k->cpus, srcp2: cur_hop[k->node]) <= k->cpu) |
2117 | return 1; |
2118 | |
2119 | if (b == k->masks) { |
2120 | k->w = 0; |
2121 | return 0; |
2122 | } |
2123 | |
2124 | prev_hop = *((struct cpumask ***)b - 1); |
2125 | k->w = cpumask_weight_and(srcp1: k->cpus, srcp2: prev_hop[k->node]); |
2126 | if (k->w <= k->cpu) |
2127 | return 0; |
2128 | |
2129 | return -1; |
2130 | } |
2131 | |
2132 | /** |
2133 | * sched_numa_find_nth_cpu() - given the NUMA topology, find the Nth closest CPU |
2134 | * from @cpus to @cpu, taking into account distance |
2135 | * from a given @node. |
2136 | * @cpus: cpumask to find a cpu from |
2137 | * @cpu: CPU to start searching |
2138 | * @node: NUMA node to order CPUs by distance |
2139 | * |
2140 | * Return: cpu, or nr_cpu_ids when nothing found. |
2141 | */ |
2142 | int sched_numa_find_nth_cpu(const struct cpumask *cpus, int cpu, int node) |
2143 | { |
2144 | struct __cmp_key k = { .cpus = cpus, .cpu = cpu }; |
2145 | struct cpumask ***hop_masks; |
2146 | int hop, ret = nr_cpu_ids; |
2147 | |
2148 | if (node == NUMA_NO_NODE) |
2149 | return cpumask_nth_and(cpu, srcp1: cpus, cpu_online_mask); |
2150 | |
2151 | rcu_read_lock(); |
2152 | |
2153 | /* CPU-less node entries are uninitialized in sched_domains_numa_masks */ |
2154 | node = numa_nearest_node(node, state: N_CPU); |
2155 | k.node = node; |
2156 | |
2157 | k.masks = rcu_dereference(sched_domains_numa_masks); |
2158 | if (!k.masks) |
2159 | goto unlock; |
2160 | |
2161 | hop_masks = bsearch(key: &k, base: k.masks, num: sched_domains_numa_levels, size: sizeof(k.masks[0]), cmp: hop_cmp); |
2162 | hop = hop_masks - k.masks; |
2163 | |
2164 | ret = hop ? |
2165 | cpumask_nth_and_andnot(cpu: cpu - k.w, srcp1: cpus, srcp2: k.masks[hop][node], srcp3: k.masks[hop-1][node]) : |
2166 | cpumask_nth_and(cpu, srcp1: cpus, srcp2: k.masks[0][node]); |
2167 | unlock: |
2168 | rcu_read_unlock(); |
2169 | return ret; |
2170 | } |
2171 | EXPORT_SYMBOL_GPL(sched_numa_find_nth_cpu); |
2172 | |
2173 | /** |
2174 | * sched_numa_hop_mask() - Get the cpumask of CPUs at most @hops hops away from |
2175 | * @node |
2176 | * @node: The node to count hops from. |
2177 | * @hops: Include CPUs up to that many hops away. 0 means local node. |
2178 | * |
2179 | * Return: On success, a pointer to a cpumask of CPUs at most @hops away from |
2180 | * @node, an error value otherwise. |
2181 | * |
2182 | * Requires rcu_lock to be held. Returned cpumask is only valid within that |
2183 | * read-side section, copy it if required beyond that. |
2184 | * |
2185 | * Note that not all hops are equal in distance; see sched_init_numa() for how |
2186 | * distances and masks are handled. |
2187 | * Also note that this is a reflection of sched_domains_numa_masks, which may change |
2188 | * during the lifetime of the system (offline nodes are taken out of the masks). |
2189 | */ |
2190 | const struct cpumask *sched_numa_hop_mask(unsigned int node, unsigned int hops) |
2191 | { |
2192 | struct cpumask ***masks; |
2193 | |
2194 | if (node >= nr_node_ids || hops >= sched_domains_numa_levels) |
2195 | return ERR_PTR(error: -EINVAL); |
2196 | |
2197 | masks = rcu_dereference(sched_domains_numa_masks); |
2198 | if (!masks) |
2199 | return ERR_PTR(error: -EBUSY); |
2200 | |
2201 | return masks[hops][node]; |
2202 | } |
2203 | EXPORT_SYMBOL_GPL(sched_numa_hop_mask); |
2204 | |
2205 | #endif /* CONFIG_NUMA */ |
2206 | |
2207 | static int __sdt_alloc(const struct cpumask *cpu_map) |
2208 | { |
2209 | struct sched_domain_topology_level *tl; |
2210 | int j; |
2211 | |
2212 | for_each_sd_topology(tl) { |
2213 | struct sd_data *sdd = &tl->data; |
2214 | |
2215 | sdd->sd = alloc_percpu(struct sched_domain *); |
2216 | if (!sdd->sd) |
2217 | return -ENOMEM; |
2218 | |
2219 | sdd->sds = alloc_percpu(struct sched_domain_shared *); |
2220 | if (!sdd->sds) |
2221 | return -ENOMEM; |
2222 | |
2223 | sdd->sg = alloc_percpu(struct sched_group *); |
2224 | if (!sdd->sg) |
2225 | return -ENOMEM; |
2226 | |
2227 | sdd->sgc = alloc_percpu(struct sched_group_capacity *); |
2228 | if (!sdd->sgc) |
2229 | return -ENOMEM; |
2230 | |
2231 | for_each_cpu(j, cpu_map) { |
2232 | struct sched_domain *sd; |
2233 | struct sched_domain_shared *sds; |
2234 | struct sched_group *sg; |
2235 | struct sched_group_capacity *sgc; |
2236 | |
2237 | sd = kzalloc_node(size: sizeof(struct sched_domain) + cpumask_size(), |
2238 | GFP_KERNEL, cpu_to_node(cpu: j)); |
2239 | if (!sd) |
2240 | return -ENOMEM; |
2241 | |
2242 | *per_cpu_ptr(sdd->sd, j) = sd; |
2243 | |
2244 | sds = kzalloc_node(size: sizeof(struct sched_domain_shared), |
2245 | GFP_KERNEL, cpu_to_node(cpu: j)); |
2246 | if (!sds) |
2247 | return -ENOMEM; |
2248 | |
2249 | *per_cpu_ptr(sdd->sds, j) = sds; |
2250 | |
2251 | sg = kzalloc_node(size: sizeof(struct sched_group) + cpumask_size(), |
2252 | GFP_KERNEL, cpu_to_node(cpu: j)); |
2253 | if (!sg) |
2254 | return -ENOMEM; |
2255 | |
2256 | sg->next = sg; |
2257 | |
2258 | *per_cpu_ptr(sdd->sg, j) = sg; |
2259 | |
2260 | sgc = kzalloc_node(size: sizeof(struct sched_group_capacity) + cpumask_size(), |
2261 | GFP_KERNEL, cpu_to_node(cpu: j)); |
2262 | if (!sgc) |
2263 | return -ENOMEM; |
2264 | |
2265 | #ifdef CONFIG_SCHED_DEBUG |
2266 | sgc->id = j; |
2267 | #endif |
2268 | |
2269 | *per_cpu_ptr(sdd->sgc, j) = sgc; |
2270 | } |
2271 | } |
2272 | |
2273 | return 0; |
2274 | } |
2275 | |
2276 | static void __sdt_free(const struct cpumask *cpu_map) |
2277 | { |
2278 | struct sched_domain_topology_level *tl; |
2279 | int j; |
2280 | |
2281 | for_each_sd_topology(tl) { |
2282 | struct sd_data *sdd = &tl->data; |
2283 | |
2284 | for_each_cpu(j, cpu_map) { |
2285 | struct sched_domain *sd; |
2286 | |
2287 | if (sdd->sd) { |
2288 | sd = *per_cpu_ptr(sdd->sd, j); |
2289 | if (sd && (sd->flags & SD_OVERLAP)) |
2290 | free_sched_groups(sg: sd->groups, free_sgc: 0); |
2291 | kfree(objp: *per_cpu_ptr(sdd->sd, j)); |
2292 | } |
2293 | |
2294 | if (sdd->sds) |
2295 | kfree(objp: *per_cpu_ptr(sdd->sds, j)); |
2296 | if (sdd->sg) |
2297 | kfree(objp: *per_cpu_ptr(sdd->sg, j)); |
2298 | if (sdd->sgc) |
2299 | kfree(objp: *per_cpu_ptr(sdd->sgc, j)); |
2300 | } |
2301 | free_percpu(pdata: sdd->sd); |
2302 | sdd->sd = NULL; |
2303 | free_percpu(pdata: sdd->sds); |
2304 | sdd->sds = NULL; |
2305 | free_percpu(pdata: sdd->sg); |
2306 | sdd->sg = NULL; |
2307 | free_percpu(pdata: sdd->sgc); |
2308 | sdd->sgc = NULL; |
2309 | } |
2310 | } |
2311 | |
2312 | static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl, |
2313 | const struct cpumask *cpu_map, struct sched_domain_attr *attr, |
2314 | struct sched_domain *child, int cpu) |
2315 | { |
2316 | struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu); |
2317 | |
2318 | if (child) { |
2319 | sd->level = child->level + 1; |
2320 | sched_domain_level_max = max(sched_domain_level_max, sd->level); |
2321 | child->parent = sd; |
2322 | |
2323 | if (!cpumask_subset(src1p: sched_domain_span(sd: child), |
2324 | src2p: sched_domain_span(sd))) { |
2325 | pr_err("BUG: arch topology borken\n" ); |
2326 | #ifdef CONFIG_SCHED_DEBUG |
2327 | pr_err(" the %s domain not a subset of the %s domain\n" , |
2328 | child->name, sd->name); |
2329 | #endif |
2330 | /* Fixup, ensure @sd has at least @child CPUs. */ |
2331 | cpumask_or(dstp: sched_domain_span(sd), |
2332 | src1p: sched_domain_span(sd), |
2333 | src2p: sched_domain_span(sd: child)); |
2334 | } |
2335 | |
2336 | } |
2337 | set_domain_attribute(sd, attr); |
2338 | |
2339 | return sd; |
2340 | } |
2341 | |
2342 | /* |
2343 | * Ensure topology masks are sane, i.e. there are no conflicts (overlaps) for |
2344 | * any two given CPUs at this (non-NUMA) topology level. |
2345 | */ |
2346 | static bool topology_span_sane(struct sched_domain_topology_level *tl, |
2347 | const struct cpumask *cpu_map, int cpu) |
2348 | { |
2349 | int i; |
2350 | |
2351 | /* NUMA levels are allowed to overlap */ |
2352 | if (tl->flags & SDTL_OVERLAP) |
2353 | return true; |
2354 | |
2355 | /* |
2356 | * Non-NUMA levels cannot partially overlap - they must be either |
2357 | * completely equal or completely disjoint. Otherwise we can end up |
2358 | * breaking the sched_group lists - i.e. a later get_group() pass |
2359 | * breaks the linking done for an earlier span. |
2360 | */ |
2361 | for_each_cpu(i, cpu_map) { |
2362 | if (i == cpu) |
2363 | continue; |
2364 | /* |
2365 | * We should 'and' all those masks with 'cpu_map' to exactly |
2366 | * match the topology we're about to build, but that can only |
2367 | * remove CPUs, which only lessens our ability to detect |
2368 | * overlaps |
2369 | */ |
2370 | if (!cpumask_equal(src1p: tl->mask(cpu), src2p: tl->mask(i)) && |
2371 | cpumask_intersects(src1p: tl->mask(cpu), src2p: tl->mask(i))) |
2372 | return false; |
2373 | } |
2374 | |
2375 | return true; |
2376 | } |
2377 | |
2378 | /* |
2379 | * Build sched domains for a given set of CPUs and attach the sched domains |
2380 | * to the individual CPUs |
2381 | */ |
2382 | static int |
2383 | build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) |
2384 | { |
2385 | enum s_alloc alloc_state = sa_none; |
2386 | struct sched_domain *sd; |
2387 | struct s_data d; |
2388 | struct rq *rq = NULL; |
2389 | int i, ret = -ENOMEM; |
2390 | bool has_asym = false; |
2391 | bool has_cluster = false; |
2392 | |
2393 | if (WARN_ON(cpumask_empty(cpu_map))) |
2394 | goto error; |
2395 | |
2396 | alloc_state = __visit_domain_allocation_hell(d: &d, cpu_map); |
2397 | if (alloc_state != sa_rootdomain) |
2398 | goto error; |
2399 | |
2400 | /* Set up domains for CPUs specified by the cpu_map: */ |
2401 | for_each_cpu(i, cpu_map) { |
2402 | struct sched_domain_topology_level *tl; |
2403 | |
2404 | sd = NULL; |
2405 | for_each_sd_topology(tl) { |
2406 | |
2407 | if (WARN_ON(!topology_span_sane(tl, cpu_map, i))) |
2408 | goto error; |
2409 | |
2410 | sd = build_sched_domain(tl, cpu_map, attr, child: sd, cpu: i); |
2411 | |
2412 | has_asym |= sd->flags & SD_ASYM_CPUCAPACITY; |
2413 | |
2414 | if (tl == sched_domain_topology) |
2415 | *per_cpu_ptr(d.sd, i) = sd; |
2416 | if (tl->flags & SDTL_OVERLAP) |
2417 | sd->flags |= SD_OVERLAP; |
2418 | if (cpumask_equal(src1p: cpu_map, src2p: sched_domain_span(sd))) |
2419 | break; |
2420 | } |
2421 | } |
2422 | |
2423 | /* Build the groups for the domains */ |
2424 | for_each_cpu(i, cpu_map) { |
2425 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
2426 | sd->span_weight = cpumask_weight(srcp: sched_domain_span(sd)); |
2427 | if (sd->flags & SD_OVERLAP) { |
2428 | if (build_overlap_sched_groups(sd, cpu: i)) |
2429 | goto error; |
2430 | } else { |
2431 | if (build_sched_groups(sd, cpu: i)) |
2432 | goto error; |
2433 | } |
2434 | } |
2435 | } |
2436 | |
2437 | /* |
2438 | * Calculate an allowed NUMA imbalance such that LLCs do not get |
2439 | * imbalanced. |
2440 | */ |
2441 | for_each_cpu(i, cpu_map) { |
2442 | unsigned int imb = 0; |
2443 | unsigned int imb_span = 1; |
2444 | |
2445 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
2446 | struct sched_domain *child = sd->child; |
2447 | |
2448 | if (!(sd->flags & SD_SHARE_PKG_RESOURCES) && child && |
2449 | (child->flags & SD_SHARE_PKG_RESOURCES)) { |
2450 | struct sched_domain __rcu *top_p; |
2451 | unsigned int nr_llcs; |
2452 | |
2453 | /* |
2454 | * For a single LLC per node, allow an |
2455 | * imbalance up to 12.5% of the node. This is |
2456 | * arbitrary cutoff based two factors -- SMT and |
2457 | * memory channels. For SMT-2, the intent is to |
2458 | * avoid premature sharing of HT resources but |
2459 | * SMT-4 or SMT-8 *may* benefit from a different |
2460 | * cutoff. For memory channels, this is a very |
2461 | * rough estimate of how many channels may be |
2462 | * active and is based on recent CPUs with |
2463 | * many cores. |
2464 | * |
2465 | * For multiple LLCs, allow an imbalance |
2466 | * until multiple tasks would share an LLC |
2467 | * on one node while LLCs on another node |
2468 | * remain idle. This assumes that there are |
2469 | * enough logical CPUs per LLC to avoid SMT |
2470 | * factors and that there is a correlation |
2471 | * between LLCs and memory channels. |
2472 | */ |
2473 | nr_llcs = sd->span_weight / child->span_weight; |
2474 | if (nr_llcs == 1) |
2475 | imb = sd->span_weight >> 3; |
2476 | else |
2477 | imb = nr_llcs; |
2478 | imb = max(1U, imb); |
2479 | sd->imb_numa_nr = imb; |
2480 | |
2481 | /* Set span based on the first NUMA domain. */ |
2482 | top_p = sd->parent; |
2483 | while (top_p && !(top_p->flags & SD_NUMA)) { |
2484 | top_p = top_p->parent; |
2485 | } |
2486 | imb_span = top_p ? top_p->span_weight : sd->span_weight; |
2487 | } else { |
2488 | int factor = max(1U, (sd->span_weight / imb_span)); |
2489 | |
2490 | sd->imb_numa_nr = imb * factor; |
2491 | } |
2492 | } |
2493 | } |
2494 | |
2495 | /* Calculate CPU capacity for physical packages and nodes */ |
2496 | for (i = nr_cpumask_bits-1; i >= 0; i--) { |
2497 | if (!cpumask_test_cpu(cpu: i, cpumask: cpu_map)) |
2498 | continue; |
2499 | |
2500 | for (sd = *per_cpu_ptr(d.sd, i); sd; sd = sd->parent) { |
2501 | claim_allocations(cpu: i, sd); |
2502 | init_sched_groups_capacity(cpu: i, sd); |
2503 | } |
2504 | } |
2505 | |
2506 | /* Attach the domains */ |
2507 | rcu_read_lock(); |
2508 | for_each_cpu(i, cpu_map) { |
2509 | unsigned long capacity; |
2510 | |
2511 | rq = cpu_rq(i); |
2512 | sd = *per_cpu_ptr(d.sd, i); |
2513 | |
2514 | capacity = arch_scale_cpu_capacity(cpu: i); |
2515 | /* Use READ_ONCE()/WRITE_ONCE() to avoid load/store tearing: */ |
2516 | if (capacity > READ_ONCE(d.rd->max_cpu_capacity)) |
2517 | WRITE_ONCE(d.rd->max_cpu_capacity, capacity); |
2518 | |
2519 | cpu_attach_domain(sd, rd: d.rd, cpu: i); |
2520 | |
2521 | if (lowest_flag_domain(cpu: i, flag: SD_CLUSTER)) |
2522 | has_cluster = true; |
2523 | } |
2524 | rcu_read_unlock(); |
2525 | |
2526 | if (has_asym) |
2527 | static_branch_inc_cpuslocked(&sched_asym_cpucapacity); |
2528 | |
2529 | if (has_cluster) |
2530 | static_branch_inc_cpuslocked(&sched_cluster_active); |
2531 | |
2532 | if (rq && sched_debug_verbose) { |
2533 | pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n" , |
2534 | cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); |
2535 | } |
2536 | |
2537 | ret = 0; |
2538 | error: |
2539 | __free_domain_allocs(d: &d, what: alloc_state, cpu_map); |
2540 | |
2541 | return ret; |
2542 | } |
2543 | |
2544 | /* Current sched domains: */ |
2545 | static cpumask_var_t *doms_cur; |
2546 | |
2547 | /* Number of sched domains in 'doms_cur': */ |
2548 | static int ndoms_cur; |
2549 | |
2550 | /* Attributes of custom domains in 'doms_cur' */ |
2551 | static struct sched_domain_attr *dattr_cur; |
2552 | |
2553 | /* |
2554 | * Special case: If a kmalloc() of a doms_cur partition (array of |
2555 | * cpumask) fails, then fallback to a single sched domain, |
2556 | * as determined by the single cpumask fallback_doms. |
2557 | */ |
2558 | static cpumask_var_t fallback_doms; |
2559 | |
2560 | /* |
2561 | * arch_update_cpu_topology lets virtualized architectures update the |
2562 | * CPU core maps. It is supposed to return 1 if the topology changed |
2563 | * or 0 if it stayed the same. |
2564 | */ |
2565 | int __weak arch_update_cpu_topology(void) |
2566 | { |
2567 | return 0; |
2568 | } |
2569 | |
2570 | cpumask_var_t *alloc_sched_domains(unsigned int ndoms) |
2571 | { |
2572 | int i; |
2573 | cpumask_var_t *doms; |
2574 | |
2575 | doms = kmalloc_array(n: ndoms, size: sizeof(*doms), GFP_KERNEL); |
2576 | if (!doms) |
2577 | return NULL; |
2578 | for (i = 0; i < ndoms; i++) { |
2579 | if (!alloc_cpumask_var(mask: &doms[i], GFP_KERNEL)) { |
2580 | free_sched_domains(doms, ndoms: i); |
2581 | return NULL; |
2582 | } |
2583 | } |
2584 | return doms; |
2585 | } |
2586 | |
2587 | void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms) |
2588 | { |
2589 | unsigned int i; |
2590 | for (i = 0; i < ndoms; i++) |
2591 | free_cpumask_var(mask: doms[i]); |
2592 | kfree(objp: doms); |
2593 | } |
2594 | |
2595 | /* |
2596 | * Set up scheduler domains and groups. For now this just excludes isolated |
2597 | * CPUs, but could be used to exclude other special cases in the future. |
2598 | */ |
2599 | int __init sched_init_domains(const struct cpumask *cpu_map) |
2600 | { |
2601 | int err; |
2602 | |
2603 | zalloc_cpumask_var(mask: &sched_domains_tmpmask, GFP_KERNEL); |
2604 | zalloc_cpumask_var(mask: &sched_domains_tmpmask2, GFP_KERNEL); |
2605 | zalloc_cpumask_var(mask: &fallback_doms, GFP_KERNEL); |
2606 | |
2607 | arch_update_cpu_topology(); |
2608 | asym_cpu_capacity_scan(); |
2609 | ndoms_cur = 1; |
2610 | doms_cur = alloc_sched_domains(ndoms: ndoms_cur); |
2611 | if (!doms_cur) |
2612 | doms_cur = &fallback_doms; |
2613 | cpumask_and(dstp: doms_cur[0], src1p: cpu_map, src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); |
2614 | err = build_sched_domains(cpu_map: doms_cur[0], NULL); |
2615 | |
2616 | return err; |
2617 | } |
2618 | |
2619 | /* |
2620 | * Detach sched domains from a group of CPUs specified in cpu_map |
2621 | * These CPUs will now be attached to the NULL domain |
2622 | */ |
2623 | static void detach_destroy_domains(const struct cpumask *cpu_map) |
2624 | { |
2625 | unsigned int cpu = cpumask_any(cpu_map); |
2626 | int i; |
2627 | |
2628 | if (rcu_access_pointer(per_cpu(sd_asym_cpucapacity, cpu))) |
2629 | static_branch_dec_cpuslocked(&sched_asym_cpucapacity); |
2630 | |
2631 | if (static_branch_unlikely(&sched_cluster_active)) |
2632 | static_branch_dec_cpuslocked(&sched_cluster_active); |
2633 | |
2634 | rcu_read_lock(); |
2635 | for_each_cpu(i, cpu_map) |
2636 | cpu_attach_domain(NULL, rd: &def_root_domain, cpu: i); |
2637 | rcu_read_unlock(); |
2638 | } |
2639 | |
2640 | /* handle null as "default" */ |
2641 | static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, |
2642 | struct sched_domain_attr *new, int idx_new) |
2643 | { |
2644 | struct sched_domain_attr tmp; |
2645 | |
2646 | /* Fast path: */ |
2647 | if (!new && !cur) |
2648 | return 1; |
2649 | |
2650 | tmp = SD_ATTR_INIT; |
2651 | |
2652 | return !memcmp(p: cur ? (cur + idx_cur) : &tmp, |
2653 | q: new ? (new + idx_new) : &tmp, |
2654 | size: sizeof(struct sched_domain_attr)); |
2655 | } |
2656 | |
2657 | /* |
2658 | * Partition sched domains as specified by the 'ndoms_new' |
2659 | * cpumasks in the array doms_new[] of cpumasks. This compares |
2660 | * doms_new[] to the current sched domain partitioning, doms_cur[]. |
2661 | * It destroys each deleted domain and builds each new domain. |
2662 | * |
2663 | * 'doms_new' is an array of cpumask_var_t's of length 'ndoms_new'. |
2664 | * The masks don't intersect (don't overlap.) We should setup one |
2665 | * sched domain for each mask. CPUs not in any of the cpumasks will |
2666 | * not be load balanced. If the same cpumask appears both in the |
2667 | * current 'doms_cur' domains and in the new 'doms_new', we can leave |
2668 | * it as it is. |
2669 | * |
2670 | * The passed in 'doms_new' should be allocated using |
2671 | * alloc_sched_domains. This routine takes ownership of it and will |
2672 | * free_sched_domains it when done with it. If the caller failed the |
2673 | * alloc call, then it can pass in doms_new == NULL && ndoms_new == 1, |
2674 | * and partition_sched_domains() will fallback to the single partition |
2675 | * 'fallback_doms', it also forces the domains to be rebuilt. |
2676 | * |
2677 | * If doms_new == NULL it will be replaced with cpu_online_mask. |
2678 | * ndoms_new == 0 is a special case for destroying existing domains, |
2679 | * and it will not create the default domain. |
2680 | * |
2681 | * Call with hotplug lock and sched_domains_mutex held |
2682 | */ |
2683 | void partition_sched_domains_locked(int ndoms_new, cpumask_var_t doms_new[], |
2684 | struct sched_domain_attr *dattr_new) |
2685 | { |
2686 | bool __maybe_unused has_eas = false; |
2687 | int i, j, n; |
2688 | int new_topology; |
2689 | |
2690 | lockdep_assert_held(&sched_domains_mutex); |
2691 | |
2692 | /* Let the architecture update CPU core mappings: */ |
2693 | new_topology = arch_update_cpu_topology(); |
2694 | /* Trigger rebuilding CPU capacity asymmetry data */ |
2695 | if (new_topology) |
2696 | asym_cpu_capacity_scan(); |
2697 | |
2698 | if (!doms_new) { |
2699 | WARN_ON_ONCE(dattr_new); |
2700 | n = 0; |
2701 | doms_new = alloc_sched_domains(ndoms: 1); |
2702 | if (doms_new) { |
2703 | n = 1; |
2704 | cpumask_and(dstp: doms_new[0], cpu_active_mask, |
2705 | src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); |
2706 | } |
2707 | } else { |
2708 | n = ndoms_new; |
2709 | } |
2710 | |
2711 | /* Destroy deleted domains: */ |
2712 | for (i = 0; i < ndoms_cur; i++) { |
2713 | for (j = 0; j < n && !new_topology; j++) { |
2714 | if (cpumask_equal(src1p: doms_cur[i], src2p: doms_new[j]) && |
2715 | dattrs_equal(cur: dattr_cur, idx_cur: i, new: dattr_new, idx_new: j)) { |
2716 | struct root_domain *rd; |
2717 | |
2718 | /* |
2719 | * This domain won't be destroyed and as such |
2720 | * its dl_bw->total_bw needs to be cleared. It |
2721 | * will be recomputed in function |
2722 | * update_tasks_root_domain(). |
2723 | */ |
2724 | rd = cpu_rq(cpumask_any(doms_cur[i]))->rd; |
2725 | dl_clear_root_domain(rd); |
2726 | goto match1; |
2727 | } |
2728 | } |
2729 | /* No match - a current sched domain not in new doms_new[] */ |
2730 | detach_destroy_domains(cpu_map: doms_cur[i]); |
2731 | match1: |
2732 | ; |
2733 | } |
2734 | |
2735 | n = ndoms_cur; |
2736 | if (!doms_new) { |
2737 | n = 0; |
2738 | doms_new = &fallback_doms; |
2739 | cpumask_and(dstp: doms_new[0], cpu_active_mask, |
2740 | src2p: housekeeping_cpumask(type: HK_TYPE_DOMAIN)); |
2741 | } |
2742 | |
2743 | /* Build new domains: */ |
2744 | for (i = 0; i < ndoms_new; i++) { |
2745 | for (j = 0; j < n && !new_topology; j++) { |
2746 | if (cpumask_equal(src1p: doms_new[i], src2p: doms_cur[j]) && |
2747 | dattrs_equal(cur: dattr_new, idx_cur: i, new: dattr_cur, idx_new: j)) |
2748 | goto match2; |
2749 | } |
2750 | /* No match - add a new doms_new */ |
2751 | build_sched_domains(cpu_map: doms_new[i], attr: dattr_new ? dattr_new + i : NULL); |
2752 | match2: |
2753 | ; |
2754 | } |
2755 | |
2756 | #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) |
2757 | /* Build perf. domains: */ |
2758 | for (i = 0; i < ndoms_new; i++) { |
2759 | for (j = 0; j < n && !sched_energy_update; j++) { |
2760 | if (cpumask_equal(src1p: doms_new[i], src2p: doms_cur[j]) && |
2761 | cpu_rq(cpumask_first(doms_cur[j]))->rd->pd) { |
2762 | has_eas = true; |
2763 | goto match3; |
2764 | } |
2765 | } |
2766 | /* No match - add perf. domains for a new rd */ |
2767 | has_eas |= build_perf_domains(cpu_map: doms_new[i]); |
2768 | match3: |
2769 | ; |
2770 | } |
2771 | sched_energy_set(has_eas); |
2772 | #endif |
2773 | |
2774 | /* Remember the new sched domains: */ |
2775 | if (doms_cur != &fallback_doms) |
2776 | free_sched_domains(doms: doms_cur, ndoms: ndoms_cur); |
2777 | |
2778 | kfree(objp: dattr_cur); |
2779 | doms_cur = doms_new; |
2780 | dattr_cur = dattr_new; |
2781 | ndoms_cur = ndoms_new; |
2782 | |
2783 | update_sched_domain_debugfs(); |
2784 | } |
2785 | |
2786 | /* |
2787 | * Call with hotplug lock held |
2788 | */ |
2789 | void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[], |
2790 | struct sched_domain_attr *dattr_new) |
2791 | { |
2792 | mutex_lock(&sched_domains_mutex); |
2793 | partition_sched_domains_locked(ndoms_new, doms_new, dattr_new); |
2794 | mutex_unlock(lock: &sched_domains_mutex); |
2795 | } |
2796 | |