1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * acpi_pad.c ACPI Processor Aggregator Driver |
4 | * |
5 | * Copyright (c) 2009, Intel Corporation. |
6 | */ |
7 | |
8 | #include <linux/kernel.h> |
9 | #include <linux/cpumask.h> |
10 | #include <linux/module.h> |
11 | #include <linux/init.h> |
12 | #include <linux/types.h> |
13 | #include <linux/kthread.h> |
14 | #include <uapi/linux/sched/types.h> |
15 | #include <linux/freezer.h> |
16 | #include <linux/cpu.h> |
17 | #include <linux/tick.h> |
18 | #include <linux/slab.h> |
19 | #include <linux/acpi.h> |
20 | #include <linux/perf_event.h> |
21 | #include <linux/platform_device.h> |
22 | #include <asm/mwait.h> |
23 | #include <xen/xen.h> |
24 | |
25 | #define ACPI_PROCESSOR_AGGREGATOR_CLASS "acpi_pad" |
26 | #define ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME "Processor Aggregator" |
27 | #define ACPI_PROCESSOR_AGGREGATOR_NOTIFY 0x80 |
28 | static DEFINE_MUTEX(isolated_cpus_lock); |
29 | static DEFINE_MUTEX(round_robin_lock); |
30 | |
31 | static unsigned long power_saving_mwait_eax; |
32 | |
33 | static unsigned char tsc_detected_unstable; |
34 | static unsigned char tsc_marked_unstable; |
35 | |
36 | static void power_saving_mwait_init(void) |
37 | { |
38 | unsigned int eax, ebx, ecx, edx; |
39 | unsigned int highest_cstate = 0; |
40 | unsigned int highest_subcstate = 0; |
41 | int i; |
42 | |
43 | if (!boot_cpu_has(X86_FEATURE_MWAIT)) |
44 | return; |
45 | if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) |
46 | return; |
47 | |
48 | cpuid(CPUID_MWAIT_LEAF, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
49 | |
50 | if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || |
51 | !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) |
52 | return; |
53 | |
54 | edx >>= MWAIT_SUBSTATE_SIZE; |
55 | for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { |
56 | if (edx & MWAIT_SUBSTATE_MASK) { |
57 | highest_cstate = i; |
58 | highest_subcstate = edx & MWAIT_SUBSTATE_MASK; |
59 | } |
60 | } |
61 | power_saving_mwait_eax = (highest_cstate << MWAIT_SUBSTATE_SIZE) | |
62 | (highest_subcstate - 1); |
63 | |
64 | #if defined(CONFIG_X86) |
65 | switch (boot_cpu_data.x86_vendor) { |
66 | case X86_VENDOR_HYGON: |
67 | case X86_VENDOR_AMD: |
68 | case X86_VENDOR_INTEL: |
69 | case X86_VENDOR_ZHAOXIN: |
70 | case X86_VENDOR_CENTAUR: |
71 | /* |
72 | * AMD Fam10h TSC will tick in all |
73 | * C/P/S0/S1 states when this bit is set. |
74 | */ |
75 | if (!boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) |
76 | tsc_detected_unstable = 1; |
77 | break; |
78 | default: |
79 | /* TSC could halt in idle */ |
80 | tsc_detected_unstable = 1; |
81 | } |
82 | #endif |
83 | } |
84 | |
85 | static unsigned long cpu_weight[NR_CPUS]; |
86 | static int tsk_in_cpu[NR_CPUS] = {[0 ... NR_CPUS-1] = -1}; |
87 | static DECLARE_BITMAP(pad_busy_cpus_bits, NR_CPUS); |
88 | static void round_robin_cpu(unsigned int tsk_index) |
89 | { |
90 | struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits); |
91 | cpumask_var_t tmp; |
92 | int cpu; |
93 | unsigned long min_weight = -1; |
94 | unsigned long preferred_cpu; |
95 | |
96 | if (!alloc_cpumask_var(mask: &tmp, GFP_KERNEL)) |
97 | return; |
98 | |
99 | mutex_lock(&round_robin_lock); |
100 | cpumask_clear(dstp: tmp); |
101 | for_each_cpu(cpu, pad_busy_cpus) |
102 | cpumask_or(dstp: tmp, src1p: tmp, topology_sibling_cpumask(cpu)); |
103 | cpumask_andnot(dstp: tmp, cpu_online_mask, src2p: tmp); |
104 | /* avoid HT siblings if possible */ |
105 | if (cpumask_empty(srcp: tmp)) |
106 | cpumask_andnot(dstp: tmp, cpu_online_mask, src2p: pad_busy_cpus); |
107 | if (cpumask_empty(srcp: tmp)) { |
108 | mutex_unlock(lock: &round_robin_lock); |
109 | free_cpumask_var(mask: tmp); |
110 | return; |
111 | } |
112 | for_each_cpu(cpu, tmp) { |
113 | if (cpu_weight[cpu] < min_weight) { |
114 | min_weight = cpu_weight[cpu]; |
115 | preferred_cpu = cpu; |
116 | } |
117 | } |
118 | |
119 | if (tsk_in_cpu[tsk_index] != -1) |
120 | cpumask_clear_cpu(cpu: tsk_in_cpu[tsk_index], dstp: pad_busy_cpus); |
121 | tsk_in_cpu[tsk_index] = preferred_cpu; |
122 | cpumask_set_cpu(cpu: preferred_cpu, dstp: pad_busy_cpus); |
123 | cpu_weight[preferred_cpu]++; |
124 | mutex_unlock(lock: &round_robin_lock); |
125 | |
126 | set_cpus_allowed_ptr(current, cpumask_of(preferred_cpu)); |
127 | |
128 | free_cpumask_var(mask: tmp); |
129 | } |
130 | |
131 | static void exit_round_robin(unsigned int tsk_index) |
132 | { |
133 | struct cpumask *pad_busy_cpus = to_cpumask(pad_busy_cpus_bits); |
134 | |
135 | cpumask_clear_cpu(cpu: tsk_in_cpu[tsk_index], dstp: pad_busy_cpus); |
136 | tsk_in_cpu[tsk_index] = -1; |
137 | } |
138 | |
139 | static unsigned int idle_pct = 5; /* percentage */ |
140 | static unsigned int round_robin_time = 1; /* second */ |
141 | static int power_saving_thread(void *data) |
142 | { |
143 | int do_sleep; |
144 | unsigned int tsk_index = (unsigned long)data; |
145 | u64 last_jiffies = 0; |
146 | |
147 | sched_set_fifo_low(current); |
148 | |
149 | while (!kthread_should_stop()) { |
150 | unsigned long expire_time; |
151 | |
152 | /* round robin to cpus */ |
153 | expire_time = last_jiffies + round_robin_time * HZ; |
154 | if (time_before(expire_time, jiffies)) { |
155 | last_jiffies = jiffies; |
156 | round_robin_cpu(tsk_index); |
157 | } |
158 | |
159 | do_sleep = 0; |
160 | |
161 | expire_time = jiffies + HZ * (100 - idle_pct) / 100; |
162 | |
163 | while (!need_resched()) { |
164 | if (tsc_detected_unstable && !tsc_marked_unstable) { |
165 | /* TSC could halt in idle, so notify users */ |
166 | mark_tsc_unstable(reason: "TSC halts in idle" ); |
167 | tsc_marked_unstable = 1; |
168 | } |
169 | local_irq_disable(); |
170 | |
171 | perf_lopwr_cb(lopwr_in: true); |
172 | |
173 | tick_broadcast_enable(); |
174 | tick_broadcast_enter(); |
175 | stop_critical_timings(); |
176 | |
177 | mwait_idle_with_hints(eax: power_saving_mwait_eax, ecx: 1); |
178 | |
179 | start_critical_timings(); |
180 | tick_broadcast_exit(); |
181 | |
182 | perf_lopwr_cb(lopwr_in: false); |
183 | |
184 | local_irq_enable(); |
185 | |
186 | if (time_before(expire_time, jiffies)) { |
187 | do_sleep = 1; |
188 | break; |
189 | } |
190 | } |
191 | |
192 | /* |
193 | * current sched_rt has threshold for rt task running time. |
194 | * When a rt task uses 95% CPU time, the rt thread will be |
195 | * scheduled out for 5% CPU time to not starve other tasks. But |
196 | * the mechanism only works when all CPUs have RT task running, |
197 | * as if one CPU hasn't RT task, RT task from other CPUs will |
198 | * borrow CPU time from this CPU and cause RT task use > 95% |
199 | * CPU time. To make 'avoid starvation' work, takes a nap here. |
200 | */ |
201 | if (unlikely(do_sleep)) |
202 | schedule_timeout_killable(HZ * idle_pct / 100); |
203 | |
204 | /* If an external event has set the need_resched flag, then |
205 | * we need to deal with it, or this loop will continue to |
206 | * spin without calling __mwait(). |
207 | */ |
208 | if (unlikely(need_resched())) |
209 | schedule(); |
210 | } |
211 | |
212 | exit_round_robin(tsk_index); |
213 | return 0; |
214 | } |
215 | |
216 | static struct task_struct *ps_tsks[NR_CPUS]; |
217 | static unsigned int ps_tsk_num; |
218 | static int create_power_saving_task(void) |
219 | { |
220 | int rc; |
221 | |
222 | ps_tsks[ps_tsk_num] = kthread_run(power_saving_thread, |
223 | (void *)(unsigned long)ps_tsk_num, |
224 | "acpi_pad/%d" , ps_tsk_num); |
225 | |
226 | if (IS_ERR(ptr: ps_tsks[ps_tsk_num])) { |
227 | rc = PTR_ERR(ptr: ps_tsks[ps_tsk_num]); |
228 | ps_tsks[ps_tsk_num] = NULL; |
229 | } else { |
230 | rc = 0; |
231 | ps_tsk_num++; |
232 | } |
233 | |
234 | return rc; |
235 | } |
236 | |
237 | static void destroy_power_saving_task(void) |
238 | { |
239 | if (ps_tsk_num > 0) { |
240 | ps_tsk_num--; |
241 | kthread_stop(k: ps_tsks[ps_tsk_num]); |
242 | ps_tsks[ps_tsk_num] = NULL; |
243 | } |
244 | } |
245 | |
246 | static void set_power_saving_task_num(unsigned int num) |
247 | { |
248 | if (num > ps_tsk_num) { |
249 | while (ps_tsk_num < num) { |
250 | if (create_power_saving_task()) |
251 | return; |
252 | } |
253 | } else if (num < ps_tsk_num) { |
254 | while (ps_tsk_num > num) |
255 | destroy_power_saving_task(); |
256 | } |
257 | } |
258 | |
259 | static void acpi_pad_idle_cpus(unsigned int num_cpus) |
260 | { |
261 | cpus_read_lock(); |
262 | |
263 | num_cpus = min_t(unsigned int, num_cpus, num_online_cpus()); |
264 | set_power_saving_task_num(num_cpus); |
265 | |
266 | cpus_read_unlock(); |
267 | } |
268 | |
269 | static uint32_t acpi_pad_idle_cpus_num(void) |
270 | { |
271 | return ps_tsk_num; |
272 | } |
273 | |
274 | static ssize_t rrtime_store(struct device *dev, |
275 | struct device_attribute *attr, const char *buf, size_t count) |
276 | { |
277 | unsigned long num; |
278 | |
279 | if (kstrtoul(s: buf, base: 0, res: &num)) |
280 | return -EINVAL; |
281 | if (num < 1 || num >= 100) |
282 | return -EINVAL; |
283 | mutex_lock(&isolated_cpus_lock); |
284 | round_robin_time = num; |
285 | mutex_unlock(lock: &isolated_cpus_lock); |
286 | return count; |
287 | } |
288 | |
289 | static ssize_t rrtime_show(struct device *dev, |
290 | struct device_attribute *attr, char *buf) |
291 | { |
292 | return sysfs_emit(buf, fmt: "%d\n" , round_robin_time); |
293 | } |
294 | static DEVICE_ATTR_RW(rrtime); |
295 | |
296 | static ssize_t idlepct_store(struct device *dev, |
297 | struct device_attribute *attr, const char *buf, size_t count) |
298 | { |
299 | unsigned long num; |
300 | |
301 | if (kstrtoul(s: buf, base: 0, res: &num)) |
302 | return -EINVAL; |
303 | if (num < 1 || num >= 100) |
304 | return -EINVAL; |
305 | mutex_lock(&isolated_cpus_lock); |
306 | idle_pct = num; |
307 | mutex_unlock(lock: &isolated_cpus_lock); |
308 | return count; |
309 | } |
310 | |
311 | static ssize_t idlepct_show(struct device *dev, |
312 | struct device_attribute *attr, char *buf) |
313 | { |
314 | return sysfs_emit(buf, fmt: "%d\n" , idle_pct); |
315 | } |
316 | static DEVICE_ATTR_RW(idlepct); |
317 | |
318 | static ssize_t idlecpus_store(struct device *dev, |
319 | struct device_attribute *attr, const char *buf, size_t count) |
320 | { |
321 | unsigned long num; |
322 | |
323 | if (kstrtoul(s: buf, base: 0, res: &num)) |
324 | return -EINVAL; |
325 | mutex_lock(&isolated_cpus_lock); |
326 | acpi_pad_idle_cpus(num_cpus: num); |
327 | mutex_unlock(lock: &isolated_cpus_lock); |
328 | return count; |
329 | } |
330 | |
331 | static ssize_t idlecpus_show(struct device *dev, |
332 | struct device_attribute *attr, char *buf) |
333 | { |
334 | return cpumap_print_to_pagebuf(list: false, buf, |
335 | to_cpumask(pad_busy_cpus_bits)); |
336 | } |
337 | |
338 | static DEVICE_ATTR_RW(idlecpus); |
339 | |
340 | static struct attribute *acpi_pad_attrs[] = { |
341 | &dev_attr_idlecpus.attr, |
342 | &dev_attr_idlepct.attr, |
343 | &dev_attr_rrtime.attr, |
344 | NULL |
345 | }; |
346 | |
347 | ATTRIBUTE_GROUPS(acpi_pad); |
348 | |
349 | /* |
350 | * Query firmware how many CPUs should be idle |
351 | * return -1 on failure |
352 | */ |
353 | static int acpi_pad_pur(acpi_handle handle) |
354 | { |
355 | struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; |
356 | union acpi_object *package; |
357 | int num = -1; |
358 | |
359 | if (ACPI_FAILURE(acpi_evaluate_object(handle, "_PUR" , NULL, &buffer))) |
360 | return num; |
361 | |
362 | if (!buffer.length || !buffer.pointer) |
363 | return num; |
364 | |
365 | package = buffer.pointer; |
366 | |
367 | if (package->type == ACPI_TYPE_PACKAGE && |
368 | package->package.count == 2 && |
369 | package->package.elements[0].integer.value == 1) /* rev 1 */ |
370 | |
371 | num = package->package.elements[1].integer.value; |
372 | |
373 | kfree(objp: buffer.pointer); |
374 | return num; |
375 | } |
376 | |
377 | static void acpi_pad_handle_notify(acpi_handle handle) |
378 | { |
379 | int num_cpus; |
380 | uint32_t idle_cpus; |
381 | struct acpi_buffer param = { |
382 | .length = 4, |
383 | .pointer = (void *)&idle_cpus, |
384 | }; |
385 | |
386 | mutex_lock(&isolated_cpus_lock); |
387 | num_cpus = acpi_pad_pur(handle); |
388 | if (num_cpus < 0) { |
389 | mutex_unlock(lock: &isolated_cpus_lock); |
390 | return; |
391 | } |
392 | acpi_pad_idle_cpus(num_cpus); |
393 | idle_cpus = acpi_pad_idle_cpus_num(); |
394 | acpi_evaluate_ost(handle, ACPI_PROCESSOR_AGGREGATOR_NOTIFY, status_code: 0, status_buf: ¶m); |
395 | mutex_unlock(lock: &isolated_cpus_lock); |
396 | } |
397 | |
398 | static void acpi_pad_notify(acpi_handle handle, u32 event, |
399 | void *data) |
400 | { |
401 | struct acpi_device *adev = data; |
402 | |
403 | switch (event) { |
404 | case ACPI_PROCESSOR_AGGREGATOR_NOTIFY: |
405 | acpi_pad_handle_notify(handle); |
406 | acpi_bus_generate_netlink_event(adev->pnp.device_class, |
407 | dev_name(dev: &adev->dev), event, 0); |
408 | break; |
409 | default: |
410 | pr_warn("Unsupported event [0x%x]\n" , event); |
411 | break; |
412 | } |
413 | } |
414 | |
415 | static int acpi_pad_probe(struct platform_device *pdev) |
416 | { |
417 | struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); |
418 | acpi_status status; |
419 | |
420 | strcpy(acpi_device_name(adev), ACPI_PROCESSOR_AGGREGATOR_DEVICE_NAME); |
421 | strcpy(acpi_device_class(adev), ACPI_PROCESSOR_AGGREGATOR_CLASS); |
422 | |
423 | status = acpi_install_notify_handler(device: adev->handle, |
424 | ACPI_DEVICE_NOTIFY, handler: acpi_pad_notify, context: adev); |
425 | |
426 | if (ACPI_FAILURE(status)) |
427 | return -ENODEV; |
428 | |
429 | return 0; |
430 | } |
431 | |
432 | static void acpi_pad_remove(struct platform_device *pdev) |
433 | { |
434 | struct acpi_device *adev = ACPI_COMPANION(&pdev->dev); |
435 | |
436 | mutex_lock(&isolated_cpus_lock); |
437 | acpi_pad_idle_cpus(num_cpus: 0); |
438 | mutex_unlock(lock: &isolated_cpus_lock); |
439 | |
440 | acpi_remove_notify_handler(device: adev->handle, |
441 | ACPI_DEVICE_NOTIFY, handler: acpi_pad_notify); |
442 | } |
443 | |
444 | static const struct acpi_device_id pad_device_ids[] = { |
445 | {"ACPI000C" , 0}, |
446 | {"" , 0}, |
447 | }; |
448 | MODULE_DEVICE_TABLE(acpi, pad_device_ids); |
449 | |
450 | static struct platform_driver acpi_pad_driver = { |
451 | .probe = acpi_pad_probe, |
452 | .remove_new = acpi_pad_remove, |
453 | .driver = { |
454 | .dev_groups = acpi_pad_groups, |
455 | .name = "processor_aggregator" , |
456 | .acpi_match_table = pad_device_ids, |
457 | }, |
458 | }; |
459 | |
460 | static int __init acpi_pad_init(void) |
461 | { |
462 | /* Xen ACPI PAD is used when running as Xen Dom0. */ |
463 | if (xen_initial_domain()) |
464 | return -ENODEV; |
465 | |
466 | power_saving_mwait_init(); |
467 | if (power_saving_mwait_eax == 0) |
468 | return -EINVAL; |
469 | |
470 | return platform_driver_register(&acpi_pad_driver); |
471 | } |
472 | |
473 | static void __exit acpi_pad_exit(void) |
474 | { |
475 | platform_driver_unregister(&acpi_pad_driver); |
476 | } |
477 | |
478 | module_init(acpi_pad_init); |
479 | module_exit(acpi_pad_exit); |
480 | MODULE_AUTHOR("Shaohua Li<shaohua.li@intel.com>" ); |
481 | MODULE_DESCRIPTION("ACPI Processor Aggregator Driver" ); |
482 | MODULE_LICENSE("GPL" ); |
483 | |