1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * intel_powerclamp.c - package c-state idle injection |
4 | * |
5 | * Copyright (c) 2012-2023, Intel Corporation. |
6 | * |
7 | * Authors: |
8 | * Arjan van de Ven <arjan@linux.intel.com> |
9 | * Jacob Pan <jacob.jun.pan@linux.intel.com> |
10 | * |
11 | * TODO: |
12 | * 1. better handle wakeup from external interrupts, currently a fixed |
13 | * compensation is added to clamping duration when excessive amount |
14 | * of wakeups are observed during idle time. the reason is that in |
15 | * case of external interrupts without need for ack, clamping down |
16 | * cpu in non-irq context does not reduce irq. for majority of the |
17 | * cases, clamping down cpu does help reduce irq as well, we should |
18 | * be able to differentiate the two cases and give a quantitative |
19 | * solution for the irqs that we can control. perhaps based on |
20 | * get_cpu_iowait_time_us() |
21 | * |
22 | * 2. synchronization with other hw blocks |
23 | */ |
24 | |
25 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
26 | |
27 | #include <linux/module.h> |
28 | #include <linux/kernel.h> |
29 | #include <linux/delay.h> |
30 | #include <linux/cpu.h> |
31 | #include <linux/thermal.h> |
32 | #include <linux/debugfs.h> |
33 | #include <linux/seq_file.h> |
34 | #include <linux/idle_inject.h> |
35 | |
36 | #include <asm/msr.h> |
37 | #include <asm/mwait.h> |
38 | #include <asm/cpu_device_id.h> |
39 | |
40 | #define MAX_TARGET_RATIO (100U) |
41 | /* For each undisturbed clamping period (no extra wake ups during idle time), |
42 | * we increment the confidence counter for the given target ratio. |
43 | * CONFIDENCE_OK defines the level where runtime calibration results are |
44 | * valid. |
45 | */ |
46 | #define CONFIDENCE_OK (3) |
47 | /* Default idle injection duration, driver adjust sleep time to meet target |
48 | * idle ratio. Similar to frequency modulation. |
49 | */ |
50 | #define DEFAULT_DURATION_JIFFIES (6) |
51 | |
52 | static unsigned int target_mwait; |
53 | static struct dentry *debug_dir; |
54 | static bool poll_pkg_cstate_enable; |
55 | |
56 | /* Idle ratio observed using package C-state counters */ |
57 | static unsigned int current_ratio; |
58 | |
59 | /* Skip the idle injection till set to true */ |
60 | static bool should_skip; |
61 | |
62 | struct powerclamp_data { |
63 | unsigned int cpu; |
64 | unsigned int count; |
65 | unsigned int guard; |
66 | unsigned int window_size_now; |
67 | unsigned int target_ratio; |
68 | bool clamping; |
69 | }; |
70 | |
71 | static struct powerclamp_data powerclamp_data; |
72 | |
73 | static struct thermal_cooling_device *cooling_dev; |
74 | |
75 | static DEFINE_MUTEX(powerclamp_lock); |
76 | |
77 | /* This duration is in microseconds */ |
78 | static unsigned int duration; |
79 | static unsigned int pkg_cstate_ratio_cur; |
80 | static unsigned int window_size; |
81 | |
82 | static int duration_set(const char *arg, const struct kernel_param *kp) |
83 | { |
84 | int ret = 0; |
85 | unsigned long new_duration; |
86 | |
87 | ret = kstrtoul(s: arg, base: 10, res: &new_duration); |
88 | if (ret) |
89 | goto exit; |
90 | if (new_duration > 25 || new_duration < 6) { |
91 | pr_err("Out of recommended range %lu, between 6-25ms\n" , |
92 | new_duration); |
93 | ret = -EINVAL; |
94 | goto exit; |
95 | } |
96 | |
97 | mutex_lock(&powerclamp_lock); |
98 | duration = clamp(new_duration, 6ul, 25ul) * 1000; |
99 | mutex_unlock(lock: &powerclamp_lock); |
100 | exit: |
101 | |
102 | return ret; |
103 | } |
104 | |
105 | static int duration_get(char *buf, const struct kernel_param *kp) |
106 | { |
107 | int ret; |
108 | |
109 | mutex_lock(&powerclamp_lock); |
110 | ret = sysfs_emit(buf, fmt: "%d\n" , duration / 1000); |
111 | mutex_unlock(lock: &powerclamp_lock); |
112 | |
113 | return ret; |
114 | } |
115 | |
116 | static const struct kernel_param_ops duration_ops = { |
117 | .set = duration_set, |
118 | .get = duration_get, |
119 | }; |
120 | |
121 | module_param_cb(duration, &duration_ops, NULL, 0644); |
122 | MODULE_PARM_DESC(duration, "forced idle time for each attempt in msec." ); |
123 | |
124 | #define DEFAULT_MAX_IDLE 50 |
125 | #define MAX_ALL_CPU_IDLE 75 |
126 | |
127 | static u8 max_idle = DEFAULT_MAX_IDLE; |
128 | |
129 | static cpumask_var_t idle_injection_cpu_mask; |
130 | |
131 | static int allocate_copy_idle_injection_mask(const struct cpumask *copy_mask) |
132 | { |
133 | if (cpumask_available(mask: idle_injection_cpu_mask)) |
134 | goto copy_mask; |
135 | |
136 | /* This mask is allocated only one time and freed during module exit */ |
137 | if (!alloc_cpumask_var(mask: &idle_injection_cpu_mask, GFP_KERNEL)) |
138 | return -ENOMEM; |
139 | |
140 | copy_mask: |
141 | cpumask_copy(dstp: idle_injection_cpu_mask, srcp: copy_mask); |
142 | |
143 | return 0; |
144 | } |
145 | |
146 | /* Return true if the cpumask and idle percent combination is invalid */ |
147 | static bool check_invalid(cpumask_var_t mask, u8 idle) |
148 | { |
149 | if (cpumask_equal(cpu_present_mask, src2p: mask) && idle > MAX_ALL_CPU_IDLE) |
150 | return true; |
151 | |
152 | return false; |
153 | } |
154 | |
155 | static int cpumask_set(const char *arg, const struct kernel_param *kp) |
156 | { |
157 | cpumask_var_t new_mask; |
158 | int ret; |
159 | |
160 | mutex_lock(&powerclamp_lock); |
161 | |
162 | /* Can't set mask when cooling device is in use */ |
163 | if (powerclamp_data.clamping) { |
164 | ret = -EAGAIN; |
165 | goto skip_cpumask_set; |
166 | } |
167 | |
168 | ret = alloc_cpumask_var(mask: &new_mask, GFP_KERNEL); |
169 | if (!ret) |
170 | goto skip_cpumask_set; |
171 | |
172 | ret = bitmap_parse(buf: arg, strlen(arg), cpumask_bits(new_mask), |
173 | nr_cpumask_bits); |
174 | if (ret) |
175 | goto free_cpumask_set; |
176 | |
177 | if (cpumask_empty(srcp: new_mask) || check_invalid(mask: new_mask, idle: max_idle)) { |
178 | ret = -EINVAL; |
179 | goto free_cpumask_set; |
180 | } |
181 | |
182 | /* |
183 | * When module parameters are passed from kernel command line |
184 | * during insmod, the module parameter callback is called |
185 | * before powerclamp_init(), so we can't assume that some |
186 | * cpumask can be allocated and copied before here. Also |
187 | * in this case this cpumask is used as the default mask. |
188 | */ |
189 | ret = allocate_copy_idle_injection_mask(copy_mask: new_mask); |
190 | |
191 | free_cpumask_set: |
192 | free_cpumask_var(mask: new_mask); |
193 | skip_cpumask_set: |
194 | mutex_unlock(lock: &powerclamp_lock); |
195 | |
196 | return ret; |
197 | } |
198 | |
199 | static int cpumask_get(char *buf, const struct kernel_param *kp) |
200 | { |
201 | if (!cpumask_available(mask: idle_injection_cpu_mask)) |
202 | return -ENODEV; |
203 | |
204 | return bitmap_print_to_pagebuf(list: false, buf, cpumask_bits(idle_injection_cpu_mask), |
205 | nr_cpumask_bits); |
206 | } |
207 | |
208 | static const struct kernel_param_ops cpumask_ops = { |
209 | .set = cpumask_set, |
210 | .get = cpumask_get, |
211 | }; |
212 | |
213 | module_param_cb(cpumask, &cpumask_ops, NULL, 0644); |
214 | MODULE_PARM_DESC(cpumask, "Mask of CPUs to use for idle injection." ); |
215 | |
216 | static int max_idle_set(const char *arg, const struct kernel_param *kp) |
217 | { |
218 | u8 new_max_idle; |
219 | int ret = 0; |
220 | |
221 | mutex_lock(&powerclamp_lock); |
222 | |
223 | /* Can't set mask when cooling device is in use */ |
224 | if (powerclamp_data.clamping) { |
225 | ret = -EAGAIN; |
226 | goto skip_limit_set; |
227 | } |
228 | |
229 | ret = kstrtou8(s: arg, base: 10, res: &new_max_idle); |
230 | if (ret) |
231 | goto skip_limit_set; |
232 | |
233 | if (new_max_idle > MAX_TARGET_RATIO) { |
234 | ret = -EINVAL; |
235 | goto skip_limit_set; |
236 | } |
237 | |
238 | if (!cpumask_available(mask: idle_injection_cpu_mask)) { |
239 | ret = allocate_copy_idle_injection_mask(cpu_present_mask); |
240 | if (ret) |
241 | goto skip_limit_set; |
242 | } |
243 | |
244 | if (check_invalid(mask: idle_injection_cpu_mask, idle: new_max_idle)) { |
245 | ret = -EINVAL; |
246 | goto skip_limit_set; |
247 | } |
248 | |
249 | max_idle = new_max_idle; |
250 | |
251 | skip_limit_set: |
252 | mutex_unlock(lock: &powerclamp_lock); |
253 | |
254 | return ret; |
255 | } |
256 | |
257 | static const struct kernel_param_ops max_idle_ops = { |
258 | .set = max_idle_set, |
259 | .get = param_get_byte, |
260 | }; |
261 | |
262 | module_param_cb(max_idle, &max_idle_ops, &max_idle, 0644); |
263 | MODULE_PARM_DESC(max_idle, "maximum injected idle time to the total CPU time ratio in percent range:1-100" ); |
264 | |
265 | struct powerclamp_calibration_data { |
266 | unsigned long confidence; /* used for calibration, basically a counter |
267 | * gets incremented each time a clamping |
268 | * period is completed without extra wakeups |
269 | * once that counter is reached given level, |
270 | * compensation is deemed usable. |
271 | */ |
272 | unsigned long steady_comp; /* steady state compensation used when |
273 | * no extra wakeups occurred. |
274 | */ |
275 | unsigned long dynamic_comp; /* compensate excessive wakeup from idle |
276 | * mostly from external interrupts. |
277 | */ |
278 | }; |
279 | |
280 | static struct powerclamp_calibration_data cal_data[MAX_TARGET_RATIO]; |
281 | |
282 | static int window_size_set(const char *arg, const struct kernel_param *kp) |
283 | { |
284 | int ret = 0; |
285 | unsigned long new_window_size; |
286 | |
287 | ret = kstrtoul(s: arg, base: 10, res: &new_window_size); |
288 | if (ret) |
289 | goto exit_win; |
290 | if (new_window_size > 10 || new_window_size < 2) { |
291 | pr_err("Out of recommended window size %lu, between 2-10\n" , |
292 | new_window_size); |
293 | ret = -EINVAL; |
294 | } |
295 | |
296 | window_size = clamp(new_window_size, 2ul, 10ul); |
297 | smp_mb(); |
298 | |
299 | exit_win: |
300 | |
301 | return ret; |
302 | } |
303 | |
304 | static const struct kernel_param_ops window_size_ops = { |
305 | .set = window_size_set, |
306 | .get = param_get_int, |
307 | }; |
308 | |
309 | module_param_cb(window_size, &window_size_ops, &window_size, 0644); |
310 | MODULE_PARM_DESC(window_size, "sliding window in number of clamping cycles\n" |
311 | "\tpowerclamp controls idle ratio within this window. larger\n" |
312 | "\twindow size results in slower response time but more smooth\n" |
313 | "\tclamping results. default to 2." ); |
314 | |
315 | static void find_target_mwait(void) |
316 | { |
317 | unsigned int eax, ebx, ecx, edx; |
318 | unsigned int highest_cstate = 0; |
319 | unsigned int highest_subcstate = 0; |
320 | int i; |
321 | |
322 | if (boot_cpu_data.cpuid_level < CPUID_MWAIT_LEAF) |
323 | return; |
324 | |
325 | cpuid(CPUID_MWAIT_LEAF, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
326 | |
327 | if (!(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) || |
328 | !(ecx & CPUID5_ECX_INTERRUPT_BREAK)) |
329 | return; |
330 | |
331 | edx >>= MWAIT_SUBSTATE_SIZE; |
332 | for (i = 0; i < 7 && edx; i++, edx >>= MWAIT_SUBSTATE_SIZE) { |
333 | if (edx & MWAIT_SUBSTATE_MASK) { |
334 | highest_cstate = i; |
335 | highest_subcstate = edx & MWAIT_SUBSTATE_MASK; |
336 | } |
337 | } |
338 | target_mwait = (highest_cstate << MWAIT_SUBSTATE_SIZE) | |
339 | (highest_subcstate - 1); |
340 | |
341 | } |
342 | |
343 | struct pkg_cstate_info { |
344 | bool skip; |
345 | int msr_index; |
346 | int cstate_id; |
347 | }; |
348 | |
349 | #define PKG_CSTATE_INIT(id) { \ |
350 | .msr_index = MSR_PKG_C##id##_RESIDENCY, \ |
351 | .cstate_id = id \ |
352 | } |
353 | |
354 | static struct pkg_cstate_info pkg_cstates[] = { |
355 | PKG_CSTATE_INIT(2), |
356 | PKG_CSTATE_INIT(3), |
357 | PKG_CSTATE_INIT(6), |
358 | PKG_CSTATE_INIT(7), |
359 | PKG_CSTATE_INIT(8), |
360 | PKG_CSTATE_INIT(9), |
361 | PKG_CSTATE_INIT(10), |
362 | {NULL}, |
363 | }; |
364 | |
365 | static bool has_pkg_state_counter(void) |
366 | { |
367 | u64 val; |
368 | struct pkg_cstate_info *info = pkg_cstates; |
369 | |
370 | /* check if any one of the counter msrs exists */ |
371 | while (info->msr_index) { |
372 | if (!rdmsrl_safe(msr: info->msr_index, p: &val)) |
373 | return true; |
374 | info++; |
375 | } |
376 | |
377 | return false; |
378 | } |
379 | |
380 | static u64 pkg_state_counter(void) |
381 | { |
382 | u64 val; |
383 | u64 count = 0; |
384 | struct pkg_cstate_info *info = pkg_cstates; |
385 | |
386 | while (info->msr_index) { |
387 | if (!info->skip) { |
388 | if (!rdmsrl_safe(msr: info->msr_index, p: &val)) |
389 | count += val; |
390 | else |
391 | info->skip = true; |
392 | } |
393 | info++; |
394 | } |
395 | |
396 | return count; |
397 | } |
398 | |
399 | static unsigned int get_compensation(int ratio) |
400 | { |
401 | unsigned int comp = 0; |
402 | |
403 | if (!poll_pkg_cstate_enable) |
404 | return 0; |
405 | |
406 | /* we only use compensation if all adjacent ones are good */ |
407 | if (ratio == 1 && |
408 | cal_data[ratio].confidence >= CONFIDENCE_OK && |
409 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK && |
410 | cal_data[ratio + 2].confidence >= CONFIDENCE_OK) { |
411 | comp = (cal_data[ratio].steady_comp + |
412 | cal_data[ratio + 1].steady_comp + |
413 | cal_data[ratio + 2].steady_comp) / 3; |
414 | } else if (ratio == MAX_TARGET_RATIO - 1 && |
415 | cal_data[ratio].confidence >= CONFIDENCE_OK && |
416 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && |
417 | cal_data[ratio - 2].confidence >= CONFIDENCE_OK) { |
418 | comp = (cal_data[ratio].steady_comp + |
419 | cal_data[ratio - 1].steady_comp + |
420 | cal_data[ratio - 2].steady_comp) / 3; |
421 | } else if (cal_data[ratio].confidence >= CONFIDENCE_OK && |
422 | cal_data[ratio - 1].confidence >= CONFIDENCE_OK && |
423 | cal_data[ratio + 1].confidence >= CONFIDENCE_OK) { |
424 | comp = (cal_data[ratio].steady_comp + |
425 | cal_data[ratio - 1].steady_comp + |
426 | cal_data[ratio + 1].steady_comp) / 3; |
427 | } |
428 | |
429 | /* do not exceed limit */ |
430 | if (comp + ratio >= MAX_TARGET_RATIO) |
431 | comp = MAX_TARGET_RATIO - ratio - 1; |
432 | |
433 | return comp; |
434 | } |
435 | |
436 | static void adjust_compensation(int target_ratio, unsigned int win) |
437 | { |
438 | int delta; |
439 | struct powerclamp_calibration_data *d = &cal_data[target_ratio]; |
440 | |
441 | /* |
442 | * adjust compensations if confidence level has not been reached. |
443 | */ |
444 | if (d->confidence >= CONFIDENCE_OK) |
445 | return; |
446 | |
447 | delta = powerclamp_data.target_ratio - current_ratio; |
448 | /* filter out bad data */ |
449 | if (delta >= 0 && delta <= (1+target_ratio/10)) { |
450 | if (d->steady_comp) |
451 | d->steady_comp = |
452 | roundup(delta+d->steady_comp, 2)/2; |
453 | else |
454 | d->steady_comp = delta; |
455 | d->confidence++; |
456 | } |
457 | } |
458 | |
459 | static bool powerclamp_adjust_controls(unsigned int target_ratio, |
460 | unsigned int guard, unsigned int win) |
461 | { |
462 | static u64 msr_last, tsc_last; |
463 | u64 msr_now, tsc_now; |
464 | u64 val64; |
465 | |
466 | /* check result for the last window */ |
467 | msr_now = pkg_state_counter(); |
468 | tsc_now = rdtsc(); |
469 | |
470 | /* calculate pkg cstate vs tsc ratio */ |
471 | if (!msr_last || !tsc_last) |
472 | current_ratio = 1; |
473 | else if (tsc_now-tsc_last) { |
474 | val64 = 100*(msr_now-msr_last); |
475 | do_div(val64, (tsc_now-tsc_last)); |
476 | current_ratio = val64; |
477 | } |
478 | |
479 | /* update record */ |
480 | msr_last = msr_now; |
481 | tsc_last = tsc_now; |
482 | |
483 | adjust_compensation(target_ratio, win); |
484 | |
485 | /* if we are above target+guard, skip */ |
486 | return powerclamp_data.target_ratio + guard <= current_ratio; |
487 | } |
488 | |
489 | /* |
490 | * This function calculates runtime from the current target ratio. |
491 | * This function gets called under powerclamp_lock. |
492 | */ |
493 | static unsigned int get_run_time(void) |
494 | { |
495 | unsigned int compensated_ratio; |
496 | unsigned int runtime; |
497 | |
498 | /* |
499 | * make sure user selected ratio does not take effect until |
500 | * the next round. adjust target_ratio if user has changed |
501 | * target such that we can converge quickly. |
502 | */ |
503 | powerclamp_data.guard = 1 + powerclamp_data.target_ratio / 20; |
504 | powerclamp_data.window_size_now = window_size; |
505 | |
506 | /* |
507 | * systems may have different ability to enter package level |
508 | * c-states, thus we need to compensate the injected idle ratio |
509 | * to achieve the actual target reported by the HW. |
510 | */ |
511 | compensated_ratio = powerclamp_data.target_ratio + |
512 | get_compensation(ratio: powerclamp_data.target_ratio); |
513 | if (compensated_ratio <= 0) |
514 | compensated_ratio = 1; |
515 | |
516 | runtime = duration * 100 / compensated_ratio - duration; |
517 | |
518 | return runtime; |
519 | } |
520 | |
521 | /* |
522 | * 1 HZ polling while clamping is active, useful for userspace |
523 | * to monitor actual idle ratio. |
524 | */ |
525 | static void poll_pkg_cstate(struct work_struct *dummy); |
526 | static DECLARE_DELAYED_WORK(poll_pkg_cstate_work, poll_pkg_cstate); |
527 | static void poll_pkg_cstate(struct work_struct *dummy) |
528 | { |
529 | static u64 msr_last; |
530 | static u64 tsc_last; |
531 | |
532 | u64 msr_now; |
533 | u64 tsc_now; |
534 | u64 val64; |
535 | |
536 | msr_now = pkg_state_counter(); |
537 | tsc_now = rdtsc(); |
538 | |
539 | /* calculate pkg cstate vs tsc ratio */ |
540 | if (!msr_last || !tsc_last) |
541 | pkg_cstate_ratio_cur = 1; |
542 | else { |
543 | if (tsc_now - tsc_last) { |
544 | val64 = 100 * (msr_now - msr_last); |
545 | do_div(val64, (tsc_now - tsc_last)); |
546 | pkg_cstate_ratio_cur = val64; |
547 | } |
548 | } |
549 | |
550 | /* update record */ |
551 | msr_last = msr_now; |
552 | tsc_last = tsc_now; |
553 | |
554 | mutex_lock(&powerclamp_lock); |
555 | if (powerclamp_data.clamping) |
556 | schedule_delayed_work(dwork: &poll_pkg_cstate_work, HZ); |
557 | mutex_unlock(lock: &powerclamp_lock); |
558 | } |
559 | |
560 | static struct idle_inject_device *ii_dev; |
561 | |
562 | /* |
563 | * This function is called from idle injection core on timer expiry |
564 | * for the run duration. This allows powerclamp to readjust or skip |
565 | * injecting idle for this cycle. |
566 | */ |
567 | static bool idle_inject_update(void) |
568 | { |
569 | bool update = false; |
570 | |
571 | /* We can't sleep in this callback */ |
572 | if (!mutex_trylock(lock: &powerclamp_lock)) |
573 | return true; |
574 | |
575 | if (!(powerclamp_data.count % powerclamp_data.window_size_now)) { |
576 | |
577 | should_skip = powerclamp_adjust_controls(target_ratio: powerclamp_data.target_ratio, |
578 | guard: powerclamp_data.guard, |
579 | win: powerclamp_data.window_size_now); |
580 | update = true; |
581 | } |
582 | |
583 | if (update) { |
584 | unsigned int runtime = get_run_time(); |
585 | |
586 | idle_inject_set_duration(ii_dev, run_duration_us: runtime, idle_duration_us: duration); |
587 | } |
588 | |
589 | powerclamp_data.count++; |
590 | |
591 | mutex_unlock(lock: &powerclamp_lock); |
592 | |
593 | if (should_skip) |
594 | return false; |
595 | |
596 | return true; |
597 | } |
598 | |
599 | /* This function starts idle injection by calling idle_inject_start() */ |
600 | static void trigger_idle_injection(void) |
601 | { |
602 | unsigned int runtime = get_run_time(); |
603 | |
604 | idle_inject_set_duration(ii_dev, run_duration_us: runtime, idle_duration_us: duration); |
605 | idle_inject_start(ii_dev); |
606 | powerclamp_data.clamping = true; |
607 | } |
608 | |
609 | /* |
610 | * This function is called from start_power_clamp() to register |
611 | * CPUS with powercap idle injection register and set default |
612 | * idle duration and latency. |
613 | */ |
614 | static int powerclamp_idle_injection_register(void) |
615 | { |
616 | poll_pkg_cstate_enable = false; |
617 | if (cpumask_equal(cpu_present_mask, src2p: idle_injection_cpu_mask)) { |
618 | ii_dev = idle_inject_register_full(cpumask: idle_injection_cpu_mask, update: idle_inject_update); |
619 | if (topology_max_packages() == 1 && topology_max_die_per_package() == 1) |
620 | poll_pkg_cstate_enable = true; |
621 | } else { |
622 | ii_dev = idle_inject_register(cpumask: idle_injection_cpu_mask); |
623 | } |
624 | |
625 | if (!ii_dev) { |
626 | pr_err("powerclamp: idle_inject_register failed\n" ); |
627 | return -EAGAIN; |
628 | } |
629 | |
630 | idle_inject_set_duration(ii_dev, TICK_USEC, idle_duration_us: duration); |
631 | idle_inject_set_latency(ii_dev, UINT_MAX); |
632 | |
633 | return 0; |
634 | } |
635 | |
636 | /* |
637 | * This function is called from end_power_clamp() to stop idle injection |
638 | * and unregister CPUS from powercap idle injection core. |
639 | */ |
640 | static void remove_idle_injection(void) |
641 | { |
642 | if (!powerclamp_data.clamping) |
643 | return; |
644 | |
645 | powerclamp_data.clamping = false; |
646 | idle_inject_stop(ii_dev); |
647 | } |
648 | |
649 | /* |
650 | * This function is called when user change the cooling device |
651 | * state from zero to some other value. |
652 | */ |
653 | static int start_power_clamp(void) |
654 | { |
655 | int ret; |
656 | |
657 | ret = powerclamp_idle_injection_register(); |
658 | if (!ret) { |
659 | trigger_idle_injection(); |
660 | if (poll_pkg_cstate_enable) |
661 | schedule_delayed_work(dwork: &poll_pkg_cstate_work, delay: 0); |
662 | } |
663 | |
664 | return ret; |
665 | } |
666 | |
667 | /* |
668 | * This function is called when user change the cooling device |
669 | * state from non zero value zero. |
670 | */ |
671 | static void end_power_clamp(void) |
672 | { |
673 | if (powerclamp_data.clamping) { |
674 | remove_idle_injection(); |
675 | idle_inject_unregister(ii_dev); |
676 | } |
677 | } |
678 | |
679 | static int powerclamp_get_max_state(struct thermal_cooling_device *cdev, |
680 | unsigned long *state) |
681 | { |
682 | *state = MAX_TARGET_RATIO; |
683 | |
684 | return 0; |
685 | } |
686 | |
687 | static int powerclamp_get_cur_state(struct thermal_cooling_device *cdev, |
688 | unsigned long *state) |
689 | { |
690 | mutex_lock(&powerclamp_lock); |
691 | *state = powerclamp_data.target_ratio; |
692 | mutex_unlock(lock: &powerclamp_lock); |
693 | |
694 | return 0; |
695 | } |
696 | |
697 | static int powerclamp_set_cur_state(struct thermal_cooling_device *cdev, |
698 | unsigned long new_target_ratio) |
699 | { |
700 | int ret = 0; |
701 | |
702 | mutex_lock(&powerclamp_lock); |
703 | |
704 | new_target_ratio = clamp(new_target_ratio, 0UL, |
705 | (unsigned long) (max_idle - 1)); |
706 | |
707 | if (powerclamp_data.target_ratio == new_target_ratio) |
708 | goto exit_set; |
709 | |
710 | if (!powerclamp_data.target_ratio && new_target_ratio > 0) { |
711 | pr_info("Start idle injection to reduce power\n" ); |
712 | powerclamp_data.target_ratio = new_target_ratio; |
713 | ret = start_power_clamp(); |
714 | if (ret) |
715 | powerclamp_data.target_ratio = 0; |
716 | goto exit_set; |
717 | } else if (powerclamp_data.target_ratio > 0 && new_target_ratio == 0) { |
718 | pr_info("Stop forced idle injection\n" ); |
719 | end_power_clamp(); |
720 | powerclamp_data.target_ratio = 0; |
721 | } else /* adjust currently running */ { |
722 | unsigned int runtime; |
723 | |
724 | powerclamp_data.target_ratio = new_target_ratio; |
725 | runtime = get_run_time(); |
726 | idle_inject_set_duration(ii_dev, run_duration_us: runtime, idle_duration_us: duration); |
727 | } |
728 | |
729 | exit_set: |
730 | mutex_unlock(lock: &powerclamp_lock); |
731 | |
732 | return ret; |
733 | } |
734 | |
735 | /* bind to generic thermal layer as cooling device*/ |
736 | static const struct thermal_cooling_device_ops powerclamp_cooling_ops = { |
737 | .get_max_state = powerclamp_get_max_state, |
738 | .get_cur_state = powerclamp_get_cur_state, |
739 | .set_cur_state = powerclamp_set_cur_state, |
740 | }; |
741 | |
742 | static const struct x86_cpu_id __initconst intel_powerclamp_ids[] = { |
743 | X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_MWAIT, NULL), |
744 | {} |
745 | }; |
746 | MODULE_DEVICE_TABLE(x86cpu, intel_powerclamp_ids); |
747 | |
748 | static int __init powerclamp_probe(void) |
749 | { |
750 | |
751 | if (!x86_match_cpu(match: intel_powerclamp_ids)) { |
752 | pr_err("CPU does not support MWAIT\n" ); |
753 | return -ENODEV; |
754 | } |
755 | |
756 | /* The goal for idle time alignment is to achieve package cstate. */ |
757 | if (!has_pkg_state_counter()) { |
758 | pr_info("No package C-state available\n" ); |
759 | return -ENODEV; |
760 | } |
761 | |
762 | /* find the deepest mwait value */ |
763 | find_target_mwait(); |
764 | |
765 | return 0; |
766 | } |
767 | |
768 | static int powerclamp_debug_show(struct seq_file *m, void *unused) |
769 | { |
770 | int i = 0; |
771 | |
772 | seq_printf(m, fmt: "pct confidence steady dynamic (compensation)\n" ); |
773 | for (i = 0; i < MAX_TARGET_RATIO; i++) { |
774 | seq_printf(m, fmt: "%d\t%lu\t%lu\t%lu\n" , |
775 | i, |
776 | cal_data[i].confidence, |
777 | cal_data[i].steady_comp, |
778 | cal_data[i].dynamic_comp); |
779 | } |
780 | |
781 | return 0; |
782 | } |
783 | |
784 | DEFINE_SHOW_ATTRIBUTE(powerclamp_debug); |
785 | |
786 | static inline void powerclamp_create_debug_files(void) |
787 | { |
788 | debug_dir = debugfs_create_dir(name: "intel_powerclamp" , NULL); |
789 | |
790 | debugfs_create_file(name: "powerclamp_calib" , S_IRUGO, parent: debug_dir, data: cal_data, |
791 | fops: &powerclamp_debug_fops); |
792 | } |
793 | |
794 | static int __init powerclamp_init(void) |
795 | { |
796 | int retval; |
797 | |
798 | /* probe cpu features and ids here */ |
799 | retval = powerclamp_probe(); |
800 | if (retval) |
801 | return retval; |
802 | |
803 | mutex_lock(&powerclamp_lock); |
804 | if (!cpumask_available(mask: idle_injection_cpu_mask)) |
805 | retval = allocate_copy_idle_injection_mask(cpu_present_mask); |
806 | mutex_unlock(lock: &powerclamp_lock); |
807 | |
808 | if (retval) |
809 | return retval; |
810 | |
811 | /* set default limit, maybe adjusted during runtime based on feedback */ |
812 | window_size = 2; |
813 | |
814 | cooling_dev = thermal_cooling_device_register("intel_powerclamp" , NULL, |
815 | &powerclamp_cooling_ops); |
816 | if (IS_ERR(ptr: cooling_dev)) |
817 | return -ENODEV; |
818 | |
819 | if (!duration) |
820 | duration = jiffies_to_usecs(DEFAULT_DURATION_JIFFIES); |
821 | |
822 | powerclamp_create_debug_files(); |
823 | |
824 | return 0; |
825 | } |
826 | module_init(powerclamp_init); |
827 | |
828 | static void __exit powerclamp_exit(void) |
829 | { |
830 | mutex_lock(&powerclamp_lock); |
831 | end_power_clamp(); |
832 | mutex_unlock(lock: &powerclamp_lock); |
833 | |
834 | thermal_cooling_device_unregister(cooling_dev); |
835 | |
836 | cancel_delayed_work_sync(dwork: &poll_pkg_cstate_work); |
837 | debugfs_remove_recursive(dentry: debug_dir); |
838 | |
839 | if (cpumask_available(mask: idle_injection_cpu_mask)) |
840 | free_cpumask_var(mask: idle_injection_cpu_mask); |
841 | } |
842 | module_exit(powerclamp_exit); |
843 | |
844 | MODULE_IMPORT_NS(IDLE_INJECT); |
845 | |
846 | MODULE_LICENSE("GPL" ); |
847 | MODULE_AUTHOR("Arjan van de Ven <arjan@linux.intel.com>" ); |
848 | MODULE_AUTHOR("Jacob Pan <jacob.jun.pan@linux.intel.com>" ); |
849 | MODULE_DESCRIPTION("Package Level C-state Idle Injection for Intel CPUs" ); |
850 | |