1 | /* |
2 | * x86_pkg_temp_thermal driver |
3 | * Copyright (c) 2013, Intel Corporation. |
4 | * |
5 | * This program is free software; you can redistribute it and/or modify it |
6 | * under the terms and conditions of the GNU General Public License, |
7 | * version 2, as published by the Free Software Foundation. |
8 | * |
9 | * This program is distributed in the hope it will be useful, but WITHOUT |
10 | * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or |
11 | * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for |
12 | * more details. |
13 | * |
14 | * You should have received a copy of the GNU General Public License along with |
15 | * this program; if not, write to the Free Software Foundation, Inc. |
16 | * |
17 | */ |
18 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
19 | |
20 | #include <linux/module.h> |
21 | #include <linux/init.h> |
22 | #include <linux/err.h> |
23 | #include <linux/param.h> |
24 | #include <linux/device.h> |
25 | #include <linux/platform_device.h> |
26 | #include <linux/cpu.h> |
27 | #include <linux/smp.h> |
28 | #include <linux/slab.h> |
29 | #include <linux/pm.h> |
30 | #include <linux/thermal.h> |
31 | #include <linux/debugfs.h> |
32 | #include <asm/cpu_device_id.h> |
33 | #include <asm/mce.h> |
34 | |
35 | /* |
36 | * Rate control delay: Idea is to introduce denounce effect |
37 | * This should be long enough to avoid reduce events, when |
38 | * threshold is set to a temperature, which is constantly |
39 | * violated, but at the short enough to take any action. |
40 | * The action can be remove threshold or change it to next |
41 | * interesting setting. Based on experiments, in around |
42 | * every 5 seconds under load will give us a significant |
43 | * temperature change. |
44 | */ |
45 | #define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000 |
46 | static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY; |
47 | module_param(notify_delay_ms, int, 0644); |
48 | MODULE_PARM_DESC(notify_delay_ms, |
49 | "User space notification delay in milli seconds." ); |
50 | |
51 | /* Number of trip points in thermal zone. Currently it can't |
52 | * be more than 2. MSR can allow setting and getting notifications |
53 | * for only 2 thresholds. This define enforces this, if there |
54 | * is some wrong values returned by cpuid for number of thresholds. |
55 | */ |
56 | #define MAX_NUMBER_OF_TRIPS 2 |
57 | |
58 | struct pkg_device { |
59 | int cpu; |
60 | bool work_scheduled; |
61 | u32 tj_max; |
62 | u32 msr_pkg_therm_low; |
63 | u32 msr_pkg_therm_high; |
64 | struct delayed_work work; |
65 | struct thermal_zone_device *tzone; |
66 | struct cpumask cpumask; |
67 | }; |
68 | |
69 | static struct thermal_zone_params pkg_temp_tz_params = { |
70 | .no_hwmon = true, |
71 | }; |
72 | |
73 | /* Keep track of how many package pointers we allocated in init() */ |
74 | static int max_packages __read_mostly; |
75 | /* Array of package pointers */ |
76 | static struct pkg_device **packages; |
77 | /* Serializes interrupt notification, work and hotplug */ |
78 | static DEFINE_SPINLOCK(pkg_temp_lock); |
79 | /* Protects zone operation in the work function against hotplug removal */ |
80 | static DEFINE_MUTEX(thermal_zone_mutex); |
81 | |
82 | /* The dynamically assigned cpu hotplug state for module_exit() */ |
83 | static enum cpuhp_state pkg_thermal_hp_state __read_mostly; |
84 | |
85 | /* Debug counters to show using debugfs */ |
86 | static struct dentry *debugfs; |
87 | static unsigned int pkg_interrupt_cnt; |
88 | static unsigned int pkg_work_cnt; |
89 | |
90 | static int pkg_temp_debugfs_init(void) |
91 | { |
92 | struct dentry *d; |
93 | |
94 | debugfs = debugfs_create_dir("pkg_temp_thermal" , NULL); |
95 | if (!debugfs) |
96 | return -ENOENT; |
97 | |
98 | d = debugfs_create_u32("pkg_thres_interrupt" , S_IRUGO, debugfs, |
99 | &pkg_interrupt_cnt); |
100 | if (!d) |
101 | goto err_out; |
102 | |
103 | d = debugfs_create_u32("pkg_thres_work" , S_IRUGO, debugfs, |
104 | &pkg_work_cnt); |
105 | if (!d) |
106 | goto err_out; |
107 | |
108 | return 0; |
109 | |
110 | err_out: |
111 | debugfs_remove_recursive(debugfs); |
112 | return -ENOENT; |
113 | } |
114 | |
115 | /* |
116 | * Protection: |
117 | * |
118 | * - cpu hotplug: Read serialized by cpu hotplug lock |
119 | * Write must hold pkg_temp_lock |
120 | * |
121 | * - Other callsites: Must hold pkg_temp_lock |
122 | */ |
123 | static struct pkg_device *pkg_temp_thermal_get_dev(unsigned int cpu) |
124 | { |
125 | int pkgid = topology_logical_package_id(cpu); |
126 | |
127 | if (pkgid >= 0 && pkgid < max_packages) |
128 | return packages[pkgid]; |
129 | return NULL; |
130 | } |
131 | |
132 | /* |
133 | * tj-max is is interesting because threshold is set relative to this |
134 | * temperature. |
135 | */ |
136 | static int get_tj_max(int cpu, u32 *tj_max) |
137 | { |
138 | u32 eax, edx, val; |
139 | int err; |
140 | |
141 | err = rdmsr_safe_on_cpu(cpu, MSR_IA32_TEMPERATURE_TARGET, &eax, &edx); |
142 | if (err) |
143 | return err; |
144 | |
145 | val = (eax >> 16) & 0xff; |
146 | *tj_max = val * 1000; |
147 | |
148 | return val ? 0 : -EINVAL; |
149 | } |
150 | |
151 | static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp) |
152 | { |
153 | struct pkg_device *pkgdev = tzd->devdata; |
154 | u32 eax, edx; |
155 | |
156 | rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_STATUS, &eax, &edx); |
157 | if (eax & 0x80000000) { |
158 | *temp = pkgdev->tj_max - ((eax >> 16) & 0x7f) * 1000; |
159 | pr_debug("sys_get_curr_temp %d\n" , *temp); |
160 | return 0; |
161 | } |
162 | return -EINVAL; |
163 | } |
164 | |
165 | static int sys_get_trip_temp(struct thermal_zone_device *tzd, |
166 | int trip, int *temp) |
167 | { |
168 | struct pkg_device *pkgdev = tzd->devdata; |
169 | unsigned long thres_reg_value; |
170 | u32 mask, shift, eax, edx; |
171 | int ret; |
172 | |
173 | if (trip >= MAX_NUMBER_OF_TRIPS) |
174 | return -EINVAL; |
175 | |
176 | if (trip) { |
177 | mask = THERM_MASK_THRESHOLD1; |
178 | shift = THERM_SHIFT_THRESHOLD1; |
179 | } else { |
180 | mask = THERM_MASK_THRESHOLD0; |
181 | shift = THERM_SHIFT_THRESHOLD0; |
182 | } |
183 | |
184 | ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, |
185 | &eax, &edx); |
186 | if (ret < 0) |
187 | return ret; |
188 | |
189 | thres_reg_value = (eax & mask) >> shift; |
190 | if (thres_reg_value) |
191 | *temp = pkgdev->tj_max - thres_reg_value * 1000; |
192 | else |
193 | *temp = 0; |
194 | pr_debug("sys_get_trip_temp %d\n" , *temp); |
195 | |
196 | return 0; |
197 | } |
198 | |
199 | static int |
200 | sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp) |
201 | { |
202 | struct pkg_device *pkgdev = tzd->devdata; |
203 | u32 l, h, mask, shift, intr; |
204 | int ret; |
205 | |
206 | if (trip >= MAX_NUMBER_OF_TRIPS || temp >= pkgdev->tj_max) |
207 | return -EINVAL; |
208 | |
209 | ret = rdmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, |
210 | &l, &h); |
211 | if (ret < 0) |
212 | return ret; |
213 | |
214 | if (trip) { |
215 | mask = THERM_MASK_THRESHOLD1; |
216 | shift = THERM_SHIFT_THRESHOLD1; |
217 | intr = THERM_INT_THRESHOLD1_ENABLE; |
218 | } else { |
219 | mask = THERM_MASK_THRESHOLD0; |
220 | shift = THERM_SHIFT_THRESHOLD0; |
221 | intr = THERM_INT_THRESHOLD0_ENABLE; |
222 | } |
223 | l &= ~mask; |
224 | /* |
225 | * When users space sets a trip temperature == 0, which is indication |
226 | * that, it is no longer interested in receiving notifications. |
227 | */ |
228 | if (!temp) { |
229 | l &= ~intr; |
230 | } else { |
231 | l |= (pkgdev->tj_max - temp)/1000 << shift; |
232 | l |= intr; |
233 | } |
234 | |
235 | return wrmsr_on_cpu(pkgdev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
236 | } |
237 | |
238 | static int sys_get_trip_type(struct thermal_zone_device *thermal, int trip, |
239 | enum thermal_trip_type *type) |
240 | { |
241 | *type = THERMAL_TRIP_PASSIVE; |
242 | return 0; |
243 | } |
244 | |
245 | /* Thermal zone callback registry */ |
246 | static struct thermal_zone_device_ops tzone_ops = { |
247 | .get_temp = sys_get_curr_temp, |
248 | .get_trip_temp = sys_get_trip_temp, |
249 | .get_trip_type = sys_get_trip_type, |
250 | .set_trip_temp = sys_set_trip_temp, |
251 | }; |
252 | |
253 | static bool pkg_thermal_rate_control(void) |
254 | { |
255 | return true; |
256 | } |
257 | |
258 | /* Enable threshold interrupt on local package/cpu */ |
259 | static inline void enable_pkg_thres_interrupt(void) |
260 | { |
261 | u8 thres_0, thres_1; |
262 | u32 l, h; |
263 | |
264 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
265 | /* only enable/disable if it had valid threshold value */ |
266 | thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0; |
267 | thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1; |
268 | if (thres_0) |
269 | l |= THERM_INT_THRESHOLD0_ENABLE; |
270 | if (thres_1) |
271 | l |= THERM_INT_THRESHOLD1_ENABLE; |
272 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
273 | } |
274 | |
275 | /* Disable threshold interrupt on local package/cpu */ |
276 | static inline void disable_pkg_thres_interrupt(void) |
277 | { |
278 | u32 l, h; |
279 | |
280 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
281 | |
282 | l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE); |
283 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
284 | } |
285 | |
286 | static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) |
287 | { |
288 | struct thermal_zone_device *tzone = NULL; |
289 | int cpu = smp_processor_id(); |
290 | struct pkg_device *pkgdev; |
291 | u64 msr_val, wr_val; |
292 | |
293 | mutex_lock(&thermal_zone_mutex); |
294 | spin_lock_irq(&pkg_temp_lock); |
295 | ++pkg_work_cnt; |
296 | |
297 | pkgdev = pkg_temp_thermal_get_dev(cpu); |
298 | if (!pkgdev) { |
299 | spin_unlock_irq(&pkg_temp_lock); |
300 | mutex_unlock(&thermal_zone_mutex); |
301 | return; |
302 | } |
303 | pkgdev->work_scheduled = false; |
304 | |
305 | rdmsrl(MSR_IA32_PACKAGE_THERM_STATUS, msr_val); |
306 | wr_val = msr_val & ~(THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1); |
307 | if (wr_val != msr_val) { |
308 | wrmsrl(MSR_IA32_PACKAGE_THERM_STATUS, wr_val); |
309 | tzone = pkgdev->tzone; |
310 | } |
311 | |
312 | enable_pkg_thres_interrupt(); |
313 | spin_unlock_irq(&pkg_temp_lock); |
314 | |
315 | /* |
316 | * If tzone is not NULL, then thermal_zone_mutex will prevent the |
317 | * concurrent removal in the cpu offline callback. |
318 | */ |
319 | if (tzone) |
320 | thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED); |
321 | |
322 | mutex_unlock(&thermal_zone_mutex); |
323 | } |
324 | |
325 | static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work) |
326 | { |
327 | unsigned long ms = msecs_to_jiffies(notify_delay_ms); |
328 | |
329 | schedule_delayed_work_on(cpu, work, ms); |
330 | } |
331 | |
332 | static int pkg_thermal_notify(u64 msr_val) |
333 | { |
334 | int cpu = smp_processor_id(); |
335 | struct pkg_device *pkgdev; |
336 | unsigned long flags; |
337 | |
338 | spin_lock_irqsave(&pkg_temp_lock, flags); |
339 | ++pkg_interrupt_cnt; |
340 | |
341 | disable_pkg_thres_interrupt(); |
342 | |
343 | /* Work is per package, so scheduling it once is enough. */ |
344 | pkgdev = pkg_temp_thermal_get_dev(cpu); |
345 | if (pkgdev && !pkgdev->work_scheduled) { |
346 | pkgdev->work_scheduled = true; |
347 | pkg_thermal_schedule_work(pkgdev->cpu, &pkgdev->work); |
348 | } |
349 | |
350 | spin_unlock_irqrestore(&pkg_temp_lock, flags); |
351 | return 0; |
352 | } |
353 | |
354 | static int pkg_temp_thermal_device_add(unsigned int cpu) |
355 | { |
356 | int pkgid = topology_logical_package_id(cpu); |
357 | u32 tj_max, eax, ebx, ecx, edx; |
358 | struct pkg_device *pkgdev; |
359 | int thres_count, err; |
360 | |
361 | if (pkgid >= max_packages) |
362 | return -ENOMEM; |
363 | |
364 | cpuid(6, &eax, &ebx, &ecx, &edx); |
365 | thres_count = ebx & 0x07; |
366 | if (!thres_count) |
367 | return -ENODEV; |
368 | |
369 | thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS); |
370 | |
371 | err = get_tj_max(cpu, &tj_max); |
372 | if (err) |
373 | return err; |
374 | |
375 | pkgdev = kzalloc(sizeof(*pkgdev), GFP_KERNEL); |
376 | if (!pkgdev) |
377 | return -ENOMEM; |
378 | |
379 | INIT_DELAYED_WORK(&pkgdev->work, pkg_temp_thermal_threshold_work_fn); |
380 | pkgdev->cpu = cpu; |
381 | pkgdev->tj_max = tj_max; |
382 | pkgdev->tzone = thermal_zone_device_register("x86_pkg_temp" , |
383 | thres_count, |
384 | (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01, |
385 | pkgdev, &tzone_ops, &pkg_temp_tz_params, 0, 0); |
386 | if (IS_ERR(pkgdev->tzone)) { |
387 | err = PTR_ERR(pkgdev->tzone); |
388 | kfree(pkgdev); |
389 | return err; |
390 | } |
391 | /* Store MSR value for package thermal interrupt, to restore at exit */ |
392 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, pkgdev->msr_pkg_therm_low, |
393 | pkgdev->msr_pkg_therm_high); |
394 | |
395 | cpumask_set_cpu(cpu, &pkgdev->cpumask); |
396 | spin_lock_irq(&pkg_temp_lock); |
397 | packages[pkgid] = pkgdev; |
398 | spin_unlock_irq(&pkg_temp_lock); |
399 | return 0; |
400 | } |
401 | |
402 | static int pkg_thermal_cpu_offline(unsigned int cpu) |
403 | { |
404 | struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu); |
405 | bool lastcpu, was_target; |
406 | int target; |
407 | |
408 | if (!pkgdev) |
409 | return 0; |
410 | |
411 | target = cpumask_any_but(&pkgdev->cpumask, cpu); |
412 | cpumask_clear_cpu(cpu, &pkgdev->cpumask); |
413 | lastcpu = target >= nr_cpu_ids; |
414 | /* |
415 | * Remove the sysfs files, if this is the last cpu in the package |
416 | * before doing further cleanups. |
417 | */ |
418 | if (lastcpu) { |
419 | struct thermal_zone_device *tzone = pkgdev->tzone; |
420 | |
421 | /* |
422 | * We must protect against a work function calling |
423 | * thermal_zone_update, after/while unregister. We null out |
424 | * the pointer under the zone mutex, so the worker function |
425 | * won't try to call. |
426 | */ |
427 | mutex_lock(&thermal_zone_mutex); |
428 | pkgdev->tzone = NULL; |
429 | mutex_unlock(&thermal_zone_mutex); |
430 | |
431 | thermal_zone_device_unregister(tzone); |
432 | } |
433 | |
434 | /* Protect against work and interrupts */ |
435 | spin_lock_irq(&pkg_temp_lock); |
436 | |
437 | /* |
438 | * Check whether this cpu was the current target and store the new |
439 | * one. When we drop the lock, then the interrupt notify function |
440 | * will see the new target. |
441 | */ |
442 | was_target = pkgdev->cpu == cpu; |
443 | pkgdev->cpu = target; |
444 | |
445 | /* |
446 | * If this is the last CPU in the package remove the package |
447 | * reference from the array and restore the interrupt MSR. When we |
448 | * drop the lock neither the interrupt notify function nor the |
449 | * worker will see the package anymore. |
450 | */ |
451 | if (lastcpu) { |
452 | packages[topology_logical_package_id(cpu)] = NULL; |
453 | /* After this point nothing touches the MSR anymore. */ |
454 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, |
455 | pkgdev->msr_pkg_therm_low, pkgdev->msr_pkg_therm_high); |
456 | } |
457 | |
458 | /* |
459 | * Check whether there is work scheduled and whether the work is |
460 | * targeted at the outgoing CPU. |
461 | */ |
462 | if (pkgdev->work_scheduled && was_target) { |
463 | /* |
464 | * To cancel the work we need to drop the lock, otherwise |
465 | * we might deadlock if the work needs to be flushed. |
466 | */ |
467 | spin_unlock_irq(&pkg_temp_lock); |
468 | cancel_delayed_work_sync(&pkgdev->work); |
469 | spin_lock_irq(&pkg_temp_lock); |
470 | /* |
471 | * If this is not the last cpu in the package and the work |
472 | * did not run after we dropped the lock above, then we |
473 | * need to reschedule the work, otherwise the interrupt |
474 | * stays disabled forever. |
475 | */ |
476 | if (!lastcpu && pkgdev->work_scheduled) |
477 | pkg_thermal_schedule_work(target, &pkgdev->work); |
478 | } |
479 | |
480 | spin_unlock_irq(&pkg_temp_lock); |
481 | |
482 | /* Final cleanup if this is the last cpu */ |
483 | if (lastcpu) |
484 | kfree(pkgdev); |
485 | return 0; |
486 | } |
487 | |
488 | static int pkg_thermal_cpu_online(unsigned int cpu) |
489 | { |
490 | struct pkg_device *pkgdev = pkg_temp_thermal_get_dev(cpu); |
491 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
492 | |
493 | /* Paranoia check */ |
494 | if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS)) |
495 | return -ENODEV; |
496 | |
497 | /* If the package exists, nothing to do */ |
498 | if (pkgdev) { |
499 | cpumask_set_cpu(cpu, &pkgdev->cpumask); |
500 | return 0; |
501 | } |
502 | return pkg_temp_thermal_device_add(cpu); |
503 | } |
504 | |
505 | static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = { |
506 | { X86_VENDOR_INTEL, X86_FAMILY_ANY, X86_MODEL_ANY, X86_FEATURE_PTS }, |
507 | {} |
508 | }; |
509 | MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids); |
510 | |
511 | static int __init pkg_temp_thermal_init(void) |
512 | { |
513 | int ret; |
514 | |
515 | if (!x86_match_cpu(pkg_temp_thermal_ids)) |
516 | return -ENODEV; |
517 | |
518 | max_packages = topology_max_packages(); |
519 | packages = kcalloc(max_packages, sizeof(struct pkg_device *), |
520 | GFP_KERNEL); |
521 | if (!packages) |
522 | return -ENOMEM; |
523 | |
524 | ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "thermal/x86_pkg:online" , |
525 | pkg_thermal_cpu_online, pkg_thermal_cpu_offline); |
526 | if (ret < 0) |
527 | goto err; |
528 | |
529 | /* Store the state for module exit */ |
530 | pkg_thermal_hp_state = ret; |
531 | |
532 | platform_thermal_package_notify = pkg_thermal_notify; |
533 | platform_thermal_package_rate_control = pkg_thermal_rate_control; |
534 | |
535 | /* Don't care if it fails */ |
536 | pkg_temp_debugfs_init(); |
537 | return 0; |
538 | |
539 | err: |
540 | kfree(packages); |
541 | return ret; |
542 | } |
543 | module_init(pkg_temp_thermal_init) |
544 | |
545 | static void __exit pkg_temp_thermal_exit(void) |
546 | { |
547 | platform_thermal_package_notify = NULL; |
548 | platform_thermal_package_rate_control = NULL; |
549 | |
550 | cpuhp_remove_state(pkg_thermal_hp_state); |
551 | debugfs_remove_recursive(debugfs); |
552 | kfree(packages); |
553 | } |
554 | module_exit(pkg_temp_thermal_exit) |
555 | |
556 | MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver" ); |
557 | MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>" ); |
558 | MODULE_LICENSE("GPL v2" ); |
559 | |