1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * x86_pkg_temp_thermal driver |
4 | * Copyright (c) 2013, Intel Corporation. |
5 | */ |
6 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
7 | |
8 | #include <linux/module.h> |
9 | #include <linux/init.h> |
10 | #include <linux/intel_tcc.h> |
11 | #include <linux/err.h> |
12 | #include <linux/param.h> |
13 | #include <linux/device.h> |
14 | #include <linux/platform_device.h> |
15 | #include <linux/cpu.h> |
16 | #include <linux/smp.h> |
17 | #include <linux/slab.h> |
18 | #include <linux/pm.h> |
19 | #include <linux/thermal.h> |
20 | #include <linux/debugfs.h> |
21 | |
22 | #include <asm/cpu_device_id.h> |
23 | |
24 | #include "thermal_interrupt.h" |
25 | |
26 | /* |
27 | * Rate control delay: Idea is to introduce denounce effect |
28 | * This should be long enough to avoid reduce events, when |
29 | * threshold is set to a temperature, which is constantly |
30 | * violated, but at the short enough to take any action. |
31 | * The action can be remove threshold or change it to next |
32 | * interesting setting. Based on experiments, in around |
33 | * every 5 seconds under load will give us a significant |
34 | * temperature change. |
35 | */ |
36 | #define PKG_TEMP_THERMAL_NOTIFY_DELAY 5000 |
37 | static int notify_delay_ms = PKG_TEMP_THERMAL_NOTIFY_DELAY; |
38 | module_param(notify_delay_ms, int, 0644); |
39 | MODULE_PARM_DESC(notify_delay_ms, |
40 | "User space notification delay in milli seconds." ); |
41 | |
42 | /* Number of trip points in thermal zone. Currently it can't |
43 | * be more than 2. MSR can allow setting and getting notifications |
44 | * for only 2 thresholds. This define enforces this, if there |
45 | * is some wrong values returned by cpuid for number of thresholds. |
46 | */ |
47 | #define MAX_NUMBER_OF_TRIPS 2 |
48 | |
49 | struct zone_device { |
50 | int cpu; |
51 | bool work_scheduled; |
52 | u32 msr_pkg_therm_low; |
53 | u32 msr_pkg_therm_high; |
54 | struct delayed_work work; |
55 | struct thermal_zone_device *tzone; |
56 | struct thermal_trip *trips; |
57 | struct cpumask cpumask; |
58 | }; |
59 | |
60 | static struct thermal_zone_params pkg_temp_tz_params = { |
61 | .no_hwmon = true, |
62 | }; |
63 | |
64 | /* Keep track of how many zone pointers we allocated in init() */ |
65 | static int max_id __read_mostly; |
66 | /* Array of zone pointers */ |
67 | static struct zone_device **zones; |
68 | /* Serializes interrupt notification, work and hotplug */ |
69 | static DEFINE_RAW_SPINLOCK(pkg_temp_lock); |
70 | /* Protects zone operation in the work function against hotplug removal */ |
71 | static DEFINE_MUTEX(thermal_zone_mutex); |
72 | |
73 | /* The dynamically assigned cpu hotplug state for module_exit() */ |
74 | static enum cpuhp_state pkg_thermal_hp_state __read_mostly; |
75 | |
76 | /* Debug counters to show using debugfs */ |
77 | static struct dentry *debugfs; |
78 | static unsigned int pkg_interrupt_cnt; |
79 | static unsigned int pkg_work_cnt; |
80 | |
81 | static void pkg_temp_debugfs_init(void) |
82 | { |
83 | debugfs = debugfs_create_dir(name: "pkg_temp_thermal" , NULL); |
84 | |
85 | debugfs_create_u32(name: "pkg_thres_interrupt" , S_IRUGO, parent: debugfs, |
86 | value: &pkg_interrupt_cnt); |
87 | debugfs_create_u32(name: "pkg_thres_work" , S_IRUGO, parent: debugfs, |
88 | value: &pkg_work_cnt); |
89 | } |
90 | |
91 | /* |
92 | * Protection: |
93 | * |
94 | * - cpu hotplug: Read serialized by cpu hotplug lock |
95 | * Write must hold pkg_temp_lock |
96 | * |
97 | * - Other callsites: Must hold pkg_temp_lock |
98 | */ |
99 | static struct zone_device *pkg_temp_thermal_get_dev(unsigned int cpu) |
100 | { |
101 | int id = topology_logical_die_id(cpu); |
102 | |
103 | if (id >= 0 && id < max_id) |
104 | return zones[id]; |
105 | return NULL; |
106 | } |
107 | |
108 | static int sys_get_curr_temp(struct thermal_zone_device *tzd, int *temp) |
109 | { |
110 | struct zone_device *zonedev = thermal_zone_device_priv(tzd); |
111 | int val; |
112 | |
113 | val = intel_tcc_get_temp(cpu: zonedev->cpu, pkg: true); |
114 | if (val < 0) |
115 | return val; |
116 | |
117 | *temp = val * 1000; |
118 | pr_debug("sys_get_curr_temp %d\n" , *temp); |
119 | return 0; |
120 | } |
121 | |
122 | static int |
123 | sys_set_trip_temp(struct thermal_zone_device *tzd, int trip, int temp) |
124 | { |
125 | struct zone_device *zonedev = thermal_zone_device_priv(tzd); |
126 | u32 l, h, mask, shift, intr; |
127 | int tj_max, val, ret; |
128 | |
129 | tj_max = intel_tcc_get_tjmax(cpu: zonedev->cpu); |
130 | if (tj_max < 0) |
131 | return tj_max; |
132 | tj_max *= 1000; |
133 | |
134 | val = (tj_max - temp)/1000; |
135 | |
136 | if (trip >= MAX_NUMBER_OF_TRIPS || val < 0 || val > 0x7f) |
137 | return -EINVAL; |
138 | |
139 | ret = rdmsr_on_cpu(cpu: zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, |
140 | l: &l, h: &h); |
141 | if (ret < 0) |
142 | return ret; |
143 | |
144 | if (trip) { |
145 | mask = THERM_MASK_THRESHOLD1; |
146 | shift = THERM_SHIFT_THRESHOLD1; |
147 | intr = THERM_INT_THRESHOLD1_ENABLE; |
148 | } else { |
149 | mask = THERM_MASK_THRESHOLD0; |
150 | shift = THERM_SHIFT_THRESHOLD0; |
151 | intr = THERM_INT_THRESHOLD0_ENABLE; |
152 | } |
153 | l &= ~mask; |
154 | /* |
155 | * When users space sets a trip temperature == 0, which is indication |
156 | * that, it is no longer interested in receiving notifications. |
157 | */ |
158 | if (!temp) { |
159 | l &= ~intr; |
160 | } else { |
161 | l |= val << shift; |
162 | l |= intr; |
163 | } |
164 | |
165 | return wrmsr_on_cpu(cpu: zonedev->cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, |
166 | l, h); |
167 | } |
168 | |
169 | /* Thermal zone callback registry */ |
170 | static struct thermal_zone_device_ops tzone_ops = { |
171 | .get_temp = sys_get_curr_temp, |
172 | .set_trip_temp = sys_set_trip_temp, |
173 | }; |
174 | |
175 | static bool pkg_thermal_rate_control(void) |
176 | { |
177 | return true; |
178 | } |
179 | |
180 | /* Enable threshold interrupt on local package/cpu */ |
181 | static inline void enable_pkg_thres_interrupt(void) |
182 | { |
183 | u8 thres_0, thres_1; |
184 | u32 l, h; |
185 | |
186 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
187 | /* only enable/disable if it had valid threshold value */ |
188 | thres_0 = (l & THERM_MASK_THRESHOLD0) >> THERM_SHIFT_THRESHOLD0; |
189 | thres_1 = (l & THERM_MASK_THRESHOLD1) >> THERM_SHIFT_THRESHOLD1; |
190 | if (thres_0) |
191 | l |= THERM_INT_THRESHOLD0_ENABLE; |
192 | if (thres_1) |
193 | l |= THERM_INT_THRESHOLD1_ENABLE; |
194 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
195 | } |
196 | |
197 | /* Disable threshold interrupt on local package/cpu */ |
198 | static inline void disable_pkg_thres_interrupt(void) |
199 | { |
200 | u32 l, h; |
201 | |
202 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
203 | |
204 | l &= ~(THERM_INT_THRESHOLD0_ENABLE | THERM_INT_THRESHOLD1_ENABLE); |
205 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); |
206 | } |
207 | |
208 | static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work) |
209 | { |
210 | struct thermal_zone_device *tzone = NULL; |
211 | int cpu = smp_processor_id(); |
212 | struct zone_device *zonedev; |
213 | |
214 | mutex_lock(&thermal_zone_mutex); |
215 | raw_spin_lock_irq(&pkg_temp_lock); |
216 | ++pkg_work_cnt; |
217 | |
218 | zonedev = pkg_temp_thermal_get_dev(cpu); |
219 | if (!zonedev) { |
220 | raw_spin_unlock_irq(&pkg_temp_lock); |
221 | mutex_unlock(lock: &thermal_zone_mutex); |
222 | return; |
223 | } |
224 | zonedev->work_scheduled = false; |
225 | |
226 | thermal_clear_package_intr_status(PACKAGE_LEVEL, THERM_LOG_THRESHOLD0 | THERM_LOG_THRESHOLD1); |
227 | tzone = zonedev->tzone; |
228 | |
229 | enable_pkg_thres_interrupt(); |
230 | raw_spin_unlock_irq(&pkg_temp_lock); |
231 | |
232 | /* |
233 | * If tzone is not NULL, then thermal_zone_mutex will prevent the |
234 | * concurrent removal in the cpu offline callback. |
235 | */ |
236 | if (tzone) |
237 | thermal_zone_device_update(tzone, THERMAL_EVENT_UNSPECIFIED); |
238 | |
239 | mutex_unlock(lock: &thermal_zone_mutex); |
240 | } |
241 | |
242 | static void pkg_thermal_schedule_work(int cpu, struct delayed_work *work) |
243 | { |
244 | unsigned long ms = msecs_to_jiffies(m: notify_delay_ms); |
245 | |
246 | schedule_delayed_work_on(cpu, dwork: work, delay: ms); |
247 | } |
248 | |
249 | static int pkg_thermal_notify(u64 msr_val) |
250 | { |
251 | int cpu = smp_processor_id(); |
252 | struct zone_device *zonedev; |
253 | unsigned long flags; |
254 | |
255 | raw_spin_lock_irqsave(&pkg_temp_lock, flags); |
256 | ++pkg_interrupt_cnt; |
257 | |
258 | disable_pkg_thres_interrupt(); |
259 | |
260 | /* Work is per package, so scheduling it once is enough. */ |
261 | zonedev = pkg_temp_thermal_get_dev(cpu); |
262 | if (zonedev && !zonedev->work_scheduled) { |
263 | zonedev->work_scheduled = true; |
264 | pkg_thermal_schedule_work(cpu: zonedev->cpu, work: &zonedev->work); |
265 | } |
266 | |
267 | raw_spin_unlock_irqrestore(&pkg_temp_lock, flags); |
268 | return 0; |
269 | } |
270 | |
271 | static struct thermal_trip *pkg_temp_thermal_trips_init(int cpu, int tj_max, int num_trips) |
272 | { |
273 | struct thermal_trip *trips; |
274 | unsigned long thres_reg_value; |
275 | u32 mask, shift, eax, edx; |
276 | int ret, i; |
277 | |
278 | trips = kzalloc(size: sizeof(*trips) * num_trips, GFP_KERNEL); |
279 | if (!trips) |
280 | return ERR_PTR(error: -ENOMEM); |
281 | |
282 | for (i = 0; i < num_trips; i++) { |
283 | |
284 | if (i) { |
285 | mask = THERM_MASK_THRESHOLD1; |
286 | shift = THERM_SHIFT_THRESHOLD1; |
287 | } else { |
288 | mask = THERM_MASK_THRESHOLD0; |
289 | shift = THERM_SHIFT_THRESHOLD0; |
290 | } |
291 | |
292 | ret = rdmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, |
293 | l: &eax, h: &edx); |
294 | if (ret < 0) { |
295 | kfree(objp: trips); |
296 | return ERR_PTR(error: ret); |
297 | } |
298 | |
299 | thres_reg_value = (eax & mask) >> shift; |
300 | |
301 | trips[i].temperature = thres_reg_value ? |
302 | tj_max - thres_reg_value * 1000 : THERMAL_TEMP_INVALID; |
303 | |
304 | trips[i].type = THERMAL_TRIP_PASSIVE; |
305 | |
306 | pr_debug("%s: cpu=%d, trip=%d, temp=%d\n" , |
307 | __func__, cpu, i, trips[i].temperature); |
308 | } |
309 | |
310 | return trips; |
311 | } |
312 | |
313 | static int pkg_temp_thermal_device_add(unsigned int cpu) |
314 | { |
315 | int id = topology_logical_die_id(cpu); |
316 | u32 eax, ebx, ecx, edx; |
317 | struct zone_device *zonedev; |
318 | int thres_count, err; |
319 | int tj_max; |
320 | |
321 | if (id >= max_id) |
322 | return -ENOMEM; |
323 | |
324 | cpuid(op: 6, eax: &eax, ebx: &ebx, ecx: &ecx, edx: &edx); |
325 | thres_count = ebx & 0x07; |
326 | if (!thres_count) |
327 | return -ENODEV; |
328 | |
329 | thres_count = clamp_val(thres_count, 0, MAX_NUMBER_OF_TRIPS); |
330 | |
331 | tj_max = intel_tcc_get_tjmax(cpu); |
332 | if (tj_max < 0) |
333 | return tj_max; |
334 | |
335 | zonedev = kzalloc(size: sizeof(*zonedev), GFP_KERNEL); |
336 | if (!zonedev) |
337 | return -ENOMEM; |
338 | |
339 | zonedev->trips = pkg_temp_thermal_trips_init(cpu, tj_max, num_trips: thres_count); |
340 | if (IS_ERR(ptr: zonedev->trips)) { |
341 | err = PTR_ERR(ptr: zonedev->trips); |
342 | goto out_kfree_zonedev; |
343 | } |
344 | |
345 | INIT_DELAYED_WORK(&zonedev->work, pkg_temp_thermal_threshold_work_fn); |
346 | zonedev->cpu = cpu; |
347 | zonedev->tzone = thermal_zone_device_register_with_trips(type: "x86_pkg_temp" , |
348 | trips: zonedev->trips, num_trips: thres_count, |
349 | mask: (thres_count == MAX_NUMBER_OF_TRIPS) ? 0x03 : 0x01, |
350 | devdata: zonedev, ops: &tzone_ops, tzp: &pkg_temp_tz_params, passive_delay: 0, polling_delay: 0); |
351 | if (IS_ERR(ptr: zonedev->tzone)) { |
352 | err = PTR_ERR(ptr: zonedev->tzone); |
353 | goto out_kfree_trips; |
354 | } |
355 | err = thermal_zone_device_enable(tz: zonedev->tzone); |
356 | if (err) |
357 | goto out_unregister_tz; |
358 | |
359 | /* Store MSR value for package thermal interrupt, to restore at exit */ |
360 | rdmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, zonedev->msr_pkg_therm_low, |
361 | zonedev->msr_pkg_therm_high); |
362 | |
363 | cpumask_set_cpu(cpu, dstp: &zonedev->cpumask); |
364 | raw_spin_lock_irq(&pkg_temp_lock); |
365 | zones[id] = zonedev; |
366 | raw_spin_unlock_irq(&pkg_temp_lock); |
367 | |
368 | return 0; |
369 | |
370 | out_unregister_tz: |
371 | thermal_zone_device_unregister(tz: zonedev->tzone); |
372 | out_kfree_trips: |
373 | kfree(objp: zonedev->trips); |
374 | out_kfree_zonedev: |
375 | kfree(objp: zonedev); |
376 | return err; |
377 | } |
378 | |
379 | static int pkg_thermal_cpu_offline(unsigned int cpu) |
380 | { |
381 | struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu); |
382 | bool lastcpu, was_target; |
383 | int target; |
384 | |
385 | if (!zonedev) |
386 | return 0; |
387 | |
388 | target = cpumask_any_but(mask: &zonedev->cpumask, cpu); |
389 | cpumask_clear_cpu(cpu, dstp: &zonedev->cpumask); |
390 | lastcpu = target >= nr_cpu_ids; |
391 | /* |
392 | * Remove the sysfs files, if this is the last cpu in the package |
393 | * before doing further cleanups. |
394 | */ |
395 | if (lastcpu) { |
396 | struct thermal_zone_device *tzone = zonedev->tzone; |
397 | |
398 | /* |
399 | * We must protect against a work function calling |
400 | * thermal_zone_update, after/while unregister. We null out |
401 | * the pointer under the zone mutex, so the worker function |
402 | * won't try to call. |
403 | */ |
404 | mutex_lock(&thermal_zone_mutex); |
405 | zonedev->tzone = NULL; |
406 | mutex_unlock(lock: &thermal_zone_mutex); |
407 | |
408 | thermal_zone_device_unregister(tz: tzone); |
409 | } |
410 | |
411 | /* Protect against work and interrupts */ |
412 | raw_spin_lock_irq(&pkg_temp_lock); |
413 | |
414 | /* |
415 | * Check whether this cpu was the current target and store the new |
416 | * one. When we drop the lock, then the interrupt notify function |
417 | * will see the new target. |
418 | */ |
419 | was_target = zonedev->cpu == cpu; |
420 | zonedev->cpu = target; |
421 | |
422 | /* |
423 | * If this is the last CPU in the package remove the package |
424 | * reference from the array and restore the interrupt MSR. When we |
425 | * drop the lock neither the interrupt notify function nor the |
426 | * worker will see the package anymore. |
427 | */ |
428 | if (lastcpu) { |
429 | zones[topology_logical_die_id(cpu)] = NULL; |
430 | /* After this point nothing touches the MSR anymore. */ |
431 | wrmsr(MSR_IA32_PACKAGE_THERM_INTERRUPT, |
432 | zonedev->msr_pkg_therm_low, zonedev->msr_pkg_therm_high); |
433 | } |
434 | |
435 | /* |
436 | * Check whether there is work scheduled and whether the work is |
437 | * targeted at the outgoing CPU. |
438 | */ |
439 | if (zonedev->work_scheduled && was_target) { |
440 | /* |
441 | * To cancel the work we need to drop the lock, otherwise |
442 | * we might deadlock if the work needs to be flushed. |
443 | */ |
444 | raw_spin_unlock_irq(&pkg_temp_lock); |
445 | cancel_delayed_work_sync(dwork: &zonedev->work); |
446 | raw_spin_lock_irq(&pkg_temp_lock); |
447 | /* |
448 | * If this is not the last cpu in the package and the work |
449 | * did not run after we dropped the lock above, then we |
450 | * need to reschedule the work, otherwise the interrupt |
451 | * stays disabled forever. |
452 | */ |
453 | if (!lastcpu && zonedev->work_scheduled) |
454 | pkg_thermal_schedule_work(cpu: target, work: &zonedev->work); |
455 | } |
456 | |
457 | raw_spin_unlock_irq(&pkg_temp_lock); |
458 | |
459 | /* Final cleanup if this is the last cpu */ |
460 | if (lastcpu) { |
461 | kfree(objp: zonedev->trips); |
462 | kfree(objp: zonedev); |
463 | } |
464 | return 0; |
465 | } |
466 | |
467 | static int pkg_thermal_cpu_online(unsigned int cpu) |
468 | { |
469 | struct zone_device *zonedev = pkg_temp_thermal_get_dev(cpu); |
470 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
471 | |
472 | /* Paranoia check */ |
473 | if (!cpu_has(c, X86_FEATURE_DTHERM) || !cpu_has(c, X86_FEATURE_PTS)) |
474 | return -ENODEV; |
475 | |
476 | /* If the package exists, nothing to do */ |
477 | if (zonedev) { |
478 | cpumask_set_cpu(cpu, dstp: &zonedev->cpumask); |
479 | return 0; |
480 | } |
481 | return pkg_temp_thermal_device_add(cpu); |
482 | } |
483 | |
484 | static const struct x86_cpu_id __initconst pkg_temp_thermal_ids[] = { |
485 | X86_MATCH_VENDOR_FEATURE(INTEL, X86_FEATURE_PTS, NULL), |
486 | {} |
487 | }; |
488 | MODULE_DEVICE_TABLE(x86cpu, pkg_temp_thermal_ids); |
489 | |
490 | static int __init pkg_temp_thermal_init(void) |
491 | { |
492 | int ret; |
493 | |
494 | if (!x86_match_cpu(match: pkg_temp_thermal_ids)) |
495 | return -ENODEV; |
496 | |
497 | max_id = topology_max_packages() * topology_max_die_per_package(); |
498 | zones = kcalloc(n: max_id, size: sizeof(struct zone_device *), |
499 | GFP_KERNEL); |
500 | if (!zones) |
501 | return -ENOMEM; |
502 | |
503 | ret = cpuhp_setup_state(state: CPUHP_AP_ONLINE_DYN, name: "thermal/x86_pkg:online" , |
504 | startup: pkg_thermal_cpu_online, teardown: pkg_thermal_cpu_offline); |
505 | if (ret < 0) |
506 | goto err; |
507 | |
508 | /* Store the state for module exit */ |
509 | pkg_thermal_hp_state = ret; |
510 | |
511 | platform_thermal_package_notify = pkg_thermal_notify; |
512 | platform_thermal_package_rate_control = pkg_thermal_rate_control; |
513 | |
514 | /* Don't care if it fails */ |
515 | pkg_temp_debugfs_init(); |
516 | return 0; |
517 | |
518 | err: |
519 | kfree(objp: zones); |
520 | return ret; |
521 | } |
522 | module_init(pkg_temp_thermal_init) |
523 | |
524 | static void __exit pkg_temp_thermal_exit(void) |
525 | { |
526 | platform_thermal_package_notify = NULL; |
527 | platform_thermal_package_rate_control = NULL; |
528 | |
529 | cpuhp_remove_state(state: pkg_thermal_hp_state); |
530 | debugfs_remove_recursive(dentry: debugfs); |
531 | kfree(objp: zones); |
532 | } |
533 | module_exit(pkg_temp_thermal_exit) |
534 | |
535 | MODULE_IMPORT_NS(INTEL_TCC); |
536 | MODULE_DESCRIPTION("X86 PKG TEMP Thermal Driver" ); |
537 | MODULE_AUTHOR("Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>" ); |
538 | MODULE_LICENSE("GPL v2" ); |
539 | |