rapl.c source code [linux/arch/x86/events/rapl.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Support Intel/AMD RAPL energy consumption counters
4	* Copyright (C) 2013 Google, Inc., Stephane Eranian
5	*
6	* Intel RAPL interface is specified in the IA-32 Manual Vol3b
7	* section 14.7.1 (September 2013)
8	*
9	* AMD RAPL interface for Fam17h is described in the public PPR:
10	* https://bugzilla.kernel.org/show_bug.cgi?id=206537
11	*
12	* RAPL provides more controls than just reporting energy consumption
13	* however here we only expose the 3 energy consumption free running
14	* counters (pp0, pkg, dram).
15	*
16	* Each of those counters increments in a power unit defined by the
17	* RAPL_POWER_UNIT MSR. On SandyBridge, this unit is 1/(2^16) Joules
18	* but it can vary.
19	*
20	* Counter to rapl events mappings:
21	*
22	* pp0 counter: consumption of all physical cores (power plane 0)
23	* event: rapl_energy_cores
24	* perf code: 0x1
25	*
26	* pkg counter: consumption of the whole processor package
27	* event: rapl_energy_pkg
28	* perf code: 0x2
29	*
30	* dram counter: consumption of the dram domain (servers only)
31	* event: rapl_energy_dram
32	* perf code: 0x3
33	*
34	* gpu counter: consumption of the builtin-gpu domain (client only)
35	* event: rapl_energy_gpu
36	* perf code: 0x4
37	*
38	* psys counter: consumption of the builtin-psys domain (client only)
39	* event: rapl_energy_psys
40	* perf code: 0x5
41	*
42	* We manage those counters as free running (read-only). They may be
43	* use simultaneously by other tools, such as turbostat.
44	*
45	* The events only support system-wide mode counting. There is no
46	* sampling support because it does not make sense and is not
47	* supported by the RAPL hardware.
48	*
49	* Because we want to avoid floating-point operations in the kernel,
50	* the events are all reported in fixed point arithmetic (32.32).
51	* Tools must adjust the counts to convert them to Watts using
52	* the duration of the measurement. Tools may use a function such as
53	* ldexp(raw_count, -32);
54	*/
55
56	#define pr_fmt(fmt) "RAPL PMU: " fmt
57
58	#include <linux/module.h>
59	#include <linux/slab.h>
60	#include <linux/perf_event.h>
61	#include <linux/nospec.h>
62	#include <asm/cpu_device_id.h>
63	#include <asm/intel-family.h>
64	#include "perf_event.h"
65	#include "probe.h"
66
67	MODULE_LICENSE("GPL");
68
69	/*
70	* RAPL energy status counters
71	*/
72	enum perf_rapl_events {
73	PERF_RAPL_PP0 = `0`, / all cores /
74	PERF_RAPL_PKG, / entire package /
75	PERF_RAPL_RAM, / DRAM /
76	PERF_RAPL_PP1, / gpu /
77	PERF_RAPL_PSYS, / psys /
78
79	PERF_RAPL_MAX,
80	NR_RAPL_DOMAINS = PERF_RAPL_MAX,
81	};
82
83	static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = {
84	"pp0-core",
85	"package",
86	"dram",
87	"pp1-gpu",
88	"psys",
89	};
90
91	/*
92	* event code: LSB 8 bits, passed in attr->config
93	* any other bit is reserved
94	*/
95	#define RAPL_EVENT_MASK 0xFFULL
96	#define RAPL_CNTR_WIDTH 32
97
98	#define RAPL_EVENT_ATTR_STR(_name, v, str) \
99	static struct perf_pmu_events_attr event_attr_##v = { \
100	.attr = __ATTR(_name, 0444, perf_event_sysfs_show, NULL), \
101	.id = 0, \
102	.event_str = str, \
103	};
104
105	struct rapl_pmu {
106	raw_spinlock_t lock;
107	int n_active;
108	int cpu;
109	struct list_head active_list;
110	struct pmu *pmu;
111	ktime_t timer_interval;
112	struct hrtimer hrtimer;
113	};
114
115	struct rapl_pmus {
116	struct pmu pmu;
117	unsigned int maxdie;
118	struct rapl_pmu *pmus[] __counted_by(maxdie);
119	};
120
121	enum rapl_unit_quirk {
122	RAPL_UNIT_QUIRK_NONE,
123	RAPL_UNIT_QUIRK_INTEL_HSW,
124	RAPL_UNIT_QUIRK_INTEL_SPR,
125	};
126
127	struct rapl_model {
128	struct perf_msr *rapl_msrs;
129	unsigned long events;
130	unsigned int msr_power_unit;
131	enum rapl_unit_quirk unit_quirk;
132	};
133
134	/ 1/2^hw_unit Joule /
135	static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly;
136	static struct rapl_pmus *rapl_pmus;
137	static cpumask_t rapl_cpu_mask;
138	static unsigned int rapl_cntr_mask;
139	static u64 rapl_timer_ms;
140	static struct perf_msr *rapl_msrs;
141
142	static inline struct rapl_pmu cpu_to_rapl_pmu(unsigned* int cpu)
143	{
144	unsigned int dieid = topology_logical_die_id(cpu);
145
146	/*
147	* The unsigned check also catches the '-1' return value for non
148	* existent mappings in the topology map.
149	*/
150	return dieid < rapl_pmus->maxdie ? rapl_pmus->pmus[dieid] : NULL;
151	}
152
153	static inline u64 rapl_read_counter(struct perf_event *event)
154	{
155	u64 raw;
156	rdmsrl(event->hw.event_base, raw);
157	return raw;
158	}
159
160	static inline u64 rapl_scale(u64 v, int cfg)
161	{
162	if (cfg > NR_RAPL_DOMAINS) {
163	pr_warn("Invalid domain %d, failed to scale data\n", cfg);
164	return v;
165	}
166	/*
167	* scale delta to smallest unit (1/2^32)
168	* users must then scale back: count * 1/(1e9*2^32) to get Joules
169	* or use ldexp(count, -32).
170	* Watts = Joules/Time delta
171	*/
172	return v << (`32` - rapl_hw_unit[cfg - `1`]);
173	}
174
175	static u64 rapl_event_update(struct perf_event *event)
176	{
177	struct hw_perf_event *hwc = &event->hw;
178	u64 prev_raw_count, new_raw_count;
179	s64 delta, sdelta;
180	int shift = RAPL_CNTR_WIDTH;
181
182	prev_raw_count = local64_read(&hwc->prev_count);
183	do {
184	rdmsrl(event->hw.event_base, new_raw_count);
185	} while (!local64_try_cmpxchg(l: &hwc->prev_count,
186	old: &prev_raw_count, new: new_raw_count));
187
188	/*
189	* Now we have the new raw value and have updated the prev
190	* timestamp already. We can now calculate the elapsed delta
191	* (event-)time and add that to the generic event.
192	*
193	* Careful, not all hw sign-extends above the physical width
194	* of the count.
195	*/
196	delta = (new_raw_count << shift) - (prev_raw_count << shift);
197	delta >>= shift;
198
199	sdelta = rapl_scale(v: delta, cfg: event->hw.config);
200
201	local64_add(sdelta, &event->count);
202
203	return new_raw_count;
204	}
205
206	static void rapl_start_hrtimer(struct rapl_pmu *pmu)
207	{
208	hrtimer_start(timer: &pmu->hrtimer, tim: pmu->timer_interval,
209	mode: HRTIMER_MODE_REL_PINNED);
210	}
211
212	static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer)
213	{
214	struct rapl_pmu pmu = container_of(hrtimer, struct* rapl_pmu, hrtimer);
215	struct perf_event *event;
216	unsigned long flags;
217
218	if (!pmu->n_active)
219	return HRTIMER_NORESTART;
220
221	raw_spin_lock_irqsave(&pmu->lock, flags);
222
223	list_for_each_entry(event, &pmu->active_list, active_entry)
224	rapl_event_update(event);
225
226	raw_spin_unlock_irqrestore(&pmu->lock, flags);
227
228	hrtimer_forward_now(timer: hrtimer, interval: pmu->timer_interval);
229
230	return HRTIMER_RESTART;
231	}
232
233	static void rapl_hrtimer_init(struct rapl_pmu *pmu)
234	{
235	struct hrtimer *hr = &pmu->hrtimer;
236
237	hrtimer_init(timer: hr, CLOCK_MONOTONIC, mode: HRTIMER_MODE_REL);
238	hr->function = rapl_hrtimer_handle;
239	}
240
241	static void __rapl_pmu_event_start(struct rapl_pmu *pmu,
242	struct perf_event *event)
243	{
244	if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED)))
245	return;
246
247	event->hw.state = `0`;
248
249	list_add_tail(new: &event->active_entry, head: &pmu->active_list);
250
251	local64_set(&event->hw.prev_count, rapl_read_counter(event));
252
253	pmu->n_active++;
254	if (pmu->n_active == `1`)
255	rapl_start_hrtimer(pmu);
256	}
257
258	static void rapl_pmu_event_start(struct perf_event event, int* mode)
259	{
260	struct rapl_pmu *pmu = event->pmu_private;
261	unsigned long flags;
262
263	raw_spin_lock_irqsave(&pmu->lock, flags);
264	__rapl_pmu_event_start(pmu, event);
265	raw_spin_unlock_irqrestore(&pmu->lock, flags);
266	}
267
268	static void rapl_pmu_event_stop(struct perf_event event, int* mode)
269	{
270	struct rapl_pmu *pmu = event->pmu_private;
271	struct hw_perf_event *hwc = &event->hw;
272	unsigned long flags;
273
274	raw_spin_lock_irqsave(&pmu->lock, flags);
275
276	/ mark event as deactivated and stopped /
277	if (!(hwc->state & PERF_HES_STOPPED)) {
278	WARN_ON_ONCE(pmu->n_active <= `0`);
279	pmu->n_active--;
280	if (pmu->n_active == `0`)
281	hrtimer_cancel(timer: &pmu->hrtimer);
282
283	list_del(entry: &event->active_entry);
284
285	WARN_ON_ONCE(hwc->state & PERF_HES_STOPPED);
286	hwc->state \|= PERF_HES_STOPPED;
287	}
288
289	/ check if update of sw counter is necessary /
290	if ((mode & PERF_EF_UPDATE) && !(hwc->state & PERF_HES_UPTODATE)) {
291	/*
292	* Drain the remaining delta count out of a event
293	* that we are disabling:
294	*/
295	rapl_event_update(event);
296	hwc->state \|= PERF_HES_UPTODATE;
297	}
298
299	raw_spin_unlock_irqrestore(&pmu->lock, flags);
300	}
301
302	static int rapl_pmu_event_add(struct perf_event event, int* mode)
303	{
304	struct rapl_pmu *pmu = event->pmu_private;
305	struct hw_perf_event *hwc = &event->hw;
306	unsigned long flags;
307
308	raw_spin_lock_irqsave(&pmu->lock, flags);
309
310	hwc->state = PERF_HES_UPTODATE \| PERF_HES_STOPPED;
311
312	if (mode & PERF_EF_START)
313	__rapl_pmu_event_start(pmu, event);
314
315	raw_spin_unlock_irqrestore(&pmu->lock, flags);
316
317	return `0`;
318	}
319
320	static void rapl_pmu_event_del(struct perf_event event, int* flags)
321	{
322	rapl_pmu_event_stop(event, PERF_EF_UPDATE);
323	}
324
325	static int rapl_pmu_event_init(struct perf_event *event)
326	{
327	u64 cfg = event->attr.config & RAPL_EVENT_MASK;
328	int bit, ret = `0`;
329	struct rapl_pmu *pmu;
330
331	/ only look at RAPL events /
332	if (event->attr.type != rapl_pmus->pmu.type)
333	return -ENOENT;
334
335	/ check only supported bits are set /
336	if (event->attr.config & ~RAPL_EVENT_MASK)
337	return -EINVAL;
338
339	if (event->cpu < `0`)
340	return -EINVAL;
341
342	event->event_caps \|= PERF_EV_CAP_READ_ACTIVE_PKG;
343
344	if (!cfg \|\| cfg >= NR_RAPL_DOMAINS + `1`)
345	return -EINVAL;
346
347	cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + `1`);
348	bit = cfg - `1`;
349
350	/ check event supported /
351	if (!(rapl_cntr_mask & (`1` << bit)))
352	return -EINVAL;
353
354	/ unsupported modes and filters /
355	if (event->attr.sample_period) / no sampling /
356	return -EINVAL;
357
358	/ must be done before validate_group /
359	pmu = cpu_to_rapl_pmu(cpu: event->cpu);
360	if (!pmu)
361	return -EINVAL;
362	event->cpu = pmu->cpu;
363	event->pmu_private = pmu;
364	event->hw.event_base = rapl_msrs[bit].msr;
365	event->hw.config = cfg;
366	event->hw.idx = bit;
367
368	return ret;
369	}
370
371	static void rapl_pmu_event_read(struct perf_event *event)
372	{
373	rapl_event_update(event);
374	}
375
376	static ssize_t rapl_get_attr_cpumask(struct device *dev,
377	struct device_attribute attr, char* *buf)
378	{
379	return cpumap_print_to_pagebuf(list: true, buf, mask: &rapl_cpu_mask);
380	}
381
382	static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL);
383
384	static struct attribute *rapl_pmu_attrs[] = {
385	&dev_attr_cpumask.attr,
386	NULL,
387	};
388
389	static struct attribute_group rapl_pmu_attr_group = {
390	.attrs = rapl_pmu_attrs,
391	};
392
393	RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01");
394	RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02");
395	RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03");
396	RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04");
397	RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05");
398
399	RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules");
400	RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules");
401	RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules");
402	RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules");
403	RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules");
404
405	/*
406	* we compute in 0.23 nJ increments regardless of MSR
407	*/
408	RAPL_EVENT_ATTR_STR(energy-cores.scale, rapl_cores_scale, "2.3283064365386962890625e-10");
409	RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890625e-10");
410	RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10");
411	RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10");
412	RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10");
413
414	/*
415	* There are no default events, but we need to create
416	* "events" group (with empty attrs) before updating
417	* it with detected events.
418	*/
419	static struct attribute *attrs_empty[] = {
420	NULL,
421	};
422
423	static struct attribute_group rapl_pmu_events_group = {
424	.name = "events",
425	.attrs = attrs_empty,
426	};
427
428	PMU_FORMAT_ATTR(event, "config:0-7");
429	static struct attribute *rapl_formats_attr[] = {
430	&format_attr_event.attr,
431	NULL,
432	};
433
434	static struct attribute_group rapl_pmu_format_group = {
435	.name = "format",
436	.attrs = rapl_formats_attr,
437	};
438
439	static const struct attribute_group *rapl_attr_groups[] = {
440	&rapl_pmu_attr_group,
441	&rapl_pmu_format_group,
442	&rapl_pmu_events_group,
443	NULL,
444	};
445
446	static struct attribute *rapl_events_cores[] = {
447	EVENT_PTR(rapl_cores),
448	EVENT_PTR(rapl_cores_unit),
449	EVENT_PTR(rapl_cores_scale),
450	NULL,
451	};
452
453	static struct attribute_group rapl_events_cores_group = {
454	.name = "events",
455	.attrs = rapl_events_cores,
456	};
457
458	static struct attribute *rapl_events_pkg[] = {
459	EVENT_PTR(rapl_pkg),
460	EVENT_PTR(rapl_pkg_unit),
461	EVENT_PTR(rapl_pkg_scale),
462	NULL,
463	};
464
465	static struct attribute_group rapl_events_pkg_group = {
466	.name = "events",
467	.attrs = rapl_events_pkg,
468	};
469
470	static struct attribute *rapl_events_ram[] = {
471	EVENT_PTR(rapl_ram),
472	EVENT_PTR(rapl_ram_unit),
473	EVENT_PTR(rapl_ram_scale),
474	NULL,
475	};
476
477	static struct attribute_group rapl_events_ram_group = {
478	.name = "events",
479	.attrs = rapl_events_ram,
480	};
481
482	static struct attribute *rapl_events_gpu[] = {
483	EVENT_PTR(rapl_gpu),
484	EVENT_PTR(rapl_gpu_unit),
485	EVENT_PTR(rapl_gpu_scale),
486	NULL,
487	};
488
489	static struct attribute_group rapl_events_gpu_group = {
490	.name = "events",
491	.attrs = rapl_events_gpu,
492	};
493
494	static struct attribute *rapl_events_psys[] = {
495	EVENT_PTR(rapl_psys),
496	EVENT_PTR(rapl_psys_unit),
497	EVENT_PTR(rapl_psys_scale),
498	NULL,
499	};
500
501	static struct attribute_group rapl_events_psys_group = {
502	.name = "events",
503	.attrs = rapl_events_psys,
504	};
505
506	static bool test_msr(int idx, void *data)
507	{
508	return test_bit(idx, (unsigned long *) data);
509	}
510
511	/ Only lower 32bits of the MSR represents the energy counter /
512	#define RAPL_MSR_MASK 0xFFFFFFFF
513
514	static struct perf_msr intel_rapl_msrs[] = {
515	[PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, .grp: &rapl_events_cores_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
516	[PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, .grp: &rapl_events_pkg_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
517	[PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, .grp: &rapl_events_ram_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
518	[PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, .grp: &rapl_events_gpu_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
519	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, .grp: &rapl_events_psys_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
520	};
521
522	static struct perf_msr intel_rapl_spr_msrs[] = {
523	[PERF_RAPL_PP0] = { MSR_PP0_ENERGY_STATUS, .grp: &rapl_events_cores_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
524	[PERF_RAPL_PKG] = { MSR_PKG_ENERGY_STATUS, .grp: &rapl_events_pkg_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
525	[PERF_RAPL_RAM] = { MSR_DRAM_ENERGY_STATUS, .grp: &rapl_events_ram_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
526	[PERF_RAPL_PP1] = { MSR_PP1_ENERGY_STATUS, .grp: &rapl_events_gpu_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
527	[PERF_RAPL_PSYS] = { MSR_PLATFORM_ENERGY_STATUS, .grp: &rapl_events_psys_group, .test: test_msr, .no_check: true, RAPL_MSR_MASK },
528	};
529
530	/*
531	* Force to PERF_RAPL_MAX size due to:
532	* - perf_msr_probe(PERF_RAPL_MAX)
533	* - want to use same event codes across both architectures
534	*/
535	static struct perf_msr amd_rapl_msrs[] = {
536	[PERF_RAPL_PP0] = { .msr: `0`, .grp: &rapl_events_cores_group, NULL, .no_check: false, .mask: `0` },
537	[PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, .grp: &rapl_events_pkg_group, .test: test_msr, .no_check: false, RAPL_MSR_MASK },
538	[PERF_RAPL_RAM] = { .msr: `0`, .grp: &rapl_events_ram_group, NULL, .no_check: false, .mask: `0` },
539	[PERF_RAPL_PP1] = { .msr: `0`, .grp: &rapl_events_gpu_group, NULL, .no_check: false, .mask: `0` },
540	[PERF_RAPL_PSYS] = { .msr: `0`, .grp: &rapl_events_psys_group, NULL, .no_check: false, .mask: `0` },
541	};
542
543	static int rapl_cpu_offline(unsigned int cpu)
544	{
545	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
546	int target;
547
548	/ Check if exiting cpu is used for collecting rapl events /
549	if (!cpumask_test_and_clear_cpu(cpu, cpumask: &rapl_cpu_mask))
550	return `0`;
551
552	pmu->cpu = -`1`;
553	/ Find a new cpu to collect rapl events /
554	target = cpumask_any_but(topology_die_cpumask(cpu), cpu);
555
556	/ Migrate rapl events to the new target /
557	if (target < nr_cpu_ids) {
558	cpumask_set_cpu(cpu: target, dstp: &rapl_cpu_mask);
559	pmu->cpu = target;
560	perf_pmu_migrate_context(pmu: pmu->pmu, src_cpu: cpu, dst_cpu: target);
561	}
562	return `0`;
563	}
564
565	static int rapl_cpu_online(unsigned int cpu)
566	{
567	struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu);
568	int target;
569
570	if (!pmu) {
571	pmu = kzalloc_node(size: sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu));
572	if (!pmu)
573	return -ENOMEM;
574
575	raw_spin_lock_init(&pmu->lock);
576	INIT_LIST_HEAD(list: &pmu->active_list);
577	pmu->pmu = &rapl_pmus->pmu;
578	pmu->timer_interval = ms_to_ktime(ms: rapl_timer_ms);
579	rapl_hrtimer_init(pmu);
580
581	rapl_pmus->pmus[topology_logical_die_id(cpu)] = pmu;
582	}
583
584	/*
585	* Check if there is an online cpu in the package which collects rapl
586	* events already.
587	*/
588	target = cpumask_any_and(&rapl_cpu_mask, topology_die_cpumask(cpu));
589	if (target < nr_cpu_ids)
590	return `0`;
591
592	cpumask_set_cpu(cpu, dstp: &rapl_cpu_mask);
593	pmu->cpu = cpu;
594	return `0`;
595	}
596
597	static int rapl_check_hw_unit(struct rapl_model *rm)
598	{
599	u64 msr_rapl_power_unit_bits;
600	int i;
601
602	/ protect rdmsrl() to handle virtualization /
603	if (rdmsrl_safe(msr: rm->msr_power_unit, p: &msr_rapl_power_unit_bits))
604	return -`1`;
605	for (i = `0`; i < NR_RAPL_DOMAINS; i++)
606	rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> `8`) & `0x1FULL`;
607
608	switch (rm->unit_quirk) {
609	/*
610	* DRAM domain on HSW server and KNL has fixed energy unit which can be
611	* different than the unit from power unit MSR. See
612	* "Intel Xeon Processor E5-1600 and E5-2600 v3 Product Families, V2
613	* of 2. Datasheet, September 2014, Reference Number: 330784-001 "
614	*/
615	case RAPL_UNIT_QUIRK_INTEL_HSW:
616	rapl_hw_unit[PERF_RAPL_RAM] = `16`;
617	break;
618	/ SPR uses a fixed energy unit for Psys domain. /
619	case RAPL_UNIT_QUIRK_INTEL_SPR:
620	rapl_hw_unit[PERF_RAPL_PSYS] = `0`;
621	break;
622	default:
623	break;
624	}
625
626
627	/*
628	* Calculate the timer rate:
629	* Use reference of 200W for scaling the timeout to avoid counter
630	* overflows. 200W = 200 Joules/sec
631	* Divide interval by 2 to avoid lockstep (2 * 100)
632	* if hw unit is 32, then we use 2 ms 1/200/2
633	*/
634	rapl_timer_ms = `2`;
635	if (rapl_hw_unit[`0`] < `32`) {
636	rapl_timer_ms = (`1000` / (`2` * `100`));
637	rapl_timer_ms *= (`1ULL` << (`32` - rapl_hw_unit[`0`] - `1`));
638	}
639	return `0`;
640	}
641
642	static void __init rapl_advertise(void)
643	{
644	int i;
645
646	pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n",
647	hweight32(rapl_cntr_mask), rapl_timer_ms);
648
649	for (i = `0`; i < NR_RAPL_DOMAINS; i++) {
650	if (rapl_cntr_mask & (`1` << i)) {
651	pr_info("hw unit of domain %s 2^-%d Joules\n",
652	rapl_domain_names[i], rapl_hw_unit[i]);
653	}
654	}
655	}
656
657	static void cleanup_rapl_pmus(void)
658	{
659	int i;
660
661	for (i = `0`; i < rapl_pmus->maxdie; i++)
662	kfree(objp: rapl_pmus->pmus[i]);
663	kfree(objp: rapl_pmus);
664	}
665
666	static const struct attribute_group *rapl_attr_update[] = {
667	&rapl_events_cores_group,
668	&rapl_events_pkg_group,
669	&rapl_events_ram_group,
670	&rapl_events_gpu_group,
671	&rapl_events_psys_group,
672	NULL,
673	};
674
675	static int __init init_rapl_pmus(void)
676	{
677	int maxdie = topology_max_packages() * topology_max_dies_per_package();
678	size_t size;
679
680	size = sizeof(rapl_pmus) + maxdie sizeof(struct rapl_pmu *);
681	rapl_pmus = kzalloc(size, GFP_KERNEL);
682	if (!rapl_pmus)
683	return -ENOMEM;
684
685	rapl_pmus->maxdie = maxdie;
686	rapl_pmus->pmu.attr_groups = rapl_attr_groups;
687	rapl_pmus->pmu.attr_update = rapl_attr_update;
688	rapl_pmus->pmu.task_ctx_nr = perf_invalid_context;
689	rapl_pmus->pmu.event_init = rapl_pmu_event_init;
690	rapl_pmus->pmu.add = rapl_pmu_event_add;
691	rapl_pmus->pmu.del = rapl_pmu_event_del;
692	rapl_pmus->pmu.start = rapl_pmu_event_start;
693	rapl_pmus->pmu.stop = rapl_pmu_event_stop;
694	rapl_pmus->pmu.read = rapl_pmu_event_read;
695	rapl_pmus->pmu.module = THIS_MODULE;
696	rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE;
697	return `0`;
698	}
699
700	static struct rapl_model model_snb = {
701	.events = BIT(PERF_RAPL_PP0) \|
702	BIT(PERF_RAPL_PKG) \|
703	BIT(PERF_RAPL_PP1),
704	.msr_power_unit = MSR_RAPL_POWER_UNIT,
705	.rapl_msrs = intel_rapl_msrs,
706	};
707
708	static struct rapl_model model_snbep = {
709	.events = BIT(PERF_RAPL_PP0) \|
710	BIT(PERF_RAPL_PKG) \|
711	BIT(PERF_RAPL_RAM),
712	.msr_power_unit = MSR_RAPL_POWER_UNIT,
713	.rapl_msrs = intel_rapl_msrs,
714	};
715
716	static struct rapl_model model_hsw = {
717	.events = BIT(PERF_RAPL_PP0) \|
718	BIT(PERF_RAPL_PKG) \|
719	BIT(PERF_RAPL_RAM) \|
720	BIT(PERF_RAPL_PP1),
721	.msr_power_unit = MSR_RAPL_POWER_UNIT,
722	.rapl_msrs = intel_rapl_msrs,
723	};
724
725	static struct rapl_model model_hsx = {
726	.events = BIT(PERF_RAPL_PP0) \|
727	BIT(PERF_RAPL_PKG) \|
728	BIT(PERF_RAPL_RAM),
729	.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
730	.msr_power_unit = MSR_RAPL_POWER_UNIT,
731	.rapl_msrs = intel_rapl_msrs,
732	};
733
734	static struct rapl_model model_knl = {
735	.events = BIT(PERF_RAPL_PKG) \|
736	BIT(PERF_RAPL_RAM),
737	.unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW,
738	.msr_power_unit = MSR_RAPL_POWER_UNIT,
739	.rapl_msrs = intel_rapl_msrs,
740	};
741
742	static struct rapl_model model_skl = {
743	.events = BIT(PERF_RAPL_PP0) \|
744	BIT(PERF_RAPL_PKG) \|
745	BIT(PERF_RAPL_RAM) \|
746	BIT(PERF_RAPL_PP1) \|
747	BIT(PERF_RAPL_PSYS),
748	.msr_power_unit = MSR_RAPL_POWER_UNIT,
749	.rapl_msrs = intel_rapl_msrs,
750	};
751
752	static struct rapl_model model_spr = {
753	.events = BIT(PERF_RAPL_PP0) \|
754	BIT(PERF_RAPL_PKG) \|
755	BIT(PERF_RAPL_RAM) \|
756	BIT(PERF_RAPL_PSYS),
757	.unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR,
758	.msr_power_unit = MSR_RAPL_POWER_UNIT,
759	.rapl_msrs = intel_rapl_spr_msrs,
760	};
761
762	static struct rapl_model model_amd_hygon = {
763	.events = BIT(PERF_RAPL_PKG),
764	.msr_power_unit = MSR_AMD_RAPL_POWER_UNIT,
765	.rapl_msrs = amd_rapl_msrs,
766	};
767
768	static const struct x86_cpu_id rapl_model_match[] __initconst = {
769	X86_MATCH_FEATURE(X86_FEATURE_RAPL, &model_amd_hygon),
770	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE, &model_snb),
771	X86_MATCH_INTEL_FAM6_MODEL(SANDYBRIDGE_X, &model_snbep),
772	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE, &model_snb),
773	X86_MATCH_INTEL_FAM6_MODEL(IVYBRIDGE_X, &model_snbep),
774	X86_MATCH_INTEL_FAM6_MODEL(HASWELL, &model_hsw),
775	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_X, &model_hsx),
776	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_L, &model_hsw),
777	X86_MATCH_INTEL_FAM6_MODEL(HASWELL_G, &model_hsw),
778	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL, &model_hsw),
779	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_G, &model_hsw),
780	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_X, &model_hsx),
781	X86_MATCH_INTEL_FAM6_MODEL(BROADWELL_D, &model_hsx),
782	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &model_knl),
783	X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &model_knl),
784	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_L, &model_skl),
785	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE, &model_skl),
786	X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &model_hsx),
787	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE_L, &model_skl),
788	X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &model_skl),
789	X86_MATCH_INTEL_FAM6_MODEL(CANNONLAKE_L, &model_skl),
790	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &model_hsw),
791	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_D, &model_hsw),
792	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT_PLUS, &model_hsw),
793	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_L, &model_skl),
794	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE, &model_skl),
795	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &model_hsx),
796	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &model_hsx),
797	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE_L, &model_skl),
798	X86_MATCH_INTEL_FAM6_MODEL(COMETLAKE, &model_skl),
799	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE_L, &model_skl),
800	X86_MATCH_INTEL_FAM6_MODEL(TIGERLAKE, &model_skl),
801	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE, &model_skl),
802	X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &model_skl),
803	X86_MATCH_INTEL_FAM6_MODEL(ATOM_GRACEMONT, &model_skl),
804	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &model_spr),
805	X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &model_spr),
806	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE, &model_skl),
807	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_P, &model_skl),
808	X86_MATCH_INTEL_FAM6_MODEL(RAPTORLAKE_S, &model_skl),
809	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE, &model_skl),
810	X86_MATCH_INTEL_FAM6_MODEL(METEORLAKE_L, &model_skl),
811	{},
812	};
813	MODULE_DEVICE_TABLE(x86cpu, rapl_model_match);
814
815	static int __init rapl_pmu_init(void)
816	{
817	const struct x86_cpu_id *id;
818	struct rapl_model *rm;
819	int ret;
820
821	id = x86_match_cpu(match: rapl_model_match);
822	if (!id)
823	return -ENODEV;
824
825	rm = (struct rapl_model *) id->driver_data;
826
827	rapl_msrs = rm->rapl_msrs;
828
829	rapl_cntr_mask = perf_msr_probe(msr: rapl_msrs, cnt: PERF_RAPL_MAX,
830	no_zero: false, data: (void *) &rm->events);
831
832	ret = rapl_check_hw_unit(rm);
833	if (ret)
834	return ret;
835
836	ret = init_rapl_pmus();
837	if (ret)
838	return ret;
839
840	/*
841	* Install callbacks. Core will call them for each online cpu.
842	*/
843	ret = cpuhp_setup_state(state: CPUHP_AP_PERF_X86_RAPL_ONLINE,
844	name: "perf/x86/rapl:online",
845	startup: rapl_cpu_online, teardown: rapl_cpu_offline);
846	if (ret)
847	goto out;
848
849	ret = perf_pmu_register(pmu: &rapl_pmus->pmu, name: "power", type: -`1`);
850	if (ret)
851	goto out1;
852
853	rapl_advertise();
854	return `0`;
855
856	out1:
857	cpuhp_remove_state(state: CPUHP_AP_PERF_X86_RAPL_ONLINE);
858	out:
859	pr_warn("Initialization failed (%d), disabled\n", ret);
860	cleanup_rapl_pmus();
861	return ret;
862	}
863	module_init(rapl_pmu_init);
864
865	static void __exit intel_rapl_exit(void)
866	{
867	cpuhp_remove_state_nocalls(state: CPUHP_AP_PERF_X86_RAPL_ONLINE);
868	perf_pmu_unregister(pmu: &rapl_pmus->pmu);
869	cleanup_rapl_pmus();
870	}
871	module_exit(intel_rapl_exit);
872

source code of linux/arch/x86/events/rapl.c