pmu.c source code [linux/arch/x86/kvm/pmu.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Kernel-based Virtual Machine -- Performance Monitoring Unit support
4	*
5	* Copyright 2015 Red Hat, Inc. and/or its affiliates.
6	*
7	* Authors:
8	* Avi Kivity <avi@redhat.com>
9	* Gleb Natapov <gleb@redhat.com>
10	* Wei Huang <wei@redhat.com>
11	*/
12	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
13
14	#include <linux/types.h>
15	#include <linux/kvm_host.h>
16	#include <linux/perf_event.h>
17	#include <linux/bsearch.h>
18	#include <linux/sort.h>
19	#include <asm/perf_event.h>
20	#include <asm/cpu_device_id.h>
21	#include "x86.h"
22	#include "cpuid.h"
23	#include "lapic.h"
24	#include "pmu.h"
25
26	/ This is enough to filter the vast majority of currently defined events. /
27	#define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300
28
29	struct x86_pmu_capability __read_mostly kvm_pmu_cap;
30	EXPORT_SYMBOL_GPL(kvm_pmu_cap);
31
32	struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel;
33	EXPORT_SYMBOL_GPL(kvm_pmu_eventsel);
34
35	/ Precise Distribution of Instructions Retired (PDIR) /
36	static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = {
37	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL),
38	X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL),
39	/ Instruction-Accurate PDIR (PDIR++) /
40	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
41	{}
42	};
43
44	/ Precise Distribution (PDist) /
45	static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = {
46	X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL),
47	{}
48	};
49
50	/ NOTE:*
51	* - Each perf counter is defined as "struct kvm_pmc";
52	* - There are two types of perf counters: general purpose (gp) and fixed.
53	* gp counters are stored in gp_counters[] and fixed counters are stored
54	* in fixed_counters[] respectively. Both of them are part of "struct
55	* kvm_pmu";
56	* - pmu.c understands the difference between gp counters and fixed counters.
57	* However AMD doesn't support fixed-counters;
58	* - There are three types of index to access perf counters (PMC):
59	* 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD
60	* has MSR_K7_PERFCTRn and, for families 15H and later,
61	* MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are
62	* aliased to MSR_K7_PERFCTRn.
63	* 2. MSR Index (named idx): This normally is used by RDPMC instruction.
64	* For instance AMD RDPMC instruction uses 0000_0003h in ECX to access
65	* C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except
66	* that it also supports fixed counters. idx can be used to as index to
67	* gp and fixed counters.
68	* 3. Global PMC Index (named pmc): pmc is an index specific to PMU
69	* code. Each pmc, stored in kvm_pmc.idx field, is unique across
70	* all perf counters (both gp and fixed). The mapping relationship
71	* between pmc and perf counters is as the following:
72	* * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters
73	* [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed
74	* * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H
75	* and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters
76	*/
77
78	static struct kvm_pmu_ops kvm_pmu_ops __read_mostly;
79
80	#define KVM_X86_PMU_OP(func) \
81	DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \
82	(((struct kvm_pmu_ops )0)->func));
83	#define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP
84	#include <asm/kvm-x86-pmu-ops.h>
85
86	void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops)
87	{
88	memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops));
89
90	#define __KVM_X86_PMU_OP(func) \
91	static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func);
92	#define KVM_X86_PMU_OP(func) \
93	WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func)
94	#define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP
95	#include <asm/kvm-x86-pmu-ops.h>
96	#undef __KVM_X86_PMU_OP
97	}
98
99	static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi)
100	{
101	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
102	bool skip_pmi = false;
103
104	if (pmc->perf_event && pmc->perf_event->attr.precise_ip) {
105	if (!in_pmi) {
106	/*
107	* TODO: KVM is currently _choosing_ to not generate records
108	* for emulated instructions, avoiding BUFFER_OVF PMI when
109	* there are no records. Strictly speaking, it should be done
110	* as well in the right context to improve sampling accuracy.
111	*/
112	skip_pmi = true;
113	} else {
114	/ Indicate PEBS overflow PMI to guest. /
115	skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT,
116	(unsigned long *)&pmu->global_status);
117	}
118	} else {
119	__set_bit(pmc->idx, (unsigned long *)&pmu->global_status);
120	}
121
122	if (pmc->intr && !skip_pmi)
123	kvm_make_request(KVM_REQ_PMI, vcpu: pmc->vcpu);
124	}
125
126	static void kvm_perf_overflow(struct perf_event *perf_event,
127	struct perf_sample_data *data,
128	struct pt_regs *regs)
129	{
130	struct kvm_pmc *pmc = perf_event->overflow_handler_context;
131
132	/*
133	* Ignore asynchronous overflow events for counters that are scheduled
134	* to be reprogrammed, e.g. if a PMI for the previous event races with
135	* KVM's handling of a related guest WRMSR.
136	*/
137	if (test_and_set_bit(nr: pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi))
138	return;
139
140	__kvm_perf_overflow(pmc, in_pmi: true);
141
142	kvm_make_request(KVM_REQ_PMU, vcpu: pmc->vcpu);
143	}
144
145	static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc)
146	{
147	/*
148	* For some model specific pebs counters with special capabilities
149	* (PDIR, PDIR++, PDIST), KVM needs to raise the event precise
150	* level to the maximum value (currently 3, backwards compatible)
151	* so that the perf subsystem would assign specific hardware counter
152	* with that capability for vPMC.
153	*/
154	if ((pmc->idx == `0` && x86_match_cpu(match: vmx_pebs_pdist_cpu)) \|\|
155	(pmc->idx == `32` && x86_match_cpu(match: vmx_pebs_pdir_cpu)))
156	return `3`;
157
158	/*
159	* The non-zero precision level of guest event makes the ordinary
160	* guest event becomes a guest PEBS event and triggers the host
161	* PEBS PMI handler to determine whether the PEBS overflow PMI
162	* comes from the host counters or the guest.
163	*/
164	return `1`;
165	}
166
167	static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value)
168	{
169	u64 sample_period = (-counter_value) & pmc_bitmask(pmc);
170
171	if (!sample_period)
172	sample_period = pmc_bitmask(pmc) + `1`;
173	return sample_period;
174	}
175
176	static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config,
177	bool exclude_user, bool exclude_kernel,
178	bool intr)
179	{
180	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
181	struct perf_event *event;
182	struct perf_event_attr attr = {
183	.type = type,
184	.size = sizeof(attr),
185	.pinned = true,
186	.exclude_idle = true,
187	.exclude_host = `1`,
188	.exclude_user = exclude_user,
189	.exclude_kernel = exclude_kernel,
190	.config = config,
191	};
192	bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable);
193
194	attr.sample_period = get_sample_period(pmc, counter_value: pmc->counter);
195
196	if ((attr.config & HSW_IN_TX_CHECKPOINTED) &&
197	guest_cpuid_is_intel(vcpu: pmc->vcpu)) {
198	/*
199	* HSW_IN_TX_CHECKPOINTED is not supported with nonzero
200	* period. Just clear the sample period so at least
201	* allocating the counter doesn't fail.
202	*/
203	attr.sample_period = `0`;
204	}
205	if (pebs) {
206	/*
207	* For most PEBS hardware events, the difference in the software
208	* precision levels of guest and host PEBS events will not affect
209	* the accuracy of the PEBS profiling result, because the "event IP"
210	* in the PEBS record is calibrated on the guest side.
211	*/
212	attr.precise_ip = pmc_get_pebs_precise_level(pmc);
213	}
214
215	event = perf_event_create_kernel_counter(attr: &attr, cpu: -`1`, current,
216	callback: kvm_perf_overflow, context: pmc);
217	if (IS_ERR(ptr: event)) {
218	pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n",
219	PTR_ERR(event), pmc->idx);
220	return PTR_ERR(ptr: event);
221	}
222
223	pmc->perf_event = event;
224	pmc_to_pmu(pmc)->event_count++;
225	pmc->is_paused = false;
226	pmc->intr = intr \|\| pebs;
227	return `0`;
228	}
229
230	static bool pmc_pause_counter(struct kvm_pmc *pmc)
231	{
232	u64 counter = pmc->counter;
233	u64 prev_counter;
234
235	/ update counter, reset event value to avoid redundant accumulation /
236	if (pmc->perf_event && !pmc->is_paused)
237	counter += perf_event_pause(event: pmc->perf_event, reset: true);
238
239	/*
240	* Snapshot the previous counter after accumulating state from perf.
241	* If overflow already happened, hardware (via perf) is responsible for
242	* generating a PMI. KVM just needs to detect overflow on emulated
243	* counter events that haven't yet been processed.
244	*/
245	prev_counter = counter & pmc_bitmask(pmc);
246
247	counter += pmc->emulated_counter;
248	pmc->counter = counter & pmc_bitmask(pmc);
249
250	pmc->emulated_counter = `0`;
251	pmc->is_paused = true;
252
253	return pmc->counter < prev_counter;
254	}
255
256	static bool pmc_resume_counter(struct kvm_pmc *pmc)
257	{
258	if (!pmc->perf_event)
259	return false;
260
261	/ recalibrate sample period and check if it's accepted by perf core /
262	if (is_sampling_event(event: pmc->perf_event) &&
263	perf_event_period(event: pmc->perf_event,
264	value: get_sample_period(pmc, counter_value: pmc->counter)))
265	return false;
266
267	if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) !=
268	(!!pmc->perf_event->attr.precise_ip))
269	return false;
270
271	/ reuse perf_event to serve as pmc_reprogram_counter() does/
272	perf_event_enable(event: pmc->perf_event);
273	pmc->is_paused = false;
274
275	return true;
276	}
277
278	static void pmc_release_perf_event(struct kvm_pmc *pmc)
279	{
280	if (pmc->perf_event) {
281	perf_event_release_kernel(event: pmc->perf_event);
282	pmc->perf_event = NULL;
283	pmc->current_config = `0`;
284	pmc_to_pmu(pmc)->event_count--;
285	}
286	}
287
288	static void pmc_stop_counter(struct kvm_pmc *pmc)
289	{
290	if (pmc->perf_event) {
291	pmc->counter = pmc_read_counter(pmc);
292	pmc_release_perf_event(pmc);
293	}
294	}
295
296	static void pmc_update_sample_period(struct kvm_pmc *pmc)
297	{
298	if (!pmc->perf_event \|\| pmc->is_paused \|\|
299	!is_sampling_event(event: pmc->perf_event))
300	return;
301
302	perf_event_period(event: pmc->perf_event,
303	value: get_sample_period(pmc, counter_value: pmc->counter));
304	}
305
306	void pmc_write_counter(struct kvm_pmc *pmc, u64 val)
307	{
308	/*
309	* Drop any unconsumed accumulated counts, the WRMSR is a write, not a
310	* read-modify-write. Adjust the counter value so that its value is
311	* relative to the current count, as reading the current count from
312	* perf is faster than pausing and repgrogramming the event in order to
313	* reset it to '0'. Note, this very sneakily offsets the accumulated
314	* emulated count too, by using pmc_read_counter()!
315	*/
316	pmc->emulated_counter = `0`;
317	pmc->counter += val - pmc_read_counter(pmc);
318	pmc->counter &= pmc_bitmask(pmc);
319	pmc_update_sample_period(pmc);
320	}
321	EXPORT_SYMBOL_GPL(pmc_write_counter);
322
323	static int filter_cmp(const void pa, const* void *pb, u64 mask)
324	{
325	u64 a = (u64 )pa & mask;
326	u64 b = (u64 )pb & mask;
327
328	return (a > b) - (a < b);
329	}
330
331
332	static int filter_sort_cmp(const void pa, const* void *pb)
333	{
334	return filter_cmp(pa, pb, mask: (KVM_PMU_MASKED_ENTRY_EVENT_SELECT \|
335	KVM_PMU_MASKED_ENTRY_EXCLUDE));
336	}
337
338	/*
339	* For the event filter, searching is done on the 'includes' list and
340	* 'excludes' list separately rather than on the 'events' list (which
341	* has both). As a result the exclude bit can be ignored.
342	*/
343	static int filter_event_cmp(const void pa, const* void *pb)
344	{
345	return filter_cmp(pa, pb, mask: (KVM_PMU_MASKED_ENTRY_EVENT_SELECT));
346	}
347
348	static int find_filter_index(u64 *events, u64 nevents, u64 key)
349	{
350	u64 fe = bsearch(key: &key, base: events, num: nevents, size: sizeof*(events[`0`]),
351	cmp: filter_event_cmp);
352
353	if (!fe)
354	return -`1`;
355
356	return fe - events;
357	}
358
359	static bool is_filter_entry_match(u64 filter_event, u64 umask)
360	{
361	u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - `8`);
362	u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH;
363
364	BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(`0`, `0xff`, `0`, false) >>
365	(KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - `8`)) !=
366	ARCH_PERFMON_EVENTSEL_UMASK);
367
368	return (umask & mask) == match;
369	}
370
371	static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel)
372	{
373	u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT;
374	u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK;
375	int i, index;
376
377	index = find_filter_index(events, nevents, key: event_select);
378	if (index < `0`)
379	return false;
380
381	/*
382	* Entries are sorted by the event select. Walk the list in both
383	* directions to process all entries with the targeted event select.
384	*/
385	for (i = index; i < nevents; i++) {
386	if (filter_event_cmp(pa: &events[i], pb: &event_select))
387	break;
388
389	if (is_filter_entry_match(filter_event: events[i], umask))
390	return true;
391	}
392
393	for (i = index - `1`; i >= `0`; i--) {
394	if (filter_event_cmp(pa: &events[i], pb: &event_select))
395	break;
396
397	if (is_filter_entry_match(filter_event: events[i], umask))
398	return true;
399	}
400
401	return false;
402	}
403
404	static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f,
405	u64 eventsel)
406	{
407	if (filter_contains_match(events: f->includes, nevents: f->nr_includes, eventsel) &&
408	!filter_contains_match(events: f->excludes, nevents: f->nr_excludes, eventsel))
409	return f->action == KVM_PMU_EVENT_ALLOW;
410
411	return f->action == KVM_PMU_EVENT_DENY;
412	}
413
414	static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter,
415	int idx)
416	{
417	int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX;
418
419	if (filter->action == KVM_PMU_EVENT_DENY &&
420	test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
421	return false;
422	if (filter->action == KVM_PMU_EVENT_ALLOW &&
423	!test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap))
424	return false;
425
426	return true;
427	}
428
429	static bool check_pmu_event_filter(struct kvm_pmc *pmc)
430	{
431	struct kvm_x86_pmu_event_filter *filter;
432	struct kvm *kvm = pmc->vcpu->kvm;
433
434	filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu);
435	if (!filter)
436	return true;
437
438	if (pmc_is_gp(pmc))
439	return is_gp_event_allowed(f: filter, eventsel: pmc->eventsel);
440
441	return is_fixed_event_allowed(filter, idx: pmc->idx);
442	}
443
444	static bool pmc_event_is_allowed(struct kvm_pmc *pmc)
445	{
446	return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) &&
447	check_pmu_event_filter(pmc);
448	}
449
450	static int reprogram_counter(struct kvm_pmc *pmc)
451	{
452	struct kvm_pmu *pmu = pmc_to_pmu(pmc);
453	u64 eventsel = pmc->eventsel;
454	u64 new_config = eventsel;
455	bool emulate_overflow;
456	u8 fixed_ctr_ctrl;
457
458	emulate_overflow = pmc_pause_counter(pmc);
459
460	if (!pmc_event_is_allowed(pmc))
461	return `0`;
462
463	if (emulate_overflow)
464	__kvm_perf_overflow(pmc, in_pmi: false);
465
466	if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL)
467	printk_once("kvm pmu: pin control bit is ignored\n");
468
469	if (pmc_is_fixed(pmc)) {
470	fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl,
471	pmc->idx - KVM_FIXED_PMC_BASE_IDX);
472	if (fixed_ctr_ctrl & `0x1`)
473	eventsel \|= ARCH_PERFMON_EVENTSEL_OS;
474	if (fixed_ctr_ctrl & `0x2`)
475	eventsel \|= ARCH_PERFMON_EVENTSEL_USR;
476	if (fixed_ctr_ctrl & `0x8`)
477	eventsel \|= ARCH_PERFMON_EVENTSEL_INT;
478	new_config = (u64)fixed_ctr_ctrl;
479	}
480
481	if (pmc->current_config == new_config && pmc_resume_counter(pmc))
482	return `0`;
483
484	pmc_release_perf_event(pmc);
485
486	pmc->current_config = new_config;
487
488	return pmc_reprogram_counter(pmc, type: PERF_TYPE_RAW,
489	config: (eventsel & pmu->raw_event_mask),
490	exclude_user: !(eventsel & ARCH_PERFMON_EVENTSEL_USR),
491	exclude_kernel: !(eventsel & ARCH_PERFMON_EVENTSEL_OS),
492	intr: eventsel & ARCH_PERFMON_EVENTSEL_INT);
493	}
494
495	void kvm_pmu_handle_event(struct kvm_vcpu *vcpu)
496	{
497	DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
498	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
499	struct kvm_pmc *pmc;
500	int bit;
501
502	bitmap_copy(dst: bitmap, src: pmu->reprogram_pmi, X86_PMC_IDX_MAX);
503
504	/*
505	* The reprogramming bitmap can be written asynchronously by something
506	* other than the task that holds vcpu->mutex, take care to clear only
507	* the bits that will actually processed.
508	*/
509	BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t));
510	atomic64_andnot(i: (s64 )bitmap, v: &pmu->__reprogram_pmi);
511
512	kvm_for_each_pmc(pmu, pmc, bit, bitmap) {
513	/*
514	* If reprogramming fails, e.g. due to contention, re-set the
515	* regprogram bit set, i.e. opportunistically try again on the
516	* next PMU refresh. Don't make a new request as doing so can
517	* stall the guest if reprogramming repeatedly fails.
518	*/
519	if (reprogram_counter(pmc))
520	set_bit(nr: pmc->idx, addr: pmu->reprogram_pmi);
521	}
522
523	/*
524	* Unused perf_events are only released if the corresponding MSRs
525	* weren't accessed during the last vCPU time slice. kvm_arch_sched_in
526	* triggers KVM_REQ_PMU if cleanup is needed.
527	*/
528	if (unlikely(pmu->need_cleanup))
529	kvm_pmu_cleanup(vcpu);
530	}
531
532	int kvm_pmu_check_rdpmc_early(struct kvm_vcpu vcpu, unsigned* int idx)
533	{
534	/*
535	* On Intel, VMX interception has priority over RDPMC exceptions that
536	* aren't already handled by the emulator, i.e. there are no additional
537	* check needed for Intel PMUs.
538	*
539	* On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts,
540	* i.e. an invalid PMC results in a #GP, not #VMEXIT.
541	*/
542	if (!kvm_pmu_ops.check_rdpmc_early)
543	return `0`;
544
545	return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx);
546	}
547
548	bool is_vmware_backdoor_pmc(u32 pmc_idx)
549	{
550	switch (pmc_idx) {
551	case VMWARE_BACKDOOR_PMC_HOST_TSC:
552	case VMWARE_BACKDOOR_PMC_REAL_TIME:
553	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
554	return true;
555	}
556	return false;
557	}
558
559	static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu vcpu, unsigned* idx, u64 *data)
560	{
561	u64 ctr_val;
562
563	switch (idx) {
564	case VMWARE_BACKDOOR_PMC_HOST_TSC:
565	ctr_val = rdtsc();
566	break;
567	case VMWARE_BACKDOOR_PMC_REAL_TIME:
568	ctr_val = ktime_get_boottime_ns();
569	break;
570	case VMWARE_BACKDOOR_PMC_APPARENT_TIME:
571	ctr_val = ktime_get_boottime_ns() +
572	vcpu->kvm->arch.kvmclock_offset;
573	break;
574	default:
575	return `1`;
576	}
577
578	*data = ctr_val;
579	return `0`;
580	}
581
582	int kvm_pmu_rdpmc(struct kvm_vcpu vcpu, unsigned* idx, u64 *data)
583	{
584	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
585	struct kvm_pmc *pmc;
586	u64 mask = ~`0ull`;
587
588	if (!pmu->version)
589	return `1`;
590
591	if (is_vmware_backdoor_pmc(pmc_idx: idx))
592	return kvm_pmu_rdpmc_vmware(vcpu, idx, data);
593
594	pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask);
595	if (!pmc)
596	return `1`;
597
598	if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) &&
599	(static_call(kvm_x86_get_cpl)(vcpu) != `0`) &&
600	kvm_is_cr0_bit_set(vcpu, X86_CR0_PE))
601	return `1`;
602
603	*data = pmc_read_counter(pmc) & mask;
604	return `0`;
605	}
606
607	void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu)
608	{
609	if (lapic_in_kernel(vcpu)) {
610	static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu);
611	kvm_apic_local_deliver(apic: vcpu->arch.apic, APIC_LVTPC);
612	}
613	}
614
615	bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr)
616	{
617	switch (msr) {
618	case MSR_CORE_PERF_GLOBAL_STATUS:
619	case MSR_CORE_PERF_GLOBAL_CTRL:
620	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
621	return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu));
622	default:
623	break;
624	}
625	return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) \|\|
626	static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr);
627	}
628
629	static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr)
630	{
631	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
632	struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr);
633
634	if (pmc)
635	__set_bit(pmc->idx, pmu->pmc_in_use);
636	}
637
638	int kvm_pmu_get_msr(struct kvm_vcpu vcpu, struct* msr_data *msr_info)
639	{
640	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
641	u32 msr = msr_info->index;
642
643	switch (msr) {
644	case MSR_CORE_PERF_GLOBAL_STATUS:
645	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
646	msr_info->data = pmu->global_status;
647	break;
648	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
649	case MSR_CORE_PERF_GLOBAL_CTRL:
650	msr_info->data = pmu->global_ctrl;
651	break;
652	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
653	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
654	msr_info->data = `0`;
655	break;
656	default:
657	return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info);
658	}
659
660	return `0`;
661	}
662
663	int kvm_pmu_set_msr(struct kvm_vcpu vcpu, struct* msr_data *msr_info)
664	{
665	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
666	u32 msr = msr_info->index;
667	u64 data = msr_info->data;
668	u64 diff;
669
670	/*
671	* Note, AMD ignores writes to reserved bits and read-only PMU MSRs,
672	* whereas Intel generates #GP on attempts to write reserved/RO MSRs.
673	*/
674	switch (msr) {
675	case MSR_CORE_PERF_GLOBAL_STATUS:
676	if (!msr_info->host_initiated)
677	return `1`; / RO MSR /
678	fallthrough;
679	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS:
680	/ Per PPR, Read-only MSR. Writes are ignored. /
681	if (!msr_info->host_initiated)
682	break;
683
684	if (data & pmu->global_status_mask)
685	return `1`;
686
687	pmu->global_status = data;
688	break;
689	case MSR_AMD64_PERF_CNTR_GLOBAL_CTL:
690	data &= ~pmu->global_ctrl_mask;
691	fallthrough;
692	case MSR_CORE_PERF_GLOBAL_CTRL:
693	if (!kvm_valid_perf_global_ctrl(pmu, data))
694	return `1`;
695
696	if (pmu->global_ctrl != data) {
697	diff = pmu->global_ctrl ^ data;
698	pmu->global_ctrl = data;
699	reprogram_counters(pmu, diff);
700	}
701	break;
702	case MSR_CORE_PERF_GLOBAL_OVF_CTRL:
703	/*
704	* GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in
705	* GLOBAL_STATUS, and so the set of reserved bits is the same.
706	*/
707	if (data & pmu->global_status_mask)
708	return `1`;
709	fallthrough;
710	case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR:
711	if (!msr_info->host_initiated)
712	pmu->global_status &= ~data;
713	break;
714	default:
715	kvm_pmu_mark_pmc_in_use(vcpu, msr: msr_info->index);
716	return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info);
717	}
718
719	return `0`;
720	}
721
722	static void kvm_pmu_reset(struct kvm_vcpu *vcpu)
723	{
724	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
725	struct kvm_pmc *pmc;
726	int i;
727
728	pmu->need_cleanup = false;
729
730	bitmap_zero(dst: pmu->reprogram_pmi, X86_PMC_IDX_MAX);
731
732	kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) {
733	pmc_stop_counter(pmc);
734	pmc->counter = `0`;
735	pmc->emulated_counter = `0`;
736
737	if (pmc_is_gp(pmc))
738	pmc->eventsel = `0`;
739	}
740
741	pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = `0`;
742
743	static_call_cond(kvm_x86_pmu_reset)(vcpu);
744	}
745
746
747	/*
748	* Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID
749	* and/or PERF_CAPABILITIES.
750	*/
751	void kvm_pmu_refresh(struct kvm_vcpu *vcpu)
752	{
753	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
754
755	if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm))
756	return;
757
758	/*
759	* Stop/release all existing counters/events before realizing the new
760	* vPMU model.
761	*/
762	kvm_pmu_reset(vcpu);
763
764	pmu->version = `0`;
765	pmu->nr_arch_gp_counters = `0`;
766	pmu->nr_arch_fixed_counters = `0`;
767	pmu->counter_bitmask[KVM_PMC_GP] = `0`;
768	pmu->counter_bitmask[KVM_PMC_FIXED] = `0`;
769	pmu->reserved_bits = `0xffffffff00200000ull`;
770	pmu->raw_event_mask = X86_RAW_EVENT_MASK;
771	pmu->global_ctrl_mask = ~`0ull`;
772	pmu->global_status_mask = ~`0ull`;
773	pmu->fixed_ctr_ctrl_mask = ~`0ull`;
774	pmu->pebs_enable_mask = ~`0ull`;
775	pmu->pebs_data_cfg_mask = ~`0ull`;
776	bitmap_zero(dst: pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
777
778	if (!vcpu->kvm->arch.enable_pmu)
779	return;
780
781	static_call(kvm_x86_pmu_refresh)(vcpu);
782
783	/*
784	* At RESET, both Intel and AMD CPUs set all enable bits for general
785	* purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that
786	* was written for v1 PMUs don't unknowingly leave GP counters disabled
787	* in the global controls). Emulate that behavior when refreshing the
788	* PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL.
789	*/
790	if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters)
791	pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - `1`, `0`);
792	}
793
794	void kvm_pmu_init(struct kvm_vcpu *vcpu)
795	{
796	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
797
798	memset(pmu, `0`, sizeof(*pmu));
799	static_call(kvm_x86_pmu_init)(vcpu);
800	kvm_pmu_refresh(vcpu);
801	}
802
803	/ Release perf_events for vPMCs that have been unused for a full time slice. /
804	void kvm_pmu_cleanup(struct kvm_vcpu *vcpu)
805	{
806	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
807	struct kvm_pmc *pmc = NULL;
808	DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX);
809	int i;
810
811	pmu->need_cleanup = false;
812
813	bitmap_andnot(dst: bitmask, src1: pmu->all_valid_pmc_idx,
814	src2: pmu->pmc_in_use, X86_PMC_IDX_MAX);
815
816	kvm_for_each_pmc(pmu, pmc, i, bitmask) {
817	if (pmc->perf_event && !pmc_speculative_in_use(pmc))
818	pmc_stop_counter(pmc);
819	}
820
821	static_call_cond(kvm_x86_pmu_cleanup)(vcpu);
822
823	bitmap_zero(dst: pmu->pmc_in_use, X86_PMC_IDX_MAX);
824	}
825
826	void kvm_pmu_destroy(struct kvm_vcpu *vcpu)
827	{
828	kvm_pmu_reset(vcpu);
829	}
830
831	static void kvm_pmu_incr_counter(struct kvm_pmc *pmc)
832	{
833	pmc->emulated_counter++;
834	kvm_pmu_request_counter_reprogram(pmc);
835	}
836
837	static inline bool cpl_is_matched(struct kvm_pmc *pmc)
838	{
839	bool select_os, select_user;
840	u64 config;
841
842	if (pmc_is_gp(pmc)) {
843	config = pmc->eventsel;
844	select_os = config & ARCH_PERFMON_EVENTSEL_OS;
845	select_user = config & ARCH_PERFMON_EVENTSEL_USR;
846	} else {
847	config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl,
848	pmc->idx - KVM_FIXED_PMC_BASE_IDX);
849	select_os = config & `0x1`;
850	select_user = config & `0x2`;
851	}
852
853	/*
854	* Skip the CPL lookup, which isn't free on Intel, if the result will
855	* be the same regardless of the CPL.
856	*/
857	if (select_os == select_user)
858	return select_os;
859
860	return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == `0`) ? select_os : select_user;
861	}
862
863	void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel)
864	{
865	DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX);
866	struct kvm_pmu *pmu = vcpu_to_pmu(vcpu);
867	struct kvm_pmc *pmc;
868	int i;
869
870	BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX);
871
872	if (!kvm_pmu_has_perf_global_ctrl(pmu))
873	bitmap_copy(dst: bitmap, src: pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX);
874	else if (!bitmap_and(dst: bitmap, src1: pmu->all_valid_pmc_idx,
875	src2: (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX))
876	return;
877
878	kvm_for_each_pmc(pmu, pmc, i, bitmap) {
879	/*
880	* Ignore checks for edge detect (all events currently emulated
881	* but KVM are always rising edges), pin control (unsupported
882	* by modern CPUs), and counter mask and its invert flag (KVM
883	* doesn't emulate multiple events in a single clock cycle).
884	*
885	* Note, the uppermost nibble of AMD's mask overlaps Intel's
886	* IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved
887	* bits (bits 35:34). Checking the "in HLE/RTM transaction"
888	* flags is correct as the vCPU can't be in a transaction if
889	* KVM is emulating an instruction. Checking the reserved bits
890	* might be wrong if they are defined in the future, but so
891	* could ignoring them, so do the simple thing for now.
892	*/
893	if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) \|\|
894	!pmc_event_is_allowed(pmc) \|\| !cpl_is_matched(pmc))
895	continue;
896
897	kvm_pmu_incr_counter(pmc);
898	}
899	}
900	EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event);
901
902	static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter)
903	{
904	u64 mask = kvm_pmu_ops.EVENTSEL_EVENT \|
905	KVM_PMU_MASKED_ENTRY_UMASK_MASK \|
906	KVM_PMU_MASKED_ENTRY_UMASK_MATCH \|
907	KVM_PMU_MASKED_ENTRY_EXCLUDE;
908	int i;
909
910	for (i = `0`; i < filter->nevents; i++) {
911	if (filter->events[i] & ~mask)
912	return false;
913	}
914
915	return true;
916	}
917
918	static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter)
919	{
920	int i, j;
921
922	for (i = `0`, j = `0`; i < filter->nevents; i++) {
923	/*
924	* Skip events that are impossible to match against a guest
925	* event. When filtering, only the event select + unit mask
926	* of the guest event is used. To maintain backwards
927	* compatibility, impossible filters can't be rejected :-(
928	*/
929	if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT \|
930	ARCH_PERFMON_EVENTSEL_UMASK))
931	continue;
932	/*
933	* Convert userspace events to a common in-kernel event so
934	* only one code path is needed to support both events. For
935	* the in-kernel events use masked events because they are
936	* flexible enough to handle both cases. To convert to masked
937	* events all that's needed is to add an "all ones" umask_mask,
938	* (unmasked filter events don't support EXCLUDE).
939	*/
940	filter->events[j++] = filter->events[i] \|
941	(`0xFFULL` << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT);
942	}
943
944	filter->nevents = j;
945	}
946
947	static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter)
948	{
949	int i;
950
951	if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS))
952	convert_to_masked_filter(filter);
953	else if (!is_masked_filter_valid(filter))
954	return -EINVAL;
955
956	/*
957	* Sort entries by event select and includes vs. excludes so that all
958	* entries for a given event select can be processed efficiently during
959	* filtering. The EXCLUDE flag uses a more significant bit than the
960	* event select, and so the sorted list is also effectively split into
961	* includes and excludes sub-lists.
962	*/
963	sort(base: &filter->events, num: filter->nevents, size: sizeof(filter->events[`0`]),
964	cmp_func: filter_sort_cmp, NULL);
965
966	i = filter->nevents;
967	/ Find the first EXCLUDE event (only supported for masked events). /
968	if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) {
969	for (i = `0`; i < filter->nevents; i++) {
970	if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE)
971	break;
972	}
973	}
974
975	filter->nr_includes = i;
976	filter->nr_excludes = filter->nevents - filter->nr_includes;
977	filter->includes = filter->events;
978	filter->excludes = filter->events + filter->nr_includes;
979
980	return `0`;
981	}
982
983	int kvm_vm_ioctl_set_pmu_event_filter(struct kvm kvm, void* __user *argp)
984	{
985	struct kvm_pmu_event_filter __user *user_filter = argp;
986	struct kvm_x86_pmu_event_filter *filter;
987	struct kvm_pmu_event_filter tmp;
988	struct kvm_vcpu *vcpu;
989	unsigned long i;
990	size_t size;
991	int r;
992
993	if (copy_from_user(to: &tmp, from: user_filter, n: sizeof(tmp)))
994	return -EFAULT;
995
996	if (tmp.action != KVM_PMU_EVENT_ALLOW &&
997	tmp.action != KVM_PMU_EVENT_DENY)
998	return -EINVAL;
999
1000	if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK)
1001	return -EINVAL;
1002
1003	if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS)
1004	return -E2BIG;
1005
1006	size = struct_size(filter, events, tmp.nevents);
1007	filter = kzalloc(size, GFP_KERNEL_ACCOUNT);
1008	if (!filter)
1009	return -ENOMEM;
1010
1011	filter->action = tmp.action;
1012	filter->nevents = tmp.nevents;
1013	filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap;
1014	filter->flags = tmp.flags;
1015
1016	r = -EFAULT;
1017	if (copy_from_user(to: filter->events, from: user_filter->events,
1018	n: sizeof(filter->events[`0`]) * filter->nevents))
1019	goto cleanup;
1020
1021	r = prepare_filter_lists(filter);
1022	if (r)
1023	goto cleanup;
1024
1025	mutex_lock(&kvm->lock);
1026	filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter,
1027	mutex_is_locked(&kvm->lock));
1028	mutex_unlock(lock: &kvm->lock);
1029	synchronize_srcu_expedited(ssp: &kvm->srcu);
1030
1031	BUILD_BUG_ON(sizeof(((struct kvm_pmu *)`0`)->reprogram_pmi) >
1032	sizeof(((struct kvm_pmu *)`0`)->__reprogram_pmi));
1033
1034	kvm_for_each_vcpu(i, vcpu, kvm)
1035	atomic64_set(v: &vcpu_to_pmu(vcpu)->__reprogram_pmi, i: -`1ull`);
1036
1037	kvm_make_all_cpus_request(kvm, KVM_REQ_PMU);
1038
1039	r = `0`;
1040	cleanup:
1041	kfree(objp: filter);
1042	return r;
1043	}
1044

source code of linux/arch/x86/kvm/pmu.c