1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Kernel-based Virtual Machine -- Performance Monitoring Unit support |
4 | * |
5 | * Copyright 2015 Red Hat, Inc. and/or its affiliates. |
6 | * |
7 | * Authors: |
8 | * Avi Kivity <avi@redhat.com> |
9 | * Gleb Natapov <gleb@redhat.com> |
10 | * Wei Huang <wei@redhat.com> |
11 | */ |
12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
13 | |
14 | #include <linux/types.h> |
15 | #include <linux/kvm_host.h> |
16 | #include <linux/perf_event.h> |
17 | #include <linux/bsearch.h> |
18 | #include <linux/sort.h> |
19 | #include <asm/perf_event.h> |
20 | #include <asm/cpu_device_id.h> |
21 | #include "x86.h" |
22 | #include "cpuid.h" |
23 | #include "lapic.h" |
24 | #include "pmu.h" |
25 | |
26 | /* This is enough to filter the vast majority of currently defined events. */ |
27 | #define KVM_PMU_EVENT_FILTER_MAX_EVENTS 300 |
28 | |
29 | struct x86_pmu_capability __read_mostly kvm_pmu_cap; |
30 | EXPORT_SYMBOL_GPL(kvm_pmu_cap); |
31 | |
32 | struct kvm_pmu_emulated_event_selectors __read_mostly kvm_pmu_eventsel; |
33 | EXPORT_SYMBOL_GPL(kvm_pmu_eventsel); |
34 | |
35 | /* Precise Distribution of Instructions Retired (PDIR) */ |
36 | static const struct x86_cpu_id vmx_pebs_pdir_cpu[] = { |
37 | X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, NULL), |
38 | X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, NULL), |
39 | /* Instruction-Accurate PDIR (PDIR++) */ |
40 | X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), |
41 | {} |
42 | }; |
43 | |
44 | /* Precise Distribution (PDist) */ |
45 | static const struct x86_cpu_id vmx_pebs_pdist_cpu[] = { |
46 | X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, NULL), |
47 | {} |
48 | }; |
49 | |
50 | /* NOTE: |
51 | * - Each perf counter is defined as "struct kvm_pmc"; |
52 | * - There are two types of perf counters: general purpose (gp) and fixed. |
53 | * gp counters are stored in gp_counters[] and fixed counters are stored |
54 | * in fixed_counters[] respectively. Both of them are part of "struct |
55 | * kvm_pmu"; |
56 | * - pmu.c understands the difference between gp counters and fixed counters. |
57 | * However AMD doesn't support fixed-counters; |
58 | * - There are three types of index to access perf counters (PMC): |
59 | * 1. MSR (named msr): For example Intel has MSR_IA32_PERFCTRn and AMD |
60 | * has MSR_K7_PERFCTRn and, for families 15H and later, |
61 | * MSR_F15H_PERF_CTRn, where MSR_F15H_PERF_CTR[0-3] are |
62 | * aliased to MSR_K7_PERFCTRn. |
63 | * 2. MSR Index (named idx): This normally is used by RDPMC instruction. |
64 | * For instance AMD RDPMC instruction uses 0000_0003h in ECX to access |
65 | * C001_0007h (MSR_K7_PERCTR3). Intel has a similar mechanism, except |
66 | * that it also supports fixed counters. idx can be used to as index to |
67 | * gp and fixed counters. |
68 | * 3. Global PMC Index (named pmc): pmc is an index specific to PMU |
69 | * code. Each pmc, stored in kvm_pmc.idx field, is unique across |
70 | * all perf counters (both gp and fixed). The mapping relationship |
71 | * between pmc and perf counters is as the following: |
72 | * * Intel: [0 .. KVM_INTEL_PMC_MAX_GENERIC-1] <=> gp counters |
73 | * [KVM_FIXED_PMC_BASE_IDX .. KVM_FIXED_PMC_BASE_IDX + 2] <=> fixed |
74 | * * AMD: [0 .. AMD64_NUM_COUNTERS-1] and, for families 15H |
75 | * and later, [0 .. AMD64_NUM_COUNTERS_CORE-1] <=> gp counters |
76 | */ |
77 | |
78 | static struct kvm_pmu_ops kvm_pmu_ops __read_mostly; |
79 | |
80 | #define KVM_X86_PMU_OP(func) \ |
81 | DEFINE_STATIC_CALL_NULL(kvm_x86_pmu_##func, \ |
82 | *(((struct kvm_pmu_ops *)0)->func)); |
83 | #define KVM_X86_PMU_OP_OPTIONAL KVM_X86_PMU_OP |
84 | #include <asm/kvm-x86-pmu-ops.h> |
85 | |
86 | void kvm_pmu_ops_update(const struct kvm_pmu_ops *pmu_ops) |
87 | { |
88 | memcpy(&kvm_pmu_ops, pmu_ops, sizeof(kvm_pmu_ops)); |
89 | |
90 | #define __KVM_X86_PMU_OP(func) \ |
91 | static_call_update(kvm_x86_pmu_##func, kvm_pmu_ops.func); |
92 | #define KVM_X86_PMU_OP(func) \ |
93 | WARN_ON(!kvm_pmu_ops.func); __KVM_X86_PMU_OP(func) |
94 | #define KVM_X86_PMU_OP_OPTIONAL __KVM_X86_PMU_OP |
95 | #include <asm/kvm-x86-pmu-ops.h> |
96 | #undef __KVM_X86_PMU_OP |
97 | } |
98 | |
99 | static inline void __kvm_perf_overflow(struct kvm_pmc *pmc, bool in_pmi) |
100 | { |
101 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
102 | bool skip_pmi = false; |
103 | |
104 | if (pmc->perf_event && pmc->perf_event->attr.precise_ip) { |
105 | if (!in_pmi) { |
106 | /* |
107 | * TODO: KVM is currently _choosing_ to not generate records |
108 | * for emulated instructions, avoiding BUFFER_OVF PMI when |
109 | * there are no records. Strictly speaking, it should be done |
110 | * as well in the right context to improve sampling accuracy. |
111 | */ |
112 | skip_pmi = true; |
113 | } else { |
114 | /* Indicate PEBS overflow PMI to guest. */ |
115 | skip_pmi = __test_and_set_bit(GLOBAL_STATUS_BUFFER_OVF_BIT, |
116 | (unsigned long *)&pmu->global_status); |
117 | } |
118 | } else { |
119 | __set_bit(pmc->idx, (unsigned long *)&pmu->global_status); |
120 | } |
121 | |
122 | if (pmc->intr && !skip_pmi) |
123 | kvm_make_request(KVM_REQ_PMI, vcpu: pmc->vcpu); |
124 | } |
125 | |
126 | static void kvm_perf_overflow(struct perf_event *perf_event, |
127 | struct perf_sample_data *data, |
128 | struct pt_regs *regs) |
129 | { |
130 | struct kvm_pmc *pmc = perf_event->overflow_handler_context; |
131 | |
132 | /* |
133 | * Ignore asynchronous overflow events for counters that are scheduled |
134 | * to be reprogrammed, e.g. if a PMI for the previous event races with |
135 | * KVM's handling of a related guest WRMSR. |
136 | */ |
137 | if (test_and_set_bit(nr: pmc->idx, pmc_to_pmu(pmc)->reprogram_pmi)) |
138 | return; |
139 | |
140 | __kvm_perf_overflow(pmc, in_pmi: true); |
141 | |
142 | kvm_make_request(KVM_REQ_PMU, vcpu: pmc->vcpu); |
143 | } |
144 | |
145 | static u64 pmc_get_pebs_precise_level(struct kvm_pmc *pmc) |
146 | { |
147 | /* |
148 | * For some model specific pebs counters with special capabilities |
149 | * (PDIR, PDIR++, PDIST), KVM needs to raise the event precise |
150 | * level to the maximum value (currently 3, backwards compatible) |
151 | * so that the perf subsystem would assign specific hardware counter |
152 | * with that capability for vPMC. |
153 | */ |
154 | if ((pmc->idx == 0 && x86_match_cpu(match: vmx_pebs_pdist_cpu)) || |
155 | (pmc->idx == 32 && x86_match_cpu(match: vmx_pebs_pdir_cpu))) |
156 | return 3; |
157 | |
158 | /* |
159 | * The non-zero precision level of guest event makes the ordinary |
160 | * guest event becomes a guest PEBS event and triggers the host |
161 | * PEBS PMI handler to determine whether the PEBS overflow PMI |
162 | * comes from the host counters or the guest. |
163 | */ |
164 | return 1; |
165 | } |
166 | |
167 | static u64 get_sample_period(struct kvm_pmc *pmc, u64 counter_value) |
168 | { |
169 | u64 sample_period = (-counter_value) & pmc_bitmask(pmc); |
170 | |
171 | if (!sample_period) |
172 | sample_period = pmc_bitmask(pmc) + 1; |
173 | return sample_period; |
174 | } |
175 | |
176 | static int pmc_reprogram_counter(struct kvm_pmc *pmc, u32 type, u64 config, |
177 | bool exclude_user, bool exclude_kernel, |
178 | bool intr) |
179 | { |
180 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
181 | struct perf_event *event; |
182 | struct perf_event_attr attr = { |
183 | .type = type, |
184 | .size = sizeof(attr), |
185 | .pinned = true, |
186 | .exclude_idle = true, |
187 | .exclude_host = 1, |
188 | .exclude_user = exclude_user, |
189 | .exclude_kernel = exclude_kernel, |
190 | .config = config, |
191 | }; |
192 | bool pebs = test_bit(pmc->idx, (unsigned long *)&pmu->pebs_enable); |
193 | |
194 | attr.sample_period = get_sample_period(pmc, counter_value: pmc->counter); |
195 | |
196 | if ((attr.config & HSW_IN_TX_CHECKPOINTED) && |
197 | guest_cpuid_is_intel(vcpu: pmc->vcpu)) { |
198 | /* |
199 | * HSW_IN_TX_CHECKPOINTED is not supported with nonzero |
200 | * period. Just clear the sample period so at least |
201 | * allocating the counter doesn't fail. |
202 | */ |
203 | attr.sample_period = 0; |
204 | } |
205 | if (pebs) { |
206 | /* |
207 | * For most PEBS hardware events, the difference in the software |
208 | * precision levels of guest and host PEBS events will not affect |
209 | * the accuracy of the PEBS profiling result, because the "event IP" |
210 | * in the PEBS record is calibrated on the guest side. |
211 | */ |
212 | attr.precise_ip = pmc_get_pebs_precise_level(pmc); |
213 | } |
214 | |
215 | event = perf_event_create_kernel_counter(attr: &attr, cpu: -1, current, |
216 | callback: kvm_perf_overflow, context: pmc); |
217 | if (IS_ERR(ptr: event)) { |
218 | pr_debug_ratelimited("kvm_pmu: event creation failed %ld for pmc->idx = %d\n" , |
219 | PTR_ERR(event), pmc->idx); |
220 | return PTR_ERR(ptr: event); |
221 | } |
222 | |
223 | pmc->perf_event = event; |
224 | pmc_to_pmu(pmc)->event_count++; |
225 | pmc->is_paused = false; |
226 | pmc->intr = intr || pebs; |
227 | return 0; |
228 | } |
229 | |
230 | static bool pmc_pause_counter(struct kvm_pmc *pmc) |
231 | { |
232 | u64 counter = pmc->counter; |
233 | u64 prev_counter; |
234 | |
235 | /* update counter, reset event value to avoid redundant accumulation */ |
236 | if (pmc->perf_event && !pmc->is_paused) |
237 | counter += perf_event_pause(event: pmc->perf_event, reset: true); |
238 | |
239 | /* |
240 | * Snapshot the previous counter *after* accumulating state from perf. |
241 | * If overflow already happened, hardware (via perf) is responsible for |
242 | * generating a PMI. KVM just needs to detect overflow on emulated |
243 | * counter events that haven't yet been processed. |
244 | */ |
245 | prev_counter = counter & pmc_bitmask(pmc); |
246 | |
247 | counter += pmc->emulated_counter; |
248 | pmc->counter = counter & pmc_bitmask(pmc); |
249 | |
250 | pmc->emulated_counter = 0; |
251 | pmc->is_paused = true; |
252 | |
253 | return pmc->counter < prev_counter; |
254 | } |
255 | |
256 | static bool pmc_resume_counter(struct kvm_pmc *pmc) |
257 | { |
258 | if (!pmc->perf_event) |
259 | return false; |
260 | |
261 | /* recalibrate sample period and check if it's accepted by perf core */ |
262 | if (is_sampling_event(event: pmc->perf_event) && |
263 | perf_event_period(event: pmc->perf_event, |
264 | value: get_sample_period(pmc, counter_value: pmc->counter))) |
265 | return false; |
266 | |
267 | if (test_bit(pmc->idx, (unsigned long *)&pmc_to_pmu(pmc)->pebs_enable) != |
268 | (!!pmc->perf_event->attr.precise_ip)) |
269 | return false; |
270 | |
271 | /* reuse perf_event to serve as pmc_reprogram_counter() does*/ |
272 | perf_event_enable(event: pmc->perf_event); |
273 | pmc->is_paused = false; |
274 | |
275 | return true; |
276 | } |
277 | |
278 | static void pmc_release_perf_event(struct kvm_pmc *pmc) |
279 | { |
280 | if (pmc->perf_event) { |
281 | perf_event_release_kernel(event: pmc->perf_event); |
282 | pmc->perf_event = NULL; |
283 | pmc->current_config = 0; |
284 | pmc_to_pmu(pmc)->event_count--; |
285 | } |
286 | } |
287 | |
288 | static void pmc_stop_counter(struct kvm_pmc *pmc) |
289 | { |
290 | if (pmc->perf_event) { |
291 | pmc->counter = pmc_read_counter(pmc); |
292 | pmc_release_perf_event(pmc); |
293 | } |
294 | } |
295 | |
296 | static void pmc_update_sample_period(struct kvm_pmc *pmc) |
297 | { |
298 | if (!pmc->perf_event || pmc->is_paused || |
299 | !is_sampling_event(event: pmc->perf_event)) |
300 | return; |
301 | |
302 | perf_event_period(event: pmc->perf_event, |
303 | value: get_sample_period(pmc, counter_value: pmc->counter)); |
304 | } |
305 | |
306 | void pmc_write_counter(struct kvm_pmc *pmc, u64 val) |
307 | { |
308 | /* |
309 | * Drop any unconsumed accumulated counts, the WRMSR is a write, not a |
310 | * read-modify-write. Adjust the counter value so that its value is |
311 | * relative to the current count, as reading the current count from |
312 | * perf is faster than pausing and repgrogramming the event in order to |
313 | * reset it to '0'. Note, this very sneakily offsets the accumulated |
314 | * emulated count too, by using pmc_read_counter()! |
315 | */ |
316 | pmc->emulated_counter = 0; |
317 | pmc->counter += val - pmc_read_counter(pmc); |
318 | pmc->counter &= pmc_bitmask(pmc); |
319 | pmc_update_sample_period(pmc); |
320 | } |
321 | EXPORT_SYMBOL_GPL(pmc_write_counter); |
322 | |
323 | static int filter_cmp(const void *pa, const void *pb, u64 mask) |
324 | { |
325 | u64 a = *(u64 *)pa & mask; |
326 | u64 b = *(u64 *)pb & mask; |
327 | |
328 | return (a > b) - (a < b); |
329 | } |
330 | |
331 | |
332 | static int filter_sort_cmp(const void *pa, const void *pb) |
333 | { |
334 | return filter_cmp(pa, pb, mask: (KVM_PMU_MASKED_ENTRY_EVENT_SELECT | |
335 | KVM_PMU_MASKED_ENTRY_EXCLUDE)); |
336 | } |
337 | |
338 | /* |
339 | * For the event filter, searching is done on the 'includes' list and |
340 | * 'excludes' list separately rather than on the 'events' list (which |
341 | * has both). As a result the exclude bit can be ignored. |
342 | */ |
343 | static int filter_event_cmp(const void *pa, const void *pb) |
344 | { |
345 | return filter_cmp(pa, pb, mask: (KVM_PMU_MASKED_ENTRY_EVENT_SELECT)); |
346 | } |
347 | |
348 | static int find_filter_index(u64 *events, u64 nevents, u64 key) |
349 | { |
350 | u64 *fe = bsearch(key: &key, base: events, num: nevents, size: sizeof(events[0]), |
351 | cmp: filter_event_cmp); |
352 | |
353 | if (!fe) |
354 | return -1; |
355 | |
356 | return fe - events; |
357 | } |
358 | |
359 | static bool is_filter_entry_match(u64 filter_event, u64 umask) |
360 | { |
361 | u64 mask = filter_event >> (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8); |
362 | u64 match = filter_event & KVM_PMU_MASKED_ENTRY_UMASK_MATCH; |
363 | |
364 | BUILD_BUG_ON((KVM_PMU_ENCODE_MASKED_ENTRY(0, 0xff, 0, false) >> |
365 | (KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT - 8)) != |
366 | ARCH_PERFMON_EVENTSEL_UMASK); |
367 | |
368 | return (umask & mask) == match; |
369 | } |
370 | |
371 | static bool filter_contains_match(u64 *events, u64 nevents, u64 eventsel) |
372 | { |
373 | u64 event_select = eventsel & kvm_pmu_ops.EVENTSEL_EVENT; |
374 | u64 umask = eventsel & ARCH_PERFMON_EVENTSEL_UMASK; |
375 | int i, index; |
376 | |
377 | index = find_filter_index(events, nevents, key: event_select); |
378 | if (index < 0) |
379 | return false; |
380 | |
381 | /* |
382 | * Entries are sorted by the event select. Walk the list in both |
383 | * directions to process all entries with the targeted event select. |
384 | */ |
385 | for (i = index; i < nevents; i++) { |
386 | if (filter_event_cmp(pa: &events[i], pb: &event_select)) |
387 | break; |
388 | |
389 | if (is_filter_entry_match(filter_event: events[i], umask)) |
390 | return true; |
391 | } |
392 | |
393 | for (i = index - 1; i >= 0; i--) { |
394 | if (filter_event_cmp(pa: &events[i], pb: &event_select)) |
395 | break; |
396 | |
397 | if (is_filter_entry_match(filter_event: events[i], umask)) |
398 | return true; |
399 | } |
400 | |
401 | return false; |
402 | } |
403 | |
404 | static bool is_gp_event_allowed(struct kvm_x86_pmu_event_filter *f, |
405 | u64 eventsel) |
406 | { |
407 | if (filter_contains_match(events: f->includes, nevents: f->nr_includes, eventsel) && |
408 | !filter_contains_match(events: f->excludes, nevents: f->nr_excludes, eventsel)) |
409 | return f->action == KVM_PMU_EVENT_ALLOW; |
410 | |
411 | return f->action == KVM_PMU_EVENT_DENY; |
412 | } |
413 | |
414 | static bool is_fixed_event_allowed(struct kvm_x86_pmu_event_filter *filter, |
415 | int idx) |
416 | { |
417 | int fixed_idx = idx - KVM_FIXED_PMC_BASE_IDX; |
418 | |
419 | if (filter->action == KVM_PMU_EVENT_DENY && |
420 | test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) |
421 | return false; |
422 | if (filter->action == KVM_PMU_EVENT_ALLOW && |
423 | !test_bit(fixed_idx, (ulong *)&filter->fixed_counter_bitmap)) |
424 | return false; |
425 | |
426 | return true; |
427 | } |
428 | |
429 | static bool check_pmu_event_filter(struct kvm_pmc *pmc) |
430 | { |
431 | struct kvm_x86_pmu_event_filter *filter; |
432 | struct kvm *kvm = pmc->vcpu->kvm; |
433 | |
434 | filter = srcu_dereference(kvm->arch.pmu_event_filter, &kvm->srcu); |
435 | if (!filter) |
436 | return true; |
437 | |
438 | if (pmc_is_gp(pmc)) |
439 | return is_gp_event_allowed(f: filter, eventsel: pmc->eventsel); |
440 | |
441 | return is_fixed_event_allowed(filter, idx: pmc->idx); |
442 | } |
443 | |
444 | static bool pmc_event_is_allowed(struct kvm_pmc *pmc) |
445 | { |
446 | return pmc_is_globally_enabled(pmc) && pmc_speculative_in_use(pmc) && |
447 | check_pmu_event_filter(pmc); |
448 | } |
449 | |
450 | static int reprogram_counter(struct kvm_pmc *pmc) |
451 | { |
452 | struct kvm_pmu *pmu = pmc_to_pmu(pmc); |
453 | u64 eventsel = pmc->eventsel; |
454 | u64 new_config = eventsel; |
455 | bool emulate_overflow; |
456 | u8 fixed_ctr_ctrl; |
457 | |
458 | emulate_overflow = pmc_pause_counter(pmc); |
459 | |
460 | if (!pmc_event_is_allowed(pmc)) |
461 | return 0; |
462 | |
463 | if (emulate_overflow) |
464 | __kvm_perf_overflow(pmc, in_pmi: false); |
465 | |
466 | if (eventsel & ARCH_PERFMON_EVENTSEL_PIN_CONTROL) |
467 | printk_once("kvm pmu: pin control bit is ignored\n" ); |
468 | |
469 | if (pmc_is_fixed(pmc)) { |
470 | fixed_ctr_ctrl = fixed_ctrl_field(pmu->fixed_ctr_ctrl, |
471 | pmc->idx - KVM_FIXED_PMC_BASE_IDX); |
472 | if (fixed_ctr_ctrl & 0x1) |
473 | eventsel |= ARCH_PERFMON_EVENTSEL_OS; |
474 | if (fixed_ctr_ctrl & 0x2) |
475 | eventsel |= ARCH_PERFMON_EVENTSEL_USR; |
476 | if (fixed_ctr_ctrl & 0x8) |
477 | eventsel |= ARCH_PERFMON_EVENTSEL_INT; |
478 | new_config = (u64)fixed_ctr_ctrl; |
479 | } |
480 | |
481 | if (pmc->current_config == new_config && pmc_resume_counter(pmc)) |
482 | return 0; |
483 | |
484 | pmc_release_perf_event(pmc); |
485 | |
486 | pmc->current_config = new_config; |
487 | |
488 | return pmc_reprogram_counter(pmc, type: PERF_TYPE_RAW, |
489 | config: (eventsel & pmu->raw_event_mask), |
490 | exclude_user: !(eventsel & ARCH_PERFMON_EVENTSEL_USR), |
491 | exclude_kernel: !(eventsel & ARCH_PERFMON_EVENTSEL_OS), |
492 | intr: eventsel & ARCH_PERFMON_EVENTSEL_INT); |
493 | } |
494 | |
495 | void kvm_pmu_handle_event(struct kvm_vcpu *vcpu) |
496 | { |
497 | DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); |
498 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
499 | struct kvm_pmc *pmc; |
500 | int bit; |
501 | |
502 | bitmap_copy(dst: bitmap, src: pmu->reprogram_pmi, X86_PMC_IDX_MAX); |
503 | |
504 | /* |
505 | * The reprogramming bitmap can be written asynchronously by something |
506 | * other than the task that holds vcpu->mutex, take care to clear only |
507 | * the bits that will actually processed. |
508 | */ |
509 | BUILD_BUG_ON(sizeof(bitmap) != sizeof(atomic64_t)); |
510 | atomic64_andnot(i: *(s64 *)bitmap, v: &pmu->__reprogram_pmi); |
511 | |
512 | kvm_for_each_pmc(pmu, pmc, bit, bitmap) { |
513 | /* |
514 | * If reprogramming fails, e.g. due to contention, re-set the |
515 | * regprogram bit set, i.e. opportunistically try again on the |
516 | * next PMU refresh. Don't make a new request as doing so can |
517 | * stall the guest if reprogramming repeatedly fails. |
518 | */ |
519 | if (reprogram_counter(pmc)) |
520 | set_bit(nr: pmc->idx, addr: pmu->reprogram_pmi); |
521 | } |
522 | |
523 | /* |
524 | * Unused perf_events are only released if the corresponding MSRs |
525 | * weren't accessed during the last vCPU time slice. kvm_arch_sched_in |
526 | * triggers KVM_REQ_PMU if cleanup is needed. |
527 | */ |
528 | if (unlikely(pmu->need_cleanup)) |
529 | kvm_pmu_cleanup(vcpu); |
530 | } |
531 | |
532 | int kvm_pmu_check_rdpmc_early(struct kvm_vcpu *vcpu, unsigned int idx) |
533 | { |
534 | /* |
535 | * On Intel, VMX interception has priority over RDPMC exceptions that |
536 | * aren't already handled by the emulator, i.e. there are no additional |
537 | * check needed for Intel PMUs. |
538 | * |
539 | * On AMD, _all_ exceptions on RDPMC have priority over SVM intercepts, |
540 | * i.e. an invalid PMC results in a #GP, not #VMEXIT. |
541 | */ |
542 | if (!kvm_pmu_ops.check_rdpmc_early) |
543 | return 0; |
544 | |
545 | return static_call(kvm_x86_pmu_check_rdpmc_early)(vcpu, idx); |
546 | } |
547 | |
548 | bool is_vmware_backdoor_pmc(u32 pmc_idx) |
549 | { |
550 | switch (pmc_idx) { |
551 | case VMWARE_BACKDOOR_PMC_HOST_TSC: |
552 | case VMWARE_BACKDOOR_PMC_REAL_TIME: |
553 | case VMWARE_BACKDOOR_PMC_APPARENT_TIME: |
554 | return true; |
555 | } |
556 | return false; |
557 | } |
558 | |
559 | static int kvm_pmu_rdpmc_vmware(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) |
560 | { |
561 | u64 ctr_val; |
562 | |
563 | switch (idx) { |
564 | case VMWARE_BACKDOOR_PMC_HOST_TSC: |
565 | ctr_val = rdtsc(); |
566 | break; |
567 | case VMWARE_BACKDOOR_PMC_REAL_TIME: |
568 | ctr_val = ktime_get_boottime_ns(); |
569 | break; |
570 | case VMWARE_BACKDOOR_PMC_APPARENT_TIME: |
571 | ctr_val = ktime_get_boottime_ns() + |
572 | vcpu->kvm->arch.kvmclock_offset; |
573 | break; |
574 | default: |
575 | return 1; |
576 | } |
577 | |
578 | *data = ctr_val; |
579 | return 0; |
580 | } |
581 | |
582 | int kvm_pmu_rdpmc(struct kvm_vcpu *vcpu, unsigned idx, u64 *data) |
583 | { |
584 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
585 | struct kvm_pmc *pmc; |
586 | u64 mask = ~0ull; |
587 | |
588 | if (!pmu->version) |
589 | return 1; |
590 | |
591 | if (is_vmware_backdoor_pmc(pmc_idx: idx)) |
592 | return kvm_pmu_rdpmc_vmware(vcpu, idx, data); |
593 | |
594 | pmc = static_call(kvm_x86_pmu_rdpmc_ecx_to_pmc)(vcpu, idx, &mask); |
595 | if (!pmc) |
596 | return 1; |
597 | |
598 | if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCE) && |
599 | (static_call(kvm_x86_get_cpl)(vcpu) != 0) && |
600 | kvm_is_cr0_bit_set(vcpu, X86_CR0_PE)) |
601 | return 1; |
602 | |
603 | *data = pmc_read_counter(pmc) & mask; |
604 | return 0; |
605 | } |
606 | |
607 | void kvm_pmu_deliver_pmi(struct kvm_vcpu *vcpu) |
608 | { |
609 | if (lapic_in_kernel(vcpu)) { |
610 | static_call_cond(kvm_x86_pmu_deliver_pmi)(vcpu); |
611 | kvm_apic_local_deliver(apic: vcpu->arch.apic, APIC_LVTPC); |
612 | } |
613 | } |
614 | |
615 | bool kvm_pmu_is_valid_msr(struct kvm_vcpu *vcpu, u32 msr) |
616 | { |
617 | switch (msr) { |
618 | case MSR_CORE_PERF_GLOBAL_STATUS: |
619 | case MSR_CORE_PERF_GLOBAL_CTRL: |
620 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: |
621 | return kvm_pmu_has_perf_global_ctrl(vcpu_to_pmu(vcpu)); |
622 | default: |
623 | break; |
624 | } |
625 | return static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr) || |
626 | static_call(kvm_x86_pmu_is_valid_msr)(vcpu, msr); |
627 | } |
628 | |
629 | static void kvm_pmu_mark_pmc_in_use(struct kvm_vcpu *vcpu, u32 msr) |
630 | { |
631 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
632 | struct kvm_pmc *pmc = static_call(kvm_x86_pmu_msr_idx_to_pmc)(vcpu, msr); |
633 | |
634 | if (pmc) |
635 | __set_bit(pmc->idx, pmu->pmc_in_use); |
636 | } |
637 | |
638 | int kvm_pmu_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
639 | { |
640 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
641 | u32 msr = msr_info->index; |
642 | |
643 | switch (msr) { |
644 | case MSR_CORE_PERF_GLOBAL_STATUS: |
645 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: |
646 | msr_info->data = pmu->global_status; |
647 | break; |
648 | case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: |
649 | case MSR_CORE_PERF_GLOBAL_CTRL: |
650 | msr_info->data = pmu->global_ctrl; |
651 | break; |
652 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: |
653 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: |
654 | msr_info->data = 0; |
655 | break; |
656 | default: |
657 | return static_call(kvm_x86_pmu_get_msr)(vcpu, msr_info); |
658 | } |
659 | |
660 | return 0; |
661 | } |
662 | |
663 | int kvm_pmu_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
664 | { |
665 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
666 | u32 msr = msr_info->index; |
667 | u64 data = msr_info->data; |
668 | u64 diff; |
669 | |
670 | /* |
671 | * Note, AMD ignores writes to reserved bits and read-only PMU MSRs, |
672 | * whereas Intel generates #GP on attempts to write reserved/RO MSRs. |
673 | */ |
674 | switch (msr) { |
675 | case MSR_CORE_PERF_GLOBAL_STATUS: |
676 | if (!msr_info->host_initiated) |
677 | return 1; /* RO MSR */ |
678 | fallthrough; |
679 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: |
680 | /* Per PPR, Read-only MSR. Writes are ignored. */ |
681 | if (!msr_info->host_initiated) |
682 | break; |
683 | |
684 | if (data & pmu->global_status_mask) |
685 | return 1; |
686 | |
687 | pmu->global_status = data; |
688 | break; |
689 | case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: |
690 | data &= ~pmu->global_ctrl_mask; |
691 | fallthrough; |
692 | case MSR_CORE_PERF_GLOBAL_CTRL: |
693 | if (!kvm_valid_perf_global_ctrl(pmu, data)) |
694 | return 1; |
695 | |
696 | if (pmu->global_ctrl != data) { |
697 | diff = pmu->global_ctrl ^ data; |
698 | pmu->global_ctrl = data; |
699 | reprogram_counters(pmu, diff); |
700 | } |
701 | break; |
702 | case MSR_CORE_PERF_GLOBAL_OVF_CTRL: |
703 | /* |
704 | * GLOBAL_OVF_CTRL, a.k.a. GLOBAL STATUS_RESET, clears bits in |
705 | * GLOBAL_STATUS, and so the set of reserved bits is the same. |
706 | */ |
707 | if (data & pmu->global_status_mask) |
708 | return 1; |
709 | fallthrough; |
710 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: |
711 | if (!msr_info->host_initiated) |
712 | pmu->global_status &= ~data; |
713 | break; |
714 | default: |
715 | kvm_pmu_mark_pmc_in_use(vcpu, msr: msr_info->index); |
716 | return static_call(kvm_x86_pmu_set_msr)(vcpu, msr_info); |
717 | } |
718 | |
719 | return 0; |
720 | } |
721 | |
722 | static void kvm_pmu_reset(struct kvm_vcpu *vcpu) |
723 | { |
724 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
725 | struct kvm_pmc *pmc; |
726 | int i; |
727 | |
728 | pmu->need_cleanup = false; |
729 | |
730 | bitmap_zero(dst: pmu->reprogram_pmi, X86_PMC_IDX_MAX); |
731 | |
732 | kvm_for_each_pmc(pmu, pmc, i, pmu->all_valid_pmc_idx) { |
733 | pmc_stop_counter(pmc); |
734 | pmc->counter = 0; |
735 | pmc->emulated_counter = 0; |
736 | |
737 | if (pmc_is_gp(pmc)) |
738 | pmc->eventsel = 0; |
739 | } |
740 | |
741 | pmu->fixed_ctr_ctrl = pmu->global_ctrl = pmu->global_status = 0; |
742 | |
743 | static_call_cond(kvm_x86_pmu_reset)(vcpu); |
744 | } |
745 | |
746 | |
747 | /* |
748 | * Refresh the PMU configuration for the vCPU, e.g. if userspace changes CPUID |
749 | * and/or PERF_CAPABILITIES. |
750 | */ |
751 | void kvm_pmu_refresh(struct kvm_vcpu *vcpu) |
752 | { |
753 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
754 | |
755 | if (KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm)) |
756 | return; |
757 | |
758 | /* |
759 | * Stop/release all existing counters/events before realizing the new |
760 | * vPMU model. |
761 | */ |
762 | kvm_pmu_reset(vcpu); |
763 | |
764 | pmu->version = 0; |
765 | pmu->nr_arch_gp_counters = 0; |
766 | pmu->nr_arch_fixed_counters = 0; |
767 | pmu->counter_bitmask[KVM_PMC_GP] = 0; |
768 | pmu->counter_bitmask[KVM_PMC_FIXED] = 0; |
769 | pmu->reserved_bits = 0xffffffff00200000ull; |
770 | pmu->raw_event_mask = X86_RAW_EVENT_MASK; |
771 | pmu->global_ctrl_mask = ~0ull; |
772 | pmu->global_status_mask = ~0ull; |
773 | pmu->fixed_ctr_ctrl_mask = ~0ull; |
774 | pmu->pebs_enable_mask = ~0ull; |
775 | pmu->pebs_data_cfg_mask = ~0ull; |
776 | bitmap_zero(dst: pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); |
777 | |
778 | if (!vcpu->kvm->arch.enable_pmu) |
779 | return; |
780 | |
781 | static_call(kvm_x86_pmu_refresh)(vcpu); |
782 | |
783 | /* |
784 | * At RESET, both Intel and AMD CPUs set all enable bits for general |
785 | * purpose counters in IA32_PERF_GLOBAL_CTRL (so that software that |
786 | * was written for v1 PMUs don't unknowingly leave GP counters disabled |
787 | * in the global controls). Emulate that behavior when refreshing the |
788 | * PMU so that userspace doesn't need to manually set PERF_GLOBAL_CTRL. |
789 | */ |
790 | if (kvm_pmu_has_perf_global_ctrl(pmu) && pmu->nr_arch_gp_counters) |
791 | pmu->global_ctrl = GENMASK_ULL(pmu->nr_arch_gp_counters - 1, 0); |
792 | } |
793 | |
794 | void kvm_pmu_init(struct kvm_vcpu *vcpu) |
795 | { |
796 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
797 | |
798 | memset(pmu, 0, sizeof(*pmu)); |
799 | static_call(kvm_x86_pmu_init)(vcpu); |
800 | kvm_pmu_refresh(vcpu); |
801 | } |
802 | |
803 | /* Release perf_events for vPMCs that have been unused for a full time slice. */ |
804 | void kvm_pmu_cleanup(struct kvm_vcpu *vcpu) |
805 | { |
806 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
807 | struct kvm_pmc *pmc = NULL; |
808 | DECLARE_BITMAP(bitmask, X86_PMC_IDX_MAX); |
809 | int i; |
810 | |
811 | pmu->need_cleanup = false; |
812 | |
813 | bitmap_andnot(dst: bitmask, src1: pmu->all_valid_pmc_idx, |
814 | src2: pmu->pmc_in_use, X86_PMC_IDX_MAX); |
815 | |
816 | kvm_for_each_pmc(pmu, pmc, i, bitmask) { |
817 | if (pmc->perf_event && !pmc_speculative_in_use(pmc)) |
818 | pmc_stop_counter(pmc); |
819 | } |
820 | |
821 | static_call_cond(kvm_x86_pmu_cleanup)(vcpu); |
822 | |
823 | bitmap_zero(dst: pmu->pmc_in_use, X86_PMC_IDX_MAX); |
824 | } |
825 | |
826 | void kvm_pmu_destroy(struct kvm_vcpu *vcpu) |
827 | { |
828 | kvm_pmu_reset(vcpu); |
829 | } |
830 | |
831 | static void kvm_pmu_incr_counter(struct kvm_pmc *pmc) |
832 | { |
833 | pmc->emulated_counter++; |
834 | kvm_pmu_request_counter_reprogram(pmc); |
835 | } |
836 | |
837 | static inline bool cpl_is_matched(struct kvm_pmc *pmc) |
838 | { |
839 | bool select_os, select_user; |
840 | u64 config; |
841 | |
842 | if (pmc_is_gp(pmc)) { |
843 | config = pmc->eventsel; |
844 | select_os = config & ARCH_PERFMON_EVENTSEL_OS; |
845 | select_user = config & ARCH_PERFMON_EVENTSEL_USR; |
846 | } else { |
847 | config = fixed_ctrl_field(pmc_to_pmu(pmc)->fixed_ctr_ctrl, |
848 | pmc->idx - KVM_FIXED_PMC_BASE_IDX); |
849 | select_os = config & 0x1; |
850 | select_user = config & 0x2; |
851 | } |
852 | |
853 | /* |
854 | * Skip the CPL lookup, which isn't free on Intel, if the result will |
855 | * be the same regardless of the CPL. |
856 | */ |
857 | if (select_os == select_user) |
858 | return select_os; |
859 | |
860 | return (static_call(kvm_x86_get_cpl)(pmc->vcpu) == 0) ? select_os : select_user; |
861 | } |
862 | |
863 | void kvm_pmu_trigger_event(struct kvm_vcpu *vcpu, u64 eventsel) |
864 | { |
865 | DECLARE_BITMAP(bitmap, X86_PMC_IDX_MAX); |
866 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
867 | struct kvm_pmc *pmc; |
868 | int i; |
869 | |
870 | BUILD_BUG_ON(sizeof(pmu->global_ctrl) * BITS_PER_BYTE != X86_PMC_IDX_MAX); |
871 | |
872 | if (!kvm_pmu_has_perf_global_ctrl(pmu)) |
873 | bitmap_copy(dst: bitmap, src: pmu->all_valid_pmc_idx, X86_PMC_IDX_MAX); |
874 | else if (!bitmap_and(dst: bitmap, src1: pmu->all_valid_pmc_idx, |
875 | src2: (unsigned long *)&pmu->global_ctrl, X86_PMC_IDX_MAX)) |
876 | return; |
877 | |
878 | kvm_for_each_pmc(pmu, pmc, i, bitmap) { |
879 | /* |
880 | * Ignore checks for edge detect (all events currently emulated |
881 | * but KVM are always rising edges), pin control (unsupported |
882 | * by modern CPUs), and counter mask and its invert flag (KVM |
883 | * doesn't emulate multiple events in a single clock cycle). |
884 | * |
885 | * Note, the uppermost nibble of AMD's mask overlaps Intel's |
886 | * IN_TX (bit 32) and IN_TXCP (bit 33), as well as two reserved |
887 | * bits (bits 35:34). Checking the "in HLE/RTM transaction" |
888 | * flags is correct as the vCPU can't be in a transaction if |
889 | * KVM is emulating an instruction. Checking the reserved bits |
890 | * might be wrong if they are defined in the future, but so |
891 | * could ignoring them, so do the simple thing for now. |
892 | */ |
893 | if (((pmc->eventsel ^ eventsel) & AMD64_RAW_EVENT_MASK_NB) || |
894 | !pmc_event_is_allowed(pmc) || !cpl_is_matched(pmc)) |
895 | continue; |
896 | |
897 | kvm_pmu_incr_counter(pmc); |
898 | } |
899 | } |
900 | EXPORT_SYMBOL_GPL(kvm_pmu_trigger_event); |
901 | |
902 | static bool is_masked_filter_valid(const struct kvm_x86_pmu_event_filter *filter) |
903 | { |
904 | u64 mask = kvm_pmu_ops.EVENTSEL_EVENT | |
905 | KVM_PMU_MASKED_ENTRY_UMASK_MASK | |
906 | KVM_PMU_MASKED_ENTRY_UMASK_MATCH | |
907 | KVM_PMU_MASKED_ENTRY_EXCLUDE; |
908 | int i; |
909 | |
910 | for (i = 0; i < filter->nevents; i++) { |
911 | if (filter->events[i] & ~mask) |
912 | return false; |
913 | } |
914 | |
915 | return true; |
916 | } |
917 | |
918 | static void convert_to_masked_filter(struct kvm_x86_pmu_event_filter *filter) |
919 | { |
920 | int i, j; |
921 | |
922 | for (i = 0, j = 0; i < filter->nevents; i++) { |
923 | /* |
924 | * Skip events that are impossible to match against a guest |
925 | * event. When filtering, only the event select + unit mask |
926 | * of the guest event is used. To maintain backwards |
927 | * compatibility, impossible filters can't be rejected :-( |
928 | */ |
929 | if (filter->events[i] & ~(kvm_pmu_ops.EVENTSEL_EVENT | |
930 | ARCH_PERFMON_EVENTSEL_UMASK)) |
931 | continue; |
932 | /* |
933 | * Convert userspace events to a common in-kernel event so |
934 | * only one code path is needed to support both events. For |
935 | * the in-kernel events use masked events because they are |
936 | * flexible enough to handle both cases. To convert to masked |
937 | * events all that's needed is to add an "all ones" umask_mask, |
938 | * (unmasked filter events don't support EXCLUDE). |
939 | */ |
940 | filter->events[j++] = filter->events[i] | |
941 | (0xFFULL << KVM_PMU_MASKED_ENTRY_UMASK_MASK_SHIFT); |
942 | } |
943 | |
944 | filter->nevents = j; |
945 | } |
946 | |
947 | static int prepare_filter_lists(struct kvm_x86_pmu_event_filter *filter) |
948 | { |
949 | int i; |
950 | |
951 | if (!(filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS)) |
952 | convert_to_masked_filter(filter); |
953 | else if (!is_masked_filter_valid(filter)) |
954 | return -EINVAL; |
955 | |
956 | /* |
957 | * Sort entries by event select and includes vs. excludes so that all |
958 | * entries for a given event select can be processed efficiently during |
959 | * filtering. The EXCLUDE flag uses a more significant bit than the |
960 | * event select, and so the sorted list is also effectively split into |
961 | * includes and excludes sub-lists. |
962 | */ |
963 | sort(base: &filter->events, num: filter->nevents, size: sizeof(filter->events[0]), |
964 | cmp_func: filter_sort_cmp, NULL); |
965 | |
966 | i = filter->nevents; |
967 | /* Find the first EXCLUDE event (only supported for masked events). */ |
968 | if (filter->flags & KVM_PMU_EVENT_FLAG_MASKED_EVENTS) { |
969 | for (i = 0; i < filter->nevents; i++) { |
970 | if (filter->events[i] & KVM_PMU_MASKED_ENTRY_EXCLUDE) |
971 | break; |
972 | } |
973 | } |
974 | |
975 | filter->nr_includes = i; |
976 | filter->nr_excludes = filter->nevents - filter->nr_includes; |
977 | filter->includes = filter->events; |
978 | filter->excludes = filter->events + filter->nr_includes; |
979 | |
980 | return 0; |
981 | } |
982 | |
983 | int kvm_vm_ioctl_set_pmu_event_filter(struct kvm *kvm, void __user *argp) |
984 | { |
985 | struct kvm_pmu_event_filter __user *user_filter = argp; |
986 | struct kvm_x86_pmu_event_filter *filter; |
987 | struct kvm_pmu_event_filter tmp; |
988 | struct kvm_vcpu *vcpu; |
989 | unsigned long i; |
990 | size_t size; |
991 | int r; |
992 | |
993 | if (copy_from_user(to: &tmp, from: user_filter, n: sizeof(tmp))) |
994 | return -EFAULT; |
995 | |
996 | if (tmp.action != KVM_PMU_EVENT_ALLOW && |
997 | tmp.action != KVM_PMU_EVENT_DENY) |
998 | return -EINVAL; |
999 | |
1000 | if (tmp.flags & ~KVM_PMU_EVENT_FLAGS_VALID_MASK) |
1001 | return -EINVAL; |
1002 | |
1003 | if (tmp.nevents > KVM_PMU_EVENT_FILTER_MAX_EVENTS) |
1004 | return -E2BIG; |
1005 | |
1006 | size = struct_size(filter, events, tmp.nevents); |
1007 | filter = kzalloc(size, GFP_KERNEL_ACCOUNT); |
1008 | if (!filter) |
1009 | return -ENOMEM; |
1010 | |
1011 | filter->action = tmp.action; |
1012 | filter->nevents = tmp.nevents; |
1013 | filter->fixed_counter_bitmap = tmp.fixed_counter_bitmap; |
1014 | filter->flags = tmp.flags; |
1015 | |
1016 | r = -EFAULT; |
1017 | if (copy_from_user(to: filter->events, from: user_filter->events, |
1018 | n: sizeof(filter->events[0]) * filter->nevents)) |
1019 | goto cleanup; |
1020 | |
1021 | r = prepare_filter_lists(filter); |
1022 | if (r) |
1023 | goto cleanup; |
1024 | |
1025 | mutex_lock(&kvm->lock); |
1026 | filter = rcu_replace_pointer(kvm->arch.pmu_event_filter, filter, |
1027 | mutex_is_locked(&kvm->lock)); |
1028 | mutex_unlock(lock: &kvm->lock); |
1029 | synchronize_srcu_expedited(ssp: &kvm->srcu); |
1030 | |
1031 | BUILD_BUG_ON(sizeof(((struct kvm_pmu *)0)->reprogram_pmi) > |
1032 | sizeof(((struct kvm_pmu *)0)->__reprogram_pmi)); |
1033 | |
1034 | kvm_for_each_vcpu(i, vcpu, kvm) |
1035 | atomic64_set(v: &vcpu_to_pmu(vcpu)->__reprogram_pmi, i: -1ull); |
1036 | |
1037 | kvm_make_all_cpus_request(kvm, KVM_REQ_PMU); |
1038 | |
1039 | r = 0; |
1040 | cleanup: |
1041 | kfree(objp: filter); |
1042 | return r; |
1043 | } |
1044 | |