1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Kernel-based Virtual Machine driver for Linux |
4 | * |
5 | * derived from drivers/kvm/kvm_main.c |
6 | * |
7 | * Copyright (C) 2006 Qumranet, Inc. |
8 | * Copyright (C) 2008 Qumranet, Inc. |
9 | * Copyright IBM Corporation, 2008 |
10 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
11 | * |
12 | * Authors: |
13 | * Avi Kivity <avi@qumranet.com> |
14 | * Yaniv Kamay <yaniv@qumranet.com> |
15 | * Amit Shah <amit.shah@qumranet.com> |
16 | * Ben-Ami Yassour <benami@il.ibm.com> |
17 | */ |
18 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
19 | |
20 | #include <linux/kvm_host.h> |
21 | #include "irq.h" |
22 | #include "ioapic.h" |
23 | #include "mmu.h" |
24 | #include "i8254.h" |
25 | #include "tss.h" |
26 | #include "kvm_cache_regs.h" |
27 | #include "kvm_emulate.h" |
28 | #include "mmu/page_track.h" |
29 | #include "x86.h" |
30 | #include "cpuid.h" |
31 | #include "pmu.h" |
32 | #include "hyperv.h" |
33 | #include "lapic.h" |
34 | #include "xen.h" |
35 | #include "smm.h" |
36 | |
37 | #include <linux/clocksource.h> |
38 | #include <linux/interrupt.h> |
39 | #include <linux/kvm.h> |
40 | #include <linux/fs.h> |
41 | #include <linux/vmalloc.h> |
42 | #include <linux/export.h> |
43 | #include <linux/moduleparam.h> |
44 | #include <linux/mman.h> |
45 | #include <linux/highmem.h> |
46 | #include <linux/iommu.h> |
47 | #include <linux/cpufreq.h> |
48 | #include <linux/user-return-notifier.h> |
49 | #include <linux/srcu.h> |
50 | #include <linux/slab.h> |
51 | #include <linux/perf_event.h> |
52 | #include <linux/uaccess.h> |
53 | #include <linux/hash.h> |
54 | #include <linux/pci.h> |
55 | #include <linux/timekeeper_internal.h> |
56 | #include <linux/pvclock_gtod.h> |
57 | #include <linux/kvm_irqfd.h> |
58 | #include <linux/irqbypass.h> |
59 | #include <linux/sched/stat.h> |
60 | #include <linux/sched/isolation.h> |
61 | #include <linux/mem_encrypt.h> |
62 | #include <linux/entry-kvm.h> |
63 | #include <linux/suspend.h> |
64 | #include <linux/smp.h> |
65 | |
66 | #include <trace/events/ipi.h> |
67 | #include <trace/events/kvm.h> |
68 | |
69 | #include <asm/debugreg.h> |
70 | #include <asm/msr.h> |
71 | #include <asm/desc.h> |
72 | #include <asm/mce.h> |
73 | #include <asm/pkru.h> |
74 | #include <linux/kernel_stat.h> |
75 | #include <asm/fpu/api.h> |
76 | #include <asm/fpu/xcr.h> |
77 | #include <asm/fpu/xstate.h> |
78 | #include <asm/pvclock.h> |
79 | #include <asm/div64.h> |
80 | #include <asm/irq_remapping.h> |
81 | #include <asm/mshyperv.h> |
82 | #include <asm/hypervisor.h> |
83 | #include <asm/tlbflush.h> |
84 | #include <asm/intel_pt.h> |
85 | #include <asm/emulate_prefix.h> |
86 | #include <asm/sgx.h> |
87 | #include <clocksource/hyperv_timer.h> |
88 | |
89 | #define CREATE_TRACE_POINTS |
90 | #include "trace.h" |
91 | |
92 | #define MAX_IO_MSRS 256 |
93 | #define KVM_MAX_MCE_BANKS 32 |
94 | |
95 | struct kvm_caps kvm_caps __read_mostly = { |
96 | .supported_mce_cap = MCG_CTL_P | MCG_SER_P, |
97 | }; |
98 | EXPORT_SYMBOL_GPL(kvm_caps); |
99 | |
100 | #define ERR_PTR_USR(e) ((void __user *)ERR_PTR(e)) |
101 | |
102 | #define emul_to_vcpu(ctxt) \ |
103 | ((struct kvm_vcpu *)(ctxt)->vcpu) |
104 | |
105 | /* EFER defaults: |
106 | * - enable syscall per default because its emulated by KVM |
107 | * - enable LME and LMA per default on 64 bit KVM |
108 | */ |
109 | #ifdef CONFIG_X86_64 |
110 | static |
111 | u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA)); |
112 | #else |
113 | static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE); |
114 | #endif |
115 | |
116 | static u64 __read_mostly cr4_reserved_bits = CR4_RESERVED_BITS; |
117 | |
118 | #define KVM_EXIT_HYPERCALL_VALID_MASK (1 << KVM_HC_MAP_GPA_RANGE) |
119 | |
120 | #define KVM_CAP_PMU_VALID_MASK KVM_PMU_CAP_DISABLE |
121 | |
122 | #define KVM_X2APIC_API_VALID_FLAGS (KVM_X2APIC_API_USE_32BIT_IDS | \ |
123 | KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) |
124 | |
125 | static void update_cr8_intercept(struct kvm_vcpu *vcpu); |
126 | static void process_nmi(struct kvm_vcpu *vcpu); |
127 | static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags); |
128 | static void store_regs(struct kvm_vcpu *vcpu); |
129 | static int sync_regs(struct kvm_vcpu *vcpu); |
130 | static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu); |
131 | |
132 | static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); |
133 | static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2); |
134 | |
135 | static DEFINE_MUTEX(vendor_module_lock); |
136 | struct kvm_x86_ops kvm_x86_ops __read_mostly; |
137 | |
138 | #define KVM_X86_OP(func) \ |
139 | DEFINE_STATIC_CALL_NULL(kvm_x86_##func, \ |
140 | *(((struct kvm_x86_ops *)0)->func)); |
141 | #define KVM_X86_OP_OPTIONAL KVM_X86_OP |
142 | #define KVM_X86_OP_OPTIONAL_RET0 KVM_X86_OP |
143 | #include <asm/kvm-x86-ops.h> |
144 | EXPORT_STATIC_CALL_GPL(kvm_x86_get_cs_db_l_bits); |
145 | EXPORT_STATIC_CALL_GPL(kvm_x86_cache_reg); |
146 | |
147 | static bool __read_mostly ignore_msrs = 0; |
148 | module_param(ignore_msrs, bool, 0644); |
149 | |
150 | bool __read_mostly report_ignored_msrs = true; |
151 | module_param(report_ignored_msrs, bool, 0644); |
152 | EXPORT_SYMBOL_GPL(report_ignored_msrs); |
153 | |
154 | unsigned int min_timer_period_us = 200; |
155 | module_param(min_timer_period_us, uint, 0644); |
156 | |
157 | static bool __read_mostly kvmclock_periodic_sync = true; |
158 | module_param(kvmclock_periodic_sync, bool, 0444); |
159 | |
160 | /* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */ |
161 | static u32 __read_mostly tsc_tolerance_ppm = 250; |
162 | module_param(tsc_tolerance_ppm, uint, 0644); |
163 | |
164 | /* |
165 | * lapic timer advance (tscdeadline mode only) in nanoseconds. '-1' enables |
166 | * adaptive tuning starting from default advancement of 1000ns. '0' disables |
167 | * advancement entirely. Any other value is used as-is and disables adaptive |
168 | * tuning, i.e. allows privileged userspace to set an exact advancement time. |
169 | */ |
170 | static int __read_mostly lapic_timer_advance_ns = -1; |
171 | module_param(lapic_timer_advance_ns, int, 0644); |
172 | |
173 | static bool __read_mostly vector_hashing = true; |
174 | module_param(vector_hashing, bool, 0444); |
175 | |
176 | bool __read_mostly enable_vmware_backdoor = false; |
177 | module_param(enable_vmware_backdoor, bool, 0444); |
178 | EXPORT_SYMBOL_GPL(enable_vmware_backdoor); |
179 | |
180 | /* |
181 | * Flags to manipulate forced emulation behavior (any non-zero value will |
182 | * enable forced emulation). |
183 | */ |
184 | #define KVM_FEP_CLEAR_RFLAGS_RF BIT(1) |
185 | static int __read_mostly force_emulation_prefix; |
186 | module_param(force_emulation_prefix, int, 0644); |
187 | |
188 | int __read_mostly pi_inject_timer = -1; |
189 | module_param(pi_inject_timer, bint, 0644); |
190 | |
191 | /* Enable/disable PMU virtualization */ |
192 | bool __read_mostly enable_pmu = true; |
193 | EXPORT_SYMBOL_GPL(enable_pmu); |
194 | module_param(enable_pmu, bool, 0444); |
195 | |
196 | bool __read_mostly eager_page_split = true; |
197 | module_param(eager_page_split, bool, 0644); |
198 | |
199 | /* Enable/disable SMT_RSB bug mitigation */ |
200 | static bool __read_mostly mitigate_smt_rsb; |
201 | module_param(mitigate_smt_rsb, bool, 0444); |
202 | |
203 | /* |
204 | * Restoring the host value for MSRs that are only consumed when running in |
205 | * usermode, e.g. SYSCALL MSRs and TSC_AUX, can be deferred until the CPU |
206 | * returns to userspace, i.e. the kernel can run with the guest's value. |
207 | */ |
208 | #define KVM_MAX_NR_USER_RETURN_MSRS 16 |
209 | |
210 | struct kvm_user_return_msrs { |
211 | struct user_return_notifier urn; |
212 | bool registered; |
213 | struct kvm_user_return_msr_values { |
214 | u64 host; |
215 | u64 curr; |
216 | } values[KVM_MAX_NR_USER_RETURN_MSRS]; |
217 | }; |
218 | |
219 | u32 __read_mostly kvm_nr_uret_msrs; |
220 | EXPORT_SYMBOL_GPL(kvm_nr_uret_msrs); |
221 | static u32 __read_mostly kvm_uret_msrs_list[KVM_MAX_NR_USER_RETURN_MSRS]; |
222 | static struct kvm_user_return_msrs __percpu *user_return_msrs; |
223 | |
224 | #define KVM_SUPPORTED_XCR0 (XFEATURE_MASK_FP | XFEATURE_MASK_SSE \ |
225 | | XFEATURE_MASK_YMM | XFEATURE_MASK_BNDREGS \ |
226 | | XFEATURE_MASK_BNDCSR | XFEATURE_MASK_AVX512 \ |
227 | | XFEATURE_MASK_PKRU | XFEATURE_MASK_XTILE) |
228 | |
229 | u64 __read_mostly host_efer; |
230 | EXPORT_SYMBOL_GPL(host_efer); |
231 | |
232 | bool __read_mostly allow_smaller_maxphyaddr = 0; |
233 | EXPORT_SYMBOL_GPL(allow_smaller_maxphyaddr); |
234 | |
235 | bool __read_mostly enable_apicv = true; |
236 | EXPORT_SYMBOL_GPL(enable_apicv); |
237 | |
238 | u64 __read_mostly host_xss; |
239 | EXPORT_SYMBOL_GPL(host_xss); |
240 | |
241 | u64 __read_mostly host_arch_capabilities; |
242 | EXPORT_SYMBOL_GPL(host_arch_capabilities); |
243 | |
244 | const struct _kvm_stats_desc kvm_vm_stats_desc[] = { |
245 | KVM_GENERIC_VM_STATS(), |
246 | STATS_DESC_COUNTER(VM, mmu_shadow_zapped), |
247 | STATS_DESC_COUNTER(VM, mmu_pte_write), |
248 | STATS_DESC_COUNTER(VM, mmu_pde_zapped), |
249 | STATS_DESC_COUNTER(VM, mmu_flooded), |
250 | STATS_DESC_COUNTER(VM, mmu_recycled), |
251 | STATS_DESC_COUNTER(VM, mmu_cache_miss), |
252 | STATS_DESC_ICOUNTER(VM, mmu_unsync), |
253 | STATS_DESC_ICOUNTER(VM, pages_4k), |
254 | STATS_DESC_ICOUNTER(VM, pages_2m), |
255 | STATS_DESC_ICOUNTER(VM, pages_1g), |
256 | STATS_DESC_ICOUNTER(VM, nx_lpage_splits), |
257 | STATS_DESC_PCOUNTER(VM, max_mmu_rmap_size), |
258 | STATS_DESC_PCOUNTER(VM, max_mmu_page_hash_collisions) |
259 | }; |
260 | |
261 | const struct kvm_stats_header = { |
262 | .name_size = KVM_STATS_NAME_SIZE, |
263 | .num_desc = ARRAY_SIZE(kvm_vm_stats_desc), |
264 | .id_offset = sizeof(struct kvm_stats_header), |
265 | .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, |
266 | .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + |
267 | sizeof(kvm_vm_stats_desc), |
268 | }; |
269 | |
270 | const struct _kvm_stats_desc kvm_vcpu_stats_desc[] = { |
271 | KVM_GENERIC_VCPU_STATS(), |
272 | STATS_DESC_COUNTER(VCPU, pf_taken), |
273 | STATS_DESC_COUNTER(VCPU, pf_fixed), |
274 | STATS_DESC_COUNTER(VCPU, pf_emulate), |
275 | STATS_DESC_COUNTER(VCPU, pf_spurious), |
276 | STATS_DESC_COUNTER(VCPU, pf_fast), |
277 | STATS_DESC_COUNTER(VCPU, pf_mmio_spte_created), |
278 | STATS_DESC_COUNTER(VCPU, pf_guest), |
279 | STATS_DESC_COUNTER(VCPU, tlb_flush), |
280 | STATS_DESC_COUNTER(VCPU, invlpg), |
281 | STATS_DESC_COUNTER(VCPU, exits), |
282 | STATS_DESC_COUNTER(VCPU, io_exits), |
283 | STATS_DESC_COUNTER(VCPU, mmio_exits), |
284 | STATS_DESC_COUNTER(VCPU, signal_exits), |
285 | STATS_DESC_COUNTER(VCPU, irq_window_exits), |
286 | STATS_DESC_COUNTER(VCPU, nmi_window_exits), |
287 | STATS_DESC_COUNTER(VCPU, l1d_flush), |
288 | STATS_DESC_COUNTER(VCPU, halt_exits), |
289 | STATS_DESC_COUNTER(VCPU, request_irq_exits), |
290 | STATS_DESC_COUNTER(VCPU, irq_exits), |
291 | STATS_DESC_COUNTER(VCPU, host_state_reload), |
292 | STATS_DESC_COUNTER(VCPU, fpu_reload), |
293 | STATS_DESC_COUNTER(VCPU, insn_emulation), |
294 | STATS_DESC_COUNTER(VCPU, insn_emulation_fail), |
295 | STATS_DESC_COUNTER(VCPU, hypercalls), |
296 | STATS_DESC_COUNTER(VCPU, irq_injections), |
297 | STATS_DESC_COUNTER(VCPU, nmi_injections), |
298 | STATS_DESC_COUNTER(VCPU, req_event), |
299 | STATS_DESC_COUNTER(VCPU, nested_run), |
300 | STATS_DESC_COUNTER(VCPU, directed_yield_attempted), |
301 | STATS_DESC_COUNTER(VCPU, directed_yield_successful), |
302 | STATS_DESC_COUNTER(VCPU, preemption_reported), |
303 | STATS_DESC_COUNTER(VCPU, preemption_other), |
304 | STATS_DESC_IBOOLEAN(VCPU, guest_mode), |
305 | STATS_DESC_COUNTER(VCPU, notify_window_exits), |
306 | }; |
307 | |
308 | const struct kvm_stats_header = { |
309 | .name_size = KVM_STATS_NAME_SIZE, |
310 | .num_desc = ARRAY_SIZE(kvm_vcpu_stats_desc), |
311 | .id_offset = sizeof(struct kvm_stats_header), |
312 | .desc_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE, |
313 | .data_offset = sizeof(struct kvm_stats_header) + KVM_STATS_NAME_SIZE + |
314 | sizeof(kvm_vcpu_stats_desc), |
315 | }; |
316 | |
317 | u64 __read_mostly host_xcr0; |
318 | |
319 | static struct kmem_cache *x86_emulator_cache; |
320 | |
321 | /* |
322 | * When called, it means the previous get/set msr reached an invalid msr. |
323 | * Return true if we want to ignore/silent this failed msr access. |
324 | */ |
325 | static bool kvm_msr_ignored_check(u32 msr, u64 data, bool write) |
326 | { |
327 | const char *op = write ? "wrmsr" : "rdmsr" ; |
328 | |
329 | if (ignore_msrs) { |
330 | if (report_ignored_msrs) |
331 | kvm_pr_unimpl("ignored %s: 0x%x data 0x%llx\n" , |
332 | op, msr, data); |
333 | /* Mask the error */ |
334 | return true; |
335 | } else { |
336 | kvm_debug_ratelimited("unhandled %s: 0x%x data 0x%llx\n" , |
337 | op, msr, data); |
338 | return false; |
339 | } |
340 | } |
341 | |
342 | static struct kmem_cache *kvm_alloc_emulator_cache(void) |
343 | { |
344 | unsigned int useroffset = offsetof(struct x86_emulate_ctxt, src); |
345 | unsigned int size = sizeof(struct x86_emulate_ctxt); |
346 | |
347 | return kmem_cache_create_usercopy(name: "x86_emulator" , size, |
348 | align: __alignof__(struct x86_emulate_ctxt), |
349 | SLAB_ACCOUNT, useroffset, |
350 | usersize: size - useroffset, NULL); |
351 | } |
352 | |
353 | static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt); |
354 | |
355 | static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu) |
356 | { |
357 | int i; |
358 | for (i = 0; i < ASYNC_PF_PER_VCPU; i++) |
359 | vcpu->arch.apf.gfns[i] = ~0; |
360 | } |
361 | |
362 | static void kvm_on_user_return(struct user_return_notifier *urn) |
363 | { |
364 | unsigned slot; |
365 | struct kvm_user_return_msrs *msrs |
366 | = container_of(urn, struct kvm_user_return_msrs, urn); |
367 | struct kvm_user_return_msr_values *values; |
368 | unsigned long flags; |
369 | |
370 | /* |
371 | * Disabling irqs at this point since the following code could be |
372 | * interrupted and executed through kvm_arch_hardware_disable() |
373 | */ |
374 | local_irq_save(flags); |
375 | if (msrs->registered) { |
376 | msrs->registered = false; |
377 | user_return_notifier_unregister(urn); |
378 | } |
379 | local_irq_restore(flags); |
380 | for (slot = 0; slot < kvm_nr_uret_msrs; ++slot) { |
381 | values = &msrs->values[slot]; |
382 | if (values->host != values->curr) { |
383 | wrmsrl(msr: kvm_uret_msrs_list[slot], val: values->host); |
384 | values->curr = values->host; |
385 | } |
386 | } |
387 | } |
388 | |
389 | static int kvm_probe_user_return_msr(u32 msr) |
390 | { |
391 | u64 val; |
392 | int ret; |
393 | |
394 | preempt_disable(); |
395 | ret = rdmsrl_safe(msr, p: &val); |
396 | if (ret) |
397 | goto out; |
398 | ret = wrmsrl_safe(msr, val); |
399 | out: |
400 | preempt_enable(); |
401 | return ret; |
402 | } |
403 | |
404 | int kvm_add_user_return_msr(u32 msr) |
405 | { |
406 | BUG_ON(kvm_nr_uret_msrs >= KVM_MAX_NR_USER_RETURN_MSRS); |
407 | |
408 | if (kvm_probe_user_return_msr(msr)) |
409 | return -1; |
410 | |
411 | kvm_uret_msrs_list[kvm_nr_uret_msrs] = msr; |
412 | return kvm_nr_uret_msrs++; |
413 | } |
414 | EXPORT_SYMBOL_GPL(kvm_add_user_return_msr); |
415 | |
416 | int kvm_find_user_return_msr(u32 msr) |
417 | { |
418 | int i; |
419 | |
420 | for (i = 0; i < kvm_nr_uret_msrs; ++i) { |
421 | if (kvm_uret_msrs_list[i] == msr) |
422 | return i; |
423 | } |
424 | return -1; |
425 | } |
426 | EXPORT_SYMBOL_GPL(kvm_find_user_return_msr); |
427 | |
428 | static void kvm_user_return_msr_cpu_online(void) |
429 | { |
430 | unsigned int cpu = smp_processor_id(); |
431 | struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
432 | u64 value; |
433 | int i; |
434 | |
435 | for (i = 0; i < kvm_nr_uret_msrs; ++i) { |
436 | rdmsrl_safe(msr: kvm_uret_msrs_list[i], p: &value); |
437 | msrs->values[i].host = value; |
438 | msrs->values[i].curr = value; |
439 | } |
440 | } |
441 | |
442 | int kvm_set_user_return_msr(unsigned slot, u64 value, u64 mask) |
443 | { |
444 | unsigned int cpu = smp_processor_id(); |
445 | struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
446 | int err; |
447 | |
448 | value = (value & mask) | (msrs->values[slot].host & ~mask); |
449 | if (value == msrs->values[slot].curr) |
450 | return 0; |
451 | err = wrmsrl_safe(msr: kvm_uret_msrs_list[slot], val: value); |
452 | if (err) |
453 | return 1; |
454 | |
455 | msrs->values[slot].curr = value; |
456 | if (!msrs->registered) { |
457 | msrs->urn.on_user_return = kvm_on_user_return; |
458 | user_return_notifier_register(urn: &msrs->urn); |
459 | msrs->registered = true; |
460 | } |
461 | return 0; |
462 | } |
463 | EXPORT_SYMBOL_GPL(kvm_set_user_return_msr); |
464 | |
465 | static void drop_user_return_notifiers(void) |
466 | { |
467 | unsigned int cpu = smp_processor_id(); |
468 | struct kvm_user_return_msrs *msrs = per_cpu_ptr(user_return_msrs, cpu); |
469 | |
470 | if (msrs->registered) |
471 | kvm_on_user_return(urn: &msrs->urn); |
472 | } |
473 | |
474 | u64 kvm_get_apic_base(struct kvm_vcpu *vcpu) |
475 | { |
476 | return vcpu->arch.apic_base; |
477 | } |
478 | |
479 | enum lapic_mode kvm_get_apic_mode(struct kvm_vcpu *vcpu) |
480 | { |
481 | return kvm_apic_mode(apic_base: kvm_get_apic_base(vcpu)); |
482 | } |
483 | EXPORT_SYMBOL_GPL(kvm_get_apic_mode); |
484 | |
485 | int kvm_set_apic_base(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
486 | { |
487 | enum lapic_mode old_mode = kvm_get_apic_mode(vcpu); |
488 | enum lapic_mode new_mode = kvm_apic_mode(apic_base: msr_info->data); |
489 | u64 reserved_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu) | 0x2ff | |
490 | (guest_cpuid_has(vcpu, X86_FEATURE_X2APIC) ? 0 : X2APIC_ENABLE); |
491 | |
492 | if ((msr_info->data & reserved_bits) != 0 || new_mode == LAPIC_MODE_INVALID) |
493 | return 1; |
494 | if (!msr_info->host_initiated) { |
495 | if (old_mode == LAPIC_MODE_X2APIC && new_mode == LAPIC_MODE_XAPIC) |
496 | return 1; |
497 | if (old_mode == LAPIC_MODE_DISABLED && new_mode == LAPIC_MODE_X2APIC) |
498 | return 1; |
499 | } |
500 | |
501 | kvm_lapic_set_base(vcpu, value: msr_info->data); |
502 | kvm_recalculate_apic_map(kvm: vcpu->kvm); |
503 | return 0; |
504 | } |
505 | |
506 | /* |
507 | * Handle a fault on a hardware virtualization (VMX or SVM) instruction. |
508 | * |
509 | * Hardware virtualization extension instructions may fault if a reboot turns |
510 | * off virtualization while processes are running. Usually after catching the |
511 | * fault we just panic; during reboot instead the instruction is ignored. |
512 | */ |
513 | noinstr void kvm_spurious_fault(void) |
514 | { |
515 | /* Fault while not rebooting. We want the trace. */ |
516 | BUG_ON(!kvm_rebooting); |
517 | } |
518 | EXPORT_SYMBOL_GPL(kvm_spurious_fault); |
519 | |
520 | #define EXCPT_BENIGN 0 |
521 | #define EXCPT_CONTRIBUTORY 1 |
522 | #define EXCPT_PF 2 |
523 | |
524 | static int exception_class(int vector) |
525 | { |
526 | switch (vector) { |
527 | case PF_VECTOR: |
528 | return EXCPT_PF; |
529 | case DE_VECTOR: |
530 | case TS_VECTOR: |
531 | case NP_VECTOR: |
532 | case SS_VECTOR: |
533 | case GP_VECTOR: |
534 | return EXCPT_CONTRIBUTORY; |
535 | default: |
536 | break; |
537 | } |
538 | return EXCPT_BENIGN; |
539 | } |
540 | |
541 | #define EXCPT_FAULT 0 |
542 | #define EXCPT_TRAP 1 |
543 | #define EXCPT_ABORT 2 |
544 | #define EXCPT_INTERRUPT 3 |
545 | #define EXCPT_DB 4 |
546 | |
547 | static int exception_type(int vector) |
548 | { |
549 | unsigned int mask; |
550 | |
551 | if (WARN_ON(vector > 31 || vector == NMI_VECTOR)) |
552 | return EXCPT_INTERRUPT; |
553 | |
554 | mask = 1 << vector; |
555 | |
556 | /* |
557 | * #DBs can be trap-like or fault-like, the caller must check other CPU |
558 | * state, e.g. DR6, to determine whether a #DB is a trap or fault. |
559 | */ |
560 | if (mask & (1 << DB_VECTOR)) |
561 | return EXCPT_DB; |
562 | |
563 | if (mask & ((1 << BP_VECTOR) | (1 << OF_VECTOR))) |
564 | return EXCPT_TRAP; |
565 | |
566 | if (mask & ((1 << DF_VECTOR) | (1 << MC_VECTOR))) |
567 | return EXCPT_ABORT; |
568 | |
569 | /* Reserved exceptions will result in fault */ |
570 | return EXCPT_FAULT; |
571 | } |
572 | |
573 | void kvm_deliver_exception_payload(struct kvm_vcpu *vcpu, |
574 | struct kvm_queued_exception *ex) |
575 | { |
576 | if (!ex->has_payload) |
577 | return; |
578 | |
579 | switch (ex->vector) { |
580 | case DB_VECTOR: |
581 | /* |
582 | * "Certain debug exceptions may clear bit 0-3. The |
583 | * remaining contents of the DR6 register are never |
584 | * cleared by the processor". |
585 | */ |
586 | vcpu->arch.dr6 &= ~DR_TRAP_BITS; |
587 | /* |
588 | * In order to reflect the #DB exception payload in guest |
589 | * dr6, three components need to be considered: active low |
590 | * bit, FIXED_1 bits and active high bits (e.g. DR6_BD, |
591 | * DR6_BS and DR6_BT) |
592 | * DR6_ACTIVE_LOW contains the FIXED_1 and active low bits. |
593 | * In the target guest dr6: |
594 | * FIXED_1 bits should always be set. |
595 | * Active low bits should be cleared if 1-setting in payload. |
596 | * Active high bits should be set if 1-setting in payload. |
597 | * |
598 | * Note, the payload is compatible with the pending debug |
599 | * exceptions/exit qualification under VMX, that active_low bits |
600 | * are active high in payload. |
601 | * So they need to be flipped for DR6. |
602 | */ |
603 | vcpu->arch.dr6 |= DR6_ACTIVE_LOW; |
604 | vcpu->arch.dr6 |= ex->payload; |
605 | vcpu->arch.dr6 ^= ex->payload & DR6_ACTIVE_LOW; |
606 | |
607 | /* |
608 | * The #DB payload is defined as compatible with the 'pending |
609 | * debug exceptions' field under VMX, not DR6. While bit 12 is |
610 | * defined in the 'pending debug exceptions' field (enabled |
611 | * breakpoint), it is reserved and must be zero in DR6. |
612 | */ |
613 | vcpu->arch.dr6 &= ~BIT(12); |
614 | break; |
615 | case PF_VECTOR: |
616 | vcpu->arch.cr2 = ex->payload; |
617 | break; |
618 | } |
619 | |
620 | ex->has_payload = false; |
621 | ex->payload = 0; |
622 | } |
623 | EXPORT_SYMBOL_GPL(kvm_deliver_exception_payload); |
624 | |
625 | static void kvm_queue_exception_vmexit(struct kvm_vcpu *vcpu, unsigned int vector, |
626 | bool has_error_code, u32 error_code, |
627 | bool has_payload, unsigned long payload) |
628 | { |
629 | struct kvm_queued_exception *ex = &vcpu->arch.exception_vmexit; |
630 | |
631 | ex->vector = vector; |
632 | ex->injected = false; |
633 | ex->pending = true; |
634 | ex->has_error_code = has_error_code; |
635 | ex->error_code = error_code; |
636 | ex->has_payload = has_payload; |
637 | ex->payload = payload; |
638 | } |
639 | |
640 | /* Forcibly leave the nested mode in cases like a vCPU reset */ |
641 | static void kvm_leave_nested(struct kvm_vcpu *vcpu) |
642 | { |
643 | kvm_x86_ops.nested_ops->leave_nested(vcpu); |
644 | } |
645 | |
646 | static void kvm_multiple_exception(struct kvm_vcpu *vcpu, |
647 | unsigned nr, bool has_error, u32 error_code, |
648 | bool has_payload, unsigned long payload, bool reinject) |
649 | { |
650 | u32 prev_nr; |
651 | int class1, class2; |
652 | |
653 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
654 | |
655 | /* |
656 | * If the exception is destined for L2 and isn't being reinjected, |
657 | * morph it to a VM-Exit if L1 wants to intercept the exception. A |
658 | * previously injected exception is not checked because it was checked |
659 | * when it was original queued, and re-checking is incorrect if _L1_ |
660 | * injected the exception, in which case it's exempt from interception. |
661 | */ |
662 | if (!reinject && is_guest_mode(vcpu) && |
663 | kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, nr, error_code)) { |
664 | kvm_queue_exception_vmexit(vcpu, vector: nr, has_error_code: has_error, error_code, |
665 | has_payload, payload); |
666 | return; |
667 | } |
668 | |
669 | if (!vcpu->arch.exception.pending && !vcpu->arch.exception.injected) { |
670 | queue: |
671 | if (reinject) { |
672 | /* |
673 | * On VM-Entry, an exception can be pending if and only |
674 | * if event injection was blocked by nested_run_pending. |
675 | * In that case, however, vcpu_enter_guest() requests an |
676 | * immediate exit, and the guest shouldn't proceed far |
677 | * enough to need reinjection. |
678 | */ |
679 | WARN_ON_ONCE(kvm_is_exception_pending(vcpu)); |
680 | vcpu->arch.exception.injected = true; |
681 | if (WARN_ON_ONCE(has_payload)) { |
682 | /* |
683 | * A reinjected event has already |
684 | * delivered its payload. |
685 | */ |
686 | has_payload = false; |
687 | payload = 0; |
688 | } |
689 | } else { |
690 | vcpu->arch.exception.pending = true; |
691 | vcpu->arch.exception.injected = false; |
692 | } |
693 | vcpu->arch.exception.has_error_code = has_error; |
694 | vcpu->arch.exception.vector = nr; |
695 | vcpu->arch.exception.error_code = error_code; |
696 | vcpu->arch.exception.has_payload = has_payload; |
697 | vcpu->arch.exception.payload = payload; |
698 | if (!is_guest_mode(vcpu)) |
699 | kvm_deliver_exception_payload(vcpu, |
700 | &vcpu->arch.exception); |
701 | return; |
702 | } |
703 | |
704 | /* to check exception */ |
705 | prev_nr = vcpu->arch.exception.vector; |
706 | if (prev_nr == DF_VECTOR) { |
707 | /* triple fault -> shutdown */ |
708 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
709 | return; |
710 | } |
711 | class1 = exception_class(vector: prev_nr); |
712 | class2 = exception_class(vector: nr); |
713 | if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY) || |
714 | (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) { |
715 | /* |
716 | * Synthesize #DF. Clear the previously injected or pending |
717 | * exception so as not to incorrectly trigger shutdown. |
718 | */ |
719 | vcpu->arch.exception.injected = false; |
720 | vcpu->arch.exception.pending = false; |
721 | |
722 | kvm_queue_exception_e(vcpu, DF_VECTOR, error_code: 0); |
723 | } else { |
724 | /* replace previous exception with a new one in a hope |
725 | that instruction re-execution will regenerate lost |
726 | exception */ |
727 | goto queue; |
728 | } |
729 | } |
730 | |
731 | void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
732 | { |
733 | kvm_multiple_exception(vcpu, nr, has_error: false, error_code: 0, has_payload: false, payload: 0, reinject: false); |
734 | } |
735 | EXPORT_SYMBOL_GPL(kvm_queue_exception); |
736 | |
737 | void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr) |
738 | { |
739 | kvm_multiple_exception(vcpu, nr, has_error: false, error_code: 0, has_payload: false, payload: 0, reinject: true); |
740 | } |
741 | EXPORT_SYMBOL_GPL(kvm_requeue_exception); |
742 | |
743 | void kvm_queue_exception_p(struct kvm_vcpu *vcpu, unsigned nr, |
744 | unsigned long payload) |
745 | { |
746 | kvm_multiple_exception(vcpu, nr, has_error: false, error_code: 0, has_payload: true, payload, reinject: false); |
747 | } |
748 | EXPORT_SYMBOL_GPL(kvm_queue_exception_p); |
749 | |
750 | static void kvm_queue_exception_e_p(struct kvm_vcpu *vcpu, unsigned nr, |
751 | u32 error_code, unsigned long payload) |
752 | { |
753 | kvm_multiple_exception(vcpu, nr, has_error: true, error_code, |
754 | has_payload: true, payload, reinject: false); |
755 | } |
756 | |
757 | int kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err) |
758 | { |
759 | if (err) |
760 | kvm_inject_gp(vcpu, error_code: 0); |
761 | else |
762 | return kvm_skip_emulated_instruction(vcpu); |
763 | |
764 | return 1; |
765 | } |
766 | EXPORT_SYMBOL_GPL(kvm_complete_insn_gp); |
767 | |
768 | static int complete_emulated_insn_gp(struct kvm_vcpu *vcpu, int err) |
769 | { |
770 | if (err) { |
771 | kvm_inject_gp(vcpu, error_code: 0); |
772 | return 1; |
773 | } |
774 | |
775 | return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE | EMULTYPE_SKIP | |
776 | EMULTYPE_COMPLETE_USER_EXIT); |
777 | } |
778 | |
779 | void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) |
780 | { |
781 | ++vcpu->stat.pf_guest; |
782 | |
783 | /* |
784 | * Async #PF in L2 is always forwarded to L1 as a VM-Exit regardless of |
785 | * whether or not L1 wants to intercept "regular" #PF. |
786 | */ |
787 | if (is_guest_mode(vcpu) && fault->async_page_fault) |
788 | kvm_queue_exception_vmexit(vcpu, PF_VECTOR, |
789 | has_error_code: true, error_code: fault->error_code, |
790 | has_payload: true, payload: fault->address); |
791 | else |
792 | kvm_queue_exception_e_p(vcpu, PF_VECTOR, error_code: fault->error_code, |
793 | payload: fault->address); |
794 | } |
795 | |
796 | void kvm_inject_emulated_page_fault(struct kvm_vcpu *vcpu, |
797 | struct x86_exception *fault) |
798 | { |
799 | struct kvm_mmu *fault_mmu; |
800 | WARN_ON_ONCE(fault->vector != PF_VECTOR); |
801 | |
802 | fault_mmu = fault->nested_page_fault ? vcpu->arch.mmu : |
803 | vcpu->arch.walk_mmu; |
804 | |
805 | /* |
806 | * Invalidate the TLB entry for the faulting address, if it exists, |
807 | * else the access will fault indefinitely (and to emulate hardware). |
808 | */ |
809 | if ((fault->error_code & PFERR_PRESENT_MASK) && |
810 | !(fault->error_code & PFERR_RSVD_MASK)) |
811 | kvm_mmu_invalidate_addr(vcpu, mmu: fault_mmu, addr: fault->address, |
812 | KVM_MMU_ROOT_CURRENT); |
813 | |
814 | fault_mmu->inject_page_fault(vcpu, fault); |
815 | } |
816 | EXPORT_SYMBOL_GPL(kvm_inject_emulated_page_fault); |
817 | |
818 | void kvm_inject_nmi(struct kvm_vcpu *vcpu) |
819 | { |
820 | atomic_inc(v: &vcpu->arch.nmi_queued); |
821 | kvm_make_request(KVM_REQ_NMI, vcpu); |
822 | } |
823 | |
824 | void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
825 | { |
826 | kvm_multiple_exception(vcpu, nr, has_error: true, error_code, has_payload: false, payload: 0, reinject: false); |
827 | } |
828 | EXPORT_SYMBOL_GPL(kvm_queue_exception_e); |
829 | |
830 | void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code) |
831 | { |
832 | kvm_multiple_exception(vcpu, nr, has_error: true, error_code, has_payload: false, payload: 0, reinject: true); |
833 | } |
834 | EXPORT_SYMBOL_GPL(kvm_requeue_exception_e); |
835 | |
836 | /* |
837 | * Checks if cpl <= required_cpl; if true, return true. Otherwise queue |
838 | * a #GP and return false. |
839 | */ |
840 | bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl) |
841 | { |
842 | if (static_call(kvm_x86_get_cpl)(vcpu) <= required_cpl) |
843 | return true; |
844 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); |
845 | return false; |
846 | } |
847 | |
848 | bool kvm_require_dr(struct kvm_vcpu *vcpu, int dr) |
849 | { |
850 | if ((dr != 4 && dr != 5) || !kvm_is_cr4_bit_set(vcpu, X86_CR4_DE)) |
851 | return true; |
852 | |
853 | kvm_queue_exception(vcpu, UD_VECTOR); |
854 | return false; |
855 | } |
856 | EXPORT_SYMBOL_GPL(kvm_require_dr); |
857 | |
858 | static inline u64 pdptr_rsvd_bits(struct kvm_vcpu *vcpu) |
859 | { |
860 | return vcpu->arch.reserved_gpa_bits | rsvd_bits(s: 5, e: 8) | rsvd_bits(s: 1, e: 2); |
861 | } |
862 | |
863 | /* |
864 | * Load the pae pdptrs. Return 1 if they are all valid, 0 otherwise. |
865 | */ |
866 | int load_pdptrs(struct kvm_vcpu *vcpu, unsigned long cr3) |
867 | { |
868 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
869 | gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT; |
870 | gpa_t real_gpa; |
871 | int i; |
872 | int ret; |
873 | u64 pdpte[ARRAY_SIZE(mmu->pdptrs)]; |
874 | |
875 | /* |
876 | * If the MMU is nested, CR3 holds an L2 GPA and needs to be translated |
877 | * to an L1 GPA. |
878 | */ |
879 | real_gpa = kvm_translate_gpa(vcpu, mmu, gpa: gfn_to_gpa(gfn: pdpt_gfn), |
880 | PFERR_USER_MASK | PFERR_WRITE_MASK, NULL); |
881 | if (real_gpa == INVALID_GPA) |
882 | return 0; |
883 | |
884 | /* Note the offset, PDPTRs are 32 byte aligned when using PAE paging. */ |
885 | ret = kvm_vcpu_read_guest_page(vcpu, gfn: gpa_to_gfn(gpa: real_gpa), data: pdpte, |
886 | offset: cr3 & GENMASK(11, 5), len: sizeof(pdpte)); |
887 | if (ret < 0) |
888 | return 0; |
889 | |
890 | for (i = 0; i < ARRAY_SIZE(pdpte); ++i) { |
891 | if ((pdpte[i] & PT_PRESENT_MASK) && |
892 | (pdpte[i] & pdptr_rsvd_bits(vcpu))) { |
893 | return 0; |
894 | } |
895 | } |
896 | |
897 | /* |
898 | * Marking VCPU_EXREG_PDPTR dirty doesn't work for !tdp_enabled. |
899 | * Shadow page roots need to be reconstructed instead. |
900 | */ |
901 | if (!tdp_enabled && memcmp(p: mmu->pdptrs, q: pdpte, size: sizeof(mmu->pdptrs))) |
902 | kvm_mmu_free_roots(kvm: vcpu->kvm, mmu, KVM_MMU_ROOT_CURRENT); |
903 | |
904 | memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs)); |
905 | kvm_register_mark_dirty(vcpu, reg: VCPU_EXREG_PDPTR); |
906 | kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu); |
907 | vcpu->arch.pdptrs_from_userspace = false; |
908 | |
909 | return 1; |
910 | } |
911 | EXPORT_SYMBOL_GPL(load_pdptrs); |
912 | |
913 | static bool kvm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
914 | { |
915 | #ifdef CONFIG_X86_64 |
916 | if (cr0 & 0xffffffff00000000UL) |
917 | return false; |
918 | #endif |
919 | |
920 | if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD)) |
921 | return false; |
922 | |
923 | if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE)) |
924 | return false; |
925 | |
926 | return static_call(kvm_x86_is_valid_cr0)(vcpu, cr0); |
927 | } |
928 | |
929 | void kvm_post_set_cr0(struct kvm_vcpu *vcpu, unsigned long old_cr0, unsigned long cr0) |
930 | { |
931 | /* |
932 | * CR0.WP is incorporated into the MMU role, but only for non-nested, |
933 | * indirect shadow MMUs. If paging is disabled, no updates are needed |
934 | * as there are no permission bits to emulate. If TDP is enabled, the |
935 | * MMU's metadata needs to be updated, e.g. so that emulating guest |
936 | * translations does the right thing, but there's no need to unload the |
937 | * root as CR0.WP doesn't affect SPTEs. |
938 | */ |
939 | if ((cr0 ^ old_cr0) == X86_CR0_WP) { |
940 | if (!(cr0 & X86_CR0_PG)) |
941 | return; |
942 | |
943 | if (tdp_enabled) { |
944 | kvm_init_mmu(vcpu); |
945 | return; |
946 | } |
947 | } |
948 | |
949 | if ((cr0 ^ old_cr0) & X86_CR0_PG) { |
950 | kvm_clear_async_pf_completion_queue(vcpu); |
951 | kvm_async_pf_hash_reset(vcpu); |
952 | |
953 | /* |
954 | * Clearing CR0.PG is defined to flush the TLB from the guest's |
955 | * perspective. |
956 | */ |
957 | if (!(cr0 & X86_CR0_PG)) |
958 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
959 | } |
960 | |
961 | if ((cr0 ^ old_cr0) & KVM_MMU_CR0_ROLE_BITS) |
962 | kvm_mmu_reset_context(vcpu); |
963 | |
964 | if (((cr0 ^ old_cr0) & X86_CR0_CD) && |
965 | kvm_mmu_honors_guest_mtrrs(kvm: vcpu->kvm) && |
966 | !kvm_check_has_quirk(kvm: vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) |
967 | kvm_zap_gfn_range(kvm: vcpu->kvm, gfn_start: 0, gfn_end: ~0ULL); |
968 | } |
969 | EXPORT_SYMBOL_GPL(kvm_post_set_cr0); |
970 | |
971 | int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
972 | { |
973 | unsigned long old_cr0 = kvm_read_cr0(vcpu); |
974 | |
975 | if (!kvm_is_valid_cr0(vcpu, cr0)) |
976 | return 1; |
977 | |
978 | cr0 |= X86_CR0_ET; |
979 | |
980 | /* Write to CR0 reserved bits are ignored, even on Intel. */ |
981 | cr0 &= ~CR0_RESERVED_BITS; |
982 | |
983 | #ifdef CONFIG_X86_64 |
984 | if ((vcpu->arch.efer & EFER_LME) && !is_paging(vcpu) && |
985 | (cr0 & X86_CR0_PG)) { |
986 | int cs_db, cs_l; |
987 | |
988 | if (!is_pae(vcpu)) |
989 | return 1; |
990 | static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); |
991 | if (cs_l) |
992 | return 1; |
993 | } |
994 | #endif |
995 | if (!(vcpu->arch.efer & EFER_LME) && (cr0 & X86_CR0_PG) && |
996 | is_pae(vcpu) && ((cr0 ^ old_cr0) & X86_CR0_PDPTR_BITS) && |
997 | !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) |
998 | return 1; |
999 | |
1000 | if (!(cr0 & X86_CR0_PG) && |
1001 | (is_64_bit_mode(vcpu) || kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE))) |
1002 | return 1; |
1003 | |
1004 | static_call(kvm_x86_set_cr0)(vcpu, cr0); |
1005 | |
1006 | kvm_post_set_cr0(vcpu, old_cr0, cr0); |
1007 | |
1008 | return 0; |
1009 | } |
1010 | EXPORT_SYMBOL_GPL(kvm_set_cr0); |
1011 | |
1012 | void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw) |
1013 | { |
1014 | (void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, mask: ~0x0eul) | (msw & 0x0f)); |
1015 | } |
1016 | EXPORT_SYMBOL_GPL(kvm_lmsw); |
1017 | |
1018 | void kvm_load_guest_xsave_state(struct kvm_vcpu *vcpu) |
1019 | { |
1020 | if (vcpu->arch.guest_state_protected) |
1021 | return; |
1022 | |
1023 | if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { |
1024 | |
1025 | if (vcpu->arch.xcr0 != host_xcr0) |
1026 | xsetbv(XCR_XFEATURE_ENABLED_MASK, value: vcpu->arch.xcr0); |
1027 | |
1028 | if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && |
1029 | vcpu->arch.ia32_xss != host_xss) |
1030 | wrmsrl(MSR_IA32_XSS, val: vcpu->arch.ia32_xss); |
1031 | } |
1032 | |
1033 | if (cpu_feature_enabled(X86_FEATURE_PKU) && |
1034 | vcpu->arch.pkru != vcpu->arch.host_pkru && |
1035 | ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || |
1036 | kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) |
1037 | write_pkru(pkru: vcpu->arch.pkru); |
1038 | } |
1039 | EXPORT_SYMBOL_GPL(kvm_load_guest_xsave_state); |
1040 | |
1041 | void kvm_load_host_xsave_state(struct kvm_vcpu *vcpu) |
1042 | { |
1043 | if (vcpu->arch.guest_state_protected) |
1044 | return; |
1045 | |
1046 | if (cpu_feature_enabled(X86_FEATURE_PKU) && |
1047 | ((vcpu->arch.xcr0 & XFEATURE_MASK_PKRU) || |
1048 | kvm_is_cr4_bit_set(vcpu, X86_CR4_PKE))) { |
1049 | vcpu->arch.pkru = rdpkru(); |
1050 | if (vcpu->arch.pkru != vcpu->arch.host_pkru) |
1051 | write_pkru(pkru: vcpu->arch.host_pkru); |
1052 | } |
1053 | |
1054 | if (kvm_is_cr4_bit_set(vcpu, X86_CR4_OSXSAVE)) { |
1055 | |
1056 | if (vcpu->arch.xcr0 != host_xcr0) |
1057 | xsetbv(XCR_XFEATURE_ENABLED_MASK, value: host_xcr0); |
1058 | |
1059 | if (guest_can_use(vcpu, X86_FEATURE_XSAVES) && |
1060 | vcpu->arch.ia32_xss != host_xss) |
1061 | wrmsrl(MSR_IA32_XSS, val: host_xss); |
1062 | } |
1063 | |
1064 | } |
1065 | EXPORT_SYMBOL_GPL(kvm_load_host_xsave_state); |
1066 | |
1067 | #ifdef CONFIG_X86_64 |
1068 | static inline u64 kvm_guest_supported_xfd(struct kvm_vcpu *vcpu) |
1069 | { |
1070 | return vcpu->arch.guest_supported_xcr0 & XFEATURE_MASK_USER_DYNAMIC; |
1071 | } |
1072 | #endif |
1073 | |
1074 | static int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr) |
1075 | { |
1076 | u64 xcr0 = xcr; |
1077 | u64 old_xcr0 = vcpu->arch.xcr0; |
1078 | u64 valid_bits; |
1079 | |
1080 | /* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now */ |
1081 | if (index != XCR_XFEATURE_ENABLED_MASK) |
1082 | return 1; |
1083 | if (!(xcr0 & XFEATURE_MASK_FP)) |
1084 | return 1; |
1085 | if ((xcr0 & XFEATURE_MASK_YMM) && !(xcr0 & XFEATURE_MASK_SSE)) |
1086 | return 1; |
1087 | |
1088 | /* |
1089 | * Do not allow the guest to set bits that we do not support |
1090 | * saving. However, xcr0 bit 0 is always set, even if the |
1091 | * emulated CPU does not support XSAVE (see kvm_vcpu_reset()). |
1092 | */ |
1093 | valid_bits = vcpu->arch.guest_supported_xcr0 | XFEATURE_MASK_FP; |
1094 | if (xcr0 & ~valid_bits) |
1095 | return 1; |
1096 | |
1097 | if ((!(xcr0 & XFEATURE_MASK_BNDREGS)) != |
1098 | (!(xcr0 & XFEATURE_MASK_BNDCSR))) |
1099 | return 1; |
1100 | |
1101 | if (xcr0 & XFEATURE_MASK_AVX512) { |
1102 | if (!(xcr0 & XFEATURE_MASK_YMM)) |
1103 | return 1; |
1104 | if ((xcr0 & XFEATURE_MASK_AVX512) != XFEATURE_MASK_AVX512) |
1105 | return 1; |
1106 | } |
1107 | |
1108 | if ((xcr0 & XFEATURE_MASK_XTILE) && |
1109 | ((xcr0 & XFEATURE_MASK_XTILE) != XFEATURE_MASK_XTILE)) |
1110 | return 1; |
1111 | |
1112 | vcpu->arch.xcr0 = xcr0; |
1113 | |
1114 | if ((xcr0 ^ old_xcr0) & XFEATURE_MASK_EXTEND) |
1115 | kvm_update_cpuid_runtime(vcpu); |
1116 | return 0; |
1117 | } |
1118 | |
1119 | int kvm_emulate_xsetbv(struct kvm_vcpu *vcpu) |
1120 | { |
1121 | /* Note, #UD due to CR4.OSXSAVE=0 has priority over the intercept. */ |
1122 | if (static_call(kvm_x86_get_cpl)(vcpu) != 0 || |
1123 | __kvm_set_xcr(vcpu, index: kvm_rcx_read(vcpu), xcr: kvm_read_edx_eax(vcpu))) { |
1124 | kvm_inject_gp(vcpu, error_code: 0); |
1125 | return 1; |
1126 | } |
1127 | |
1128 | return kvm_skip_emulated_instruction(vcpu); |
1129 | } |
1130 | EXPORT_SYMBOL_GPL(kvm_emulate_xsetbv); |
1131 | |
1132 | bool __kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1133 | { |
1134 | if (cr4 & cr4_reserved_bits) |
1135 | return false; |
1136 | |
1137 | if (cr4 & vcpu->arch.cr4_guest_rsvd_bits) |
1138 | return false; |
1139 | |
1140 | return true; |
1141 | } |
1142 | EXPORT_SYMBOL_GPL(__kvm_is_valid_cr4); |
1143 | |
1144 | static bool kvm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1145 | { |
1146 | return __kvm_is_valid_cr4(vcpu, cr4) && |
1147 | static_call(kvm_x86_is_valid_cr4)(vcpu, cr4); |
1148 | } |
1149 | |
1150 | void kvm_post_set_cr4(struct kvm_vcpu *vcpu, unsigned long old_cr4, unsigned long cr4) |
1151 | { |
1152 | if ((cr4 ^ old_cr4) & KVM_MMU_CR4_ROLE_BITS) |
1153 | kvm_mmu_reset_context(vcpu); |
1154 | |
1155 | /* |
1156 | * If CR4.PCIDE is changed 0 -> 1, there is no need to flush the TLB |
1157 | * according to the SDM; however, stale prev_roots could be reused |
1158 | * incorrectly in the future after a MOV to CR3 with NOFLUSH=1, so we |
1159 | * free them all. This is *not* a superset of KVM_REQ_TLB_FLUSH_GUEST |
1160 | * or KVM_REQ_TLB_FLUSH_CURRENT, because the hardware TLB is not flushed, |
1161 | * so fall through. |
1162 | */ |
1163 | if (!tdp_enabled && |
1164 | (cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) |
1165 | kvm_mmu_unload(vcpu); |
1166 | |
1167 | /* |
1168 | * The TLB has to be flushed for all PCIDs if any of the following |
1169 | * (architecturally required) changes happen: |
1170 | * - CR4.PCIDE is changed from 1 to 0 |
1171 | * - CR4.PGE is toggled |
1172 | * |
1173 | * This is a superset of KVM_REQ_TLB_FLUSH_CURRENT. |
1174 | */ |
1175 | if (((cr4 ^ old_cr4) & X86_CR4_PGE) || |
1176 | (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE))) |
1177 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
1178 | |
1179 | /* |
1180 | * The TLB has to be flushed for the current PCID if any of the |
1181 | * following (architecturally required) changes happen: |
1182 | * - CR4.SMEP is changed from 0 to 1 |
1183 | * - CR4.PAE is toggled |
1184 | */ |
1185 | else if (((cr4 ^ old_cr4) & X86_CR4_PAE) || |
1186 | ((cr4 & X86_CR4_SMEP) && !(old_cr4 & X86_CR4_SMEP))) |
1187 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); |
1188 | |
1189 | } |
1190 | EXPORT_SYMBOL_GPL(kvm_post_set_cr4); |
1191 | |
1192 | int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1193 | { |
1194 | unsigned long old_cr4 = kvm_read_cr4(vcpu); |
1195 | |
1196 | if (!kvm_is_valid_cr4(vcpu, cr4)) |
1197 | return 1; |
1198 | |
1199 | if (is_long_mode(vcpu)) { |
1200 | if (!(cr4 & X86_CR4_PAE)) |
1201 | return 1; |
1202 | if ((cr4 ^ old_cr4) & X86_CR4_LA57) |
1203 | return 1; |
1204 | } else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE) |
1205 | && ((cr4 ^ old_cr4) & X86_CR4_PDPTR_BITS) |
1206 | && !load_pdptrs(vcpu, kvm_read_cr3(vcpu))) |
1207 | return 1; |
1208 | |
1209 | if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) { |
1210 | /* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */ |
1211 | if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu)) |
1212 | return 1; |
1213 | } |
1214 | |
1215 | static_call(kvm_x86_set_cr4)(vcpu, cr4); |
1216 | |
1217 | kvm_post_set_cr4(vcpu, old_cr4, cr4); |
1218 | |
1219 | return 0; |
1220 | } |
1221 | EXPORT_SYMBOL_GPL(kvm_set_cr4); |
1222 | |
1223 | static void kvm_invalidate_pcid(struct kvm_vcpu *vcpu, unsigned long pcid) |
1224 | { |
1225 | struct kvm_mmu *mmu = vcpu->arch.mmu; |
1226 | unsigned long roots_to_free = 0; |
1227 | int i; |
1228 | |
1229 | /* |
1230 | * MOV CR3 and INVPCID are usually not intercepted when using TDP, but |
1231 | * this is reachable when running EPT=1 and unrestricted_guest=0, and |
1232 | * also via the emulator. KVM's TDP page tables are not in the scope of |
1233 | * the invalidation, but the guest's TLB entries need to be flushed as |
1234 | * the CPU may have cached entries in its TLB for the target PCID. |
1235 | */ |
1236 | if (unlikely(tdp_enabled)) { |
1237 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
1238 | return; |
1239 | } |
1240 | |
1241 | /* |
1242 | * If neither the current CR3 nor any of the prev_roots use the given |
1243 | * PCID, then nothing needs to be done here because a resync will |
1244 | * happen anyway before switching to any other CR3. |
1245 | */ |
1246 | if (kvm_get_active_pcid(vcpu) == pcid) { |
1247 | kvm_make_request(KVM_REQ_MMU_SYNC, vcpu); |
1248 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); |
1249 | } |
1250 | |
1251 | /* |
1252 | * If PCID is disabled, there is no need to free prev_roots even if the |
1253 | * PCIDs for them are also 0, because MOV to CR3 always flushes the TLB |
1254 | * with PCIDE=0. |
1255 | */ |
1256 | if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) |
1257 | return; |
1258 | |
1259 | for (i = 0; i < KVM_MMU_NUM_PREV_ROOTS; i++) |
1260 | if (kvm_get_pcid(vcpu, cr3: mmu->prev_roots[i].pgd) == pcid) |
1261 | roots_to_free |= KVM_MMU_ROOT_PREVIOUS(i); |
1262 | |
1263 | kvm_mmu_free_roots(kvm: vcpu->kvm, mmu, roots_to_free); |
1264 | } |
1265 | |
1266 | int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
1267 | { |
1268 | bool skip_tlb_flush = false; |
1269 | unsigned long pcid = 0; |
1270 | #ifdef CONFIG_X86_64 |
1271 | if (kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE)) { |
1272 | skip_tlb_flush = cr3 & X86_CR3_PCID_NOFLUSH; |
1273 | cr3 &= ~X86_CR3_PCID_NOFLUSH; |
1274 | pcid = cr3 & X86_CR3_PCID_MASK; |
1275 | } |
1276 | #endif |
1277 | |
1278 | /* PDPTRs are always reloaded for PAE paging. */ |
1279 | if (cr3 == kvm_read_cr3(vcpu) && !is_pae_paging(vcpu)) |
1280 | goto handle_tlb_flush; |
1281 | |
1282 | /* |
1283 | * Do not condition the GPA check on long mode, this helper is used to |
1284 | * stuff CR3, e.g. for RSM emulation, and there is no guarantee that |
1285 | * the current vCPU mode is accurate. |
1286 | */ |
1287 | if (!kvm_vcpu_is_legal_cr3(vcpu, cr3)) |
1288 | return 1; |
1289 | |
1290 | if (is_pae_paging(vcpu) && !load_pdptrs(vcpu, cr3)) |
1291 | return 1; |
1292 | |
1293 | if (cr3 != kvm_read_cr3(vcpu)) |
1294 | kvm_mmu_new_pgd(vcpu, new_pgd: cr3); |
1295 | |
1296 | vcpu->arch.cr3 = cr3; |
1297 | kvm_register_mark_dirty(vcpu, reg: VCPU_EXREG_CR3); |
1298 | /* Do not call post_set_cr3, we do not get here for confidential guests. */ |
1299 | |
1300 | handle_tlb_flush: |
1301 | /* |
1302 | * A load of CR3 that flushes the TLB flushes only the current PCID, |
1303 | * even if PCID is disabled, in which case PCID=0 is flushed. It's a |
1304 | * moot point in the end because _disabling_ PCID will flush all PCIDs, |
1305 | * and it's impossible to use a non-zero PCID when PCID is disabled, |
1306 | * i.e. only PCID=0 can be relevant. |
1307 | */ |
1308 | if (!skip_tlb_flush) |
1309 | kvm_invalidate_pcid(vcpu, pcid); |
1310 | |
1311 | return 0; |
1312 | } |
1313 | EXPORT_SYMBOL_GPL(kvm_set_cr3); |
1314 | |
1315 | int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8) |
1316 | { |
1317 | if (cr8 & CR8_RESERVED_BITS) |
1318 | return 1; |
1319 | if (lapic_in_kernel(vcpu)) |
1320 | kvm_lapic_set_tpr(vcpu, cr8); |
1321 | else |
1322 | vcpu->arch.cr8 = cr8; |
1323 | return 0; |
1324 | } |
1325 | EXPORT_SYMBOL_GPL(kvm_set_cr8); |
1326 | |
1327 | unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu) |
1328 | { |
1329 | if (lapic_in_kernel(vcpu)) |
1330 | return kvm_lapic_get_cr8(vcpu); |
1331 | else |
1332 | return vcpu->arch.cr8; |
1333 | } |
1334 | EXPORT_SYMBOL_GPL(kvm_get_cr8); |
1335 | |
1336 | static void kvm_update_dr0123(struct kvm_vcpu *vcpu) |
1337 | { |
1338 | int i; |
1339 | |
1340 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) { |
1341 | for (i = 0; i < KVM_NR_DB_REGS; i++) |
1342 | vcpu->arch.eff_db[i] = vcpu->arch.db[i]; |
1343 | } |
1344 | } |
1345 | |
1346 | void kvm_update_dr7(struct kvm_vcpu *vcpu) |
1347 | { |
1348 | unsigned long dr7; |
1349 | |
1350 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) |
1351 | dr7 = vcpu->arch.guest_debug_dr7; |
1352 | else |
1353 | dr7 = vcpu->arch.dr7; |
1354 | static_call(kvm_x86_set_dr7)(vcpu, dr7); |
1355 | vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_BP_ENABLED; |
1356 | if (dr7 & DR7_BP_EN_MASK) |
1357 | vcpu->arch.switch_db_regs |= KVM_DEBUGREG_BP_ENABLED; |
1358 | } |
1359 | EXPORT_SYMBOL_GPL(kvm_update_dr7); |
1360 | |
1361 | static u64 kvm_dr6_fixed(struct kvm_vcpu *vcpu) |
1362 | { |
1363 | u64 fixed = DR6_FIXED_1; |
1364 | |
1365 | if (!guest_cpuid_has(vcpu, X86_FEATURE_RTM)) |
1366 | fixed |= DR6_RTM; |
1367 | |
1368 | if (!guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)) |
1369 | fixed |= DR6_BUS_LOCK; |
1370 | return fixed; |
1371 | } |
1372 | |
1373 | int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val) |
1374 | { |
1375 | size_t size = ARRAY_SIZE(vcpu->arch.db); |
1376 | |
1377 | switch (dr) { |
1378 | case 0 ... 3: |
1379 | vcpu->arch.db[array_index_nospec(dr, size)] = val; |
1380 | if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)) |
1381 | vcpu->arch.eff_db[dr] = val; |
1382 | break; |
1383 | case 4: |
1384 | case 6: |
1385 | if (!kvm_dr6_valid(data: val)) |
1386 | return 1; /* #GP */ |
1387 | vcpu->arch.dr6 = (val & DR6_VOLATILE) | kvm_dr6_fixed(vcpu); |
1388 | break; |
1389 | case 5: |
1390 | default: /* 7 */ |
1391 | if (!kvm_dr7_valid(data: val)) |
1392 | return 1; /* #GP */ |
1393 | vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1; |
1394 | kvm_update_dr7(vcpu); |
1395 | break; |
1396 | } |
1397 | |
1398 | return 0; |
1399 | } |
1400 | EXPORT_SYMBOL_GPL(kvm_set_dr); |
1401 | |
1402 | unsigned long kvm_get_dr(struct kvm_vcpu *vcpu, int dr) |
1403 | { |
1404 | size_t size = ARRAY_SIZE(vcpu->arch.db); |
1405 | |
1406 | switch (dr) { |
1407 | case 0 ... 3: |
1408 | return vcpu->arch.db[array_index_nospec(dr, size)]; |
1409 | case 4: |
1410 | case 6: |
1411 | return vcpu->arch.dr6; |
1412 | case 5: |
1413 | default: /* 7 */ |
1414 | return vcpu->arch.dr7; |
1415 | } |
1416 | } |
1417 | EXPORT_SYMBOL_GPL(kvm_get_dr); |
1418 | |
1419 | int kvm_emulate_rdpmc(struct kvm_vcpu *vcpu) |
1420 | { |
1421 | u32 ecx = kvm_rcx_read(vcpu); |
1422 | u64 data; |
1423 | |
1424 | if (kvm_pmu_rdpmc(vcpu, pmc: ecx, data: &data)) { |
1425 | kvm_inject_gp(vcpu, error_code: 0); |
1426 | return 1; |
1427 | } |
1428 | |
1429 | kvm_rax_write(vcpu, val: (u32)data); |
1430 | kvm_rdx_write(vcpu, val: data >> 32); |
1431 | return kvm_skip_emulated_instruction(vcpu); |
1432 | } |
1433 | EXPORT_SYMBOL_GPL(kvm_emulate_rdpmc); |
1434 | |
1435 | /* |
1436 | * The three MSR lists(msrs_to_save, emulated_msrs, msr_based_features) track |
1437 | * the set of MSRs that KVM exposes to userspace through KVM_GET_MSRS, |
1438 | * KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST. msrs_to_save holds MSRs that |
1439 | * require host support, i.e. should be probed via RDMSR. emulated_msrs holds |
1440 | * MSRs that KVM emulates without strictly requiring host support. |
1441 | * msr_based_features holds MSRs that enumerate features, i.e. are effectively |
1442 | * CPUID leafs. Note, msr_based_features isn't mutually exclusive with |
1443 | * msrs_to_save and emulated_msrs. |
1444 | */ |
1445 | |
1446 | static const u32 msrs_to_save_base[] = { |
1447 | MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP, |
1448 | MSR_STAR, |
1449 | #ifdef CONFIG_X86_64 |
1450 | MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR, |
1451 | #endif |
1452 | MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA, |
1453 | MSR_IA32_FEAT_CTL, MSR_IA32_BNDCFGS, MSR_TSC_AUX, |
1454 | MSR_IA32_SPEC_CTRL, MSR_IA32_TSX_CTRL, |
1455 | MSR_IA32_RTIT_CTL, MSR_IA32_RTIT_STATUS, MSR_IA32_RTIT_CR3_MATCH, |
1456 | MSR_IA32_RTIT_OUTPUT_BASE, MSR_IA32_RTIT_OUTPUT_MASK, |
1457 | MSR_IA32_RTIT_ADDR0_A, MSR_IA32_RTIT_ADDR0_B, |
1458 | MSR_IA32_RTIT_ADDR1_A, MSR_IA32_RTIT_ADDR1_B, |
1459 | MSR_IA32_RTIT_ADDR2_A, MSR_IA32_RTIT_ADDR2_B, |
1460 | MSR_IA32_RTIT_ADDR3_A, MSR_IA32_RTIT_ADDR3_B, |
1461 | MSR_IA32_UMWAIT_CONTROL, |
1462 | |
1463 | MSR_IA32_XFD, MSR_IA32_XFD_ERR, |
1464 | }; |
1465 | |
1466 | static const u32 msrs_to_save_pmu[] = { |
1467 | MSR_ARCH_PERFMON_FIXED_CTR0, MSR_ARCH_PERFMON_FIXED_CTR1, |
1468 | MSR_ARCH_PERFMON_FIXED_CTR0 + 2, |
1469 | MSR_CORE_PERF_FIXED_CTR_CTRL, MSR_CORE_PERF_GLOBAL_STATUS, |
1470 | MSR_CORE_PERF_GLOBAL_CTRL, MSR_CORE_PERF_GLOBAL_OVF_CTRL, |
1471 | MSR_IA32_PEBS_ENABLE, MSR_IA32_DS_AREA, MSR_PEBS_DATA_CFG, |
1472 | |
1473 | /* This part of MSRs should match KVM_INTEL_PMC_MAX_GENERIC. */ |
1474 | MSR_ARCH_PERFMON_PERFCTR0, MSR_ARCH_PERFMON_PERFCTR1, |
1475 | MSR_ARCH_PERFMON_PERFCTR0 + 2, MSR_ARCH_PERFMON_PERFCTR0 + 3, |
1476 | MSR_ARCH_PERFMON_PERFCTR0 + 4, MSR_ARCH_PERFMON_PERFCTR0 + 5, |
1477 | MSR_ARCH_PERFMON_PERFCTR0 + 6, MSR_ARCH_PERFMON_PERFCTR0 + 7, |
1478 | MSR_ARCH_PERFMON_EVENTSEL0, MSR_ARCH_PERFMON_EVENTSEL1, |
1479 | MSR_ARCH_PERFMON_EVENTSEL0 + 2, MSR_ARCH_PERFMON_EVENTSEL0 + 3, |
1480 | MSR_ARCH_PERFMON_EVENTSEL0 + 4, MSR_ARCH_PERFMON_EVENTSEL0 + 5, |
1481 | MSR_ARCH_PERFMON_EVENTSEL0 + 6, MSR_ARCH_PERFMON_EVENTSEL0 + 7, |
1482 | |
1483 | MSR_K7_EVNTSEL0, MSR_K7_EVNTSEL1, MSR_K7_EVNTSEL2, MSR_K7_EVNTSEL3, |
1484 | MSR_K7_PERFCTR0, MSR_K7_PERFCTR1, MSR_K7_PERFCTR2, MSR_K7_PERFCTR3, |
1485 | |
1486 | /* This part of MSRs should match KVM_AMD_PMC_MAX_GENERIC. */ |
1487 | MSR_F15H_PERF_CTL0, MSR_F15H_PERF_CTL1, MSR_F15H_PERF_CTL2, |
1488 | MSR_F15H_PERF_CTL3, MSR_F15H_PERF_CTL4, MSR_F15H_PERF_CTL5, |
1489 | MSR_F15H_PERF_CTR0, MSR_F15H_PERF_CTR1, MSR_F15H_PERF_CTR2, |
1490 | MSR_F15H_PERF_CTR3, MSR_F15H_PERF_CTR4, MSR_F15H_PERF_CTR5, |
1491 | |
1492 | MSR_AMD64_PERF_CNTR_GLOBAL_CTL, |
1493 | MSR_AMD64_PERF_CNTR_GLOBAL_STATUS, |
1494 | MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR, |
1495 | }; |
1496 | |
1497 | static u32 msrs_to_save[ARRAY_SIZE(msrs_to_save_base) + |
1498 | ARRAY_SIZE(msrs_to_save_pmu)]; |
1499 | static unsigned num_msrs_to_save; |
1500 | |
1501 | static const u32 emulated_msrs_all[] = { |
1502 | MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK, |
1503 | MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW, |
1504 | |
1505 | #ifdef CONFIG_KVM_HYPERV |
1506 | HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL, |
1507 | HV_X64_MSR_TIME_REF_COUNT, HV_X64_MSR_REFERENCE_TSC, |
1508 | HV_X64_MSR_TSC_FREQUENCY, HV_X64_MSR_APIC_FREQUENCY, |
1509 | HV_X64_MSR_CRASH_P0, HV_X64_MSR_CRASH_P1, HV_X64_MSR_CRASH_P2, |
1510 | HV_X64_MSR_CRASH_P3, HV_X64_MSR_CRASH_P4, HV_X64_MSR_CRASH_CTL, |
1511 | HV_X64_MSR_RESET, |
1512 | HV_X64_MSR_VP_INDEX, |
1513 | HV_X64_MSR_VP_RUNTIME, |
1514 | HV_X64_MSR_SCONTROL, |
1515 | HV_X64_MSR_STIMER0_CONFIG, |
1516 | HV_X64_MSR_VP_ASSIST_PAGE, |
1517 | HV_X64_MSR_REENLIGHTENMENT_CONTROL, HV_X64_MSR_TSC_EMULATION_CONTROL, |
1518 | HV_X64_MSR_TSC_EMULATION_STATUS, HV_X64_MSR_TSC_INVARIANT_CONTROL, |
1519 | HV_X64_MSR_SYNDBG_OPTIONS, |
1520 | HV_X64_MSR_SYNDBG_CONTROL, HV_X64_MSR_SYNDBG_STATUS, |
1521 | HV_X64_MSR_SYNDBG_SEND_BUFFER, HV_X64_MSR_SYNDBG_RECV_BUFFER, |
1522 | HV_X64_MSR_SYNDBG_PENDING_BUFFER, |
1523 | #endif |
1524 | |
1525 | MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME, |
1526 | MSR_KVM_PV_EOI_EN, MSR_KVM_ASYNC_PF_INT, MSR_KVM_ASYNC_PF_ACK, |
1527 | |
1528 | MSR_IA32_TSC_ADJUST, |
1529 | MSR_IA32_TSC_DEADLINE, |
1530 | MSR_IA32_ARCH_CAPABILITIES, |
1531 | MSR_IA32_PERF_CAPABILITIES, |
1532 | MSR_IA32_MISC_ENABLE, |
1533 | MSR_IA32_MCG_STATUS, |
1534 | MSR_IA32_MCG_CTL, |
1535 | MSR_IA32_MCG_EXT_CTL, |
1536 | MSR_IA32_SMBASE, |
1537 | MSR_SMI_COUNT, |
1538 | MSR_PLATFORM_INFO, |
1539 | MSR_MISC_FEATURES_ENABLES, |
1540 | MSR_AMD64_VIRT_SPEC_CTRL, |
1541 | MSR_AMD64_TSC_RATIO, |
1542 | MSR_IA32_POWER_CTL, |
1543 | MSR_IA32_UCODE_REV, |
1544 | |
1545 | /* |
1546 | * KVM always supports the "true" VMX control MSRs, even if the host |
1547 | * does not. The VMX MSRs as a whole are considered "emulated" as KVM |
1548 | * doesn't strictly require them to exist in the host (ignoring that |
1549 | * KVM would refuse to load in the first place if the core set of MSRs |
1550 | * aren't supported). |
1551 | */ |
1552 | MSR_IA32_VMX_BASIC, |
1553 | MSR_IA32_VMX_TRUE_PINBASED_CTLS, |
1554 | MSR_IA32_VMX_TRUE_PROCBASED_CTLS, |
1555 | MSR_IA32_VMX_TRUE_EXIT_CTLS, |
1556 | MSR_IA32_VMX_TRUE_ENTRY_CTLS, |
1557 | MSR_IA32_VMX_MISC, |
1558 | MSR_IA32_VMX_CR0_FIXED0, |
1559 | MSR_IA32_VMX_CR4_FIXED0, |
1560 | MSR_IA32_VMX_VMCS_ENUM, |
1561 | MSR_IA32_VMX_PROCBASED_CTLS2, |
1562 | MSR_IA32_VMX_EPT_VPID_CAP, |
1563 | MSR_IA32_VMX_VMFUNC, |
1564 | |
1565 | MSR_K7_HWCR, |
1566 | MSR_KVM_POLL_CONTROL, |
1567 | }; |
1568 | |
1569 | static u32 emulated_msrs[ARRAY_SIZE(emulated_msrs_all)]; |
1570 | static unsigned num_emulated_msrs; |
1571 | |
1572 | /* |
1573 | * List of MSRs that control the existence of MSR-based features, i.e. MSRs |
1574 | * that are effectively CPUID leafs. VMX MSRs are also included in the set of |
1575 | * feature MSRs, but are handled separately to allow expedited lookups. |
1576 | */ |
1577 | static const u32 msr_based_features_all_except_vmx[] = { |
1578 | MSR_AMD64_DE_CFG, |
1579 | MSR_IA32_UCODE_REV, |
1580 | MSR_IA32_ARCH_CAPABILITIES, |
1581 | MSR_IA32_PERF_CAPABILITIES, |
1582 | }; |
1583 | |
1584 | static u32 msr_based_features[ARRAY_SIZE(msr_based_features_all_except_vmx) + |
1585 | (KVM_LAST_EMULATED_VMX_MSR - KVM_FIRST_EMULATED_VMX_MSR + 1)]; |
1586 | static unsigned int num_msr_based_features; |
1587 | |
1588 | /* |
1589 | * All feature MSRs except uCode revID, which tracks the currently loaded uCode |
1590 | * patch, are immutable once the vCPU model is defined. |
1591 | */ |
1592 | static bool kvm_is_immutable_feature_msr(u32 msr) |
1593 | { |
1594 | int i; |
1595 | |
1596 | if (msr >= KVM_FIRST_EMULATED_VMX_MSR && msr <= KVM_LAST_EMULATED_VMX_MSR) |
1597 | return true; |
1598 | |
1599 | for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) { |
1600 | if (msr == msr_based_features_all_except_vmx[i]) |
1601 | return msr != MSR_IA32_UCODE_REV; |
1602 | } |
1603 | |
1604 | return false; |
1605 | } |
1606 | |
1607 | /* |
1608 | * Some IA32_ARCH_CAPABILITIES bits have dependencies on MSRs that KVM |
1609 | * does not yet virtualize. These include: |
1610 | * 10 - MISC_PACKAGE_CTRLS |
1611 | * 11 - ENERGY_FILTERING_CTL |
1612 | * 12 - DOITM |
1613 | * 18 - FB_CLEAR_CTRL |
1614 | * 21 - XAPIC_DISABLE_STATUS |
1615 | * 23 - OVERCLOCKING_STATUS |
1616 | */ |
1617 | |
1618 | #define KVM_SUPPORTED_ARCH_CAP \ |
1619 | (ARCH_CAP_RDCL_NO | ARCH_CAP_IBRS_ALL | ARCH_CAP_RSBA | \ |
1620 | ARCH_CAP_SKIP_VMENTRY_L1DFLUSH | ARCH_CAP_SSB_NO | ARCH_CAP_MDS_NO | \ |
1621 | ARCH_CAP_PSCHANGE_MC_NO | ARCH_CAP_TSX_CTRL_MSR | ARCH_CAP_TAA_NO | \ |
1622 | ARCH_CAP_SBDR_SSDP_NO | ARCH_CAP_FBSDP_NO | ARCH_CAP_PSDP_NO | \ |
1623 | ARCH_CAP_FB_CLEAR | ARCH_CAP_RRSBA | ARCH_CAP_PBRSB_NO | ARCH_CAP_GDS_NO | \ |
1624 | ARCH_CAP_RFDS_NO | ARCH_CAP_RFDS_CLEAR | ARCH_CAP_BHI_NO) |
1625 | |
1626 | static u64 kvm_get_arch_capabilities(void) |
1627 | { |
1628 | u64 data = host_arch_capabilities & KVM_SUPPORTED_ARCH_CAP; |
1629 | |
1630 | /* |
1631 | * If nx_huge_pages is enabled, KVM's shadow paging will ensure that |
1632 | * the nested hypervisor runs with NX huge pages. If it is not, |
1633 | * L1 is anyway vulnerable to ITLB_MULTIHIT exploits from other |
1634 | * L1 guests, so it need not worry about its own (L2) guests. |
1635 | */ |
1636 | data |= ARCH_CAP_PSCHANGE_MC_NO; |
1637 | |
1638 | /* |
1639 | * If we're doing cache flushes (either "always" or "cond") |
1640 | * we will do one whenever the guest does a vmlaunch/vmresume. |
1641 | * If an outer hypervisor is doing the cache flush for us |
1642 | * (ARCH_CAP_SKIP_VMENTRY_L1DFLUSH), we can safely pass that |
1643 | * capability to the guest too, and if EPT is disabled we're not |
1644 | * vulnerable. Overall, only VMENTER_L1D_FLUSH_NEVER will |
1645 | * require a nested hypervisor to do a flush of its own. |
1646 | */ |
1647 | if (l1tf_vmx_mitigation != VMENTER_L1D_FLUSH_NEVER) |
1648 | data |= ARCH_CAP_SKIP_VMENTRY_L1DFLUSH; |
1649 | |
1650 | if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) |
1651 | data |= ARCH_CAP_RDCL_NO; |
1652 | if (!boot_cpu_has_bug(X86_BUG_SPEC_STORE_BYPASS)) |
1653 | data |= ARCH_CAP_SSB_NO; |
1654 | if (!boot_cpu_has_bug(X86_BUG_MDS)) |
1655 | data |= ARCH_CAP_MDS_NO; |
1656 | if (!boot_cpu_has_bug(X86_BUG_RFDS)) |
1657 | data |= ARCH_CAP_RFDS_NO; |
1658 | |
1659 | if (!boot_cpu_has(X86_FEATURE_RTM)) { |
1660 | /* |
1661 | * If RTM=0 because the kernel has disabled TSX, the host might |
1662 | * have TAA_NO or TSX_CTRL. Clear TAA_NO (the guest sees RTM=0 |
1663 | * and therefore knows that there cannot be TAA) but keep |
1664 | * TSX_CTRL: some buggy userspaces leave it set on tsx=on hosts, |
1665 | * and we want to allow migrating those guests to tsx=off hosts. |
1666 | */ |
1667 | data &= ~ARCH_CAP_TAA_NO; |
1668 | } else if (!boot_cpu_has_bug(X86_BUG_TAA)) { |
1669 | data |= ARCH_CAP_TAA_NO; |
1670 | } else { |
1671 | /* |
1672 | * Nothing to do here; we emulate TSX_CTRL if present on the |
1673 | * host so the guest can choose between disabling TSX or |
1674 | * using VERW to clear CPU buffers. |
1675 | */ |
1676 | } |
1677 | |
1678 | if (!boot_cpu_has_bug(X86_BUG_GDS) || gds_ucode_mitigated()) |
1679 | data |= ARCH_CAP_GDS_NO; |
1680 | |
1681 | return data; |
1682 | } |
1683 | |
1684 | static int kvm_get_msr_feature(struct kvm_msr_entry *msr) |
1685 | { |
1686 | switch (msr->index) { |
1687 | case MSR_IA32_ARCH_CAPABILITIES: |
1688 | msr->data = kvm_get_arch_capabilities(); |
1689 | break; |
1690 | case MSR_IA32_PERF_CAPABILITIES: |
1691 | msr->data = kvm_caps.supported_perf_cap; |
1692 | break; |
1693 | case MSR_IA32_UCODE_REV: |
1694 | rdmsrl_safe(msr: msr->index, p: &msr->data); |
1695 | break; |
1696 | default: |
1697 | return static_call(kvm_x86_get_msr_feature)(msr); |
1698 | } |
1699 | return 0; |
1700 | } |
1701 | |
1702 | static int do_get_msr_feature(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
1703 | { |
1704 | struct kvm_msr_entry msr; |
1705 | int r; |
1706 | |
1707 | /* Unconditionally clear the output for simplicity */ |
1708 | msr.data = 0; |
1709 | msr.index = index; |
1710 | r = kvm_get_msr_feature(msr: &msr); |
1711 | |
1712 | if (r == KVM_MSR_RET_INVALID && kvm_msr_ignored_check(msr: index, data: 0, write: false)) |
1713 | r = 0; |
1714 | |
1715 | *data = msr.data; |
1716 | |
1717 | return r; |
1718 | } |
1719 | |
1720 | static bool __kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) |
1721 | { |
1722 | if (efer & EFER_AUTOIBRS && !guest_cpuid_has(vcpu, X86_FEATURE_AUTOIBRS)) |
1723 | return false; |
1724 | |
1725 | if (efer & EFER_FFXSR && !guest_cpuid_has(vcpu, X86_FEATURE_FXSR_OPT)) |
1726 | return false; |
1727 | |
1728 | if (efer & EFER_SVME && !guest_cpuid_has(vcpu, X86_FEATURE_SVM)) |
1729 | return false; |
1730 | |
1731 | if (efer & (EFER_LME | EFER_LMA) && |
1732 | !guest_cpuid_has(vcpu, X86_FEATURE_LM)) |
1733 | return false; |
1734 | |
1735 | if (efer & EFER_NX && !guest_cpuid_has(vcpu, X86_FEATURE_NX)) |
1736 | return false; |
1737 | |
1738 | return true; |
1739 | |
1740 | } |
1741 | bool kvm_valid_efer(struct kvm_vcpu *vcpu, u64 efer) |
1742 | { |
1743 | if (efer & efer_reserved_bits) |
1744 | return false; |
1745 | |
1746 | return __kvm_valid_efer(vcpu, efer); |
1747 | } |
1748 | EXPORT_SYMBOL_GPL(kvm_valid_efer); |
1749 | |
1750 | static int set_efer(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
1751 | { |
1752 | u64 old_efer = vcpu->arch.efer; |
1753 | u64 efer = msr_info->data; |
1754 | int r; |
1755 | |
1756 | if (efer & efer_reserved_bits) |
1757 | return 1; |
1758 | |
1759 | if (!msr_info->host_initiated) { |
1760 | if (!__kvm_valid_efer(vcpu, efer)) |
1761 | return 1; |
1762 | |
1763 | if (is_paging(vcpu) && |
1764 | (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME)) |
1765 | return 1; |
1766 | } |
1767 | |
1768 | efer &= ~EFER_LMA; |
1769 | efer |= vcpu->arch.efer & EFER_LMA; |
1770 | |
1771 | r = static_call(kvm_x86_set_efer)(vcpu, efer); |
1772 | if (r) { |
1773 | WARN_ON(r > 0); |
1774 | return r; |
1775 | } |
1776 | |
1777 | if ((efer ^ old_efer) & KVM_MMU_EFER_ROLE_BITS) |
1778 | kvm_mmu_reset_context(vcpu); |
1779 | |
1780 | if (!static_cpu_has(X86_FEATURE_XSAVES) && |
1781 | (efer & EFER_SVME)) |
1782 | kvm_hv_xsaves_xsavec_maybe_warn(vcpu); |
1783 | |
1784 | return 0; |
1785 | } |
1786 | |
1787 | void kvm_enable_efer_bits(u64 mask) |
1788 | { |
1789 | efer_reserved_bits &= ~mask; |
1790 | } |
1791 | EXPORT_SYMBOL_GPL(kvm_enable_efer_bits); |
1792 | |
1793 | bool kvm_msr_allowed(struct kvm_vcpu *vcpu, u32 index, u32 type) |
1794 | { |
1795 | struct kvm_x86_msr_filter *msr_filter; |
1796 | struct msr_bitmap_range *ranges; |
1797 | struct kvm *kvm = vcpu->kvm; |
1798 | bool allowed; |
1799 | int idx; |
1800 | u32 i; |
1801 | |
1802 | /* x2APIC MSRs do not support filtering. */ |
1803 | if (index >= 0x800 && index <= 0x8ff) |
1804 | return true; |
1805 | |
1806 | idx = srcu_read_lock(ssp: &kvm->srcu); |
1807 | |
1808 | msr_filter = srcu_dereference(kvm->arch.msr_filter, &kvm->srcu); |
1809 | if (!msr_filter) { |
1810 | allowed = true; |
1811 | goto out; |
1812 | } |
1813 | |
1814 | allowed = msr_filter->default_allow; |
1815 | ranges = msr_filter->ranges; |
1816 | |
1817 | for (i = 0; i < msr_filter->count; i++) { |
1818 | u32 start = ranges[i].base; |
1819 | u32 end = start + ranges[i].nmsrs; |
1820 | u32 flags = ranges[i].flags; |
1821 | unsigned long *bitmap = ranges[i].bitmap; |
1822 | |
1823 | if ((index >= start) && (index < end) && (flags & type)) { |
1824 | allowed = test_bit(index - start, bitmap); |
1825 | break; |
1826 | } |
1827 | } |
1828 | |
1829 | out: |
1830 | srcu_read_unlock(ssp: &kvm->srcu, idx); |
1831 | |
1832 | return allowed; |
1833 | } |
1834 | EXPORT_SYMBOL_GPL(kvm_msr_allowed); |
1835 | |
1836 | /* |
1837 | * Write @data into the MSR specified by @index. Select MSR specific fault |
1838 | * checks are bypassed if @host_initiated is %true. |
1839 | * Returns 0 on success, non-0 otherwise. |
1840 | * Assumes vcpu_load() was already called. |
1841 | */ |
1842 | static int __kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data, |
1843 | bool host_initiated) |
1844 | { |
1845 | struct msr_data msr; |
1846 | |
1847 | switch (index) { |
1848 | case MSR_FS_BASE: |
1849 | case MSR_GS_BASE: |
1850 | case MSR_KERNEL_GS_BASE: |
1851 | case MSR_CSTAR: |
1852 | case MSR_LSTAR: |
1853 | if (is_noncanonical_address(la: data, vcpu)) |
1854 | return 1; |
1855 | break; |
1856 | case MSR_IA32_SYSENTER_EIP: |
1857 | case MSR_IA32_SYSENTER_ESP: |
1858 | /* |
1859 | * IA32_SYSENTER_ESP and IA32_SYSENTER_EIP cause #GP if |
1860 | * non-canonical address is written on Intel but not on |
1861 | * AMD (which ignores the top 32-bits, because it does |
1862 | * not implement 64-bit SYSENTER). |
1863 | * |
1864 | * 64-bit code should hence be able to write a non-canonical |
1865 | * value on AMD. Making the address canonical ensures that |
1866 | * vmentry does not fail on Intel after writing a non-canonical |
1867 | * value, and that something deterministic happens if the guest |
1868 | * invokes 64-bit SYSENTER. |
1869 | */ |
1870 | data = __canonical_address(vaddr: data, vaddr_bits: vcpu_virt_addr_bits(vcpu)); |
1871 | break; |
1872 | case MSR_TSC_AUX: |
1873 | if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) |
1874 | return 1; |
1875 | |
1876 | if (!host_initiated && |
1877 | !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && |
1878 | !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) |
1879 | return 1; |
1880 | |
1881 | /* |
1882 | * Per Intel's SDM, bits 63:32 are reserved, but AMD's APM has |
1883 | * incomplete and conflicting architectural behavior. Current |
1884 | * AMD CPUs completely ignore bits 63:32, i.e. they aren't |
1885 | * reserved and always read as zeros. Enforce Intel's reserved |
1886 | * bits check if and only if the guest CPU is Intel, and clear |
1887 | * the bits in all other cases. This ensures cross-vendor |
1888 | * migration will provide consistent behavior for the guest. |
1889 | */ |
1890 | if (guest_cpuid_is_intel(vcpu) && (data >> 32) != 0) |
1891 | return 1; |
1892 | |
1893 | data = (u32)data; |
1894 | break; |
1895 | } |
1896 | |
1897 | msr.data = data; |
1898 | msr.index = index; |
1899 | msr.host_initiated = host_initiated; |
1900 | |
1901 | return static_call(kvm_x86_set_msr)(vcpu, &msr); |
1902 | } |
1903 | |
1904 | static int kvm_set_msr_ignored_check(struct kvm_vcpu *vcpu, |
1905 | u32 index, u64 data, bool host_initiated) |
1906 | { |
1907 | int ret = __kvm_set_msr(vcpu, index, data, host_initiated); |
1908 | |
1909 | if (ret == KVM_MSR_RET_INVALID) |
1910 | if (kvm_msr_ignored_check(msr: index, data, write: true)) |
1911 | ret = 0; |
1912 | |
1913 | return ret; |
1914 | } |
1915 | |
1916 | /* |
1917 | * Read the MSR specified by @index into @data. Select MSR specific fault |
1918 | * checks are bypassed if @host_initiated is %true. |
1919 | * Returns 0 on success, non-0 otherwise. |
1920 | * Assumes vcpu_load() was already called. |
1921 | */ |
1922 | int __kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data, |
1923 | bool host_initiated) |
1924 | { |
1925 | struct msr_data msr; |
1926 | int ret; |
1927 | |
1928 | switch (index) { |
1929 | case MSR_TSC_AUX: |
1930 | if (!kvm_is_supported_user_return_msr(MSR_TSC_AUX)) |
1931 | return 1; |
1932 | |
1933 | if (!host_initiated && |
1934 | !guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) && |
1935 | !guest_cpuid_has(vcpu, X86_FEATURE_RDPID)) |
1936 | return 1; |
1937 | break; |
1938 | } |
1939 | |
1940 | msr.index = index; |
1941 | msr.host_initiated = host_initiated; |
1942 | |
1943 | ret = static_call(kvm_x86_get_msr)(vcpu, &msr); |
1944 | if (!ret) |
1945 | *data = msr.data; |
1946 | return ret; |
1947 | } |
1948 | |
1949 | static int kvm_get_msr_ignored_check(struct kvm_vcpu *vcpu, |
1950 | u32 index, u64 *data, bool host_initiated) |
1951 | { |
1952 | int ret = __kvm_get_msr(vcpu, index, data, host_initiated); |
1953 | |
1954 | if (ret == KVM_MSR_RET_INVALID) { |
1955 | /* Unconditionally clear *data for simplicity */ |
1956 | *data = 0; |
1957 | if (kvm_msr_ignored_check(msr: index, data: 0, write: false)) |
1958 | ret = 0; |
1959 | } |
1960 | |
1961 | return ret; |
1962 | } |
1963 | |
1964 | static int kvm_get_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 *data) |
1965 | { |
1966 | if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_READ)) |
1967 | return KVM_MSR_RET_FILTERED; |
1968 | return kvm_get_msr_ignored_check(vcpu, index, data, host_initiated: false); |
1969 | } |
1970 | |
1971 | static int kvm_set_msr_with_filter(struct kvm_vcpu *vcpu, u32 index, u64 data) |
1972 | { |
1973 | if (!kvm_msr_allowed(vcpu, index, KVM_MSR_FILTER_WRITE)) |
1974 | return KVM_MSR_RET_FILTERED; |
1975 | return kvm_set_msr_ignored_check(vcpu, index, data, host_initiated: false); |
1976 | } |
1977 | |
1978 | int kvm_get_msr(struct kvm_vcpu *vcpu, u32 index, u64 *data) |
1979 | { |
1980 | return kvm_get_msr_ignored_check(vcpu, index, data, host_initiated: false); |
1981 | } |
1982 | EXPORT_SYMBOL_GPL(kvm_get_msr); |
1983 | |
1984 | int kvm_set_msr(struct kvm_vcpu *vcpu, u32 index, u64 data) |
1985 | { |
1986 | return kvm_set_msr_ignored_check(vcpu, index, data, host_initiated: false); |
1987 | } |
1988 | EXPORT_SYMBOL_GPL(kvm_set_msr); |
1989 | |
1990 | static void complete_userspace_rdmsr(struct kvm_vcpu *vcpu) |
1991 | { |
1992 | if (!vcpu->run->msr.error) { |
1993 | kvm_rax_write(vcpu, val: (u32)vcpu->run->msr.data); |
1994 | kvm_rdx_write(vcpu, val: vcpu->run->msr.data >> 32); |
1995 | } |
1996 | } |
1997 | |
1998 | static int complete_emulated_msr_access(struct kvm_vcpu *vcpu) |
1999 | { |
2000 | return complete_emulated_insn_gp(vcpu, err: vcpu->run->msr.error); |
2001 | } |
2002 | |
2003 | static int complete_emulated_rdmsr(struct kvm_vcpu *vcpu) |
2004 | { |
2005 | complete_userspace_rdmsr(vcpu); |
2006 | return complete_emulated_msr_access(vcpu); |
2007 | } |
2008 | |
2009 | static int complete_fast_msr_access(struct kvm_vcpu *vcpu) |
2010 | { |
2011 | return static_call(kvm_x86_complete_emulated_msr)(vcpu, vcpu->run->msr.error); |
2012 | } |
2013 | |
2014 | static int complete_fast_rdmsr(struct kvm_vcpu *vcpu) |
2015 | { |
2016 | complete_userspace_rdmsr(vcpu); |
2017 | return complete_fast_msr_access(vcpu); |
2018 | } |
2019 | |
2020 | static u64 kvm_msr_reason(int r) |
2021 | { |
2022 | switch (r) { |
2023 | case KVM_MSR_RET_INVALID: |
2024 | return KVM_MSR_EXIT_REASON_UNKNOWN; |
2025 | case KVM_MSR_RET_FILTERED: |
2026 | return KVM_MSR_EXIT_REASON_FILTER; |
2027 | default: |
2028 | return KVM_MSR_EXIT_REASON_INVAL; |
2029 | } |
2030 | } |
2031 | |
2032 | static int kvm_msr_user_space(struct kvm_vcpu *vcpu, u32 index, |
2033 | u32 exit_reason, u64 data, |
2034 | int (*completion)(struct kvm_vcpu *vcpu), |
2035 | int r) |
2036 | { |
2037 | u64 msr_reason = kvm_msr_reason(r); |
2038 | |
2039 | /* Check if the user wanted to know about this MSR fault */ |
2040 | if (!(vcpu->kvm->arch.user_space_msr_mask & msr_reason)) |
2041 | return 0; |
2042 | |
2043 | vcpu->run->exit_reason = exit_reason; |
2044 | vcpu->run->msr.error = 0; |
2045 | memset(vcpu->run->msr.pad, 0, sizeof(vcpu->run->msr.pad)); |
2046 | vcpu->run->msr.reason = msr_reason; |
2047 | vcpu->run->msr.index = index; |
2048 | vcpu->run->msr.data = data; |
2049 | vcpu->arch.complete_userspace_io = completion; |
2050 | |
2051 | return 1; |
2052 | } |
2053 | |
2054 | int kvm_emulate_rdmsr(struct kvm_vcpu *vcpu) |
2055 | { |
2056 | u32 ecx = kvm_rcx_read(vcpu); |
2057 | u64 data; |
2058 | int r; |
2059 | |
2060 | r = kvm_get_msr_with_filter(vcpu, index: ecx, data: &data); |
2061 | |
2062 | if (!r) { |
2063 | trace_kvm_msr_read(ecx, data); |
2064 | |
2065 | kvm_rax_write(vcpu, val: data & -1u); |
2066 | kvm_rdx_write(vcpu, val: (data >> 32) & -1u); |
2067 | } else { |
2068 | /* MSR read failed? See if we should ask user space */ |
2069 | if (kvm_msr_user_space(vcpu, index: ecx, KVM_EXIT_X86_RDMSR, data: 0, |
2070 | completion: complete_fast_rdmsr, r)) |
2071 | return 0; |
2072 | trace_kvm_msr_read_ex(ecx); |
2073 | } |
2074 | |
2075 | return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); |
2076 | } |
2077 | EXPORT_SYMBOL_GPL(kvm_emulate_rdmsr); |
2078 | |
2079 | int kvm_emulate_wrmsr(struct kvm_vcpu *vcpu) |
2080 | { |
2081 | u32 ecx = kvm_rcx_read(vcpu); |
2082 | u64 data = kvm_read_edx_eax(vcpu); |
2083 | int r; |
2084 | |
2085 | r = kvm_set_msr_with_filter(vcpu, index: ecx, data); |
2086 | |
2087 | if (!r) { |
2088 | trace_kvm_msr_write(ecx, data); |
2089 | } else { |
2090 | /* MSR write failed? See if we should ask user space */ |
2091 | if (kvm_msr_user_space(vcpu, index: ecx, KVM_EXIT_X86_WRMSR, data, |
2092 | completion: complete_fast_msr_access, r)) |
2093 | return 0; |
2094 | /* Signal all other negative errors to userspace */ |
2095 | if (r < 0) |
2096 | return r; |
2097 | trace_kvm_msr_write_ex(ecx, data); |
2098 | } |
2099 | |
2100 | return static_call(kvm_x86_complete_emulated_msr)(vcpu, r); |
2101 | } |
2102 | EXPORT_SYMBOL_GPL(kvm_emulate_wrmsr); |
2103 | |
2104 | int kvm_emulate_as_nop(struct kvm_vcpu *vcpu) |
2105 | { |
2106 | return kvm_skip_emulated_instruction(vcpu); |
2107 | } |
2108 | |
2109 | int kvm_emulate_invd(struct kvm_vcpu *vcpu) |
2110 | { |
2111 | /* Treat an INVD instruction as a NOP and just skip it. */ |
2112 | return kvm_emulate_as_nop(vcpu); |
2113 | } |
2114 | EXPORT_SYMBOL_GPL(kvm_emulate_invd); |
2115 | |
2116 | int kvm_handle_invalid_op(struct kvm_vcpu *vcpu) |
2117 | { |
2118 | kvm_queue_exception(vcpu, UD_VECTOR); |
2119 | return 1; |
2120 | } |
2121 | EXPORT_SYMBOL_GPL(kvm_handle_invalid_op); |
2122 | |
2123 | |
2124 | static int kvm_emulate_monitor_mwait(struct kvm_vcpu *vcpu, const char *insn) |
2125 | { |
2126 | if (!kvm_check_has_quirk(kvm: vcpu->kvm, KVM_X86_QUIRK_MWAIT_NEVER_UD_FAULTS) && |
2127 | !guest_cpuid_has(vcpu, X86_FEATURE_MWAIT)) |
2128 | return kvm_handle_invalid_op(vcpu); |
2129 | |
2130 | pr_warn_once("%s instruction emulated as NOP!\n" , insn); |
2131 | return kvm_emulate_as_nop(vcpu); |
2132 | } |
2133 | int kvm_emulate_mwait(struct kvm_vcpu *vcpu) |
2134 | { |
2135 | return kvm_emulate_monitor_mwait(vcpu, insn: "MWAIT" ); |
2136 | } |
2137 | EXPORT_SYMBOL_GPL(kvm_emulate_mwait); |
2138 | |
2139 | int kvm_emulate_monitor(struct kvm_vcpu *vcpu) |
2140 | { |
2141 | return kvm_emulate_monitor_mwait(vcpu, insn: "MONITOR" ); |
2142 | } |
2143 | EXPORT_SYMBOL_GPL(kvm_emulate_monitor); |
2144 | |
2145 | static inline bool kvm_vcpu_exit_request(struct kvm_vcpu *vcpu) |
2146 | { |
2147 | xfer_to_guest_mode_prepare(); |
2148 | return vcpu->mode == EXITING_GUEST_MODE || kvm_request_pending(vcpu) || |
2149 | xfer_to_guest_mode_work_pending(); |
2150 | } |
2151 | |
2152 | /* |
2153 | * The fast path for frequent and performance sensitive wrmsr emulation, |
2154 | * i.e. the sending of IPI, sending IPI early in the VM-Exit flow reduces |
2155 | * the latency of virtual IPI by avoiding the expensive bits of transitioning |
2156 | * from guest to host, e.g. reacquiring KVM's SRCU lock. In contrast to the |
2157 | * other cases which must be called after interrupts are enabled on the host. |
2158 | */ |
2159 | static int handle_fastpath_set_x2apic_icr_irqoff(struct kvm_vcpu *vcpu, u64 data) |
2160 | { |
2161 | if (!lapic_in_kernel(vcpu) || !apic_x2apic_mode(apic: vcpu->arch.apic)) |
2162 | return 1; |
2163 | |
2164 | if (((data & APIC_SHORT_MASK) == APIC_DEST_NOSHORT) && |
2165 | ((data & APIC_DEST_MASK) == APIC_DEST_PHYSICAL) && |
2166 | ((data & APIC_MODE_MASK) == APIC_DM_FIXED) && |
2167 | ((u32)(data >> 32) != X2APIC_BROADCAST)) |
2168 | return kvm_x2apic_icr_write(apic: vcpu->arch.apic, data); |
2169 | |
2170 | return 1; |
2171 | } |
2172 | |
2173 | static int handle_fastpath_set_tscdeadline(struct kvm_vcpu *vcpu, u64 data) |
2174 | { |
2175 | if (!kvm_can_use_hv_timer(vcpu)) |
2176 | return 1; |
2177 | |
2178 | kvm_set_lapic_tscdeadline_msr(vcpu, data); |
2179 | return 0; |
2180 | } |
2181 | |
2182 | fastpath_t handle_fastpath_set_msr_irqoff(struct kvm_vcpu *vcpu) |
2183 | { |
2184 | u32 msr = kvm_rcx_read(vcpu); |
2185 | u64 data; |
2186 | fastpath_t ret = EXIT_FASTPATH_NONE; |
2187 | |
2188 | kvm_vcpu_srcu_read_lock(vcpu); |
2189 | |
2190 | switch (msr) { |
2191 | case APIC_BASE_MSR + (APIC_ICR >> 4): |
2192 | data = kvm_read_edx_eax(vcpu); |
2193 | if (!handle_fastpath_set_x2apic_icr_irqoff(vcpu, data)) { |
2194 | kvm_skip_emulated_instruction(vcpu); |
2195 | ret = EXIT_FASTPATH_EXIT_HANDLED; |
2196 | } |
2197 | break; |
2198 | case MSR_IA32_TSC_DEADLINE: |
2199 | data = kvm_read_edx_eax(vcpu); |
2200 | if (!handle_fastpath_set_tscdeadline(vcpu, data)) { |
2201 | kvm_skip_emulated_instruction(vcpu); |
2202 | ret = EXIT_FASTPATH_REENTER_GUEST; |
2203 | } |
2204 | break; |
2205 | default: |
2206 | break; |
2207 | } |
2208 | |
2209 | if (ret != EXIT_FASTPATH_NONE) |
2210 | trace_kvm_msr_write(msr, data); |
2211 | |
2212 | kvm_vcpu_srcu_read_unlock(vcpu); |
2213 | |
2214 | return ret; |
2215 | } |
2216 | EXPORT_SYMBOL_GPL(handle_fastpath_set_msr_irqoff); |
2217 | |
2218 | /* |
2219 | * Adapt set_msr() to msr_io()'s calling convention |
2220 | */ |
2221 | static int do_get_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
2222 | { |
2223 | return kvm_get_msr_ignored_check(vcpu, index, data, host_initiated: true); |
2224 | } |
2225 | |
2226 | static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data) |
2227 | { |
2228 | u64 val; |
2229 | |
2230 | /* |
2231 | * Disallow writes to immutable feature MSRs after KVM_RUN. KVM does |
2232 | * not support modifying the guest vCPU model on the fly, e.g. changing |
2233 | * the nVMX capabilities while L2 is running is nonsensical. Ignore |
2234 | * writes of the same value, e.g. to allow userspace to blindly stuff |
2235 | * all MSRs when emulating RESET. |
2236 | */ |
2237 | if (kvm_vcpu_has_run(vcpu) && kvm_is_immutable_feature_msr(msr: index)) { |
2238 | if (do_get_msr(vcpu, index, data: &val) || *data != val) |
2239 | return -EINVAL; |
2240 | |
2241 | return 0; |
2242 | } |
2243 | |
2244 | return kvm_set_msr_ignored_check(vcpu, index, data: *data, host_initiated: true); |
2245 | } |
2246 | |
2247 | #ifdef CONFIG_X86_64 |
2248 | struct pvclock_clock { |
2249 | int vclock_mode; |
2250 | u64 cycle_last; |
2251 | u64 mask; |
2252 | u32 mult; |
2253 | u32 shift; |
2254 | u64 base_cycles; |
2255 | u64 offset; |
2256 | }; |
2257 | |
2258 | struct pvclock_gtod_data { |
2259 | seqcount_t seq; |
2260 | |
2261 | struct pvclock_clock clock; /* extract of a clocksource struct */ |
2262 | struct pvclock_clock raw_clock; /* extract of a clocksource struct */ |
2263 | |
2264 | ktime_t offs_boot; |
2265 | u64 wall_time_sec; |
2266 | }; |
2267 | |
2268 | static struct pvclock_gtod_data pvclock_gtod_data; |
2269 | |
2270 | static void update_pvclock_gtod(struct timekeeper *tk) |
2271 | { |
2272 | struct pvclock_gtod_data *vdata = &pvclock_gtod_data; |
2273 | |
2274 | write_seqcount_begin(&vdata->seq); |
2275 | |
2276 | /* copy pvclock gtod data */ |
2277 | vdata->clock.vclock_mode = tk->tkr_mono.clock->vdso_clock_mode; |
2278 | vdata->clock.cycle_last = tk->tkr_mono.cycle_last; |
2279 | vdata->clock.mask = tk->tkr_mono.mask; |
2280 | vdata->clock.mult = tk->tkr_mono.mult; |
2281 | vdata->clock.shift = tk->tkr_mono.shift; |
2282 | vdata->clock.base_cycles = tk->tkr_mono.xtime_nsec; |
2283 | vdata->clock.offset = tk->tkr_mono.base; |
2284 | |
2285 | vdata->raw_clock.vclock_mode = tk->tkr_raw.clock->vdso_clock_mode; |
2286 | vdata->raw_clock.cycle_last = tk->tkr_raw.cycle_last; |
2287 | vdata->raw_clock.mask = tk->tkr_raw.mask; |
2288 | vdata->raw_clock.mult = tk->tkr_raw.mult; |
2289 | vdata->raw_clock.shift = tk->tkr_raw.shift; |
2290 | vdata->raw_clock.base_cycles = tk->tkr_raw.xtime_nsec; |
2291 | vdata->raw_clock.offset = tk->tkr_raw.base; |
2292 | |
2293 | vdata->wall_time_sec = tk->xtime_sec; |
2294 | |
2295 | vdata->offs_boot = tk->offs_boot; |
2296 | |
2297 | write_seqcount_end(&vdata->seq); |
2298 | } |
2299 | |
2300 | static s64 get_kvmclock_base_ns(void) |
2301 | { |
2302 | /* Count up from boot time, but with the frequency of the raw clock. */ |
2303 | return ktime_to_ns(ktime_add(ktime_get_raw(), pvclock_gtod_data.offs_boot)); |
2304 | } |
2305 | #else |
2306 | static s64 get_kvmclock_base_ns(void) |
2307 | { |
2308 | /* Master clock not used, so we can just use CLOCK_BOOTTIME. */ |
2309 | return ktime_get_boottime_ns(); |
2310 | } |
2311 | #endif |
2312 | |
2313 | static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock, int sec_hi_ofs) |
2314 | { |
2315 | int version; |
2316 | int r; |
2317 | struct pvclock_wall_clock wc; |
2318 | u32 wc_sec_hi; |
2319 | u64 wall_nsec; |
2320 | |
2321 | if (!wall_clock) |
2322 | return; |
2323 | |
2324 | r = kvm_read_guest(kvm, gpa: wall_clock, data: &version, len: sizeof(version)); |
2325 | if (r) |
2326 | return; |
2327 | |
2328 | if (version & 1) |
2329 | ++version; /* first time write, random junk */ |
2330 | |
2331 | ++version; |
2332 | |
2333 | if (kvm_write_guest(kvm, gpa: wall_clock, data: &version, len: sizeof(version))) |
2334 | return; |
2335 | |
2336 | wall_nsec = kvm_get_wall_clock_epoch(kvm); |
2337 | |
2338 | wc.nsec = do_div(wall_nsec, NSEC_PER_SEC); |
2339 | wc.sec = (u32)wall_nsec; /* overflow in 2106 guest time */ |
2340 | wc.version = version; |
2341 | |
2342 | kvm_write_guest(kvm, gpa: wall_clock, data: &wc, len: sizeof(wc)); |
2343 | |
2344 | if (sec_hi_ofs) { |
2345 | wc_sec_hi = wall_nsec >> 32; |
2346 | kvm_write_guest(kvm, gpa: wall_clock + sec_hi_ofs, |
2347 | data: &wc_sec_hi, len: sizeof(wc_sec_hi)); |
2348 | } |
2349 | |
2350 | version++; |
2351 | kvm_write_guest(kvm, gpa: wall_clock, data: &version, len: sizeof(version)); |
2352 | } |
2353 | |
2354 | static void kvm_write_system_time(struct kvm_vcpu *vcpu, gpa_t system_time, |
2355 | bool old_msr, bool host_initiated) |
2356 | { |
2357 | struct kvm_arch *ka = &vcpu->kvm->arch; |
2358 | |
2359 | if (vcpu->vcpu_id == 0 && !host_initiated) { |
2360 | if (ka->boot_vcpu_runs_old_kvmclock != old_msr) |
2361 | kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
2362 | |
2363 | ka->boot_vcpu_runs_old_kvmclock = old_msr; |
2364 | } |
2365 | |
2366 | vcpu->arch.time = system_time; |
2367 | kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); |
2368 | |
2369 | /* we verify if the enable bit is set... */ |
2370 | if (system_time & 1) |
2371 | kvm_gpc_activate(gpc: &vcpu->arch.pv_time, gpa: system_time & ~1ULL, |
2372 | len: sizeof(struct pvclock_vcpu_time_info)); |
2373 | else |
2374 | kvm_gpc_deactivate(gpc: &vcpu->arch.pv_time); |
2375 | |
2376 | return; |
2377 | } |
2378 | |
2379 | static uint32_t div_frac(uint32_t dividend, uint32_t divisor) |
2380 | { |
2381 | do_shl32_div32(dividend, divisor); |
2382 | return dividend; |
2383 | } |
2384 | |
2385 | static void kvm_get_time_scale(uint64_t scaled_hz, uint64_t base_hz, |
2386 | s8 *pshift, u32 *pmultiplier) |
2387 | { |
2388 | uint64_t scaled64; |
2389 | int32_t shift = 0; |
2390 | uint64_t tps64; |
2391 | uint32_t tps32; |
2392 | |
2393 | tps64 = base_hz; |
2394 | scaled64 = scaled_hz; |
2395 | while (tps64 > scaled64*2 || tps64 & 0xffffffff00000000ULL) { |
2396 | tps64 >>= 1; |
2397 | shift--; |
2398 | } |
2399 | |
2400 | tps32 = (uint32_t)tps64; |
2401 | while (tps32 <= scaled64 || scaled64 & 0xffffffff00000000ULL) { |
2402 | if (scaled64 & 0xffffffff00000000ULL || tps32 & 0x80000000) |
2403 | scaled64 >>= 1; |
2404 | else |
2405 | tps32 <<= 1; |
2406 | shift++; |
2407 | } |
2408 | |
2409 | *pshift = shift; |
2410 | *pmultiplier = div_frac(dividend: scaled64, divisor: tps32); |
2411 | } |
2412 | |
2413 | #ifdef CONFIG_X86_64 |
2414 | static atomic_t kvm_guest_has_master_clock = ATOMIC_INIT(0); |
2415 | #endif |
2416 | |
2417 | static DEFINE_PER_CPU(unsigned long, cpu_tsc_khz); |
2418 | static unsigned long max_tsc_khz; |
2419 | |
2420 | static u32 adjust_tsc_khz(u32 khz, s32 ppm) |
2421 | { |
2422 | u64 v = (u64)khz * (1000000 + ppm); |
2423 | do_div(v, 1000000); |
2424 | return v; |
2425 | } |
2426 | |
2427 | static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier); |
2428 | |
2429 | static int set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz, bool scale) |
2430 | { |
2431 | u64 ratio; |
2432 | |
2433 | /* Guest TSC same frequency as host TSC? */ |
2434 | if (!scale) { |
2435 | kvm_vcpu_write_tsc_multiplier(vcpu, l1_multiplier: kvm_caps.default_tsc_scaling_ratio); |
2436 | return 0; |
2437 | } |
2438 | |
2439 | /* TSC scaling supported? */ |
2440 | if (!kvm_caps.has_tsc_control) { |
2441 | if (user_tsc_khz > tsc_khz) { |
2442 | vcpu->arch.tsc_catchup = 1; |
2443 | vcpu->arch.tsc_always_catchup = 1; |
2444 | return 0; |
2445 | } else { |
2446 | pr_warn_ratelimited("user requested TSC rate below hardware speed\n" ); |
2447 | return -1; |
2448 | } |
2449 | } |
2450 | |
2451 | /* TSC scaling required - calculate ratio */ |
2452 | ratio = mul_u64_u32_div(a: 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits, |
2453 | mul: user_tsc_khz, div: tsc_khz); |
2454 | |
2455 | if (ratio == 0 || ratio >= kvm_caps.max_tsc_scaling_ratio) { |
2456 | pr_warn_ratelimited("Invalid TSC scaling ratio - virtual-tsc-khz=%u\n" , |
2457 | user_tsc_khz); |
2458 | return -1; |
2459 | } |
2460 | |
2461 | kvm_vcpu_write_tsc_multiplier(vcpu, l1_multiplier: ratio); |
2462 | return 0; |
2463 | } |
2464 | |
2465 | static int kvm_set_tsc_khz(struct kvm_vcpu *vcpu, u32 user_tsc_khz) |
2466 | { |
2467 | u32 thresh_lo, thresh_hi; |
2468 | int use_scaling = 0; |
2469 | |
2470 | /* tsc_khz can be zero if TSC calibration fails */ |
2471 | if (user_tsc_khz == 0) { |
2472 | /* set tsc_scaling_ratio to a safe value */ |
2473 | kvm_vcpu_write_tsc_multiplier(vcpu, l1_multiplier: kvm_caps.default_tsc_scaling_ratio); |
2474 | return -1; |
2475 | } |
2476 | |
2477 | /* Compute a scale to convert nanoseconds in TSC cycles */ |
2478 | kvm_get_time_scale(scaled_hz: user_tsc_khz * 1000LL, NSEC_PER_SEC, |
2479 | pshift: &vcpu->arch.virtual_tsc_shift, |
2480 | pmultiplier: &vcpu->arch.virtual_tsc_mult); |
2481 | vcpu->arch.virtual_tsc_khz = user_tsc_khz; |
2482 | |
2483 | /* |
2484 | * Compute the variation in TSC rate which is acceptable |
2485 | * within the range of tolerance and decide if the |
2486 | * rate being applied is within that bounds of the hardware |
2487 | * rate. If so, no scaling or compensation need be done. |
2488 | */ |
2489 | thresh_lo = adjust_tsc_khz(khz: tsc_khz, ppm: -tsc_tolerance_ppm); |
2490 | thresh_hi = adjust_tsc_khz(khz: tsc_khz, ppm: tsc_tolerance_ppm); |
2491 | if (user_tsc_khz < thresh_lo || user_tsc_khz > thresh_hi) { |
2492 | pr_debug("requested TSC rate %u falls outside tolerance [%u,%u]\n" , |
2493 | user_tsc_khz, thresh_lo, thresh_hi); |
2494 | use_scaling = 1; |
2495 | } |
2496 | return set_tsc_khz(vcpu, user_tsc_khz, scale: use_scaling); |
2497 | } |
2498 | |
2499 | static u64 compute_guest_tsc(struct kvm_vcpu *vcpu, s64 kernel_ns) |
2500 | { |
2501 | u64 tsc = pvclock_scale_delta(delta: kernel_ns-vcpu->arch.this_tsc_nsec, |
2502 | mul_frac: vcpu->arch.virtual_tsc_mult, |
2503 | shift: vcpu->arch.virtual_tsc_shift); |
2504 | tsc += vcpu->arch.this_tsc_write; |
2505 | return tsc; |
2506 | } |
2507 | |
2508 | #ifdef CONFIG_X86_64 |
2509 | static inline bool gtod_is_based_on_tsc(int mode) |
2510 | { |
2511 | return mode == VDSO_CLOCKMODE_TSC || mode == VDSO_CLOCKMODE_HVCLOCK; |
2512 | } |
2513 | #endif |
2514 | |
2515 | static void kvm_track_tsc_matching(struct kvm_vcpu *vcpu, bool new_generation) |
2516 | { |
2517 | #ifdef CONFIG_X86_64 |
2518 | struct kvm_arch *ka = &vcpu->kvm->arch; |
2519 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
2520 | |
2521 | /* |
2522 | * To use the masterclock, the host clocksource must be based on TSC |
2523 | * and all vCPUs must have matching TSCs. Note, the count for matching |
2524 | * vCPUs doesn't include the reference vCPU, hence "+1". |
2525 | */ |
2526 | bool use_master_clock = (ka->nr_vcpus_matched_tsc + 1 == |
2527 | atomic_read(v: &vcpu->kvm->online_vcpus)) && |
2528 | gtod_is_based_on_tsc(mode: gtod->clock.vclock_mode); |
2529 | |
2530 | /* |
2531 | * Request a masterclock update if the masterclock needs to be toggled |
2532 | * on/off, or when starting a new generation and the masterclock is |
2533 | * enabled (compute_guest_tsc() requires the masterclock snapshot to be |
2534 | * taken _after_ the new generation is created). |
2535 | */ |
2536 | if ((ka->use_master_clock && new_generation) || |
2537 | (ka->use_master_clock != use_master_clock)) |
2538 | kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
2539 | |
2540 | trace_kvm_track_tsc(vcpu_id: vcpu->vcpu_id, nr_matched: ka->nr_vcpus_matched_tsc, |
2541 | online_vcpus: atomic_read(v: &vcpu->kvm->online_vcpus), |
2542 | use_master_clock: ka->use_master_clock, host_clock: gtod->clock.vclock_mode); |
2543 | #endif |
2544 | } |
2545 | |
2546 | /* |
2547 | * Multiply tsc by a fixed point number represented by ratio. |
2548 | * |
2549 | * The most significant 64-N bits (mult) of ratio represent the |
2550 | * integral part of the fixed point number; the remaining N bits |
2551 | * (frac) represent the fractional part, ie. ratio represents a fixed |
2552 | * point number (mult + frac * 2^(-N)). |
2553 | * |
2554 | * N equals to kvm_caps.tsc_scaling_ratio_frac_bits. |
2555 | */ |
2556 | static inline u64 __scale_tsc(u64 ratio, u64 tsc) |
2557 | { |
2558 | return mul_u64_u64_shr(a: tsc, mul: ratio, shift: kvm_caps.tsc_scaling_ratio_frac_bits); |
2559 | } |
2560 | |
2561 | u64 kvm_scale_tsc(u64 tsc, u64 ratio) |
2562 | { |
2563 | u64 _tsc = tsc; |
2564 | |
2565 | if (ratio != kvm_caps.default_tsc_scaling_ratio) |
2566 | _tsc = __scale_tsc(ratio, tsc); |
2567 | |
2568 | return _tsc; |
2569 | } |
2570 | |
2571 | static u64 kvm_compute_l1_tsc_offset(struct kvm_vcpu *vcpu, u64 target_tsc) |
2572 | { |
2573 | u64 tsc; |
2574 | |
2575 | tsc = kvm_scale_tsc(tsc: rdtsc(), ratio: vcpu->arch.l1_tsc_scaling_ratio); |
2576 | |
2577 | return target_tsc - tsc; |
2578 | } |
2579 | |
2580 | u64 kvm_read_l1_tsc(struct kvm_vcpu *vcpu, u64 host_tsc) |
2581 | { |
2582 | return vcpu->arch.l1_tsc_offset + |
2583 | kvm_scale_tsc(tsc: host_tsc, ratio: vcpu->arch.l1_tsc_scaling_ratio); |
2584 | } |
2585 | EXPORT_SYMBOL_GPL(kvm_read_l1_tsc); |
2586 | |
2587 | u64 kvm_calc_nested_tsc_offset(u64 l1_offset, u64 l2_offset, u64 l2_multiplier) |
2588 | { |
2589 | u64 nested_offset; |
2590 | |
2591 | if (l2_multiplier == kvm_caps.default_tsc_scaling_ratio) |
2592 | nested_offset = l1_offset; |
2593 | else |
2594 | nested_offset = mul_s64_u64_shr(a: (s64) l1_offset, b: l2_multiplier, |
2595 | shift: kvm_caps.tsc_scaling_ratio_frac_bits); |
2596 | |
2597 | nested_offset += l2_offset; |
2598 | return nested_offset; |
2599 | } |
2600 | EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_offset); |
2601 | |
2602 | u64 kvm_calc_nested_tsc_multiplier(u64 l1_multiplier, u64 l2_multiplier) |
2603 | { |
2604 | if (l2_multiplier != kvm_caps.default_tsc_scaling_ratio) |
2605 | return mul_u64_u64_shr(a: l1_multiplier, mul: l2_multiplier, |
2606 | shift: kvm_caps.tsc_scaling_ratio_frac_bits); |
2607 | |
2608 | return l1_multiplier; |
2609 | } |
2610 | EXPORT_SYMBOL_GPL(kvm_calc_nested_tsc_multiplier); |
2611 | |
2612 | static void kvm_vcpu_write_tsc_offset(struct kvm_vcpu *vcpu, u64 l1_offset) |
2613 | { |
2614 | trace_kvm_write_tsc_offset(vcpu_id: vcpu->vcpu_id, |
2615 | previous_tsc_offset: vcpu->arch.l1_tsc_offset, |
2616 | next_tsc_offset: l1_offset); |
2617 | |
2618 | vcpu->arch.l1_tsc_offset = l1_offset; |
2619 | |
2620 | /* |
2621 | * If we are here because L1 chose not to trap WRMSR to TSC then |
2622 | * according to the spec this should set L1's TSC (as opposed to |
2623 | * setting L1's offset for L2). |
2624 | */ |
2625 | if (is_guest_mode(vcpu)) |
2626 | vcpu->arch.tsc_offset = kvm_calc_nested_tsc_offset( |
2627 | l1_offset, |
2628 | static_call(kvm_x86_get_l2_tsc_offset)(vcpu), |
2629 | static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); |
2630 | else |
2631 | vcpu->arch.tsc_offset = l1_offset; |
2632 | |
2633 | static_call(kvm_x86_write_tsc_offset)(vcpu); |
2634 | } |
2635 | |
2636 | static void kvm_vcpu_write_tsc_multiplier(struct kvm_vcpu *vcpu, u64 l1_multiplier) |
2637 | { |
2638 | vcpu->arch.l1_tsc_scaling_ratio = l1_multiplier; |
2639 | |
2640 | /* Userspace is changing the multiplier while L2 is active */ |
2641 | if (is_guest_mode(vcpu)) |
2642 | vcpu->arch.tsc_scaling_ratio = kvm_calc_nested_tsc_multiplier( |
2643 | l1_multiplier, |
2644 | static_call(kvm_x86_get_l2_tsc_multiplier)(vcpu)); |
2645 | else |
2646 | vcpu->arch.tsc_scaling_ratio = l1_multiplier; |
2647 | |
2648 | if (kvm_caps.has_tsc_control) |
2649 | static_call(kvm_x86_write_tsc_multiplier)(vcpu); |
2650 | } |
2651 | |
2652 | static inline bool kvm_check_tsc_unstable(void) |
2653 | { |
2654 | #ifdef CONFIG_X86_64 |
2655 | /* |
2656 | * TSC is marked unstable when we're running on Hyper-V, |
2657 | * 'TSC page' clocksource is good. |
2658 | */ |
2659 | if (pvclock_gtod_data.clock.vclock_mode == VDSO_CLOCKMODE_HVCLOCK) |
2660 | return false; |
2661 | #endif |
2662 | return check_tsc_unstable(); |
2663 | } |
2664 | |
2665 | /* |
2666 | * Infers attempts to synchronize the guest's tsc from host writes. Sets the |
2667 | * offset for the vcpu and tracks the TSC matching generation that the vcpu |
2668 | * participates in. |
2669 | */ |
2670 | static void __kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 offset, u64 tsc, |
2671 | u64 ns, bool matched) |
2672 | { |
2673 | struct kvm *kvm = vcpu->kvm; |
2674 | |
2675 | lockdep_assert_held(&kvm->arch.tsc_write_lock); |
2676 | |
2677 | /* |
2678 | * We also track th most recent recorded KHZ, write and time to |
2679 | * allow the matching interval to be extended at each write. |
2680 | */ |
2681 | kvm->arch.last_tsc_nsec = ns; |
2682 | kvm->arch.last_tsc_write = tsc; |
2683 | kvm->arch.last_tsc_khz = vcpu->arch.virtual_tsc_khz; |
2684 | kvm->arch.last_tsc_offset = offset; |
2685 | |
2686 | vcpu->arch.last_guest_tsc = tsc; |
2687 | |
2688 | kvm_vcpu_write_tsc_offset(vcpu, l1_offset: offset); |
2689 | |
2690 | if (!matched) { |
2691 | /* |
2692 | * We split periods of matched TSC writes into generations. |
2693 | * For each generation, we track the original measured |
2694 | * nanosecond time, offset, and write, so if TSCs are in |
2695 | * sync, we can match exact offset, and if not, we can match |
2696 | * exact software computation in compute_guest_tsc() |
2697 | * |
2698 | * These values are tracked in kvm->arch.cur_xxx variables. |
2699 | */ |
2700 | kvm->arch.cur_tsc_generation++; |
2701 | kvm->arch.cur_tsc_nsec = ns; |
2702 | kvm->arch.cur_tsc_write = tsc; |
2703 | kvm->arch.cur_tsc_offset = offset; |
2704 | kvm->arch.nr_vcpus_matched_tsc = 0; |
2705 | } else if (vcpu->arch.this_tsc_generation != kvm->arch.cur_tsc_generation) { |
2706 | kvm->arch.nr_vcpus_matched_tsc++; |
2707 | } |
2708 | |
2709 | /* Keep track of which generation this VCPU has synchronized to */ |
2710 | vcpu->arch.this_tsc_generation = kvm->arch.cur_tsc_generation; |
2711 | vcpu->arch.this_tsc_nsec = kvm->arch.cur_tsc_nsec; |
2712 | vcpu->arch.this_tsc_write = kvm->arch.cur_tsc_write; |
2713 | |
2714 | kvm_track_tsc_matching(vcpu, new_generation: !matched); |
2715 | } |
2716 | |
2717 | static void kvm_synchronize_tsc(struct kvm_vcpu *vcpu, u64 *user_value) |
2718 | { |
2719 | u64 data = user_value ? *user_value : 0; |
2720 | struct kvm *kvm = vcpu->kvm; |
2721 | u64 offset, ns, elapsed; |
2722 | unsigned long flags; |
2723 | bool matched = false; |
2724 | bool synchronizing = false; |
2725 | |
2726 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
2727 | offset = kvm_compute_l1_tsc_offset(vcpu, target_tsc: data); |
2728 | ns = get_kvmclock_base_ns(); |
2729 | elapsed = ns - kvm->arch.last_tsc_nsec; |
2730 | |
2731 | if (vcpu->arch.virtual_tsc_khz) { |
2732 | if (data == 0) { |
2733 | /* |
2734 | * Force synchronization when creating a vCPU, or when |
2735 | * userspace explicitly writes a zero value. |
2736 | */ |
2737 | synchronizing = true; |
2738 | } else if (kvm->arch.user_set_tsc) { |
2739 | u64 tsc_exp = kvm->arch.last_tsc_write + |
2740 | nsec_to_cycles(vcpu, nsec: elapsed); |
2741 | u64 tsc_hz = vcpu->arch.virtual_tsc_khz * 1000LL; |
2742 | /* |
2743 | * Here lies UAPI baggage: when a user-initiated TSC write has |
2744 | * a small delta (1 second) of virtual cycle time against the |
2745 | * previously set vCPU, we assume that they were intended to be |
2746 | * in sync and the delta was only due to the racy nature of the |
2747 | * legacy API. |
2748 | * |
2749 | * This trick falls down when restoring a guest which genuinely |
2750 | * has been running for less time than the 1 second of imprecision |
2751 | * which we allow for in the legacy API. In this case, the first |
2752 | * value written by userspace (on any vCPU) should not be subject |
2753 | * to this 'correction' to make it sync up with values that only |
2754 | * come from the kernel's default vCPU creation. Make the 1-second |
2755 | * slop hack only trigger if the user_set_tsc flag is already set. |
2756 | */ |
2757 | synchronizing = data < tsc_exp + tsc_hz && |
2758 | data + tsc_hz > tsc_exp; |
2759 | } |
2760 | } |
2761 | |
2762 | if (user_value) |
2763 | kvm->arch.user_set_tsc = true; |
2764 | |
2765 | /* |
2766 | * For a reliable TSC, we can match TSC offsets, and for an unstable |
2767 | * TSC, we add elapsed time in this computation. We could let the |
2768 | * compensation code attempt to catch up if we fall behind, but |
2769 | * it's better to try to match offsets from the beginning. |
2770 | */ |
2771 | if (synchronizing && |
2772 | vcpu->arch.virtual_tsc_khz == kvm->arch.last_tsc_khz) { |
2773 | if (!kvm_check_tsc_unstable()) { |
2774 | offset = kvm->arch.cur_tsc_offset; |
2775 | } else { |
2776 | u64 delta = nsec_to_cycles(vcpu, nsec: elapsed); |
2777 | data += delta; |
2778 | offset = kvm_compute_l1_tsc_offset(vcpu, target_tsc: data); |
2779 | } |
2780 | matched = true; |
2781 | } |
2782 | |
2783 | __kvm_synchronize_tsc(vcpu, offset, tsc: data, ns, matched); |
2784 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); |
2785 | } |
2786 | |
2787 | static inline void adjust_tsc_offset_guest(struct kvm_vcpu *vcpu, |
2788 | s64 adjustment) |
2789 | { |
2790 | u64 tsc_offset = vcpu->arch.l1_tsc_offset; |
2791 | kvm_vcpu_write_tsc_offset(vcpu, l1_offset: tsc_offset + adjustment); |
2792 | } |
2793 | |
2794 | static inline void adjust_tsc_offset_host(struct kvm_vcpu *vcpu, s64 adjustment) |
2795 | { |
2796 | if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio) |
2797 | WARN_ON(adjustment < 0); |
2798 | adjustment = kvm_scale_tsc(tsc: (u64) adjustment, |
2799 | ratio: vcpu->arch.l1_tsc_scaling_ratio); |
2800 | adjust_tsc_offset_guest(vcpu, adjustment); |
2801 | } |
2802 | |
2803 | #ifdef CONFIG_X86_64 |
2804 | |
2805 | static u64 read_tsc(void) |
2806 | { |
2807 | u64 ret = (u64)rdtsc_ordered(); |
2808 | u64 last = pvclock_gtod_data.clock.cycle_last; |
2809 | |
2810 | if (likely(ret >= last)) |
2811 | return ret; |
2812 | |
2813 | /* |
2814 | * GCC likes to generate cmov here, but this branch is extremely |
2815 | * predictable (it's just a function of time and the likely is |
2816 | * very likely) and there's a data dependence, so force GCC |
2817 | * to generate a branch instead. I don't barrier() because |
2818 | * we don't actually need a barrier, and if this function |
2819 | * ever gets inlined it will generate worse code. |
2820 | */ |
2821 | asm volatile ("" ); |
2822 | return last; |
2823 | } |
2824 | |
2825 | static inline u64 vgettsc(struct pvclock_clock *clock, u64 *tsc_timestamp, |
2826 | int *mode) |
2827 | { |
2828 | u64 tsc_pg_val; |
2829 | long v; |
2830 | |
2831 | switch (clock->vclock_mode) { |
2832 | case VDSO_CLOCKMODE_HVCLOCK: |
2833 | if (hv_read_tsc_page_tsc(tsc_pg: hv_get_tsc_page(), |
2834 | cur_tsc: tsc_timestamp, time: &tsc_pg_val)) { |
2835 | /* TSC page valid */ |
2836 | *mode = VDSO_CLOCKMODE_HVCLOCK; |
2837 | v = (tsc_pg_val - clock->cycle_last) & |
2838 | clock->mask; |
2839 | } else { |
2840 | /* TSC page invalid */ |
2841 | *mode = VDSO_CLOCKMODE_NONE; |
2842 | } |
2843 | break; |
2844 | case VDSO_CLOCKMODE_TSC: |
2845 | *mode = VDSO_CLOCKMODE_TSC; |
2846 | *tsc_timestamp = read_tsc(); |
2847 | v = (*tsc_timestamp - clock->cycle_last) & |
2848 | clock->mask; |
2849 | break; |
2850 | default: |
2851 | *mode = VDSO_CLOCKMODE_NONE; |
2852 | } |
2853 | |
2854 | if (*mode == VDSO_CLOCKMODE_NONE) |
2855 | *tsc_timestamp = v = 0; |
2856 | |
2857 | return v * clock->mult; |
2858 | } |
2859 | |
2860 | /* |
2861 | * As with get_kvmclock_base_ns(), this counts from boot time, at the |
2862 | * frequency of CLOCK_MONOTONIC_RAW (hence adding gtos->offs_boot). |
2863 | */ |
2864 | static int do_kvmclock_base(s64 *t, u64 *tsc_timestamp) |
2865 | { |
2866 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
2867 | unsigned long seq; |
2868 | int mode; |
2869 | u64 ns; |
2870 | |
2871 | do { |
2872 | seq = read_seqcount_begin(>od->seq); |
2873 | ns = gtod->raw_clock.base_cycles; |
2874 | ns += vgettsc(clock: >od->raw_clock, tsc_timestamp, mode: &mode); |
2875 | ns >>= gtod->raw_clock.shift; |
2876 | ns += ktime_to_ns(ktime_add(gtod->raw_clock.offset, gtod->offs_boot)); |
2877 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
2878 | *t = ns; |
2879 | |
2880 | return mode; |
2881 | } |
2882 | |
2883 | /* |
2884 | * This calculates CLOCK_MONOTONIC at the time of the TSC snapshot, with |
2885 | * no boot time offset. |
2886 | */ |
2887 | static int do_monotonic(s64 *t, u64 *tsc_timestamp) |
2888 | { |
2889 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
2890 | unsigned long seq; |
2891 | int mode; |
2892 | u64 ns; |
2893 | |
2894 | do { |
2895 | seq = read_seqcount_begin(>od->seq); |
2896 | ns = gtod->clock.base_cycles; |
2897 | ns += vgettsc(clock: >od->clock, tsc_timestamp, mode: &mode); |
2898 | ns >>= gtod->clock.shift; |
2899 | ns += ktime_to_ns(kt: gtod->clock.offset); |
2900 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
2901 | *t = ns; |
2902 | |
2903 | return mode; |
2904 | } |
2905 | |
2906 | static int do_realtime(struct timespec64 *ts, u64 *tsc_timestamp) |
2907 | { |
2908 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
2909 | unsigned long seq; |
2910 | int mode; |
2911 | u64 ns; |
2912 | |
2913 | do { |
2914 | seq = read_seqcount_begin(>od->seq); |
2915 | ts->tv_sec = gtod->wall_time_sec; |
2916 | ns = gtod->clock.base_cycles; |
2917 | ns += vgettsc(clock: >od->clock, tsc_timestamp, mode: &mode); |
2918 | ns >>= gtod->clock.shift; |
2919 | } while (unlikely(read_seqcount_retry(>od->seq, seq))); |
2920 | |
2921 | ts->tv_sec += __iter_div_u64_rem(dividend: ns, NSEC_PER_SEC, remainder: &ns); |
2922 | ts->tv_nsec = ns; |
2923 | |
2924 | return mode; |
2925 | } |
2926 | |
2927 | /* |
2928 | * Calculates the kvmclock_base_ns (CLOCK_MONOTONIC_RAW + boot time) and |
2929 | * reports the TSC value from which it do so. Returns true if host is |
2930 | * using TSC based clocksource. |
2931 | */ |
2932 | static bool kvm_get_time_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) |
2933 | { |
2934 | /* checked again under seqlock below */ |
2935 | if (!gtod_is_based_on_tsc(mode: pvclock_gtod_data.clock.vclock_mode)) |
2936 | return false; |
2937 | |
2938 | return gtod_is_based_on_tsc(mode: do_kvmclock_base(t: kernel_ns, |
2939 | tsc_timestamp)); |
2940 | } |
2941 | |
2942 | /* |
2943 | * Calculates CLOCK_MONOTONIC and reports the TSC value from which it did |
2944 | * so. Returns true if host is using TSC based clocksource. |
2945 | */ |
2946 | bool kvm_get_monotonic_and_clockread(s64 *kernel_ns, u64 *tsc_timestamp) |
2947 | { |
2948 | /* checked again under seqlock below */ |
2949 | if (!gtod_is_based_on_tsc(mode: pvclock_gtod_data.clock.vclock_mode)) |
2950 | return false; |
2951 | |
2952 | return gtod_is_based_on_tsc(mode: do_monotonic(t: kernel_ns, |
2953 | tsc_timestamp)); |
2954 | } |
2955 | |
2956 | /* |
2957 | * Calculates CLOCK_REALTIME and reports the TSC value from which it did |
2958 | * so. Returns true if host is using TSC based clocksource. |
2959 | * |
2960 | * DO NOT USE this for anything related to migration. You want CLOCK_TAI |
2961 | * for that. |
2962 | */ |
2963 | static bool kvm_get_walltime_and_clockread(struct timespec64 *ts, |
2964 | u64 *tsc_timestamp) |
2965 | { |
2966 | /* checked again under seqlock below */ |
2967 | if (!gtod_is_based_on_tsc(mode: pvclock_gtod_data.clock.vclock_mode)) |
2968 | return false; |
2969 | |
2970 | return gtod_is_based_on_tsc(mode: do_realtime(ts, tsc_timestamp)); |
2971 | } |
2972 | #endif |
2973 | |
2974 | /* |
2975 | * |
2976 | * Assuming a stable TSC across physical CPUS, and a stable TSC |
2977 | * across virtual CPUs, the following condition is possible. |
2978 | * Each numbered line represents an event visible to both |
2979 | * CPUs at the next numbered event. |
2980 | * |
2981 | * "timespecX" represents host monotonic time. "tscX" represents |
2982 | * RDTSC value. |
2983 | * |
2984 | * VCPU0 on CPU0 | VCPU1 on CPU1 |
2985 | * |
2986 | * 1. read timespec0,tsc0 |
2987 | * 2. | timespec1 = timespec0 + N |
2988 | * | tsc1 = tsc0 + M |
2989 | * 3. transition to guest | transition to guest |
2990 | * 4. ret0 = timespec0 + (rdtsc - tsc0) | |
2991 | * 5. | ret1 = timespec1 + (rdtsc - tsc1) |
2992 | * | ret1 = timespec0 + N + (rdtsc - (tsc0 + M)) |
2993 | * |
2994 | * Since ret0 update is visible to VCPU1 at time 5, to obey monotonicity: |
2995 | * |
2996 | * - ret0 < ret1 |
2997 | * - timespec0 + (rdtsc - tsc0) < timespec0 + N + (rdtsc - (tsc0 + M)) |
2998 | * ... |
2999 | * - 0 < N - M => M < N |
3000 | * |
3001 | * That is, when timespec0 != timespec1, M < N. Unfortunately that is not |
3002 | * always the case (the difference between two distinct xtime instances |
3003 | * might be smaller then the difference between corresponding TSC reads, |
3004 | * when updating guest vcpus pvclock areas). |
3005 | * |
3006 | * To avoid that problem, do not allow visibility of distinct |
3007 | * system_timestamp/tsc_timestamp values simultaneously: use a master |
3008 | * copy of host monotonic time values. Update that master copy |
3009 | * in lockstep. |
3010 | * |
3011 | * Rely on synchronization of host TSCs and guest TSCs for monotonicity. |
3012 | * |
3013 | */ |
3014 | |
3015 | static void pvclock_update_vm_gtod_copy(struct kvm *kvm) |
3016 | { |
3017 | #ifdef CONFIG_X86_64 |
3018 | struct kvm_arch *ka = &kvm->arch; |
3019 | int vclock_mode; |
3020 | bool host_tsc_clocksource, vcpus_matched; |
3021 | |
3022 | lockdep_assert_held(&kvm->arch.tsc_write_lock); |
3023 | vcpus_matched = (ka->nr_vcpus_matched_tsc + 1 == |
3024 | atomic_read(v: &kvm->online_vcpus)); |
3025 | |
3026 | /* |
3027 | * If the host uses TSC clock, then passthrough TSC as stable |
3028 | * to the guest. |
3029 | */ |
3030 | host_tsc_clocksource = kvm_get_time_and_clockread( |
3031 | kernel_ns: &ka->master_kernel_ns, |
3032 | tsc_timestamp: &ka->master_cycle_now); |
3033 | |
3034 | ka->use_master_clock = host_tsc_clocksource && vcpus_matched |
3035 | && !ka->backwards_tsc_observed |
3036 | && !ka->boot_vcpu_runs_old_kvmclock; |
3037 | |
3038 | if (ka->use_master_clock) |
3039 | atomic_set(v: &kvm_guest_has_master_clock, i: 1); |
3040 | |
3041 | vclock_mode = pvclock_gtod_data.clock.vclock_mode; |
3042 | trace_kvm_update_master_clock(use_master_clock: ka->use_master_clock, host_clock: vclock_mode, |
3043 | offset_matched: vcpus_matched); |
3044 | #endif |
3045 | } |
3046 | |
3047 | static void kvm_make_mclock_inprogress_request(struct kvm *kvm) |
3048 | { |
3049 | kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS); |
3050 | } |
3051 | |
3052 | static void __kvm_start_pvclock_update(struct kvm *kvm) |
3053 | { |
3054 | raw_spin_lock_irq(&kvm->arch.tsc_write_lock); |
3055 | write_seqcount_begin(&kvm->arch.pvclock_sc); |
3056 | } |
3057 | |
3058 | static void kvm_start_pvclock_update(struct kvm *kvm) |
3059 | { |
3060 | kvm_make_mclock_inprogress_request(kvm); |
3061 | |
3062 | /* no guest entries from this point */ |
3063 | __kvm_start_pvclock_update(kvm); |
3064 | } |
3065 | |
3066 | static void kvm_end_pvclock_update(struct kvm *kvm) |
3067 | { |
3068 | struct kvm_arch *ka = &kvm->arch; |
3069 | struct kvm_vcpu *vcpu; |
3070 | unsigned long i; |
3071 | |
3072 | write_seqcount_end(&ka->pvclock_sc); |
3073 | raw_spin_unlock_irq(&ka->tsc_write_lock); |
3074 | kvm_for_each_vcpu(i, vcpu, kvm) |
3075 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
3076 | |
3077 | /* guest entries allowed */ |
3078 | kvm_for_each_vcpu(i, vcpu, kvm) |
3079 | kvm_clear_request(KVM_REQ_MCLOCK_INPROGRESS, vcpu); |
3080 | } |
3081 | |
3082 | static void kvm_update_masterclock(struct kvm *kvm) |
3083 | { |
3084 | kvm_hv_request_tsc_page_update(kvm); |
3085 | kvm_start_pvclock_update(kvm); |
3086 | pvclock_update_vm_gtod_copy(kvm); |
3087 | kvm_end_pvclock_update(kvm); |
3088 | } |
3089 | |
3090 | /* |
3091 | * Use the kernel's tsc_khz directly if the TSC is constant, otherwise use KVM's |
3092 | * per-CPU value (which may be zero if a CPU is going offline). Note, tsc_khz |
3093 | * can change during boot even if the TSC is constant, as it's possible for KVM |
3094 | * to be loaded before TSC calibration completes. Ideally, KVM would get a |
3095 | * notification when calibration completes, but practically speaking calibration |
3096 | * will complete before userspace is alive enough to create VMs. |
3097 | */ |
3098 | static unsigned long get_cpu_tsc_khz(void) |
3099 | { |
3100 | if (static_cpu_has(X86_FEATURE_CONSTANT_TSC)) |
3101 | return tsc_khz; |
3102 | else |
3103 | return __this_cpu_read(cpu_tsc_khz); |
3104 | } |
3105 | |
3106 | /* Called within read_seqcount_begin/retry for kvm->pvclock_sc. */ |
3107 | static void __get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) |
3108 | { |
3109 | struct kvm_arch *ka = &kvm->arch; |
3110 | struct pvclock_vcpu_time_info hv_clock; |
3111 | |
3112 | /* both __this_cpu_read() and rdtsc() should be on the same cpu */ |
3113 | get_cpu(); |
3114 | |
3115 | data->flags = 0; |
3116 | if (ka->use_master_clock && |
3117 | (static_cpu_has(X86_FEATURE_CONSTANT_TSC) || __this_cpu_read(cpu_tsc_khz))) { |
3118 | #ifdef CONFIG_X86_64 |
3119 | struct timespec64 ts; |
3120 | |
3121 | if (kvm_get_walltime_and_clockread(ts: &ts, tsc_timestamp: &data->host_tsc)) { |
3122 | data->realtime = ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec; |
3123 | data->flags |= KVM_CLOCK_REALTIME | KVM_CLOCK_HOST_TSC; |
3124 | } else |
3125 | #endif |
3126 | data->host_tsc = rdtsc(); |
3127 | |
3128 | data->flags |= KVM_CLOCK_TSC_STABLE; |
3129 | hv_clock.tsc_timestamp = ka->master_cycle_now; |
3130 | hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; |
3131 | kvm_get_time_scale(NSEC_PER_SEC, base_hz: get_cpu_tsc_khz() * 1000LL, |
3132 | pshift: &hv_clock.tsc_shift, |
3133 | pmultiplier: &hv_clock.tsc_to_system_mul); |
3134 | data->clock = __pvclock_read_cycles(src: &hv_clock, tsc: data->host_tsc); |
3135 | } else { |
3136 | data->clock = get_kvmclock_base_ns() + ka->kvmclock_offset; |
3137 | } |
3138 | |
3139 | put_cpu(); |
3140 | } |
3141 | |
3142 | static void get_kvmclock(struct kvm *kvm, struct kvm_clock_data *data) |
3143 | { |
3144 | struct kvm_arch *ka = &kvm->arch; |
3145 | unsigned seq; |
3146 | |
3147 | do { |
3148 | seq = read_seqcount_begin(&ka->pvclock_sc); |
3149 | __get_kvmclock(kvm, data); |
3150 | } while (read_seqcount_retry(&ka->pvclock_sc, seq)); |
3151 | } |
3152 | |
3153 | u64 get_kvmclock_ns(struct kvm *kvm) |
3154 | { |
3155 | struct kvm_clock_data data; |
3156 | |
3157 | get_kvmclock(kvm, data: &data); |
3158 | return data.clock; |
3159 | } |
3160 | |
3161 | static void kvm_setup_guest_pvclock(struct kvm_vcpu *v, |
3162 | struct gfn_to_pfn_cache *gpc, |
3163 | unsigned int offset, |
3164 | bool force_tsc_unstable) |
3165 | { |
3166 | struct kvm_vcpu_arch *vcpu = &v->arch; |
3167 | struct pvclock_vcpu_time_info *guest_hv_clock; |
3168 | unsigned long flags; |
3169 | |
3170 | read_lock_irqsave(&gpc->lock, flags); |
3171 | while (!kvm_gpc_check(gpc, len: offset + sizeof(*guest_hv_clock))) { |
3172 | read_unlock_irqrestore(&gpc->lock, flags); |
3173 | |
3174 | if (kvm_gpc_refresh(gpc, len: offset + sizeof(*guest_hv_clock))) |
3175 | return; |
3176 | |
3177 | read_lock_irqsave(&gpc->lock, flags); |
3178 | } |
3179 | |
3180 | guest_hv_clock = (void *)(gpc->khva + offset); |
3181 | |
3182 | /* |
3183 | * This VCPU is paused, but it's legal for a guest to read another |
3184 | * VCPU's kvmclock, so we really have to follow the specification where |
3185 | * it says that version is odd if data is being modified, and even after |
3186 | * it is consistent. |
3187 | */ |
3188 | |
3189 | guest_hv_clock->version = vcpu->hv_clock.version = (guest_hv_clock->version + 1) | 1; |
3190 | smp_wmb(); |
3191 | |
3192 | /* retain PVCLOCK_GUEST_STOPPED if set in guest copy */ |
3193 | vcpu->hv_clock.flags |= (guest_hv_clock->flags & PVCLOCK_GUEST_STOPPED); |
3194 | |
3195 | if (vcpu->pvclock_set_guest_stopped_request) { |
3196 | vcpu->hv_clock.flags |= PVCLOCK_GUEST_STOPPED; |
3197 | vcpu->pvclock_set_guest_stopped_request = false; |
3198 | } |
3199 | |
3200 | memcpy(guest_hv_clock, &vcpu->hv_clock, sizeof(*guest_hv_clock)); |
3201 | |
3202 | if (force_tsc_unstable) |
3203 | guest_hv_clock->flags &= ~PVCLOCK_TSC_STABLE_BIT; |
3204 | |
3205 | smp_wmb(); |
3206 | |
3207 | guest_hv_clock->version = ++vcpu->hv_clock.version; |
3208 | |
3209 | kvm_gpc_mark_dirty_in_slot(gpc); |
3210 | read_unlock_irqrestore(&gpc->lock, flags); |
3211 | |
3212 | trace_kvm_pvclock_update(vcpu_id: v->vcpu_id, pvclock: &vcpu->hv_clock); |
3213 | } |
3214 | |
3215 | static int kvm_guest_time_update(struct kvm_vcpu *v) |
3216 | { |
3217 | unsigned long flags, tgt_tsc_khz; |
3218 | unsigned seq; |
3219 | struct kvm_vcpu_arch *vcpu = &v->arch; |
3220 | struct kvm_arch *ka = &v->kvm->arch; |
3221 | s64 kernel_ns; |
3222 | u64 tsc_timestamp, host_tsc; |
3223 | u8 pvclock_flags; |
3224 | bool use_master_clock; |
3225 | #ifdef CONFIG_KVM_XEN |
3226 | /* |
3227 | * For Xen guests we may need to override PVCLOCK_TSC_STABLE_BIT as unless |
3228 | * explicitly told to use TSC as its clocksource Xen will not set this bit. |
3229 | * This default behaviour led to bugs in some guest kernels which cause |
3230 | * problems if they observe PVCLOCK_TSC_STABLE_BIT in the pvclock flags. |
3231 | */ |
3232 | bool xen_pvclock_tsc_unstable = |
3233 | ka->xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; |
3234 | #endif |
3235 | |
3236 | kernel_ns = 0; |
3237 | host_tsc = 0; |
3238 | |
3239 | /* |
3240 | * If the host uses TSC clock, then passthrough TSC as stable |
3241 | * to the guest. |
3242 | */ |
3243 | do { |
3244 | seq = read_seqcount_begin(&ka->pvclock_sc); |
3245 | use_master_clock = ka->use_master_clock; |
3246 | if (use_master_clock) { |
3247 | host_tsc = ka->master_cycle_now; |
3248 | kernel_ns = ka->master_kernel_ns; |
3249 | } |
3250 | } while (read_seqcount_retry(&ka->pvclock_sc, seq)); |
3251 | |
3252 | /* Keep irq disabled to prevent changes to the clock */ |
3253 | local_irq_save(flags); |
3254 | tgt_tsc_khz = get_cpu_tsc_khz(); |
3255 | if (unlikely(tgt_tsc_khz == 0)) { |
3256 | local_irq_restore(flags); |
3257 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu: v); |
3258 | return 1; |
3259 | } |
3260 | if (!use_master_clock) { |
3261 | host_tsc = rdtsc(); |
3262 | kernel_ns = get_kvmclock_base_ns(); |
3263 | } |
3264 | |
3265 | tsc_timestamp = kvm_read_l1_tsc(v, host_tsc); |
3266 | |
3267 | /* |
3268 | * We may have to catch up the TSC to match elapsed wall clock |
3269 | * time for two reasons, even if kvmclock is used. |
3270 | * 1) CPU could have been running below the maximum TSC rate |
3271 | * 2) Broken TSC compensation resets the base at each VCPU |
3272 | * entry to avoid unknown leaps of TSC even when running |
3273 | * again on the same CPU. This may cause apparent elapsed |
3274 | * time to disappear, and the guest to stand still or run |
3275 | * very slowly. |
3276 | */ |
3277 | if (vcpu->tsc_catchup) { |
3278 | u64 tsc = compute_guest_tsc(vcpu: v, kernel_ns); |
3279 | if (tsc > tsc_timestamp) { |
3280 | adjust_tsc_offset_guest(vcpu: v, adjustment: tsc - tsc_timestamp); |
3281 | tsc_timestamp = tsc; |
3282 | } |
3283 | } |
3284 | |
3285 | local_irq_restore(flags); |
3286 | |
3287 | /* With all the info we got, fill in the values */ |
3288 | |
3289 | if (kvm_caps.has_tsc_control) |
3290 | tgt_tsc_khz = kvm_scale_tsc(tsc: tgt_tsc_khz, |
3291 | ratio: v->arch.l1_tsc_scaling_ratio); |
3292 | |
3293 | if (unlikely(vcpu->hw_tsc_khz != tgt_tsc_khz)) { |
3294 | kvm_get_time_scale(NSEC_PER_SEC, base_hz: tgt_tsc_khz * 1000LL, |
3295 | pshift: &vcpu->hv_clock.tsc_shift, |
3296 | pmultiplier: &vcpu->hv_clock.tsc_to_system_mul); |
3297 | vcpu->hw_tsc_khz = tgt_tsc_khz; |
3298 | kvm_xen_update_tsc_info(vcpu: v); |
3299 | } |
3300 | |
3301 | vcpu->hv_clock.tsc_timestamp = tsc_timestamp; |
3302 | vcpu->hv_clock.system_time = kernel_ns + v->kvm->arch.kvmclock_offset; |
3303 | vcpu->last_guest_tsc = tsc_timestamp; |
3304 | |
3305 | /* If the host uses TSC clocksource, then it is stable */ |
3306 | pvclock_flags = 0; |
3307 | if (use_master_clock) |
3308 | pvclock_flags |= PVCLOCK_TSC_STABLE_BIT; |
3309 | |
3310 | vcpu->hv_clock.flags = pvclock_flags; |
3311 | |
3312 | if (vcpu->pv_time.active) |
3313 | kvm_setup_guest_pvclock(v, gpc: &vcpu->pv_time, offset: 0, force_tsc_unstable: false); |
3314 | #ifdef CONFIG_KVM_XEN |
3315 | if (vcpu->xen.vcpu_info_cache.active) |
3316 | kvm_setup_guest_pvclock(v, gpc: &vcpu->xen.vcpu_info_cache, |
3317 | offsetof(struct compat_vcpu_info, time), |
3318 | force_tsc_unstable: xen_pvclock_tsc_unstable); |
3319 | if (vcpu->xen.vcpu_time_info_cache.active) |
3320 | kvm_setup_guest_pvclock(v, gpc: &vcpu->xen.vcpu_time_info_cache, offset: 0, |
3321 | force_tsc_unstable: xen_pvclock_tsc_unstable); |
3322 | #endif |
3323 | kvm_hv_setup_tsc_page(kvm: v->kvm, hv_clock: &vcpu->hv_clock); |
3324 | return 0; |
3325 | } |
3326 | |
3327 | /* |
3328 | * The pvclock_wall_clock ABI tells the guest the wall clock time at |
3329 | * which it started (i.e. its epoch, when its kvmclock was zero). |
3330 | * |
3331 | * In fact those clocks are subtly different; wall clock frequency is |
3332 | * adjusted by NTP and has leap seconds, while the kvmclock is a |
3333 | * simple function of the TSC without any such adjustment. |
3334 | * |
3335 | * Perhaps the ABI should have exposed CLOCK_TAI and a ratio between |
3336 | * that and kvmclock, but even that would be subject to change over |
3337 | * time. |
3338 | * |
3339 | * Attempt to calculate the epoch at a given moment using the *same* |
3340 | * TSC reading via kvm_get_walltime_and_clockread() to obtain both |
3341 | * wallclock and kvmclock times, and subtracting one from the other. |
3342 | * |
3343 | * Fall back to using their values at slightly different moments by |
3344 | * calling ktime_get_real_ns() and get_kvmclock_ns() separately. |
3345 | */ |
3346 | uint64_t kvm_get_wall_clock_epoch(struct kvm *kvm) |
3347 | { |
3348 | #ifdef CONFIG_X86_64 |
3349 | struct pvclock_vcpu_time_info hv_clock; |
3350 | struct kvm_arch *ka = &kvm->arch; |
3351 | unsigned long seq, local_tsc_khz; |
3352 | struct timespec64 ts; |
3353 | uint64_t host_tsc; |
3354 | |
3355 | do { |
3356 | seq = read_seqcount_begin(&ka->pvclock_sc); |
3357 | |
3358 | local_tsc_khz = 0; |
3359 | if (!ka->use_master_clock) |
3360 | break; |
3361 | |
3362 | /* |
3363 | * The TSC read and the call to get_cpu_tsc_khz() must happen |
3364 | * on the same CPU. |
3365 | */ |
3366 | get_cpu(); |
3367 | |
3368 | local_tsc_khz = get_cpu_tsc_khz(); |
3369 | |
3370 | if (local_tsc_khz && |
3371 | !kvm_get_walltime_and_clockread(ts: &ts, tsc_timestamp: &host_tsc)) |
3372 | local_tsc_khz = 0; /* Fall back to old method */ |
3373 | |
3374 | put_cpu(); |
3375 | |
3376 | /* |
3377 | * These values must be snapshotted within the seqcount loop. |
3378 | * After that, it's just mathematics which can happen on any |
3379 | * CPU at any time. |
3380 | */ |
3381 | hv_clock.tsc_timestamp = ka->master_cycle_now; |
3382 | hv_clock.system_time = ka->master_kernel_ns + ka->kvmclock_offset; |
3383 | |
3384 | } while (read_seqcount_retry(&ka->pvclock_sc, seq)); |
3385 | |
3386 | /* |
3387 | * If the conditions were right, and obtaining the wallclock+TSC was |
3388 | * successful, calculate the KVM clock at the corresponding time and |
3389 | * subtract one from the other to get the guest's epoch in nanoseconds |
3390 | * since 1970-01-01. |
3391 | */ |
3392 | if (local_tsc_khz) { |
3393 | kvm_get_time_scale(NSEC_PER_SEC, base_hz: local_tsc_khz * NSEC_PER_USEC, |
3394 | pshift: &hv_clock.tsc_shift, |
3395 | pmultiplier: &hv_clock.tsc_to_system_mul); |
3396 | return ts.tv_nsec + NSEC_PER_SEC * ts.tv_sec - |
3397 | __pvclock_read_cycles(src: &hv_clock, tsc: host_tsc); |
3398 | } |
3399 | #endif |
3400 | return ktime_get_real_ns() - get_kvmclock_ns(kvm); |
3401 | } |
3402 | |
3403 | /* |
3404 | * kvmclock updates which are isolated to a given vcpu, such as |
3405 | * vcpu->cpu migration, should not allow system_timestamp from |
3406 | * the rest of the vcpus to remain static. Otherwise ntp frequency |
3407 | * correction applies to one vcpu's system_timestamp but not |
3408 | * the others. |
3409 | * |
3410 | * So in those cases, request a kvmclock update for all vcpus. |
3411 | * We need to rate-limit these requests though, as they can |
3412 | * considerably slow guests that have a large number of vcpus. |
3413 | * The time for a remote vcpu to update its kvmclock is bound |
3414 | * by the delay we use to rate-limit the updates. |
3415 | */ |
3416 | |
3417 | #define KVMCLOCK_UPDATE_DELAY msecs_to_jiffies(100) |
3418 | |
3419 | static void kvmclock_update_fn(struct work_struct *work) |
3420 | { |
3421 | unsigned long i; |
3422 | struct delayed_work *dwork = to_delayed_work(work); |
3423 | struct kvm_arch *ka = container_of(dwork, struct kvm_arch, |
3424 | kvmclock_update_work); |
3425 | struct kvm *kvm = container_of(ka, struct kvm, arch); |
3426 | struct kvm_vcpu *vcpu; |
3427 | |
3428 | kvm_for_each_vcpu(i, vcpu, kvm) { |
3429 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
3430 | kvm_vcpu_kick(vcpu); |
3431 | } |
3432 | } |
3433 | |
3434 | static void kvm_gen_kvmclock_update(struct kvm_vcpu *v) |
3435 | { |
3436 | struct kvm *kvm = v->kvm; |
3437 | |
3438 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu: v); |
3439 | schedule_delayed_work(dwork: &kvm->arch.kvmclock_update_work, |
3440 | KVMCLOCK_UPDATE_DELAY); |
3441 | } |
3442 | |
3443 | #define KVMCLOCK_SYNC_PERIOD (300 * HZ) |
3444 | |
3445 | static void kvmclock_sync_fn(struct work_struct *work) |
3446 | { |
3447 | struct delayed_work *dwork = to_delayed_work(work); |
3448 | struct kvm_arch *ka = container_of(dwork, struct kvm_arch, |
3449 | kvmclock_sync_work); |
3450 | struct kvm *kvm = container_of(ka, struct kvm, arch); |
3451 | |
3452 | schedule_delayed_work(dwork: &kvm->arch.kvmclock_update_work, delay: 0); |
3453 | schedule_delayed_work(dwork: &kvm->arch.kvmclock_sync_work, |
3454 | KVMCLOCK_SYNC_PERIOD); |
3455 | } |
3456 | |
3457 | /* These helpers are safe iff @msr is known to be an MCx bank MSR. */ |
3458 | static bool is_mci_control_msr(u32 msr) |
3459 | { |
3460 | return (msr & 3) == 0; |
3461 | } |
3462 | static bool is_mci_status_msr(u32 msr) |
3463 | { |
3464 | return (msr & 3) == 1; |
3465 | } |
3466 | |
3467 | /* |
3468 | * On AMD, HWCR[McStatusWrEn] controls whether setting MCi_STATUS results in #GP. |
3469 | */ |
3470 | static bool can_set_mci_status(struct kvm_vcpu *vcpu) |
3471 | { |
3472 | /* McStatusWrEn enabled? */ |
3473 | if (guest_cpuid_is_amd_compatible(vcpu)) |
3474 | return !!(vcpu->arch.msr_hwcr & BIT_ULL(18)); |
3475 | |
3476 | return false; |
3477 | } |
3478 | |
3479 | static int set_msr_mce(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
3480 | { |
3481 | u64 mcg_cap = vcpu->arch.mcg_cap; |
3482 | unsigned bank_num = mcg_cap & 0xff; |
3483 | u32 msr = msr_info->index; |
3484 | u64 data = msr_info->data; |
3485 | u32 offset, last_msr; |
3486 | |
3487 | switch (msr) { |
3488 | case MSR_IA32_MCG_STATUS: |
3489 | vcpu->arch.mcg_status = data; |
3490 | break; |
3491 | case MSR_IA32_MCG_CTL: |
3492 | if (!(mcg_cap & MCG_CTL_P) && |
3493 | (data || !msr_info->host_initiated)) |
3494 | return 1; |
3495 | if (data != 0 && data != ~(u64)0) |
3496 | return 1; |
3497 | vcpu->arch.mcg_ctl = data; |
3498 | break; |
3499 | case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: |
3500 | last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; |
3501 | if (msr > last_msr) |
3502 | return 1; |
3503 | |
3504 | if (!(mcg_cap & MCG_CMCI_P) && (data || !msr_info->host_initiated)) |
3505 | return 1; |
3506 | /* An attempt to write a 1 to a reserved bit raises #GP */ |
3507 | if (data & ~(MCI_CTL2_CMCI_EN | MCI_CTL2_CMCI_THRESHOLD_MASK)) |
3508 | return 1; |
3509 | offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, |
3510 | last_msr + 1 - MSR_IA32_MC0_CTL2); |
3511 | vcpu->arch.mci_ctl2_banks[offset] = data; |
3512 | break; |
3513 | case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: |
3514 | last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; |
3515 | if (msr > last_msr) |
3516 | return 1; |
3517 | |
3518 | /* |
3519 | * Only 0 or all 1s can be written to IA32_MCi_CTL, all other |
3520 | * values are architecturally undefined. But, some Linux |
3521 | * kernels clear bit 10 in bank 4 to workaround a BIOS/GART TLB |
3522 | * issue on AMD K8s, allow bit 10 to be clear when setting all |
3523 | * other bits in order to avoid an uncaught #GP in the guest. |
3524 | * |
3525 | * UNIXWARE clears bit 0 of MC1_CTL to ignore correctable, |
3526 | * single-bit ECC data errors. |
3527 | */ |
3528 | if (is_mci_control_msr(msr) && |
3529 | data != 0 && (data | (1 << 10) | 1) != ~(u64)0) |
3530 | return 1; |
3531 | |
3532 | /* |
3533 | * All CPUs allow writing 0 to MCi_STATUS MSRs to clear the MSR. |
3534 | * AMD-based CPUs allow non-zero values, but if and only if |
3535 | * HWCR[McStatusWrEn] is set. |
3536 | */ |
3537 | if (!msr_info->host_initiated && is_mci_status_msr(msr) && |
3538 | data != 0 && !can_set_mci_status(vcpu)) |
3539 | return 1; |
3540 | |
3541 | offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, |
3542 | last_msr + 1 - MSR_IA32_MC0_CTL); |
3543 | vcpu->arch.mce_banks[offset] = data; |
3544 | break; |
3545 | default: |
3546 | return 1; |
3547 | } |
3548 | return 0; |
3549 | } |
3550 | |
3551 | static inline bool kvm_pv_async_pf_enabled(struct kvm_vcpu *vcpu) |
3552 | { |
3553 | u64 mask = KVM_ASYNC_PF_ENABLED | KVM_ASYNC_PF_DELIVERY_AS_INT; |
3554 | |
3555 | return (vcpu->arch.apf.msr_en_val & mask) == mask; |
3556 | } |
3557 | |
3558 | static int kvm_pv_enable_async_pf(struct kvm_vcpu *vcpu, u64 data) |
3559 | { |
3560 | gpa_t gpa = data & ~0x3f; |
3561 | |
3562 | /* Bits 4:5 are reserved, Should be zero */ |
3563 | if (data & 0x30) |
3564 | return 1; |
3565 | |
3566 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_VMEXIT) && |
3567 | (data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT)) |
3568 | return 1; |
3569 | |
3570 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT) && |
3571 | (data & KVM_ASYNC_PF_DELIVERY_AS_INT)) |
3572 | return 1; |
3573 | |
3574 | if (!lapic_in_kernel(vcpu)) |
3575 | return data ? 1 : 0; |
3576 | |
3577 | vcpu->arch.apf.msr_en_val = data; |
3578 | |
3579 | if (!kvm_pv_async_pf_enabled(vcpu)) { |
3580 | kvm_clear_async_pf_completion_queue(vcpu); |
3581 | kvm_async_pf_hash_reset(vcpu); |
3582 | return 0; |
3583 | } |
3584 | |
3585 | if (kvm_gfn_to_hva_cache_init(kvm: vcpu->kvm, ghc: &vcpu->arch.apf.data, gpa, |
3586 | len: sizeof(u64))) |
3587 | return 1; |
3588 | |
3589 | vcpu->arch.apf.send_user_only = !(data & KVM_ASYNC_PF_SEND_ALWAYS); |
3590 | vcpu->arch.apf.delivery_as_pf_vmexit = data & KVM_ASYNC_PF_DELIVERY_AS_PF_VMEXIT; |
3591 | |
3592 | kvm_async_pf_wakeup_all(vcpu); |
3593 | |
3594 | return 0; |
3595 | } |
3596 | |
3597 | static int kvm_pv_enable_async_pf_int(struct kvm_vcpu *vcpu, u64 data) |
3598 | { |
3599 | /* Bits 8-63 are reserved */ |
3600 | if (data >> 8) |
3601 | return 1; |
3602 | |
3603 | if (!lapic_in_kernel(vcpu)) |
3604 | return 1; |
3605 | |
3606 | vcpu->arch.apf.msr_int_val = data; |
3607 | |
3608 | vcpu->arch.apf.vec = data & KVM_ASYNC_PF_VEC_MASK; |
3609 | |
3610 | return 0; |
3611 | } |
3612 | |
3613 | static void kvmclock_reset(struct kvm_vcpu *vcpu) |
3614 | { |
3615 | kvm_gpc_deactivate(gpc: &vcpu->arch.pv_time); |
3616 | vcpu->arch.time = 0; |
3617 | } |
3618 | |
3619 | static void kvm_vcpu_flush_tlb_all(struct kvm_vcpu *vcpu) |
3620 | { |
3621 | ++vcpu->stat.tlb_flush; |
3622 | static_call(kvm_x86_flush_tlb_all)(vcpu); |
3623 | |
3624 | /* Flushing all ASIDs flushes the current ASID... */ |
3625 | kvm_clear_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu); |
3626 | } |
3627 | |
3628 | static void kvm_vcpu_flush_tlb_guest(struct kvm_vcpu *vcpu) |
3629 | { |
3630 | ++vcpu->stat.tlb_flush; |
3631 | |
3632 | if (!tdp_enabled) { |
3633 | /* |
3634 | * A TLB flush on behalf of the guest is equivalent to |
3635 | * INVPCID(all), toggling CR4.PGE, etc., which requires |
3636 | * a forced sync of the shadow page tables. Ensure all the |
3637 | * roots are synced and the guest TLB in hardware is clean. |
3638 | */ |
3639 | kvm_mmu_sync_roots(vcpu); |
3640 | kvm_mmu_sync_prev_roots(vcpu); |
3641 | } |
3642 | |
3643 | static_call(kvm_x86_flush_tlb_guest)(vcpu); |
3644 | |
3645 | /* |
3646 | * Flushing all "guest" TLB is always a superset of Hyper-V's fine |
3647 | * grained flushing. |
3648 | */ |
3649 | kvm_hv_vcpu_purge_flush_tlb(vcpu); |
3650 | } |
3651 | |
3652 | |
3653 | static inline void kvm_vcpu_flush_tlb_current(struct kvm_vcpu *vcpu) |
3654 | { |
3655 | ++vcpu->stat.tlb_flush; |
3656 | static_call(kvm_x86_flush_tlb_current)(vcpu); |
3657 | } |
3658 | |
3659 | /* |
3660 | * Service "local" TLB flush requests, which are specific to the current MMU |
3661 | * context. In addition to the generic event handling in vcpu_enter_guest(), |
3662 | * TLB flushes that are targeted at an MMU context also need to be serviced |
3663 | * prior before nested VM-Enter/VM-Exit. |
3664 | */ |
3665 | void kvm_service_local_tlb_flush_requests(struct kvm_vcpu *vcpu) |
3666 | { |
3667 | if (kvm_check_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu)) |
3668 | kvm_vcpu_flush_tlb_current(vcpu); |
3669 | |
3670 | if (kvm_check_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu)) |
3671 | kvm_vcpu_flush_tlb_guest(vcpu); |
3672 | } |
3673 | EXPORT_SYMBOL_GPL(kvm_service_local_tlb_flush_requests); |
3674 | |
3675 | static void record_steal_time(struct kvm_vcpu *vcpu) |
3676 | { |
3677 | struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; |
3678 | struct kvm_steal_time __user *st; |
3679 | struct kvm_memslots *slots; |
3680 | gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; |
3681 | u64 steal; |
3682 | u32 version; |
3683 | |
3684 | if (kvm_xen_msr_enabled(kvm: vcpu->kvm)) { |
3685 | kvm_xen_runstate_set_running(vcpu); |
3686 | return; |
3687 | } |
3688 | |
3689 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) |
3690 | return; |
3691 | |
3692 | if (WARN_ON_ONCE(current->mm != vcpu->kvm->mm)) |
3693 | return; |
3694 | |
3695 | slots = kvm_memslots(kvm: vcpu->kvm); |
3696 | |
3697 | if (unlikely(slots->generation != ghc->generation || |
3698 | gpa != ghc->gpa || |
3699 | kvm_is_error_hva(ghc->hva) || !ghc->memslot)) { |
3700 | /* We rely on the fact that it fits in a single page. */ |
3701 | BUILD_BUG_ON((sizeof(*st) - 1) & KVM_STEAL_VALID_BITS); |
3702 | |
3703 | if (kvm_gfn_to_hva_cache_init(kvm: vcpu->kvm, ghc, gpa, len: sizeof(*st)) || |
3704 | kvm_is_error_hva(addr: ghc->hva) || !ghc->memslot) |
3705 | return; |
3706 | } |
3707 | |
3708 | st = (struct kvm_steal_time __user *)ghc->hva; |
3709 | /* |
3710 | * Doing a TLB flush here, on the guest's behalf, can avoid |
3711 | * expensive IPIs. |
3712 | */ |
3713 | if (guest_pv_has(vcpu, KVM_FEATURE_PV_TLB_FLUSH)) { |
3714 | u8 st_preempted = 0; |
3715 | int err = -EFAULT; |
3716 | |
3717 | if (!user_access_begin(st, sizeof(*st))) |
3718 | return; |
3719 | |
3720 | asm volatile("1: xchgb %0, %2\n" |
3721 | "xor %1, %1\n" |
3722 | "2:\n" |
3723 | _ASM_EXTABLE_UA(1b, 2b) |
3724 | : "+q" (st_preempted), |
3725 | "+&r" (err), |
3726 | "+m" (st->preempted)); |
3727 | if (err) |
3728 | goto out; |
3729 | |
3730 | user_access_end(); |
3731 | |
3732 | vcpu->arch.st.preempted = 0; |
3733 | |
3734 | trace_kvm_pv_tlb_flush(vcpu_id: vcpu->vcpu_id, |
3735 | need_flush_tlb: st_preempted & KVM_VCPU_FLUSH_TLB); |
3736 | if (st_preempted & KVM_VCPU_FLUSH_TLB) |
3737 | kvm_vcpu_flush_tlb_guest(vcpu); |
3738 | |
3739 | if (!user_access_begin(st, sizeof(*st))) |
3740 | goto dirty; |
3741 | } else { |
3742 | if (!user_access_begin(st, sizeof(*st))) |
3743 | return; |
3744 | |
3745 | unsafe_put_user(0, &st->preempted, out); |
3746 | vcpu->arch.st.preempted = 0; |
3747 | } |
3748 | |
3749 | unsafe_get_user(version, &st->version, out); |
3750 | if (version & 1) |
3751 | version += 1; /* first time write, random junk */ |
3752 | |
3753 | version += 1; |
3754 | unsafe_put_user(version, &st->version, out); |
3755 | |
3756 | smp_wmb(); |
3757 | |
3758 | unsafe_get_user(steal, &st->steal, out); |
3759 | steal += current->sched_info.run_delay - |
3760 | vcpu->arch.st.last_steal; |
3761 | vcpu->arch.st.last_steal = current->sched_info.run_delay; |
3762 | unsafe_put_user(steal, &st->steal, out); |
3763 | |
3764 | version += 1; |
3765 | unsafe_put_user(version, &st->version, out); |
3766 | |
3767 | out: |
3768 | user_access_end(); |
3769 | dirty: |
3770 | mark_page_dirty_in_slot(kvm: vcpu->kvm, memslot: ghc->memslot, gfn: gpa_to_gfn(gpa: ghc->gpa)); |
3771 | } |
3772 | |
3773 | static bool kvm_is_msr_to_save(u32 msr_index) |
3774 | { |
3775 | unsigned int i; |
3776 | |
3777 | for (i = 0; i < num_msrs_to_save; i++) { |
3778 | if (msrs_to_save[i] == msr_index) |
3779 | return true; |
3780 | } |
3781 | |
3782 | return false; |
3783 | } |
3784 | |
3785 | int kvm_set_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
3786 | { |
3787 | u32 msr = msr_info->index; |
3788 | u64 data = msr_info->data; |
3789 | |
3790 | if (msr && msr == vcpu->kvm->arch.xen_hvm_config.msr) |
3791 | return kvm_xen_write_hypercall_page(vcpu, data); |
3792 | |
3793 | switch (msr) { |
3794 | case MSR_AMD64_NB_CFG: |
3795 | case MSR_IA32_UCODE_WRITE: |
3796 | case MSR_VM_HSAVE_PA: |
3797 | case MSR_AMD64_PATCH_LOADER: |
3798 | case MSR_AMD64_BU_CFG2: |
3799 | case MSR_AMD64_DC_CFG: |
3800 | case MSR_AMD64_TW_CFG: |
3801 | case MSR_F15H_EX_CFG: |
3802 | break; |
3803 | |
3804 | case MSR_IA32_UCODE_REV: |
3805 | if (msr_info->host_initiated) |
3806 | vcpu->arch.microcode_version = data; |
3807 | break; |
3808 | case MSR_IA32_ARCH_CAPABILITIES: |
3809 | if (!msr_info->host_initiated) |
3810 | return 1; |
3811 | vcpu->arch.arch_capabilities = data; |
3812 | break; |
3813 | case MSR_IA32_PERF_CAPABILITIES: |
3814 | if (!msr_info->host_initiated) |
3815 | return 1; |
3816 | if (data & ~kvm_caps.supported_perf_cap) |
3817 | return 1; |
3818 | |
3819 | /* |
3820 | * Note, this is not just a performance optimization! KVM |
3821 | * disallows changing feature MSRs after the vCPU has run; PMU |
3822 | * refresh will bug the VM if called after the vCPU has run. |
3823 | */ |
3824 | if (vcpu->arch.perf_capabilities == data) |
3825 | break; |
3826 | |
3827 | vcpu->arch.perf_capabilities = data; |
3828 | kvm_pmu_refresh(vcpu); |
3829 | break; |
3830 | case MSR_IA32_PRED_CMD: { |
3831 | u64 reserved_bits = ~(PRED_CMD_IBPB | PRED_CMD_SBPB); |
3832 | |
3833 | if (!msr_info->host_initiated) { |
3834 | if ((!guest_has_pred_cmd_msr(vcpu))) |
3835 | return 1; |
3836 | |
3837 | if (!guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && |
3838 | !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) |
3839 | reserved_bits |= PRED_CMD_IBPB; |
3840 | |
3841 | if (!guest_cpuid_has(vcpu, X86_FEATURE_SBPB)) |
3842 | reserved_bits |= PRED_CMD_SBPB; |
3843 | } |
3844 | |
3845 | if (!boot_cpu_has(X86_FEATURE_IBPB)) |
3846 | reserved_bits |= PRED_CMD_IBPB; |
3847 | |
3848 | if (!boot_cpu_has(X86_FEATURE_SBPB)) |
3849 | reserved_bits |= PRED_CMD_SBPB; |
3850 | |
3851 | if (data & reserved_bits) |
3852 | return 1; |
3853 | |
3854 | if (!data) |
3855 | break; |
3856 | |
3857 | wrmsrl(MSR_IA32_PRED_CMD, val: data); |
3858 | break; |
3859 | } |
3860 | case MSR_IA32_FLUSH_CMD: |
3861 | if (!msr_info->host_initiated && |
3862 | !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)) |
3863 | return 1; |
3864 | |
3865 | if (!boot_cpu_has(X86_FEATURE_FLUSH_L1D) || (data & ~L1D_FLUSH)) |
3866 | return 1; |
3867 | if (!data) |
3868 | break; |
3869 | |
3870 | wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH); |
3871 | break; |
3872 | case MSR_EFER: |
3873 | return set_efer(vcpu, msr_info); |
3874 | case MSR_K7_HWCR: |
3875 | data &= ~(u64)0x40; /* ignore flush filter disable */ |
3876 | data &= ~(u64)0x100; /* ignore ignne emulation enable */ |
3877 | data &= ~(u64)0x8; /* ignore TLB cache disable */ |
3878 | |
3879 | /* |
3880 | * Allow McStatusWrEn and TscFreqSel. (Linux guests from v3.2 |
3881 | * through at least v6.6 whine if TscFreqSel is clear, |
3882 | * depending on F/M/S. |
3883 | */ |
3884 | if (data & ~(BIT_ULL(18) | BIT_ULL(24))) { |
3885 | kvm_pr_unimpl_wrmsr(vcpu, msr, data); |
3886 | return 1; |
3887 | } |
3888 | vcpu->arch.msr_hwcr = data; |
3889 | break; |
3890 | case MSR_FAM10H_MMIO_CONF_BASE: |
3891 | if (data != 0) { |
3892 | kvm_pr_unimpl_wrmsr(vcpu, msr, data); |
3893 | return 1; |
3894 | } |
3895 | break; |
3896 | case MSR_IA32_CR_PAT: |
3897 | if (!kvm_pat_valid(data)) |
3898 | return 1; |
3899 | |
3900 | vcpu->arch.pat = data; |
3901 | break; |
3902 | case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: |
3903 | case MSR_MTRRdefType: |
3904 | return kvm_mtrr_set_msr(vcpu, msr, data); |
3905 | case MSR_IA32_APICBASE: |
3906 | return kvm_set_apic_base(vcpu, msr_info); |
3907 | case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: |
3908 | return kvm_x2apic_msr_write(vcpu, msr, data); |
3909 | case MSR_IA32_TSC_DEADLINE: |
3910 | kvm_set_lapic_tscdeadline_msr(vcpu, data); |
3911 | break; |
3912 | case MSR_IA32_TSC_ADJUST: |
3913 | if (guest_cpuid_has(vcpu, X86_FEATURE_TSC_ADJUST)) { |
3914 | if (!msr_info->host_initiated) { |
3915 | s64 adj = data - vcpu->arch.ia32_tsc_adjust_msr; |
3916 | adjust_tsc_offset_guest(vcpu, adjustment: adj); |
3917 | /* Before back to guest, tsc_timestamp must be adjusted |
3918 | * as well, otherwise guest's percpu pvclock time could jump. |
3919 | */ |
3920 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
3921 | } |
3922 | vcpu->arch.ia32_tsc_adjust_msr = data; |
3923 | } |
3924 | break; |
3925 | case MSR_IA32_MISC_ENABLE: { |
3926 | u64 old_val = vcpu->arch.ia32_misc_enable_msr; |
3927 | |
3928 | if (!msr_info->host_initiated) { |
3929 | /* RO bits */ |
3930 | if ((old_val ^ data) & MSR_IA32_MISC_ENABLE_PMU_RO_MASK) |
3931 | return 1; |
3932 | |
3933 | /* R bits, i.e. writes are ignored, but don't fault. */ |
3934 | data = data & ~MSR_IA32_MISC_ENABLE_EMON; |
3935 | data |= old_val & MSR_IA32_MISC_ENABLE_EMON; |
3936 | } |
3937 | |
3938 | if (!kvm_check_has_quirk(kvm: vcpu->kvm, KVM_X86_QUIRK_MISC_ENABLE_NO_MWAIT) && |
3939 | ((old_val ^ data) & MSR_IA32_MISC_ENABLE_MWAIT)) { |
3940 | if (!guest_cpuid_has(vcpu, X86_FEATURE_XMM3)) |
3941 | return 1; |
3942 | vcpu->arch.ia32_misc_enable_msr = data; |
3943 | kvm_update_cpuid_runtime(vcpu); |
3944 | } else { |
3945 | vcpu->arch.ia32_misc_enable_msr = data; |
3946 | } |
3947 | break; |
3948 | } |
3949 | case MSR_IA32_SMBASE: |
3950 | if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) |
3951 | return 1; |
3952 | vcpu->arch.smbase = data; |
3953 | break; |
3954 | case MSR_IA32_POWER_CTL: |
3955 | vcpu->arch.msr_ia32_power_ctl = data; |
3956 | break; |
3957 | case MSR_IA32_TSC: |
3958 | if (msr_info->host_initiated) { |
3959 | kvm_synchronize_tsc(vcpu, user_value: &data); |
3960 | } else { |
3961 | u64 adj = kvm_compute_l1_tsc_offset(vcpu, target_tsc: data) - vcpu->arch.l1_tsc_offset; |
3962 | adjust_tsc_offset_guest(vcpu, adjustment: adj); |
3963 | vcpu->arch.ia32_tsc_adjust_msr += adj; |
3964 | } |
3965 | break; |
3966 | case MSR_IA32_XSS: |
3967 | if (!msr_info->host_initiated && |
3968 | !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) |
3969 | return 1; |
3970 | /* |
3971 | * KVM supports exposing PT to the guest, but does not support |
3972 | * IA32_XSS[bit 8]. Guests have to use RDMSR/WRMSR rather than |
3973 | * XSAVES/XRSTORS to save/restore PT MSRs. |
3974 | */ |
3975 | if (data & ~kvm_caps.supported_xss) |
3976 | return 1; |
3977 | vcpu->arch.ia32_xss = data; |
3978 | kvm_update_cpuid_runtime(vcpu); |
3979 | break; |
3980 | case MSR_SMI_COUNT: |
3981 | if (!msr_info->host_initiated) |
3982 | return 1; |
3983 | vcpu->arch.smi_count = data; |
3984 | break; |
3985 | case MSR_KVM_WALL_CLOCK_NEW: |
3986 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) |
3987 | return 1; |
3988 | |
3989 | vcpu->kvm->arch.wall_clock = data; |
3990 | kvm_write_wall_clock(kvm: vcpu->kvm, wall_clock: data, sec_hi_ofs: 0); |
3991 | break; |
3992 | case MSR_KVM_WALL_CLOCK: |
3993 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) |
3994 | return 1; |
3995 | |
3996 | vcpu->kvm->arch.wall_clock = data; |
3997 | kvm_write_wall_clock(kvm: vcpu->kvm, wall_clock: data, sec_hi_ofs: 0); |
3998 | break; |
3999 | case MSR_KVM_SYSTEM_TIME_NEW: |
4000 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) |
4001 | return 1; |
4002 | |
4003 | kvm_write_system_time(vcpu, system_time: data, old_msr: false, host_initiated: msr_info->host_initiated); |
4004 | break; |
4005 | case MSR_KVM_SYSTEM_TIME: |
4006 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) |
4007 | return 1; |
4008 | |
4009 | kvm_write_system_time(vcpu, system_time: data, old_msr: true, host_initiated: msr_info->host_initiated); |
4010 | break; |
4011 | case MSR_KVM_ASYNC_PF_EN: |
4012 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) |
4013 | return 1; |
4014 | |
4015 | if (kvm_pv_enable_async_pf(vcpu, data)) |
4016 | return 1; |
4017 | break; |
4018 | case MSR_KVM_ASYNC_PF_INT: |
4019 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) |
4020 | return 1; |
4021 | |
4022 | if (kvm_pv_enable_async_pf_int(vcpu, data)) |
4023 | return 1; |
4024 | break; |
4025 | case MSR_KVM_ASYNC_PF_ACK: |
4026 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) |
4027 | return 1; |
4028 | if (data & 0x1) { |
4029 | vcpu->arch.apf.pageready_pending = false; |
4030 | kvm_check_async_pf_completion(vcpu); |
4031 | } |
4032 | break; |
4033 | case MSR_KVM_STEAL_TIME: |
4034 | if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) |
4035 | return 1; |
4036 | |
4037 | if (unlikely(!sched_info_on())) |
4038 | return 1; |
4039 | |
4040 | if (data & KVM_STEAL_RESERVED_MASK) |
4041 | return 1; |
4042 | |
4043 | vcpu->arch.st.msr_val = data; |
4044 | |
4045 | if (!(data & KVM_MSR_ENABLED)) |
4046 | break; |
4047 | |
4048 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); |
4049 | |
4050 | break; |
4051 | case MSR_KVM_PV_EOI_EN: |
4052 | if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) |
4053 | return 1; |
4054 | |
4055 | if (kvm_lapic_set_pv_eoi(vcpu, data, len: sizeof(u8))) |
4056 | return 1; |
4057 | break; |
4058 | |
4059 | case MSR_KVM_POLL_CONTROL: |
4060 | if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) |
4061 | return 1; |
4062 | |
4063 | /* only enable bit supported */ |
4064 | if (data & (-1ULL << 1)) |
4065 | return 1; |
4066 | |
4067 | vcpu->arch.msr_kvm_poll_control = data; |
4068 | break; |
4069 | |
4070 | case MSR_IA32_MCG_CTL: |
4071 | case MSR_IA32_MCG_STATUS: |
4072 | case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: |
4073 | case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: |
4074 | return set_msr_mce(vcpu, msr_info); |
4075 | |
4076 | case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: |
4077 | case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: |
4078 | case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: |
4079 | case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: |
4080 | if (kvm_pmu_is_valid_msr(vcpu, msr)) |
4081 | return kvm_pmu_set_msr(vcpu, msr_info); |
4082 | |
4083 | if (data) |
4084 | kvm_pr_unimpl_wrmsr(vcpu, msr, data); |
4085 | break; |
4086 | case MSR_K7_CLK_CTL: |
4087 | /* |
4088 | * Ignore all writes to this no longer documented MSR. |
4089 | * Writes are only relevant for old K7 processors, |
4090 | * all pre-dating SVM, but a recommended workaround from |
4091 | * AMD for these chips. It is possible to specify the |
4092 | * affected processor models on the command line, hence |
4093 | * the need to ignore the workaround. |
4094 | */ |
4095 | break; |
4096 | #ifdef CONFIG_KVM_HYPERV |
4097 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: |
4098 | case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: |
4099 | case HV_X64_MSR_SYNDBG_OPTIONS: |
4100 | case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: |
4101 | case HV_X64_MSR_CRASH_CTL: |
4102 | case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: |
4103 | case HV_X64_MSR_REENLIGHTENMENT_CONTROL: |
4104 | case HV_X64_MSR_TSC_EMULATION_CONTROL: |
4105 | case HV_X64_MSR_TSC_EMULATION_STATUS: |
4106 | case HV_X64_MSR_TSC_INVARIANT_CONTROL: |
4107 | return kvm_hv_set_msr_common(vcpu, msr, data, |
4108 | host: msr_info->host_initiated); |
4109 | #endif |
4110 | case MSR_IA32_BBL_CR_CTL3: |
4111 | /* Drop writes to this legacy MSR -- see rdmsr |
4112 | * counterpart for further detail. |
4113 | */ |
4114 | kvm_pr_unimpl_wrmsr(vcpu, msr, data); |
4115 | break; |
4116 | case MSR_AMD64_OSVW_ID_LENGTH: |
4117 | if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) |
4118 | return 1; |
4119 | vcpu->arch.osvw.length = data; |
4120 | break; |
4121 | case MSR_AMD64_OSVW_STATUS: |
4122 | if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) |
4123 | return 1; |
4124 | vcpu->arch.osvw.status = data; |
4125 | break; |
4126 | case MSR_PLATFORM_INFO: |
4127 | if (!msr_info->host_initiated || |
4128 | (!(data & MSR_PLATFORM_INFO_CPUID_FAULT) && |
4129 | cpuid_fault_enabled(vcpu))) |
4130 | return 1; |
4131 | vcpu->arch.msr_platform_info = data; |
4132 | break; |
4133 | case MSR_MISC_FEATURES_ENABLES: |
4134 | if (data & ~MSR_MISC_FEATURES_ENABLES_CPUID_FAULT || |
4135 | (data & MSR_MISC_FEATURES_ENABLES_CPUID_FAULT && |
4136 | !supports_cpuid_fault(vcpu))) |
4137 | return 1; |
4138 | vcpu->arch.msr_misc_features_enables = data; |
4139 | break; |
4140 | #ifdef CONFIG_X86_64 |
4141 | case MSR_IA32_XFD: |
4142 | if (!msr_info->host_initiated && |
4143 | !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) |
4144 | return 1; |
4145 | |
4146 | if (data & ~kvm_guest_supported_xfd(vcpu)) |
4147 | return 1; |
4148 | |
4149 | fpu_update_guest_xfd(guest_fpu: &vcpu->arch.guest_fpu, xfd: data); |
4150 | break; |
4151 | case MSR_IA32_XFD_ERR: |
4152 | if (!msr_info->host_initiated && |
4153 | !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) |
4154 | return 1; |
4155 | |
4156 | if (data & ~kvm_guest_supported_xfd(vcpu)) |
4157 | return 1; |
4158 | |
4159 | vcpu->arch.guest_fpu.xfd_err = data; |
4160 | break; |
4161 | #endif |
4162 | default: |
4163 | if (kvm_pmu_is_valid_msr(vcpu, msr)) |
4164 | return kvm_pmu_set_msr(vcpu, msr_info); |
4165 | |
4166 | /* |
4167 | * Userspace is allowed to write '0' to MSRs that KVM reports |
4168 | * as to-be-saved, even if an MSRs isn't fully supported. |
4169 | */ |
4170 | if (msr_info->host_initiated && !data && |
4171 | kvm_is_msr_to_save(msr_index: msr)) |
4172 | break; |
4173 | |
4174 | return KVM_MSR_RET_INVALID; |
4175 | } |
4176 | return 0; |
4177 | } |
4178 | EXPORT_SYMBOL_GPL(kvm_set_msr_common); |
4179 | |
4180 | static int get_msr_mce(struct kvm_vcpu *vcpu, u32 msr, u64 *pdata, bool host) |
4181 | { |
4182 | u64 data; |
4183 | u64 mcg_cap = vcpu->arch.mcg_cap; |
4184 | unsigned bank_num = mcg_cap & 0xff; |
4185 | u32 offset, last_msr; |
4186 | |
4187 | switch (msr) { |
4188 | case MSR_IA32_P5_MC_ADDR: |
4189 | case MSR_IA32_P5_MC_TYPE: |
4190 | data = 0; |
4191 | break; |
4192 | case MSR_IA32_MCG_CAP: |
4193 | data = vcpu->arch.mcg_cap; |
4194 | break; |
4195 | case MSR_IA32_MCG_CTL: |
4196 | if (!(mcg_cap & MCG_CTL_P) && !host) |
4197 | return 1; |
4198 | data = vcpu->arch.mcg_ctl; |
4199 | break; |
4200 | case MSR_IA32_MCG_STATUS: |
4201 | data = vcpu->arch.mcg_status; |
4202 | break; |
4203 | case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: |
4204 | last_msr = MSR_IA32_MCx_CTL2(bank_num) - 1; |
4205 | if (msr > last_msr) |
4206 | return 1; |
4207 | |
4208 | if (!(mcg_cap & MCG_CMCI_P) && !host) |
4209 | return 1; |
4210 | offset = array_index_nospec(msr - MSR_IA32_MC0_CTL2, |
4211 | last_msr + 1 - MSR_IA32_MC0_CTL2); |
4212 | data = vcpu->arch.mci_ctl2_banks[offset]; |
4213 | break; |
4214 | case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: |
4215 | last_msr = MSR_IA32_MCx_CTL(bank_num) - 1; |
4216 | if (msr > last_msr) |
4217 | return 1; |
4218 | |
4219 | offset = array_index_nospec(msr - MSR_IA32_MC0_CTL, |
4220 | last_msr + 1 - MSR_IA32_MC0_CTL); |
4221 | data = vcpu->arch.mce_banks[offset]; |
4222 | break; |
4223 | default: |
4224 | return 1; |
4225 | } |
4226 | *pdata = data; |
4227 | return 0; |
4228 | } |
4229 | |
4230 | int kvm_get_msr_common(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
4231 | { |
4232 | switch (msr_info->index) { |
4233 | case MSR_IA32_PLATFORM_ID: |
4234 | case MSR_IA32_EBL_CR_POWERON: |
4235 | case MSR_IA32_LASTBRANCHFROMIP: |
4236 | case MSR_IA32_LASTBRANCHTOIP: |
4237 | case MSR_IA32_LASTINTFROMIP: |
4238 | case MSR_IA32_LASTINTTOIP: |
4239 | case MSR_AMD64_SYSCFG: |
4240 | case MSR_K8_TSEG_ADDR: |
4241 | case MSR_K8_TSEG_MASK: |
4242 | case MSR_VM_HSAVE_PA: |
4243 | case MSR_K8_INT_PENDING_MSG: |
4244 | case MSR_AMD64_NB_CFG: |
4245 | case MSR_FAM10H_MMIO_CONF_BASE: |
4246 | case MSR_AMD64_BU_CFG2: |
4247 | case MSR_IA32_PERF_CTL: |
4248 | case MSR_AMD64_DC_CFG: |
4249 | case MSR_AMD64_TW_CFG: |
4250 | case MSR_F15H_EX_CFG: |
4251 | /* |
4252 | * Intel Sandy Bridge CPUs must support the RAPL (running average power |
4253 | * limit) MSRs. Just return 0, as we do not want to expose the host |
4254 | * data here. Do not conditionalize this on CPUID, as KVM does not do |
4255 | * so for existing CPU-specific MSRs. |
4256 | */ |
4257 | case MSR_RAPL_POWER_UNIT: |
4258 | case MSR_PP0_ENERGY_STATUS: /* Power plane 0 (core) */ |
4259 | case MSR_PP1_ENERGY_STATUS: /* Power plane 1 (graphics uncore) */ |
4260 | case MSR_PKG_ENERGY_STATUS: /* Total package */ |
4261 | case MSR_DRAM_ENERGY_STATUS: /* DRAM controller */ |
4262 | msr_info->data = 0; |
4263 | break; |
4264 | case MSR_K7_EVNTSEL0 ... MSR_K7_EVNTSEL3: |
4265 | case MSR_K7_PERFCTR0 ... MSR_K7_PERFCTR3: |
4266 | case MSR_P6_PERFCTR0 ... MSR_P6_PERFCTR1: |
4267 | case MSR_P6_EVNTSEL0 ... MSR_P6_EVNTSEL1: |
4268 | if (kvm_pmu_is_valid_msr(vcpu, msr: msr_info->index)) |
4269 | return kvm_pmu_get_msr(vcpu, msr_info); |
4270 | msr_info->data = 0; |
4271 | break; |
4272 | case MSR_IA32_UCODE_REV: |
4273 | msr_info->data = vcpu->arch.microcode_version; |
4274 | break; |
4275 | case MSR_IA32_ARCH_CAPABILITIES: |
4276 | if (!msr_info->host_initiated && |
4277 | !guest_cpuid_has(vcpu, X86_FEATURE_ARCH_CAPABILITIES)) |
4278 | return 1; |
4279 | msr_info->data = vcpu->arch.arch_capabilities; |
4280 | break; |
4281 | case MSR_IA32_PERF_CAPABILITIES: |
4282 | if (!msr_info->host_initiated && |
4283 | !guest_cpuid_has(vcpu, X86_FEATURE_PDCM)) |
4284 | return 1; |
4285 | msr_info->data = vcpu->arch.perf_capabilities; |
4286 | break; |
4287 | case MSR_IA32_POWER_CTL: |
4288 | msr_info->data = vcpu->arch.msr_ia32_power_ctl; |
4289 | break; |
4290 | case MSR_IA32_TSC: { |
4291 | /* |
4292 | * Intel SDM states that MSR_IA32_TSC read adds the TSC offset |
4293 | * even when not intercepted. AMD manual doesn't explicitly |
4294 | * state this but appears to behave the same. |
4295 | * |
4296 | * On userspace reads and writes, however, we unconditionally |
4297 | * return L1's TSC value to ensure backwards-compatible |
4298 | * behavior for migration. |
4299 | */ |
4300 | u64 offset, ratio; |
4301 | |
4302 | if (msr_info->host_initiated) { |
4303 | offset = vcpu->arch.l1_tsc_offset; |
4304 | ratio = vcpu->arch.l1_tsc_scaling_ratio; |
4305 | } else { |
4306 | offset = vcpu->arch.tsc_offset; |
4307 | ratio = vcpu->arch.tsc_scaling_ratio; |
4308 | } |
4309 | |
4310 | msr_info->data = kvm_scale_tsc(tsc: rdtsc(), ratio) + offset; |
4311 | break; |
4312 | } |
4313 | case MSR_IA32_CR_PAT: |
4314 | msr_info->data = vcpu->arch.pat; |
4315 | break; |
4316 | case MSR_MTRRcap: |
4317 | case MTRRphysBase_MSR(0) ... MSR_MTRRfix4K_F8000: |
4318 | case MSR_MTRRdefType: |
4319 | return kvm_mtrr_get_msr(vcpu, msr: msr_info->index, pdata: &msr_info->data); |
4320 | case 0xcd: /* fsb frequency */ |
4321 | msr_info->data = 3; |
4322 | break; |
4323 | /* |
4324 | * MSR_EBC_FREQUENCY_ID |
4325 | * Conservative value valid for even the basic CPU models. |
4326 | * Models 0,1: 000 in bits 23:21 indicating a bus speed of |
4327 | * 100MHz, model 2 000 in bits 18:16 indicating 100MHz, |
4328 | * and 266MHz for model 3, or 4. Set Core Clock |
4329 | * Frequency to System Bus Frequency Ratio to 1 (bits |
4330 | * 31:24) even though these are only valid for CPU |
4331 | * models > 2, however guests may end up dividing or |
4332 | * multiplying by zero otherwise. |
4333 | */ |
4334 | case MSR_EBC_FREQUENCY_ID: |
4335 | msr_info->data = 1 << 24; |
4336 | break; |
4337 | case MSR_IA32_APICBASE: |
4338 | msr_info->data = kvm_get_apic_base(vcpu); |
4339 | break; |
4340 | case APIC_BASE_MSR ... APIC_BASE_MSR + 0xff: |
4341 | return kvm_x2apic_msr_read(vcpu, msr: msr_info->index, data: &msr_info->data); |
4342 | case MSR_IA32_TSC_DEADLINE: |
4343 | msr_info->data = kvm_get_lapic_tscdeadline_msr(vcpu); |
4344 | break; |
4345 | case MSR_IA32_TSC_ADJUST: |
4346 | msr_info->data = (u64)vcpu->arch.ia32_tsc_adjust_msr; |
4347 | break; |
4348 | case MSR_IA32_MISC_ENABLE: |
4349 | msr_info->data = vcpu->arch.ia32_misc_enable_msr; |
4350 | break; |
4351 | case MSR_IA32_SMBASE: |
4352 | if (!IS_ENABLED(CONFIG_KVM_SMM) || !msr_info->host_initiated) |
4353 | return 1; |
4354 | msr_info->data = vcpu->arch.smbase; |
4355 | break; |
4356 | case MSR_SMI_COUNT: |
4357 | msr_info->data = vcpu->arch.smi_count; |
4358 | break; |
4359 | case MSR_IA32_PERF_STATUS: |
4360 | /* TSC increment by tick */ |
4361 | msr_info->data = 1000ULL; |
4362 | /* CPU multiplier */ |
4363 | msr_info->data |= (((uint64_t)4ULL) << 40); |
4364 | break; |
4365 | case MSR_EFER: |
4366 | msr_info->data = vcpu->arch.efer; |
4367 | break; |
4368 | case MSR_KVM_WALL_CLOCK: |
4369 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) |
4370 | return 1; |
4371 | |
4372 | msr_info->data = vcpu->kvm->arch.wall_clock; |
4373 | break; |
4374 | case MSR_KVM_WALL_CLOCK_NEW: |
4375 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) |
4376 | return 1; |
4377 | |
4378 | msr_info->data = vcpu->kvm->arch.wall_clock; |
4379 | break; |
4380 | case MSR_KVM_SYSTEM_TIME: |
4381 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE)) |
4382 | return 1; |
4383 | |
4384 | msr_info->data = vcpu->arch.time; |
4385 | break; |
4386 | case MSR_KVM_SYSTEM_TIME_NEW: |
4387 | if (!guest_pv_has(vcpu, KVM_FEATURE_CLOCKSOURCE2)) |
4388 | return 1; |
4389 | |
4390 | msr_info->data = vcpu->arch.time; |
4391 | break; |
4392 | case MSR_KVM_ASYNC_PF_EN: |
4393 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF)) |
4394 | return 1; |
4395 | |
4396 | msr_info->data = vcpu->arch.apf.msr_en_val; |
4397 | break; |
4398 | case MSR_KVM_ASYNC_PF_INT: |
4399 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) |
4400 | return 1; |
4401 | |
4402 | msr_info->data = vcpu->arch.apf.msr_int_val; |
4403 | break; |
4404 | case MSR_KVM_ASYNC_PF_ACK: |
4405 | if (!guest_pv_has(vcpu, KVM_FEATURE_ASYNC_PF_INT)) |
4406 | return 1; |
4407 | |
4408 | msr_info->data = 0; |
4409 | break; |
4410 | case MSR_KVM_STEAL_TIME: |
4411 | if (!guest_pv_has(vcpu, KVM_FEATURE_STEAL_TIME)) |
4412 | return 1; |
4413 | |
4414 | msr_info->data = vcpu->arch.st.msr_val; |
4415 | break; |
4416 | case MSR_KVM_PV_EOI_EN: |
4417 | if (!guest_pv_has(vcpu, KVM_FEATURE_PV_EOI)) |
4418 | return 1; |
4419 | |
4420 | msr_info->data = vcpu->arch.pv_eoi.msr_val; |
4421 | break; |
4422 | case MSR_KVM_POLL_CONTROL: |
4423 | if (!guest_pv_has(vcpu, KVM_FEATURE_POLL_CONTROL)) |
4424 | return 1; |
4425 | |
4426 | msr_info->data = vcpu->arch.msr_kvm_poll_control; |
4427 | break; |
4428 | case MSR_IA32_P5_MC_ADDR: |
4429 | case MSR_IA32_P5_MC_TYPE: |
4430 | case MSR_IA32_MCG_CAP: |
4431 | case MSR_IA32_MCG_CTL: |
4432 | case MSR_IA32_MCG_STATUS: |
4433 | case MSR_IA32_MC0_CTL ... MSR_IA32_MCx_CTL(KVM_MAX_MCE_BANKS) - 1: |
4434 | case MSR_IA32_MC0_CTL2 ... MSR_IA32_MCx_CTL2(KVM_MAX_MCE_BANKS) - 1: |
4435 | return get_msr_mce(vcpu, msr: msr_info->index, pdata: &msr_info->data, |
4436 | host: msr_info->host_initiated); |
4437 | case MSR_IA32_XSS: |
4438 | if (!msr_info->host_initiated && |
4439 | !guest_cpuid_has(vcpu, X86_FEATURE_XSAVES)) |
4440 | return 1; |
4441 | msr_info->data = vcpu->arch.ia32_xss; |
4442 | break; |
4443 | case MSR_K7_CLK_CTL: |
4444 | /* |
4445 | * Provide expected ramp-up count for K7. All other |
4446 | * are set to zero, indicating minimum divisors for |
4447 | * every field. |
4448 | * |
4449 | * This prevents guest kernels on AMD host with CPU |
4450 | * type 6, model 8 and higher from exploding due to |
4451 | * the rdmsr failing. |
4452 | */ |
4453 | msr_info->data = 0x20000000; |
4454 | break; |
4455 | #ifdef CONFIG_KVM_HYPERV |
4456 | case HV_X64_MSR_GUEST_OS_ID ... HV_X64_MSR_SINT15: |
4457 | case HV_X64_MSR_SYNDBG_CONTROL ... HV_X64_MSR_SYNDBG_PENDING_BUFFER: |
4458 | case HV_X64_MSR_SYNDBG_OPTIONS: |
4459 | case HV_X64_MSR_CRASH_P0 ... HV_X64_MSR_CRASH_P4: |
4460 | case HV_X64_MSR_CRASH_CTL: |
4461 | case HV_X64_MSR_STIMER0_CONFIG ... HV_X64_MSR_STIMER3_COUNT: |
4462 | case HV_X64_MSR_REENLIGHTENMENT_CONTROL: |
4463 | case HV_X64_MSR_TSC_EMULATION_CONTROL: |
4464 | case HV_X64_MSR_TSC_EMULATION_STATUS: |
4465 | case HV_X64_MSR_TSC_INVARIANT_CONTROL: |
4466 | return kvm_hv_get_msr_common(vcpu, |
4467 | msr: msr_info->index, pdata: &msr_info->data, |
4468 | host: msr_info->host_initiated); |
4469 | #endif |
4470 | case MSR_IA32_BBL_CR_CTL3: |
4471 | /* This legacy MSR exists but isn't fully documented in current |
4472 | * silicon. It is however accessed by winxp in very narrow |
4473 | * scenarios where it sets bit #19, itself documented as |
4474 | * a "reserved" bit. Best effort attempt to source coherent |
4475 | * read data here should the balance of the register be |
4476 | * interpreted by the guest: |
4477 | * |
4478 | * L2 cache control register 3: 64GB range, 256KB size, |
4479 | * enabled, latency 0x1, configured |
4480 | */ |
4481 | msr_info->data = 0xbe702111; |
4482 | break; |
4483 | case MSR_AMD64_OSVW_ID_LENGTH: |
4484 | if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) |
4485 | return 1; |
4486 | msr_info->data = vcpu->arch.osvw.length; |
4487 | break; |
4488 | case MSR_AMD64_OSVW_STATUS: |
4489 | if (!guest_cpuid_has(vcpu, X86_FEATURE_OSVW)) |
4490 | return 1; |
4491 | msr_info->data = vcpu->arch.osvw.status; |
4492 | break; |
4493 | case MSR_PLATFORM_INFO: |
4494 | if (!msr_info->host_initiated && |
4495 | !vcpu->kvm->arch.guest_can_read_msr_platform_info) |
4496 | return 1; |
4497 | msr_info->data = vcpu->arch.msr_platform_info; |
4498 | break; |
4499 | case MSR_MISC_FEATURES_ENABLES: |
4500 | msr_info->data = vcpu->arch.msr_misc_features_enables; |
4501 | break; |
4502 | case MSR_K7_HWCR: |
4503 | msr_info->data = vcpu->arch.msr_hwcr; |
4504 | break; |
4505 | #ifdef CONFIG_X86_64 |
4506 | case MSR_IA32_XFD: |
4507 | if (!msr_info->host_initiated && |
4508 | !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) |
4509 | return 1; |
4510 | |
4511 | msr_info->data = vcpu->arch.guest_fpu.fpstate->xfd; |
4512 | break; |
4513 | case MSR_IA32_XFD_ERR: |
4514 | if (!msr_info->host_initiated && |
4515 | !guest_cpuid_has(vcpu, X86_FEATURE_XFD)) |
4516 | return 1; |
4517 | |
4518 | msr_info->data = vcpu->arch.guest_fpu.xfd_err; |
4519 | break; |
4520 | #endif |
4521 | default: |
4522 | if (kvm_pmu_is_valid_msr(vcpu, msr: msr_info->index)) |
4523 | return kvm_pmu_get_msr(vcpu, msr_info); |
4524 | |
4525 | /* |
4526 | * Userspace is allowed to read MSRs that KVM reports as |
4527 | * to-be-saved, even if an MSR isn't fully supported. |
4528 | */ |
4529 | if (msr_info->host_initiated && |
4530 | kvm_is_msr_to_save(msr_index: msr_info->index)) { |
4531 | msr_info->data = 0; |
4532 | break; |
4533 | } |
4534 | |
4535 | return KVM_MSR_RET_INVALID; |
4536 | } |
4537 | return 0; |
4538 | } |
4539 | EXPORT_SYMBOL_GPL(kvm_get_msr_common); |
4540 | |
4541 | /* |
4542 | * Read or write a bunch of msrs. All parameters are kernel addresses. |
4543 | * |
4544 | * @return number of msrs set successfully. |
4545 | */ |
4546 | static int __msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs *msrs, |
4547 | struct kvm_msr_entry *entries, |
4548 | int (*do_msr)(struct kvm_vcpu *vcpu, |
4549 | unsigned index, u64 *data)) |
4550 | { |
4551 | int i; |
4552 | |
4553 | for (i = 0; i < msrs->nmsrs; ++i) |
4554 | if (do_msr(vcpu, entries[i].index, &entries[i].data)) |
4555 | break; |
4556 | |
4557 | return i; |
4558 | } |
4559 | |
4560 | /* |
4561 | * Read or write a bunch of msrs. Parameters are user addresses. |
4562 | * |
4563 | * @return number of msrs set successfully. |
4564 | */ |
4565 | static int msr_io(struct kvm_vcpu *vcpu, struct kvm_msrs __user *user_msrs, |
4566 | int (*do_msr)(struct kvm_vcpu *vcpu, |
4567 | unsigned index, u64 *data), |
4568 | int writeback) |
4569 | { |
4570 | struct kvm_msrs msrs; |
4571 | struct kvm_msr_entry *entries; |
4572 | unsigned size; |
4573 | int r; |
4574 | |
4575 | r = -EFAULT; |
4576 | if (copy_from_user(to: &msrs, from: user_msrs, n: sizeof(msrs))) |
4577 | goto out; |
4578 | |
4579 | r = -E2BIG; |
4580 | if (msrs.nmsrs >= MAX_IO_MSRS) |
4581 | goto out; |
4582 | |
4583 | size = sizeof(struct kvm_msr_entry) * msrs.nmsrs; |
4584 | entries = memdup_user(user_msrs->entries, size); |
4585 | if (IS_ERR(ptr: entries)) { |
4586 | r = PTR_ERR(ptr: entries); |
4587 | goto out; |
4588 | } |
4589 | |
4590 | r = __msr_io(vcpu, msrs: &msrs, entries, do_msr); |
4591 | |
4592 | if (writeback && copy_to_user(to: user_msrs->entries, from: entries, n: size)) |
4593 | r = -EFAULT; |
4594 | |
4595 | kfree(objp: entries); |
4596 | out: |
4597 | return r; |
4598 | } |
4599 | |
4600 | static inline bool kvm_can_mwait_in_guest(void) |
4601 | { |
4602 | return boot_cpu_has(X86_FEATURE_MWAIT) && |
4603 | !boot_cpu_has_bug(X86_BUG_MONITOR) && |
4604 | boot_cpu_has(X86_FEATURE_ARAT); |
4605 | } |
4606 | |
4607 | #ifdef CONFIG_KVM_HYPERV |
4608 | static int kvm_ioctl_get_supported_hv_cpuid(struct kvm_vcpu *vcpu, |
4609 | struct kvm_cpuid2 __user *cpuid_arg) |
4610 | { |
4611 | struct kvm_cpuid2 cpuid; |
4612 | int r; |
4613 | |
4614 | r = -EFAULT; |
4615 | if (copy_from_user(to: &cpuid, from: cpuid_arg, n: sizeof(cpuid))) |
4616 | return r; |
4617 | |
4618 | r = kvm_get_hv_cpuid(vcpu, cpuid: &cpuid, entries: cpuid_arg->entries); |
4619 | if (r) |
4620 | return r; |
4621 | |
4622 | r = -EFAULT; |
4623 | if (copy_to_user(to: cpuid_arg, from: &cpuid, n: sizeof(cpuid))) |
4624 | return r; |
4625 | |
4626 | return 0; |
4627 | } |
4628 | #endif |
4629 | |
4630 | static bool kvm_is_vm_type_supported(unsigned long type) |
4631 | { |
4632 | return type == KVM_X86_DEFAULT_VM || |
4633 | (type == KVM_X86_SW_PROTECTED_VM && |
4634 | IS_ENABLED(CONFIG_KVM_SW_PROTECTED_VM) && tdp_mmu_enabled); |
4635 | } |
4636 | |
4637 | int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) |
4638 | { |
4639 | int r = 0; |
4640 | |
4641 | switch (ext) { |
4642 | case KVM_CAP_IRQCHIP: |
4643 | case KVM_CAP_HLT: |
4644 | case KVM_CAP_MMU_SHADOW_CACHE_CONTROL: |
4645 | case KVM_CAP_SET_TSS_ADDR: |
4646 | case KVM_CAP_EXT_CPUID: |
4647 | case KVM_CAP_EXT_EMUL_CPUID: |
4648 | case KVM_CAP_CLOCKSOURCE: |
4649 | case KVM_CAP_PIT: |
4650 | case KVM_CAP_NOP_IO_DELAY: |
4651 | case KVM_CAP_MP_STATE: |
4652 | case KVM_CAP_SYNC_MMU: |
4653 | case KVM_CAP_USER_NMI: |
4654 | case KVM_CAP_REINJECT_CONTROL: |
4655 | case KVM_CAP_IRQ_INJECT_STATUS: |
4656 | case KVM_CAP_IOEVENTFD: |
4657 | case KVM_CAP_IOEVENTFD_NO_LENGTH: |
4658 | case KVM_CAP_PIT2: |
4659 | case KVM_CAP_PIT_STATE2: |
4660 | case KVM_CAP_SET_IDENTITY_MAP_ADDR: |
4661 | case KVM_CAP_VCPU_EVENTS: |
4662 | #ifdef CONFIG_KVM_HYPERV |
4663 | case KVM_CAP_HYPERV: |
4664 | case KVM_CAP_HYPERV_VAPIC: |
4665 | case KVM_CAP_HYPERV_SPIN: |
4666 | case KVM_CAP_HYPERV_TIME: |
4667 | case KVM_CAP_HYPERV_SYNIC: |
4668 | case KVM_CAP_HYPERV_SYNIC2: |
4669 | case KVM_CAP_HYPERV_VP_INDEX: |
4670 | case KVM_CAP_HYPERV_EVENTFD: |
4671 | case KVM_CAP_HYPERV_TLBFLUSH: |
4672 | case KVM_CAP_HYPERV_SEND_IPI: |
4673 | case KVM_CAP_HYPERV_CPUID: |
4674 | case KVM_CAP_HYPERV_ENFORCE_CPUID: |
4675 | case KVM_CAP_SYS_HYPERV_CPUID: |
4676 | #endif |
4677 | case KVM_CAP_PCI_SEGMENT: |
4678 | case KVM_CAP_DEBUGREGS: |
4679 | case KVM_CAP_X86_ROBUST_SINGLESTEP: |
4680 | case KVM_CAP_XSAVE: |
4681 | case KVM_CAP_ASYNC_PF: |
4682 | case KVM_CAP_ASYNC_PF_INT: |
4683 | case KVM_CAP_GET_TSC_KHZ: |
4684 | case KVM_CAP_KVMCLOCK_CTRL: |
4685 | case KVM_CAP_READONLY_MEM: |
4686 | case KVM_CAP_IOAPIC_POLARITY_IGNORED: |
4687 | case KVM_CAP_TSC_DEADLINE_TIMER: |
4688 | case KVM_CAP_DISABLE_QUIRKS: |
4689 | case KVM_CAP_SET_BOOT_CPU_ID: |
4690 | case KVM_CAP_SPLIT_IRQCHIP: |
4691 | case KVM_CAP_IMMEDIATE_EXIT: |
4692 | case KVM_CAP_PMU_EVENT_FILTER: |
4693 | case KVM_CAP_PMU_EVENT_MASKED_EVENTS: |
4694 | case KVM_CAP_GET_MSR_FEATURES: |
4695 | case KVM_CAP_MSR_PLATFORM_INFO: |
4696 | case KVM_CAP_EXCEPTION_PAYLOAD: |
4697 | case KVM_CAP_X86_TRIPLE_FAULT_EVENT: |
4698 | case KVM_CAP_SET_GUEST_DEBUG: |
4699 | case KVM_CAP_LAST_CPU: |
4700 | case KVM_CAP_X86_USER_SPACE_MSR: |
4701 | case KVM_CAP_X86_MSR_FILTER: |
4702 | case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: |
4703 | #ifdef CONFIG_X86_SGX_KVM |
4704 | case KVM_CAP_SGX_ATTRIBUTE: |
4705 | #endif |
4706 | case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: |
4707 | case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: |
4708 | case KVM_CAP_SREGS2: |
4709 | case KVM_CAP_EXIT_ON_EMULATION_FAILURE: |
4710 | case KVM_CAP_VCPU_ATTRIBUTES: |
4711 | case KVM_CAP_SYS_ATTRIBUTES: |
4712 | case KVM_CAP_VAPIC: |
4713 | case KVM_CAP_ENABLE_CAP: |
4714 | case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: |
4715 | case KVM_CAP_IRQFD_RESAMPLE: |
4716 | case KVM_CAP_MEMORY_FAULT_INFO: |
4717 | r = 1; |
4718 | break; |
4719 | case KVM_CAP_EXIT_HYPERCALL: |
4720 | r = KVM_EXIT_HYPERCALL_VALID_MASK; |
4721 | break; |
4722 | case KVM_CAP_SET_GUEST_DEBUG2: |
4723 | return KVM_GUESTDBG_VALID_MASK; |
4724 | #ifdef CONFIG_KVM_XEN |
4725 | case KVM_CAP_XEN_HVM: |
4726 | r = KVM_XEN_HVM_CONFIG_HYPERCALL_MSR | |
4727 | KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | |
4728 | KVM_XEN_HVM_CONFIG_SHARED_INFO | |
4729 | KVM_XEN_HVM_CONFIG_EVTCHN_2LEVEL | |
4730 | KVM_XEN_HVM_CONFIG_EVTCHN_SEND | |
4731 | KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE | |
4732 | KVM_XEN_HVM_CONFIG_SHARED_INFO_HVA; |
4733 | if (sched_info_on()) |
4734 | r |= KVM_XEN_HVM_CONFIG_RUNSTATE | |
4735 | KVM_XEN_HVM_CONFIG_RUNSTATE_UPDATE_FLAG; |
4736 | break; |
4737 | #endif |
4738 | case KVM_CAP_SYNC_REGS: |
4739 | r = KVM_SYNC_X86_VALID_FIELDS; |
4740 | break; |
4741 | case KVM_CAP_ADJUST_CLOCK: |
4742 | r = KVM_CLOCK_VALID_FLAGS; |
4743 | break; |
4744 | case KVM_CAP_X86_DISABLE_EXITS: |
4745 | r = KVM_X86_DISABLE_EXITS_PAUSE; |
4746 | |
4747 | if (!mitigate_smt_rsb) { |
4748 | r |= KVM_X86_DISABLE_EXITS_HLT | |
4749 | KVM_X86_DISABLE_EXITS_CSTATE; |
4750 | |
4751 | if (kvm_can_mwait_in_guest()) |
4752 | r |= KVM_X86_DISABLE_EXITS_MWAIT; |
4753 | } |
4754 | break; |
4755 | case KVM_CAP_X86_SMM: |
4756 | if (!IS_ENABLED(CONFIG_KVM_SMM)) |
4757 | break; |
4758 | |
4759 | /* SMBASE is usually relocated above 1M on modern chipsets, |
4760 | * and SMM handlers might indeed rely on 4G segment limits, |
4761 | * so do not report SMM to be available if real mode is |
4762 | * emulated via vm86 mode. Still, do not go to great lengths |
4763 | * to avoid userspace's usage of the feature, because it is a |
4764 | * fringe case that is not enabled except via specific settings |
4765 | * of the module parameters. |
4766 | */ |
4767 | r = static_call(kvm_x86_has_emulated_msr)(kvm, MSR_IA32_SMBASE); |
4768 | break; |
4769 | case KVM_CAP_NR_VCPUS: |
4770 | r = min_t(unsigned int, num_online_cpus(), KVM_MAX_VCPUS); |
4771 | break; |
4772 | case KVM_CAP_MAX_VCPUS: |
4773 | r = KVM_MAX_VCPUS; |
4774 | break; |
4775 | case KVM_CAP_MAX_VCPU_ID: |
4776 | r = KVM_MAX_VCPU_IDS; |
4777 | break; |
4778 | case KVM_CAP_PV_MMU: /* obsolete */ |
4779 | r = 0; |
4780 | break; |
4781 | case KVM_CAP_MCE: |
4782 | r = KVM_MAX_MCE_BANKS; |
4783 | break; |
4784 | case KVM_CAP_XCRS: |
4785 | r = boot_cpu_has(X86_FEATURE_XSAVE); |
4786 | break; |
4787 | case KVM_CAP_TSC_CONTROL: |
4788 | case KVM_CAP_VM_TSC_CONTROL: |
4789 | r = kvm_caps.has_tsc_control; |
4790 | break; |
4791 | case KVM_CAP_X2APIC_API: |
4792 | r = KVM_X2APIC_API_VALID_FLAGS; |
4793 | break; |
4794 | case KVM_CAP_NESTED_STATE: |
4795 | r = kvm_x86_ops.nested_ops->get_state ? |
4796 | kvm_x86_ops.nested_ops->get_state(NULL, NULL, 0) : 0; |
4797 | break; |
4798 | #ifdef CONFIG_KVM_HYPERV |
4799 | case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: |
4800 | r = kvm_x86_ops.enable_l2_tlb_flush != NULL; |
4801 | break; |
4802 | case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: |
4803 | r = kvm_x86_ops.nested_ops->enable_evmcs != NULL; |
4804 | break; |
4805 | #endif |
4806 | case KVM_CAP_SMALLER_MAXPHYADDR: |
4807 | r = (int) allow_smaller_maxphyaddr; |
4808 | break; |
4809 | case KVM_CAP_STEAL_TIME: |
4810 | r = sched_info_on(); |
4811 | break; |
4812 | case KVM_CAP_X86_BUS_LOCK_EXIT: |
4813 | if (kvm_caps.has_bus_lock_exit) |
4814 | r = KVM_BUS_LOCK_DETECTION_OFF | |
4815 | KVM_BUS_LOCK_DETECTION_EXIT; |
4816 | else |
4817 | r = 0; |
4818 | break; |
4819 | case KVM_CAP_XSAVE2: { |
4820 | r = xstate_required_size(xstate_bv: kvm_get_filtered_xcr0(), compacted: false); |
4821 | if (r < sizeof(struct kvm_xsave)) |
4822 | r = sizeof(struct kvm_xsave); |
4823 | break; |
4824 | } |
4825 | case KVM_CAP_PMU_CAPABILITY: |
4826 | r = enable_pmu ? KVM_CAP_PMU_VALID_MASK : 0; |
4827 | break; |
4828 | case KVM_CAP_DISABLE_QUIRKS2: |
4829 | r = KVM_X86_VALID_QUIRKS; |
4830 | break; |
4831 | case KVM_CAP_X86_NOTIFY_VMEXIT: |
4832 | r = kvm_caps.has_notify_vmexit; |
4833 | break; |
4834 | case KVM_CAP_VM_TYPES: |
4835 | r = BIT(KVM_X86_DEFAULT_VM); |
4836 | if (kvm_is_vm_type_supported(KVM_X86_SW_PROTECTED_VM)) |
4837 | r |= BIT(KVM_X86_SW_PROTECTED_VM); |
4838 | break; |
4839 | default: |
4840 | break; |
4841 | } |
4842 | return r; |
4843 | } |
4844 | |
4845 | static inline void __user *kvm_get_attr_addr(struct kvm_device_attr *attr) |
4846 | { |
4847 | void __user *uaddr = (void __user*)(unsigned long)attr->addr; |
4848 | |
4849 | if ((u64)(unsigned long)uaddr != attr->addr) |
4850 | return ERR_PTR_USR(-EFAULT); |
4851 | return uaddr; |
4852 | } |
4853 | |
4854 | static int kvm_x86_dev_get_attr(struct kvm_device_attr *attr) |
4855 | { |
4856 | u64 __user *uaddr = kvm_get_attr_addr(attr); |
4857 | |
4858 | if (attr->group) |
4859 | return -ENXIO; |
4860 | |
4861 | if (IS_ERR(ptr: uaddr)) |
4862 | return PTR_ERR(ptr: uaddr); |
4863 | |
4864 | switch (attr->attr) { |
4865 | case KVM_X86_XCOMP_GUEST_SUPP: |
4866 | if (put_user(kvm_caps.supported_xcr0, uaddr)) |
4867 | return -EFAULT; |
4868 | return 0; |
4869 | default: |
4870 | return -ENXIO; |
4871 | } |
4872 | } |
4873 | |
4874 | static int kvm_x86_dev_has_attr(struct kvm_device_attr *attr) |
4875 | { |
4876 | if (attr->group) |
4877 | return -ENXIO; |
4878 | |
4879 | switch (attr->attr) { |
4880 | case KVM_X86_XCOMP_GUEST_SUPP: |
4881 | return 0; |
4882 | default: |
4883 | return -ENXIO; |
4884 | } |
4885 | } |
4886 | |
4887 | long kvm_arch_dev_ioctl(struct file *filp, |
4888 | unsigned int ioctl, unsigned long arg) |
4889 | { |
4890 | void __user *argp = (void __user *)arg; |
4891 | long r; |
4892 | |
4893 | switch (ioctl) { |
4894 | case KVM_GET_MSR_INDEX_LIST: { |
4895 | struct kvm_msr_list __user *user_msr_list = argp; |
4896 | struct kvm_msr_list msr_list; |
4897 | unsigned n; |
4898 | |
4899 | r = -EFAULT; |
4900 | if (copy_from_user(to: &msr_list, from: user_msr_list, n: sizeof(msr_list))) |
4901 | goto out; |
4902 | n = msr_list.nmsrs; |
4903 | msr_list.nmsrs = num_msrs_to_save + num_emulated_msrs; |
4904 | if (copy_to_user(to: user_msr_list, from: &msr_list, n: sizeof(msr_list))) |
4905 | goto out; |
4906 | r = -E2BIG; |
4907 | if (n < msr_list.nmsrs) |
4908 | goto out; |
4909 | r = -EFAULT; |
4910 | if (copy_to_user(to: user_msr_list->indices, from: &msrs_to_save, |
4911 | n: num_msrs_to_save * sizeof(u32))) |
4912 | goto out; |
4913 | if (copy_to_user(to: user_msr_list->indices + num_msrs_to_save, |
4914 | from: &emulated_msrs, |
4915 | n: num_emulated_msrs * sizeof(u32))) |
4916 | goto out; |
4917 | r = 0; |
4918 | break; |
4919 | } |
4920 | case KVM_GET_SUPPORTED_CPUID: |
4921 | case KVM_GET_EMULATED_CPUID: { |
4922 | struct kvm_cpuid2 __user *cpuid_arg = argp; |
4923 | struct kvm_cpuid2 cpuid; |
4924 | |
4925 | r = -EFAULT; |
4926 | if (copy_from_user(to: &cpuid, from: cpuid_arg, n: sizeof(cpuid))) |
4927 | goto out; |
4928 | |
4929 | r = kvm_dev_ioctl_get_cpuid(cpuid: &cpuid, entries: cpuid_arg->entries, |
4930 | type: ioctl); |
4931 | if (r) |
4932 | goto out; |
4933 | |
4934 | r = -EFAULT; |
4935 | if (copy_to_user(to: cpuid_arg, from: &cpuid, n: sizeof(cpuid))) |
4936 | goto out; |
4937 | r = 0; |
4938 | break; |
4939 | } |
4940 | case KVM_X86_GET_MCE_CAP_SUPPORTED: |
4941 | r = -EFAULT; |
4942 | if (copy_to_user(to: argp, from: &kvm_caps.supported_mce_cap, |
4943 | n: sizeof(kvm_caps.supported_mce_cap))) |
4944 | goto out; |
4945 | r = 0; |
4946 | break; |
4947 | case KVM_GET_MSR_FEATURE_INDEX_LIST: { |
4948 | struct kvm_msr_list __user *user_msr_list = argp; |
4949 | struct kvm_msr_list msr_list; |
4950 | unsigned int n; |
4951 | |
4952 | r = -EFAULT; |
4953 | if (copy_from_user(to: &msr_list, from: user_msr_list, n: sizeof(msr_list))) |
4954 | goto out; |
4955 | n = msr_list.nmsrs; |
4956 | msr_list.nmsrs = num_msr_based_features; |
4957 | if (copy_to_user(to: user_msr_list, from: &msr_list, n: sizeof(msr_list))) |
4958 | goto out; |
4959 | r = -E2BIG; |
4960 | if (n < msr_list.nmsrs) |
4961 | goto out; |
4962 | r = -EFAULT; |
4963 | if (copy_to_user(to: user_msr_list->indices, from: &msr_based_features, |
4964 | n: num_msr_based_features * sizeof(u32))) |
4965 | goto out; |
4966 | r = 0; |
4967 | break; |
4968 | } |
4969 | case KVM_GET_MSRS: |
4970 | r = msr_io(NULL, user_msrs: argp, do_msr: do_get_msr_feature, writeback: 1); |
4971 | break; |
4972 | #ifdef CONFIG_KVM_HYPERV |
4973 | case KVM_GET_SUPPORTED_HV_CPUID: |
4974 | r = kvm_ioctl_get_supported_hv_cpuid(NULL, cpuid_arg: argp); |
4975 | break; |
4976 | #endif |
4977 | case KVM_GET_DEVICE_ATTR: { |
4978 | struct kvm_device_attr attr; |
4979 | r = -EFAULT; |
4980 | if (copy_from_user(to: &attr, from: (void __user *)arg, n: sizeof(attr))) |
4981 | break; |
4982 | r = kvm_x86_dev_get_attr(attr: &attr); |
4983 | break; |
4984 | } |
4985 | case KVM_HAS_DEVICE_ATTR: { |
4986 | struct kvm_device_attr attr; |
4987 | r = -EFAULT; |
4988 | if (copy_from_user(to: &attr, from: (void __user *)arg, n: sizeof(attr))) |
4989 | break; |
4990 | r = kvm_x86_dev_has_attr(attr: &attr); |
4991 | break; |
4992 | } |
4993 | default: |
4994 | r = -EINVAL; |
4995 | break; |
4996 | } |
4997 | out: |
4998 | return r; |
4999 | } |
5000 | |
5001 | static void wbinvd_ipi(void *garbage) |
5002 | { |
5003 | wbinvd(); |
5004 | } |
5005 | |
5006 | static bool need_emulate_wbinvd(struct kvm_vcpu *vcpu) |
5007 | { |
5008 | return kvm_arch_has_noncoherent_dma(kvm: vcpu->kvm); |
5009 | } |
5010 | |
5011 | void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
5012 | { |
5013 | /* Address WBINVD may be executed by guest */ |
5014 | if (need_emulate_wbinvd(vcpu)) { |
5015 | if (static_call(kvm_x86_has_wbinvd_exit)()) |
5016 | cpumask_set_cpu(cpu, dstp: vcpu->arch.wbinvd_dirty_mask); |
5017 | else if (vcpu->cpu != -1 && vcpu->cpu != cpu) |
5018 | smp_call_function_single(cpuid: vcpu->cpu, |
5019 | func: wbinvd_ipi, NULL, wait: 1); |
5020 | } |
5021 | |
5022 | static_call(kvm_x86_vcpu_load)(vcpu, cpu); |
5023 | |
5024 | /* Save host pkru register if supported */ |
5025 | vcpu->arch.host_pkru = read_pkru(); |
5026 | |
5027 | /* Apply any externally detected TSC adjustments (due to suspend) */ |
5028 | if (unlikely(vcpu->arch.tsc_offset_adjustment)) { |
5029 | adjust_tsc_offset_host(vcpu, adjustment: vcpu->arch.tsc_offset_adjustment); |
5030 | vcpu->arch.tsc_offset_adjustment = 0; |
5031 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
5032 | } |
5033 | |
5034 | if (unlikely(vcpu->cpu != cpu) || kvm_check_tsc_unstable()) { |
5035 | s64 tsc_delta = !vcpu->arch.last_host_tsc ? 0 : |
5036 | rdtsc() - vcpu->arch.last_host_tsc; |
5037 | if (tsc_delta < 0) |
5038 | mark_tsc_unstable(reason: "KVM discovered backwards TSC" ); |
5039 | |
5040 | if (kvm_check_tsc_unstable()) { |
5041 | u64 offset = kvm_compute_l1_tsc_offset(vcpu, |
5042 | target_tsc: vcpu->arch.last_guest_tsc); |
5043 | kvm_vcpu_write_tsc_offset(vcpu, l1_offset: offset); |
5044 | vcpu->arch.tsc_catchup = 1; |
5045 | } |
5046 | |
5047 | if (kvm_lapic_hv_timer_in_use(vcpu)) |
5048 | kvm_lapic_restart_hv_timer(vcpu); |
5049 | |
5050 | /* |
5051 | * On a host with synchronized TSC, there is no need to update |
5052 | * kvmclock on vcpu->cpu migration |
5053 | */ |
5054 | if (!vcpu->kvm->arch.use_master_clock || vcpu->cpu == -1) |
5055 | kvm_make_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu); |
5056 | if (vcpu->cpu != cpu) |
5057 | kvm_make_request(KVM_REQ_MIGRATE_TIMER, vcpu); |
5058 | vcpu->cpu = cpu; |
5059 | } |
5060 | |
5061 | kvm_make_request(KVM_REQ_STEAL_UPDATE, vcpu); |
5062 | } |
5063 | |
5064 | static void kvm_steal_time_set_preempted(struct kvm_vcpu *vcpu) |
5065 | { |
5066 | struct gfn_to_hva_cache *ghc = &vcpu->arch.st.cache; |
5067 | struct kvm_steal_time __user *st; |
5068 | struct kvm_memslots *slots; |
5069 | static const u8 preempted = KVM_VCPU_PREEMPTED; |
5070 | gpa_t gpa = vcpu->arch.st.msr_val & KVM_STEAL_VALID_BITS; |
5071 | |
5072 | /* |
5073 | * The vCPU can be marked preempted if and only if the VM-Exit was on |
5074 | * an instruction boundary and will not trigger guest emulation of any |
5075 | * kind (see vcpu_run). Vendor specific code controls (conservatively) |
5076 | * when this is true, for example allowing the vCPU to be marked |
5077 | * preempted if and only if the VM-Exit was due to a host interrupt. |
5078 | */ |
5079 | if (!vcpu->arch.at_instruction_boundary) { |
5080 | vcpu->stat.preemption_other++; |
5081 | return; |
5082 | } |
5083 | |
5084 | vcpu->stat.preemption_reported++; |
5085 | if (!(vcpu->arch.st.msr_val & KVM_MSR_ENABLED)) |
5086 | return; |
5087 | |
5088 | if (vcpu->arch.st.preempted) |
5089 | return; |
5090 | |
5091 | /* This happens on process exit */ |
5092 | if (unlikely(current->mm != vcpu->kvm->mm)) |
5093 | return; |
5094 | |
5095 | slots = kvm_memslots(kvm: vcpu->kvm); |
5096 | |
5097 | if (unlikely(slots->generation != ghc->generation || |
5098 | gpa != ghc->gpa || |
5099 | kvm_is_error_hva(ghc->hva) || !ghc->memslot)) |
5100 | return; |
5101 | |
5102 | st = (struct kvm_steal_time __user *)ghc->hva; |
5103 | BUILD_BUG_ON(sizeof(st->preempted) != sizeof(preempted)); |
5104 | |
5105 | if (!copy_to_user_nofault(dst: &st->preempted, src: &preempted, size: sizeof(preempted))) |
5106 | vcpu->arch.st.preempted = KVM_VCPU_PREEMPTED; |
5107 | |
5108 | mark_page_dirty_in_slot(kvm: vcpu->kvm, memslot: ghc->memslot, gfn: gpa_to_gfn(gpa: ghc->gpa)); |
5109 | } |
5110 | |
5111 | void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) |
5112 | { |
5113 | int idx; |
5114 | |
5115 | if (vcpu->preempted) { |
5116 | vcpu->arch.preempted_in_kernel = kvm_arch_vcpu_in_kernel(vcpu); |
5117 | |
5118 | /* |
5119 | * Take the srcu lock as memslots will be accessed to check the gfn |
5120 | * cache generation against the memslots generation. |
5121 | */ |
5122 | idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
5123 | if (kvm_xen_msr_enabled(kvm: vcpu->kvm)) |
5124 | kvm_xen_runstate_set_preempted(vcpu); |
5125 | else |
5126 | kvm_steal_time_set_preempted(vcpu); |
5127 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
5128 | } |
5129 | |
5130 | static_call(kvm_x86_vcpu_put)(vcpu); |
5131 | vcpu->arch.last_host_tsc = rdtsc(); |
5132 | } |
5133 | |
5134 | static int kvm_vcpu_ioctl_get_lapic(struct kvm_vcpu *vcpu, |
5135 | struct kvm_lapic_state *s) |
5136 | { |
5137 | static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); |
5138 | |
5139 | return kvm_apic_get_state(vcpu, s); |
5140 | } |
5141 | |
5142 | static int kvm_vcpu_ioctl_set_lapic(struct kvm_vcpu *vcpu, |
5143 | struct kvm_lapic_state *s) |
5144 | { |
5145 | int r; |
5146 | |
5147 | r = kvm_apic_set_state(vcpu, s); |
5148 | if (r) |
5149 | return r; |
5150 | update_cr8_intercept(vcpu); |
5151 | |
5152 | return 0; |
5153 | } |
5154 | |
5155 | static int kvm_cpu_accept_dm_intr(struct kvm_vcpu *vcpu) |
5156 | { |
5157 | /* |
5158 | * We can accept userspace's request for interrupt injection |
5159 | * as long as we have a place to store the interrupt number. |
5160 | * The actual injection will happen when the CPU is able to |
5161 | * deliver the interrupt. |
5162 | */ |
5163 | if (kvm_cpu_has_extint(v: vcpu)) |
5164 | return false; |
5165 | |
5166 | /* Acknowledging ExtINT does not happen if LINT0 is masked. */ |
5167 | return (!lapic_in_kernel(vcpu) || |
5168 | kvm_apic_accept_pic_intr(vcpu)); |
5169 | } |
5170 | |
5171 | static int kvm_vcpu_ready_for_interrupt_injection(struct kvm_vcpu *vcpu) |
5172 | { |
5173 | /* |
5174 | * Do not cause an interrupt window exit if an exception |
5175 | * is pending or an event needs reinjection; userspace |
5176 | * might want to inject the interrupt manually using KVM_SET_REGS |
5177 | * or KVM_SET_SREGS. For that to work, we must be at an |
5178 | * instruction boundary and with no events half-injected. |
5179 | */ |
5180 | return (kvm_arch_interrupt_allowed(vcpu) && |
5181 | kvm_cpu_accept_dm_intr(vcpu) && |
5182 | !kvm_event_needs_reinjection(vcpu) && |
5183 | !kvm_is_exception_pending(vcpu)); |
5184 | } |
5185 | |
5186 | static int kvm_vcpu_ioctl_interrupt(struct kvm_vcpu *vcpu, |
5187 | struct kvm_interrupt *irq) |
5188 | { |
5189 | if (irq->irq >= KVM_NR_INTERRUPTS) |
5190 | return -EINVAL; |
5191 | |
5192 | if (!irqchip_in_kernel(kvm: vcpu->kvm)) { |
5193 | kvm_queue_interrupt(vcpu, vector: irq->irq, soft: false); |
5194 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5195 | return 0; |
5196 | } |
5197 | |
5198 | /* |
5199 | * With in-kernel LAPIC, we only use this to inject EXTINT, so |
5200 | * fail for in-kernel 8259. |
5201 | */ |
5202 | if (pic_in_kernel(kvm: vcpu->kvm)) |
5203 | return -ENXIO; |
5204 | |
5205 | if (vcpu->arch.pending_external_vector != -1) |
5206 | return -EEXIST; |
5207 | |
5208 | vcpu->arch.pending_external_vector = irq->irq; |
5209 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5210 | return 0; |
5211 | } |
5212 | |
5213 | static int kvm_vcpu_ioctl_nmi(struct kvm_vcpu *vcpu) |
5214 | { |
5215 | kvm_inject_nmi(vcpu); |
5216 | |
5217 | return 0; |
5218 | } |
5219 | |
5220 | static int vcpu_ioctl_tpr_access_reporting(struct kvm_vcpu *vcpu, |
5221 | struct kvm_tpr_access_ctl *tac) |
5222 | { |
5223 | if (tac->flags) |
5224 | return -EINVAL; |
5225 | vcpu->arch.tpr_access_reporting = !!tac->enabled; |
5226 | return 0; |
5227 | } |
5228 | |
5229 | static int kvm_vcpu_ioctl_x86_setup_mce(struct kvm_vcpu *vcpu, |
5230 | u64 mcg_cap) |
5231 | { |
5232 | int r; |
5233 | unsigned bank_num = mcg_cap & 0xff, bank; |
5234 | |
5235 | r = -EINVAL; |
5236 | if (!bank_num || bank_num > KVM_MAX_MCE_BANKS) |
5237 | goto out; |
5238 | if (mcg_cap & ~(kvm_caps.supported_mce_cap | 0xff | 0xff0000)) |
5239 | goto out; |
5240 | r = 0; |
5241 | vcpu->arch.mcg_cap = mcg_cap; |
5242 | /* Init IA32_MCG_CTL to all 1s */ |
5243 | if (mcg_cap & MCG_CTL_P) |
5244 | vcpu->arch.mcg_ctl = ~(u64)0; |
5245 | /* Init IA32_MCi_CTL to all 1s, IA32_MCi_CTL2 to all 0s */ |
5246 | for (bank = 0; bank < bank_num; bank++) { |
5247 | vcpu->arch.mce_banks[bank*4] = ~(u64)0; |
5248 | if (mcg_cap & MCG_CMCI_P) |
5249 | vcpu->arch.mci_ctl2_banks[bank] = 0; |
5250 | } |
5251 | |
5252 | kvm_apic_after_set_mcg_cap(vcpu); |
5253 | |
5254 | static_call(kvm_x86_setup_mce)(vcpu); |
5255 | out: |
5256 | return r; |
5257 | } |
5258 | |
5259 | /* |
5260 | * Validate this is an UCNA (uncorrectable no action) error by checking the |
5261 | * MCG_STATUS and MCi_STATUS registers: |
5262 | * - none of the bits for Machine Check Exceptions are set |
5263 | * - both the VAL (valid) and UC (uncorrectable) bits are set |
5264 | * MCI_STATUS_PCC - Processor Context Corrupted |
5265 | * MCI_STATUS_S - Signaled as a Machine Check Exception |
5266 | * MCI_STATUS_AR - Software recoverable Action Required |
5267 | */ |
5268 | static bool is_ucna(struct kvm_x86_mce *mce) |
5269 | { |
5270 | return !mce->mcg_status && |
5271 | !(mce->status & (MCI_STATUS_PCC | MCI_STATUS_S | MCI_STATUS_AR)) && |
5272 | (mce->status & MCI_STATUS_VAL) && |
5273 | (mce->status & MCI_STATUS_UC); |
5274 | } |
5275 | |
5276 | static int kvm_vcpu_x86_set_ucna(struct kvm_vcpu *vcpu, struct kvm_x86_mce *mce, u64* banks) |
5277 | { |
5278 | u64 mcg_cap = vcpu->arch.mcg_cap; |
5279 | |
5280 | banks[1] = mce->status; |
5281 | banks[2] = mce->addr; |
5282 | banks[3] = mce->misc; |
5283 | vcpu->arch.mcg_status = mce->mcg_status; |
5284 | |
5285 | if (!(mcg_cap & MCG_CMCI_P) || |
5286 | !(vcpu->arch.mci_ctl2_banks[mce->bank] & MCI_CTL2_CMCI_EN)) |
5287 | return 0; |
5288 | |
5289 | if (lapic_in_kernel(vcpu)) |
5290 | kvm_apic_local_deliver(apic: vcpu->arch.apic, APIC_LVTCMCI); |
5291 | |
5292 | return 0; |
5293 | } |
5294 | |
5295 | static int kvm_vcpu_ioctl_x86_set_mce(struct kvm_vcpu *vcpu, |
5296 | struct kvm_x86_mce *mce) |
5297 | { |
5298 | u64 mcg_cap = vcpu->arch.mcg_cap; |
5299 | unsigned bank_num = mcg_cap & 0xff; |
5300 | u64 *banks = vcpu->arch.mce_banks; |
5301 | |
5302 | if (mce->bank >= bank_num || !(mce->status & MCI_STATUS_VAL)) |
5303 | return -EINVAL; |
5304 | |
5305 | banks += array_index_nospec(4 * mce->bank, 4 * bank_num); |
5306 | |
5307 | if (is_ucna(mce)) |
5308 | return kvm_vcpu_x86_set_ucna(vcpu, mce, banks); |
5309 | |
5310 | /* |
5311 | * if IA32_MCG_CTL is not all 1s, the uncorrected error |
5312 | * reporting is disabled |
5313 | */ |
5314 | if ((mce->status & MCI_STATUS_UC) && (mcg_cap & MCG_CTL_P) && |
5315 | vcpu->arch.mcg_ctl != ~(u64)0) |
5316 | return 0; |
5317 | /* |
5318 | * if IA32_MCi_CTL is not all 1s, the uncorrected error |
5319 | * reporting is disabled for the bank |
5320 | */ |
5321 | if ((mce->status & MCI_STATUS_UC) && banks[0] != ~(u64)0) |
5322 | return 0; |
5323 | if (mce->status & MCI_STATUS_UC) { |
5324 | if ((vcpu->arch.mcg_status & MCG_STATUS_MCIP) || |
5325 | !kvm_is_cr4_bit_set(vcpu, X86_CR4_MCE)) { |
5326 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
5327 | return 0; |
5328 | } |
5329 | if (banks[1] & MCI_STATUS_VAL) |
5330 | mce->status |= MCI_STATUS_OVER; |
5331 | banks[2] = mce->addr; |
5332 | banks[3] = mce->misc; |
5333 | vcpu->arch.mcg_status = mce->mcg_status; |
5334 | banks[1] = mce->status; |
5335 | kvm_queue_exception(vcpu, MC_VECTOR); |
5336 | } else if (!(banks[1] & MCI_STATUS_VAL) |
5337 | || !(banks[1] & MCI_STATUS_UC)) { |
5338 | if (banks[1] & MCI_STATUS_VAL) |
5339 | mce->status |= MCI_STATUS_OVER; |
5340 | banks[2] = mce->addr; |
5341 | banks[3] = mce->misc; |
5342 | banks[1] = mce->status; |
5343 | } else |
5344 | banks[1] |= MCI_STATUS_OVER; |
5345 | return 0; |
5346 | } |
5347 | |
5348 | static void kvm_vcpu_ioctl_x86_get_vcpu_events(struct kvm_vcpu *vcpu, |
5349 | struct kvm_vcpu_events *events) |
5350 | { |
5351 | struct kvm_queued_exception *ex; |
5352 | |
5353 | process_nmi(vcpu); |
5354 | |
5355 | #ifdef CONFIG_KVM_SMM |
5356 | if (kvm_check_request(KVM_REQ_SMI, vcpu)) |
5357 | process_smi(vcpu); |
5358 | #endif |
5359 | |
5360 | /* |
5361 | * KVM's ABI only allows for one exception to be migrated. Luckily, |
5362 | * the only time there can be two queued exceptions is if there's a |
5363 | * non-exiting _injected_ exception, and a pending exiting exception. |
5364 | * In that case, ignore the VM-Exiting exception as it's an extension |
5365 | * of the injected exception. |
5366 | */ |
5367 | if (vcpu->arch.exception_vmexit.pending && |
5368 | !vcpu->arch.exception.pending && |
5369 | !vcpu->arch.exception.injected) |
5370 | ex = &vcpu->arch.exception_vmexit; |
5371 | else |
5372 | ex = &vcpu->arch.exception; |
5373 | |
5374 | /* |
5375 | * In guest mode, payload delivery should be deferred if the exception |
5376 | * will be intercepted by L1, e.g. KVM should not modifying CR2 if L1 |
5377 | * intercepts #PF, ditto for DR6 and #DBs. If the per-VM capability, |
5378 | * KVM_CAP_EXCEPTION_PAYLOAD, is not set, userspace may or may not |
5379 | * propagate the payload and so it cannot be safely deferred. Deliver |
5380 | * the payload if the capability hasn't been requested. |
5381 | */ |
5382 | if (!vcpu->kvm->arch.exception_payload_enabled && |
5383 | ex->pending && ex->has_payload) |
5384 | kvm_deliver_exception_payload(vcpu, ex); |
5385 | |
5386 | memset(events, 0, sizeof(*events)); |
5387 | |
5388 | /* |
5389 | * The API doesn't provide the instruction length for software |
5390 | * exceptions, so don't report them. As long as the guest RIP |
5391 | * isn't advanced, we should expect to encounter the exception |
5392 | * again. |
5393 | */ |
5394 | if (!kvm_exception_is_soft(nr: ex->vector)) { |
5395 | events->exception.injected = ex->injected; |
5396 | events->exception.pending = ex->pending; |
5397 | /* |
5398 | * For ABI compatibility, deliberately conflate |
5399 | * pending and injected exceptions when |
5400 | * KVM_CAP_EXCEPTION_PAYLOAD isn't enabled. |
5401 | */ |
5402 | if (!vcpu->kvm->arch.exception_payload_enabled) |
5403 | events->exception.injected |= ex->pending; |
5404 | } |
5405 | events->exception.nr = ex->vector; |
5406 | events->exception.has_error_code = ex->has_error_code; |
5407 | events->exception.error_code = ex->error_code; |
5408 | events->exception_has_payload = ex->has_payload; |
5409 | events->exception_payload = ex->payload; |
5410 | |
5411 | events->interrupt.injected = |
5412 | vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft; |
5413 | events->interrupt.nr = vcpu->arch.interrupt.nr; |
5414 | events->interrupt.shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); |
5415 | |
5416 | events->nmi.injected = vcpu->arch.nmi_injected; |
5417 | events->nmi.pending = kvm_get_nr_pending_nmis(vcpu); |
5418 | events->nmi.masked = static_call(kvm_x86_get_nmi_mask)(vcpu); |
5419 | |
5420 | /* events->sipi_vector is never valid when reporting to user space */ |
5421 | |
5422 | #ifdef CONFIG_KVM_SMM |
5423 | events->smi.smm = is_smm(vcpu); |
5424 | events->smi.pending = vcpu->arch.smi_pending; |
5425 | events->smi.smm_inside_nmi = |
5426 | !!(vcpu->arch.hflags & HF_SMM_INSIDE_NMI_MASK); |
5427 | #endif |
5428 | events->smi.latched_init = kvm_lapic_latched_init(vcpu); |
5429 | |
5430 | events->flags = (KVM_VCPUEVENT_VALID_NMI_PENDING |
5431 | | KVM_VCPUEVENT_VALID_SHADOW |
5432 | | KVM_VCPUEVENT_VALID_SMM); |
5433 | if (vcpu->kvm->arch.exception_payload_enabled) |
5434 | events->flags |= KVM_VCPUEVENT_VALID_PAYLOAD; |
5435 | if (vcpu->kvm->arch.triple_fault_event) { |
5436 | events->triple_fault.pending = kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
5437 | events->flags |= KVM_VCPUEVENT_VALID_TRIPLE_FAULT; |
5438 | } |
5439 | } |
5440 | |
5441 | static int kvm_vcpu_ioctl_x86_set_vcpu_events(struct kvm_vcpu *vcpu, |
5442 | struct kvm_vcpu_events *events) |
5443 | { |
5444 | if (events->flags & ~(KVM_VCPUEVENT_VALID_NMI_PENDING |
5445 | | KVM_VCPUEVENT_VALID_SIPI_VECTOR |
5446 | | KVM_VCPUEVENT_VALID_SHADOW |
5447 | | KVM_VCPUEVENT_VALID_SMM |
5448 | | KVM_VCPUEVENT_VALID_PAYLOAD |
5449 | | KVM_VCPUEVENT_VALID_TRIPLE_FAULT)) |
5450 | return -EINVAL; |
5451 | |
5452 | if (events->flags & KVM_VCPUEVENT_VALID_PAYLOAD) { |
5453 | if (!vcpu->kvm->arch.exception_payload_enabled) |
5454 | return -EINVAL; |
5455 | if (events->exception.pending) |
5456 | events->exception.injected = 0; |
5457 | else |
5458 | events->exception_has_payload = 0; |
5459 | } else { |
5460 | events->exception.pending = 0; |
5461 | events->exception_has_payload = 0; |
5462 | } |
5463 | |
5464 | if ((events->exception.injected || events->exception.pending) && |
5465 | (events->exception.nr > 31 || events->exception.nr == NMI_VECTOR)) |
5466 | return -EINVAL; |
5467 | |
5468 | /* INITs are latched while in SMM */ |
5469 | if (events->flags & KVM_VCPUEVENT_VALID_SMM && |
5470 | (events->smi.smm || events->smi.pending) && |
5471 | vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) |
5472 | return -EINVAL; |
5473 | |
5474 | process_nmi(vcpu); |
5475 | |
5476 | /* |
5477 | * Flag that userspace is stuffing an exception, the next KVM_RUN will |
5478 | * morph the exception to a VM-Exit if appropriate. Do this only for |
5479 | * pending exceptions, already-injected exceptions are not subject to |
5480 | * intercpetion. Note, userspace that conflates pending and injected |
5481 | * is hosed, and will incorrectly convert an injected exception into a |
5482 | * pending exception, which in turn may cause a spurious VM-Exit. |
5483 | */ |
5484 | vcpu->arch.exception_from_userspace = events->exception.pending; |
5485 | |
5486 | vcpu->arch.exception_vmexit.pending = false; |
5487 | |
5488 | vcpu->arch.exception.injected = events->exception.injected; |
5489 | vcpu->arch.exception.pending = events->exception.pending; |
5490 | vcpu->arch.exception.vector = events->exception.nr; |
5491 | vcpu->arch.exception.has_error_code = events->exception.has_error_code; |
5492 | vcpu->arch.exception.error_code = events->exception.error_code; |
5493 | vcpu->arch.exception.has_payload = events->exception_has_payload; |
5494 | vcpu->arch.exception.payload = events->exception_payload; |
5495 | |
5496 | vcpu->arch.interrupt.injected = events->interrupt.injected; |
5497 | vcpu->arch.interrupt.nr = events->interrupt.nr; |
5498 | vcpu->arch.interrupt.soft = events->interrupt.soft; |
5499 | if (events->flags & KVM_VCPUEVENT_VALID_SHADOW) |
5500 | static_call(kvm_x86_set_interrupt_shadow)(vcpu, |
5501 | events->interrupt.shadow); |
5502 | |
5503 | vcpu->arch.nmi_injected = events->nmi.injected; |
5504 | if (events->flags & KVM_VCPUEVENT_VALID_NMI_PENDING) { |
5505 | vcpu->arch.nmi_pending = 0; |
5506 | atomic_set(v: &vcpu->arch.nmi_queued, i: events->nmi.pending); |
5507 | if (events->nmi.pending) |
5508 | kvm_make_request(KVM_REQ_NMI, vcpu); |
5509 | } |
5510 | static_call(kvm_x86_set_nmi_mask)(vcpu, events->nmi.masked); |
5511 | |
5512 | if (events->flags & KVM_VCPUEVENT_VALID_SIPI_VECTOR && |
5513 | lapic_in_kernel(vcpu)) |
5514 | vcpu->arch.apic->sipi_vector = events->sipi_vector; |
5515 | |
5516 | if (events->flags & KVM_VCPUEVENT_VALID_SMM) { |
5517 | #ifdef CONFIG_KVM_SMM |
5518 | if (!!(vcpu->arch.hflags & HF_SMM_MASK) != events->smi.smm) { |
5519 | kvm_leave_nested(vcpu); |
5520 | kvm_smm_changed(vcpu, in_smm: events->smi.smm); |
5521 | } |
5522 | |
5523 | vcpu->arch.smi_pending = events->smi.pending; |
5524 | |
5525 | if (events->smi.smm) { |
5526 | if (events->smi.smm_inside_nmi) |
5527 | vcpu->arch.hflags |= HF_SMM_INSIDE_NMI_MASK; |
5528 | else |
5529 | vcpu->arch.hflags &= ~HF_SMM_INSIDE_NMI_MASK; |
5530 | } |
5531 | |
5532 | #else |
5533 | if (events->smi.smm || events->smi.pending || |
5534 | events->smi.smm_inside_nmi) |
5535 | return -EINVAL; |
5536 | #endif |
5537 | |
5538 | if (lapic_in_kernel(vcpu)) { |
5539 | if (events->smi.latched_init) |
5540 | set_bit(KVM_APIC_INIT, addr: &vcpu->arch.apic->pending_events); |
5541 | else |
5542 | clear_bit(KVM_APIC_INIT, addr: &vcpu->arch.apic->pending_events); |
5543 | } |
5544 | } |
5545 | |
5546 | if (events->flags & KVM_VCPUEVENT_VALID_TRIPLE_FAULT) { |
5547 | if (!vcpu->kvm->arch.triple_fault_event) |
5548 | return -EINVAL; |
5549 | if (events->triple_fault.pending) |
5550 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
5551 | else |
5552 | kvm_clear_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
5553 | } |
5554 | |
5555 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
5556 | |
5557 | return 0; |
5558 | } |
5559 | |
5560 | static void kvm_vcpu_ioctl_x86_get_debugregs(struct kvm_vcpu *vcpu, |
5561 | struct kvm_debugregs *dbgregs) |
5562 | { |
5563 | unsigned int i; |
5564 | |
5565 | memset(dbgregs, 0, sizeof(*dbgregs)); |
5566 | |
5567 | BUILD_BUG_ON(ARRAY_SIZE(vcpu->arch.db) != ARRAY_SIZE(dbgregs->db)); |
5568 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) |
5569 | dbgregs->db[i] = vcpu->arch.db[i]; |
5570 | |
5571 | dbgregs->dr6 = vcpu->arch.dr6; |
5572 | dbgregs->dr7 = vcpu->arch.dr7; |
5573 | } |
5574 | |
5575 | static int kvm_vcpu_ioctl_x86_set_debugregs(struct kvm_vcpu *vcpu, |
5576 | struct kvm_debugregs *dbgregs) |
5577 | { |
5578 | unsigned int i; |
5579 | |
5580 | if (dbgregs->flags) |
5581 | return -EINVAL; |
5582 | |
5583 | if (!kvm_dr6_valid(data: dbgregs->dr6)) |
5584 | return -EINVAL; |
5585 | if (!kvm_dr7_valid(data: dbgregs->dr7)) |
5586 | return -EINVAL; |
5587 | |
5588 | for (i = 0; i < ARRAY_SIZE(vcpu->arch.db); i++) |
5589 | vcpu->arch.db[i] = dbgregs->db[i]; |
5590 | |
5591 | kvm_update_dr0123(vcpu); |
5592 | vcpu->arch.dr6 = dbgregs->dr6; |
5593 | vcpu->arch.dr7 = dbgregs->dr7; |
5594 | kvm_update_dr7(vcpu); |
5595 | |
5596 | return 0; |
5597 | } |
5598 | |
5599 | |
5600 | static void kvm_vcpu_ioctl_x86_get_xsave2(struct kvm_vcpu *vcpu, |
5601 | u8 *state, unsigned int size) |
5602 | { |
5603 | /* |
5604 | * Only copy state for features that are enabled for the guest. The |
5605 | * state itself isn't problematic, but setting bits in the header for |
5606 | * features that are supported in *this* host but not exposed to the |
5607 | * guest can result in KVM_SET_XSAVE failing when live migrating to a |
5608 | * compatible host without the features that are NOT exposed to the |
5609 | * guest. |
5610 | * |
5611 | * FP+SSE can always be saved/restored via KVM_{G,S}ET_XSAVE, even if |
5612 | * XSAVE/XCRO are not exposed to the guest, and even if XSAVE isn't |
5613 | * supported by the host. |
5614 | */ |
5615 | u64 supported_xcr0 = vcpu->arch.guest_supported_xcr0 | |
5616 | XFEATURE_MASK_FPSSE; |
5617 | |
5618 | if (fpstate_is_confidential(gfpu: &vcpu->arch.guest_fpu)) |
5619 | return; |
5620 | |
5621 | fpu_copy_guest_fpstate_to_uabi(gfpu: &vcpu->arch.guest_fpu, buf: state, size, |
5622 | xfeatures: supported_xcr0, pkru: vcpu->arch.pkru); |
5623 | } |
5624 | |
5625 | static void kvm_vcpu_ioctl_x86_get_xsave(struct kvm_vcpu *vcpu, |
5626 | struct kvm_xsave *guest_xsave) |
5627 | { |
5628 | kvm_vcpu_ioctl_x86_get_xsave2(vcpu, state: (void *)guest_xsave->region, |
5629 | size: sizeof(guest_xsave->region)); |
5630 | } |
5631 | |
5632 | static int kvm_vcpu_ioctl_x86_set_xsave(struct kvm_vcpu *vcpu, |
5633 | struct kvm_xsave *guest_xsave) |
5634 | { |
5635 | if (fpstate_is_confidential(gfpu: &vcpu->arch.guest_fpu)) |
5636 | return 0; |
5637 | |
5638 | return fpu_copy_uabi_to_guest_fpstate(gfpu: &vcpu->arch.guest_fpu, |
5639 | buf: guest_xsave->region, |
5640 | xcr0: kvm_caps.supported_xcr0, |
5641 | vpkru: &vcpu->arch.pkru); |
5642 | } |
5643 | |
5644 | static void kvm_vcpu_ioctl_x86_get_xcrs(struct kvm_vcpu *vcpu, |
5645 | struct kvm_xcrs *guest_xcrs) |
5646 | { |
5647 | if (!boot_cpu_has(X86_FEATURE_XSAVE)) { |
5648 | guest_xcrs->nr_xcrs = 0; |
5649 | return; |
5650 | } |
5651 | |
5652 | guest_xcrs->nr_xcrs = 1; |
5653 | guest_xcrs->flags = 0; |
5654 | guest_xcrs->xcrs[0].xcr = XCR_XFEATURE_ENABLED_MASK; |
5655 | guest_xcrs->xcrs[0].value = vcpu->arch.xcr0; |
5656 | } |
5657 | |
5658 | static int kvm_vcpu_ioctl_x86_set_xcrs(struct kvm_vcpu *vcpu, |
5659 | struct kvm_xcrs *guest_xcrs) |
5660 | { |
5661 | int i, r = 0; |
5662 | |
5663 | if (!boot_cpu_has(X86_FEATURE_XSAVE)) |
5664 | return -EINVAL; |
5665 | |
5666 | if (guest_xcrs->nr_xcrs > KVM_MAX_XCRS || guest_xcrs->flags) |
5667 | return -EINVAL; |
5668 | |
5669 | for (i = 0; i < guest_xcrs->nr_xcrs; i++) |
5670 | /* Only support XCR0 currently */ |
5671 | if (guest_xcrs->xcrs[i].xcr == XCR_XFEATURE_ENABLED_MASK) { |
5672 | r = __kvm_set_xcr(vcpu, XCR_XFEATURE_ENABLED_MASK, |
5673 | xcr: guest_xcrs->xcrs[i].value); |
5674 | break; |
5675 | } |
5676 | if (r) |
5677 | r = -EINVAL; |
5678 | return r; |
5679 | } |
5680 | |
5681 | /* |
5682 | * kvm_set_guest_paused() indicates to the guest kernel that it has been |
5683 | * stopped by the hypervisor. This function will be called from the host only. |
5684 | * EINVAL is returned when the host attempts to set the flag for a guest that |
5685 | * does not support pv clocks. |
5686 | */ |
5687 | static int kvm_set_guest_paused(struct kvm_vcpu *vcpu) |
5688 | { |
5689 | if (!vcpu->arch.pv_time.active) |
5690 | return -EINVAL; |
5691 | vcpu->arch.pvclock_set_guest_stopped_request = true; |
5692 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
5693 | return 0; |
5694 | } |
5695 | |
5696 | static int kvm_arch_tsc_has_attr(struct kvm_vcpu *vcpu, |
5697 | struct kvm_device_attr *attr) |
5698 | { |
5699 | int r; |
5700 | |
5701 | switch (attr->attr) { |
5702 | case KVM_VCPU_TSC_OFFSET: |
5703 | r = 0; |
5704 | break; |
5705 | default: |
5706 | r = -ENXIO; |
5707 | } |
5708 | |
5709 | return r; |
5710 | } |
5711 | |
5712 | static int kvm_arch_tsc_get_attr(struct kvm_vcpu *vcpu, |
5713 | struct kvm_device_attr *attr) |
5714 | { |
5715 | u64 __user *uaddr = kvm_get_attr_addr(attr); |
5716 | int r; |
5717 | |
5718 | if (IS_ERR(ptr: uaddr)) |
5719 | return PTR_ERR(ptr: uaddr); |
5720 | |
5721 | switch (attr->attr) { |
5722 | case KVM_VCPU_TSC_OFFSET: |
5723 | r = -EFAULT; |
5724 | if (put_user(vcpu->arch.l1_tsc_offset, uaddr)) |
5725 | break; |
5726 | r = 0; |
5727 | break; |
5728 | default: |
5729 | r = -ENXIO; |
5730 | } |
5731 | |
5732 | return r; |
5733 | } |
5734 | |
5735 | static int kvm_arch_tsc_set_attr(struct kvm_vcpu *vcpu, |
5736 | struct kvm_device_attr *attr) |
5737 | { |
5738 | u64 __user *uaddr = kvm_get_attr_addr(attr); |
5739 | struct kvm *kvm = vcpu->kvm; |
5740 | int r; |
5741 | |
5742 | if (IS_ERR(ptr: uaddr)) |
5743 | return PTR_ERR(ptr: uaddr); |
5744 | |
5745 | switch (attr->attr) { |
5746 | case KVM_VCPU_TSC_OFFSET: { |
5747 | u64 offset, tsc, ns; |
5748 | unsigned long flags; |
5749 | bool matched; |
5750 | |
5751 | r = -EFAULT; |
5752 | if (get_user(offset, uaddr)) |
5753 | break; |
5754 | |
5755 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
5756 | |
5757 | matched = (vcpu->arch.virtual_tsc_khz && |
5758 | kvm->arch.last_tsc_khz == vcpu->arch.virtual_tsc_khz && |
5759 | kvm->arch.last_tsc_offset == offset); |
5760 | |
5761 | tsc = kvm_scale_tsc(tsc: rdtsc(), ratio: vcpu->arch.l1_tsc_scaling_ratio) + offset; |
5762 | ns = get_kvmclock_base_ns(); |
5763 | |
5764 | kvm->arch.user_set_tsc = true; |
5765 | __kvm_synchronize_tsc(vcpu, offset, tsc, ns, matched); |
5766 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); |
5767 | |
5768 | r = 0; |
5769 | break; |
5770 | } |
5771 | default: |
5772 | r = -ENXIO; |
5773 | } |
5774 | |
5775 | return r; |
5776 | } |
5777 | |
5778 | static int kvm_vcpu_ioctl_device_attr(struct kvm_vcpu *vcpu, |
5779 | unsigned int ioctl, |
5780 | void __user *argp) |
5781 | { |
5782 | struct kvm_device_attr attr; |
5783 | int r; |
5784 | |
5785 | if (copy_from_user(to: &attr, from: argp, n: sizeof(attr))) |
5786 | return -EFAULT; |
5787 | |
5788 | if (attr.group != KVM_VCPU_TSC_CTRL) |
5789 | return -ENXIO; |
5790 | |
5791 | switch (ioctl) { |
5792 | case KVM_HAS_DEVICE_ATTR: |
5793 | r = kvm_arch_tsc_has_attr(vcpu, attr: &attr); |
5794 | break; |
5795 | case KVM_GET_DEVICE_ATTR: |
5796 | r = kvm_arch_tsc_get_attr(vcpu, attr: &attr); |
5797 | break; |
5798 | case KVM_SET_DEVICE_ATTR: |
5799 | r = kvm_arch_tsc_set_attr(vcpu, attr: &attr); |
5800 | break; |
5801 | } |
5802 | |
5803 | return r; |
5804 | } |
5805 | |
5806 | static int kvm_vcpu_ioctl_enable_cap(struct kvm_vcpu *vcpu, |
5807 | struct kvm_enable_cap *cap) |
5808 | { |
5809 | if (cap->flags) |
5810 | return -EINVAL; |
5811 | |
5812 | switch (cap->cap) { |
5813 | #ifdef CONFIG_KVM_HYPERV |
5814 | case KVM_CAP_HYPERV_SYNIC2: |
5815 | if (cap->args[0]) |
5816 | return -EINVAL; |
5817 | fallthrough; |
5818 | |
5819 | case KVM_CAP_HYPERV_SYNIC: |
5820 | if (!irqchip_in_kernel(kvm: vcpu->kvm)) |
5821 | return -EINVAL; |
5822 | return kvm_hv_activate_synic(vcpu, dont_zero_synic_pages: cap->cap == |
5823 | KVM_CAP_HYPERV_SYNIC2); |
5824 | case KVM_CAP_HYPERV_ENLIGHTENED_VMCS: |
5825 | { |
5826 | int r; |
5827 | uint16_t vmcs_version; |
5828 | void __user *user_ptr; |
5829 | |
5830 | if (!kvm_x86_ops.nested_ops->enable_evmcs) |
5831 | return -ENOTTY; |
5832 | r = kvm_x86_ops.nested_ops->enable_evmcs(vcpu, &vmcs_version); |
5833 | if (!r) { |
5834 | user_ptr = (void __user *)(uintptr_t)cap->args[0]; |
5835 | if (copy_to_user(to: user_ptr, from: &vmcs_version, |
5836 | n: sizeof(vmcs_version))) |
5837 | r = -EFAULT; |
5838 | } |
5839 | return r; |
5840 | } |
5841 | case KVM_CAP_HYPERV_DIRECT_TLBFLUSH: |
5842 | if (!kvm_x86_ops.enable_l2_tlb_flush) |
5843 | return -ENOTTY; |
5844 | |
5845 | return static_call(kvm_x86_enable_l2_tlb_flush)(vcpu); |
5846 | |
5847 | case KVM_CAP_HYPERV_ENFORCE_CPUID: |
5848 | return kvm_hv_set_enforce_cpuid(vcpu, enforce: cap->args[0]); |
5849 | #endif |
5850 | |
5851 | case KVM_CAP_ENFORCE_PV_FEATURE_CPUID: |
5852 | vcpu->arch.pv_cpuid.enforce = cap->args[0]; |
5853 | if (vcpu->arch.pv_cpuid.enforce) |
5854 | kvm_update_pv_runtime(vcpu); |
5855 | |
5856 | return 0; |
5857 | default: |
5858 | return -EINVAL; |
5859 | } |
5860 | } |
5861 | |
5862 | long kvm_arch_vcpu_ioctl(struct file *filp, |
5863 | unsigned int ioctl, unsigned long arg) |
5864 | { |
5865 | struct kvm_vcpu *vcpu = filp->private_data; |
5866 | void __user *argp = (void __user *)arg; |
5867 | int r; |
5868 | union { |
5869 | struct kvm_sregs2 *sregs2; |
5870 | struct kvm_lapic_state *lapic; |
5871 | struct kvm_xsave *xsave; |
5872 | struct kvm_xcrs *xcrs; |
5873 | void *buffer; |
5874 | } u; |
5875 | |
5876 | vcpu_load(vcpu); |
5877 | |
5878 | u.buffer = NULL; |
5879 | switch (ioctl) { |
5880 | case KVM_GET_LAPIC: { |
5881 | r = -EINVAL; |
5882 | if (!lapic_in_kernel(vcpu)) |
5883 | goto out; |
5884 | u.lapic = kzalloc(size: sizeof(struct kvm_lapic_state), |
5885 | GFP_KERNEL_ACCOUNT); |
5886 | |
5887 | r = -ENOMEM; |
5888 | if (!u.lapic) |
5889 | goto out; |
5890 | r = kvm_vcpu_ioctl_get_lapic(vcpu, s: u.lapic); |
5891 | if (r) |
5892 | goto out; |
5893 | r = -EFAULT; |
5894 | if (copy_to_user(to: argp, from: u.lapic, n: sizeof(struct kvm_lapic_state))) |
5895 | goto out; |
5896 | r = 0; |
5897 | break; |
5898 | } |
5899 | case KVM_SET_LAPIC: { |
5900 | r = -EINVAL; |
5901 | if (!lapic_in_kernel(vcpu)) |
5902 | goto out; |
5903 | u.lapic = memdup_user(argp, sizeof(*u.lapic)); |
5904 | if (IS_ERR(ptr: u.lapic)) { |
5905 | r = PTR_ERR(ptr: u.lapic); |
5906 | goto out_nofree; |
5907 | } |
5908 | |
5909 | r = kvm_vcpu_ioctl_set_lapic(vcpu, s: u.lapic); |
5910 | break; |
5911 | } |
5912 | case KVM_INTERRUPT: { |
5913 | struct kvm_interrupt irq; |
5914 | |
5915 | r = -EFAULT; |
5916 | if (copy_from_user(to: &irq, from: argp, n: sizeof(irq))) |
5917 | goto out; |
5918 | r = kvm_vcpu_ioctl_interrupt(vcpu, irq: &irq); |
5919 | break; |
5920 | } |
5921 | case KVM_NMI: { |
5922 | r = kvm_vcpu_ioctl_nmi(vcpu); |
5923 | break; |
5924 | } |
5925 | case KVM_SMI: { |
5926 | r = kvm_inject_smi(vcpu); |
5927 | break; |
5928 | } |
5929 | case KVM_SET_CPUID: { |
5930 | struct kvm_cpuid __user *cpuid_arg = argp; |
5931 | struct kvm_cpuid cpuid; |
5932 | |
5933 | r = -EFAULT; |
5934 | if (copy_from_user(to: &cpuid, from: cpuid_arg, n: sizeof(cpuid))) |
5935 | goto out; |
5936 | r = kvm_vcpu_ioctl_set_cpuid(vcpu, cpuid: &cpuid, entries: cpuid_arg->entries); |
5937 | break; |
5938 | } |
5939 | case KVM_SET_CPUID2: { |
5940 | struct kvm_cpuid2 __user *cpuid_arg = argp; |
5941 | struct kvm_cpuid2 cpuid; |
5942 | |
5943 | r = -EFAULT; |
5944 | if (copy_from_user(to: &cpuid, from: cpuid_arg, n: sizeof(cpuid))) |
5945 | goto out; |
5946 | r = kvm_vcpu_ioctl_set_cpuid2(vcpu, cpuid: &cpuid, |
5947 | entries: cpuid_arg->entries); |
5948 | break; |
5949 | } |
5950 | case KVM_GET_CPUID2: { |
5951 | struct kvm_cpuid2 __user *cpuid_arg = argp; |
5952 | struct kvm_cpuid2 cpuid; |
5953 | |
5954 | r = -EFAULT; |
5955 | if (copy_from_user(to: &cpuid, from: cpuid_arg, n: sizeof(cpuid))) |
5956 | goto out; |
5957 | r = kvm_vcpu_ioctl_get_cpuid2(vcpu, cpuid: &cpuid, |
5958 | entries: cpuid_arg->entries); |
5959 | if (r) |
5960 | goto out; |
5961 | r = -EFAULT; |
5962 | if (copy_to_user(to: cpuid_arg, from: &cpuid, n: sizeof(cpuid))) |
5963 | goto out; |
5964 | r = 0; |
5965 | break; |
5966 | } |
5967 | case KVM_GET_MSRS: { |
5968 | int idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
5969 | r = msr_io(vcpu, user_msrs: argp, do_msr: do_get_msr, writeback: 1); |
5970 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
5971 | break; |
5972 | } |
5973 | case KVM_SET_MSRS: { |
5974 | int idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
5975 | r = msr_io(vcpu, user_msrs: argp, do_msr: do_set_msr, writeback: 0); |
5976 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
5977 | break; |
5978 | } |
5979 | case KVM_TPR_ACCESS_REPORTING: { |
5980 | struct kvm_tpr_access_ctl tac; |
5981 | |
5982 | r = -EFAULT; |
5983 | if (copy_from_user(to: &tac, from: argp, n: sizeof(tac))) |
5984 | goto out; |
5985 | r = vcpu_ioctl_tpr_access_reporting(vcpu, tac: &tac); |
5986 | if (r) |
5987 | goto out; |
5988 | r = -EFAULT; |
5989 | if (copy_to_user(to: argp, from: &tac, n: sizeof(tac))) |
5990 | goto out; |
5991 | r = 0; |
5992 | break; |
5993 | }; |
5994 | case KVM_SET_VAPIC_ADDR: { |
5995 | struct kvm_vapic_addr va; |
5996 | int idx; |
5997 | |
5998 | r = -EINVAL; |
5999 | if (!lapic_in_kernel(vcpu)) |
6000 | goto out; |
6001 | r = -EFAULT; |
6002 | if (copy_from_user(to: &va, from: argp, n: sizeof(va))) |
6003 | goto out; |
6004 | idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
6005 | r = kvm_lapic_set_vapic_addr(vcpu, vapic_addr: va.vapic_addr); |
6006 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
6007 | break; |
6008 | } |
6009 | case KVM_X86_SETUP_MCE: { |
6010 | u64 mcg_cap; |
6011 | |
6012 | r = -EFAULT; |
6013 | if (copy_from_user(to: &mcg_cap, from: argp, n: sizeof(mcg_cap))) |
6014 | goto out; |
6015 | r = kvm_vcpu_ioctl_x86_setup_mce(vcpu, mcg_cap); |
6016 | break; |
6017 | } |
6018 | case KVM_X86_SET_MCE: { |
6019 | struct kvm_x86_mce mce; |
6020 | |
6021 | r = -EFAULT; |
6022 | if (copy_from_user(to: &mce, from: argp, n: sizeof(mce))) |
6023 | goto out; |
6024 | r = kvm_vcpu_ioctl_x86_set_mce(vcpu, mce: &mce); |
6025 | break; |
6026 | } |
6027 | case KVM_GET_VCPU_EVENTS: { |
6028 | struct kvm_vcpu_events events; |
6029 | |
6030 | kvm_vcpu_ioctl_x86_get_vcpu_events(vcpu, events: &events); |
6031 | |
6032 | r = -EFAULT; |
6033 | if (copy_to_user(to: argp, from: &events, n: sizeof(struct kvm_vcpu_events))) |
6034 | break; |
6035 | r = 0; |
6036 | break; |
6037 | } |
6038 | case KVM_SET_VCPU_EVENTS: { |
6039 | struct kvm_vcpu_events events; |
6040 | |
6041 | r = -EFAULT; |
6042 | if (copy_from_user(to: &events, from: argp, n: sizeof(struct kvm_vcpu_events))) |
6043 | break; |
6044 | |
6045 | r = kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, events: &events); |
6046 | break; |
6047 | } |
6048 | case KVM_GET_DEBUGREGS: { |
6049 | struct kvm_debugregs dbgregs; |
6050 | |
6051 | kvm_vcpu_ioctl_x86_get_debugregs(vcpu, dbgregs: &dbgregs); |
6052 | |
6053 | r = -EFAULT; |
6054 | if (copy_to_user(to: argp, from: &dbgregs, |
6055 | n: sizeof(struct kvm_debugregs))) |
6056 | break; |
6057 | r = 0; |
6058 | break; |
6059 | } |
6060 | case KVM_SET_DEBUGREGS: { |
6061 | struct kvm_debugregs dbgregs; |
6062 | |
6063 | r = -EFAULT; |
6064 | if (copy_from_user(to: &dbgregs, from: argp, |
6065 | n: sizeof(struct kvm_debugregs))) |
6066 | break; |
6067 | |
6068 | r = kvm_vcpu_ioctl_x86_set_debugregs(vcpu, dbgregs: &dbgregs); |
6069 | break; |
6070 | } |
6071 | case KVM_GET_XSAVE: { |
6072 | r = -EINVAL; |
6073 | if (vcpu->arch.guest_fpu.uabi_size > sizeof(struct kvm_xsave)) |
6074 | break; |
6075 | |
6076 | u.xsave = kzalloc(size: sizeof(struct kvm_xsave), GFP_KERNEL_ACCOUNT); |
6077 | r = -ENOMEM; |
6078 | if (!u.xsave) |
6079 | break; |
6080 | |
6081 | kvm_vcpu_ioctl_x86_get_xsave(vcpu, guest_xsave: u.xsave); |
6082 | |
6083 | r = -EFAULT; |
6084 | if (copy_to_user(to: argp, from: u.xsave, n: sizeof(struct kvm_xsave))) |
6085 | break; |
6086 | r = 0; |
6087 | break; |
6088 | } |
6089 | case KVM_SET_XSAVE: { |
6090 | int size = vcpu->arch.guest_fpu.uabi_size; |
6091 | |
6092 | u.xsave = memdup_user(argp, size); |
6093 | if (IS_ERR(ptr: u.xsave)) { |
6094 | r = PTR_ERR(ptr: u.xsave); |
6095 | goto out_nofree; |
6096 | } |
6097 | |
6098 | r = kvm_vcpu_ioctl_x86_set_xsave(vcpu, guest_xsave: u.xsave); |
6099 | break; |
6100 | } |
6101 | |
6102 | case KVM_GET_XSAVE2: { |
6103 | int size = vcpu->arch.guest_fpu.uabi_size; |
6104 | |
6105 | u.xsave = kzalloc(size, GFP_KERNEL_ACCOUNT); |
6106 | r = -ENOMEM; |
6107 | if (!u.xsave) |
6108 | break; |
6109 | |
6110 | kvm_vcpu_ioctl_x86_get_xsave2(vcpu, state: u.buffer, size); |
6111 | |
6112 | r = -EFAULT; |
6113 | if (copy_to_user(to: argp, from: u.xsave, n: size)) |
6114 | break; |
6115 | |
6116 | r = 0; |
6117 | break; |
6118 | } |
6119 | |
6120 | case KVM_GET_XCRS: { |
6121 | u.xcrs = kzalloc(size: sizeof(struct kvm_xcrs), GFP_KERNEL_ACCOUNT); |
6122 | r = -ENOMEM; |
6123 | if (!u.xcrs) |
6124 | break; |
6125 | |
6126 | kvm_vcpu_ioctl_x86_get_xcrs(vcpu, guest_xcrs: u.xcrs); |
6127 | |
6128 | r = -EFAULT; |
6129 | if (copy_to_user(to: argp, from: u.xcrs, |
6130 | n: sizeof(struct kvm_xcrs))) |
6131 | break; |
6132 | r = 0; |
6133 | break; |
6134 | } |
6135 | case KVM_SET_XCRS: { |
6136 | u.xcrs = memdup_user(argp, sizeof(*u.xcrs)); |
6137 | if (IS_ERR(ptr: u.xcrs)) { |
6138 | r = PTR_ERR(ptr: u.xcrs); |
6139 | goto out_nofree; |
6140 | } |
6141 | |
6142 | r = kvm_vcpu_ioctl_x86_set_xcrs(vcpu, guest_xcrs: u.xcrs); |
6143 | break; |
6144 | } |
6145 | case KVM_SET_TSC_KHZ: { |
6146 | u32 user_tsc_khz; |
6147 | |
6148 | r = -EINVAL; |
6149 | user_tsc_khz = (u32)arg; |
6150 | |
6151 | if (kvm_caps.has_tsc_control && |
6152 | user_tsc_khz >= kvm_caps.max_guest_tsc_khz) |
6153 | goto out; |
6154 | |
6155 | if (user_tsc_khz == 0) |
6156 | user_tsc_khz = tsc_khz; |
6157 | |
6158 | if (!kvm_set_tsc_khz(vcpu, user_tsc_khz)) |
6159 | r = 0; |
6160 | |
6161 | goto out; |
6162 | } |
6163 | case KVM_GET_TSC_KHZ: { |
6164 | r = vcpu->arch.virtual_tsc_khz; |
6165 | goto out; |
6166 | } |
6167 | case KVM_KVMCLOCK_CTRL: { |
6168 | r = kvm_set_guest_paused(vcpu); |
6169 | goto out; |
6170 | } |
6171 | case KVM_ENABLE_CAP: { |
6172 | struct kvm_enable_cap cap; |
6173 | |
6174 | r = -EFAULT; |
6175 | if (copy_from_user(to: &cap, from: argp, n: sizeof(cap))) |
6176 | goto out; |
6177 | r = kvm_vcpu_ioctl_enable_cap(vcpu, cap: &cap); |
6178 | break; |
6179 | } |
6180 | case KVM_GET_NESTED_STATE: { |
6181 | struct kvm_nested_state __user *user_kvm_nested_state = argp; |
6182 | u32 user_data_size; |
6183 | |
6184 | r = -EINVAL; |
6185 | if (!kvm_x86_ops.nested_ops->get_state) |
6186 | break; |
6187 | |
6188 | BUILD_BUG_ON(sizeof(user_data_size) != sizeof(user_kvm_nested_state->size)); |
6189 | r = -EFAULT; |
6190 | if (get_user(user_data_size, &user_kvm_nested_state->size)) |
6191 | break; |
6192 | |
6193 | r = kvm_x86_ops.nested_ops->get_state(vcpu, user_kvm_nested_state, |
6194 | user_data_size); |
6195 | if (r < 0) |
6196 | break; |
6197 | |
6198 | if (r > user_data_size) { |
6199 | if (put_user(r, &user_kvm_nested_state->size)) |
6200 | r = -EFAULT; |
6201 | else |
6202 | r = -E2BIG; |
6203 | break; |
6204 | } |
6205 | |
6206 | r = 0; |
6207 | break; |
6208 | } |
6209 | case KVM_SET_NESTED_STATE: { |
6210 | struct kvm_nested_state __user *user_kvm_nested_state = argp; |
6211 | struct kvm_nested_state kvm_state; |
6212 | int idx; |
6213 | |
6214 | r = -EINVAL; |
6215 | if (!kvm_x86_ops.nested_ops->set_state) |
6216 | break; |
6217 | |
6218 | r = -EFAULT; |
6219 | if (copy_from_user(to: &kvm_state, from: user_kvm_nested_state, n: sizeof(kvm_state))) |
6220 | break; |
6221 | |
6222 | r = -EINVAL; |
6223 | if (kvm_state.size < sizeof(kvm_state)) |
6224 | break; |
6225 | |
6226 | if (kvm_state.flags & |
6227 | ~(KVM_STATE_NESTED_RUN_PENDING | KVM_STATE_NESTED_GUEST_MODE |
6228 | | KVM_STATE_NESTED_EVMCS | KVM_STATE_NESTED_MTF_PENDING |
6229 | | KVM_STATE_NESTED_GIF_SET)) |
6230 | break; |
6231 | |
6232 | /* nested_run_pending implies guest_mode. */ |
6233 | if ((kvm_state.flags & KVM_STATE_NESTED_RUN_PENDING) |
6234 | && !(kvm_state.flags & KVM_STATE_NESTED_GUEST_MODE)) |
6235 | break; |
6236 | |
6237 | idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
6238 | r = kvm_x86_ops.nested_ops->set_state(vcpu, user_kvm_nested_state, &kvm_state); |
6239 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
6240 | break; |
6241 | } |
6242 | #ifdef CONFIG_KVM_HYPERV |
6243 | case KVM_GET_SUPPORTED_HV_CPUID: |
6244 | r = kvm_ioctl_get_supported_hv_cpuid(vcpu, cpuid_arg: argp); |
6245 | break; |
6246 | #endif |
6247 | #ifdef CONFIG_KVM_XEN |
6248 | case KVM_XEN_VCPU_GET_ATTR: { |
6249 | struct kvm_xen_vcpu_attr xva; |
6250 | |
6251 | r = -EFAULT; |
6252 | if (copy_from_user(to: &xva, from: argp, n: sizeof(xva))) |
6253 | goto out; |
6254 | r = kvm_xen_vcpu_get_attr(vcpu, data: &xva); |
6255 | if (!r && copy_to_user(to: argp, from: &xva, n: sizeof(xva))) |
6256 | r = -EFAULT; |
6257 | break; |
6258 | } |
6259 | case KVM_XEN_VCPU_SET_ATTR: { |
6260 | struct kvm_xen_vcpu_attr xva; |
6261 | |
6262 | r = -EFAULT; |
6263 | if (copy_from_user(to: &xva, from: argp, n: sizeof(xva))) |
6264 | goto out; |
6265 | r = kvm_xen_vcpu_set_attr(vcpu, data: &xva); |
6266 | break; |
6267 | } |
6268 | #endif |
6269 | case KVM_GET_SREGS2: { |
6270 | u.sregs2 = kzalloc(size: sizeof(struct kvm_sregs2), GFP_KERNEL); |
6271 | r = -ENOMEM; |
6272 | if (!u.sregs2) |
6273 | goto out; |
6274 | __get_sregs2(vcpu, sregs2: u.sregs2); |
6275 | r = -EFAULT; |
6276 | if (copy_to_user(to: argp, from: u.sregs2, n: sizeof(struct kvm_sregs2))) |
6277 | goto out; |
6278 | r = 0; |
6279 | break; |
6280 | } |
6281 | case KVM_SET_SREGS2: { |
6282 | u.sregs2 = memdup_user(argp, sizeof(struct kvm_sregs2)); |
6283 | if (IS_ERR(ptr: u.sregs2)) { |
6284 | r = PTR_ERR(ptr: u.sregs2); |
6285 | u.sregs2 = NULL; |
6286 | goto out; |
6287 | } |
6288 | r = __set_sregs2(vcpu, sregs2: u.sregs2); |
6289 | break; |
6290 | } |
6291 | case KVM_HAS_DEVICE_ATTR: |
6292 | case KVM_GET_DEVICE_ATTR: |
6293 | case KVM_SET_DEVICE_ATTR: |
6294 | r = kvm_vcpu_ioctl_device_attr(vcpu, ioctl, argp); |
6295 | break; |
6296 | default: |
6297 | r = -EINVAL; |
6298 | } |
6299 | out: |
6300 | kfree(objp: u.buffer); |
6301 | out_nofree: |
6302 | vcpu_put(vcpu); |
6303 | return r; |
6304 | } |
6305 | |
6306 | vm_fault_t kvm_arch_vcpu_fault(struct kvm_vcpu *vcpu, struct vm_fault *vmf) |
6307 | { |
6308 | return VM_FAULT_SIGBUS; |
6309 | } |
6310 | |
6311 | static int kvm_vm_ioctl_set_tss_addr(struct kvm *kvm, unsigned long addr) |
6312 | { |
6313 | int ret; |
6314 | |
6315 | if (addr > (unsigned int)(-3 * PAGE_SIZE)) |
6316 | return -EINVAL; |
6317 | ret = static_call(kvm_x86_set_tss_addr)(kvm, addr); |
6318 | return ret; |
6319 | } |
6320 | |
6321 | static int kvm_vm_ioctl_set_identity_map_addr(struct kvm *kvm, |
6322 | u64 ident_addr) |
6323 | { |
6324 | return static_call(kvm_x86_set_identity_map_addr)(kvm, ident_addr); |
6325 | } |
6326 | |
6327 | static int kvm_vm_ioctl_set_nr_mmu_pages(struct kvm *kvm, |
6328 | unsigned long kvm_nr_mmu_pages) |
6329 | { |
6330 | if (kvm_nr_mmu_pages < KVM_MIN_ALLOC_MMU_PAGES) |
6331 | return -EINVAL; |
6332 | |
6333 | mutex_lock(&kvm->slots_lock); |
6334 | |
6335 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages); |
6336 | kvm->arch.n_requested_mmu_pages = kvm_nr_mmu_pages; |
6337 | |
6338 | mutex_unlock(lock: &kvm->slots_lock); |
6339 | return 0; |
6340 | } |
6341 | |
6342 | static int kvm_vm_ioctl_get_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) |
6343 | { |
6344 | struct kvm_pic *pic = kvm->arch.vpic; |
6345 | int r; |
6346 | |
6347 | r = 0; |
6348 | switch (chip->chip_id) { |
6349 | case KVM_IRQCHIP_PIC_MASTER: |
6350 | memcpy(&chip->chip.pic, &pic->pics[0], |
6351 | sizeof(struct kvm_pic_state)); |
6352 | break; |
6353 | case KVM_IRQCHIP_PIC_SLAVE: |
6354 | memcpy(&chip->chip.pic, &pic->pics[1], |
6355 | sizeof(struct kvm_pic_state)); |
6356 | break; |
6357 | case KVM_IRQCHIP_IOAPIC: |
6358 | kvm_get_ioapic(kvm, state: &chip->chip.ioapic); |
6359 | break; |
6360 | default: |
6361 | r = -EINVAL; |
6362 | break; |
6363 | } |
6364 | return r; |
6365 | } |
6366 | |
6367 | static int kvm_vm_ioctl_set_irqchip(struct kvm *kvm, struct kvm_irqchip *chip) |
6368 | { |
6369 | struct kvm_pic *pic = kvm->arch.vpic; |
6370 | int r; |
6371 | |
6372 | r = 0; |
6373 | switch (chip->chip_id) { |
6374 | case KVM_IRQCHIP_PIC_MASTER: |
6375 | spin_lock(lock: &pic->lock); |
6376 | memcpy(&pic->pics[0], &chip->chip.pic, |
6377 | sizeof(struct kvm_pic_state)); |
6378 | spin_unlock(lock: &pic->lock); |
6379 | break; |
6380 | case KVM_IRQCHIP_PIC_SLAVE: |
6381 | spin_lock(lock: &pic->lock); |
6382 | memcpy(&pic->pics[1], &chip->chip.pic, |
6383 | sizeof(struct kvm_pic_state)); |
6384 | spin_unlock(lock: &pic->lock); |
6385 | break; |
6386 | case KVM_IRQCHIP_IOAPIC: |
6387 | kvm_set_ioapic(kvm, state: &chip->chip.ioapic); |
6388 | break; |
6389 | default: |
6390 | r = -EINVAL; |
6391 | break; |
6392 | } |
6393 | kvm_pic_update_irq(s: pic); |
6394 | return r; |
6395 | } |
6396 | |
6397 | static int kvm_vm_ioctl_get_pit(struct kvm *kvm, struct kvm_pit_state *ps) |
6398 | { |
6399 | struct kvm_kpit_state *kps = &kvm->arch.vpit->pit_state; |
6400 | |
6401 | BUILD_BUG_ON(sizeof(*ps) != sizeof(kps->channels)); |
6402 | |
6403 | mutex_lock(&kps->lock); |
6404 | memcpy(ps, &kps->channels, sizeof(*ps)); |
6405 | mutex_unlock(lock: &kps->lock); |
6406 | return 0; |
6407 | } |
6408 | |
6409 | static int kvm_vm_ioctl_set_pit(struct kvm *kvm, struct kvm_pit_state *ps) |
6410 | { |
6411 | int i; |
6412 | struct kvm_pit *pit = kvm->arch.vpit; |
6413 | |
6414 | mutex_lock(&pit->pit_state.lock); |
6415 | memcpy(&pit->pit_state.channels, ps, sizeof(*ps)); |
6416 | for (i = 0; i < 3; i++) |
6417 | kvm_pit_load_count(pit, channel: i, val: ps->channels[i].count, hpet_legacy_start: 0); |
6418 | mutex_unlock(lock: &pit->pit_state.lock); |
6419 | return 0; |
6420 | } |
6421 | |
6422 | static int kvm_vm_ioctl_get_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) |
6423 | { |
6424 | mutex_lock(&kvm->arch.vpit->pit_state.lock); |
6425 | memcpy(ps->channels, &kvm->arch.vpit->pit_state.channels, |
6426 | sizeof(ps->channels)); |
6427 | ps->flags = kvm->arch.vpit->pit_state.flags; |
6428 | mutex_unlock(lock: &kvm->arch.vpit->pit_state.lock); |
6429 | memset(&ps->reserved, 0, sizeof(ps->reserved)); |
6430 | return 0; |
6431 | } |
6432 | |
6433 | static int kvm_vm_ioctl_set_pit2(struct kvm *kvm, struct kvm_pit_state2 *ps) |
6434 | { |
6435 | int start = 0; |
6436 | int i; |
6437 | u32 prev_legacy, cur_legacy; |
6438 | struct kvm_pit *pit = kvm->arch.vpit; |
6439 | |
6440 | mutex_lock(&pit->pit_state.lock); |
6441 | prev_legacy = pit->pit_state.flags & KVM_PIT_FLAGS_HPET_LEGACY; |
6442 | cur_legacy = ps->flags & KVM_PIT_FLAGS_HPET_LEGACY; |
6443 | if (!prev_legacy && cur_legacy) |
6444 | start = 1; |
6445 | memcpy(&pit->pit_state.channels, &ps->channels, |
6446 | sizeof(pit->pit_state.channels)); |
6447 | pit->pit_state.flags = ps->flags; |
6448 | for (i = 0; i < 3; i++) |
6449 | kvm_pit_load_count(pit, channel: i, val: pit->pit_state.channels[i].count, |
6450 | hpet_legacy_start: start && i == 0); |
6451 | mutex_unlock(lock: &pit->pit_state.lock); |
6452 | return 0; |
6453 | } |
6454 | |
6455 | static int kvm_vm_ioctl_reinject(struct kvm *kvm, |
6456 | struct kvm_reinject_control *control) |
6457 | { |
6458 | struct kvm_pit *pit = kvm->arch.vpit; |
6459 | |
6460 | /* pit->pit_state.lock was overloaded to prevent userspace from getting |
6461 | * an inconsistent state after running multiple KVM_REINJECT_CONTROL |
6462 | * ioctls in parallel. Use a separate lock if that ioctl isn't rare. |
6463 | */ |
6464 | mutex_lock(&pit->pit_state.lock); |
6465 | kvm_pit_set_reinject(pit, reinject: control->pit_reinject); |
6466 | mutex_unlock(lock: &pit->pit_state.lock); |
6467 | |
6468 | return 0; |
6469 | } |
6470 | |
6471 | void kvm_arch_sync_dirty_log(struct kvm *kvm, struct kvm_memory_slot *memslot) |
6472 | { |
6473 | |
6474 | /* |
6475 | * Flush all CPUs' dirty log buffers to the dirty_bitmap. Called |
6476 | * before reporting dirty_bitmap to userspace. KVM flushes the buffers |
6477 | * on all VM-Exits, thus we only need to kick running vCPUs to force a |
6478 | * VM-Exit. |
6479 | */ |
6480 | struct kvm_vcpu *vcpu; |
6481 | unsigned long i; |
6482 | |
6483 | if (!kvm_x86_ops.cpu_dirty_log_size) |
6484 | return; |
6485 | |
6486 | kvm_for_each_vcpu(i, vcpu, kvm) |
6487 | kvm_vcpu_kick(vcpu); |
6488 | } |
6489 | |
6490 | int kvm_vm_ioctl_irq_line(struct kvm *kvm, struct kvm_irq_level *irq_event, |
6491 | bool line_status) |
6492 | { |
6493 | if (!irqchip_in_kernel(kvm)) |
6494 | return -ENXIO; |
6495 | |
6496 | irq_event->status = kvm_set_irq(kvm, KVM_USERSPACE_IRQ_SOURCE_ID, |
6497 | irq: irq_event->irq, level: irq_event->level, |
6498 | line_status); |
6499 | return 0; |
6500 | } |
6501 | |
6502 | int kvm_vm_ioctl_enable_cap(struct kvm *kvm, |
6503 | struct kvm_enable_cap *cap) |
6504 | { |
6505 | int r; |
6506 | |
6507 | if (cap->flags) |
6508 | return -EINVAL; |
6509 | |
6510 | switch (cap->cap) { |
6511 | case KVM_CAP_DISABLE_QUIRKS2: |
6512 | r = -EINVAL; |
6513 | if (cap->args[0] & ~KVM_X86_VALID_QUIRKS) |
6514 | break; |
6515 | fallthrough; |
6516 | case KVM_CAP_DISABLE_QUIRKS: |
6517 | kvm->arch.disabled_quirks = cap->args[0]; |
6518 | r = 0; |
6519 | break; |
6520 | case KVM_CAP_SPLIT_IRQCHIP: { |
6521 | mutex_lock(&kvm->lock); |
6522 | r = -EINVAL; |
6523 | if (cap->args[0] > MAX_NR_RESERVED_IOAPIC_PINS) |
6524 | goto split_irqchip_unlock; |
6525 | r = -EEXIST; |
6526 | if (irqchip_in_kernel(kvm)) |
6527 | goto split_irqchip_unlock; |
6528 | if (kvm->created_vcpus) |
6529 | goto split_irqchip_unlock; |
6530 | r = kvm_setup_empty_irq_routing(kvm); |
6531 | if (r) |
6532 | goto split_irqchip_unlock; |
6533 | /* Pairs with irqchip_in_kernel. */ |
6534 | smp_wmb(); |
6535 | kvm->arch.irqchip_mode = KVM_IRQCHIP_SPLIT; |
6536 | kvm->arch.nr_reserved_ioapic_pins = cap->args[0]; |
6537 | kvm_clear_apicv_inhibit(kvm, reason: APICV_INHIBIT_REASON_ABSENT); |
6538 | r = 0; |
6539 | split_irqchip_unlock: |
6540 | mutex_unlock(lock: &kvm->lock); |
6541 | break; |
6542 | } |
6543 | case KVM_CAP_X2APIC_API: |
6544 | r = -EINVAL; |
6545 | if (cap->args[0] & ~KVM_X2APIC_API_VALID_FLAGS) |
6546 | break; |
6547 | |
6548 | if (cap->args[0] & KVM_X2APIC_API_USE_32BIT_IDS) |
6549 | kvm->arch.x2apic_format = true; |
6550 | if (cap->args[0] & KVM_X2APIC_API_DISABLE_BROADCAST_QUIRK) |
6551 | kvm->arch.x2apic_broadcast_quirk_disabled = true; |
6552 | |
6553 | r = 0; |
6554 | break; |
6555 | case KVM_CAP_X86_DISABLE_EXITS: |
6556 | r = -EINVAL; |
6557 | if (cap->args[0] & ~KVM_X86_DISABLE_VALID_EXITS) |
6558 | break; |
6559 | |
6560 | if (cap->args[0] & KVM_X86_DISABLE_EXITS_PAUSE) |
6561 | kvm->arch.pause_in_guest = true; |
6562 | |
6563 | #define SMT_RSB_MSG "This processor is affected by the Cross-Thread Return Predictions vulnerability. " \ |
6564 | "KVM_CAP_X86_DISABLE_EXITS should only be used with SMT disabled or trusted guests." |
6565 | |
6566 | if (!mitigate_smt_rsb) { |
6567 | if (boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible() && |
6568 | (cap->args[0] & ~KVM_X86_DISABLE_EXITS_PAUSE)) |
6569 | pr_warn_once(SMT_RSB_MSG); |
6570 | |
6571 | if ((cap->args[0] & KVM_X86_DISABLE_EXITS_MWAIT) && |
6572 | kvm_can_mwait_in_guest()) |
6573 | kvm->arch.mwait_in_guest = true; |
6574 | if (cap->args[0] & KVM_X86_DISABLE_EXITS_HLT) |
6575 | kvm->arch.hlt_in_guest = true; |
6576 | if (cap->args[0] & KVM_X86_DISABLE_EXITS_CSTATE) |
6577 | kvm->arch.cstate_in_guest = true; |
6578 | } |
6579 | |
6580 | r = 0; |
6581 | break; |
6582 | case KVM_CAP_MSR_PLATFORM_INFO: |
6583 | kvm->arch.guest_can_read_msr_platform_info = cap->args[0]; |
6584 | r = 0; |
6585 | break; |
6586 | case KVM_CAP_EXCEPTION_PAYLOAD: |
6587 | kvm->arch.exception_payload_enabled = cap->args[0]; |
6588 | r = 0; |
6589 | break; |
6590 | case KVM_CAP_X86_TRIPLE_FAULT_EVENT: |
6591 | kvm->arch.triple_fault_event = cap->args[0]; |
6592 | r = 0; |
6593 | break; |
6594 | case KVM_CAP_X86_USER_SPACE_MSR: |
6595 | r = -EINVAL; |
6596 | if (cap->args[0] & ~KVM_MSR_EXIT_REASON_VALID_MASK) |
6597 | break; |
6598 | kvm->arch.user_space_msr_mask = cap->args[0]; |
6599 | r = 0; |
6600 | break; |
6601 | case KVM_CAP_X86_BUS_LOCK_EXIT: |
6602 | r = -EINVAL; |
6603 | if (cap->args[0] & ~KVM_BUS_LOCK_DETECTION_VALID_MODE) |
6604 | break; |
6605 | |
6606 | if ((cap->args[0] & KVM_BUS_LOCK_DETECTION_OFF) && |
6607 | (cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT)) |
6608 | break; |
6609 | |
6610 | if (kvm_caps.has_bus_lock_exit && |
6611 | cap->args[0] & KVM_BUS_LOCK_DETECTION_EXIT) |
6612 | kvm->arch.bus_lock_detection_enabled = true; |
6613 | r = 0; |
6614 | break; |
6615 | #ifdef CONFIG_X86_SGX_KVM |
6616 | case KVM_CAP_SGX_ATTRIBUTE: { |
6617 | unsigned long allowed_attributes = 0; |
6618 | |
6619 | r = sgx_set_attribute(allowed_attributes: &allowed_attributes, attribute_fd: cap->args[0]); |
6620 | if (r) |
6621 | break; |
6622 | |
6623 | /* KVM only supports the PROVISIONKEY privileged attribute. */ |
6624 | if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && |
6625 | !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) |
6626 | kvm->arch.sgx_provisioning_allowed = true; |
6627 | else |
6628 | r = -EINVAL; |
6629 | break; |
6630 | } |
6631 | #endif |
6632 | case KVM_CAP_VM_COPY_ENC_CONTEXT_FROM: |
6633 | r = -EINVAL; |
6634 | if (!kvm_x86_ops.vm_copy_enc_context_from) |
6635 | break; |
6636 | |
6637 | r = static_call(kvm_x86_vm_copy_enc_context_from)(kvm, cap->args[0]); |
6638 | break; |
6639 | case KVM_CAP_VM_MOVE_ENC_CONTEXT_FROM: |
6640 | r = -EINVAL; |
6641 | if (!kvm_x86_ops.vm_move_enc_context_from) |
6642 | break; |
6643 | |
6644 | r = static_call(kvm_x86_vm_move_enc_context_from)(kvm, cap->args[0]); |
6645 | break; |
6646 | case KVM_CAP_EXIT_HYPERCALL: |
6647 | if (cap->args[0] & ~KVM_EXIT_HYPERCALL_VALID_MASK) { |
6648 | r = -EINVAL; |
6649 | break; |
6650 | } |
6651 | kvm->arch.hypercall_exit_enabled = cap->args[0]; |
6652 | r = 0; |
6653 | break; |
6654 | case KVM_CAP_EXIT_ON_EMULATION_FAILURE: |
6655 | r = -EINVAL; |
6656 | if (cap->args[0] & ~1) |
6657 | break; |
6658 | kvm->arch.exit_on_emulation_error = cap->args[0]; |
6659 | r = 0; |
6660 | break; |
6661 | case KVM_CAP_PMU_CAPABILITY: |
6662 | r = -EINVAL; |
6663 | if (!enable_pmu || (cap->args[0] & ~KVM_CAP_PMU_VALID_MASK)) |
6664 | break; |
6665 | |
6666 | mutex_lock(&kvm->lock); |
6667 | if (!kvm->created_vcpus) { |
6668 | kvm->arch.enable_pmu = !(cap->args[0] & KVM_PMU_CAP_DISABLE); |
6669 | r = 0; |
6670 | } |
6671 | mutex_unlock(lock: &kvm->lock); |
6672 | break; |
6673 | case KVM_CAP_MAX_VCPU_ID: |
6674 | r = -EINVAL; |
6675 | if (cap->args[0] > KVM_MAX_VCPU_IDS) |
6676 | break; |
6677 | |
6678 | mutex_lock(&kvm->lock); |
6679 | if (kvm->arch.max_vcpu_ids == cap->args[0]) { |
6680 | r = 0; |
6681 | } else if (!kvm->arch.max_vcpu_ids) { |
6682 | kvm->arch.max_vcpu_ids = cap->args[0]; |
6683 | r = 0; |
6684 | } |
6685 | mutex_unlock(lock: &kvm->lock); |
6686 | break; |
6687 | case KVM_CAP_X86_NOTIFY_VMEXIT: |
6688 | r = -EINVAL; |
6689 | if ((u32)cap->args[0] & ~KVM_X86_NOTIFY_VMEXIT_VALID_BITS) |
6690 | break; |
6691 | if (!kvm_caps.has_notify_vmexit) |
6692 | break; |
6693 | if (!((u32)cap->args[0] & KVM_X86_NOTIFY_VMEXIT_ENABLED)) |
6694 | break; |
6695 | mutex_lock(&kvm->lock); |
6696 | if (!kvm->created_vcpus) { |
6697 | kvm->arch.notify_window = cap->args[0] >> 32; |
6698 | kvm->arch.notify_vmexit_flags = (u32)cap->args[0]; |
6699 | r = 0; |
6700 | } |
6701 | mutex_unlock(lock: &kvm->lock); |
6702 | break; |
6703 | case KVM_CAP_VM_DISABLE_NX_HUGE_PAGES: |
6704 | r = -EINVAL; |
6705 | |
6706 | /* |
6707 | * Since the risk of disabling NX hugepages is a guest crashing |
6708 | * the system, ensure the userspace process has permission to |
6709 | * reboot the system. |
6710 | * |
6711 | * Note that unlike the reboot() syscall, the process must have |
6712 | * this capability in the root namespace because exposing |
6713 | * /dev/kvm into a container does not limit the scope of the |
6714 | * iTLB multihit bug to that container. In other words, |
6715 | * this must use capable(), not ns_capable(). |
6716 | */ |
6717 | if (!capable(CAP_SYS_BOOT)) { |
6718 | r = -EPERM; |
6719 | break; |
6720 | } |
6721 | |
6722 | if (cap->args[0]) |
6723 | break; |
6724 | |
6725 | mutex_lock(&kvm->lock); |
6726 | if (!kvm->created_vcpus) { |
6727 | kvm->arch.disable_nx_huge_pages = true; |
6728 | r = 0; |
6729 | } |
6730 | mutex_unlock(lock: &kvm->lock); |
6731 | break; |
6732 | default: |
6733 | r = -EINVAL; |
6734 | break; |
6735 | } |
6736 | return r; |
6737 | } |
6738 | |
6739 | static struct kvm_x86_msr_filter *kvm_alloc_msr_filter(bool default_allow) |
6740 | { |
6741 | struct kvm_x86_msr_filter *msr_filter; |
6742 | |
6743 | msr_filter = kzalloc(size: sizeof(*msr_filter), GFP_KERNEL_ACCOUNT); |
6744 | if (!msr_filter) |
6745 | return NULL; |
6746 | |
6747 | msr_filter->default_allow = default_allow; |
6748 | return msr_filter; |
6749 | } |
6750 | |
6751 | static void kvm_free_msr_filter(struct kvm_x86_msr_filter *msr_filter) |
6752 | { |
6753 | u32 i; |
6754 | |
6755 | if (!msr_filter) |
6756 | return; |
6757 | |
6758 | for (i = 0; i < msr_filter->count; i++) |
6759 | kfree(objp: msr_filter->ranges[i].bitmap); |
6760 | |
6761 | kfree(objp: msr_filter); |
6762 | } |
6763 | |
6764 | static int kvm_add_msr_filter(struct kvm_x86_msr_filter *msr_filter, |
6765 | struct kvm_msr_filter_range *user_range) |
6766 | { |
6767 | unsigned long *bitmap; |
6768 | size_t bitmap_size; |
6769 | |
6770 | if (!user_range->nmsrs) |
6771 | return 0; |
6772 | |
6773 | if (user_range->flags & ~KVM_MSR_FILTER_RANGE_VALID_MASK) |
6774 | return -EINVAL; |
6775 | |
6776 | if (!user_range->flags) |
6777 | return -EINVAL; |
6778 | |
6779 | bitmap_size = BITS_TO_LONGS(user_range->nmsrs) * sizeof(long); |
6780 | if (!bitmap_size || bitmap_size > KVM_MSR_FILTER_MAX_BITMAP_SIZE) |
6781 | return -EINVAL; |
6782 | |
6783 | bitmap = memdup_user((__user u8*)user_range->bitmap, bitmap_size); |
6784 | if (IS_ERR(ptr: bitmap)) |
6785 | return PTR_ERR(ptr: bitmap); |
6786 | |
6787 | msr_filter->ranges[msr_filter->count] = (struct msr_bitmap_range) { |
6788 | .flags = user_range->flags, |
6789 | .base = user_range->base, |
6790 | .nmsrs = user_range->nmsrs, |
6791 | .bitmap = bitmap, |
6792 | }; |
6793 | |
6794 | msr_filter->count++; |
6795 | return 0; |
6796 | } |
6797 | |
6798 | static int kvm_vm_ioctl_set_msr_filter(struct kvm *kvm, |
6799 | struct kvm_msr_filter *filter) |
6800 | { |
6801 | struct kvm_x86_msr_filter *new_filter, *old_filter; |
6802 | bool default_allow; |
6803 | bool empty = true; |
6804 | int r; |
6805 | u32 i; |
6806 | |
6807 | if (filter->flags & ~KVM_MSR_FILTER_VALID_MASK) |
6808 | return -EINVAL; |
6809 | |
6810 | for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) |
6811 | empty &= !filter->ranges[i].nmsrs; |
6812 | |
6813 | default_allow = !(filter->flags & KVM_MSR_FILTER_DEFAULT_DENY); |
6814 | if (empty && !default_allow) |
6815 | return -EINVAL; |
6816 | |
6817 | new_filter = kvm_alloc_msr_filter(default_allow); |
6818 | if (!new_filter) |
6819 | return -ENOMEM; |
6820 | |
6821 | for (i = 0; i < ARRAY_SIZE(filter->ranges); i++) { |
6822 | r = kvm_add_msr_filter(msr_filter: new_filter, user_range: &filter->ranges[i]); |
6823 | if (r) { |
6824 | kvm_free_msr_filter(msr_filter: new_filter); |
6825 | return r; |
6826 | } |
6827 | } |
6828 | |
6829 | mutex_lock(&kvm->lock); |
6830 | old_filter = rcu_replace_pointer(kvm->arch.msr_filter, new_filter, |
6831 | mutex_is_locked(&kvm->lock)); |
6832 | mutex_unlock(lock: &kvm->lock); |
6833 | synchronize_srcu(ssp: &kvm->srcu); |
6834 | |
6835 | kvm_free_msr_filter(msr_filter: old_filter); |
6836 | |
6837 | kvm_make_all_cpus_request(kvm, KVM_REQ_MSR_FILTER_CHANGED); |
6838 | |
6839 | return 0; |
6840 | } |
6841 | |
6842 | #ifdef CONFIG_KVM_COMPAT |
6843 | /* for KVM_X86_SET_MSR_FILTER */ |
6844 | struct kvm_msr_filter_range_compat { |
6845 | __u32 flags; |
6846 | __u32 nmsrs; |
6847 | __u32 base; |
6848 | __u32 bitmap; |
6849 | }; |
6850 | |
6851 | struct kvm_msr_filter_compat { |
6852 | __u32 flags; |
6853 | struct kvm_msr_filter_range_compat ranges[KVM_MSR_FILTER_MAX_RANGES]; |
6854 | }; |
6855 | |
6856 | #define KVM_X86_SET_MSR_FILTER_COMPAT _IOW(KVMIO, 0xc6, struct kvm_msr_filter_compat) |
6857 | |
6858 | long kvm_arch_vm_compat_ioctl(struct file *filp, unsigned int ioctl, |
6859 | unsigned long arg) |
6860 | { |
6861 | void __user *argp = (void __user *)arg; |
6862 | struct kvm *kvm = filp->private_data; |
6863 | long r = -ENOTTY; |
6864 | |
6865 | switch (ioctl) { |
6866 | case KVM_X86_SET_MSR_FILTER_COMPAT: { |
6867 | struct kvm_msr_filter __user *user_msr_filter = argp; |
6868 | struct kvm_msr_filter_compat filter_compat; |
6869 | struct kvm_msr_filter filter; |
6870 | int i; |
6871 | |
6872 | if (copy_from_user(to: &filter_compat, from: user_msr_filter, |
6873 | n: sizeof(filter_compat))) |
6874 | return -EFAULT; |
6875 | |
6876 | filter.flags = filter_compat.flags; |
6877 | for (i = 0; i < ARRAY_SIZE(filter.ranges); i++) { |
6878 | struct kvm_msr_filter_range_compat *cr; |
6879 | |
6880 | cr = &filter_compat.ranges[i]; |
6881 | filter.ranges[i] = (struct kvm_msr_filter_range) { |
6882 | .flags = cr->flags, |
6883 | .nmsrs = cr->nmsrs, |
6884 | .base = cr->base, |
6885 | .bitmap = (__u8 *)(ulong)cr->bitmap, |
6886 | }; |
6887 | } |
6888 | |
6889 | r = kvm_vm_ioctl_set_msr_filter(kvm, filter: &filter); |
6890 | break; |
6891 | } |
6892 | } |
6893 | |
6894 | return r; |
6895 | } |
6896 | #endif |
6897 | |
6898 | #ifdef CONFIG_HAVE_KVM_PM_NOTIFIER |
6899 | static int kvm_arch_suspend_notifier(struct kvm *kvm) |
6900 | { |
6901 | struct kvm_vcpu *vcpu; |
6902 | unsigned long i; |
6903 | int ret = 0; |
6904 | |
6905 | mutex_lock(&kvm->lock); |
6906 | kvm_for_each_vcpu(i, vcpu, kvm) { |
6907 | if (!vcpu->arch.pv_time.active) |
6908 | continue; |
6909 | |
6910 | ret = kvm_set_guest_paused(vcpu); |
6911 | if (ret) { |
6912 | kvm_err("Failed to pause guest VCPU%d: %d\n" , |
6913 | vcpu->vcpu_id, ret); |
6914 | break; |
6915 | } |
6916 | } |
6917 | mutex_unlock(lock: &kvm->lock); |
6918 | |
6919 | return ret ? NOTIFY_BAD : NOTIFY_DONE; |
6920 | } |
6921 | |
6922 | int kvm_arch_pm_notifier(struct kvm *kvm, unsigned long state) |
6923 | { |
6924 | switch (state) { |
6925 | case PM_HIBERNATION_PREPARE: |
6926 | case PM_SUSPEND_PREPARE: |
6927 | return kvm_arch_suspend_notifier(kvm); |
6928 | } |
6929 | |
6930 | return NOTIFY_DONE; |
6931 | } |
6932 | #endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */ |
6933 | |
6934 | static int kvm_vm_ioctl_get_clock(struct kvm *kvm, void __user *argp) |
6935 | { |
6936 | struct kvm_clock_data data = { 0 }; |
6937 | |
6938 | get_kvmclock(kvm, data: &data); |
6939 | if (copy_to_user(to: argp, from: &data, n: sizeof(data))) |
6940 | return -EFAULT; |
6941 | |
6942 | return 0; |
6943 | } |
6944 | |
6945 | static int kvm_vm_ioctl_set_clock(struct kvm *kvm, void __user *argp) |
6946 | { |
6947 | struct kvm_arch *ka = &kvm->arch; |
6948 | struct kvm_clock_data data; |
6949 | u64 now_raw_ns; |
6950 | |
6951 | if (copy_from_user(to: &data, from: argp, n: sizeof(data))) |
6952 | return -EFAULT; |
6953 | |
6954 | /* |
6955 | * Only KVM_CLOCK_REALTIME is used, but allow passing the |
6956 | * result of KVM_GET_CLOCK back to KVM_SET_CLOCK. |
6957 | */ |
6958 | if (data.flags & ~KVM_CLOCK_VALID_FLAGS) |
6959 | return -EINVAL; |
6960 | |
6961 | kvm_hv_request_tsc_page_update(kvm); |
6962 | kvm_start_pvclock_update(kvm); |
6963 | pvclock_update_vm_gtod_copy(kvm); |
6964 | |
6965 | /* |
6966 | * This pairs with kvm_guest_time_update(): when masterclock is |
6967 | * in use, we use master_kernel_ns + kvmclock_offset to set |
6968 | * unsigned 'system_time' so if we use get_kvmclock_ns() (which |
6969 | * is slightly ahead) here we risk going negative on unsigned |
6970 | * 'system_time' when 'data.clock' is very small. |
6971 | */ |
6972 | if (data.flags & KVM_CLOCK_REALTIME) { |
6973 | u64 now_real_ns = ktime_get_real_ns(); |
6974 | |
6975 | /* |
6976 | * Avoid stepping the kvmclock backwards. |
6977 | */ |
6978 | if (now_real_ns > data.realtime) |
6979 | data.clock += now_real_ns - data.realtime; |
6980 | } |
6981 | |
6982 | if (ka->use_master_clock) |
6983 | now_raw_ns = ka->master_kernel_ns; |
6984 | else |
6985 | now_raw_ns = get_kvmclock_base_ns(); |
6986 | ka->kvmclock_offset = data.clock - now_raw_ns; |
6987 | kvm_end_pvclock_update(kvm); |
6988 | return 0; |
6989 | } |
6990 | |
6991 | int kvm_arch_vm_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) |
6992 | { |
6993 | struct kvm *kvm = filp->private_data; |
6994 | void __user *argp = (void __user *)arg; |
6995 | int r = -ENOTTY; |
6996 | /* |
6997 | * This union makes it completely explicit to gcc-3.x |
6998 | * that these two variables' stack usage should be |
6999 | * combined, not added together. |
7000 | */ |
7001 | union { |
7002 | struct kvm_pit_state ps; |
7003 | struct kvm_pit_state2 ps2; |
7004 | struct kvm_pit_config pit_config; |
7005 | } u; |
7006 | |
7007 | switch (ioctl) { |
7008 | case KVM_SET_TSS_ADDR: |
7009 | r = kvm_vm_ioctl_set_tss_addr(kvm, addr: arg); |
7010 | break; |
7011 | case KVM_SET_IDENTITY_MAP_ADDR: { |
7012 | u64 ident_addr; |
7013 | |
7014 | mutex_lock(&kvm->lock); |
7015 | r = -EINVAL; |
7016 | if (kvm->created_vcpus) |
7017 | goto set_identity_unlock; |
7018 | r = -EFAULT; |
7019 | if (copy_from_user(to: &ident_addr, from: argp, n: sizeof(ident_addr))) |
7020 | goto set_identity_unlock; |
7021 | r = kvm_vm_ioctl_set_identity_map_addr(kvm, ident_addr); |
7022 | set_identity_unlock: |
7023 | mutex_unlock(lock: &kvm->lock); |
7024 | break; |
7025 | } |
7026 | case KVM_SET_NR_MMU_PAGES: |
7027 | r = kvm_vm_ioctl_set_nr_mmu_pages(kvm, kvm_nr_mmu_pages: arg); |
7028 | break; |
7029 | case KVM_CREATE_IRQCHIP: { |
7030 | mutex_lock(&kvm->lock); |
7031 | |
7032 | r = -EEXIST; |
7033 | if (irqchip_in_kernel(kvm)) |
7034 | goto create_irqchip_unlock; |
7035 | |
7036 | r = -EINVAL; |
7037 | if (kvm->created_vcpus) |
7038 | goto create_irqchip_unlock; |
7039 | |
7040 | r = kvm_pic_init(kvm); |
7041 | if (r) |
7042 | goto create_irqchip_unlock; |
7043 | |
7044 | r = kvm_ioapic_init(kvm); |
7045 | if (r) { |
7046 | kvm_pic_destroy(kvm); |
7047 | goto create_irqchip_unlock; |
7048 | } |
7049 | |
7050 | r = kvm_setup_default_irq_routing(kvm); |
7051 | if (r) { |
7052 | kvm_ioapic_destroy(kvm); |
7053 | kvm_pic_destroy(kvm); |
7054 | goto create_irqchip_unlock; |
7055 | } |
7056 | /* Write kvm->irq_routing before enabling irqchip_in_kernel. */ |
7057 | smp_wmb(); |
7058 | kvm->arch.irqchip_mode = KVM_IRQCHIP_KERNEL; |
7059 | kvm_clear_apicv_inhibit(kvm, reason: APICV_INHIBIT_REASON_ABSENT); |
7060 | create_irqchip_unlock: |
7061 | mutex_unlock(lock: &kvm->lock); |
7062 | break; |
7063 | } |
7064 | case KVM_CREATE_PIT: |
7065 | u.pit_config.flags = KVM_PIT_SPEAKER_DUMMY; |
7066 | goto create_pit; |
7067 | case KVM_CREATE_PIT2: |
7068 | r = -EFAULT; |
7069 | if (copy_from_user(to: &u.pit_config, from: argp, |
7070 | n: sizeof(struct kvm_pit_config))) |
7071 | goto out; |
7072 | create_pit: |
7073 | mutex_lock(&kvm->lock); |
7074 | r = -EEXIST; |
7075 | if (kvm->arch.vpit) |
7076 | goto create_pit_unlock; |
7077 | r = -ENOENT; |
7078 | if (!pic_in_kernel(kvm)) |
7079 | goto create_pit_unlock; |
7080 | r = -ENOMEM; |
7081 | kvm->arch.vpit = kvm_create_pit(kvm, flags: u.pit_config.flags); |
7082 | if (kvm->arch.vpit) |
7083 | r = 0; |
7084 | create_pit_unlock: |
7085 | mutex_unlock(lock: &kvm->lock); |
7086 | break; |
7087 | case KVM_GET_IRQCHIP: { |
7088 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
7089 | struct kvm_irqchip *chip; |
7090 | |
7091 | chip = memdup_user(argp, sizeof(*chip)); |
7092 | if (IS_ERR(ptr: chip)) { |
7093 | r = PTR_ERR(ptr: chip); |
7094 | goto out; |
7095 | } |
7096 | |
7097 | r = -ENXIO; |
7098 | if (!irqchip_kernel(kvm)) |
7099 | goto get_irqchip_out; |
7100 | r = kvm_vm_ioctl_get_irqchip(kvm, chip); |
7101 | if (r) |
7102 | goto get_irqchip_out; |
7103 | r = -EFAULT; |
7104 | if (copy_to_user(to: argp, from: chip, n: sizeof(*chip))) |
7105 | goto get_irqchip_out; |
7106 | r = 0; |
7107 | get_irqchip_out: |
7108 | kfree(objp: chip); |
7109 | break; |
7110 | } |
7111 | case KVM_SET_IRQCHIP: { |
7112 | /* 0: PIC master, 1: PIC slave, 2: IOAPIC */ |
7113 | struct kvm_irqchip *chip; |
7114 | |
7115 | chip = memdup_user(argp, sizeof(*chip)); |
7116 | if (IS_ERR(ptr: chip)) { |
7117 | r = PTR_ERR(ptr: chip); |
7118 | goto out; |
7119 | } |
7120 | |
7121 | r = -ENXIO; |
7122 | if (!irqchip_kernel(kvm)) |
7123 | goto set_irqchip_out; |
7124 | r = kvm_vm_ioctl_set_irqchip(kvm, chip); |
7125 | set_irqchip_out: |
7126 | kfree(objp: chip); |
7127 | break; |
7128 | } |
7129 | case KVM_GET_PIT: { |
7130 | r = -EFAULT; |
7131 | if (copy_from_user(to: &u.ps, from: argp, n: sizeof(struct kvm_pit_state))) |
7132 | goto out; |
7133 | r = -ENXIO; |
7134 | if (!kvm->arch.vpit) |
7135 | goto out; |
7136 | r = kvm_vm_ioctl_get_pit(kvm, ps: &u.ps); |
7137 | if (r) |
7138 | goto out; |
7139 | r = -EFAULT; |
7140 | if (copy_to_user(to: argp, from: &u.ps, n: sizeof(struct kvm_pit_state))) |
7141 | goto out; |
7142 | r = 0; |
7143 | break; |
7144 | } |
7145 | case KVM_SET_PIT: { |
7146 | r = -EFAULT; |
7147 | if (copy_from_user(to: &u.ps, from: argp, n: sizeof(u.ps))) |
7148 | goto out; |
7149 | mutex_lock(&kvm->lock); |
7150 | r = -ENXIO; |
7151 | if (!kvm->arch.vpit) |
7152 | goto set_pit_out; |
7153 | r = kvm_vm_ioctl_set_pit(kvm, ps: &u.ps); |
7154 | set_pit_out: |
7155 | mutex_unlock(lock: &kvm->lock); |
7156 | break; |
7157 | } |
7158 | case KVM_GET_PIT2: { |
7159 | r = -ENXIO; |
7160 | if (!kvm->arch.vpit) |
7161 | goto out; |
7162 | r = kvm_vm_ioctl_get_pit2(kvm, ps: &u.ps2); |
7163 | if (r) |
7164 | goto out; |
7165 | r = -EFAULT; |
7166 | if (copy_to_user(to: argp, from: &u.ps2, n: sizeof(u.ps2))) |
7167 | goto out; |
7168 | r = 0; |
7169 | break; |
7170 | } |
7171 | case KVM_SET_PIT2: { |
7172 | r = -EFAULT; |
7173 | if (copy_from_user(to: &u.ps2, from: argp, n: sizeof(u.ps2))) |
7174 | goto out; |
7175 | mutex_lock(&kvm->lock); |
7176 | r = -ENXIO; |
7177 | if (!kvm->arch.vpit) |
7178 | goto set_pit2_out; |
7179 | r = kvm_vm_ioctl_set_pit2(kvm, ps: &u.ps2); |
7180 | set_pit2_out: |
7181 | mutex_unlock(lock: &kvm->lock); |
7182 | break; |
7183 | } |
7184 | case KVM_REINJECT_CONTROL: { |
7185 | struct kvm_reinject_control control; |
7186 | r = -EFAULT; |
7187 | if (copy_from_user(to: &control, from: argp, n: sizeof(control))) |
7188 | goto out; |
7189 | r = -ENXIO; |
7190 | if (!kvm->arch.vpit) |
7191 | goto out; |
7192 | r = kvm_vm_ioctl_reinject(kvm, control: &control); |
7193 | break; |
7194 | } |
7195 | case KVM_SET_BOOT_CPU_ID: |
7196 | r = 0; |
7197 | mutex_lock(&kvm->lock); |
7198 | if (kvm->created_vcpus) |
7199 | r = -EBUSY; |
7200 | else |
7201 | kvm->arch.bsp_vcpu_id = arg; |
7202 | mutex_unlock(lock: &kvm->lock); |
7203 | break; |
7204 | #ifdef CONFIG_KVM_XEN |
7205 | case KVM_XEN_HVM_CONFIG: { |
7206 | struct kvm_xen_hvm_config xhc; |
7207 | r = -EFAULT; |
7208 | if (copy_from_user(to: &xhc, from: argp, n: sizeof(xhc))) |
7209 | goto out; |
7210 | r = kvm_xen_hvm_config(kvm, xhc: &xhc); |
7211 | break; |
7212 | } |
7213 | case KVM_XEN_HVM_GET_ATTR: { |
7214 | struct kvm_xen_hvm_attr xha; |
7215 | |
7216 | r = -EFAULT; |
7217 | if (copy_from_user(to: &xha, from: argp, n: sizeof(xha))) |
7218 | goto out; |
7219 | r = kvm_xen_hvm_get_attr(kvm, data: &xha); |
7220 | if (!r && copy_to_user(to: argp, from: &xha, n: sizeof(xha))) |
7221 | r = -EFAULT; |
7222 | break; |
7223 | } |
7224 | case KVM_XEN_HVM_SET_ATTR: { |
7225 | struct kvm_xen_hvm_attr xha; |
7226 | |
7227 | r = -EFAULT; |
7228 | if (copy_from_user(to: &xha, from: argp, n: sizeof(xha))) |
7229 | goto out; |
7230 | r = kvm_xen_hvm_set_attr(kvm, data: &xha); |
7231 | break; |
7232 | } |
7233 | case KVM_XEN_HVM_EVTCHN_SEND: { |
7234 | struct kvm_irq_routing_xen_evtchn uxe; |
7235 | |
7236 | r = -EFAULT; |
7237 | if (copy_from_user(to: &uxe, from: argp, n: sizeof(uxe))) |
7238 | goto out; |
7239 | r = kvm_xen_hvm_evtchn_send(kvm, evt: &uxe); |
7240 | break; |
7241 | } |
7242 | #endif |
7243 | case KVM_SET_CLOCK: |
7244 | r = kvm_vm_ioctl_set_clock(kvm, argp); |
7245 | break; |
7246 | case KVM_GET_CLOCK: |
7247 | r = kvm_vm_ioctl_get_clock(kvm, argp); |
7248 | break; |
7249 | case KVM_SET_TSC_KHZ: { |
7250 | u32 user_tsc_khz; |
7251 | |
7252 | r = -EINVAL; |
7253 | user_tsc_khz = (u32)arg; |
7254 | |
7255 | if (kvm_caps.has_tsc_control && |
7256 | user_tsc_khz >= kvm_caps.max_guest_tsc_khz) |
7257 | goto out; |
7258 | |
7259 | if (user_tsc_khz == 0) |
7260 | user_tsc_khz = tsc_khz; |
7261 | |
7262 | WRITE_ONCE(kvm->arch.default_tsc_khz, user_tsc_khz); |
7263 | r = 0; |
7264 | |
7265 | goto out; |
7266 | } |
7267 | case KVM_GET_TSC_KHZ: { |
7268 | r = READ_ONCE(kvm->arch.default_tsc_khz); |
7269 | goto out; |
7270 | } |
7271 | case KVM_MEMORY_ENCRYPT_OP: { |
7272 | r = -ENOTTY; |
7273 | if (!kvm_x86_ops.mem_enc_ioctl) |
7274 | goto out; |
7275 | |
7276 | r = static_call(kvm_x86_mem_enc_ioctl)(kvm, argp); |
7277 | break; |
7278 | } |
7279 | case KVM_MEMORY_ENCRYPT_REG_REGION: { |
7280 | struct kvm_enc_region region; |
7281 | |
7282 | r = -EFAULT; |
7283 | if (copy_from_user(to: ®ion, from: argp, n: sizeof(region))) |
7284 | goto out; |
7285 | |
7286 | r = -ENOTTY; |
7287 | if (!kvm_x86_ops.mem_enc_register_region) |
7288 | goto out; |
7289 | |
7290 | r = static_call(kvm_x86_mem_enc_register_region)(kvm, ®ion); |
7291 | break; |
7292 | } |
7293 | case KVM_MEMORY_ENCRYPT_UNREG_REGION: { |
7294 | struct kvm_enc_region region; |
7295 | |
7296 | r = -EFAULT; |
7297 | if (copy_from_user(to: ®ion, from: argp, n: sizeof(region))) |
7298 | goto out; |
7299 | |
7300 | r = -ENOTTY; |
7301 | if (!kvm_x86_ops.mem_enc_unregister_region) |
7302 | goto out; |
7303 | |
7304 | r = static_call(kvm_x86_mem_enc_unregister_region)(kvm, ®ion); |
7305 | break; |
7306 | } |
7307 | #ifdef CONFIG_KVM_HYPERV |
7308 | case KVM_HYPERV_EVENTFD: { |
7309 | struct kvm_hyperv_eventfd hvevfd; |
7310 | |
7311 | r = -EFAULT; |
7312 | if (copy_from_user(to: &hvevfd, from: argp, n: sizeof(hvevfd))) |
7313 | goto out; |
7314 | r = kvm_vm_ioctl_hv_eventfd(kvm, args: &hvevfd); |
7315 | break; |
7316 | } |
7317 | #endif |
7318 | case KVM_SET_PMU_EVENT_FILTER: |
7319 | r = kvm_vm_ioctl_set_pmu_event_filter(kvm, argp); |
7320 | break; |
7321 | case KVM_X86_SET_MSR_FILTER: { |
7322 | struct kvm_msr_filter __user *user_msr_filter = argp; |
7323 | struct kvm_msr_filter filter; |
7324 | |
7325 | if (copy_from_user(to: &filter, from: user_msr_filter, n: sizeof(filter))) |
7326 | return -EFAULT; |
7327 | |
7328 | r = kvm_vm_ioctl_set_msr_filter(kvm, filter: &filter); |
7329 | break; |
7330 | } |
7331 | default: |
7332 | r = -ENOTTY; |
7333 | } |
7334 | out: |
7335 | return r; |
7336 | } |
7337 | |
7338 | static void kvm_probe_feature_msr(u32 msr_index) |
7339 | { |
7340 | struct kvm_msr_entry msr = { |
7341 | .index = msr_index, |
7342 | }; |
7343 | |
7344 | if (kvm_get_msr_feature(msr: &msr)) |
7345 | return; |
7346 | |
7347 | msr_based_features[num_msr_based_features++] = msr_index; |
7348 | } |
7349 | |
7350 | static void kvm_probe_msr_to_save(u32 msr_index) |
7351 | { |
7352 | u32 dummy[2]; |
7353 | |
7354 | if (rdmsr_safe(msr_index, &dummy[0], &dummy[1])) |
7355 | return; |
7356 | |
7357 | /* |
7358 | * Even MSRs that are valid in the host may not be exposed to guests in |
7359 | * some cases. |
7360 | */ |
7361 | switch (msr_index) { |
7362 | case MSR_IA32_BNDCFGS: |
7363 | if (!kvm_mpx_supported()) |
7364 | return; |
7365 | break; |
7366 | case MSR_TSC_AUX: |
7367 | if (!kvm_cpu_cap_has(X86_FEATURE_RDTSCP) && |
7368 | !kvm_cpu_cap_has(X86_FEATURE_RDPID)) |
7369 | return; |
7370 | break; |
7371 | case MSR_IA32_UMWAIT_CONTROL: |
7372 | if (!kvm_cpu_cap_has(X86_FEATURE_WAITPKG)) |
7373 | return; |
7374 | break; |
7375 | case MSR_IA32_RTIT_CTL: |
7376 | case MSR_IA32_RTIT_STATUS: |
7377 | if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT)) |
7378 | return; |
7379 | break; |
7380 | case MSR_IA32_RTIT_CR3_MATCH: |
7381 | if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || |
7382 | !intel_pt_validate_hw_cap(cap: PT_CAP_cr3_filtering)) |
7383 | return; |
7384 | break; |
7385 | case MSR_IA32_RTIT_OUTPUT_BASE: |
7386 | case MSR_IA32_RTIT_OUTPUT_MASK: |
7387 | if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || |
7388 | (!intel_pt_validate_hw_cap(cap: PT_CAP_topa_output) && |
7389 | !intel_pt_validate_hw_cap(cap: PT_CAP_single_range_output))) |
7390 | return; |
7391 | break; |
7392 | case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B: |
7393 | if (!kvm_cpu_cap_has(X86_FEATURE_INTEL_PT) || |
7394 | (msr_index - MSR_IA32_RTIT_ADDR0_A >= |
7395 | intel_pt_validate_hw_cap(cap: PT_CAP_num_address_ranges) * 2)) |
7396 | return; |
7397 | break; |
7398 | case MSR_ARCH_PERFMON_PERFCTR0 ... MSR_ARCH_PERFMON_PERFCTR_MAX: |
7399 | if (msr_index - MSR_ARCH_PERFMON_PERFCTR0 >= |
7400 | kvm_pmu_cap.num_counters_gp) |
7401 | return; |
7402 | break; |
7403 | case MSR_ARCH_PERFMON_EVENTSEL0 ... MSR_ARCH_PERFMON_EVENTSEL_MAX: |
7404 | if (msr_index - MSR_ARCH_PERFMON_EVENTSEL0 >= |
7405 | kvm_pmu_cap.num_counters_gp) |
7406 | return; |
7407 | break; |
7408 | case MSR_ARCH_PERFMON_FIXED_CTR0 ... MSR_ARCH_PERFMON_FIXED_CTR_MAX: |
7409 | if (msr_index - MSR_ARCH_PERFMON_FIXED_CTR0 >= |
7410 | kvm_pmu_cap.num_counters_fixed) |
7411 | return; |
7412 | break; |
7413 | case MSR_AMD64_PERF_CNTR_GLOBAL_CTL: |
7414 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS: |
7415 | case MSR_AMD64_PERF_CNTR_GLOBAL_STATUS_CLR: |
7416 | if (!kvm_cpu_cap_has(X86_FEATURE_PERFMON_V2)) |
7417 | return; |
7418 | break; |
7419 | case MSR_IA32_XFD: |
7420 | case MSR_IA32_XFD_ERR: |
7421 | if (!kvm_cpu_cap_has(X86_FEATURE_XFD)) |
7422 | return; |
7423 | break; |
7424 | case MSR_IA32_TSX_CTRL: |
7425 | if (!(kvm_get_arch_capabilities() & ARCH_CAP_TSX_CTRL_MSR)) |
7426 | return; |
7427 | break; |
7428 | default: |
7429 | break; |
7430 | } |
7431 | |
7432 | msrs_to_save[num_msrs_to_save++] = msr_index; |
7433 | } |
7434 | |
7435 | static void kvm_init_msr_lists(void) |
7436 | { |
7437 | unsigned i; |
7438 | |
7439 | BUILD_BUG_ON_MSG(KVM_PMC_MAX_FIXED != 3, |
7440 | "Please update the fixed PMCs in msrs_to_save_pmu[]" ); |
7441 | |
7442 | num_msrs_to_save = 0; |
7443 | num_emulated_msrs = 0; |
7444 | num_msr_based_features = 0; |
7445 | |
7446 | for (i = 0; i < ARRAY_SIZE(msrs_to_save_base); i++) |
7447 | kvm_probe_msr_to_save(msr_index: msrs_to_save_base[i]); |
7448 | |
7449 | if (enable_pmu) { |
7450 | for (i = 0; i < ARRAY_SIZE(msrs_to_save_pmu); i++) |
7451 | kvm_probe_msr_to_save(msr_index: msrs_to_save_pmu[i]); |
7452 | } |
7453 | |
7454 | for (i = 0; i < ARRAY_SIZE(emulated_msrs_all); i++) { |
7455 | if (!static_call(kvm_x86_has_emulated_msr)(NULL, emulated_msrs_all[i])) |
7456 | continue; |
7457 | |
7458 | emulated_msrs[num_emulated_msrs++] = emulated_msrs_all[i]; |
7459 | } |
7460 | |
7461 | for (i = KVM_FIRST_EMULATED_VMX_MSR; i <= KVM_LAST_EMULATED_VMX_MSR; i++) |
7462 | kvm_probe_feature_msr(msr_index: i); |
7463 | |
7464 | for (i = 0; i < ARRAY_SIZE(msr_based_features_all_except_vmx); i++) |
7465 | kvm_probe_feature_msr(msr_index: msr_based_features_all_except_vmx[i]); |
7466 | } |
7467 | |
7468 | static int vcpu_mmio_write(struct kvm_vcpu *vcpu, gpa_t addr, int len, |
7469 | const void *v) |
7470 | { |
7471 | int handled = 0; |
7472 | int n; |
7473 | |
7474 | do { |
7475 | n = min(len, 8); |
7476 | if (!(lapic_in_kernel(vcpu) && |
7477 | !kvm_iodevice_write(vcpu, dev: &vcpu->arch.apic->dev, addr, l: n, v)) |
7478 | && kvm_io_bus_write(vcpu, bus_idx: KVM_MMIO_BUS, addr, len: n, val: v)) |
7479 | break; |
7480 | handled += n; |
7481 | addr += n; |
7482 | len -= n; |
7483 | v += n; |
7484 | } while (len); |
7485 | |
7486 | return handled; |
7487 | } |
7488 | |
7489 | static int vcpu_mmio_read(struct kvm_vcpu *vcpu, gpa_t addr, int len, void *v) |
7490 | { |
7491 | int handled = 0; |
7492 | int n; |
7493 | |
7494 | do { |
7495 | n = min(len, 8); |
7496 | if (!(lapic_in_kernel(vcpu) && |
7497 | !kvm_iodevice_read(vcpu, dev: &vcpu->arch.apic->dev, |
7498 | addr, l: n, v)) |
7499 | && kvm_io_bus_read(vcpu, bus_idx: KVM_MMIO_BUS, addr, len: n, val: v)) |
7500 | break; |
7501 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, len: n, gpa: addr, val: v); |
7502 | handled += n; |
7503 | addr += n; |
7504 | len -= n; |
7505 | v += n; |
7506 | } while (len); |
7507 | |
7508 | return handled; |
7509 | } |
7510 | |
7511 | void kvm_set_segment(struct kvm_vcpu *vcpu, |
7512 | struct kvm_segment *var, int seg) |
7513 | { |
7514 | static_call(kvm_x86_set_segment)(vcpu, var, seg); |
7515 | } |
7516 | |
7517 | void kvm_get_segment(struct kvm_vcpu *vcpu, |
7518 | struct kvm_segment *var, int seg) |
7519 | { |
7520 | static_call(kvm_x86_get_segment)(vcpu, var, seg); |
7521 | } |
7522 | |
7523 | gpa_t translate_nested_gpa(struct kvm_vcpu *vcpu, gpa_t gpa, u64 access, |
7524 | struct x86_exception *exception) |
7525 | { |
7526 | struct kvm_mmu *mmu = vcpu->arch.mmu; |
7527 | gpa_t t_gpa; |
7528 | |
7529 | BUG_ON(!mmu_is_nested(vcpu)); |
7530 | |
7531 | /* NPT walks are always user-walks */ |
7532 | access |= PFERR_USER_MASK; |
7533 | t_gpa = mmu->gva_to_gpa(vcpu, mmu, gpa, access, exception); |
7534 | |
7535 | return t_gpa; |
7536 | } |
7537 | |
7538 | gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, |
7539 | struct x86_exception *exception) |
7540 | { |
7541 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
7542 | |
7543 | u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; |
7544 | return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); |
7545 | } |
7546 | EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); |
7547 | |
7548 | gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, |
7549 | struct x86_exception *exception) |
7550 | { |
7551 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
7552 | |
7553 | u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; |
7554 | access |= PFERR_WRITE_MASK; |
7555 | return mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); |
7556 | } |
7557 | EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); |
7558 | |
7559 | /* uses this to access any guest's mapped memory without checking CPL */ |
7560 | gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, |
7561 | struct x86_exception *exception) |
7562 | { |
7563 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
7564 | |
7565 | return mmu->gva_to_gpa(vcpu, mmu, gva, 0, exception); |
7566 | } |
7567 | |
7568 | static int kvm_read_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, |
7569 | struct kvm_vcpu *vcpu, u64 access, |
7570 | struct x86_exception *exception) |
7571 | { |
7572 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
7573 | void *data = val; |
7574 | int r = X86EMUL_CONTINUE; |
7575 | |
7576 | while (bytes) { |
7577 | gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); |
7578 | unsigned offset = addr & (PAGE_SIZE-1); |
7579 | unsigned toread = min(bytes, (unsigned)PAGE_SIZE - offset); |
7580 | int ret; |
7581 | |
7582 | if (gpa == INVALID_GPA) |
7583 | return X86EMUL_PROPAGATE_FAULT; |
7584 | ret = kvm_vcpu_read_guest_page(vcpu, gfn: gpa >> PAGE_SHIFT, data, |
7585 | offset, len: toread); |
7586 | if (ret < 0) { |
7587 | r = X86EMUL_IO_NEEDED; |
7588 | goto out; |
7589 | } |
7590 | |
7591 | bytes -= toread; |
7592 | data += toread; |
7593 | addr += toread; |
7594 | } |
7595 | out: |
7596 | return r; |
7597 | } |
7598 | |
7599 | /* used for instruction fetching */ |
7600 | static int kvm_fetch_guest_virt(struct x86_emulate_ctxt *ctxt, |
7601 | gva_t addr, void *val, unsigned int bytes, |
7602 | struct x86_exception *exception) |
7603 | { |
7604 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
7605 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
7606 | u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; |
7607 | unsigned offset; |
7608 | int ret; |
7609 | |
7610 | /* Inline kvm_read_guest_virt_helper for speed. */ |
7611 | gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access|PFERR_FETCH_MASK, |
7612 | exception); |
7613 | if (unlikely(gpa == INVALID_GPA)) |
7614 | return X86EMUL_PROPAGATE_FAULT; |
7615 | |
7616 | offset = addr & (PAGE_SIZE-1); |
7617 | if (WARN_ON(offset + bytes > PAGE_SIZE)) |
7618 | bytes = (unsigned)PAGE_SIZE - offset; |
7619 | ret = kvm_vcpu_read_guest_page(vcpu, gfn: gpa >> PAGE_SHIFT, data: val, |
7620 | offset, len: bytes); |
7621 | if (unlikely(ret < 0)) |
7622 | return X86EMUL_IO_NEEDED; |
7623 | |
7624 | return X86EMUL_CONTINUE; |
7625 | } |
7626 | |
7627 | int kvm_read_guest_virt(struct kvm_vcpu *vcpu, |
7628 | gva_t addr, void *val, unsigned int bytes, |
7629 | struct x86_exception *exception) |
7630 | { |
7631 | u64 access = (static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0; |
7632 | |
7633 | /* |
7634 | * FIXME: this should call handle_emulation_failure if X86EMUL_IO_NEEDED |
7635 | * is returned, but our callers are not ready for that and they blindly |
7636 | * call kvm_inject_page_fault. Ensure that they at least do not leak |
7637 | * uninitialized kernel stack memory into cr2 and error code. |
7638 | */ |
7639 | memset(exception, 0, sizeof(*exception)); |
7640 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, |
7641 | exception); |
7642 | } |
7643 | EXPORT_SYMBOL_GPL(kvm_read_guest_virt); |
7644 | |
7645 | static int emulator_read_std(struct x86_emulate_ctxt *ctxt, |
7646 | gva_t addr, void *val, unsigned int bytes, |
7647 | struct x86_exception *exception, bool system) |
7648 | { |
7649 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
7650 | u64 access = 0; |
7651 | |
7652 | if (system) |
7653 | access |= PFERR_IMPLICIT_ACCESS; |
7654 | else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) |
7655 | access |= PFERR_USER_MASK; |
7656 | |
7657 | return kvm_read_guest_virt_helper(addr, val, bytes, vcpu, access, exception); |
7658 | } |
7659 | |
7660 | static int kvm_write_guest_virt_helper(gva_t addr, void *val, unsigned int bytes, |
7661 | struct kvm_vcpu *vcpu, u64 access, |
7662 | struct x86_exception *exception) |
7663 | { |
7664 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
7665 | void *data = val; |
7666 | int r = X86EMUL_CONTINUE; |
7667 | |
7668 | while (bytes) { |
7669 | gpa_t gpa = mmu->gva_to_gpa(vcpu, mmu, addr, access, exception); |
7670 | unsigned offset = addr & (PAGE_SIZE-1); |
7671 | unsigned towrite = min(bytes, (unsigned)PAGE_SIZE - offset); |
7672 | int ret; |
7673 | |
7674 | if (gpa == INVALID_GPA) |
7675 | return X86EMUL_PROPAGATE_FAULT; |
7676 | ret = kvm_vcpu_write_guest(vcpu, gpa, data, len: towrite); |
7677 | if (ret < 0) { |
7678 | r = X86EMUL_IO_NEEDED; |
7679 | goto out; |
7680 | } |
7681 | |
7682 | bytes -= towrite; |
7683 | data += towrite; |
7684 | addr += towrite; |
7685 | } |
7686 | out: |
7687 | return r; |
7688 | } |
7689 | |
7690 | static int emulator_write_std(struct x86_emulate_ctxt *ctxt, gva_t addr, void *val, |
7691 | unsigned int bytes, struct x86_exception *exception, |
7692 | bool system) |
7693 | { |
7694 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
7695 | u64 access = PFERR_WRITE_MASK; |
7696 | |
7697 | if (system) |
7698 | access |= PFERR_IMPLICIT_ACCESS; |
7699 | else if (static_call(kvm_x86_get_cpl)(vcpu) == 3) |
7700 | access |= PFERR_USER_MASK; |
7701 | |
7702 | return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, |
7703 | access, exception); |
7704 | } |
7705 | |
7706 | int kvm_write_guest_virt_system(struct kvm_vcpu *vcpu, gva_t addr, void *val, |
7707 | unsigned int bytes, struct x86_exception *exception) |
7708 | { |
7709 | /* kvm_write_guest_virt_system can pull in tons of pages. */ |
7710 | vcpu->arch.l1tf_flush_l1d = true; |
7711 | |
7712 | return kvm_write_guest_virt_helper(addr, val, bytes, vcpu, |
7713 | PFERR_WRITE_MASK, exception); |
7714 | } |
7715 | EXPORT_SYMBOL_GPL(kvm_write_guest_virt_system); |
7716 | |
7717 | static int kvm_check_emulate_insn(struct kvm_vcpu *vcpu, int emul_type, |
7718 | void *insn, int insn_len) |
7719 | { |
7720 | return static_call(kvm_x86_check_emulate_instruction)(vcpu, emul_type, |
7721 | insn, insn_len); |
7722 | } |
7723 | |
7724 | int handle_ud(struct kvm_vcpu *vcpu) |
7725 | { |
7726 | static const char kvm_emulate_prefix[] = { __KVM_EMULATE_PREFIX }; |
7727 | int fep_flags = READ_ONCE(force_emulation_prefix); |
7728 | int emul_type = EMULTYPE_TRAP_UD; |
7729 | char sig[5]; /* ud2; .ascii "kvm" */ |
7730 | struct x86_exception e; |
7731 | int r; |
7732 | |
7733 | r = kvm_check_emulate_insn(vcpu, emul_type, NULL, insn_len: 0); |
7734 | if (r != X86EMUL_CONTINUE) |
7735 | return 1; |
7736 | |
7737 | if (fep_flags && |
7738 | kvm_read_guest_virt(vcpu, kvm_get_linear_rip(vcpu), |
7739 | sig, sizeof(sig), &e) == 0 && |
7740 | memcmp(p: sig, q: kvm_emulate_prefix, size: sizeof(sig)) == 0) { |
7741 | if (fep_flags & KVM_FEP_CLEAR_RFLAGS_RF) |
7742 | kvm_set_rflags(vcpu, rflags: kvm_get_rflags(vcpu) & ~X86_EFLAGS_RF); |
7743 | kvm_rip_write(vcpu, val: kvm_rip_read(vcpu) + sizeof(sig)); |
7744 | emul_type = EMULTYPE_TRAP_UD_FORCED; |
7745 | } |
7746 | |
7747 | return kvm_emulate_instruction(vcpu, emulation_type: emul_type); |
7748 | } |
7749 | EXPORT_SYMBOL_GPL(handle_ud); |
7750 | |
7751 | static int vcpu_is_mmio_gpa(struct kvm_vcpu *vcpu, unsigned long gva, |
7752 | gpa_t gpa, bool write) |
7753 | { |
7754 | /* For APIC access vmexit */ |
7755 | if ((gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
7756 | return 1; |
7757 | |
7758 | if (vcpu_match_mmio_gpa(vcpu, gpa)) { |
7759 | trace_vcpu_match_mmio(gva, gpa, write, gpa_match: true); |
7760 | return 1; |
7761 | } |
7762 | |
7763 | return 0; |
7764 | } |
7765 | |
7766 | static int vcpu_mmio_gva_to_gpa(struct kvm_vcpu *vcpu, unsigned long gva, |
7767 | gpa_t *gpa, struct x86_exception *exception, |
7768 | bool write) |
7769 | { |
7770 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
7771 | u64 access = ((static_call(kvm_x86_get_cpl)(vcpu) == 3) ? PFERR_USER_MASK : 0) |
7772 | | (write ? PFERR_WRITE_MASK : 0); |
7773 | |
7774 | /* |
7775 | * currently PKRU is only applied to ept enabled guest so |
7776 | * there is no pkey in EPT page table for L1 guest or EPT |
7777 | * shadow page table for L2 guest. |
7778 | */ |
7779 | if (vcpu_match_mmio_gva(vcpu, gva) && (!is_paging(vcpu) || |
7780 | !permission_fault(vcpu, mmu: vcpu->arch.walk_mmu, |
7781 | pte_access: vcpu->arch.mmio_access, pte_pkey: 0, access))) { |
7782 | *gpa = vcpu->arch.mmio_gfn << PAGE_SHIFT | |
7783 | (gva & (PAGE_SIZE - 1)); |
7784 | trace_vcpu_match_mmio(gva, gpa: *gpa, write, gpa_match: false); |
7785 | return 1; |
7786 | } |
7787 | |
7788 | *gpa = mmu->gva_to_gpa(vcpu, mmu, gva, access, exception); |
7789 | |
7790 | if (*gpa == INVALID_GPA) |
7791 | return -1; |
7792 | |
7793 | return vcpu_is_mmio_gpa(vcpu, gva, gpa: *gpa, write); |
7794 | } |
7795 | |
7796 | int emulator_write_phys(struct kvm_vcpu *vcpu, gpa_t gpa, |
7797 | const void *val, int bytes) |
7798 | { |
7799 | int ret; |
7800 | |
7801 | ret = kvm_vcpu_write_guest(vcpu, gpa, data: val, len: bytes); |
7802 | if (ret < 0) |
7803 | return 0; |
7804 | kvm_page_track_write(vcpu, gpa, new: val, bytes); |
7805 | return 1; |
7806 | } |
7807 | |
7808 | struct read_write_emulator_ops { |
7809 | int (*read_write_prepare)(struct kvm_vcpu *vcpu, void *val, |
7810 | int bytes); |
7811 | int (*read_write_emulate)(struct kvm_vcpu *vcpu, gpa_t gpa, |
7812 | void *val, int bytes); |
7813 | int (*read_write_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, |
7814 | int bytes, void *val); |
7815 | int (*read_write_exit_mmio)(struct kvm_vcpu *vcpu, gpa_t gpa, |
7816 | void *val, int bytes); |
7817 | bool write; |
7818 | }; |
7819 | |
7820 | static int read_prepare(struct kvm_vcpu *vcpu, void *val, int bytes) |
7821 | { |
7822 | if (vcpu->mmio_read_completed) { |
7823 | trace_kvm_mmio(KVM_TRACE_MMIO_READ, len: bytes, |
7824 | gpa: vcpu->mmio_fragments[0].gpa, val); |
7825 | vcpu->mmio_read_completed = 0; |
7826 | return 1; |
7827 | } |
7828 | |
7829 | return 0; |
7830 | } |
7831 | |
7832 | static int read_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, |
7833 | void *val, int bytes) |
7834 | { |
7835 | return !kvm_vcpu_read_guest(vcpu, gpa, data: val, len: bytes); |
7836 | } |
7837 | |
7838 | static int write_emulate(struct kvm_vcpu *vcpu, gpa_t gpa, |
7839 | void *val, int bytes) |
7840 | { |
7841 | return emulator_write_phys(vcpu, gpa, val, bytes); |
7842 | } |
7843 | |
7844 | static int write_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, int bytes, void *val) |
7845 | { |
7846 | trace_kvm_mmio(KVM_TRACE_MMIO_WRITE, len: bytes, gpa, val); |
7847 | return vcpu_mmio_write(vcpu, addr: gpa, len: bytes, v: val); |
7848 | } |
7849 | |
7850 | static int read_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, |
7851 | void *val, int bytes) |
7852 | { |
7853 | trace_kvm_mmio(KVM_TRACE_MMIO_READ_UNSATISFIED, len: bytes, gpa, NULL); |
7854 | return X86EMUL_IO_NEEDED; |
7855 | } |
7856 | |
7857 | static int write_exit_mmio(struct kvm_vcpu *vcpu, gpa_t gpa, |
7858 | void *val, int bytes) |
7859 | { |
7860 | struct kvm_mmio_fragment *frag = &vcpu->mmio_fragments[0]; |
7861 | |
7862 | memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); |
7863 | return X86EMUL_CONTINUE; |
7864 | } |
7865 | |
7866 | static const struct read_write_emulator_ops read_emultor = { |
7867 | .read_write_prepare = read_prepare, |
7868 | .read_write_emulate = read_emulate, |
7869 | .read_write_mmio = vcpu_mmio_read, |
7870 | .read_write_exit_mmio = read_exit_mmio, |
7871 | }; |
7872 | |
7873 | static const struct read_write_emulator_ops write_emultor = { |
7874 | .read_write_emulate = write_emulate, |
7875 | .read_write_mmio = write_mmio, |
7876 | .read_write_exit_mmio = write_exit_mmio, |
7877 | .write = true, |
7878 | }; |
7879 | |
7880 | static int emulator_read_write_onepage(unsigned long addr, void *val, |
7881 | unsigned int bytes, |
7882 | struct x86_exception *exception, |
7883 | struct kvm_vcpu *vcpu, |
7884 | const struct read_write_emulator_ops *ops) |
7885 | { |
7886 | gpa_t gpa; |
7887 | int handled, ret; |
7888 | bool write = ops->write; |
7889 | struct kvm_mmio_fragment *frag; |
7890 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
7891 | |
7892 | /* |
7893 | * If the exit was due to a NPF we may already have a GPA. |
7894 | * If the GPA is present, use it to avoid the GVA to GPA table walk. |
7895 | * Note, this cannot be used on string operations since string |
7896 | * operation using rep will only have the initial GPA from the NPF |
7897 | * occurred. |
7898 | */ |
7899 | if (ctxt->gpa_available && emulator_can_use_gpa(ctxt) && |
7900 | (addr & ~PAGE_MASK) == (ctxt->gpa_val & ~PAGE_MASK)) { |
7901 | gpa = ctxt->gpa_val; |
7902 | ret = vcpu_is_mmio_gpa(vcpu, gva: addr, gpa, write); |
7903 | } else { |
7904 | ret = vcpu_mmio_gva_to_gpa(vcpu, gva: addr, gpa: &gpa, exception, write); |
7905 | if (ret < 0) |
7906 | return X86EMUL_PROPAGATE_FAULT; |
7907 | } |
7908 | |
7909 | if (!ret && ops->read_write_emulate(vcpu, gpa, val, bytes)) |
7910 | return X86EMUL_CONTINUE; |
7911 | |
7912 | /* |
7913 | * Is this MMIO handled locally? |
7914 | */ |
7915 | handled = ops->read_write_mmio(vcpu, gpa, bytes, val); |
7916 | if (handled == bytes) |
7917 | return X86EMUL_CONTINUE; |
7918 | |
7919 | gpa += handled; |
7920 | bytes -= handled; |
7921 | val += handled; |
7922 | |
7923 | WARN_ON(vcpu->mmio_nr_fragments >= KVM_MAX_MMIO_FRAGMENTS); |
7924 | frag = &vcpu->mmio_fragments[vcpu->mmio_nr_fragments++]; |
7925 | frag->gpa = gpa; |
7926 | frag->data = val; |
7927 | frag->len = bytes; |
7928 | return X86EMUL_CONTINUE; |
7929 | } |
7930 | |
7931 | static int emulator_read_write(struct x86_emulate_ctxt *ctxt, |
7932 | unsigned long addr, |
7933 | void *val, unsigned int bytes, |
7934 | struct x86_exception *exception, |
7935 | const struct read_write_emulator_ops *ops) |
7936 | { |
7937 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
7938 | gpa_t gpa; |
7939 | int rc; |
7940 | |
7941 | if (ops->read_write_prepare && |
7942 | ops->read_write_prepare(vcpu, val, bytes)) |
7943 | return X86EMUL_CONTINUE; |
7944 | |
7945 | vcpu->mmio_nr_fragments = 0; |
7946 | |
7947 | /* Crossing a page boundary? */ |
7948 | if (((addr + bytes - 1) ^ addr) & PAGE_MASK) { |
7949 | int now; |
7950 | |
7951 | now = -addr & ~PAGE_MASK; |
7952 | rc = emulator_read_write_onepage(addr, val, bytes: now, exception, |
7953 | vcpu, ops); |
7954 | |
7955 | if (rc != X86EMUL_CONTINUE) |
7956 | return rc; |
7957 | addr += now; |
7958 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
7959 | addr = (u32)addr; |
7960 | val += now; |
7961 | bytes -= now; |
7962 | } |
7963 | |
7964 | rc = emulator_read_write_onepage(addr, val, bytes, exception, |
7965 | vcpu, ops); |
7966 | if (rc != X86EMUL_CONTINUE) |
7967 | return rc; |
7968 | |
7969 | if (!vcpu->mmio_nr_fragments) |
7970 | return rc; |
7971 | |
7972 | gpa = vcpu->mmio_fragments[0].gpa; |
7973 | |
7974 | vcpu->mmio_needed = 1; |
7975 | vcpu->mmio_cur_fragment = 0; |
7976 | |
7977 | vcpu->run->mmio.len = min(8u, vcpu->mmio_fragments[0].len); |
7978 | vcpu->run->mmio.is_write = vcpu->mmio_is_write = ops->write; |
7979 | vcpu->run->exit_reason = KVM_EXIT_MMIO; |
7980 | vcpu->run->mmio.phys_addr = gpa; |
7981 | |
7982 | return ops->read_write_exit_mmio(vcpu, gpa, val, bytes); |
7983 | } |
7984 | |
7985 | static int emulator_read_emulated(struct x86_emulate_ctxt *ctxt, |
7986 | unsigned long addr, |
7987 | void *val, |
7988 | unsigned int bytes, |
7989 | struct x86_exception *exception) |
7990 | { |
7991 | return emulator_read_write(ctxt, addr, val, bytes, |
7992 | exception, ops: &read_emultor); |
7993 | } |
7994 | |
7995 | static int emulator_write_emulated(struct x86_emulate_ctxt *ctxt, |
7996 | unsigned long addr, |
7997 | const void *val, |
7998 | unsigned int bytes, |
7999 | struct x86_exception *exception) |
8000 | { |
8001 | return emulator_read_write(ctxt, addr, val: (void *)val, bytes, |
8002 | exception, ops: &write_emultor); |
8003 | } |
8004 | |
8005 | #define emulator_try_cmpxchg_user(t, ptr, old, new) \ |
8006 | (__try_cmpxchg_user((t __user *)(ptr), (t *)(old), *(t *)(new), efault ## t)) |
8007 | |
8008 | static int emulator_cmpxchg_emulated(struct x86_emulate_ctxt *ctxt, |
8009 | unsigned long addr, |
8010 | const void *old, |
8011 | const void *new, |
8012 | unsigned int bytes, |
8013 | struct x86_exception *exception) |
8014 | { |
8015 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8016 | u64 page_line_mask; |
8017 | unsigned long hva; |
8018 | gpa_t gpa; |
8019 | int r; |
8020 | |
8021 | /* guests cmpxchg8b have to be emulated atomically */ |
8022 | if (bytes > 8 || (bytes & (bytes - 1))) |
8023 | goto emul_write; |
8024 | |
8025 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, addr, NULL); |
8026 | |
8027 | if (gpa == INVALID_GPA || |
8028 | (gpa & PAGE_MASK) == APIC_DEFAULT_PHYS_BASE) |
8029 | goto emul_write; |
8030 | |
8031 | /* |
8032 | * Emulate the atomic as a straight write to avoid #AC if SLD is |
8033 | * enabled in the host and the access splits a cache line. |
8034 | */ |
8035 | if (boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT)) |
8036 | page_line_mask = ~(cache_line_size() - 1); |
8037 | else |
8038 | page_line_mask = PAGE_MASK; |
8039 | |
8040 | if (((gpa + bytes - 1) & page_line_mask) != (gpa & page_line_mask)) |
8041 | goto emul_write; |
8042 | |
8043 | hva = kvm_vcpu_gfn_to_hva(vcpu, gfn: gpa_to_gfn(gpa)); |
8044 | if (kvm_is_error_hva(addr: hva)) |
8045 | goto emul_write; |
8046 | |
8047 | hva += offset_in_page(gpa); |
8048 | |
8049 | switch (bytes) { |
8050 | case 1: |
8051 | r = emulator_try_cmpxchg_user(u8, hva, old, new); |
8052 | break; |
8053 | case 2: |
8054 | r = emulator_try_cmpxchg_user(u16, hva, old, new); |
8055 | break; |
8056 | case 4: |
8057 | r = emulator_try_cmpxchg_user(u32, hva, old, new); |
8058 | break; |
8059 | case 8: |
8060 | r = emulator_try_cmpxchg_user(u64, hva, old, new); |
8061 | break; |
8062 | default: |
8063 | BUG(); |
8064 | } |
8065 | |
8066 | if (r < 0) |
8067 | return X86EMUL_UNHANDLEABLE; |
8068 | |
8069 | /* |
8070 | * Mark the page dirty _before_ checking whether or not the CMPXCHG was |
8071 | * successful, as the old value is written back on failure. Note, for |
8072 | * live migration, this is unnecessarily conservative as CMPXCHG writes |
8073 | * back the original value and the access is atomic, but KVM's ABI is |
8074 | * that all writes are dirty logged, regardless of the value written. |
8075 | */ |
8076 | kvm_vcpu_mark_page_dirty(vcpu, gfn: gpa_to_gfn(gpa)); |
8077 | |
8078 | if (r) |
8079 | return X86EMUL_CMPXCHG_FAILED; |
8080 | |
8081 | kvm_page_track_write(vcpu, gpa, new, bytes); |
8082 | |
8083 | return X86EMUL_CONTINUE; |
8084 | |
8085 | emul_write: |
8086 | pr_warn_once("emulating exchange as write\n" ); |
8087 | |
8088 | return emulator_write_emulated(ctxt, addr, val: new, bytes, exception); |
8089 | } |
8090 | |
8091 | static int emulator_pio_in_out(struct kvm_vcpu *vcpu, int size, |
8092 | unsigned short port, void *data, |
8093 | unsigned int count, bool in) |
8094 | { |
8095 | unsigned i; |
8096 | int r; |
8097 | |
8098 | WARN_ON_ONCE(vcpu->arch.pio.count); |
8099 | for (i = 0; i < count; i++) { |
8100 | if (in) |
8101 | r = kvm_io_bus_read(vcpu, bus_idx: KVM_PIO_BUS, addr: port, len: size, val: data); |
8102 | else |
8103 | r = kvm_io_bus_write(vcpu, bus_idx: KVM_PIO_BUS, addr: port, len: size, val: data); |
8104 | |
8105 | if (r) { |
8106 | if (i == 0) |
8107 | goto userspace_io; |
8108 | |
8109 | /* |
8110 | * Userspace must have unregistered the device while PIO |
8111 | * was running. Drop writes / read as 0. |
8112 | */ |
8113 | if (in) |
8114 | memset(data, 0, size * (count - i)); |
8115 | break; |
8116 | } |
8117 | |
8118 | data += size; |
8119 | } |
8120 | return 1; |
8121 | |
8122 | userspace_io: |
8123 | vcpu->arch.pio.port = port; |
8124 | vcpu->arch.pio.in = in; |
8125 | vcpu->arch.pio.count = count; |
8126 | vcpu->arch.pio.size = size; |
8127 | |
8128 | if (in) |
8129 | memset(vcpu->arch.pio_data, 0, size * count); |
8130 | else |
8131 | memcpy(vcpu->arch.pio_data, data, size * count); |
8132 | |
8133 | vcpu->run->exit_reason = KVM_EXIT_IO; |
8134 | vcpu->run->io.direction = in ? KVM_EXIT_IO_IN : KVM_EXIT_IO_OUT; |
8135 | vcpu->run->io.size = size; |
8136 | vcpu->run->io.data_offset = KVM_PIO_PAGE_OFFSET * PAGE_SIZE; |
8137 | vcpu->run->io.count = count; |
8138 | vcpu->run->io.port = port; |
8139 | return 0; |
8140 | } |
8141 | |
8142 | static int emulator_pio_in(struct kvm_vcpu *vcpu, int size, |
8143 | unsigned short port, void *val, unsigned int count) |
8144 | { |
8145 | int r = emulator_pio_in_out(vcpu, size, port, data: val, count, in: true); |
8146 | if (r) |
8147 | trace_kvm_pio(KVM_PIO_IN, port, size, count, data: val); |
8148 | |
8149 | return r; |
8150 | } |
8151 | |
8152 | static void complete_emulator_pio_in(struct kvm_vcpu *vcpu, void *val) |
8153 | { |
8154 | int size = vcpu->arch.pio.size; |
8155 | unsigned int count = vcpu->arch.pio.count; |
8156 | memcpy(val, vcpu->arch.pio_data, size * count); |
8157 | trace_kvm_pio(KVM_PIO_IN, port: vcpu->arch.pio.port, size, count, data: vcpu->arch.pio_data); |
8158 | vcpu->arch.pio.count = 0; |
8159 | } |
8160 | |
8161 | static int emulator_pio_in_emulated(struct x86_emulate_ctxt *ctxt, |
8162 | int size, unsigned short port, void *val, |
8163 | unsigned int count) |
8164 | { |
8165 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8166 | if (vcpu->arch.pio.count) { |
8167 | /* |
8168 | * Complete a previous iteration that required userspace I/O. |
8169 | * Note, @count isn't guaranteed to match pio.count as userspace |
8170 | * can modify ECX before rerunning the vCPU. Ignore any such |
8171 | * shenanigans as KVM doesn't support modifying the rep count, |
8172 | * and the emulator ensures @count doesn't overflow the buffer. |
8173 | */ |
8174 | complete_emulator_pio_in(vcpu, val); |
8175 | return 1; |
8176 | } |
8177 | |
8178 | return emulator_pio_in(vcpu, size, port, val, count); |
8179 | } |
8180 | |
8181 | static int emulator_pio_out(struct kvm_vcpu *vcpu, int size, |
8182 | unsigned short port, const void *val, |
8183 | unsigned int count) |
8184 | { |
8185 | trace_kvm_pio(KVM_PIO_OUT, port, size, count, data: val); |
8186 | return emulator_pio_in_out(vcpu, size, port, data: (void *)val, count, in: false); |
8187 | } |
8188 | |
8189 | static int emulator_pio_out_emulated(struct x86_emulate_ctxt *ctxt, |
8190 | int size, unsigned short port, |
8191 | const void *val, unsigned int count) |
8192 | { |
8193 | return emulator_pio_out(emul_to_vcpu(ctxt), size, port, val, count); |
8194 | } |
8195 | |
8196 | static unsigned long get_segment_base(struct kvm_vcpu *vcpu, int seg) |
8197 | { |
8198 | return static_call(kvm_x86_get_segment_base)(vcpu, seg); |
8199 | } |
8200 | |
8201 | static void emulator_invlpg(struct x86_emulate_ctxt *ctxt, ulong address) |
8202 | { |
8203 | kvm_mmu_invlpg(emul_to_vcpu(ctxt), gva: address); |
8204 | } |
8205 | |
8206 | static int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu) |
8207 | { |
8208 | if (!need_emulate_wbinvd(vcpu)) |
8209 | return X86EMUL_CONTINUE; |
8210 | |
8211 | if (static_call(kvm_x86_has_wbinvd_exit)()) { |
8212 | int cpu = get_cpu(); |
8213 | |
8214 | cpumask_set_cpu(cpu, dstp: vcpu->arch.wbinvd_dirty_mask); |
8215 | on_each_cpu_mask(mask: vcpu->arch.wbinvd_dirty_mask, |
8216 | func: wbinvd_ipi, NULL, wait: 1); |
8217 | put_cpu(); |
8218 | cpumask_clear(dstp: vcpu->arch.wbinvd_dirty_mask); |
8219 | } else |
8220 | wbinvd(); |
8221 | return X86EMUL_CONTINUE; |
8222 | } |
8223 | |
8224 | int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu) |
8225 | { |
8226 | kvm_emulate_wbinvd_noskip(vcpu); |
8227 | return kvm_skip_emulated_instruction(vcpu); |
8228 | } |
8229 | EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd); |
8230 | |
8231 | |
8232 | |
8233 | static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt) |
8234 | { |
8235 | kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt)); |
8236 | } |
8237 | |
8238 | static unsigned long emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr) |
8239 | { |
8240 | return kvm_get_dr(emul_to_vcpu(ctxt), dr); |
8241 | } |
8242 | |
8243 | static int emulator_set_dr(struct x86_emulate_ctxt *ctxt, int dr, |
8244 | unsigned long value) |
8245 | { |
8246 | |
8247 | return kvm_set_dr(emul_to_vcpu(ctxt), dr, value); |
8248 | } |
8249 | |
8250 | static u64 mk_cr_64(u64 curr_cr, u32 new_val) |
8251 | { |
8252 | return (curr_cr & ~((1ULL << 32) - 1)) | new_val; |
8253 | } |
8254 | |
8255 | static unsigned long emulator_get_cr(struct x86_emulate_ctxt *ctxt, int cr) |
8256 | { |
8257 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8258 | unsigned long value; |
8259 | |
8260 | switch (cr) { |
8261 | case 0: |
8262 | value = kvm_read_cr0(vcpu); |
8263 | break; |
8264 | case 2: |
8265 | value = vcpu->arch.cr2; |
8266 | break; |
8267 | case 3: |
8268 | value = kvm_read_cr3(vcpu); |
8269 | break; |
8270 | case 4: |
8271 | value = kvm_read_cr4(vcpu); |
8272 | break; |
8273 | case 8: |
8274 | value = kvm_get_cr8(vcpu); |
8275 | break; |
8276 | default: |
8277 | kvm_err("%s: unexpected cr %u\n" , __func__, cr); |
8278 | return 0; |
8279 | } |
8280 | |
8281 | return value; |
8282 | } |
8283 | |
8284 | static int emulator_set_cr(struct x86_emulate_ctxt *ctxt, int cr, ulong val) |
8285 | { |
8286 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8287 | int res = 0; |
8288 | |
8289 | switch (cr) { |
8290 | case 0: |
8291 | res = kvm_set_cr0(vcpu, mk_cr_64(curr_cr: kvm_read_cr0(vcpu), new_val: val)); |
8292 | break; |
8293 | case 2: |
8294 | vcpu->arch.cr2 = val; |
8295 | break; |
8296 | case 3: |
8297 | res = kvm_set_cr3(vcpu, val); |
8298 | break; |
8299 | case 4: |
8300 | res = kvm_set_cr4(vcpu, mk_cr_64(curr_cr: kvm_read_cr4(vcpu), new_val: val)); |
8301 | break; |
8302 | case 8: |
8303 | res = kvm_set_cr8(vcpu, val); |
8304 | break; |
8305 | default: |
8306 | kvm_err("%s: unexpected cr %u\n" , __func__, cr); |
8307 | res = -1; |
8308 | } |
8309 | |
8310 | return res; |
8311 | } |
8312 | |
8313 | static int emulator_get_cpl(struct x86_emulate_ctxt *ctxt) |
8314 | { |
8315 | return static_call(kvm_x86_get_cpl)(emul_to_vcpu(ctxt)); |
8316 | } |
8317 | |
8318 | static void emulator_get_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) |
8319 | { |
8320 | static_call(kvm_x86_get_gdt)(emul_to_vcpu(ctxt), dt); |
8321 | } |
8322 | |
8323 | static void emulator_get_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) |
8324 | { |
8325 | static_call(kvm_x86_get_idt)(emul_to_vcpu(ctxt), dt); |
8326 | } |
8327 | |
8328 | static void emulator_set_gdt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) |
8329 | { |
8330 | static_call(kvm_x86_set_gdt)(emul_to_vcpu(ctxt), dt); |
8331 | } |
8332 | |
8333 | static void emulator_set_idt(struct x86_emulate_ctxt *ctxt, struct desc_ptr *dt) |
8334 | { |
8335 | static_call(kvm_x86_set_idt)(emul_to_vcpu(ctxt), dt); |
8336 | } |
8337 | |
8338 | static unsigned long emulator_get_cached_segment_base( |
8339 | struct x86_emulate_ctxt *ctxt, int seg) |
8340 | { |
8341 | return get_segment_base(emul_to_vcpu(ctxt), seg); |
8342 | } |
8343 | |
8344 | static bool emulator_get_segment(struct x86_emulate_ctxt *ctxt, u16 *selector, |
8345 | struct desc_struct *desc, u32 *base3, |
8346 | int seg) |
8347 | { |
8348 | struct kvm_segment var; |
8349 | |
8350 | kvm_get_segment(emul_to_vcpu(ctxt), var: &var, seg); |
8351 | *selector = var.selector; |
8352 | |
8353 | if (var.unusable) { |
8354 | memset(desc, 0, sizeof(*desc)); |
8355 | if (base3) |
8356 | *base3 = 0; |
8357 | return false; |
8358 | } |
8359 | |
8360 | if (var.g) |
8361 | var.limit >>= 12; |
8362 | set_desc_limit(desc, limit: var.limit); |
8363 | set_desc_base(desc, base: (unsigned long)var.base); |
8364 | #ifdef CONFIG_X86_64 |
8365 | if (base3) |
8366 | *base3 = var.base >> 32; |
8367 | #endif |
8368 | desc->type = var.type; |
8369 | desc->s = var.s; |
8370 | desc->dpl = var.dpl; |
8371 | desc->p = var.present; |
8372 | desc->avl = var.avl; |
8373 | desc->l = var.l; |
8374 | desc->d = var.db; |
8375 | desc->g = var.g; |
8376 | |
8377 | return true; |
8378 | } |
8379 | |
8380 | static void emulator_set_segment(struct x86_emulate_ctxt *ctxt, u16 selector, |
8381 | struct desc_struct *desc, u32 base3, |
8382 | int seg) |
8383 | { |
8384 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8385 | struct kvm_segment var; |
8386 | |
8387 | var.selector = selector; |
8388 | var.base = get_desc_base(desc); |
8389 | #ifdef CONFIG_X86_64 |
8390 | var.base |= ((u64)base3) << 32; |
8391 | #endif |
8392 | var.limit = get_desc_limit(desc); |
8393 | if (desc->g) |
8394 | var.limit = (var.limit << 12) | 0xfff; |
8395 | var.type = desc->type; |
8396 | var.dpl = desc->dpl; |
8397 | var.db = desc->d; |
8398 | var.s = desc->s; |
8399 | var.l = desc->l; |
8400 | var.g = desc->g; |
8401 | var.avl = desc->avl; |
8402 | var.present = desc->p; |
8403 | var.unusable = !var.present; |
8404 | var.padding = 0; |
8405 | |
8406 | kvm_set_segment(vcpu, var: &var, seg); |
8407 | return; |
8408 | } |
8409 | |
8410 | static int emulator_get_msr_with_filter(struct x86_emulate_ctxt *ctxt, |
8411 | u32 msr_index, u64 *pdata) |
8412 | { |
8413 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8414 | int r; |
8415 | |
8416 | r = kvm_get_msr_with_filter(vcpu, index: msr_index, data: pdata); |
8417 | if (r < 0) |
8418 | return X86EMUL_UNHANDLEABLE; |
8419 | |
8420 | if (r) { |
8421 | if (kvm_msr_user_space(vcpu, index: msr_index, KVM_EXIT_X86_RDMSR, data: 0, |
8422 | completion: complete_emulated_rdmsr, r)) |
8423 | return X86EMUL_IO_NEEDED; |
8424 | |
8425 | trace_kvm_msr_read_ex(msr_index); |
8426 | return X86EMUL_PROPAGATE_FAULT; |
8427 | } |
8428 | |
8429 | trace_kvm_msr_read(msr_index, *pdata); |
8430 | return X86EMUL_CONTINUE; |
8431 | } |
8432 | |
8433 | static int emulator_set_msr_with_filter(struct x86_emulate_ctxt *ctxt, |
8434 | u32 msr_index, u64 data) |
8435 | { |
8436 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8437 | int r; |
8438 | |
8439 | r = kvm_set_msr_with_filter(vcpu, index: msr_index, data); |
8440 | if (r < 0) |
8441 | return X86EMUL_UNHANDLEABLE; |
8442 | |
8443 | if (r) { |
8444 | if (kvm_msr_user_space(vcpu, index: msr_index, KVM_EXIT_X86_WRMSR, data, |
8445 | completion: complete_emulated_msr_access, r)) |
8446 | return X86EMUL_IO_NEEDED; |
8447 | |
8448 | trace_kvm_msr_write_ex(msr_index, data); |
8449 | return X86EMUL_PROPAGATE_FAULT; |
8450 | } |
8451 | |
8452 | trace_kvm_msr_write(msr_index, data); |
8453 | return X86EMUL_CONTINUE; |
8454 | } |
8455 | |
8456 | static int emulator_get_msr(struct x86_emulate_ctxt *ctxt, |
8457 | u32 msr_index, u64 *pdata) |
8458 | { |
8459 | return kvm_get_msr(emul_to_vcpu(ctxt), msr_index, pdata); |
8460 | } |
8461 | |
8462 | static int emulator_check_rdpmc_early(struct x86_emulate_ctxt *ctxt, u32 pmc) |
8463 | { |
8464 | return kvm_pmu_check_rdpmc_early(emul_to_vcpu(ctxt), idx: pmc); |
8465 | } |
8466 | |
8467 | static int emulator_read_pmc(struct x86_emulate_ctxt *ctxt, |
8468 | u32 pmc, u64 *pdata) |
8469 | { |
8470 | return kvm_pmu_rdpmc(emul_to_vcpu(ctxt), pmc, data: pdata); |
8471 | } |
8472 | |
8473 | static void emulator_halt(struct x86_emulate_ctxt *ctxt) |
8474 | { |
8475 | emul_to_vcpu(ctxt)->arch.halt_request = 1; |
8476 | } |
8477 | |
8478 | static int emulator_intercept(struct x86_emulate_ctxt *ctxt, |
8479 | struct x86_instruction_info *info, |
8480 | enum x86_intercept_stage stage) |
8481 | { |
8482 | return static_call(kvm_x86_check_intercept)(emul_to_vcpu(ctxt), info, stage, |
8483 | &ctxt->exception); |
8484 | } |
8485 | |
8486 | static bool emulator_get_cpuid(struct x86_emulate_ctxt *ctxt, |
8487 | u32 *eax, u32 *ebx, u32 *ecx, u32 *edx, |
8488 | bool exact_only) |
8489 | { |
8490 | return kvm_cpuid(emul_to_vcpu(ctxt), eax, ebx, ecx, edx, exact_only); |
8491 | } |
8492 | |
8493 | static bool emulator_guest_has_movbe(struct x86_emulate_ctxt *ctxt) |
8494 | { |
8495 | return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_MOVBE); |
8496 | } |
8497 | |
8498 | static bool emulator_guest_has_fxsr(struct x86_emulate_ctxt *ctxt) |
8499 | { |
8500 | return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_FXSR); |
8501 | } |
8502 | |
8503 | static bool emulator_guest_has_rdpid(struct x86_emulate_ctxt *ctxt) |
8504 | { |
8505 | return guest_cpuid_has(emul_to_vcpu(ctxt), X86_FEATURE_RDPID); |
8506 | } |
8507 | |
8508 | static ulong emulator_read_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg) |
8509 | { |
8510 | return kvm_register_read_raw(emul_to_vcpu(ctxt), reg); |
8511 | } |
8512 | |
8513 | static void emulator_write_gpr(struct x86_emulate_ctxt *ctxt, unsigned reg, ulong val) |
8514 | { |
8515 | kvm_register_write_raw(emul_to_vcpu(ctxt), reg, val); |
8516 | } |
8517 | |
8518 | static void emulator_set_nmi_mask(struct x86_emulate_ctxt *ctxt, bool masked) |
8519 | { |
8520 | static_call(kvm_x86_set_nmi_mask)(emul_to_vcpu(ctxt), masked); |
8521 | } |
8522 | |
8523 | static bool emulator_is_smm(struct x86_emulate_ctxt *ctxt) |
8524 | { |
8525 | return is_smm(emul_to_vcpu(ctxt)); |
8526 | } |
8527 | |
8528 | static bool emulator_is_guest_mode(struct x86_emulate_ctxt *ctxt) |
8529 | { |
8530 | return is_guest_mode(emul_to_vcpu(ctxt)); |
8531 | } |
8532 | |
8533 | #ifndef CONFIG_KVM_SMM |
8534 | static int emulator_leave_smm(struct x86_emulate_ctxt *ctxt) |
8535 | { |
8536 | WARN_ON_ONCE(1); |
8537 | return X86EMUL_UNHANDLEABLE; |
8538 | } |
8539 | #endif |
8540 | |
8541 | static void emulator_triple_fault(struct x86_emulate_ctxt *ctxt) |
8542 | { |
8543 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, emul_to_vcpu(ctxt)); |
8544 | } |
8545 | |
8546 | static int emulator_set_xcr(struct x86_emulate_ctxt *ctxt, u32 index, u64 xcr) |
8547 | { |
8548 | return __kvm_set_xcr(emul_to_vcpu(ctxt), index, xcr); |
8549 | } |
8550 | |
8551 | static void emulator_vm_bugged(struct x86_emulate_ctxt *ctxt) |
8552 | { |
8553 | struct kvm *kvm = emul_to_vcpu(ctxt)->kvm; |
8554 | |
8555 | if (!kvm->vm_bugged) |
8556 | kvm_vm_bugged(kvm); |
8557 | } |
8558 | |
8559 | static gva_t emulator_get_untagged_addr(struct x86_emulate_ctxt *ctxt, |
8560 | gva_t addr, unsigned int flags) |
8561 | { |
8562 | if (!kvm_x86_ops.get_untagged_addr) |
8563 | return addr; |
8564 | |
8565 | return static_call(kvm_x86_get_untagged_addr)(emul_to_vcpu(ctxt), addr, flags); |
8566 | } |
8567 | |
8568 | static const struct x86_emulate_ops emulate_ops = { |
8569 | .vm_bugged = emulator_vm_bugged, |
8570 | .read_gpr = emulator_read_gpr, |
8571 | .write_gpr = emulator_write_gpr, |
8572 | .read_std = emulator_read_std, |
8573 | .write_std = emulator_write_std, |
8574 | .fetch = kvm_fetch_guest_virt, |
8575 | .read_emulated = emulator_read_emulated, |
8576 | .write_emulated = emulator_write_emulated, |
8577 | .cmpxchg_emulated = emulator_cmpxchg_emulated, |
8578 | .invlpg = emulator_invlpg, |
8579 | .pio_in_emulated = emulator_pio_in_emulated, |
8580 | .pio_out_emulated = emulator_pio_out_emulated, |
8581 | .get_segment = emulator_get_segment, |
8582 | .set_segment = emulator_set_segment, |
8583 | .get_cached_segment_base = emulator_get_cached_segment_base, |
8584 | .get_gdt = emulator_get_gdt, |
8585 | .get_idt = emulator_get_idt, |
8586 | .set_gdt = emulator_set_gdt, |
8587 | .set_idt = emulator_set_idt, |
8588 | .get_cr = emulator_get_cr, |
8589 | .set_cr = emulator_set_cr, |
8590 | .cpl = emulator_get_cpl, |
8591 | .get_dr = emulator_get_dr, |
8592 | .set_dr = emulator_set_dr, |
8593 | .set_msr_with_filter = emulator_set_msr_with_filter, |
8594 | .get_msr_with_filter = emulator_get_msr_with_filter, |
8595 | .get_msr = emulator_get_msr, |
8596 | .check_rdpmc_early = emulator_check_rdpmc_early, |
8597 | .read_pmc = emulator_read_pmc, |
8598 | .halt = emulator_halt, |
8599 | .wbinvd = emulator_wbinvd, |
8600 | .fix_hypercall = emulator_fix_hypercall, |
8601 | .intercept = emulator_intercept, |
8602 | .get_cpuid = emulator_get_cpuid, |
8603 | .guest_has_movbe = emulator_guest_has_movbe, |
8604 | .guest_has_fxsr = emulator_guest_has_fxsr, |
8605 | .guest_has_rdpid = emulator_guest_has_rdpid, |
8606 | .set_nmi_mask = emulator_set_nmi_mask, |
8607 | .is_smm = emulator_is_smm, |
8608 | .is_guest_mode = emulator_is_guest_mode, |
8609 | .leave_smm = emulator_leave_smm, |
8610 | .triple_fault = emulator_triple_fault, |
8611 | .set_xcr = emulator_set_xcr, |
8612 | .get_untagged_addr = emulator_get_untagged_addr, |
8613 | }; |
8614 | |
8615 | static void toggle_interruptibility(struct kvm_vcpu *vcpu, u32 mask) |
8616 | { |
8617 | u32 int_shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); |
8618 | /* |
8619 | * an sti; sti; sequence only disable interrupts for the first |
8620 | * instruction. So, if the last instruction, be it emulated or |
8621 | * not, left the system with the INT_STI flag enabled, it |
8622 | * means that the last instruction is an sti. We should not |
8623 | * leave the flag on in this case. The same goes for mov ss |
8624 | */ |
8625 | if (int_shadow & mask) |
8626 | mask = 0; |
8627 | if (unlikely(int_shadow || mask)) { |
8628 | static_call(kvm_x86_set_interrupt_shadow)(vcpu, mask); |
8629 | if (!mask) |
8630 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
8631 | } |
8632 | } |
8633 | |
8634 | static void inject_emulated_exception(struct kvm_vcpu *vcpu) |
8635 | { |
8636 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
8637 | |
8638 | if (ctxt->exception.vector == PF_VECTOR) |
8639 | kvm_inject_emulated_page_fault(vcpu, &ctxt->exception); |
8640 | else if (ctxt->exception.error_code_valid) |
8641 | kvm_queue_exception_e(vcpu, ctxt->exception.vector, |
8642 | ctxt->exception.error_code); |
8643 | else |
8644 | kvm_queue_exception(vcpu, ctxt->exception.vector); |
8645 | } |
8646 | |
8647 | static struct x86_emulate_ctxt *alloc_emulate_ctxt(struct kvm_vcpu *vcpu) |
8648 | { |
8649 | struct x86_emulate_ctxt *ctxt; |
8650 | |
8651 | ctxt = kmem_cache_zalloc(k: x86_emulator_cache, GFP_KERNEL_ACCOUNT); |
8652 | if (!ctxt) { |
8653 | pr_err("failed to allocate vcpu's emulator\n" ); |
8654 | return NULL; |
8655 | } |
8656 | |
8657 | ctxt->vcpu = vcpu; |
8658 | ctxt->ops = &emulate_ops; |
8659 | vcpu->arch.emulate_ctxt = ctxt; |
8660 | |
8661 | return ctxt; |
8662 | } |
8663 | |
8664 | static void init_emulate_ctxt(struct kvm_vcpu *vcpu) |
8665 | { |
8666 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
8667 | int cs_db, cs_l; |
8668 | |
8669 | static_call(kvm_x86_get_cs_db_l_bits)(vcpu, &cs_db, &cs_l); |
8670 | |
8671 | ctxt->gpa_available = false; |
8672 | ctxt->eflags = kvm_get_rflags(vcpu); |
8673 | ctxt->tf = (ctxt->eflags & X86_EFLAGS_TF) != 0; |
8674 | |
8675 | ctxt->eip = kvm_rip_read(vcpu); |
8676 | ctxt->mode = (!is_protmode(vcpu)) ? X86EMUL_MODE_REAL : |
8677 | (ctxt->eflags & X86_EFLAGS_VM) ? X86EMUL_MODE_VM86 : |
8678 | (cs_l && is_long_mode(vcpu)) ? X86EMUL_MODE_PROT64 : |
8679 | cs_db ? X86EMUL_MODE_PROT32 : |
8680 | X86EMUL_MODE_PROT16; |
8681 | ctxt->interruptibility = 0; |
8682 | ctxt->have_exception = false; |
8683 | ctxt->exception.vector = -1; |
8684 | ctxt->perm_ok = false; |
8685 | |
8686 | init_decode_cache(ctxt); |
8687 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
8688 | } |
8689 | |
8690 | void kvm_inject_realmode_interrupt(struct kvm_vcpu *vcpu, int irq, int inc_eip) |
8691 | { |
8692 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
8693 | int ret; |
8694 | |
8695 | init_emulate_ctxt(vcpu); |
8696 | |
8697 | ctxt->op_bytes = 2; |
8698 | ctxt->ad_bytes = 2; |
8699 | ctxt->_eip = ctxt->eip + inc_eip; |
8700 | ret = emulate_int_real(ctxt, irq); |
8701 | |
8702 | if (ret != X86EMUL_CONTINUE) { |
8703 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
8704 | } else { |
8705 | ctxt->eip = ctxt->_eip; |
8706 | kvm_rip_write(vcpu, val: ctxt->eip); |
8707 | kvm_set_rflags(vcpu, rflags: ctxt->eflags); |
8708 | } |
8709 | } |
8710 | EXPORT_SYMBOL_GPL(kvm_inject_realmode_interrupt); |
8711 | |
8712 | static void prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, |
8713 | u8 ndata, u8 *insn_bytes, u8 insn_size) |
8714 | { |
8715 | struct kvm_run *run = vcpu->run; |
8716 | u64 info[5]; |
8717 | u8 info_start; |
8718 | |
8719 | /* |
8720 | * Zero the whole array used to retrieve the exit info, as casting to |
8721 | * u32 for select entries will leave some chunks uninitialized. |
8722 | */ |
8723 | memset(&info, 0, sizeof(info)); |
8724 | |
8725 | static_call(kvm_x86_get_exit_info)(vcpu, (u32 *)&info[0], &info[1], |
8726 | &info[2], (u32 *)&info[3], |
8727 | (u32 *)&info[4]); |
8728 | |
8729 | run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
8730 | run->emulation_failure.suberror = KVM_INTERNAL_ERROR_EMULATION; |
8731 | |
8732 | /* |
8733 | * There's currently space for 13 entries, but 5 are used for the exit |
8734 | * reason and info. Restrict to 4 to reduce the maintenance burden |
8735 | * when expanding kvm_run.emulation_failure in the future. |
8736 | */ |
8737 | if (WARN_ON_ONCE(ndata > 4)) |
8738 | ndata = 4; |
8739 | |
8740 | /* Always include the flags as a 'data' entry. */ |
8741 | info_start = 1; |
8742 | run->emulation_failure.flags = 0; |
8743 | |
8744 | if (insn_size) { |
8745 | BUILD_BUG_ON((sizeof(run->emulation_failure.insn_size) + |
8746 | sizeof(run->emulation_failure.insn_bytes) != 16)); |
8747 | info_start += 2; |
8748 | run->emulation_failure.flags |= |
8749 | KVM_INTERNAL_ERROR_EMULATION_FLAG_INSTRUCTION_BYTES; |
8750 | run->emulation_failure.insn_size = insn_size; |
8751 | memset(run->emulation_failure.insn_bytes, 0x90, |
8752 | sizeof(run->emulation_failure.insn_bytes)); |
8753 | memcpy(run->emulation_failure.insn_bytes, insn_bytes, insn_size); |
8754 | } |
8755 | |
8756 | memcpy(&run->internal.data[info_start], info, sizeof(info)); |
8757 | memcpy(&run->internal.data[info_start + ARRAY_SIZE(info)], data, |
8758 | ndata * sizeof(data[0])); |
8759 | |
8760 | run->emulation_failure.ndata = info_start + ARRAY_SIZE(info) + ndata; |
8761 | } |
8762 | |
8763 | static void prepare_emulation_ctxt_failure_exit(struct kvm_vcpu *vcpu) |
8764 | { |
8765 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
8766 | |
8767 | prepare_emulation_failure_exit(vcpu, NULL, ndata: 0, insn_bytes: ctxt->fetch.data, |
8768 | insn_size: ctxt->fetch.end - ctxt->fetch.data); |
8769 | } |
8770 | |
8771 | void __kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu, u64 *data, |
8772 | u8 ndata) |
8773 | { |
8774 | prepare_emulation_failure_exit(vcpu, data, ndata, NULL, insn_size: 0); |
8775 | } |
8776 | EXPORT_SYMBOL_GPL(__kvm_prepare_emulation_failure_exit); |
8777 | |
8778 | void kvm_prepare_emulation_failure_exit(struct kvm_vcpu *vcpu) |
8779 | { |
8780 | __kvm_prepare_emulation_failure_exit(vcpu, NULL, 0); |
8781 | } |
8782 | EXPORT_SYMBOL_GPL(kvm_prepare_emulation_failure_exit); |
8783 | |
8784 | static int handle_emulation_failure(struct kvm_vcpu *vcpu, int emulation_type) |
8785 | { |
8786 | struct kvm *kvm = vcpu->kvm; |
8787 | |
8788 | ++vcpu->stat.insn_emulation_fail; |
8789 | trace_kvm_emulate_insn_failed(vcpu); |
8790 | |
8791 | if (emulation_type & EMULTYPE_VMWARE_GP) { |
8792 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); |
8793 | return 1; |
8794 | } |
8795 | |
8796 | if (kvm->arch.exit_on_emulation_error || |
8797 | (emulation_type & EMULTYPE_SKIP)) { |
8798 | prepare_emulation_ctxt_failure_exit(vcpu); |
8799 | return 0; |
8800 | } |
8801 | |
8802 | kvm_queue_exception(vcpu, UD_VECTOR); |
8803 | |
8804 | if (!is_guest_mode(vcpu) && static_call(kvm_x86_get_cpl)(vcpu) == 0) { |
8805 | prepare_emulation_ctxt_failure_exit(vcpu); |
8806 | return 0; |
8807 | } |
8808 | |
8809 | return 1; |
8810 | } |
8811 | |
8812 | static bool reexecute_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, |
8813 | int emulation_type) |
8814 | { |
8815 | gpa_t gpa = cr2_or_gpa; |
8816 | kvm_pfn_t pfn; |
8817 | |
8818 | if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) |
8819 | return false; |
8820 | |
8821 | if (WARN_ON_ONCE(is_guest_mode(vcpu)) || |
8822 | WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) |
8823 | return false; |
8824 | |
8825 | if (!vcpu->arch.mmu->root_role.direct) { |
8826 | /* |
8827 | * Write permission should be allowed since only |
8828 | * write access need to be emulated. |
8829 | */ |
8830 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); |
8831 | |
8832 | /* |
8833 | * If the mapping is invalid in guest, let cpu retry |
8834 | * it to generate fault. |
8835 | */ |
8836 | if (gpa == INVALID_GPA) |
8837 | return true; |
8838 | } |
8839 | |
8840 | /* |
8841 | * Do not retry the unhandleable instruction if it faults on the |
8842 | * readonly host memory, otherwise it will goto a infinite loop: |
8843 | * retry instruction -> write #PF -> emulation fail -> retry |
8844 | * instruction -> ... |
8845 | */ |
8846 | pfn = gfn_to_pfn(kvm: vcpu->kvm, gfn: gpa_to_gfn(gpa)); |
8847 | |
8848 | /* |
8849 | * If the instruction failed on the error pfn, it can not be fixed, |
8850 | * report the error to userspace. |
8851 | */ |
8852 | if (is_error_noslot_pfn(pfn)) |
8853 | return false; |
8854 | |
8855 | kvm_release_pfn_clean(pfn); |
8856 | |
8857 | /* |
8858 | * If emulation may have been triggered by a write to a shadowed page |
8859 | * table, unprotect the gfn (zap any relevant SPTEs) and re-enter the |
8860 | * guest to let the CPU re-execute the instruction in the hope that the |
8861 | * CPU can cleanly execute the instruction that KVM failed to emulate. |
8862 | */ |
8863 | if (vcpu->kvm->arch.indirect_shadow_pages) |
8864 | kvm_mmu_unprotect_page(kvm: vcpu->kvm, gfn: gpa_to_gfn(gpa)); |
8865 | |
8866 | /* |
8867 | * If the failed instruction faulted on an access to page tables that |
8868 | * are used to translate any part of the instruction, KVM can't resolve |
8869 | * the issue by unprotecting the gfn, as zapping the shadow page will |
8870 | * result in the instruction taking a !PRESENT page fault and thus put |
8871 | * the vCPU into an infinite loop of page faults. E.g. KVM will create |
8872 | * a SPTE and write-protect the gfn to resolve the !PRESENT fault, and |
8873 | * then zap the SPTE to unprotect the gfn, and then do it all over |
8874 | * again. Report the error to userspace. |
8875 | */ |
8876 | return !(emulation_type & EMULTYPE_WRITE_PF_TO_SP); |
8877 | } |
8878 | |
8879 | static bool retry_instruction(struct x86_emulate_ctxt *ctxt, |
8880 | gpa_t cr2_or_gpa, int emulation_type) |
8881 | { |
8882 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
8883 | unsigned long last_retry_eip, last_retry_addr, gpa = cr2_or_gpa; |
8884 | |
8885 | last_retry_eip = vcpu->arch.last_retry_eip; |
8886 | last_retry_addr = vcpu->arch.last_retry_addr; |
8887 | |
8888 | /* |
8889 | * If the emulation is caused by #PF and it is non-page_table |
8890 | * writing instruction, it means the VM-EXIT is caused by shadow |
8891 | * page protected, we can zap the shadow page and retry this |
8892 | * instruction directly. |
8893 | * |
8894 | * Note: if the guest uses a non-page-table modifying instruction |
8895 | * on the PDE that points to the instruction, then we will unmap |
8896 | * the instruction and go to an infinite loop. So, we cache the |
8897 | * last retried eip and the last fault address, if we meet the eip |
8898 | * and the address again, we can break out of the potential infinite |
8899 | * loop. |
8900 | */ |
8901 | vcpu->arch.last_retry_eip = vcpu->arch.last_retry_addr = 0; |
8902 | |
8903 | if (!(emulation_type & EMULTYPE_ALLOW_RETRY_PF)) |
8904 | return false; |
8905 | |
8906 | if (WARN_ON_ONCE(is_guest_mode(vcpu)) || |
8907 | WARN_ON_ONCE(!(emulation_type & EMULTYPE_PF))) |
8908 | return false; |
8909 | |
8910 | if (x86_page_table_writing_insn(ctxt)) |
8911 | return false; |
8912 | |
8913 | if (ctxt->eip == last_retry_eip && last_retry_addr == cr2_or_gpa) |
8914 | return false; |
8915 | |
8916 | vcpu->arch.last_retry_eip = ctxt->eip; |
8917 | vcpu->arch.last_retry_addr = cr2_or_gpa; |
8918 | |
8919 | if (!vcpu->arch.mmu->root_role.direct) |
8920 | gpa = kvm_mmu_gva_to_gpa_write(vcpu, cr2_or_gpa, NULL); |
8921 | |
8922 | kvm_mmu_unprotect_page(kvm: vcpu->kvm, gfn: gpa_to_gfn(gpa)); |
8923 | |
8924 | return true; |
8925 | } |
8926 | |
8927 | static int complete_emulated_mmio(struct kvm_vcpu *vcpu); |
8928 | static int complete_emulated_pio(struct kvm_vcpu *vcpu); |
8929 | |
8930 | static int kvm_vcpu_check_hw_bp(unsigned long addr, u32 type, u32 dr7, |
8931 | unsigned long *db) |
8932 | { |
8933 | u32 dr6 = 0; |
8934 | int i; |
8935 | u32 enable, rwlen; |
8936 | |
8937 | enable = dr7; |
8938 | rwlen = dr7 >> 16; |
8939 | for (i = 0; i < 4; i++, enable >>= 2, rwlen >>= 4) |
8940 | if ((enable & 3) && (rwlen & 15) == type && db[i] == addr) |
8941 | dr6 |= (1 << i); |
8942 | return dr6; |
8943 | } |
8944 | |
8945 | static int kvm_vcpu_do_singlestep(struct kvm_vcpu *vcpu) |
8946 | { |
8947 | struct kvm_run *kvm_run = vcpu->run; |
8948 | |
8949 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) { |
8950 | kvm_run->debug.arch.dr6 = DR6_BS | DR6_ACTIVE_LOW; |
8951 | kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu); |
8952 | kvm_run->debug.arch.exception = DB_VECTOR; |
8953 | kvm_run->exit_reason = KVM_EXIT_DEBUG; |
8954 | return 0; |
8955 | } |
8956 | kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BS); |
8957 | return 1; |
8958 | } |
8959 | |
8960 | int kvm_skip_emulated_instruction(struct kvm_vcpu *vcpu) |
8961 | { |
8962 | unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); |
8963 | int r; |
8964 | |
8965 | r = static_call(kvm_x86_skip_emulated_instruction)(vcpu); |
8966 | if (unlikely(!r)) |
8967 | return 0; |
8968 | |
8969 | kvm_pmu_trigger_event(vcpu, eventsel: kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); |
8970 | |
8971 | /* |
8972 | * rflags is the old, "raw" value of the flags. The new value has |
8973 | * not been saved yet. |
8974 | * |
8975 | * This is correct even for TF set by the guest, because "the |
8976 | * processor will not generate this exception after the instruction |
8977 | * that sets the TF flag". |
8978 | */ |
8979 | if (unlikely(rflags & X86_EFLAGS_TF)) |
8980 | r = kvm_vcpu_do_singlestep(vcpu); |
8981 | return r; |
8982 | } |
8983 | EXPORT_SYMBOL_GPL(kvm_skip_emulated_instruction); |
8984 | |
8985 | static bool kvm_is_code_breakpoint_inhibited(struct kvm_vcpu *vcpu) |
8986 | { |
8987 | u32 shadow; |
8988 | |
8989 | if (kvm_get_rflags(vcpu) & X86_EFLAGS_RF) |
8990 | return true; |
8991 | |
8992 | /* |
8993 | * Intel CPUs inhibit code #DBs when MOV/POP SS blocking is active, |
8994 | * but AMD CPUs do not. MOV/POP SS blocking is rare, check that first |
8995 | * to avoid the relatively expensive CPUID lookup. |
8996 | */ |
8997 | shadow = static_call(kvm_x86_get_interrupt_shadow)(vcpu); |
8998 | return (shadow & KVM_X86_SHADOW_INT_MOV_SS) && |
8999 | guest_cpuid_is_intel(vcpu); |
9000 | } |
9001 | |
9002 | static bool kvm_vcpu_check_code_breakpoint(struct kvm_vcpu *vcpu, |
9003 | int emulation_type, int *r) |
9004 | { |
9005 | WARN_ON_ONCE(emulation_type & EMULTYPE_NO_DECODE); |
9006 | |
9007 | /* |
9008 | * Do not check for code breakpoints if hardware has already done the |
9009 | * checks, as inferred from the emulation type. On NO_DECODE and SKIP, |
9010 | * the instruction has passed all exception checks, and all intercepted |
9011 | * exceptions that trigger emulation have lower priority than code |
9012 | * breakpoints, i.e. the fact that the intercepted exception occurred |
9013 | * means any code breakpoints have already been serviced. |
9014 | * |
9015 | * Note, KVM needs to check for code #DBs on EMULTYPE_TRAP_UD_FORCED as |
9016 | * hardware has checked the RIP of the magic prefix, but not the RIP of |
9017 | * the instruction being emulated. The intent of forced emulation is |
9018 | * to behave as if KVM intercepted the instruction without an exception |
9019 | * and without a prefix. |
9020 | */ |
9021 | if (emulation_type & (EMULTYPE_NO_DECODE | EMULTYPE_SKIP | |
9022 | EMULTYPE_TRAP_UD | EMULTYPE_VMWARE_GP | EMULTYPE_PF)) |
9023 | return false; |
9024 | |
9025 | if (unlikely(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) && |
9026 | (vcpu->arch.guest_debug_dr7 & DR7_BP_EN_MASK)) { |
9027 | struct kvm_run *kvm_run = vcpu->run; |
9028 | unsigned long eip = kvm_get_linear_rip(vcpu); |
9029 | u32 dr6 = kvm_vcpu_check_hw_bp(addr: eip, type: 0, |
9030 | dr7: vcpu->arch.guest_debug_dr7, |
9031 | db: vcpu->arch.eff_db); |
9032 | |
9033 | if (dr6 != 0) { |
9034 | kvm_run->debug.arch.dr6 = dr6 | DR6_ACTIVE_LOW; |
9035 | kvm_run->debug.arch.pc = eip; |
9036 | kvm_run->debug.arch.exception = DB_VECTOR; |
9037 | kvm_run->exit_reason = KVM_EXIT_DEBUG; |
9038 | *r = 0; |
9039 | return true; |
9040 | } |
9041 | } |
9042 | |
9043 | if (unlikely(vcpu->arch.dr7 & DR7_BP_EN_MASK) && |
9044 | !kvm_is_code_breakpoint_inhibited(vcpu)) { |
9045 | unsigned long eip = kvm_get_linear_rip(vcpu); |
9046 | u32 dr6 = kvm_vcpu_check_hw_bp(addr: eip, type: 0, |
9047 | dr7: vcpu->arch.dr7, |
9048 | db: vcpu->arch.db); |
9049 | |
9050 | if (dr6 != 0) { |
9051 | kvm_queue_exception_p(vcpu, DB_VECTOR, dr6); |
9052 | *r = 1; |
9053 | return true; |
9054 | } |
9055 | } |
9056 | |
9057 | return false; |
9058 | } |
9059 | |
9060 | static bool is_vmware_backdoor_opcode(struct x86_emulate_ctxt *ctxt) |
9061 | { |
9062 | switch (ctxt->opcode_len) { |
9063 | case 1: |
9064 | switch (ctxt->b) { |
9065 | case 0xe4: /* IN */ |
9066 | case 0xe5: |
9067 | case 0xec: |
9068 | case 0xed: |
9069 | case 0xe6: /* OUT */ |
9070 | case 0xe7: |
9071 | case 0xee: |
9072 | case 0xef: |
9073 | case 0x6c: /* INS */ |
9074 | case 0x6d: |
9075 | case 0x6e: /* OUTS */ |
9076 | case 0x6f: |
9077 | return true; |
9078 | } |
9079 | break; |
9080 | case 2: |
9081 | switch (ctxt->b) { |
9082 | case 0x33: /* RDPMC */ |
9083 | return true; |
9084 | } |
9085 | break; |
9086 | } |
9087 | |
9088 | return false; |
9089 | } |
9090 | |
9091 | /* |
9092 | * Decode an instruction for emulation. The caller is responsible for handling |
9093 | * code breakpoints. Note, manually detecting code breakpoints is unnecessary |
9094 | * (and wrong) when emulating on an intercepted fault-like exception[*], as |
9095 | * code breakpoints have higher priority and thus have already been done by |
9096 | * hardware. |
9097 | * |
9098 | * [*] Except #MC, which is higher priority, but KVM should never emulate in |
9099 | * response to a machine check. |
9100 | */ |
9101 | int x86_decode_emulated_instruction(struct kvm_vcpu *vcpu, int emulation_type, |
9102 | void *insn, int insn_len) |
9103 | { |
9104 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
9105 | int r; |
9106 | |
9107 | init_emulate_ctxt(vcpu); |
9108 | |
9109 | r = x86_decode_insn(ctxt, insn, insn_len, emulation_type); |
9110 | |
9111 | trace_kvm_emulate_insn_start(vcpu); |
9112 | ++vcpu->stat.insn_emulation; |
9113 | |
9114 | return r; |
9115 | } |
9116 | EXPORT_SYMBOL_GPL(x86_decode_emulated_instruction); |
9117 | |
9118 | int x86_emulate_instruction(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, |
9119 | int emulation_type, void *insn, int insn_len) |
9120 | { |
9121 | int r; |
9122 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
9123 | bool writeback = true; |
9124 | |
9125 | r = kvm_check_emulate_insn(vcpu, emul_type: emulation_type, insn, insn_len); |
9126 | if (r != X86EMUL_CONTINUE) { |
9127 | if (r == X86EMUL_RETRY_INSTR || r == X86EMUL_PROPAGATE_FAULT) |
9128 | return 1; |
9129 | |
9130 | WARN_ON_ONCE(r != X86EMUL_UNHANDLEABLE); |
9131 | return handle_emulation_failure(vcpu, emulation_type); |
9132 | } |
9133 | |
9134 | vcpu->arch.l1tf_flush_l1d = true; |
9135 | |
9136 | if (!(emulation_type & EMULTYPE_NO_DECODE)) { |
9137 | kvm_clear_exception_queue(vcpu); |
9138 | |
9139 | /* |
9140 | * Return immediately if RIP hits a code breakpoint, such #DBs |
9141 | * are fault-like and are higher priority than any faults on |
9142 | * the code fetch itself. |
9143 | */ |
9144 | if (kvm_vcpu_check_code_breakpoint(vcpu, emulation_type, r: &r)) |
9145 | return r; |
9146 | |
9147 | r = x86_decode_emulated_instruction(vcpu, emulation_type, |
9148 | insn, insn_len); |
9149 | if (r != EMULATION_OK) { |
9150 | if ((emulation_type & EMULTYPE_TRAP_UD) || |
9151 | (emulation_type & EMULTYPE_TRAP_UD_FORCED)) { |
9152 | kvm_queue_exception(vcpu, UD_VECTOR); |
9153 | return 1; |
9154 | } |
9155 | if (reexecute_instruction(vcpu, cr2_or_gpa, |
9156 | emulation_type)) |
9157 | return 1; |
9158 | |
9159 | if (ctxt->have_exception && |
9160 | !(emulation_type & EMULTYPE_SKIP)) { |
9161 | /* |
9162 | * #UD should result in just EMULATION_FAILED, and trap-like |
9163 | * exception should not be encountered during decode. |
9164 | */ |
9165 | WARN_ON_ONCE(ctxt->exception.vector == UD_VECTOR || |
9166 | exception_type(ctxt->exception.vector) == EXCPT_TRAP); |
9167 | inject_emulated_exception(vcpu); |
9168 | return 1; |
9169 | } |
9170 | return handle_emulation_failure(vcpu, emulation_type); |
9171 | } |
9172 | } |
9173 | |
9174 | if ((emulation_type & EMULTYPE_VMWARE_GP) && |
9175 | !is_vmware_backdoor_opcode(ctxt)) { |
9176 | kvm_queue_exception_e(vcpu, GP_VECTOR, 0); |
9177 | return 1; |
9178 | } |
9179 | |
9180 | /* |
9181 | * EMULTYPE_SKIP without EMULTYPE_COMPLETE_USER_EXIT is intended for |
9182 | * use *only* by vendor callbacks for kvm_skip_emulated_instruction(). |
9183 | * The caller is responsible for updating interruptibility state and |
9184 | * injecting single-step #DBs. |
9185 | */ |
9186 | if (emulation_type & EMULTYPE_SKIP) { |
9187 | if (ctxt->mode != X86EMUL_MODE_PROT64) |
9188 | ctxt->eip = (u32)ctxt->_eip; |
9189 | else |
9190 | ctxt->eip = ctxt->_eip; |
9191 | |
9192 | if (emulation_type & EMULTYPE_COMPLETE_USER_EXIT) { |
9193 | r = 1; |
9194 | goto writeback; |
9195 | } |
9196 | |
9197 | kvm_rip_write(vcpu, val: ctxt->eip); |
9198 | if (ctxt->eflags & X86_EFLAGS_RF) |
9199 | kvm_set_rflags(vcpu, rflags: ctxt->eflags & ~X86_EFLAGS_RF); |
9200 | return 1; |
9201 | } |
9202 | |
9203 | if (retry_instruction(ctxt, cr2_or_gpa, emulation_type)) |
9204 | return 1; |
9205 | |
9206 | /* this is needed for vmware backdoor interface to work since it |
9207 | changes registers values during IO operation */ |
9208 | if (vcpu->arch.emulate_regs_need_sync_from_vcpu) { |
9209 | vcpu->arch.emulate_regs_need_sync_from_vcpu = false; |
9210 | emulator_invalidate_register_cache(ctxt); |
9211 | } |
9212 | |
9213 | restart: |
9214 | if (emulation_type & EMULTYPE_PF) { |
9215 | /* Save the faulting GPA (cr2) in the address field */ |
9216 | ctxt->exception.address = cr2_or_gpa; |
9217 | |
9218 | /* With shadow page tables, cr2 contains a GVA or nGPA. */ |
9219 | if (vcpu->arch.mmu->root_role.direct) { |
9220 | ctxt->gpa_available = true; |
9221 | ctxt->gpa_val = cr2_or_gpa; |
9222 | } |
9223 | } else { |
9224 | /* Sanitize the address out of an abundance of paranoia. */ |
9225 | ctxt->exception.address = 0; |
9226 | } |
9227 | |
9228 | r = x86_emulate_insn(ctxt); |
9229 | |
9230 | if (r == EMULATION_INTERCEPTED) |
9231 | return 1; |
9232 | |
9233 | if (r == EMULATION_FAILED) { |
9234 | if (reexecute_instruction(vcpu, cr2_or_gpa, emulation_type)) |
9235 | return 1; |
9236 | |
9237 | return handle_emulation_failure(vcpu, emulation_type); |
9238 | } |
9239 | |
9240 | if (ctxt->have_exception) { |
9241 | WARN_ON_ONCE(vcpu->mmio_needed && !vcpu->mmio_is_write); |
9242 | vcpu->mmio_needed = false; |
9243 | r = 1; |
9244 | inject_emulated_exception(vcpu); |
9245 | } else if (vcpu->arch.pio.count) { |
9246 | if (!vcpu->arch.pio.in) { |
9247 | /* FIXME: return into emulator if single-stepping. */ |
9248 | vcpu->arch.pio.count = 0; |
9249 | } else { |
9250 | writeback = false; |
9251 | vcpu->arch.complete_userspace_io = complete_emulated_pio; |
9252 | } |
9253 | r = 0; |
9254 | } else if (vcpu->mmio_needed) { |
9255 | ++vcpu->stat.mmio_exits; |
9256 | |
9257 | if (!vcpu->mmio_is_write) |
9258 | writeback = false; |
9259 | r = 0; |
9260 | vcpu->arch.complete_userspace_io = complete_emulated_mmio; |
9261 | } else if (vcpu->arch.complete_userspace_io) { |
9262 | writeback = false; |
9263 | r = 0; |
9264 | } else if (r == EMULATION_RESTART) |
9265 | goto restart; |
9266 | else |
9267 | r = 1; |
9268 | |
9269 | writeback: |
9270 | if (writeback) { |
9271 | unsigned long rflags = static_call(kvm_x86_get_rflags)(vcpu); |
9272 | toggle_interruptibility(vcpu, mask: ctxt->interruptibility); |
9273 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
9274 | |
9275 | /* |
9276 | * Note, EXCPT_DB is assumed to be fault-like as the emulator |
9277 | * only supports code breakpoints and general detect #DB, both |
9278 | * of which are fault-like. |
9279 | */ |
9280 | if (!ctxt->have_exception || |
9281 | exception_type(vector: ctxt->exception.vector) == EXCPT_TRAP) { |
9282 | kvm_pmu_trigger_event(vcpu, eventsel: kvm_pmu_eventsel.INSTRUCTIONS_RETIRED); |
9283 | if (ctxt->is_branch) |
9284 | kvm_pmu_trigger_event(vcpu, eventsel: kvm_pmu_eventsel.BRANCH_INSTRUCTIONS_RETIRED); |
9285 | kvm_rip_write(vcpu, val: ctxt->eip); |
9286 | if (r && (ctxt->tf || (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP))) |
9287 | r = kvm_vcpu_do_singlestep(vcpu); |
9288 | static_call_cond(kvm_x86_update_emulated_instruction)(vcpu); |
9289 | __kvm_set_rflags(vcpu, rflags: ctxt->eflags); |
9290 | } |
9291 | |
9292 | /* |
9293 | * For STI, interrupts are shadowed; so KVM_REQ_EVENT will |
9294 | * do nothing, and it will be requested again as soon as |
9295 | * the shadow expires. But we still need to check here, |
9296 | * because POPF has no interrupt shadow. |
9297 | */ |
9298 | if (unlikely((ctxt->eflags & ~rflags) & X86_EFLAGS_IF)) |
9299 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
9300 | } else |
9301 | vcpu->arch.emulate_regs_need_sync_to_vcpu = true; |
9302 | |
9303 | return r; |
9304 | } |
9305 | |
9306 | int kvm_emulate_instruction(struct kvm_vcpu *vcpu, int emulation_type) |
9307 | { |
9308 | return x86_emulate_instruction(vcpu, cr2_or_gpa: 0, emulation_type, NULL, insn_len: 0); |
9309 | } |
9310 | EXPORT_SYMBOL_GPL(kvm_emulate_instruction); |
9311 | |
9312 | int kvm_emulate_instruction_from_buffer(struct kvm_vcpu *vcpu, |
9313 | void *insn, int insn_len) |
9314 | { |
9315 | return x86_emulate_instruction(vcpu, cr2_or_gpa: 0, emulation_type: 0, insn, insn_len); |
9316 | } |
9317 | EXPORT_SYMBOL_GPL(kvm_emulate_instruction_from_buffer); |
9318 | |
9319 | static int complete_fast_pio_out_port_0x7e(struct kvm_vcpu *vcpu) |
9320 | { |
9321 | vcpu->arch.pio.count = 0; |
9322 | return 1; |
9323 | } |
9324 | |
9325 | static int complete_fast_pio_out(struct kvm_vcpu *vcpu) |
9326 | { |
9327 | vcpu->arch.pio.count = 0; |
9328 | |
9329 | if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) |
9330 | return 1; |
9331 | |
9332 | return kvm_skip_emulated_instruction(vcpu); |
9333 | } |
9334 | |
9335 | static int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, |
9336 | unsigned short port) |
9337 | { |
9338 | unsigned long val = kvm_rax_read(vcpu); |
9339 | int ret = emulator_pio_out(vcpu, size, port, val: &val, count: 1); |
9340 | |
9341 | if (ret) |
9342 | return ret; |
9343 | |
9344 | /* |
9345 | * Workaround userspace that relies on old KVM behavior of %rip being |
9346 | * incremented prior to exiting to userspace to handle "OUT 0x7e". |
9347 | */ |
9348 | if (port == 0x7e && |
9349 | kvm_check_has_quirk(kvm: vcpu->kvm, KVM_X86_QUIRK_OUT_7E_INC_RIP)) { |
9350 | vcpu->arch.complete_userspace_io = |
9351 | complete_fast_pio_out_port_0x7e; |
9352 | kvm_skip_emulated_instruction(vcpu); |
9353 | } else { |
9354 | vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); |
9355 | vcpu->arch.complete_userspace_io = complete_fast_pio_out; |
9356 | } |
9357 | return 0; |
9358 | } |
9359 | |
9360 | static int complete_fast_pio_in(struct kvm_vcpu *vcpu) |
9361 | { |
9362 | unsigned long val; |
9363 | |
9364 | /* We should only ever be called with arch.pio.count equal to 1 */ |
9365 | BUG_ON(vcpu->arch.pio.count != 1); |
9366 | |
9367 | if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.pio.linear_rip))) { |
9368 | vcpu->arch.pio.count = 0; |
9369 | return 1; |
9370 | } |
9371 | |
9372 | /* For size less than 4 we merge, else we zero extend */ |
9373 | val = (vcpu->arch.pio.size < 4) ? kvm_rax_read(vcpu) : 0; |
9374 | |
9375 | complete_emulator_pio_in(vcpu, val: &val); |
9376 | kvm_rax_write(vcpu, val); |
9377 | |
9378 | return kvm_skip_emulated_instruction(vcpu); |
9379 | } |
9380 | |
9381 | static int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, |
9382 | unsigned short port) |
9383 | { |
9384 | unsigned long val; |
9385 | int ret; |
9386 | |
9387 | /* For size less than 4 we merge, else we zero extend */ |
9388 | val = (size < 4) ? kvm_rax_read(vcpu) : 0; |
9389 | |
9390 | ret = emulator_pio_in(vcpu, size, port, val: &val, count: 1); |
9391 | if (ret) { |
9392 | kvm_rax_write(vcpu, val); |
9393 | return ret; |
9394 | } |
9395 | |
9396 | vcpu->arch.pio.linear_rip = kvm_get_linear_rip(vcpu); |
9397 | vcpu->arch.complete_userspace_io = complete_fast_pio_in; |
9398 | |
9399 | return 0; |
9400 | } |
9401 | |
9402 | int kvm_fast_pio(struct kvm_vcpu *vcpu, int size, unsigned short port, int in) |
9403 | { |
9404 | int ret; |
9405 | |
9406 | if (in) |
9407 | ret = kvm_fast_pio_in(vcpu, size, port); |
9408 | else |
9409 | ret = kvm_fast_pio_out(vcpu, size, port); |
9410 | return ret && kvm_skip_emulated_instruction(vcpu); |
9411 | } |
9412 | EXPORT_SYMBOL_GPL(kvm_fast_pio); |
9413 | |
9414 | static int kvmclock_cpu_down_prep(unsigned int cpu) |
9415 | { |
9416 | __this_cpu_write(cpu_tsc_khz, 0); |
9417 | return 0; |
9418 | } |
9419 | |
9420 | static void tsc_khz_changed(void *data) |
9421 | { |
9422 | struct cpufreq_freqs *freq = data; |
9423 | unsigned long khz; |
9424 | |
9425 | WARN_ON_ONCE(boot_cpu_has(X86_FEATURE_CONSTANT_TSC)); |
9426 | |
9427 | if (data) |
9428 | khz = freq->new; |
9429 | else |
9430 | khz = cpufreq_quick_get(raw_smp_processor_id()); |
9431 | if (!khz) |
9432 | khz = tsc_khz; |
9433 | __this_cpu_write(cpu_tsc_khz, khz); |
9434 | } |
9435 | |
9436 | #ifdef CONFIG_X86_64 |
9437 | static void kvm_hyperv_tsc_notifier(void) |
9438 | { |
9439 | struct kvm *kvm; |
9440 | int cpu; |
9441 | |
9442 | mutex_lock(&kvm_lock); |
9443 | list_for_each_entry(kvm, &vm_list, vm_list) |
9444 | kvm_make_mclock_inprogress_request(kvm); |
9445 | |
9446 | /* no guest entries from this point */ |
9447 | hyperv_stop_tsc_emulation(); |
9448 | |
9449 | /* TSC frequency always matches when on Hyper-V */ |
9450 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
9451 | for_each_present_cpu(cpu) |
9452 | per_cpu(cpu_tsc_khz, cpu) = tsc_khz; |
9453 | } |
9454 | kvm_caps.max_guest_tsc_khz = tsc_khz; |
9455 | |
9456 | list_for_each_entry(kvm, &vm_list, vm_list) { |
9457 | __kvm_start_pvclock_update(kvm); |
9458 | pvclock_update_vm_gtod_copy(kvm); |
9459 | kvm_end_pvclock_update(kvm); |
9460 | } |
9461 | |
9462 | mutex_unlock(lock: &kvm_lock); |
9463 | } |
9464 | #endif |
9465 | |
9466 | static void __kvmclock_cpufreq_notifier(struct cpufreq_freqs *freq, int cpu) |
9467 | { |
9468 | struct kvm *kvm; |
9469 | struct kvm_vcpu *vcpu; |
9470 | int send_ipi = 0; |
9471 | unsigned long i; |
9472 | |
9473 | /* |
9474 | * We allow guests to temporarily run on slowing clocks, |
9475 | * provided we notify them after, or to run on accelerating |
9476 | * clocks, provided we notify them before. Thus time never |
9477 | * goes backwards. |
9478 | * |
9479 | * However, we have a problem. We can't atomically update |
9480 | * the frequency of a given CPU from this function; it is |
9481 | * merely a notifier, which can be called from any CPU. |
9482 | * Changing the TSC frequency at arbitrary points in time |
9483 | * requires a recomputation of local variables related to |
9484 | * the TSC for each VCPU. We must flag these local variables |
9485 | * to be updated and be sure the update takes place with the |
9486 | * new frequency before any guests proceed. |
9487 | * |
9488 | * Unfortunately, the combination of hotplug CPU and frequency |
9489 | * change creates an intractable locking scenario; the order |
9490 | * of when these callouts happen is undefined with respect to |
9491 | * CPU hotplug, and they can race with each other. As such, |
9492 | * merely setting per_cpu(cpu_tsc_khz) = X during a hotadd is |
9493 | * undefined; you can actually have a CPU frequency change take |
9494 | * place in between the computation of X and the setting of the |
9495 | * variable. To protect against this problem, all updates of |
9496 | * the per_cpu tsc_khz variable are done in an interrupt |
9497 | * protected IPI, and all callers wishing to update the value |
9498 | * must wait for a synchronous IPI to complete (which is trivial |
9499 | * if the caller is on the CPU already). This establishes the |
9500 | * necessary total order on variable updates. |
9501 | * |
9502 | * Note that because a guest time update may take place |
9503 | * anytime after the setting of the VCPU's request bit, the |
9504 | * correct TSC value must be set before the request. However, |
9505 | * to ensure the update actually makes it to any guest which |
9506 | * starts running in hardware virtualization between the set |
9507 | * and the acquisition of the spinlock, we must also ping the |
9508 | * CPU after setting the request bit. |
9509 | * |
9510 | */ |
9511 | |
9512 | smp_call_function_single(cpuid: cpu, func: tsc_khz_changed, info: freq, wait: 1); |
9513 | |
9514 | mutex_lock(&kvm_lock); |
9515 | list_for_each_entry(kvm, &vm_list, vm_list) { |
9516 | kvm_for_each_vcpu(i, vcpu, kvm) { |
9517 | if (vcpu->cpu != cpu) |
9518 | continue; |
9519 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
9520 | if (vcpu->cpu != raw_smp_processor_id()) |
9521 | send_ipi = 1; |
9522 | } |
9523 | } |
9524 | mutex_unlock(lock: &kvm_lock); |
9525 | |
9526 | if (freq->old < freq->new && send_ipi) { |
9527 | /* |
9528 | * We upscale the frequency. Must make the guest |
9529 | * doesn't see old kvmclock values while running with |
9530 | * the new frequency, otherwise we risk the guest sees |
9531 | * time go backwards. |
9532 | * |
9533 | * In case we update the frequency for another cpu |
9534 | * (which might be in guest context) send an interrupt |
9535 | * to kick the cpu out of guest context. Next time |
9536 | * guest context is entered kvmclock will be updated, |
9537 | * so the guest will not see stale values. |
9538 | */ |
9539 | smp_call_function_single(cpuid: cpu, func: tsc_khz_changed, info: freq, wait: 1); |
9540 | } |
9541 | } |
9542 | |
9543 | static int kvmclock_cpufreq_notifier(struct notifier_block *nb, unsigned long val, |
9544 | void *data) |
9545 | { |
9546 | struct cpufreq_freqs *freq = data; |
9547 | int cpu; |
9548 | |
9549 | if (val == CPUFREQ_PRECHANGE && freq->old > freq->new) |
9550 | return 0; |
9551 | if (val == CPUFREQ_POSTCHANGE && freq->old < freq->new) |
9552 | return 0; |
9553 | |
9554 | for_each_cpu(cpu, freq->policy->cpus) |
9555 | __kvmclock_cpufreq_notifier(freq, cpu); |
9556 | |
9557 | return 0; |
9558 | } |
9559 | |
9560 | static struct notifier_block kvmclock_cpufreq_notifier_block = { |
9561 | .notifier_call = kvmclock_cpufreq_notifier |
9562 | }; |
9563 | |
9564 | static int kvmclock_cpu_online(unsigned int cpu) |
9565 | { |
9566 | tsc_khz_changed(NULL); |
9567 | return 0; |
9568 | } |
9569 | |
9570 | static void kvm_timer_init(void) |
9571 | { |
9572 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
9573 | max_tsc_khz = tsc_khz; |
9574 | |
9575 | if (IS_ENABLED(CONFIG_CPU_FREQ)) { |
9576 | struct cpufreq_policy *policy; |
9577 | int cpu; |
9578 | |
9579 | cpu = get_cpu(); |
9580 | policy = cpufreq_cpu_get(cpu); |
9581 | if (policy) { |
9582 | if (policy->cpuinfo.max_freq) |
9583 | max_tsc_khz = policy->cpuinfo.max_freq; |
9584 | cpufreq_cpu_put(policy); |
9585 | } |
9586 | put_cpu(); |
9587 | } |
9588 | cpufreq_register_notifier(nb: &kvmclock_cpufreq_notifier_block, |
9589 | CPUFREQ_TRANSITION_NOTIFIER); |
9590 | |
9591 | cpuhp_setup_state(state: CPUHP_AP_X86_KVM_CLK_ONLINE, name: "x86/kvm/clk:online" , |
9592 | startup: kvmclock_cpu_online, teardown: kvmclock_cpu_down_prep); |
9593 | } |
9594 | } |
9595 | |
9596 | #ifdef CONFIG_X86_64 |
9597 | static void pvclock_gtod_update_fn(struct work_struct *work) |
9598 | { |
9599 | struct kvm *kvm; |
9600 | struct kvm_vcpu *vcpu; |
9601 | unsigned long i; |
9602 | |
9603 | mutex_lock(&kvm_lock); |
9604 | list_for_each_entry(kvm, &vm_list, vm_list) |
9605 | kvm_for_each_vcpu(i, vcpu, kvm) |
9606 | kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
9607 | atomic_set(v: &kvm_guest_has_master_clock, i: 0); |
9608 | mutex_unlock(lock: &kvm_lock); |
9609 | } |
9610 | |
9611 | static DECLARE_WORK(pvclock_gtod_work, pvclock_gtod_update_fn); |
9612 | |
9613 | /* |
9614 | * Indirection to move queue_work() out of the tk_core.seq write held |
9615 | * region to prevent possible deadlocks against time accessors which |
9616 | * are invoked with work related locks held. |
9617 | */ |
9618 | static void pvclock_irq_work_fn(struct irq_work *w) |
9619 | { |
9620 | queue_work(wq: system_long_wq, work: &pvclock_gtod_work); |
9621 | } |
9622 | |
9623 | static DEFINE_IRQ_WORK(pvclock_irq_work, pvclock_irq_work_fn); |
9624 | |
9625 | /* |
9626 | * Notification about pvclock gtod data update. |
9627 | */ |
9628 | static int pvclock_gtod_notify(struct notifier_block *nb, unsigned long unused, |
9629 | void *priv) |
9630 | { |
9631 | struct pvclock_gtod_data *gtod = &pvclock_gtod_data; |
9632 | struct timekeeper *tk = priv; |
9633 | |
9634 | update_pvclock_gtod(tk); |
9635 | |
9636 | /* |
9637 | * Disable master clock if host does not trust, or does not use, |
9638 | * TSC based clocksource. Delegate queue_work() to irq_work as |
9639 | * this is invoked with tk_core.seq write held. |
9640 | */ |
9641 | if (!gtod_is_based_on_tsc(mode: gtod->clock.vclock_mode) && |
9642 | atomic_read(v: &kvm_guest_has_master_clock) != 0) |
9643 | irq_work_queue(work: &pvclock_irq_work); |
9644 | return 0; |
9645 | } |
9646 | |
9647 | static struct notifier_block pvclock_gtod_notifier = { |
9648 | .notifier_call = pvclock_gtod_notify, |
9649 | }; |
9650 | #endif |
9651 | |
9652 | static inline void kvm_ops_update(struct kvm_x86_init_ops *ops) |
9653 | { |
9654 | memcpy(&kvm_x86_ops, ops->runtime_ops, sizeof(kvm_x86_ops)); |
9655 | |
9656 | #define __KVM_X86_OP(func) \ |
9657 | static_call_update(kvm_x86_##func, kvm_x86_ops.func); |
9658 | #define KVM_X86_OP(func) \ |
9659 | WARN_ON(!kvm_x86_ops.func); __KVM_X86_OP(func) |
9660 | #define KVM_X86_OP_OPTIONAL __KVM_X86_OP |
9661 | #define KVM_X86_OP_OPTIONAL_RET0(func) \ |
9662 | static_call_update(kvm_x86_##func, (void *)kvm_x86_ops.func ? : \ |
9663 | (void *)__static_call_return0); |
9664 | #include <asm/kvm-x86-ops.h> |
9665 | #undef __KVM_X86_OP |
9666 | |
9667 | kvm_pmu_ops_update(pmu_ops: ops->pmu_ops); |
9668 | } |
9669 | |
9670 | static int kvm_x86_check_processor_compatibility(void) |
9671 | { |
9672 | int cpu = smp_processor_id(); |
9673 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
9674 | |
9675 | /* |
9676 | * Compatibility checks are done when loading KVM and when enabling |
9677 | * hardware, e.g. during CPU hotplug, to ensure all online CPUs are |
9678 | * compatible, i.e. KVM should never perform a compatibility check on |
9679 | * an offline CPU. |
9680 | */ |
9681 | WARN_ON(!cpu_online(cpu)); |
9682 | |
9683 | if (__cr4_reserved_bits(cpu_has, c) != |
9684 | __cr4_reserved_bits(cpu_has, &boot_cpu_data)) |
9685 | return -EIO; |
9686 | |
9687 | return static_call(kvm_x86_check_processor_compatibility)(); |
9688 | } |
9689 | |
9690 | static void kvm_x86_check_cpu_compat(void *ret) |
9691 | { |
9692 | *(int *)ret = kvm_x86_check_processor_compatibility(); |
9693 | } |
9694 | |
9695 | int kvm_x86_vendor_init(struct kvm_x86_init_ops *ops) |
9696 | { |
9697 | u64 host_pat; |
9698 | int r, cpu; |
9699 | |
9700 | guard(mutex)(T: &vendor_module_lock); |
9701 | |
9702 | if (kvm_x86_ops.hardware_enable) { |
9703 | pr_err("already loaded vendor module '%s'\n" , kvm_x86_ops.name); |
9704 | return -EEXIST; |
9705 | } |
9706 | |
9707 | /* |
9708 | * KVM explicitly assumes that the guest has an FPU and |
9709 | * FXSAVE/FXRSTOR. For example, the KVM_GET_FPU explicitly casts the |
9710 | * vCPU's FPU state as a fxregs_state struct. |
9711 | */ |
9712 | if (!boot_cpu_has(X86_FEATURE_FPU) || !boot_cpu_has(X86_FEATURE_FXSR)) { |
9713 | pr_err("inadequate fpu\n" ); |
9714 | return -EOPNOTSUPP; |
9715 | } |
9716 | |
9717 | if (IS_ENABLED(CONFIG_PREEMPT_RT) && !boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
9718 | pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n" ); |
9719 | return -EOPNOTSUPP; |
9720 | } |
9721 | |
9722 | /* |
9723 | * KVM assumes that PAT entry '0' encodes WB memtype and simply zeroes |
9724 | * the PAT bits in SPTEs. Bail if PAT[0] is programmed to something |
9725 | * other than WB. Note, EPT doesn't utilize the PAT, but don't bother |
9726 | * with an exception. PAT[0] is set to WB on RESET and also by the |
9727 | * kernel, i.e. failure indicates a kernel bug or broken firmware. |
9728 | */ |
9729 | if (rdmsrl_safe(MSR_IA32_CR_PAT, p: &host_pat) || |
9730 | (host_pat & GENMASK(2, 0)) != 6) { |
9731 | pr_err("host PAT[0] is not WB\n" ); |
9732 | return -EIO; |
9733 | } |
9734 | |
9735 | x86_emulator_cache = kvm_alloc_emulator_cache(); |
9736 | if (!x86_emulator_cache) { |
9737 | pr_err("failed to allocate cache for x86 emulator\n" ); |
9738 | return -ENOMEM; |
9739 | } |
9740 | |
9741 | user_return_msrs = alloc_percpu(struct kvm_user_return_msrs); |
9742 | if (!user_return_msrs) { |
9743 | pr_err("failed to allocate percpu kvm_user_return_msrs\n" ); |
9744 | r = -ENOMEM; |
9745 | goto out_free_x86_emulator_cache; |
9746 | } |
9747 | kvm_nr_uret_msrs = 0; |
9748 | |
9749 | r = kvm_mmu_vendor_module_init(); |
9750 | if (r) |
9751 | goto out_free_percpu; |
9752 | |
9753 | if (boot_cpu_has(X86_FEATURE_XSAVE)) { |
9754 | host_xcr0 = xgetbv(XCR_XFEATURE_ENABLED_MASK); |
9755 | kvm_caps.supported_xcr0 = host_xcr0 & KVM_SUPPORTED_XCR0; |
9756 | } |
9757 | |
9758 | rdmsrl_safe(MSR_EFER, p: &host_efer); |
9759 | |
9760 | if (boot_cpu_has(X86_FEATURE_XSAVES)) |
9761 | rdmsrl(MSR_IA32_XSS, host_xss); |
9762 | |
9763 | kvm_init_pmu_capability(pmu_ops: ops->pmu_ops); |
9764 | |
9765 | if (boot_cpu_has(X86_FEATURE_ARCH_CAPABILITIES)) |
9766 | rdmsrl(MSR_IA32_ARCH_CAPABILITIES, host_arch_capabilities); |
9767 | |
9768 | r = ops->hardware_setup(); |
9769 | if (r != 0) |
9770 | goto out_mmu_exit; |
9771 | |
9772 | kvm_ops_update(ops); |
9773 | |
9774 | for_each_online_cpu(cpu) { |
9775 | smp_call_function_single(cpuid: cpu, func: kvm_x86_check_cpu_compat, info: &r, wait: 1); |
9776 | if (r < 0) |
9777 | goto out_unwind_ops; |
9778 | } |
9779 | |
9780 | /* |
9781 | * Point of no return! DO NOT add error paths below this point unless |
9782 | * absolutely necessary, as most operations from this point forward |
9783 | * require unwinding. |
9784 | */ |
9785 | kvm_timer_init(); |
9786 | |
9787 | if (pi_inject_timer == -1) |
9788 | pi_inject_timer = housekeeping_enabled(type: HK_TYPE_TIMER); |
9789 | #ifdef CONFIG_X86_64 |
9790 | pvclock_gtod_register_notifier(nb: &pvclock_gtod_notifier); |
9791 | |
9792 | if (hypervisor_is_type(type: X86_HYPER_MS_HYPERV)) |
9793 | set_hv_tscchange_cb(kvm_hyperv_tsc_notifier); |
9794 | #endif |
9795 | |
9796 | kvm_register_perf_callbacks(pt_intr_handler: ops->handle_intel_pt_intr); |
9797 | |
9798 | if (!kvm_cpu_cap_has(X86_FEATURE_XSAVES)) |
9799 | kvm_caps.supported_xss = 0; |
9800 | |
9801 | #define __kvm_cpu_cap_has(UNUSED_, f) kvm_cpu_cap_has(f) |
9802 | cr4_reserved_bits = __cr4_reserved_bits(__kvm_cpu_cap_has, UNUSED_); |
9803 | #undef __kvm_cpu_cap_has |
9804 | |
9805 | if (kvm_caps.has_tsc_control) { |
9806 | /* |
9807 | * Make sure the user can only configure tsc_khz values that |
9808 | * fit into a signed integer. |
9809 | * A min value is not calculated because it will always |
9810 | * be 1 on all machines. |
9811 | */ |
9812 | u64 max = min(0x7fffffffULL, |
9813 | __scale_tsc(kvm_caps.max_tsc_scaling_ratio, tsc_khz)); |
9814 | kvm_caps.max_guest_tsc_khz = max; |
9815 | } |
9816 | kvm_caps.default_tsc_scaling_ratio = 1ULL << kvm_caps.tsc_scaling_ratio_frac_bits; |
9817 | kvm_init_msr_lists(); |
9818 | return 0; |
9819 | |
9820 | out_unwind_ops: |
9821 | kvm_x86_ops.hardware_enable = NULL; |
9822 | static_call(kvm_x86_hardware_unsetup)(); |
9823 | out_mmu_exit: |
9824 | kvm_mmu_vendor_module_exit(); |
9825 | out_free_percpu: |
9826 | free_percpu(pdata: user_return_msrs); |
9827 | out_free_x86_emulator_cache: |
9828 | kmem_cache_destroy(s: x86_emulator_cache); |
9829 | return r; |
9830 | } |
9831 | EXPORT_SYMBOL_GPL(kvm_x86_vendor_init); |
9832 | |
9833 | void kvm_x86_vendor_exit(void) |
9834 | { |
9835 | kvm_unregister_perf_callbacks(); |
9836 | |
9837 | #ifdef CONFIG_X86_64 |
9838 | if (hypervisor_is_type(type: X86_HYPER_MS_HYPERV)) |
9839 | clear_hv_tscchange_cb(); |
9840 | #endif |
9841 | kvm_lapic_exit(); |
9842 | |
9843 | if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
9844 | cpufreq_unregister_notifier(nb: &kvmclock_cpufreq_notifier_block, |
9845 | CPUFREQ_TRANSITION_NOTIFIER); |
9846 | cpuhp_remove_state_nocalls(state: CPUHP_AP_X86_KVM_CLK_ONLINE); |
9847 | } |
9848 | #ifdef CONFIG_X86_64 |
9849 | pvclock_gtod_unregister_notifier(nb: &pvclock_gtod_notifier); |
9850 | irq_work_sync(work: &pvclock_irq_work); |
9851 | cancel_work_sync(work: &pvclock_gtod_work); |
9852 | #endif |
9853 | static_call(kvm_x86_hardware_unsetup)(); |
9854 | kvm_mmu_vendor_module_exit(); |
9855 | free_percpu(pdata: user_return_msrs); |
9856 | kmem_cache_destroy(s: x86_emulator_cache); |
9857 | #ifdef CONFIG_KVM_XEN |
9858 | static_key_deferred_flush(&kvm_xen_enabled); |
9859 | WARN_ON(static_branch_unlikely(&kvm_xen_enabled.key)); |
9860 | #endif |
9861 | mutex_lock(&vendor_module_lock); |
9862 | kvm_x86_ops.hardware_enable = NULL; |
9863 | mutex_unlock(lock: &vendor_module_lock); |
9864 | } |
9865 | EXPORT_SYMBOL_GPL(kvm_x86_vendor_exit); |
9866 | |
9867 | static int __kvm_emulate_halt(struct kvm_vcpu *vcpu, int state, int reason) |
9868 | { |
9869 | /* |
9870 | * The vCPU has halted, e.g. executed HLT. Update the run state if the |
9871 | * local APIC is in-kernel, the run loop will detect the non-runnable |
9872 | * state and halt the vCPU. Exit to userspace if the local APIC is |
9873 | * managed by userspace, in which case userspace is responsible for |
9874 | * handling wake events. |
9875 | */ |
9876 | ++vcpu->stat.halt_exits; |
9877 | if (lapic_in_kernel(vcpu)) { |
9878 | vcpu->arch.mp_state = state; |
9879 | return 1; |
9880 | } else { |
9881 | vcpu->run->exit_reason = reason; |
9882 | return 0; |
9883 | } |
9884 | } |
9885 | |
9886 | int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu) |
9887 | { |
9888 | return __kvm_emulate_halt(vcpu, KVM_MP_STATE_HALTED, KVM_EXIT_HLT); |
9889 | } |
9890 | EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip); |
9891 | |
9892 | int kvm_emulate_halt(struct kvm_vcpu *vcpu) |
9893 | { |
9894 | int ret = kvm_skip_emulated_instruction(vcpu); |
9895 | /* |
9896 | * TODO: we might be squashing a GUESTDBG_SINGLESTEP-triggered |
9897 | * KVM_EXIT_DEBUG here. |
9898 | */ |
9899 | return kvm_emulate_halt_noskip(vcpu) && ret; |
9900 | } |
9901 | EXPORT_SYMBOL_GPL(kvm_emulate_halt); |
9902 | |
9903 | int kvm_emulate_ap_reset_hold(struct kvm_vcpu *vcpu) |
9904 | { |
9905 | int ret = kvm_skip_emulated_instruction(vcpu); |
9906 | |
9907 | return __kvm_emulate_halt(vcpu, KVM_MP_STATE_AP_RESET_HOLD, |
9908 | KVM_EXIT_AP_RESET_HOLD) && ret; |
9909 | } |
9910 | EXPORT_SYMBOL_GPL(kvm_emulate_ap_reset_hold); |
9911 | |
9912 | #ifdef CONFIG_X86_64 |
9913 | static int kvm_pv_clock_pairing(struct kvm_vcpu *vcpu, gpa_t paddr, |
9914 | unsigned long clock_type) |
9915 | { |
9916 | struct kvm_clock_pairing clock_pairing; |
9917 | struct timespec64 ts; |
9918 | u64 cycle; |
9919 | int ret; |
9920 | |
9921 | if (clock_type != KVM_CLOCK_PAIRING_WALLCLOCK) |
9922 | return -KVM_EOPNOTSUPP; |
9923 | |
9924 | /* |
9925 | * When tsc is in permanent catchup mode guests won't be able to use |
9926 | * pvclock_read_retry loop to get consistent view of pvclock |
9927 | */ |
9928 | if (vcpu->arch.tsc_always_catchup) |
9929 | return -KVM_EOPNOTSUPP; |
9930 | |
9931 | if (!kvm_get_walltime_and_clockread(ts: &ts, tsc_timestamp: &cycle)) |
9932 | return -KVM_EOPNOTSUPP; |
9933 | |
9934 | clock_pairing.sec = ts.tv_sec; |
9935 | clock_pairing.nsec = ts.tv_nsec; |
9936 | clock_pairing.tsc = kvm_read_l1_tsc(vcpu, cycle); |
9937 | clock_pairing.flags = 0; |
9938 | memset(&clock_pairing.pad, 0, sizeof(clock_pairing.pad)); |
9939 | |
9940 | ret = 0; |
9941 | if (kvm_write_guest(kvm: vcpu->kvm, gpa: paddr, data: &clock_pairing, |
9942 | len: sizeof(struct kvm_clock_pairing))) |
9943 | ret = -KVM_EFAULT; |
9944 | |
9945 | return ret; |
9946 | } |
9947 | #endif |
9948 | |
9949 | /* |
9950 | * kvm_pv_kick_cpu_op: Kick a vcpu. |
9951 | * |
9952 | * @apicid - apicid of vcpu to be kicked. |
9953 | */ |
9954 | static void kvm_pv_kick_cpu_op(struct kvm *kvm, int apicid) |
9955 | { |
9956 | /* |
9957 | * All other fields are unused for APIC_DM_REMRD, but may be consumed by |
9958 | * common code, e.g. for tracing. Defer initialization to the compiler. |
9959 | */ |
9960 | struct kvm_lapic_irq lapic_irq = { |
9961 | .delivery_mode = APIC_DM_REMRD, |
9962 | .dest_mode = APIC_DEST_PHYSICAL, |
9963 | .shorthand = APIC_DEST_NOSHORT, |
9964 | .dest_id = apicid, |
9965 | }; |
9966 | |
9967 | kvm_irq_delivery_to_apic(kvm, NULL, irq: &lapic_irq, NULL); |
9968 | } |
9969 | |
9970 | bool kvm_apicv_activated(struct kvm *kvm) |
9971 | { |
9972 | return (READ_ONCE(kvm->arch.apicv_inhibit_reasons) == 0); |
9973 | } |
9974 | EXPORT_SYMBOL_GPL(kvm_apicv_activated); |
9975 | |
9976 | bool kvm_vcpu_apicv_activated(struct kvm_vcpu *vcpu) |
9977 | { |
9978 | ulong vm_reasons = READ_ONCE(vcpu->kvm->arch.apicv_inhibit_reasons); |
9979 | ulong vcpu_reasons = static_call(kvm_x86_vcpu_get_apicv_inhibit_reasons)(vcpu); |
9980 | |
9981 | return (vm_reasons | vcpu_reasons) == 0; |
9982 | } |
9983 | EXPORT_SYMBOL_GPL(kvm_vcpu_apicv_activated); |
9984 | |
9985 | static void set_or_clear_apicv_inhibit(unsigned long *inhibits, |
9986 | enum kvm_apicv_inhibit reason, bool set) |
9987 | { |
9988 | if (set) |
9989 | __set_bit(reason, inhibits); |
9990 | else |
9991 | __clear_bit(reason, inhibits); |
9992 | |
9993 | trace_kvm_apicv_inhibit_changed(reason, set, inhibits: *inhibits); |
9994 | } |
9995 | |
9996 | static void kvm_apicv_init(struct kvm *kvm) |
9997 | { |
9998 | unsigned long *inhibits = &kvm->arch.apicv_inhibit_reasons; |
9999 | |
10000 | init_rwsem(&kvm->arch.apicv_update_lock); |
10001 | |
10002 | set_or_clear_apicv_inhibit(inhibits, reason: APICV_INHIBIT_REASON_ABSENT, set: true); |
10003 | |
10004 | if (!enable_apicv) |
10005 | set_or_clear_apicv_inhibit(inhibits, |
10006 | reason: APICV_INHIBIT_REASON_DISABLE, set: true); |
10007 | } |
10008 | |
10009 | static void kvm_sched_yield(struct kvm_vcpu *vcpu, unsigned long dest_id) |
10010 | { |
10011 | struct kvm_vcpu *target = NULL; |
10012 | struct kvm_apic_map *map; |
10013 | |
10014 | vcpu->stat.directed_yield_attempted++; |
10015 | |
10016 | if (single_task_running()) |
10017 | goto no_yield; |
10018 | |
10019 | rcu_read_lock(); |
10020 | map = rcu_dereference(vcpu->kvm->arch.apic_map); |
10021 | |
10022 | if (likely(map) && dest_id <= map->max_apic_id && map->phys_map[dest_id]) |
10023 | target = map->phys_map[dest_id]->vcpu; |
10024 | |
10025 | rcu_read_unlock(); |
10026 | |
10027 | if (!target || !READ_ONCE(target->ready)) |
10028 | goto no_yield; |
10029 | |
10030 | /* Ignore requests to yield to self */ |
10031 | if (vcpu == target) |
10032 | goto no_yield; |
10033 | |
10034 | if (kvm_vcpu_yield_to(target) <= 0) |
10035 | goto no_yield; |
10036 | |
10037 | vcpu->stat.directed_yield_successful++; |
10038 | |
10039 | no_yield: |
10040 | return; |
10041 | } |
10042 | |
10043 | static int complete_hypercall_exit(struct kvm_vcpu *vcpu) |
10044 | { |
10045 | u64 ret = vcpu->run->hypercall.ret; |
10046 | |
10047 | if (!is_64_bit_mode(vcpu)) |
10048 | ret = (u32)ret; |
10049 | kvm_rax_write(vcpu, val: ret); |
10050 | ++vcpu->stat.hypercalls; |
10051 | return kvm_skip_emulated_instruction(vcpu); |
10052 | } |
10053 | |
10054 | int kvm_emulate_hypercall(struct kvm_vcpu *vcpu) |
10055 | { |
10056 | unsigned long nr, a0, a1, a2, a3, ret; |
10057 | int op_64_bit; |
10058 | |
10059 | if (kvm_xen_hypercall_enabled(kvm: vcpu->kvm)) |
10060 | return kvm_xen_hypercall(vcpu); |
10061 | |
10062 | if (kvm_hv_hypercall_enabled(vcpu)) |
10063 | return kvm_hv_hypercall(vcpu); |
10064 | |
10065 | nr = kvm_rax_read(vcpu); |
10066 | a0 = kvm_rbx_read(vcpu); |
10067 | a1 = kvm_rcx_read(vcpu); |
10068 | a2 = kvm_rdx_read(vcpu); |
10069 | a3 = kvm_rsi_read(vcpu); |
10070 | |
10071 | trace_kvm_hypercall(nr, a0, a1, a2, a3); |
10072 | |
10073 | op_64_bit = is_64_bit_hypercall(vcpu); |
10074 | if (!op_64_bit) { |
10075 | nr &= 0xFFFFFFFF; |
10076 | a0 &= 0xFFFFFFFF; |
10077 | a1 &= 0xFFFFFFFF; |
10078 | a2 &= 0xFFFFFFFF; |
10079 | a3 &= 0xFFFFFFFF; |
10080 | } |
10081 | |
10082 | if (static_call(kvm_x86_get_cpl)(vcpu) != 0) { |
10083 | ret = -KVM_EPERM; |
10084 | goto out; |
10085 | } |
10086 | |
10087 | ret = -KVM_ENOSYS; |
10088 | |
10089 | switch (nr) { |
10090 | case KVM_HC_VAPIC_POLL_IRQ: |
10091 | ret = 0; |
10092 | break; |
10093 | case KVM_HC_KICK_CPU: |
10094 | if (!guest_pv_has(vcpu, KVM_FEATURE_PV_UNHALT)) |
10095 | break; |
10096 | |
10097 | kvm_pv_kick_cpu_op(kvm: vcpu->kvm, apicid: a1); |
10098 | kvm_sched_yield(vcpu, dest_id: a1); |
10099 | ret = 0; |
10100 | break; |
10101 | #ifdef CONFIG_X86_64 |
10102 | case KVM_HC_CLOCK_PAIRING: |
10103 | ret = kvm_pv_clock_pairing(vcpu, paddr: a0, clock_type: a1); |
10104 | break; |
10105 | #endif |
10106 | case KVM_HC_SEND_IPI: |
10107 | if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SEND_IPI)) |
10108 | break; |
10109 | |
10110 | ret = kvm_pv_send_ipi(kvm: vcpu->kvm, ipi_bitmap_low: a0, ipi_bitmap_high: a1, min: a2, icr: a3, op_64_bit); |
10111 | break; |
10112 | case KVM_HC_SCHED_YIELD: |
10113 | if (!guest_pv_has(vcpu, KVM_FEATURE_PV_SCHED_YIELD)) |
10114 | break; |
10115 | |
10116 | kvm_sched_yield(vcpu, dest_id: a0); |
10117 | ret = 0; |
10118 | break; |
10119 | case KVM_HC_MAP_GPA_RANGE: { |
10120 | u64 gpa = a0, npages = a1, attrs = a2; |
10121 | |
10122 | ret = -KVM_ENOSYS; |
10123 | if (!(vcpu->kvm->arch.hypercall_exit_enabled & (1 << KVM_HC_MAP_GPA_RANGE))) |
10124 | break; |
10125 | |
10126 | if (!PAGE_ALIGNED(gpa) || !npages || |
10127 | gpa_to_gfn(gpa) + npages <= gpa_to_gfn(gpa)) { |
10128 | ret = -KVM_EINVAL; |
10129 | break; |
10130 | } |
10131 | |
10132 | vcpu->run->exit_reason = KVM_EXIT_HYPERCALL; |
10133 | vcpu->run->hypercall.nr = KVM_HC_MAP_GPA_RANGE; |
10134 | vcpu->run->hypercall.args[0] = gpa; |
10135 | vcpu->run->hypercall.args[1] = npages; |
10136 | vcpu->run->hypercall.args[2] = attrs; |
10137 | vcpu->run->hypercall.flags = 0; |
10138 | if (op_64_bit) |
10139 | vcpu->run->hypercall.flags |= KVM_EXIT_HYPERCALL_LONG_MODE; |
10140 | |
10141 | WARN_ON_ONCE(vcpu->run->hypercall.flags & KVM_EXIT_HYPERCALL_MBZ); |
10142 | vcpu->arch.complete_userspace_io = complete_hypercall_exit; |
10143 | return 0; |
10144 | } |
10145 | default: |
10146 | ret = -KVM_ENOSYS; |
10147 | break; |
10148 | } |
10149 | out: |
10150 | if (!op_64_bit) |
10151 | ret = (u32)ret; |
10152 | kvm_rax_write(vcpu, val: ret); |
10153 | |
10154 | ++vcpu->stat.hypercalls; |
10155 | return kvm_skip_emulated_instruction(vcpu); |
10156 | } |
10157 | EXPORT_SYMBOL_GPL(kvm_emulate_hypercall); |
10158 | |
10159 | static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt) |
10160 | { |
10161 | struct kvm_vcpu *vcpu = emul_to_vcpu(ctxt); |
10162 | char instruction[3]; |
10163 | unsigned long rip = kvm_rip_read(vcpu); |
10164 | |
10165 | /* |
10166 | * If the quirk is disabled, synthesize a #UD and let the guest pick up |
10167 | * the pieces. |
10168 | */ |
10169 | if (!kvm_check_has_quirk(kvm: vcpu->kvm, KVM_X86_QUIRK_FIX_HYPERCALL_INSN)) { |
10170 | ctxt->exception.error_code_valid = false; |
10171 | ctxt->exception.vector = UD_VECTOR; |
10172 | ctxt->have_exception = true; |
10173 | return X86EMUL_PROPAGATE_FAULT; |
10174 | } |
10175 | |
10176 | static_call(kvm_x86_patch_hypercall)(vcpu, instruction); |
10177 | |
10178 | return emulator_write_emulated(ctxt, addr: rip, val: instruction, bytes: 3, |
10179 | exception: &ctxt->exception); |
10180 | } |
10181 | |
10182 | static int dm_request_for_irq_injection(struct kvm_vcpu *vcpu) |
10183 | { |
10184 | return vcpu->run->request_interrupt_window && |
10185 | likely(!pic_in_kernel(vcpu->kvm)); |
10186 | } |
10187 | |
10188 | /* Called within kvm->srcu read side. */ |
10189 | static void post_kvm_run_save(struct kvm_vcpu *vcpu) |
10190 | { |
10191 | struct kvm_run *kvm_run = vcpu->run; |
10192 | |
10193 | kvm_run->if_flag = static_call(kvm_x86_get_if_flag)(vcpu); |
10194 | kvm_run->cr8 = kvm_get_cr8(vcpu); |
10195 | kvm_run->apic_base = kvm_get_apic_base(vcpu); |
10196 | |
10197 | kvm_run->ready_for_interrupt_injection = |
10198 | pic_in_kernel(kvm: vcpu->kvm) || |
10199 | kvm_vcpu_ready_for_interrupt_injection(vcpu); |
10200 | |
10201 | if (is_smm(vcpu)) |
10202 | kvm_run->flags |= KVM_RUN_X86_SMM; |
10203 | } |
10204 | |
10205 | static void update_cr8_intercept(struct kvm_vcpu *vcpu) |
10206 | { |
10207 | int max_irr, tpr; |
10208 | |
10209 | if (!kvm_x86_ops.update_cr8_intercept) |
10210 | return; |
10211 | |
10212 | if (!lapic_in_kernel(vcpu)) |
10213 | return; |
10214 | |
10215 | if (vcpu->arch.apic->apicv_active) |
10216 | return; |
10217 | |
10218 | if (!vcpu->arch.apic->vapic_addr) |
10219 | max_irr = kvm_lapic_find_highest_irr(vcpu); |
10220 | else |
10221 | max_irr = -1; |
10222 | |
10223 | if (max_irr != -1) |
10224 | max_irr >>= 4; |
10225 | |
10226 | tpr = kvm_lapic_get_cr8(vcpu); |
10227 | |
10228 | static_call(kvm_x86_update_cr8_intercept)(vcpu, tpr, max_irr); |
10229 | } |
10230 | |
10231 | |
10232 | int kvm_check_nested_events(struct kvm_vcpu *vcpu) |
10233 | { |
10234 | if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { |
10235 | kvm_x86_ops.nested_ops->triple_fault(vcpu); |
10236 | return 1; |
10237 | } |
10238 | |
10239 | return kvm_x86_ops.nested_ops->check_events(vcpu); |
10240 | } |
10241 | |
10242 | static void kvm_inject_exception(struct kvm_vcpu *vcpu) |
10243 | { |
10244 | /* |
10245 | * Suppress the error code if the vCPU is in Real Mode, as Real Mode |
10246 | * exceptions don't report error codes. The presence of an error code |
10247 | * is carried with the exception and only stripped when the exception |
10248 | * is injected as intercepted #PF VM-Exits for AMD's Paged Real Mode do |
10249 | * report an error code despite the CPU being in Real Mode. |
10250 | */ |
10251 | vcpu->arch.exception.has_error_code &= is_protmode(vcpu); |
10252 | |
10253 | trace_kvm_inj_exception(exception: vcpu->arch.exception.vector, |
10254 | has_error: vcpu->arch.exception.has_error_code, |
10255 | error_code: vcpu->arch.exception.error_code, |
10256 | reinjected: vcpu->arch.exception.injected); |
10257 | |
10258 | static_call(kvm_x86_inject_exception)(vcpu); |
10259 | } |
10260 | |
10261 | /* |
10262 | * Check for any event (interrupt or exception) that is ready to be injected, |
10263 | * and if there is at least one event, inject the event with the highest |
10264 | * priority. This handles both "pending" events, i.e. events that have never |
10265 | * been injected into the guest, and "injected" events, i.e. events that were |
10266 | * injected as part of a previous VM-Enter, but weren't successfully delivered |
10267 | * and need to be re-injected. |
10268 | * |
10269 | * Note, this is not guaranteed to be invoked on a guest instruction boundary, |
10270 | * i.e. doesn't guarantee that there's an event window in the guest. KVM must |
10271 | * be able to inject exceptions in the "middle" of an instruction, and so must |
10272 | * also be able to re-inject NMIs and IRQs in the middle of an instruction. |
10273 | * I.e. for exceptions and re-injected events, NOT invoking this on instruction |
10274 | * boundaries is necessary and correct. |
10275 | * |
10276 | * For simplicity, KVM uses a single path to inject all events (except events |
10277 | * that are injected directly from L1 to L2) and doesn't explicitly track |
10278 | * instruction boundaries for asynchronous events. However, because VM-Exits |
10279 | * that can occur during instruction execution typically result in KVM skipping |
10280 | * the instruction or injecting an exception, e.g. instruction and exception |
10281 | * intercepts, and because pending exceptions have higher priority than pending |
10282 | * interrupts, KVM still honors instruction boundaries in most scenarios. |
10283 | * |
10284 | * But, if a VM-Exit occurs during instruction execution, and KVM does NOT skip |
10285 | * the instruction or inject an exception, then KVM can incorrecty inject a new |
10286 | * asynchronous event if the event became pending after the CPU fetched the |
10287 | * instruction (in the guest). E.g. if a page fault (#PF, #NPF, EPT violation) |
10288 | * occurs and is resolved by KVM, a coincident NMI, SMI, IRQ, etc... can be |
10289 | * injected on the restarted instruction instead of being deferred until the |
10290 | * instruction completes. |
10291 | * |
10292 | * In practice, this virtualization hole is unlikely to be observed by the |
10293 | * guest, and even less likely to cause functional problems. To detect the |
10294 | * hole, the guest would have to trigger an event on a side effect of an early |
10295 | * phase of instruction execution, e.g. on the instruction fetch from memory. |
10296 | * And for it to be a functional problem, the guest would need to depend on the |
10297 | * ordering between that side effect, the instruction completing, _and_ the |
10298 | * delivery of the asynchronous event. |
10299 | */ |
10300 | static int kvm_check_and_inject_events(struct kvm_vcpu *vcpu, |
10301 | bool *req_immediate_exit) |
10302 | { |
10303 | bool can_inject; |
10304 | int r; |
10305 | |
10306 | /* |
10307 | * Process nested events first, as nested VM-Exit supersedes event |
10308 | * re-injection. If there's an event queued for re-injection, it will |
10309 | * be saved into the appropriate vmc{b,s}12 fields on nested VM-Exit. |
10310 | */ |
10311 | if (is_guest_mode(vcpu)) |
10312 | r = kvm_check_nested_events(vcpu); |
10313 | else |
10314 | r = 0; |
10315 | |
10316 | /* |
10317 | * Re-inject exceptions and events *especially* if immediate entry+exit |
10318 | * to/from L2 is needed, as any event that has already been injected |
10319 | * into L2 needs to complete its lifecycle before injecting a new event. |
10320 | * |
10321 | * Don't re-inject an NMI or interrupt if there is a pending exception. |
10322 | * This collision arises if an exception occurred while vectoring the |
10323 | * injected event, KVM intercepted said exception, and KVM ultimately |
10324 | * determined the fault belongs to the guest and queues the exception |
10325 | * for injection back into the guest. |
10326 | * |
10327 | * "Injected" interrupts can also collide with pending exceptions if |
10328 | * userspace ignores the "ready for injection" flag and blindly queues |
10329 | * an interrupt. In that case, prioritizing the exception is correct, |
10330 | * as the exception "occurred" before the exit to userspace. Trap-like |
10331 | * exceptions, e.g. most #DBs, have higher priority than interrupts. |
10332 | * And while fault-like exceptions, e.g. #GP and #PF, are the lowest |
10333 | * priority, they're only generated (pended) during instruction |
10334 | * execution, and interrupts are recognized at instruction boundaries. |
10335 | * Thus a pending fault-like exception means the fault occurred on the |
10336 | * *previous* instruction and must be serviced prior to recognizing any |
10337 | * new events in order to fully complete the previous instruction. |
10338 | */ |
10339 | if (vcpu->arch.exception.injected) |
10340 | kvm_inject_exception(vcpu); |
10341 | else if (kvm_is_exception_pending(vcpu)) |
10342 | ; /* see above */ |
10343 | else if (vcpu->arch.nmi_injected) |
10344 | static_call(kvm_x86_inject_nmi)(vcpu); |
10345 | else if (vcpu->arch.interrupt.injected) |
10346 | static_call(kvm_x86_inject_irq)(vcpu, true); |
10347 | |
10348 | /* |
10349 | * Exceptions that morph to VM-Exits are handled above, and pending |
10350 | * exceptions on top of injected exceptions that do not VM-Exit should |
10351 | * either morph to #DF or, sadly, override the injected exception. |
10352 | */ |
10353 | WARN_ON_ONCE(vcpu->arch.exception.injected && |
10354 | vcpu->arch.exception.pending); |
10355 | |
10356 | /* |
10357 | * Bail if immediate entry+exit to/from the guest is needed to complete |
10358 | * nested VM-Enter or event re-injection so that a different pending |
10359 | * event can be serviced (or if KVM needs to exit to userspace). |
10360 | * |
10361 | * Otherwise, continue processing events even if VM-Exit occurred. The |
10362 | * VM-Exit will have cleared exceptions that were meant for L2, but |
10363 | * there may now be events that can be injected into L1. |
10364 | */ |
10365 | if (r < 0) |
10366 | goto out; |
10367 | |
10368 | /* |
10369 | * A pending exception VM-Exit should either result in nested VM-Exit |
10370 | * or force an immediate re-entry and exit to/from L2, and exception |
10371 | * VM-Exits cannot be injected (flag should _never_ be set). |
10372 | */ |
10373 | WARN_ON_ONCE(vcpu->arch.exception_vmexit.injected || |
10374 | vcpu->arch.exception_vmexit.pending); |
10375 | |
10376 | /* |
10377 | * New events, other than exceptions, cannot be injected if KVM needs |
10378 | * to re-inject a previous event. See above comments on re-injecting |
10379 | * for why pending exceptions get priority. |
10380 | */ |
10381 | can_inject = !kvm_event_needs_reinjection(vcpu); |
10382 | |
10383 | if (vcpu->arch.exception.pending) { |
10384 | /* |
10385 | * Fault-class exceptions, except #DBs, set RF=1 in the RFLAGS |
10386 | * value pushed on the stack. Trap-like exception and all #DBs |
10387 | * leave RF as-is (KVM follows Intel's behavior in this regard; |
10388 | * AMD states that code breakpoint #DBs excplitly clear RF=0). |
10389 | * |
10390 | * Note, most versions of Intel's SDM and AMD's APM incorrectly |
10391 | * describe the behavior of General Detect #DBs, which are |
10392 | * fault-like. They do _not_ set RF, a la code breakpoints. |
10393 | */ |
10394 | if (exception_type(vector: vcpu->arch.exception.vector) == EXCPT_FAULT) |
10395 | __kvm_set_rflags(vcpu, rflags: kvm_get_rflags(vcpu) | |
10396 | X86_EFLAGS_RF); |
10397 | |
10398 | if (vcpu->arch.exception.vector == DB_VECTOR) { |
10399 | kvm_deliver_exception_payload(vcpu, &vcpu->arch.exception); |
10400 | if (vcpu->arch.dr7 & DR7_GD) { |
10401 | vcpu->arch.dr7 &= ~DR7_GD; |
10402 | kvm_update_dr7(vcpu); |
10403 | } |
10404 | } |
10405 | |
10406 | kvm_inject_exception(vcpu); |
10407 | |
10408 | vcpu->arch.exception.pending = false; |
10409 | vcpu->arch.exception.injected = true; |
10410 | |
10411 | can_inject = false; |
10412 | } |
10413 | |
10414 | /* Don't inject interrupts if the user asked to avoid doing so */ |
10415 | if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) |
10416 | return 0; |
10417 | |
10418 | /* |
10419 | * Finally, inject interrupt events. If an event cannot be injected |
10420 | * due to architectural conditions (e.g. IF=0) a window-open exit |
10421 | * will re-request KVM_REQ_EVENT. Sometimes however an event is pending |
10422 | * and can architecturally be injected, but we cannot do it right now: |
10423 | * an interrupt could have arrived just now and we have to inject it |
10424 | * as a vmexit, or there could already an event in the queue, which is |
10425 | * indicated by can_inject. In that case we request an immediate exit |
10426 | * in order to make progress and get back here for another iteration. |
10427 | * The kvm_x86_ops hooks communicate this by returning -EBUSY. |
10428 | */ |
10429 | #ifdef CONFIG_KVM_SMM |
10430 | if (vcpu->arch.smi_pending) { |
10431 | r = can_inject ? static_call(kvm_x86_smi_allowed)(vcpu, true) : -EBUSY; |
10432 | if (r < 0) |
10433 | goto out; |
10434 | if (r) { |
10435 | vcpu->arch.smi_pending = false; |
10436 | ++vcpu->arch.smi_count; |
10437 | enter_smm(vcpu); |
10438 | can_inject = false; |
10439 | } else |
10440 | static_call(kvm_x86_enable_smi_window)(vcpu); |
10441 | } |
10442 | #endif |
10443 | |
10444 | if (vcpu->arch.nmi_pending) { |
10445 | r = can_inject ? static_call(kvm_x86_nmi_allowed)(vcpu, true) : -EBUSY; |
10446 | if (r < 0) |
10447 | goto out; |
10448 | if (r) { |
10449 | --vcpu->arch.nmi_pending; |
10450 | vcpu->arch.nmi_injected = true; |
10451 | static_call(kvm_x86_inject_nmi)(vcpu); |
10452 | can_inject = false; |
10453 | WARN_ON(static_call(kvm_x86_nmi_allowed)(vcpu, true) < 0); |
10454 | } |
10455 | if (vcpu->arch.nmi_pending) |
10456 | static_call(kvm_x86_enable_nmi_window)(vcpu); |
10457 | } |
10458 | |
10459 | if (kvm_cpu_has_injectable_intr(v: vcpu)) { |
10460 | r = can_inject ? static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY; |
10461 | if (r < 0) |
10462 | goto out; |
10463 | if (r) { |
10464 | int irq = kvm_cpu_get_interrupt(v: vcpu); |
10465 | |
10466 | if (!WARN_ON_ONCE(irq == -1)) { |
10467 | kvm_queue_interrupt(vcpu, vector: irq, soft: false); |
10468 | static_call(kvm_x86_inject_irq)(vcpu, false); |
10469 | WARN_ON(static_call(kvm_x86_interrupt_allowed)(vcpu, true) < 0); |
10470 | } |
10471 | } |
10472 | if (kvm_cpu_has_injectable_intr(v: vcpu)) |
10473 | static_call(kvm_x86_enable_irq_window)(vcpu); |
10474 | } |
10475 | |
10476 | if (is_guest_mode(vcpu) && |
10477 | kvm_x86_ops.nested_ops->has_events && |
10478 | kvm_x86_ops.nested_ops->has_events(vcpu)) |
10479 | *req_immediate_exit = true; |
10480 | |
10481 | /* |
10482 | * KVM must never queue a new exception while injecting an event; KVM |
10483 | * is done emulating and should only propagate the to-be-injected event |
10484 | * to the VMCS/VMCB. Queueing a new exception can put the vCPU into an |
10485 | * infinite loop as KVM will bail from VM-Enter to inject the pending |
10486 | * exception and start the cycle all over. |
10487 | * |
10488 | * Exempt triple faults as they have special handling and won't put the |
10489 | * vCPU into an infinite loop. Triple fault can be queued when running |
10490 | * VMX without unrestricted guest, as that requires KVM to emulate Real |
10491 | * Mode events (see kvm_inject_realmode_interrupt()). |
10492 | */ |
10493 | WARN_ON_ONCE(vcpu->arch.exception.pending || |
10494 | vcpu->arch.exception_vmexit.pending); |
10495 | return 0; |
10496 | |
10497 | out: |
10498 | if (r == -EBUSY) { |
10499 | *req_immediate_exit = true; |
10500 | r = 0; |
10501 | } |
10502 | return r; |
10503 | } |
10504 | |
10505 | static void process_nmi(struct kvm_vcpu *vcpu) |
10506 | { |
10507 | unsigned int limit; |
10508 | |
10509 | /* |
10510 | * x86 is limited to one NMI pending, but because KVM can't react to |
10511 | * incoming NMIs as quickly as bare metal, e.g. if the vCPU is |
10512 | * scheduled out, KVM needs to play nice with two queued NMIs showing |
10513 | * up at the same time. To handle this scenario, allow two NMIs to be |
10514 | * (temporarily) pending so long as NMIs are not blocked and KVM is not |
10515 | * waiting for a previous NMI injection to complete (which effectively |
10516 | * blocks NMIs). KVM will immediately inject one of the two NMIs, and |
10517 | * will request an NMI window to handle the second NMI. |
10518 | */ |
10519 | if (static_call(kvm_x86_get_nmi_mask)(vcpu) || vcpu->arch.nmi_injected) |
10520 | limit = 1; |
10521 | else |
10522 | limit = 2; |
10523 | |
10524 | /* |
10525 | * Adjust the limit to account for pending virtual NMIs, which aren't |
10526 | * tracked in vcpu->arch.nmi_pending. |
10527 | */ |
10528 | if (static_call(kvm_x86_is_vnmi_pending)(vcpu)) |
10529 | limit--; |
10530 | |
10531 | vcpu->arch.nmi_pending += atomic_xchg(v: &vcpu->arch.nmi_queued, new: 0); |
10532 | vcpu->arch.nmi_pending = min(vcpu->arch.nmi_pending, limit); |
10533 | |
10534 | if (vcpu->arch.nmi_pending && |
10535 | (static_call(kvm_x86_set_vnmi_pending)(vcpu))) |
10536 | vcpu->arch.nmi_pending--; |
10537 | |
10538 | if (vcpu->arch.nmi_pending) |
10539 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
10540 | } |
10541 | |
10542 | /* Return total number of NMIs pending injection to the VM */ |
10543 | int kvm_get_nr_pending_nmis(struct kvm_vcpu *vcpu) |
10544 | { |
10545 | return vcpu->arch.nmi_pending + |
10546 | static_call(kvm_x86_is_vnmi_pending)(vcpu); |
10547 | } |
10548 | |
10549 | void kvm_make_scan_ioapic_request_mask(struct kvm *kvm, |
10550 | unsigned long *vcpu_bitmap) |
10551 | { |
10552 | kvm_make_vcpus_request_mask(kvm, KVM_REQ_SCAN_IOAPIC, vcpu_bitmap); |
10553 | } |
10554 | |
10555 | void kvm_make_scan_ioapic_request(struct kvm *kvm) |
10556 | { |
10557 | kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC); |
10558 | } |
10559 | |
10560 | void __kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) |
10561 | { |
10562 | struct kvm_lapic *apic = vcpu->arch.apic; |
10563 | bool activate; |
10564 | |
10565 | if (!lapic_in_kernel(vcpu)) |
10566 | return; |
10567 | |
10568 | down_read(sem: &vcpu->kvm->arch.apicv_update_lock); |
10569 | preempt_disable(); |
10570 | |
10571 | /* Do not activate APICV when APIC is disabled */ |
10572 | activate = kvm_vcpu_apicv_activated(vcpu) && |
10573 | (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED); |
10574 | |
10575 | if (apic->apicv_active == activate) |
10576 | goto out; |
10577 | |
10578 | apic->apicv_active = activate; |
10579 | kvm_apic_update_apicv(vcpu); |
10580 | static_call(kvm_x86_refresh_apicv_exec_ctrl)(vcpu); |
10581 | |
10582 | /* |
10583 | * When APICv gets disabled, we may still have injected interrupts |
10584 | * pending. At the same time, KVM_REQ_EVENT may not be set as APICv was |
10585 | * still active when the interrupt got accepted. Make sure |
10586 | * kvm_check_and_inject_events() is called to check for that. |
10587 | */ |
10588 | if (!apic->apicv_active) |
10589 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
10590 | |
10591 | out: |
10592 | preempt_enable(); |
10593 | up_read(sem: &vcpu->kvm->arch.apicv_update_lock); |
10594 | } |
10595 | EXPORT_SYMBOL_GPL(__kvm_vcpu_update_apicv); |
10596 | |
10597 | static void kvm_vcpu_update_apicv(struct kvm_vcpu *vcpu) |
10598 | { |
10599 | if (!lapic_in_kernel(vcpu)) |
10600 | return; |
10601 | |
10602 | /* |
10603 | * Due to sharing page tables across vCPUs, the xAPIC memslot must be |
10604 | * deleted if any vCPU has xAPIC virtualization and x2APIC enabled, but |
10605 | * and hardware doesn't support x2APIC virtualization. E.g. some AMD |
10606 | * CPUs support AVIC but not x2APIC. KVM still allows enabling AVIC in |
10607 | * this case so that KVM can the AVIC doorbell to inject interrupts to |
10608 | * running vCPUs, but KVM must not create SPTEs for the APIC base as |
10609 | * the vCPU would incorrectly be able to access the vAPIC page via MMIO |
10610 | * despite being in x2APIC mode. For simplicity, inhibiting the APIC |
10611 | * access page is sticky. |
10612 | */ |
10613 | if (apic_x2apic_mode(apic: vcpu->arch.apic) && |
10614 | kvm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization) |
10615 | kvm_inhibit_apic_access_page(vcpu); |
10616 | |
10617 | __kvm_vcpu_update_apicv(vcpu); |
10618 | } |
10619 | |
10620 | void __kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, |
10621 | enum kvm_apicv_inhibit reason, bool set) |
10622 | { |
10623 | unsigned long old, new; |
10624 | |
10625 | lockdep_assert_held_write(&kvm->arch.apicv_update_lock); |
10626 | |
10627 | if (!(kvm_x86_ops.required_apicv_inhibits & BIT(reason))) |
10628 | return; |
10629 | |
10630 | old = new = kvm->arch.apicv_inhibit_reasons; |
10631 | |
10632 | set_or_clear_apicv_inhibit(inhibits: &new, reason, set); |
10633 | |
10634 | if (!!old != !!new) { |
10635 | /* |
10636 | * Kick all vCPUs before setting apicv_inhibit_reasons to avoid |
10637 | * false positives in the sanity check WARN in svm_vcpu_run(). |
10638 | * This task will wait for all vCPUs to ack the kick IRQ before |
10639 | * updating apicv_inhibit_reasons, and all other vCPUs will |
10640 | * block on acquiring apicv_update_lock so that vCPUs can't |
10641 | * redo svm_vcpu_run() without seeing the new inhibit state. |
10642 | * |
10643 | * Note, holding apicv_update_lock and taking it in the read |
10644 | * side (handling the request) also prevents other vCPUs from |
10645 | * servicing the request with a stale apicv_inhibit_reasons. |
10646 | */ |
10647 | kvm_make_all_cpus_request(kvm, KVM_REQ_APICV_UPDATE); |
10648 | kvm->arch.apicv_inhibit_reasons = new; |
10649 | if (new) { |
10650 | unsigned long gfn = gpa_to_gfn(APIC_DEFAULT_PHYS_BASE); |
10651 | int idx = srcu_read_lock(ssp: &kvm->srcu); |
10652 | |
10653 | kvm_zap_gfn_range(kvm, gfn_start: gfn, gfn_end: gfn+1); |
10654 | srcu_read_unlock(ssp: &kvm->srcu, idx); |
10655 | } |
10656 | } else { |
10657 | kvm->arch.apicv_inhibit_reasons = new; |
10658 | } |
10659 | } |
10660 | |
10661 | void kvm_set_or_clear_apicv_inhibit(struct kvm *kvm, |
10662 | enum kvm_apicv_inhibit reason, bool set) |
10663 | { |
10664 | if (!enable_apicv) |
10665 | return; |
10666 | |
10667 | down_write(sem: &kvm->arch.apicv_update_lock); |
10668 | __kvm_set_or_clear_apicv_inhibit(kvm, reason, set); |
10669 | up_write(sem: &kvm->arch.apicv_update_lock); |
10670 | } |
10671 | EXPORT_SYMBOL_GPL(kvm_set_or_clear_apicv_inhibit); |
10672 | |
10673 | static void vcpu_scan_ioapic(struct kvm_vcpu *vcpu) |
10674 | { |
10675 | if (!kvm_apic_present(vcpu)) |
10676 | return; |
10677 | |
10678 | bitmap_zero(dst: vcpu->arch.ioapic_handled_vectors, nbits: 256); |
10679 | |
10680 | if (irqchip_split(kvm: vcpu->kvm)) |
10681 | kvm_scan_ioapic_routes(vcpu, ioapic_handled_vectors: vcpu->arch.ioapic_handled_vectors); |
10682 | else { |
10683 | static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); |
10684 | if (ioapic_in_kernel(kvm: vcpu->kvm)) |
10685 | kvm_ioapic_scan_entry(vcpu, ioapic_handled_vectors: vcpu->arch.ioapic_handled_vectors); |
10686 | } |
10687 | |
10688 | if (is_guest_mode(vcpu)) |
10689 | vcpu->arch.load_eoi_exitmap_pending = true; |
10690 | else |
10691 | kvm_make_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu); |
10692 | } |
10693 | |
10694 | static void vcpu_load_eoi_exitmap(struct kvm_vcpu *vcpu) |
10695 | { |
10696 | if (!kvm_apic_hw_enabled(apic: vcpu->arch.apic)) |
10697 | return; |
10698 | |
10699 | #ifdef CONFIG_KVM_HYPERV |
10700 | if (to_hv_vcpu(vcpu)) { |
10701 | u64 eoi_exit_bitmap[4]; |
10702 | |
10703 | bitmap_or(dst: (ulong *)eoi_exit_bitmap, |
10704 | src1: vcpu->arch.ioapic_handled_vectors, |
10705 | src2: to_hv_synic(vcpu)->vec_bitmap, nbits: 256); |
10706 | static_call_cond(kvm_x86_load_eoi_exitmap)(vcpu, eoi_exit_bitmap); |
10707 | return; |
10708 | } |
10709 | #endif |
10710 | static_call_cond(kvm_x86_load_eoi_exitmap)( |
10711 | vcpu, (u64 *)vcpu->arch.ioapic_handled_vectors); |
10712 | } |
10713 | |
10714 | void kvm_arch_guest_memory_reclaimed(struct kvm *kvm) |
10715 | { |
10716 | static_call_cond(kvm_x86_guest_memory_reclaimed)(kvm); |
10717 | } |
10718 | |
10719 | static void kvm_vcpu_reload_apic_access_page(struct kvm_vcpu *vcpu) |
10720 | { |
10721 | if (!lapic_in_kernel(vcpu)) |
10722 | return; |
10723 | |
10724 | static_call_cond(kvm_x86_set_apic_access_page_addr)(vcpu); |
10725 | } |
10726 | |
10727 | /* |
10728 | * Called within kvm->srcu read side. |
10729 | * Returns 1 to let vcpu_run() continue the guest execution loop without |
10730 | * exiting to the userspace. Otherwise, the value will be returned to the |
10731 | * userspace. |
10732 | */ |
10733 | static int vcpu_enter_guest(struct kvm_vcpu *vcpu) |
10734 | { |
10735 | int r; |
10736 | bool req_int_win = |
10737 | dm_request_for_irq_injection(vcpu) && |
10738 | kvm_cpu_accept_dm_intr(vcpu); |
10739 | fastpath_t exit_fastpath; |
10740 | |
10741 | bool req_immediate_exit = false; |
10742 | |
10743 | if (kvm_request_pending(vcpu)) { |
10744 | if (kvm_check_request(KVM_REQ_VM_DEAD, vcpu)) { |
10745 | r = -EIO; |
10746 | goto out; |
10747 | } |
10748 | |
10749 | if (kvm_dirty_ring_check_request(vcpu)) { |
10750 | r = 0; |
10751 | goto out; |
10752 | } |
10753 | |
10754 | if (kvm_check_request(KVM_REQ_GET_NESTED_STATE_PAGES, vcpu)) { |
10755 | if (unlikely(!kvm_x86_ops.nested_ops->get_nested_state_pages(vcpu))) { |
10756 | r = 0; |
10757 | goto out; |
10758 | } |
10759 | } |
10760 | if (kvm_check_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu)) |
10761 | kvm_mmu_free_obsolete_roots(vcpu); |
10762 | if (kvm_check_request(KVM_REQ_MIGRATE_TIMER, vcpu)) |
10763 | __kvm_migrate_timers(vcpu); |
10764 | if (kvm_check_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu)) |
10765 | kvm_update_masterclock(kvm: vcpu->kvm); |
10766 | if (kvm_check_request(KVM_REQ_GLOBAL_CLOCK_UPDATE, vcpu)) |
10767 | kvm_gen_kvmclock_update(v: vcpu); |
10768 | if (kvm_check_request(KVM_REQ_CLOCK_UPDATE, vcpu)) { |
10769 | r = kvm_guest_time_update(v: vcpu); |
10770 | if (unlikely(r)) |
10771 | goto out; |
10772 | } |
10773 | if (kvm_check_request(KVM_REQ_MMU_SYNC, vcpu)) |
10774 | kvm_mmu_sync_roots(vcpu); |
10775 | if (kvm_check_request(KVM_REQ_LOAD_MMU_PGD, vcpu)) |
10776 | kvm_mmu_load_pgd(vcpu); |
10777 | |
10778 | /* |
10779 | * Note, the order matters here, as flushing "all" TLB entries |
10780 | * also flushes the "current" TLB entries, i.e. servicing the |
10781 | * flush "all" will clear any request to flush "current". |
10782 | */ |
10783 | if (kvm_check_request(KVM_REQ_TLB_FLUSH, vcpu)) |
10784 | kvm_vcpu_flush_tlb_all(vcpu); |
10785 | |
10786 | kvm_service_local_tlb_flush_requests(vcpu); |
10787 | |
10788 | /* |
10789 | * Fall back to a "full" guest flush if Hyper-V's precise |
10790 | * flushing fails. Note, Hyper-V's flushing is per-vCPU, but |
10791 | * the flushes are considered "remote" and not "local" because |
10792 | * the requests can be initiated from other vCPUs. |
10793 | */ |
10794 | #ifdef CONFIG_KVM_HYPERV |
10795 | if (kvm_check_request(KVM_REQ_HV_TLB_FLUSH, vcpu) && |
10796 | kvm_hv_vcpu_flush_tlb(vcpu)) |
10797 | kvm_vcpu_flush_tlb_guest(vcpu); |
10798 | #endif |
10799 | |
10800 | if (kvm_check_request(KVM_REQ_REPORT_TPR_ACCESS, vcpu)) { |
10801 | vcpu->run->exit_reason = KVM_EXIT_TPR_ACCESS; |
10802 | r = 0; |
10803 | goto out; |
10804 | } |
10805 | if (kvm_test_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { |
10806 | if (is_guest_mode(vcpu)) |
10807 | kvm_x86_ops.nested_ops->triple_fault(vcpu); |
10808 | |
10809 | if (kvm_check_request(KVM_REQ_TRIPLE_FAULT, vcpu)) { |
10810 | vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN; |
10811 | vcpu->mmio_needed = 0; |
10812 | r = 0; |
10813 | goto out; |
10814 | } |
10815 | } |
10816 | if (kvm_check_request(KVM_REQ_APF_HALT, vcpu)) { |
10817 | /* Page is swapped out. Do synthetic halt */ |
10818 | vcpu->arch.apf.halted = true; |
10819 | r = 1; |
10820 | goto out; |
10821 | } |
10822 | if (kvm_check_request(KVM_REQ_STEAL_UPDATE, vcpu)) |
10823 | record_steal_time(vcpu); |
10824 | if (kvm_check_request(KVM_REQ_PMU, vcpu)) |
10825 | kvm_pmu_handle_event(vcpu); |
10826 | if (kvm_check_request(KVM_REQ_PMI, vcpu)) |
10827 | kvm_pmu_deliver_pmi(vcpu); |
10828 | #ifdef CONFIG_KVM_SMM |
10829 | if (kvm_check_request(KVM_REQ_SMI, vcpu)) |
10830 | process_smi(vcpu); |
10831 | #endif |
10832 | if (kvm_check_request(KVM_REQ_NMI, vcpu)) |
10833 | process_nmi(vcpu); |
10834 | if (kvm_check_request(KVM_REQ_IOAPIC_EOI_EXIT, vcpu)) { |
10835 | BUG_ON(vcpu->arch.pending_ioapic_eoi > 255); |
10836 | if (test_bit(vcpu->arch.pending_ioapic_eoi, |
10837 | vcpu->arch.ioapic_handled_vectors)) { |
10838 | vcpu->run->exit_reason = KVM_EXIT_IOAPIC_EOI; |
10839 | vcpu->run->eoi.vector = |
10840 | vcpu->arch.pending_ioapic_eoi; |
10841 | r = 0; |
10842 | goto out; |
10843 | } |
10844 | } |
10845 | if (kvm_check_request(KVM_REQ_SCAN_IOAPIC, vcpu)) |
10846 | vcpu_scan_ioapic(vcpu); |
10847 | if (kvm_check_request(KVM_REQ_LOAD_EOI_EXITMAP, vcpu)) |
10848 | vcpu_load_eoi_exitmap(vcpu); |
10849 | if (kvm_check_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu)) |
10850 | kvm_vcpu_reload_apic_access_page(vcpu); |
10851 | #ifdef CONFIG_KVM_HYPERV |
10852 | if (kvm_check_request(KVM_REQ_HV_CRASH, vcpu)) { |
10853 | vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; |
10854 | vcpu->run->system_event.type = KVM_SYSTEM_EVENT_CRASH; |
10855 | vcpu->run->system_event.ndata = 0; |
10856 | r = 0; |
10857 | goto out; |
10858 | } |
10859 | if (kvm_check_request(KVM_REQ_HV_RESET, vcpu)) { |
10860 | vcpu->run->exit_reason = KVM_EXIT_SYSTEM_EVENT; |
10861 | vcpu->run->system_event.type = KVM_SYSTEM_EVENT_RESET; |
10862 | vcpu->run->system_event.ndata = 0; |
10863 | r = 0; |
10864 | goto out; |
10865 | } |
10866 | if (kvm_check_request(KVM_REQ_HV_EXIT, vcpu)) { |
10867 | struct kvm_vcpu_hv *hv_vcpu = to_hv_vcpu(vcpu); |
10868 | |
10869 | vcpu->run->exit_reason = KVM_EXIT_HYPERV; |
10870 | vcpu->run->hyperv = hv_vcpu->exit; |
10871 | r = 0; |
10872 | goto out; |
10873 | } |
10874 | |
10875 | /* |
10876 | * KVM_REQ_HV_STIMER has to be processed after |
10877 | * KVM_REQ_CLOCK_UPDATE, because Hyper-V SynIC timers |
10878 | * depend on the guest clock being up-to-date |
10879 | */ |
10880 | if (kvm_check_request(KVM_REQ_HV_STIMER, vcpu)) |
10881 | kvm_hv_process_stimers(vcpu); |
10882 | #endif |
10883 | if (kvm_check_request(KVM_REQ_APICV_UPDATE, vcpu)) |
10884 | kvm_vcpu_update_apicv(vcpu); |
10885 | if (kvm_check_request(KVM_REQ_APF_READY, vcpu)) |
10886 | kvm_check_async_pf_completion(vcpu); |
10887 | if (kvm_check_request(KVM_REQ_MSR_FILTER_CHANGED, vcpu)) |
10888 | static_call(kvm_x86_msr_filter_changed)(vcpu); |
10889 | |
10890 | if (kvm_check_request(KVM_REQ_UPDATE_CPU_DIRTY_LOGGING, vcpu)) |
10891 | static_call(kvm_x86_update_cpu_dirty_logging)(vcpu); |
10892 | } |
10893 | |
10894 | if (kvm_check_request(KVM_REQ_EVENT, vcpu) || req_int_win || |
10895 | kvm_xen_has_interrupt(vcpu)) { |
10896 | ++vcpu->stat.req_event; |
10897 | r = kvm_apic_accept_events(vcpu); |
10898 | if (r < 0) { |
10899 | r = 0; |
10900 | goto out; |
10901 | } |
10902 | if (vcpu->arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) { |
10903 | r = 1; |
10904 | goto out; |
10905 | } |
10906 | |
10907 | r = kvm_check_and_inject_events(vcpu, req_immediate_exit: &req_immediate_exit); |
10908 | if (r < 0) { |
10909 | r = 0; |
10910 | goto out; |
10911 | } |
10912 | if (req_int_win) |
10913 | static_call(kvm_x86_enable_irq_window)(vcpu); |
10914 | |
10915 | if (kvm_lapic_enabled(vcpu)) { |
10916 | update_cr8_intercept(vcpu); |
10917 | kvm_lapic_sync_to_vapic(vcpu); |
10918 | } |
10919 | } |
10920 | |
10921 | r = kvm_mmu_reload(vcpu); |
10922 | if (unlikely(r)) { |
10923 | goto cancel_injection; |
10924 | } |
10925 | |
10926 | preempt_disable(); |
10927 | |
10928 | static_call(kvm_x86_prepare_switch_to_guest)(vcpu); |
10929 | |
10930 | /* |
10931 | * Disable IRQs before setting IN_GUEST_MODE. Posted interrupt |
10932 | * IPI are then delayed after guest entry, which ensures that they |
10933 | * result in virtual interrupt delivery. |
10934 | */ |
10935 | local_irq_disable(); |
10936 | |
10937 | /* Store vcpu->apicv_active before vcpu->mode. */ |
10938 | smp_store_release(&vcpu->mode, IN_GUEST_MODE); |
10939 | |
10940 | kvm_vcpu_srcu_read_unlock(vcpu); |
10941 | |
10942 | /* |
10943 | * 1) We should set ->mode before checking ->requests. Please see |
10944 | * the comment in kvm_vcpu_exiting_guest_mode(). |
10945 | * |
10946 | * 2) For APICv, we should set ->mode before checking PID.ON. This |
10947 | * pairs with the memory barrier implicit in pi_test_and_set_on |
10948 | * (see vmx_deliver_posted_interrupt). |
10949 | * |
10950 | * 3) This also orders the write to mode from any reads to the page |
10951 | * tables done while the VCPU is running. Please see the comment |
10952 | * in kvm_flush_remote_tlbs. |
10953 | */ |
10954 | smp_mb__after_srcu_read_unlock(); |
10955 | |
10956 | /* |
10957 | * Process pending posted interrupts to handle the case where the |
10958 | * notification IRQ arrived in the host, or was never sent (because the |
10959 | * target vCPU wasn't running). Do this regardless of the vCPU's APICv |
10960 | * status, KVM doesn't update assigned devices when APICv is inhibited, |
10961 | * i.e. they can post interrupts even if APICv is temporarily disabled. |
10962 | */ |
10963 | if (kvm_lapic_enabled(vcpu)) |
10964 | static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); |
10965 | |
10966 | if (kvm_vcpu_exit_request(vcpu)) { |
10967 | vcpu->mode = OUTSIDE_GUEST_MODE; |
10968 | smp_wmb(); |
10969 | local_irq_enable(); |
10970 | preempt_enable(); |
10971 | kvm_vcpu_srcu_read_lock(vcpu); |
10972 | r = 1; |
10973 | goto cancel_injection; |
10974 | } |
10975 | |
10976 | if (req_immediate_exit) |
10977 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
10978 | |
10979 | fpregs_assert_state_consistent(); |
10980 | if (test_thread_flag(TIF_NEED_FPU_LOAD)) |
10981 | switch_fpu_return(); |
10982 | |
10983 | if (vcpu->arch.guest_fpu.xfd_err) |
10984 | wrmsrl(MSR_IA32_XFD_ERR, val: vcpu->arch.guest_fpu.xfd_err); |
10985 | |
10986 | if (unlikely(vcpu->arch.switch_db_regs)) { |
10987 | set_debugreg(val: 0, reg: 7); |
10988 | set_debugreg(val: vcpu->arch.eff_db[0], reg: 0); |
10989 | set_debugreg(val: vcpu->arch.eff_db[1], reg: 1); |
10990 | set_debugreg(val: vcpu->arch.eff_db[2], reg: 2); |
10991 | set_debugreg(val: vcpu->arch.eff_db[3], reg: 3); |
10992 | } else if (unlikely(hw_breakpoint_active())) { |
10993 | set_debugreg(val: 0, reg: 7); |
10994 | } |
10995 | |
10996 | guest_timing_enter_irqoff(); |
10997 | |
10998 | for (;;) { |
10999 | /* |
11000 | * Assert that vCPU vs. VM APICv state is consistent. An APICv |
11001 | * update must kick and wait for all vCPUs before toggling the |
11002 | * per-VM state, and responding vCPUs must wait for the update |
11003 | * to complete before servicing KVM_REQ_APICV_UPDATE. |
11004 | */ |
11005 | WARN_ON_ONCE((kvm_vcpu_apicv_activated(vcpu) != kvm_vcpu_apicv_active(vcpu)) && |
11006 | (kvm_get_apic_mode(vcpu) != LAPIC_MODE_DISABLED)); |
11007 | |
11008 | exit_fastpath = static_call(kvm_x86_vcpu_run)(vcpu, req_immediate_exit); |
11009 | if (likely(exit_fastpath != EXIT_FASTPATH_REENTER_GUEST)) |
11010 | break; |
11011 | |
11012 | if (kvm_lapic_enabled(vcpu)) |
11013 | static_call_cond(kvm_x86_sync_pir_to_irr)(vcpu); |
11014 | |
11015 | if (unlikely(kvm_vcpu_exit_request(vcpu))) { |
11016 | exit_fastpath = EXIT_FASTPATH_EXIT_HANDLED; |
11017 | break; |
11018 | } |
11019 | |
11020 | /* Note, VM-Exits that go down the "slow" path are accounted below. */ |
11021 | ++vcpu->stat.exits; |
11022 | } |
11023 | |
11024 | /* |
11025 | * Do this here before restoring debug registers on the host. And |
11026 | * since we do this before handling the vmexit, a DR access vmexit |
11027 | * can (a) read the correct value of the debug registers, (b) set |
11028 | * KVM_DEBUGREG_WONT_EXIT again. |
11029 | */ |
11030 | if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) { |
11031 | WARN_ON(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP); |
11032 | static_call(kvm_x86_sync_dirty_debug_regs)(vcpu); |
11033 | kvm_update_dr0123(vcpu); |
11034 | kvm_update_dr7(vcpu); |
11035 | } |
11036 | |
11037 | /* |
11038 | * If the guest has used debug registers, at least dr7 |
11039 | * will be disabled while returning to the host. |
11040 | * If we don't have active breakpoints in the host, we don't |
11041 | * care about the messed up debug address registers. But if |
11042 | * we have some of them active, restore the old state. |
11043 | */ |
11044 | if (hw_breakpoint_active()) |
11045 | hw_breakpoint_restore(); |
11046 | |
11047 | vcpu->arch.last_vmentry_cpu = vcpu->cpu; |
11048 | vcpu->arch.last_guest_tsc = kvm_read_l1_tsc(vcpu, rdtsc()); |
11049 | |
11050 | vcpu->mode = OUTSIDE_GUEST_MODE; |
11051 | smp_wmb(); |
11052 | |
11053 | /* |
11054 | * Sync xfd before calling handle_exit_irqoff() which may |
11055 | * rely on the fact that guest_fpu::xfd is up-to-date (e.g. |
11056 | * in #NM irqoff handler). |
11057 | */ |
11058 | if (vcpu->arch.xfd_no_write_intercept) |
11059 | fpu_sync_guest_vmexit_xfd_state(); |
11060 | |
11061 | static_call(kvm_x86_handle_exit_irqoff)(vcpu); |
11062 | |
11063 | if (vcpu->arch.guest_fpu.xfd_err) |
11064 | wrmsrl(MSR_IA32_XFD_ERR, val: 0); |
11065 | |
11066 | /* |
11067 | * Consume any pending interrupts, including the possible source of |
11068 | * VM-Exit on SVM and any ticks that occur between VM-Exit and now. |
11069 | * An instruction is required after local_irq_enable() to fully unblock |
11070 | * interrupts on processors that implement an interrupt shadow, the |
11071 | * stat.exits increment will do nicely. |
11072 | */ |
11073 | kvm_before_interrupt(vcpu, intr: KVM_HANDLING_IRQ); |
11074 | local_irq_enable(); |
11075 | ++vcpu->stat.exits; |
11076 | local_irq_disable(); |
11077 | kvm_after_interrupt(vcpu); |
11078 | |
11079 | /* |
11080 | * Wait until after servicing IRQs to account guest time so that any |
11081 | * ticks that occurred while running the guest are properly accounted |
11082 | * to the guest. Waiting until IRQs are enabled degrades the accuracy |
11083 | * of accounting via context tracking, but the loss of accuracy is |
11084 | * acceptable for all known use cases. |
11085 | */ |
11086 | guest_timing_exit_irqoff(); |
11087 | |
11088 | local_irq_enable(); |
11089 | preempt_enable(); |
11090 | |
11091 | kvm_vcpu_srcu_read_lock(vcpu); |
11092 | |
11093 | /* |
11094 | * Profile KVM exit RIPs: |
11095 | */ |
11096 | if (unlikely(prof_on == KVM_PROFILING)) { |
11097 | unsigned long rip = kvm_rip_read(vcpu); |
11098 | profile_hit(KVM_PROFILING, ip: (void *)rip); |
11099 | } |
11100 | |
11101 | if (unlikely(vcpu->arch.tsc_always_catchup)) |
11102 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
11103 | |
11104 | if (vcpu->arch.apic_attention) |
11105 | kvm_lapic_sync_from_vapic(vcpu); |
11106 | |
11107 | r = static_call(kvm_x86_handle_exit)(vcpu, exit_fastpath); |
11108 | return r; |
11109 | |
11110 | cancel_injection: |
11111 | if (req_immediate_exit) |
11112 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
11113 | static_call(kvm_x86_cancel_injection)(vcpu); |
11114 | if (unlikely(vcpu->arch.apic_attention)) |
11115 | kvm_lapic_sync_from_vapic(vcpu); |
11116 | out: |
11117 | return r; |
11118 | } |
11119 | |
11120 | /* Called within kvm->srcu read side. */ |
11121 | static inline int vcpu_block(struct kvm_vcpu *vcpu) |
11122 | { |
11123 | bool hv_timer; |
11124 | |
11125 | if (!kvm_arch_vcpu_runnable(vcpu)) { |
11126 | /* |
11127 | * Switch to the software timer before halt-polling/blocking as |
11128 | * the guest's timer may be a break event for the vCPU, and the |
11129 | * hypervisor timer runs only when the CPU is in guest mode. |
11130 | * Switch before halt-polling so that KVM recognizes an expired |
11131 | * timer before blocking. |
11132 | */ |
11133 | hv_timer = kvm_lapic_hv_timer_in_use(vcpu); |
11134 | if (hv_timer) |
11135 | kvm_lapic_switch_to_sw_timer(vcpu); |
11136 | |
11137 | kvm_vcpu_srcu_read_unlock(vcpu); |
11138 | if (vcpu->arch.mp_state == KVM_MP_STATE_HALTED) |
11139 | kvm_vcpu_halt(vcpu); |
11140 | else |
11141 | kvm_vcpu_block(vcpu); |
11142 | kvm_vcpu_srcu_read_lock(vcpu); |
11143 | |
11144 | if (hv_timer) |
11145 | kvm_lapic_switch_to_hv_timer(vcpu); |
11146 | |
11147 | /* |
11148 | * If the vCPU is not runnable, a signal or another host event |
11149 | * of some kind is pending; service it without changing the |
11150 | * vCPU's activity state. |
11151 | */ |
11152 | if (!kvm_arch_vcpu_runnable(vcpu)) |
11153 | return 1; |
11154 | } |
11155 | |
11156 | /* |
11157 | * Evaluate nested events before exiting the halted state. This allows |
11158 | * the halt state to be recorded properly in the VMCS12's activity |
11159 | * state field (AMD does not have a similar field and a VM-Exit always |
11160 | * causes a spurious wakeup from HLT). |
11161 | */ |
11162 | if (is_guest_mode(vcpu)) { |
11163 | if (kvm_check_nested_events(vcpu) < 0) |
11164 | return 0; |
11165 | } |
11166 | |
11167 | if (kvm_apic_accept_events(vcpu) < 0) |
11168 | return 0; |
11169 | switch(vcpu->arch.mp_state) { |
11170 | case KVM_MP_STATE_HALTED: |
11171 | case KVM_MP_STATE_AP_RESET_HOLD: |
11172 | vcpu->arch.pv.pv_unhalted = false; |
11173 | vcpu->arch.mp_state = |
11174 | KVM_MP_STATE_RUNNABLE; |
11175 | fallthrough; |
11176 | case KVM_MP_STATE_RUNNABLE: |
11177 | vcpu->arch.apf.halted = false; |
11178 | break; |
11179 | case KVM_MP_STATE_INIT_RECEIVED: |
11180 | break; |
11181 | default: |
11182 | WARN_ON_ONCE(1); |
11183 | break; |
11184 | } |
11185 | return 1; |
11186 | } |
11187 | |
11188 | static inline bool kvm_vcpu_running(struct kvm_vcpu *vcpu) |
11189 | { |
11190 | return (vcpu->arch.mp_state == KVM_MP_STATE_RUNNABLE && |
11191 | !vcpu->arch.apf.halted); |
11192 | } |
11193 | |
11194 | /* Called within kvm->srcu read side. */ |
11195 | static int vcpu_run(struct kvm_vcpu *vcpu) |
11196 | { |
11197 | int r; |
11198 | |
11199 | vcpu->run->exit_reason = KVM_EXIT_UNKNOWN; |
11200 | vcpu->arch.l1tf_flush_l1d = true; |
11201 | |
11202 | for (;;) { |
11203 | /* |
11204 | * If another guest vCPU requests a PV TLB flush in the middle |
11205 | * of instruction emulation, the rest of the emulation could |
11206 | * use a stale page translation. Assume that any code after |
11207 | * this point can start executing an instruction. |
11208 | */ |
11209 | vcpu->arch.at_instruction_boundary = false; |
11210 | if (kvm_vcpu_running(vcpu)) { |
11211 | r = vcpu_enter_guest(vcpu); |
11212 | } else { |
11213 | r = vcpu_block(vcpu); |
11214 | } |
11215 | |
11216 | if (r <= 0) |
11217 | break; |
11218 | |
11219 | kvm_clear_request(KVM_REQ_UNBLOCK, vcpu); |
11220 | if (kvm_xen_has_pending_events(vcpu)) |
11221 | kvm_xen_inject_pending_events(vcpu); |
11222 | |
11223 | if (kvm_cpu_has_pending_timer(vcpu)) |
11224 | kvm_inject_pending_timer_irqs(vcpu); |
11225 | |
11226 | if (dm_request_for_irq_injection(vcpu) && |
11227 | kvm_vcpu_ready_for_interrupt_injection(vcpu)) { |
11228 | r = 0; |
11229 | vcpu->run->exit_reason = KVM_EXIT_IRQ_WINDOW_OPEN; |
11230 | ++vcpu->stat.request_irq_exits; |
11231 | break; |
11232 | } |
11233 | |
11234 | if (__xfer_to_guest_mode_work_pending()) { |
11235 | kvm_vcpu_srcu_read_unlock(vcpu); |
11236 | r = xfer_to_guest_mode_handle_work(vcpu); |
11237 | kvm_vcpu_srcu_read_lock(vcpu); |
11238 | if (r) |
11239 | return r; |
11240 | } |
11241 | } |
11242 | |
11243 | return r; |
11244 | } |
11245 | |
11246 | static inline int complete_emulated_io(struct kvm_vcpu *vcpu) |
11247 | { |
11248 | return kvm_emulate_instruction(vcpu, EMULTYPE_NO_DECODE); |
11249 | } |
11250 | |
11251 | static int complete_emulated_pio(struct kvm_vcpu *vcpu) |
11252 | { |
11253 | BUG_ON(!vcpu->arch.pio.count); |
11254 | |
11255 | return complete_emulated_io(vcpu); |
11256 | } |
11257 | |
11258 | /* |
11259 | * Implements the following, as a state machine: |
11260 | * |
11261 | * read: |
11262 | * for each fragment |
11263 | * for each mmio piece in the fragment |
11264 | * write gpa, len |
11265 | * exit |
11266 | * copy data |
11267 | * execute insn |
11268 | * |
11269 | * write: |
11270 | * for each fragment |
11271 | * for each mmio piece in the fragment |
11272 | * write gpa, len |
11273 | * copy data |
11274 | * exit |
11275 | */ |
11276 | static int complete_emulated_mmio(struct kvm_vcpu *vcpu) |
11277 | { |
11278 | struct kvm_run *run = vcpu->run; |
11279 | struct kvm_mmio_fragment *frag; |
11280 | unsigned len; |
11281 | |
11282 | BUG_ON(!vcpu->mmio_needed); |
11283 | |
11284 | /* Complete previous fragment */ |
11285 | frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; |
11286 | len = min(8u, frag->len); |
11287 | if (!vcpu->mmio_is_write) |
11288 | memcpy(frag->data, run->mmio.data, len); |
11289 | |
11290 | if (frag->len <= 8) { |
11291 | /* Switch to the next fragment. */ |
11292 | frag++; |
11293 | vcpu->mmio_cur_fragment++; |
11294 | } else { |
11295 | /* Go forward to the next mmio piece. */ |
11296 | frag->data += len; |
11297 | frag->gpa += len; |
11298 | frag->len -= len; |
11299 | } |
11300 | |
11301 | if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { |
11302 | vcpu->mmio_needed = 0; |
11303 | |
11304 | /* FIXME: return into emulator if single-stepping. */ |
11305 | if (vcpu->mmio_is_write) |
11306 | return 1; |
11307 | vcpu->mmio_read_completed = 1; |
11308 | return complete_emulated_io(vcpu); |
11309 | } |
11310 | |
11311 | run->exit_reason = KVM_EXIT_MMIO; |
11312 | run->mmio.phys_addr = frag->gpa; |
11313 | if (vcpu->mmio_is_write) |
11314 | memcpy(run->mmio.data, frag->data, min(8u, frag->len)); |
11315 | run->mmio.len = min(8u, frag->len); |
11316 | run->mmio.is_write = vcpu->mmio_is_write; |
11317 | vcpu->arch.complete_userspace_io = complete_emulated_mmio; |
11318 | return 0; |
11319 | } |
11320 | |
11321 | /* Swap (qemu) user FPU context for the guest FPU context. */ |
11322 | static void kvm_load_guest_fpu(struct kvm_vcpu *vcpu) |
11323 | { |
11324 | /* Exclude PKRU, it's restored separately immediately after VM-Exit. */ |
11325 | fpu_swap_kvm_fpstate(gfpu: &vcpu->arch.guest_fpu, enter_guest: true); |
11326 | trace_kvm_fpu(load: 1); |
11327 | } |
11328 | |
11329 | /* When vcpu_run ends, restore user space FPU context. */ |
11330 | static void kvm_put_guest_fpu(struct kvm_vcpu *vcpu) |
11331 | { |
11332 | fpu_swap_kvm_fpstate(gfpu: &vcpu->arch.guest_fpu, enter_guest: false); |
11333 | ++vcpu->stat.fpu_reload; |
11334 | trace_kvm_fpu(load: 0); |
11335 | } |
11336 | |
11337 | int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) |
11338 | { |
11339 | struct kvm_queued_exception *ex = &vcpu->arch.exception; |
11340 | struct kvm_run *kvm_run = vcpu->run; |
11341 | int r; |
11342 | |
11343 | vcpu_load(vcpu); |
11344 | kvm_sigset_activate(vcpu); |
11345 | kvm_run->flags = 0; |
11346 | kvm_load_guest_fpu(vcpu); |
11347 | |
11348 | kvm_vcpu_srcu_read_lock(vcpu); |
11349 | if (unlikely(vcpu->arch.mp_state == KVM_MP_STATE_UNINITIALIZED)) { |
11350 | if (kvm_run->immediate_exit) { |
11351 | r = -EINTR; |
11352 | goto out; |
11353 | } |
11354 | |
11355 | /* |
11356 | * Don't bother switching APIC timer emulation from the |
11357 | * hypervisor timer to the software timer, the only way for the |
11358 | * APIC timer to be active is if userspace stuffed vCPU state, |
11359 | * i.e. put the vCPU into a nonsensical state. Only an INIT |
11360 | * will transition the vCPU out of UNINITIALIZED (without more |
11361 | * state stuffing from userspace), which will reset the local |
11362 | * APIC and thus cancel the timer or drop the IRQ (if the timer |
11363 | * already expired). |
11364 | */ |
11365 | kvm_vcpu_srcu_read_unlock(vcpu); |
11366 | kvm_vcpu_block(vcpu); |
11367 | kvm_vcpu_srcu_read_lock(vcpu); |
11368 | |
11369 | if (kvm_apic_accept_events(vcpu) < 0) { |
11370 | r = 0; |
11371 | goto out; |
11372 | } |
11373 | r = -EAGAIN; |
11374 | if (signal_pending(current)) { |
11375 | r = -EINTR; |
11376 | kvm_run->exit_reason = KVM_EXIT_INTR; |
11377 | ++vcpu->stat.signal_exits; |
11378 | } |
11379 | goto out; |
11380 | } |
11381 | |
11382 | if ((kvm_run->kvm_valid_regs & ~KVM_SYNC_X86_VALID_FIELDS) || |
11383 | (kvm_run->kvm_dirty_regs & ~KVM_SYNC_X86_VALID_FIELDS)) { |
11384 | r = -EINVAL; |
11385 | goto out; |
11386 | } |
11387 | |
11388 | if (kvm_run->kvm_dirty_regs) { |
11389 | r = sync_regs(vcpu); |
11390 | if (r != 0) |
11391 | goto out; |
11392 | } |
11393 | |
11394 | /* re-sync apic's tpr */ |
11395 | if (!lapic_in_kernel(vcpu)) { |
11396 | if (kvm_set_cr8(vcpu, kvm_run->cr8) != 0) { |
11397 | r = -EINVAL; |
11398 | goto out; |
11399 | } |
11400 | } |
11401 | |
11402 | /* |
11403 | * If userspace set a pending exception and L2 is active, convert it to |
11404 | * a pending VM-Exit if L1 wants to intercept the exception. |
11405 | */ |
11406 | if (vcpu->arch.exception_from_userspace && is_guest_mode(vcpu) && |
11407 | kvm_x86_ops.nested_ops->is_exception_vmexit(vcpu, ex->vector, |
11408 | ex->error_code)) { |
11409 | kvm_queue_exception_vmexit(vcpu, vector: ex->vector, |
11410 | has_error_code: ex->has_error_code, error_code: ex->error_code, |
11411 | has_payload: ex->has_payload, payload: ex->payload); |
11412 | ex->injected = false; |
11413 | ex->pending = false; |
11414 | } |
11415 | vcpu->arch.exception_from_userspace = false; |
11416 | |
11417 | if (unlikely(vcpu->arch.complete_userspace_io)) { |
11418 | int (*cui)(struct kvm_vcpu *) = vcpu->arch.complete_userspace_io; |
11419 | vcpu->arch.complete_userspace_io = NULL; |
11420 | r = cui(vcpu); |
11421 | if (r <= 0) |
11422 | goto out; |
11423 | } else { |
11424 | WARN_ON_ONCE(vcpu->arch.pio.count); |
11425 | WARN_ON_ONCE(vcpu->mmio_needed); |
11426 | } |
11427 | |
11428 | if (kvm_run->immediate_exit) { |
11429 | r = -EINTR; |
11430 | goto out; |
11431 | } |
11432 | |
11433 | r = static_call(kvm_x86_vcpu_pre_run)(vcpu); |
11434 | if (r <= 0) |
11435 | goto out; |
11436 | |
11437 | r = vcpu_run(vcpu); |
11438 | |
11439 | out: |
11440 | kvm_put_guest_fpu(vcpu); |
11441 | if (kvm_run->kvm_valid_regs) |
11442 | store_regs(vcpu); |
11443 | post_kvm_run_save(vcpu); |
11444 | kvm_vcpu_srcu_read_unlock(vcpu); |
11445 | |
11446 | kvm_sigset_deactivate(vcpu); |
11447 | vcpu_put(vcpu); |
11448 | return r; |
11449 | } |
11450 | |
11451 | static void __get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
11452 | { |
11453 | if (vcpu->arch.emulate_regs_need_sync_to_vcpu) { |
11454 | /* |
11455 | * We are here if userspace calls get_regs() in the middle of |
11456 | * instruction emulation. Registers state needs to be copied |
11457 | * back from emulation context to vcpu. Userspace shouldn't do |
11458 | * that usually, but some bad designed PV devices (vmware |
11459 | * backdoor interface) need this to work |
11460 | */ |
11461 | emulator_writeback_register_cache(ctxt: vcpu->arch.emulate_ctxt); |
11462 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
11463 | } |
11464 | regs->rax = kvm_rax_read(vcpu); |
11465 | regs->rbx = kvm_rbx_read(vcpu); |
11466 | regs->rcx = kvm_rcx_read(vcpu); |
11467 | regs->rdx = kvm_rdx_read(vcpu); |
11468 | regs->rsi = kvm_rsi_read(vcpu); |
11469 | regs->rdi = kvm_rdi_read(vcpu); |
11470 | regs->rsp = kvm_rsp_read(vcpu); |
11471 | regs->rbp = kvm_rbp_read(vcpu); |
11472 | #ifdef CONFIG_X86_64 |
11473 | regs->r8 = kvm_r8_read(vcpu); |
11474 | regs->r9 = kvm_r9_read(vcpu); |
11475 | regs->r10 = kvm_r10_read(vcpu); |
11476 | regs->r11 = kvm_r11_read(vcpu); |
11477 | regs->r12 = kvm_r12_read(vcpu); |
11478 | regs->r13 = kvm_r13_read(vcpu); |
11479 | regs->r14 = kvm_r14_read(vcpu); |
11480 | regs->r15 = kvm_r15_read(vcpu); |
11481 | #endif |
11482 | |
11483 | regs->rip = kvm_rip_read(vcpu); |
11484 | regs->rflags = kvm_get_rflags(vcpu); |
11485 | } |
11486 | |
11487 | int kvm_arch_vcpu_ioctl_get_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
11488 | { |
11489 | vcpu_load(vcpu); |
11490 | __get_regs(vcpu, regs); |
11491 | vcpu_put(vcpu); |
11492 | return 0; |
11493 | } |
11494 | |
11495 | static void __set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
11496 | { |
11497 | vcpu->arch.emulate_regs_need_sync_from_vcpu = true; |
11498 | vcpu->arch.emulate_regs_need_sync_to_vcpu = false; |
11499 | |
11500 | kvm_rax_write(vcpu, val: regs->rax); |
11501 | kvm_rbx_write(vcpu, val: regs->rbx); |
11502 | kvm_rcx_write(vcpu, val: regs->rcx); |
11503 | kvm_rdx_write(vcpu, val: regs->rdx); |
11504 | kvm_rsi_write(vcpu, val: regs->rsi); |
11505 | kvm_rdi_write(vcpu, val: regs->rdi); |
11506 | kvm_rsp_write(vcpu, val: regs->rsp); |
11507 | kvm_rbp_write(vcpu, val: regs->rbp); |
11508 | #ifdef CONFIG_X86_64 |
11509 | kvm_r8_write(vcpu, val: regs->r8); |
11510 | kvm_r9_write(vcpu, val: regs->r9); |
11511 | kvm_r10_write(vcpu, val: regs->r10); |
11512 | kvm_r11_write(vcpu, val: regs->r11); |
11513 | kvm_r12_write(vcpu, val: regs->r12); |
11514 | kvm_r13_write(vcpu, val: regs->r13); |
11515 | kvm_r14_write(vcpu, val: regs->r14); |
11516 | kvm_r15_write(vcpu, val: regs->r15); |
11517 | #endif |
11518 | |
11519 | kvm_rip_write(vcpu, val: regs->rip); |
11520 | kvm_set_rflags(vcpu, rflags: regs->rflags | X86_EFLAGS_FIXED); |
11521 | |
11522 | vcpu->arch.exception.pending = false; |
11523 | vcpu->arch.exception_vmexit.pending = false; |
11524 | |
11525 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
11526 | } |
11527 | |
11528 | int kvm_arch_vcpu_ioctl_set_regs(struct kvm_vcpu *vcpu, struct kvm_regs *regs) |
11529 | { |
11530 | vcpu_load(vcpu); |
11531 | __set_regs(vcpu, regs); |
11532 | vcpu_put(vcpu); |
11533 | return 0; |
11534 | } |
11535 | |
11536 | static void __get_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) |
11537 | { |
11538 | struct desc_ptr dt; |
11539 | |
11540 | if (vcpu->arch.guest_state_protected) |
11541 | goto skip_protected_regs; |
11542 | |
11543 | kvm_get_segment(vcpu, var: &sregs->cs, seg: VCPU_SREG_CS); |
11544 | kvm_get_segment(vcpu, var: &sregs->ds, seg: VCPU_SREG_DS); |
11545 | kvm_get_segment(vcpu, var: &sregs->es, seg: VCPU_SREG_ES); |
11546 | kvm_get_segment(vcpu, var: &sregs->fs, seg: VCPU_SREG_FS); |
11547 | kvm_get_segment(vcpu, var: &sregs->gs, seg: VCPU_SREG_GS); |
11548 | kvm_get_segment(vcpu, var: &sregs->ss, seg: VCPU_SREG_SS); |
11549 | |
11550 | kvm_get_segment(vcpu, var: &sregs->tr, seg: VCPU_SREG_TR); |
11551 | kvm_get_segment(vcpu, var: &sregs->ldt, seg: VCPU_SREG_LDTR); |
11552 | |
11553 | static_call(kvm_x86_get_idt)(vcpu, &dt); |
11554 | sregs->idt.limit = dt.size; |
11555 | sregs->idt.base = dt.address; |
11556 | static_call(kvm_x86_get_gdt)(vcpu, &dt); |
11557 | sregs->gdt.limit = dt.size; |
11558 | sregs->gdt.base = dt.address; |
11559 | |
11560 | sregs->cr2 = vcpu->arch.cr2; |
11561 | sregs->cr3 = kvm_read_cr3(vcpu); |
11562 | |
11563 | skip_protected_regs: |
11564 | sregs->cr0 = kvm_read_cr0(vcpu); |
11565 | sregs->cr4 = kvm_read_cr4(vcpu); |
11566 | sregs->cr8 = kvm_get_cr8(vcpu); |
11567 | sregs->efer = vcpu->arch.efer; |
11568 | sregs->apic_base = kvm_get_apic_base(vcpu); |
11569 | } |
11570 | |
11571 | static void __get_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) |
11572 | { |
11573 | __get_sregs_common(vcpu, sregs); |
11574 | |
11575 | if (vcpu->arch.guest_state_protected) |
11576 | return; |
11577 | |
11578 | if (vcpu->arch.interrupt.injected && !vcpu->arch.interrupt.soft) |
11579 | set_bit(nr: vcpu->arch.interrupt.nr, |
11580 | addr: (unsigned long *)sregs->interrupt_bitmap); |
11581 | } |
11582 | |
11583 | static void __get_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) |
11584 | { |
11585 | int i; |
11586 | |
11587 | __get_sregs_common(vcpu, sregs: (struct kvm_sregs *)sregs2); |
11588 | |
11589 | if (vcpu->arch.guest_state_protected) |
11590 | return; |
11591 | |
11592 | if (is_pae_paging(vcpu)) { |
11593 | for (i = 0 ; i < 4 ; i++) |
11594 | sregs2->pdptrs[i] = kvm_pdptr_read(vcpu, index: i); |
11595 | sregs2->flags |= KVM_SREGS2_FLAGS_PDPTRS_VALID; |
11596 | } |
11597 | } |
11598 | |
11599 | int kvm_arch_vcpu_ioctl_get_sregs(struct kvm_vcpu *vcpu, |
11600 | struct kvm_sregs *sregs) |
11601 | { |
11602 | vcpu_load(vcpu); |
11603 | __get_sregs(vcpu, sregs); |
11604 | vcpu_put(vcpu); |
11605 | return 0; |
11606 | } |
11607 | |
11608 | int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu, |
11609 | struct kvm_mp_state *mp_state) |
11610 | { |
11611 | int r; |
11612 | |
11613 | vcpu_load(vcpu); |
11614 | if (kvm_mpx_supported()) |
11615 | kvm_load_guest_fpu(vcpu); |
11616 | |
11617 | r = kvm_apic_accept_events(vcpu); |
11618 | if (r < 0) |
11619 | goto out; |
11620 | r = 0; |
11621 | |
11622 | if ((vcpu->arch.mp_state == KVM_MP_STATE_HALTED || |
11623 | vcpu->arch.mp_state == KVM_MP_STATE_AP_RESET_HOLD) && |
11624 | vcpu->arch.pv.pv_unhalted) |
11625 | mp_state->mp_state = KVM_MP_STATE_RUNNABLE; |
11626 | else |
11627 | mp_state->mp_state = vcpu->arch.mp_state; |
11628 | |
11629 | out: |
11630 | if (kvm_mpx_supported()) |
11631 | kvm_put_guest_fpu(vcpu); |
11632 | vcpu_put(vcpu); |
11633 | return r; |
11634 | } |
11635 | |
11636 | int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu, |
11637 | struct kvm_mp_state *mp_state) |
11638 | { |
11639 | int ret = -EINVAL; |
11640 | |
11641 | vcpu_load(vcpu); |
11642 | |
11643 | switch (mp_state->mp_state) { |
11644 | case KVM_MP_STATE_UNINITIALIZED: |
11645 | case KVM_MP_STATE_HALTED: |
11646 | case KVM_MP_STATE_AP_RESET_HOLD: |
11647 | case KVM_MP_STATE_INIT_RECEIVED: |
11648 | case KVM_MP_STATE_SIPI_RECEIVED: |
11649 | if (!lapic_in_kernel(vcpu)) |
11650 | goto out; |
11651 | break; |
11652 | |
11653 | case KVM_MP_STATE_RUNNABLE: |
11654 | break; |
11655 | |
11656 | default: |
11657 | goto out; |
11658 | } |
11659 | |
11660 | /* |
11661 | * Pending INITs are reported using KVM_SET_VCPU_EVENTS, disallow |
11662 | * forcing the guest into INIT/SIPI if those events are supposed to be |
11663 | * blocked. KVM prioritizes SMI over INIT, so reject INIT/SIPI state |
11664 | * if an SMI is pending as well. |
11665 | */ |
11666 | if ((!kvm_apic_init_sipi_allowed(vcpu) || vcpu->arch.smi_pending) && |
11667 | (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED || |
11668 | mp_state->mp_state == KVM_MP_STATE_INIT_RECEIVED)) |
11669 | goto out; |
11670 | |
11671 | if (mp_state->mp_state == KVM_MP_STATE_SIPI_RECEIVED) { |
11672 | vcpu->arch.mp_state = KVM_MP_STATE_INIT_RECEIVED; |
11673 | set_bit(KVM_APIC_SIPI, addr: &vcpu->arch.apic->pending_events); |
11674 | } else |
11675 | vcpu->arch.mp_state = mp_state->mp_state; |
11676 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
11677 | |
11678 | ret = 0; |
11679 | out: |
11680 | vcpu_put(vcpu); |
11681 | return ret; |
11682 | } |
11683 | |
11684 | int kvm_task_switch(struct kvm_vcpu *vcpu, u16 tss_selector, int idt_index, |
11685 | int reason, bool has_error_code, u32 error_code) |
11686 | { |
11687 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
11688 | int ret; |
11689 | |
11690 | init_emulate_ctxt(vcpu); |
11691 | |
11692 | ret = emulator_task_switch(ctxt, tss_selector, idt_index, reason, |
11693 | has_error_code, error_code); |
11694 | if (ret) { |
11695 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
11696 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; |
11697 | vcpu->run->internal.ndata = 0; |
11698 | return 0; |
11699 | } |
11700 | |
11701 | kvm_rip_write(vcpu, val: ctxt->eip); |
11702 | kvm_set_rflags(vcpu, rflags: ctxt->eflags); |
11703 | return 1; |
11704 | } |
11705 | EXPORT_SYMBOL_GPL(kvm_task_switch); |
11706 | |
11707 | static bool kvm_is_valid_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) |
11708 | { |
11709 | if ((sregs->efer & EFER_LME) && (sregs->cr0 & X86_CR0_PG)) { |
11710 | /* |
11711 | * When EFER.LME and CR0.PG are set, the processor is in |
11712 | * 64-bit mode (though maybe in a 32-bit code segment). |
11713 | * CR4.PAE and EFER.LMA must be set. |
11714 | */ |
11715 | if (!(sregs->cr4 & X86_CR4_PAE) || !(sregs->efer & EFER_LMA)) |
11716 | return false; |
11717 | if (!kvm_vcpu_is_legal_cr3(vcpu, cr3: sregs->cr3)) |
11718 | return false; |
11719 | } else { |
11720 | /* |
11721 | * Not in 64-bit mode: EFER.LMA is clear and the code |
11722 | * segment cannot be 64-bit. |
11723 | */ |
11724 | if (sregs->efer & EFER_LMA || sregs->cs.l) |
11725 | return false; |
11726 | } |
11727 | |
11728 | return kvm_is_valid_cr4(vcpu, cr4: sregs->cr4) && |
11729 | kvm_is_valid_cr0(vcpu, cr0: sregs->cr0); |
11730 | } |
11731 | |
11732 | static int __set_sregs_common(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs, |
11733 | int *mmu_reset_needed, bool update_pdptrs) |
11734 | { |
11735 | struct msr_data apic_base_msr; |
11736 | int idx; |
11737 | struct desc_ptr dt; |
11738 | |
11739 | if (!kvm_is_valid_sregs(vcpu, sregs)) |
11740 | return -EINVAL; |
11741 | |
11742 | apic_base_msr.data = sregs->apic_base; |
11743 | apic_base_msr.host_initiated = true; |
11744 | if (kvm_set_apic_base(vcpu, msr_info: &apic_base_msr)) |
11745 | return -EINVAL; |
11746 | |
11747 | if (vcpu->arch.guest_state_protected) |
11748 | return 0; |
11749 | |
11750 | dt.size = sregs->idt.limit; |
11751 | dt.address = sregs->idt.base; |
11752 | static_call(kvm_x86_set_idt)(vcpu, &dt); |
11753 | dt.size = sregs->gdt.limit; |
11754 | dt.address = sregs->gdt.base; |
11755 | static_call(kvm_x86_set_gdt)(vcpu, &dt); |
11756 | |
11757 | vcpu->arch.cr2 = sregs->cr2; |
11758 | *mmu_reset_needed |= kvm_read_cr3(vcpu) != sregs->cr3; |
11759 | vcpu->arch.cr3 = sregs->cr3; |
11760 | kvm_register_mark_dirty(vcpu, reg: VCPU_EXREG_CR3); |
11761 | static_call_cond(kvm_x86_post_set_cr3)(vcpu, sregs->cr3); |
11762 | |
11763 | kvm_set_cr8(vcpu, sregs->cr8); |
11764 | |
11765 | *mmu_reset_needed |= vcpu->arch.efer != sregs->efer; |
11766 | static_call(kvm_x86_set_efer)(vcpu, sregs->efer); |
11767 | |
11768 | *mmu_reset_needed |= kvm_read_cr0(vcpu) != sregs->cr0; |
11769 | static_call(kvm_x86_set_cr0)(vcpu, sregs->cr0); |
11770 | |
11771 | *mmu_reset_needed |= kvm_read_cr4(vcpu) != sregs->cr4; |
11772 | static_call(kvm_x86_set_cr4)(vcpu, sregs->cr4); |
11773 | |
11774 | if (update_pdptrs) { |
11775 | idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
11776 | if (is_pae_paging(vcpu)) { |
11777 | load_pdptrs(vcpu, kvm_read_cr3(vcpu)); |
11778 | *mmu_reset_needed = 1; |
11779 | } |
11780 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
11781 | } |
11782 | |
11783 | kvm_set_segment(vcpu, var: &sregs->cs, seg: VCPU_SREG_CS); |
11784 | kvm_set_segment(vcpu, var: &sregs->ds, seg: VCPU_SREG_DS); |
11785 | kvm_set_segment(vcpu, var: &sregs->es, seg: VCPU_SREG_ES); |
11786 | kvm_set_segment(vcpu, var: &sregs->fs, seg: VCPU_SREG_FS); |
11787 | kvm_set_segment(vcpu, var: &sregs->gs, seg: VCPU_SREG_GS); |
11788 | kvm_set_segment(vcpu, var: &sregs->ss, seg: VCPU_SREG_SS); |
11789 | |
11790 | kvm_set_segment(vcpu, var: &sregs->tr, seg: VCPU_SREG_TR); |
11791 | kvm_set_segment(vcpu, var: &sregs->ldt, seg: VCPU_SREG_LDTR); |
11792 | |
11793 | update_cr8_intercept(vcpu); |
11794 | |
11795 | /* Older userspace won't unhalt the vcpu on reset. */ |
11796 | if (kvm_vcpu_is_bsp(vcpu) && kvm_rip_read(vcpu) == 0xfff0 && |
11797 | sregs->cs.selector == 0xf000 && sregs->cs.base == 0xffff0000 && |
11798 | !is_protmode(vcpu)) |
11799 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
11800 | |
11801 | return 0; |
11802 | } |
11803 | |
11804 | static int __set_sregs(struct kvm_vcpu *vcpu, struct kvm_sregs *sregs) |
11805 | { |
11806 | int pending_vec, max_bits; |
11807 | int mmu_reset_needed = 0; |
11808 | int ret = __set_sregs_common(vcpu, sregs, mmu_reset_needed: &mmu_reset_needed, update_pdptrs: true); |
11809 | |
11810 | if (ret) |
11811 | return ret; |
11812 | |
11813 | if (mmu_reset_needed) { |
11814 | kvm_mmu_reset_context(vcpu); |
11815 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
11816 | } |
11817 | |
11818 | max_bits = KVM_NR_INTERRUPTS; |
11819 | pending_vec = find_first_bit( |
11820 | addr: (const unsigned long *)sregs->interrupt_bitmap, size: max_bits); |
11821 | |
11822 | if (pending_vec < max_bits) { |
11823 | kvm_queue_interrupt(vcpu, vector: pending_vec, soft: false); |
11824 | pr_debug("Set back pending irq %d\n" , pending_vec); |
11825 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
11826 | } |
11827 | return 0; |
11828 | } |
11829 | |
11830 | static int __set_sregs2(struct kvm_vcpu *vcpu, struct kvm_sregs2 *sregs2) |
11831 | { |
11832 | int mmu_reset_needed = 0; |
11833 | bool valid_pdptrs = sregs2->flags & KVM_SREGS2_FLAGS_PDPTRS_VALID; |
11834 | bool pae = (sregs2->cr0 & X86_CR0_PG) && (sregs2->cr4 & X86_CR4_PAE) && |
11835 | !(sregs2->efer & EFER_LMA); |
11836 | int i, ret; |
11837 | |
11838 | if (sregs2->flags & ~KVM_SREGS2_FLAGS_PDPTRS_VALID) |
11839 | return -EINVAL; |
11840 | |
11841 | if (valid_pdptrs && (!pae || vcpu->arch.guest_state_protected)) |
11842 | return -EINVAL; |
11843 | |
11844 | ret = __set_sregs_common(vcpu, sregs: (struct kvm_sregs *)sregs2, |
11845 | mmu_reset_needed: &mmu_reset_needed, update_pdptrs: !valid_pdptrs); |
11846 | if (ret) |
11847 | return ret; |
11848 | |
11849 | if (valid_pdptrs) { |
11850 | for (i = 0; i < 4 ; i++) |
11851 | kvm_pdptr_write(vcpu, index: i, value: sregs2->pdptrs[i]); |
11852 | |
11853 | kvm_register_mark_dirty(vcpu, reg: VCPU_EXREG_PDPTR); |
11854 | mmu_reset_needed = 1; |
11855 | vcpu->arch.pdptrs_from_userspace = true; |
11856 | } |
11857 | if (mmu_reset_needed) { |
11858 | kvm_mmu_reset_context(vcpu); |
11859 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
11860 | } |
11861 | return 0; |
11862 | } |
11863 | |
11864 | int kvm_arch_vcpu_ioctl_set_sregs(struct kvm_vcpu *vcpu, |
11865 | struct kvm_sregs *sregs) |
11866 | { |
11867 | int ret; |
11868 | |
11869 | vcpu_load(vcpu); |
11870 | ret = __set_sregs(vcpu, sregs); |
11871 | vcpu_put(vcpu); |
11872 | return ret; |
11873 | } |
11874 | |
11875 | static void kvm_arch_vcpu_guestdbg_update_apicv_inhibit(struct kvm *kvm) |
11876 | { |
11877 | bool set = false; |
11878 | struct kvm_vcpu *vcpu; |
11879 | unsigned long i; |
11880 | |
11881 | if (!enable_apicv) |
11882 | return; |
11883 | |
11884 | down_write(sem: &kvm->arch.apicv_update_lock); |
11885 | |
11886 | kvm_for_each_vcpu(i, vcpu, kvm) { |
11887 | if (vcpu->guest_debug & KVM_GUESTDBG_BLOCKIRQ) { |
11888 | set = true; |
11889 | break; |
11890 | } |
11891 | } |
11892 | __kvm_set_or_clear_apicv_inhibit(kvm, reason: APICV_INHIBIT_REASON_BLOCKIRQ, set); |
11893 | up_write(sem: &kvm->arch.apicv_update_lock); |
11894 | } |
11895 | |
11896 | int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu *vcpu, |
11897 | struct kvm_guest_debug *dbg) |
11898 | { |
11899 | unsigned long rflags; |
11900 | int i, r; |
11901 | |
11902 | if (vcpu->arch.guest_state_protected) |
11903 | return -EINVAL; |
11904 | |
11905 | vcpu_load(vcpu); |
11906 | |
11907 | if (dbg->control & (KVM_GUESTDBG_INJECT_DB | KVM_GUESTDBG_INJECT_BP)) { |
11908 | r = -EBUSY; |
11909 | if (kvm_is_exception_pending(vcpu)) |
11910 | goto out; |
11911 | if (dbg->control & KVM_GUESTDBG_INJECT_DB) |
11912 | kvm_queue_exception(vcpu, DB_VECTOR); |
11913 | else |
11914 | kvm_queue_exception(vcpu, BP_VECTOR); |
11915 | } |
11916 | |
11917 | /* |
11918 | * Read rflags as long as potentially injected trace flags are still |
11919 | * filtered out. |
11920 | */ |
11921 | rflags = kvm_get_rflags(vcpu); |
11922 | |
11923 | vcpu->guest_debug = dbg->control; |
11924 | if (!(vcpu->guest_debug & KVM_GUESTDBG_ENABLE)) |
11925 | vcpu->guest_debug = 0; |
11926 | |
11927 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) { |
11928 | for (i = 0; i < KVM_NR_DB_REGS; ++i) |
11929 | vcpu->arch.eff_db[i] = dbg->arch.debugreg[i]; |
11930 | vcpu->arch.guest_debug_dr7 = dbg->arch.debugreg[7]; |
11931 | } else { |
11932 | for (i = 0; i < KVM_NR_DB_REGS; i++) |
11933 | vcpu->arch.eff_db[i] = vcpu->arch.db[i]; |
11934 | } |
11935 | kvm_update_dr7(vcpu); |
11936 | |
11937 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) |
11938 | vcpu->arch.singlestep_rip = kvm_get_linear_rip(vcpu); |
11939 | |
11940 | /* |
11941 | * Trigger an rflags update that will inject or remove the trace |
11942 | * flags. |
11943 | */ |
11944 | kvm_set_rflags(vcpu, rflags); |
11945 | |
11946 | static_call(kvm_x86_update_exception_bitmap)(vcpu); |
11947 | |
11948 | kvm_arch_vcpu_guestdbg_update_apicv_inhibit(kvm: vcpu->kvm); |
11949 | |
11950 | r = 0; |
11951 | |
11952 | out: |
11953 | vcpu_put(vcpu); |
11954 | return r; |
11955 | } |
11956 | |
11957 | /* |
11958 | * Translate a guest virtual address to a guest physical address. |
11959 | */ |
11960 | int kvm_arch_vcpu_ioctl_translate(struct kvm_vcpu *vcpu, |
11961 | struct kvm_translation *tr) |
11962 | { |
11963 | unsigned long vaddr = tr->linear_address; |
11964 | gpa_t gpa; |
11965 | int idx; |
11966 | |
11967 | vcpu_load(vcpu); |
11968 | |
11969 | idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
11970 | gpa = kvm_mmu_gva_to_gpa_system(vcpu, gva: vaddr, NULL); |
11971 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
11972 | tr->physical_address = gpa; |
11973 | tr->valid = gpa != INVALID_GPA; |
11974 | tr->writeable = 1; |
11975 | tr->usermode = 0; |
11976 | |
11977 | vcpu_put(vcpu); |
11978 | return 0; |
11979 | } |
11980 | |
11981 | int kvm_arch_vcpu_ioctl_get_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) |
11982 | { |
11983 | struct fxregs_state *fxsave; |
11984 | |
11985 | if (fpstate_is_confidential(gfpu: &vcpu->arch.guest_fpu)) |
11986 | return 0; |
11987 | |
11988 | vcpu_load(vcpu); |
11989 | |
11990 | fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; |
11991 | memcpy(fpu->fpr, fxsave->st_space, 128); |
11992 | fpu->fcw = fxsave->cwd; |
11993 | fpu->fsw = fxsave->swd; |
11994 | fpu->ftwx = fxsave->twd; |
11995 | fpu->last_opcode = fxsave->fop; |
11996 | fpu->last_ip = fxsave->rip; |
11997 | fpu->last_dp = fxsave->rdp; |
11998 | memcpy(fpu->xmm, fxsave->xmm_space, sizeof(fxsave->xmm_space)); |
11999 | |
12000 | vcpu_put(vcpu); |
12001 | return 0; |
12002 | } |
12003 | |
12004 | int kvm_arch_vcpu_ioctl_set_fpu(struct kvm_vcpu *vcpu, struct kvm_fpu *fpu) |
12005 | { |
12006 | struct fxregs_state *fxsave; |
12007 | |
12008 | if (fpstate_is_confidential(gfpu: &vcpu->arch.guest_fpu)) |
12009 | return 0; |
12010 | |
12011 | vcpu_load(vcpu); |
12012 | |
12013 | fxsave = &vcpu->arch.guest_fpu.fpstate->regs.fxsave; |
12014 | |
12015 | memcpy(fxsave->st_space, fpu->fpr, 128); |
12016 | fxsave->cwd = fpu->fcw; |
12017 | fxsave->swd = fpu->fsw; |
12018 | fxsave->twd = fpu->ftwx; |
12019 | fxsave->fop = fpu->last_opcode; |
12020 | fxsave->rip = fpu->last_ip; |
12021 | fxsave->rdp = fpu->last_dp; |
12022 | memcpy(fxsave->xmm_space, fpu->xmm, sizeof(fxsave->xmm_space)); |
12023 | |
12024 | vcpu_put(vcpu); |
12025 | return 0; |
12026 | } |
12027 | |
12028 | static void store_regs(struct kvm_vcpu *vcpu) |
12029 | { |
12030 | BUILD_BUG_ON(sizeof(struct kvm_sync_regs) > SYNC_REGS_SIZE_BYTES); |
12031 | |
12032 | if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_REGS) |
12033 | __get_regs(vcpu, regs: &vcpu->run->s.regs.regs); |
12034 | |
12035 | if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_SREGS) |
12036 | __get_sregs(vcpu, sregs: &vcpu->run->s.regs.sregs); |
12037 | |
12038 | if (vcpu->run->kvm_valid_regs & KVM_SYNC_X86_EVENTS) |
12039 | kvm_vcpu_ioctl_x86_get_vcpu_events( |
12040 | vcpu, events: &vcpu->run->s.regs.events); |
12041 | } |
12042 | |
12043 | static int sync_regs(struct kvm_vcpu *vcpu) |
12044 | { |
12045 | if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_REGS) { |
12046 | __set_regs(vcpu, regs: &vcpu->run->s.regs.regs); |
12047 | vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_REGS; |
12048 | } |
12049 | |
12050 | if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_SREGS) { |
12051 | struct kvm_sregs sregs = vcpu->run->s.regs.sregs; |
12052 | |
12053 | if (__set_sregs(vcpu, sregs: &sregs)) |
12054 | return -EINVAL; |
12055 | |
12056 | vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_SREGS; |
12057 | } |
12058 | |
12059 | if (vcpu->run->kvm_dirty_regs & KVM_SYNC_X86_EVENTS) { |
12060 | struct kvm_vcpu_events events = vcpu->run->s.regs.events; |
12061 | |
12062 | if (kvm_vcpu_ioctl_x86_set_vcpu_events(vcpu, events: &events)) |
12063 | return -EINVAL; |
12064 | |
12065 | vcpu->run->kvm_dirty_regs &= ~KVM_SYNC_X86_EVENTS; |
12066 | } |
12067 | |
12068 | return 0; |
12069 | } |
12070 | |
12071 | int kvm_arch_vcpu_precreate(struct kvm *kvm, unsigned int id) |
12072 | { |
12073 | if (kvm_check_tsc_unstable() && kvm->created_vcpus) |
12074 | pr_warn_once("SMP vm created on host with unstable TSC; " |
12075 | "guest TSC will not be reliable\n" ); |
12076 | |
12077 | if (!kvm->arch.max_vcpu_ids) |
12078 | kvm->arch.max_vcpu_ids = KVM_MAX_VCPU_IDS; |
12079 | |
12080 | if (id >= kvm->arch.max_vcpu_ids) |
12081 | return -EINVAL; |
12082 | |
12083 | return static_call(kvm_x86_vcpu_precreate)(kvm); |
12084 | } |
12085 | |
12086 | int kvm_arch_vcpu_create(struct kvm_vcpu *vcpu) |
12087 | { |
12088 | struct page *page; |
12089 | int r; |
12090 | |
12091 | vcpu->arch.last_vmentry_cpu = -1; |
12092 | vcpu->arch.regs_avail = ~0; |
12093 | vcpu->arch.regs_dirty = ~0; |
12094 | |
12095 | kvm_gpc_init(gpc: &vcpu->arch.pv_time, kvm: vcpu->kvm); |
12096 | |
12097 | if (!irqchip_in_kernel(kvm: vcpu->kvm) || kvm_vcpu_is_reset_bsp(vcpu)) |
12098 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
12099 | else |
12100 | vcpu->arch.mp_state = KVM_MP_STATE_UNINITIALIZED; |
12101 | |
12102 | r = kvm_mmu_create(vcpu); |
12103 | if (r < 0) |
12104 | return r; |
12105 | |
12106 | r = kvm_create_lapic(vcpu, timer_advance_ns: lapic_timer_advance_ns); |
12107 | if (r < 0) |
12108 | goto fail_mmu_destroy; |
12109 | |
12110 | r = -ENOMEM; |
12111 | |
12112 | page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); |
12113 | if (!page) |
12114 | goto fail_free_lapic; |
12115 | vcpu->arch.pio_data = page_address(page); |
12116 | |
12117 | vcpu->arch.mce_banks = kcalloc(KVM_MAX_MCE_BANKS * 4, size: sizeof(u64), |
12118 | GFP_KERNEL_ACCOUNT); |
12119 | vcpu->arch.mci_ctl2_banks = kcalloc(KVM_MAX_MCE_BANKS, size: sizeof(u64), |
12120 | GFP_KERNEL_ACCOUNT); |
12121 | if (!vcpu->arch.mce_banks || !vcpu->arch.mci_ctl2_banks) |
12122 | goto fail_free_mce_banks; |
12123 | vcpu->arch.mcg_cap = KVM_MAX_MCE_BANKS; |
12124 | |
12125 | if (!zalloc_cpumask_var(mask: &vcpu->arch.wbinvd_dirty_mask, |
12126 | GFP_KERNEL_ACCOUNT)) |
12127 | goto fail_free_mce_banks; |
12128 | |
12129 | if (!alloc_emulate_ctxt(vcpu)) |
12130 | goto free_wbinvd_dirty_mask; |
12131 | |
12132 | if (!fpu_alloc_guest_fpstate(gfpu: &vcpu->arch.guest_fpu)) { |
12133 | pr_err("failed to allocate vcpu's fpu\n" ); |
12134 | goto free_emulate_ctxt; |
12135 | } |
12136 | |
12137 | vcpu->arch.maxphyaddr = cpuid_query_maxphyaddr(vcpu); |
12138 | vcpu->arch.reserved_gpa_bits = kvm_vcpu_reserved_gpa_bits_raw(vcpu); |
12139 | |
12140 | vcpu->arch.pat = MSR_IA32_CR_PAT_DEFAULT; |
12141 | |
12142 | kvm_async_pf_hash_reset(vcpu); |
12143 | |
12144 | vcpu->arch.perf_capabilities = kvm_caps.supported_perf_cap; |
12145 | kvm_pmu_init(vcpu); |
12146 | |
12147 | vcpu->arch.pending_external_vector = -1; |
12148 | vcpu->arch.preempted_in_kernel = false; |
12149 | |
12150 | #if IS_ENABLED(CONFIG_HYPERV) |
12151 | vcpu->arch.hv_root_tdp = INVALID_PAGE; |
12152 | #endif |
12153 | |
12154 | r = static_call(kvm_x86_vcpu_create)(vcpu); |
12155 | if (r) |
12156 | goto free_guest_fpu; |
12157 | |
12158 | vcpu->arch.arch_capabilities = kvm_get_arch_capabilities(); |
12159 | vcpu->arch.msr_platform_info = MSR_PLATFORM_INFO_CPUID_FAULT; |
12160 | kvm_xen_init_vcpu(vcpu); |
12161 | kvm_vcpu_mtrr_init(vcpu); |
12162 | vcpu_load(vcpu); |
12163 | kvm_set_tsc_khz(vcpu, user_tsc_khz: vcpu->kvm->arch.default_tsc_khz); |
12164 | kvm_vcpu_reset(vcpu, init_event: false); |
12165 | kvm_init_mmu(vcpu); |
12166 | vcpu_put(vcpu); |
12167 | return 0; |
12168 | |
12169 | free_guest_fpu: |
12170 | fpu_free_guest_fpstate(gfpu: &vcpu->arch.guest_fpu); |
12171 | free_emulate_ctxt: |
12172 | kmem_cache_free(s: x86_emulator_cache, objp: vcpu->arch.emulate_ctxt); |
12173 | free_wbinvd_dirty_mask: |
12174 | free_cpumask_var(mask: vcpu->arch.wbinvd_dirty_mask); |
12175 | fail_free_mce_banks: |
12176 | kfree(objp: vcpu->arch.mce_banks); |
12177 | kfree(objp: vcpu->arch.mci_ctl2_banks); |
12178 | free_page((unsigned long)vcpu->arch.pio_data); |
12179 | fail_free_lapic: |
12180 | kvm_free_lapic(vcpu); |
12181 | fail_mmu_destroy: |
12182 | kvm_mmu_destroy(vcpu); |
12183 | return r; |
12184 | } |
12185 | |
12186 | void kvm_arch_vcpu_postcreate(struct kvm_vcpu *vcpu) |
12187 | { |
12188 | struct kvm *kvm = vcpu->kvm; |
12189 | |
12190 | if (mutex_lock_killable(&vcpu->mutex)) |
12191 | return; |
12192 | vcpu_load(vcpu); |
12193 | kvm_synchronize_tsc(vcpu, NULL); |
12194 | vcpu_put(vcpu); |
12195 | |
12196 | /* poll control enabled by default */ |
12197 | vcpu->arch.msr_kvm_poll_control = 1; |
12198 | |
12199 | mutex_unlock(lock: &vcpu->mutex); |
12200 | |
12201 | if (kvmclock_periodic_sync && vcpu->vcpu_idx == 0) |
12202 | schedule_delayed_work(dwork: &kvm->arch.kvmclock_sync_work, |
12203 | KVMCLOCK_SYNC_PERIOD); |
12204 | } |
12205 | |
12206 | void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu) |
12207 | { |
12208 | int idx; |
12209 | |
12210 | kvmclock_reset(vcpu); |
12211 | |
12212 | static_call(kvm_x86_vcpu_free)(vcpu); |
12213 | |
12214 | kmem_cache_free(s: x86_emulator_cache, objp: vcpu->arch.emulate_ctxt); |
12215 | free_cpumask_var(mask: vcpu->arch.wbinvd_dirty_mask); |
12216 | fpu_free_guest_fpstate(gfpu: &vcpu->arch.guest_fpu); |
12217 | |
12218 | kvm_xen_destroy_vcpu(vcpu); |
12219 | kvm_hv_vcpu_uninit(vcpu); |
12220 | kvm_pmu_destroy(vcpu); |
12221 | kfree(objp: vcpu->arch.mce_banks); |
12222 | kfree(objp: vcpu->arch.mci_ctl2_banks); |
12223 | kvm_free_lapic(vcpu); |
12224 | idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
12225 | kvm_mmu_destroy(vcpu); |
12226 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
12227 | free_page((unsigned long)vcpu->arch.pio_data); |
12228 | kvfree(addr: vcpu->arch.cpuid_entries); |
12229 | } |
12230 | |
12231 | void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) |
12232 | { |
12233 | struct kvm_cpuid_entry2 *cpuid_0x1; |
12234 | unsigned long old_cr0 = kvm_read_cr0(vcpu); |
12235 | unsigned long new_cr0; |
12236 | |
12237 | /* |
12238 | * Several of the "set" flows, e.g. ->set_cr0(), read other registers |
12239 | * to handle side effects. RESET emulation hits those flows and relies |
12240 | * on emulated/virtualized registers, including those that are loaded |
12241 | * into hardware, to be zeroed at vCPU creation. Use CRs as a sentinel |
12242 | * to detect improper or missing initialization. |
12243 | */ |
12244 | WARN_ON_ONCE(!init_event && |
12245 | (old_cr0 || kvm_read_cr3(vcpu) || kvm_read_cr4(vcpu))); |
12246 | |
12247 | /* |
12248 | * SVM doesn't unconditionally VM-Exit on INIT and SHUTDOWN, thus it's |
12249 | * possible to INIT the vCPU while L2 is active. Force the vCPU back |
12250 | * into L1 as EFER.SVME is cleared on INIT (along with all other EFER |
12251 | * bits), i.e. virtualization is disabled. |
12252 | */ |
12253 | if (is_guest_mode(vcpu)) |
12254 | kvm_leave_nested(vcpu); |
12255 | |
12256 | kvm_lapic_reset(vcpu, init_event); |
12257 | |
12258 | WARN_ON_ONCE(is_guest_mode(vcpu) || is_smm(vcpu)); |
12259 | vcpu->arch.hflags = 0; |
12260 | |
12261 | vcpu->arch.smi_pending = 0; |
12262 | vcpu->arch.smi_count = 0; |
12263 | atomic_set(v: &vcpu->arch.nmi_queued, i: 0); |
12264 | vcpu->arch.nmi_pending = 0; |
12265 | vcpu->arch.nmi_injected = false; |
12266 | kvm_clear_interrupt_queue(vcpu); |
12267 | kvm_clear_exception_queue(vcpu); |
12268 | |
12269 | memset(vcpu->arch.db, 0, sizeof(vcpu->arch.db)); |
12270 | kvm_update_dr0123(vcpu); |
12271 | vcpu->arch.dr6 = DR6_ACTIVE_LOW; |
12272 | vcpu->arch.dr7 = DR7_FIXED_1; |
12273 | kvm_update_dr7(vcpu); |
12274 | |
12275 | vcpu->arch.cr2 = 0; |
12276 | |
12277 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
12278 | vcpu->arch.apf.msr_en_val = 0; |
12279 | vcpu->arch.apf.msr_int_val = 0; |
12280 | vcpu->arch.st.msr_val = 0; |
12281 | |
12282 | kvmclock_reset(vcpu); |
12283 | |
12284 | kvm_clear_async_pf_completion_queue(vcpu); |
12285 | kvm_async_pf_hash_reset(vcpu); |
12286 | vcpu->arch.apf.halted = false; |
12287 | |
12288 | if (vcpu->arch.guest_fpu.fpstate && kvm_mpx_supported()) { |
12289 | struct fpstate *fpstate = vcpu->arch.guest_fpu.fpstate; |
12290 | |
12291 | /* |
12292 | * All paths that lead to INIT are required to load the guest's |
12293 | * FPU state (because most paths are buried in KVM_RUN). |
12294 | */ |
12295 | if (init_event) |
12296 | kvm_put_guest_fpu(vcpu); |
12297 | |
12298 | fpstate_clear_xstate_component(fps: fpstate, xfeature: XFEATURE_BNDREGS); |
12299 | fpstate_clear_xstate_component(fps: fpstate, xfeature: XFEATURE_BNDCSR); |
12300 | |
12301 | if (init_event) |
12302 | kvm_load_guest_fpu(vcpu); |
12303 | } |
12304 | |
12305 | if (!init_event) { |
12306 | vcpu->arch.smbase = 0x30000; |
12307 | |
12308 | vcpu->arch.msr_misc_features_enables = 0; |
12309 | vcpu->arch.ia32_misc_enable_msr = MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | |
12310 | MSR_IA32_MISC_ENABLE_BTS_UNAVAIL; |
12311 | |
12312 | __kvm_set_xcr(vcpu, index: 0, XFEATURE_MASK_FP); |
12313 | __kvm_set_msr(vcpu, MSR_IA32_XSS, data: 0, host_initiated: true); |
12314 | } |
12315 | |
12316 | /* All GPRs except RDX (handled below) are zeroed on RESET/INIT. */ |
12317 | memset(vcpu->arch.regs, 0, sizeof(vcpu->arch.regs)); |
12318 | kvm_register_mark_dirty(vcpu, reg: VCPU_REGS_RSP); |
12319 | |
12320 | /* |
12321 | * Fall back to KVM's default Family/Model/Stepping of 0x600 (P6/Athlon) |
12322 | * if no CPUID match is found. Note, it's impossible to get a match at |
12323 | * RESET since KVM emulates RESET before exposing the vCPU to userspace, |
12324 | * i.e. it's impossible for kvm_find_cpuid_entry() to find a valid entry |
12325 | * on RESET. But, go through the motions in case that's ever remedied. |
12326 | */ |
12327 | cpuid_0x1 = kvm_find_cpuid_entry(vcpu, function: 1); |
12328 | kvm_rdx_write(vcpu, val: cpuid_0x1 ? cpuid_0x1->eax : 0x600); |
12329 | |
12330 | static_call(kvm_x86_vcpu_reset)(vcpu, init_event); |
12331 | |
12332 | kvm_set_rflags(vcpu, X86_EFLAGS_FIXED); |
12333 | kvm_rip_write(vcpu, val: 0xfff0); |
12334 | |
12335 | vcpu->arch.cr3 = 0; |
12336 | kvm_register_mark_dirty(vcpu, reg: VCPU_EXREG_CR3); |
12337 | |
12338 | /* |
12339 | * CR0.CD/NW are set on RESET, preserved on INIT. Note, some versions |
12340 | * of Intel's SDM list CD/NW as being set on INIT, but they contradict |
12341 | * (or qualify) that with a footnote stating that CD/NW are preserved. |
12342 | */ |
12343 | new_cr0 = X86_CR0_ET; |
12344 | if (init_event) |
12345 | new_cr0 |= (old_cr0 & (X86_CR0_NW | X86_CR0_CD)); |
12346 | else |
12347 | new_cr0 |= X86_CR0_NW | X86_CR0_CD; |
12348 | |
12349 | static_call(kvm_x86_set_cr0)(vcpu, new_cr0); |
12350 | static_call(kvm_x86_set_cr4)(vcpu, 0); |
12351 | static_call(kvm_x86_set_efer)(vcpu, 0); |
12352 | static_call(kvm_x86_update_exception_bitmap)(vcpu); |
12353 | |
12354 | /* |
12355 | * On the standard CR0/CR4/EFER modification paths, there are several |
12356 | * complex conditions determining whether the MMU has to be reset and/or |
12357 | * which PCIDs have to be flushed. However, CR0.WP and the paging-related |
12358 | * bits in CR4 and EFER are irrelevant if CR0.PG was '0'; and a reset+flush |
12359 | * is needed anyway if CR0.PG was '1' (which can only happen for INIT, as |
12360 | * CR0 will be '0' prior to RESET). So we only need to check CR0.PG here. |
12361 | */ |
12362 | if (old_cr0 & X86_CR0_PG) { |
12363 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
12364 | kvm_mmu_reset_context(vcpu); |
12365 | } |
12366 | |
12367 | /* |
12368 | * Intel's SDM states that all TLB entries are flushed on INIT. AMD's |
12369 | * APM states the TLBs are untouched by INIT, but it also states that |
12370 | * the TLBs are flushed on "External initialization of the processor." |
12371 | * Flush the guest TLB regardless of vendor, there is no meaningful |
12372 | * benefit in relying on the guest to flush the TLB immediately after |
12373 | * INIT. A spurious TLB flush is benign and likely negligible from a |
12374 | * performance perspective. |
12375 | */ |
12376 | if (init_event) |
12377 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
12378 | } |
12379 | EXPORT_SYMBOL_GPL(kvm_vcpu_reset); |
12380 | |
12381 | void kvm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) |
12382 | { |
12383 | struct kvm_segment cs; |
12384 | |
12385 | kvm_get_segment(vcpu, var: &cs, seg: VCPU_SREG_CS); |
12386 | cs.selector = vector << 8; |
12387 | cs.base = vector << 12; |
12388 | kvm_set_segment(vcpu, var: &cs, seg: VCPU_SREG_CS); |
12389 | kvm_rip_write(vcpu, val: 0); |
12390 | } |
12391 | EXPORT_SYMBOL_GPL(kvm_vcpu_deliver_sipi_vector); |
12392 | |
12393 | int kvm_arch_hardware_enable(void) |
12394 | { |
12395 | struct kvm *kvm; |
12396 | struct kvm_vcpu *vcpu; |
12397 | unsigned long i; |
12398 | int ret; |
12399 | u64 local_tsc; |
12400 | u64 max_tsc = 0; |
12401 | bool stable, backwards_tsc = false; |
12402 | |
12403 | kvm_user_return_msr_cpu_online(); |
12404 | |
12405 | ret = kvm_x86_check_processor_compatibility(); |
12406 | if (ret) |
12407 | return ret; |
12408 | |
12409 | ret = static_call(kvm_x86_hardware_enable)(); |
12410 | if (ret != 0) |
12411 | return ret; |
12412 | |
12413 | local_tsc = rdtsc(); |
12414 | stable = !kvm_check_tsc_unstable(); |
12415 | list_for_each_entry(kvm, &vm_list, vm_list) { |
12416 | kvm_for_each_vcpu(i, vcpu, kvm) { |
12417 | if (!stable && vcpu->cpu == smp_processor_id()) |
12418 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
12419 | if (stable && vcpu->arch.last_host_tsc > local_tsc) { |
12420 | backwards_tsc = true; |
12421 | if (vcpu->arch.last_host_tsc > max_tsc) |
12422 | max_tsc = vcpu->arch.last_host_tsc; |
12423 | } |
12424 | } |
12425 | } |
12426 | |
12427 | /* |
12428 | * Sometimes, even reliable TSCs go backwards. This happens on |
12429 | * platforms that reset TSC during suspend or hibernate actions, but |
12430 | * maintain synchronization. We must compensate. Fortunately, we can |
12431 | * detect that condition here, which happens early in CPU bringup, |
12432 | * before any KVM threads can be running. Unfortunately, we can't |
12433 | * bring the TSCs fully up to date with real time, as we aren't yet far |
12434 | * enough into CPU bringup that we know how much real time has actually |
12435 | * elapsed; our helper function, ktime_get_boottime_ns() will be using boot |
12436 | * variables that haven't been updated yet. |
12437 | * |
12438 | * So we simply find the maximum observed TSC above, then record the |
12439 | * adjustment to TSC in each VCPU. When the VCPU later gets loaded, |
12440 | * the adjustment will be applied. Note that we accumulate |
12441 | * adjustments, in case multiple suspend cycles happen before some VCPU |
12442 | * gets a chance to run again. In the event that no KVM threads get a |
12443 | * chance to run, we will miss the entire elapsed period, as we'll have |
12444 | * reset last_host_tsc, so VCPUs will not have the TSC adjusted and may |
12445 | * loose cycle time. This isn't too big a deal, since the loss will be |
12446 | * uniform across all VCPUs (not to mention the scenario is extremely |
12447 | * unlikely). It is possible that a second hibernate recovery happens |
12448 | * much faster than a first, causing the observed TSC here to be |
12449 | * smaller; this would require additional padding adjustment, which is |
12450 | * why we set last_host_tsc to the local tsc observed here. |
12451 | * |
12452 | * N.B. - this code below runs only on platforms with reliable TSC, |
12453 | * as that is the only way backwards_tsc is set above. Also note |
12454 | * that this runs for ALL vcpus, which is not a bug; all VCPUs should |
12455 | * have the same delta_cyc adjustment applied if backwards_tsc |
12456 | * is detected. Note further, this adjustment is only done once, |
12457 | * as we reset last_host_tsc on all VCPUs to stop this from being |
12458 | * called multiple times (one for each physical CPU bringup). |
12459 | * |
12460 | * Platforms with unreliable TSCs don't have to deal with this, they |
12461 | * will be compensated by the logic in vcpu_load, which sets the TSC to |
12462 | * catchup mode. This will catchup all VCPUs to real time, but cannot |
12463 | * guarantee that they stay in perfect synchronization. |
12464 | */ |
12465 | if (backwards_tsc) { |
12466 | u64 delta_cyc = max_tsc - local_tsc; |
12467 | list_for_each_entry(kvm, &vm_list, vm_list) { |
12468 | kvm->arch.backwards_tsc_observed = true; |
12469 | kvm_for_each_vcpu(i, vcpu, kvm) { |
12470 | vcpu->arch.tsc_offset_adjustment += delta_cyc; |
12471 | vcpu->arch.last_host_tsc = local_tsc; |
12472 | kvm_make_request(KVM_REQ_MASTERCLOCK_UPDATE, vcpu); |
12473 | } |
12474 | |
12475 | /* |
12476 | * We have to disable TSC offset matching.. if you were |
12477 | * booting a VM while issuing an S4 host suspend.... |
12478 | * you may have some problem. Solving this issue is |
12479 | * left as an exercise to the reader. |
12480 | */ |
12481 | kvm->arch.last_tsc_nsec = 0; |
12482 | kvm->arch.last_tsc_write = 0; |
12483 | } |
12484 | |
12485 | } |
12486 | return 0; |
12487 | } |
12488 | |
12489 | void kvm_arch_hardware_disable(void) |
12490 | { |
12491 | static_call(kvm_x86_hardware_disable)(); |
12492 | drop_user_return_notifiers(); |
12493 | } |
12494 | |
12495 | bool kvm_vcpu_is_reset_bsp(struct kvm_vcpu *vcpu) |
12496 | { |
12497 | return vcpu->kvm->arch.bsp_vcpu_id == vcpu->vcpu_id; |
12498 | } |
12499 | |
12500 | bool kvm_vcpu_is_bsp(struct kvm_vcpu *vcpu) |
12501 | { |
12502 | return (vcpu->arch.apic_base & MSR_IA32_APICBASE_BSP) != 0; |
12503 | } |
12504 | |
12505 | void kvm_arch_sched_in(struct kvm_vcpu *vcpu, int cpu) |
12506 | { |
12507 | struct kvm_pmu *pmu = vcpu_to_pmu(vcpu); |
12508 | |
12509 | vcpu->arch.l1tf_flush_l1d = true; |
12510 | if (pmu->version && unlikely(pmu->event_count)) { |
12511 | pmu->need_cleanup = true; |
12512 | kvm_make_request(KVM_REQ_PMU, vcpu); |
12513 | } |
12514 | static_call(kvm_x86_sched_in)(vcpu, cpu); |
12515 | } |
12516 | |
12517 | void kvm_arch_free_vm(struct kvm *kvm) |
12518 | { |
12519 | #if IS_ENABLED(CONFIG_HYPERV) |
12520 | kfree(objp: kvm->arch.hv_pa_pg); |
12521 | #endif |
12522 | __kvm_arch_free_vm(kvm); |
12523 | } |
12524 | |
12525 | |
12526 | int kvm_arch_init_vm(struct kvm *kvm, unsigned long type) |
12527 | { |
12528 | int ret; |
12529 | unsigned long flags; |
12530 | |
12531 | if (!kvm_is_vm_type_supported(type)) |
12532 | return -EINVAL; |
12533 | |
12534 | kvm->arch.vm_type = type; |
12535 | |
12536 | ret = kvm_page_track_init(kvm); |
12537 | if (ret) |
12538 | goto out; |
12539 | |
12540 | kvm_mmu_init_vm(kvm); |
12541 | |
12542 | ret = static_call(kvm_x86_vm_init)(kvm); |
12543 | if (ret) |
12544 | goto out_uninit_mmu; |
12545 | |
12546 | INIT_HLIST_HEAD(&kvm->arch.mask_notifier_list); |
12547 | atomic_set(v: &kvm->arch.noncoherent_dma_count, i: 0); |
12548 | |
12549 | /* Reserve bit 0 of irq_sources_bitmap for userspace irq source */ |
12550 | set_bit(KVM_USERSPACE_IRQ_SOURCE_ID, addr: &kvm->arch.irq_sources_bitmap); |
12551 | /* Reserve bit 1 of irq_sources_bitmap for irqfd-resampler */ |
12552 | set_bit(KVM_IRQFD_RESAMPLE_IRQ_SOURCE_ID, |
12553 | addr: &kvm->arch.irq_sources_bitmap); |
12554 | |
12555 | raw_spin_lock_init(&kvm->arch.tsc_write_lock); |
12556 | mutex_init(&kvm->arch.apic_map_lock); |
12557 | seqcount_raw_spinlock_init(&kvm->arch.pvclock_sc, &kvm->arch.tsc_write_lock); |
12558 | kvm->arch.kvmclock_offset = -get_kvmclock_base_ns(); |
12559 | |
12560 | raw_spin_lock_irqsave(&kvm->arch.tsc_write_lock, flags); |
12561 | pvclock_update_vm_gtod_copy(kvm); |
12562 | raw_spin_unlock_irqrestore(&kvm->arch.tsc_write_lock, flags); |
12563 | |
12564 | kvm->arch.default_tsc_khz = max_tsc_khz ? : tsc_khz; |
12565 | kvm->arch.guest_can_read_msr_platform_info = true; |
12566 | kvm->arch.enable_pmu = enable_pmu; |
12567 | |
12568 | #if IS_ENABLED(CONFIG_HYPERV) |
12569 | spin_lock_init(&kvm->arch.hv_root_tdp_lock); |
12570 | kvm->arch.hv_root_tdp = INVALID_PAGE; |
12571 | #endif |
12572 | |
12573 | INIT_DELAYED_WORK(&kvm->arch.kvmclock_update_work, kvmclock_update_fn); |
12574 | INIT_DELAYED_WORK(&kvm->arch.kvmclock_sync_work, kvmclock_sync_fn); |
12575 | |
12576 | kvm_apicv_init(kvm); |
12577 | kvm_hv_init_vm(kvm); |
12578 | kvm_xen_init_vm(kvm); |
12579 | |
12580 | return 0; |
12581 | |
12582 | out_uninit_mmu: |
12583 | kvm_mmu_uninit_vm(kvm); |
12584 | kvm_page_track_cleanup(kvm); |
12585 | out: |
12586 | return ret; |
12587 | } |
12588 | |
12589 | int kvm_arch_post_init_vm(struct kvm *kvm) |
12590 | { |
12591 | return kvm_mmu_post_init_vm(kvm); |
12592 | } |
12593 | |
12594 | static void kvm_unload_vcpu_mmu(struct kvm_vcpu *vcpu) |
12595 | { |
12596 | vcpu_load(vcpu); |
12597 | kvm_mmu_unload(vcpu); |
12598 | vcpu_put(vcpu); |
12599 | } |
12600 | |
12601 | static void kvm_unload_vcpu_mmus(struct kvm *kvm) |
12602 | { |
12603 | unsigned long i; |
12604 | struct kvm_vcpu *vcpu; |
12605 | |
12606 | kvm_for_each_vcpu(i, vcpu, kvm) { |
12607 | kvm_clear_async_pf_completion_queue(vcpu); |
12608 | kvm_unload_vcpu_mmu(vcpu); |
12609 | } |
12610 | } |
12611 | |
12612 | void kvm_arch_sync_events(struct kvm *kvm) |
12613 | { |
12614 | cancel_delayed_work_sync(dwork: &kvm->arch.kvmclock_sync_work); |
12615 | cancel_delayed_work_sync(dwork: &kvm->arch.kvmclock_update_work); |
12616 | kvm_free_pit(kvm); |
12617 | } |
12618 | |
12619 | /** |
12620 | * __x86_set_memory_region: Setup KVM internal memory slot |
12621 | * |
12622 | * @kvm: the kvm pointer to the VM. |
12623 | * @id: the slot ID to setup. |
12624 | * @gpa: the GPA to install the slot (unused when @size == 0). |
12625 | * @size: the size of the slot. Set to zero to uninstall a slot. |
12626 | * |
12627 | * This function helps to setup a KVM internal memory slot. Specify |
12628 | * @size > 0 to install a new slot, while @size == 0 to uninstall a |
12629 | * slot. The return code can be one of the following: |
12630 | * |
12631 | * HVA: on success (uninstall will return a bogus HVA) |
12632 | * -errno: on error |
12633 | * |
12634 | * The caller should always use IS_ERR() to check the return value |
12635 | * before use. Note, the KVM internal memory slots are guaranteed to |
12636 | * remain valid and unchanged until the VM is destroyed, i.e., the |
12637 | * GPA->HVA translation will not change. However, the HVA is a user |
12638 | * address, i.e. its accessibility is not guaranteed, and must be |
12639 | * accessed via __copy_{to,from}_user(). |
12640 | */ |
12641 | void __user * __x86_set_memory_region(struct kvm *kvm, int id, gpa_t gpa, |
12642 | u32 size) |
12643 | { |
12644 | int i, r; |
12645 | unsigned long hva, old_npages; |
12646 | struct kvm_memslots *slots = kvm_memslots(kvm); |
12647 | struct kvm_memory_slot *slot; |
12648 | |
12649 | /* Called with kvm->slots_lock held. */ |
12650 | if (WARN_ON(id >= KVM_MEM_SLOTS_NUM)) |
12651 | return ERR_PTR_USR(-EINVAL); |
12652 | |
12653 | slot = id_to_memslot(slots, id); |
12654 | if (size) { |
12655 | if (slot && slot->npages) |
12656 | return ERR_PTR_USR(-EEXIST); |
12657 | |
12658 | /* |
12659 | * MAP_SHARED to prevent internal slot pages from being moved |
12660 | * by fork()/COW. |
12661 | */ |
12662 | hva = vm_mmap(NULL, 0, size, PROT_READ | PROT_WRITE, |
12663 | MAP_SHARED | MAP_ANONYMOUS, 0); |
12664 | if (IS_ERR_VALUE(hva)) |
12665 | return (void __user *)hva; |
12666 | } else { |
12667 | if (!slot || !slot->npages) |
12668 | return NULL; |
12669 | |
12670 | old_npages = slot->npages; |
12671 | hva = slot->userspace_addr; |
12672 | } |
12673 | |
12674 | for (i = 0; i < kvm_arch_nr_memslot_as_ids(kvm); i++) { |
12675 | struct kvm_userspace_memory_region2 m; |
12676 | |
12677 | m.slot = id | (i << 16); |
12678 | m.flags = 0; |
12679 | m.guest_phys_addr = gpa; |
12680 | m.userspace_addr = hva; |
12681 | m.memory_size = size; |
12682 | r = __kvm_set_memory_region(kvm, mem: &m); |
12683 | if (r < 0) |
12684 | return ERR_PTR_USR(r); |
12685 | } |
12686 | |
12687 | if (!size) |
12688 | vm_munmap(hva, old_npages * PAGE_SIZE); |
12689 | |
12690 | return (void __user *)hva; |
12691 | } |
12692 | EXPORT_SYMBOL_GPL(__x86_set_memory_region); |
12693 | |
12694 | void kvm_arch_pre_destroy_vm(struct kvm *kvm) |
12695 | { |
12696 | kvm_mmu_pre_destroy_vm(kvm); |
12697 | } |
12698 | |
12699 | void kvm_arch_destroy_vm(struct kvm *kvm) |
12700 | { |
12701 | if (current->mm == kvm->mm) { |
12702 | /* |
12703 | * Free memory regions allocated on behalf of userspace, |
12704 | * unless the memory map has changed due to process exit |
12705 | * or fd copying. |
12706 | */ |
12707 | mutex_lock(&kvm->slots_lock); |
12708 | __x86_set_memory_region(kvm, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT, |
12709 | 0, 0); |
12710 | __x86_set_memory_region(kvm, IDENTITY_PAGETABLE_PRIVATE_MEMSLOT, |
12711 | 0, 0); |
12712 | __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, 0, 0); |
12713 | mutex_unlock(lock: &kvm->slots_lock); |
12714 | } |
12715 | kvm_unload_vcpu_mmus(kvm); |
12716 | static_call_cond(kvm_x86_vm_destroy)(kvm); |
12717 | kvm_free_msr_filter(srcu_dereference_check(kvm->arch.msr_filter, &kvm->srcu, 1)); |
12718 | kvm_pic_destroy(kvm); |
12719 | kvm_ioapic_destroy(kvm); |
12720 | kvm_destroy_vcpus(kvm); |
12721 | kvfree(rcu_dereference_check(kvm->arch.apic_map, 1)); |
12722 | kfree(srcu_dereference_check(kvm->arch.pmu_event_filter, &kvm->srcu, 1)); |
12723 | kvm_mmu_uninit_vm(kvm); |
12724 | kvm_page_track_cleanup(kvm); |
12725 | kvm_xen_destroy_vm(kvm); |
12726 | kvm_hv_destroy_vm(kvm); |
12727 | } |
12728 | |
12729 | static void memslot_rmap_free(struct kvm_memory_slot *slot) |
12730 | { |
12731 | int i; |
12732 | |
12733 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { |
12734 | kvfree(addr: slot->arch.rmap[i]); |
12735 | slot->arch.rmap[i] = NULL; |
12736 | } |
12737 | } |
12738 | |
12739 | void kvm_arch_free_memslot(struct kvm *kvm, struct kvm_memory_slot *slot) |
12740 | { |
12741 | int i; |
12742 | |
12743 | memslot_rmap_free(slot); |
12744 | |
12745 | for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { |
12746 | kvfree(addr: slot->arch.lpage_info[i - 1]); |
12747 | slot->arch.lpage_info[i - 1] = NULL; |
12748 | } |
12749 | |
12750 | kvm_page_track_free_memslot(slot); |
12751 | } |
12752 | |
12753 | int memslot_rmap_alloc(struct kvm_memory_slot *slot, unsigned long npages) |
12754 | { |
12755 | const int sz = sizeof(*slot->arch.rmap[0]); |
12756 | int i; |
12757 | |
12758 | for (i = 0; i < KVM_NR_PAGE_SIZES; ++i) { |
12759 | int level = i + 1; |
12760 | int lpages = __kvm_mmu_slot_lpages(slot, npages, level); |
12761 | |
12762 | if (slot->arch.rmap[i]) |
12763 | continue; |
12764 | |
12765 | slot->arch.rmap[i] = __vcalloc(n: lpages, size: sz, GFP_KERNEL_ACCOUNT); |
12766 | if (!slot->arch.rmap[i]) { |
12767 | memslot_rmap_free(slot); |
12768 | return -ENOMEM; |
12769 | } |
12770 | } |
12771 | |
12772 | return 0; |
12773 | } |
12774 | |
12775 | static int kvm_alloc_memslot_metadata(struct kvm *kvm, |
12776 | struct kvm_memory_slot *slot) |
12777 | { |
12778 | unsigned long npages = slot->npages; |
12779 | int i, r; |
12780 | |
12781 | /* |
12782 | * Clear out the previous array pointers for the KVM_MR_MOVE case. The |
12783 | * old arrays will be freed by __kvm_set_memory_region() if installing |
12784 | * the new memslot is successful. |
12785 | */ |
12786 | memset(&slot->arch, 0, sizeof(slot->arch)); |
12787 | |
12788 | if (kvm_memslots_have_rmaps(kvm)) { |
12789 | r = memslot_rmap_alloc(slot, npages); |
12790 | if (r) |
12791 | return r; |
12792 | } |
12793 | |
12794 | for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { |
12795 | struct kvm_lpage_info *linfo; |
12796 | unsigned long ugfn; |
12797 | int lpages; |
12798 | int level = i + 1; |
12799 | |
12800 | lpages = __kvm_mmu_slot_lpages(slot, npages, level); |
12801 | |
12802 | linfo = __vcalloc(n: lpages, size: sizeof(*linfo), GFP_KERNEL_ACCOUNT); |
12803 | if (!linfo) |
12804 | goto out_free; |
12805 | |
12806 | slot->arch.lpage_info[i - 1] = linfo; |
12807 | |
12808 | if (slot->base_gfn & (KVM_PAGES_PER_HPAGE(level) - 1)) |
12809 | linfo[0].disallow_lpage = 1; |
12810 | if ((slot->base_gfn + npages) & (KVM_PAGES_PER_HPAGE(level) - 1)) |
12811 | linfo[lpages - 1].disallow_lpage = 1; |
12812 | ugfn = slot->userspace_addr >> PAGE_SHIFT; |
12813 | /* |
12814 | * If the gfn and userspace address are not aligned wrt each |
12815 | * other, disable large page support for this slot. |
12816 | */ |
12817 | if ((slot->base_gfn ^ ugfn) & (KVM_PAGES_PER_HPAGE(level) - 1)) { |
12818 | unsigned long j; |
12819 | |
12820 | for (j = 0; j < lpages; ++j) |
12821 | linfo[j].disallow_lpage = 1; |
12822 | } |
12823 | } |
12824 | |
12825 | #ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES |
12826 | kvm_mmu_init_memslot_memory_attributes(kvm, slot); |
12827 | #endif |
12828 | |
12829 | if (kvm_page_track_create_memslot(kvm, slot, npages)) |
12830 | goto out_free; |
12831 | |
12832 | return 0; |
12833 | |
12834 | out_free: |
12835 | memslot_rmap_free(slot); |
12836 | |
12837 | for (i = 1; i < KVM_NR_PAGE_SIZES; ++i) { |
12838 | kvfree(addr: slot->arch.lpage_info[i - 1]); |
12839 | slot->arch.lpage_info[i - 1] = NULL; |
12840 | } |
12841 | return -ENOMEM; |
12842 | } |
12843 | |
12844 | void kvm_arch_memslots_updated(struct kvm *kvm, u64 gen) |
12845 | { |
12846 | struct kvm_vcpu *vcpu; |
12847 | unsigned long i; |
12848 | |
12849 | /* |
12850 | * memslots->generation has been incremented. |
12851 | * mmio generation may have reached its maximum value. |
12852 | */ |
12853 | kvm_mmu_invalidate_mmio_sptes(kvm, gen); |
12854 | |
12855 | /* Force re-initialization of steal_time cache */ |
12856 | kvm_for_each_vcpu(i, vcpu, kvm) |
12857 | kvm_vcpu_kick(vcpu); |
12858 | } |
12859 | |
12860 | int kvm_arch_prepare_memory_region(struct kvm *kvm, |
12861 | const struct kvm_memory_slot *old, |
12862 | struct kvm_memory_slot *new, |
12863 | enum kvm_mr_change change) |
12864 | { |
12865 | /* |
12866 | * KVM doesn't support moving memslots when there are external page |
12867 | * trackers attached to the VM, i.e. if KVMGT is in use. |
12868 | */ |
12869 | if (change == KVM_MR_MOVE && kvm_page_track_has_external_user(kvm)) |
12870 | return -EINVAL; |
12871 | |
12872 | if (change == KVM_MR_CREATE || change == KVM_MR_MOVE) { |
12873 | if ((new->base_gfn + new->npages - 1) > kvm_mmu_max_gfn()) |
12874 | return -EINVAL; |
12875 | |
12876 | return kvm_alloc_memslot_metadata(kvm, slot: new); |
12877 | } |
12878 | |
12879 | if (change == KVM_MR_FLAGS_ONLY) |
12880 | memcpy(&new->arch, &old->arch, sizeof(old->arch)); |
12881 | else if (WARN_ON_ONCE(change != KVM_MR_DELETE)) |
12882 | return -EIO; |
12883 | |
12884 | return 0; |
12885 | } |
12886 | |
12887 | |
12888 | static void kvm_mmu_update_cpu_dirty_logging(struct kvm *kvm, bool enable) |
12889 | { |
12890 | int nr_slots; |
12891 | |
12892 | if (!kvm_x86_ops.cpu_dirty_log_size) |
12893 | return; |
12894 | |
12895 | nr_slots = atomic_read(v: &kvm->nr_memslots_dirty_logging); |
12896 | if ((enable && nr_slots == 1) || !nr_slots) |
12897 | kvm_make_all_cpus_request(kvm, KVM_REQ_UPDATE_CPU_DIRTY_LOGGING); |
12898 | } |
12899 | |
12900 | static void kvm_mmu_slot_apply_flags(struct kvm *kvm, |
12901 | struct kvm_memory_slot *old, |
12902 | const struct kvm_memory_slot *new, |
12903 | enum kvm_mr_change change) |
12904 | { |
12905 | u32 old_flags = old ? old->flags : 0; |
12906 | u32 new_flags = new ? new->flags : 0; |
12907 | bool log_dirty_pages = new_flags & KVM_MEM_LOG_DIRTY_PAGES; |
12908 | |
12909 | /* |
12910 | * Update CPU dirty logging if dirty logging is being toggled. This |
12911 | * applies to all operations. |
12912 | */ |
12913 | if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) |
12914 | kvm_mmu_update_cpu_dirty_logging(kvm, enable: log_dirty_pages); |
12915 | |
12916 | /* |
12917 | * Nothing more to do for RO slots (which can't be dirtied and can't be |
12918 | * made writable) or CREATE/MOVE/DELETE of a slot. |
12919 | * |
12920 | * For a memslot with dirty logging disabled: |
12921 | * CREATE: No dirty mappings will already exist. |
12922 | * MOVE/DELETE: The old mappings will already have been cleaned up by |
12923 | * kvm_arch_flush_shadow_memslot() |
12924 | * |
12925 | * For a memslot with dirty logging enabled: |
12926 | * CREATE: No shadow pages exist, thus nothing to write-protect |
12927 | * and no dirty bits to clear. |
12928 | * MOVE/DELETE: The old mappings will already have been cleaned up by |
12929 | * kvm_arch_flush_shadow_memslot(). |
12930 | */ |
12931 | if ((change != KVM_MR_FLAGS_ONLY) || (new_flags & KVM_MEM_READONLY)) |
12932 | return; |
12933 | |
12934 | /* |
12935 | * READONLY and non-flags changes were filtered out above, and the only |
12936 | * other flag is LOG_DIRTY_PAGES, i.e. something is wrong if dirty |
12937 | * logging isn't being toggled on or off. |
12938 | */ |
12939 | if (WARN_ON_ONCE(!((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES))) |
12940 | return; |
12941 | |
12942 | if (!log_dirty_pages) { |
12943 | /* |
12944 | * Dirty logging tracks sptes in 4k granularity, meaning that |
12945 | * large sptes have to be split. If live migration succeeds, |
12946 | * the guest in the source machine will be destroyed and large |
12947 | * sptes will be created in the destination. However, if the |
12948 | * guest continues to run in the source machine (for example if |
12949 | * live migration fails), small sptes will remain around and |
12950 | * cause bad performance. |
12951 | * |
12952 | * Scan sptes if dirty logging has been stopped, dropping those |
12953 | * which can be collapsed into a single large-page spte. Later |
12954 | * page faults will create the large-page sptes. |
12955 | */ |
12956 | kvm_mmu_zap_collapsible_sptes(kvm, memslot: new); |
12957 | } else { |
12958 | /* |
12959 | * Initially-all-set does not require write protecting any page, |
12960 | * because they're all assumed to be dirty. |
12961 | */ |
12962 | if (kvm_dirty_log_manual_protect_and_init_set(kvm)) |
12963 | return; |
12964 | |
12965 | if (READ_ONCE(eager_page_split)) |
12966 | kvm_mmu_slot_try_split_huge_pages(kvm, memslot: new, target_level: PG_LEVEL_4K); |
12967 | |
12968 | if (kvm_x86_ops.cpu_dirty_log_size) { |
12969 | kvm_mmu_slot_leaf_clear_dirty(kvm, memslot: new); |
12970 | kvm_mmu_slot_remove_write_access(kvm, memslot: new, start_level: PG_LEVEL_2M); |
12971 | } else { |
12972 | kvm_mmu_slot_remove_write_access(kvm, memslot: new, start_level: PG_LEVEL_4K); |
12973 | } |
12974 | |
12975 | /* |
12976 | * Unconditionally flush the TLBs after enabling dirty logging. |
12977 | * A flush is almost always going to be necessary (see below), |
12978 | * and unconditionally flushing allows the helpers to omit |
12979 | * the subtly complex checks when removing write access. |
12980 | * |
12981 | * Do the flush outside of mmu_lock to reduce the amount of |
12982 | * time mmu_lock is held. Flushing after dropping mmu_lock is |
12983 | * safe as KVM only needs to guarantee the slot is fully |
12984 | * write-protected before returning to userspace, i.e. before |
12985 | * userspace can consume the dirty status. |
12986 | * |
12987 | * Flushing outside of mmu_lock requires KVM to be careful when |
12988 | * making decisions based on writable status of an SPTE, e.g. a |
12989 | * !writable SPTE doesn't guarantee a CPU can't perform writes. |
12990 | * |
12991 | * Specifically, KVM also write-protects guest page tables to |
12992 | * monitor changes when using shadow paging, and must guarantee |
12993 | * no CPUs can write to those page before mmu_lock is dropped. |
12994 | * Because CPUs may have stale TLB entries at this point, a |
12995 | * !writable SPTE doesn't guarantee CPUs can't perform writes. |
12996 | * |
12997 | * KVM also allows making SPTES writable outside of mmu_lock, |
12998 | * e.g. to allow dirty logging without taking mmu_lock. |
12999 | * |
13000 | * To handle these scenarios, KVM uses a separate software-only |
13001 | * bit (MMU-writable) to track if a SPTE is !writable due to |
13002 | * a guest page table being write-protected (KVM clears the |
13003 | * MMU-writable flag when write-protecting for shadow paging). |
13004 | * |
13005 | * The use of MMU-writable is also the primary motivation for |
13006 | * the unconditional flush. Because KVM must guarantee that a |
13007 | * CPU doesn't contain stale, writable TLB entries for a |
13008 | * !MMU-writable SPTE, KVM must flush if it encounters any |
13009 | * MMU-writable SPTE regardless of whether the actual hardware |
13010 | * writable bit was set. I.e. KVM is almost guaranteed to need |
13011 | * to flush, while unconditionally flushing allows the "remove |
13012 | * write access" helpers to ignore MMU-writable entirely. |
13013 | * |
13014 | * See is_writable_pte() for more details (the case involving |
13015 | * access-tracked SPTEs is particularly relevant). |
13016 | */ |
13017 | kvm_flush_remote_tlbs_memslot(kvm, memslot: new); |
13018 | } |
13019 | } |
13020 | |
13021 | void kvm_arch_commit_memory_region(struct kvm *kvm, |
13022 | struct kvm_memory_slot *old, |
13023 | const struct kvm_memory_slot *new, |
13024 | enum kvm_mr_change change) |
13025 | { |
13026 | if (change == KVM_MR_DELETE) |
13027 | kvm_page_track_delete_slot(kvm, slot: old); |
13028 | |
13029 | if (!kvm->arch.n_requested_mmu_pages && |
13030 | (change == KVM_MR_CREATE || change == KVM_MR_DELETE)) { |
13031 | unsigned long nr_mmu_pages; |
13032 | |
13033 | nr_mmu_pages = kvm->nr_memslot_pages / KVM_MEMSLOT_PAGES_TO_MMU_PAGES_RATIO; |
13034 | nr_mmu_pages = max(nr_mmu_pages, KVM_MIN_ALLOC_MMU_PAGES); |
13035 | kvm_mmu_change_mmu_pages(kvm, kvm_nr_mmu_pages: nr_mmu_pages); |
13036 | } |
13037 | |
13038 | kvm_mmu_slot_apply_flags(kvm, old, new, change); |
13039 | |
13040 | /* Free the arrays associated with the old memslot. */ |
13041 | if (change == KVM_MR_MOVE) |
13042 | kvm_arch_free_memslot(kvm, slot: old); |
13043 | } |
13044 | |
13045 | static inline bool kvm_guest_apic_has_interrupt(struct kvm_vcpu *vcpu) |
13046 | { |
13047 | return (is_guest_mode(vcpu) && |
13048 | static_call(kvm_x86_guest_apic_has_interrupt)(vcpu)); |
13049 | } |
13050 | |
13051 | static inline bool kvm_vcpu_has_events(struct kvm_vcpu *vcpu) |
13052 | { |
13053 | if (!list_empty_careful(head: &vcpu->async_pf.done)) |
13054 | return true; |
13055 | |
13056 | if (kvm_apic_has_pending_init_or_sipi(vcpu) && |
13057 | kvm_apic_init_sipi_allowed(vcpu)) |
13058 | return true; |
13059 | |
13060 | if (vcpu->arch.pv.pv_unhalted) |
13061 | return true; |
13062 | |
13063 | if (kvm_is_exception_pending(vcpu)) |
13064 | return true; |
13065 | |
13066 | if (kvm_test_request(KVM_REQ_NMI, vcpu) || |
13067 | (vcpu->arch.nmi_pending && |
13068 | static_call(kvm_x86_nmi_allowed)(vcpu, false))) |
13069 | return true; |
13070 | |
13071 | #ifdef CONFIG_KVM_SMM |
13072 | if (kvm_test_request(KVM_REQ_SMI, vcpu) || |
13073 | (vcpu->arch.smi_pending && |
13074 | static_call(kvm_x86_smi_allowed)(vcpu, false))) |
13075 | return true; |
13076 | #endif |
13077 | |
13078 | if (kvm_test_request(KVM_REQ_PMI, vcpu)) |
13079 | return true; |
13080 | |
13081 | if (kvm_arch_interrupt_allowed(vcpu) && |
13082 | (kvm_cpu_has_interrupt(vcpu) || |
13083 | kvm_guest_apic_has_interrupt(vcpu))) |
13084 | return true; |
13085 | |
13086 | if (kvm_hv_has_stimer_pending(vcpu)) |
13087 | return true; |
13088 | |
13089 | if (is_guest_mode(vcpu) && |
13090 | kvm_x86_ops.nested_ops->has_events && |
13091 | kvm_x86_ops.nested_ops->has_events(vcpu)) |
13092 | return true; |
13093 | |
13094 | if (kvm_xen_has_pending_events(vcpu)) |
13095 | return true; |
13096 | |
13097 | return false; |
13098 | } |
13099 | |
13100 | int kvm_arch_vcpu_runnable(struct kvm_vcpu *vcpu) |
13101 | { |
13102 | return kvm_vcpu_running(vcpu) || kvm_vcpu_has_events(vcpu); |
13103 | } |
13104 | |
13105 | bool kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu) |
13106 | { |
13107 | return kvm_vcpu_apicv_active(vcpu) && |
13108 | static_call(kvm_x86_dy_apicv_has_pending_interrupt)(vcpu); |
13109 | } |
13110 | |
13111 | bool kvm_arch_vcpu_preempted_in_kernel(struct kvm_vcpu *vcpu) |
13112 | { |
13113 | return vcpu->arch.preempted_in_kernel; |
13114 | } |
13115 | |
13116 | bool kvm_arch_dy_runnable(struct kvm_vcpu *vcpu) |
13117 | { |
13118 | if (READ_ONCE(vcpu->arch.pv.pv_unhalted)) |
13119 | return true; |
13120 | |
13121 | if (kvm_test_request(KVM_REQ_NMI, vcpu) || |
13122 | #ifdef CONFIG_KVM_SMM |
13123 | kvm_test_request(KVM_REQ_SMI, vcpu) || |
13124 | #endif |
13125 | kvm_test_request(KVM_REQ_EVENT, vcpu)) |
13126 | return true; |
13127 | |
13128 | return kvm_arch_dy_has_pending_interrupt(vcpu); |
13129 | } |
13130 | |
13131 | bool kvm_arch_vcpu_in_kernel(struct kvm_vcpu *vcpu) |
13132 | { |
13133 | if (vcpu->arch.guest_state_protected) |
13134 | return true; |
13135 | |
13136 | return static_call(kvm_x86_get_cpl)(vcpu) == 0; |
13137 | } |
13138 | |
13139 | unsigned long kvm_arch_vcpu_get_ip(struct kvm_vcpu *vcpu) |
13140 | { |
13141 | return kvm_rip_read(vcpu); |
13142 | } |
13143 | |
13144 | int kvm_arch_vcpu_should_kick(struct kvm_vcpu *vcpu) |
13145 | { |
13146 | return kvm_vcpu_exiting_guest_mode(vcpu) == IN_GUEST_MODE; |
13147 | } |
13148 | |
13149 | int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu) |
13150 | { |
13151 | return static_call(kvm_x86_interrupt_allowed)(vcpu, false); |
13152 | } |
13153 | |
13154 | unsigned long kvm_get_linear_rip(struct kvm_vcpu *vcpu) |
13155 | { |
13156 | /* Can't read the RIP when guest state is protected, just return 0 */ |
13157 | if (vcpu->arch.guest_state_protected) |
13158 | return 0; |
13159 | |
13160 | if (is_64_bit_mode(vcpu)) |
13161 | return kvm_rip_read(vcpu); |
13162 | return (u32)(get_segment_base(vcpu, seg: VCPU_SREG_CS) + |
13163 | kvm_rip_read(vcpu)); |
13164 | } |
13165 | EXPORT_SYMBOL_GPL(kvm_get_linear_rip); |
13166 | |
13167 | bool kvm_is_linear_rip(struct kvm_vcpu *vcpu, unsigned long linear_rip) |
13168 | { |
13169 | return kvm_get_linear_rip(vcpu) == linear_rip; |
13170 | } |
13171 | EXPORT_SYMBOL_GPL(kvm_is_linear_rip); |
13172 | |
13173 | unsigned long kvm_get_rflags(struct kvm_vcpu *vcpu) |
13174 | { |
13175 | unsigned long rflags; |
13176 | |
13177 | rflags = static_call(kvm_x86_get_rflags)(vcpu); |
13178 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP) |
13179 | rflags &= ~X86_EFLAGS_TF; |
13180 | return rflags; |
13181 | } |
13182 | EXPORT_SYMBOL_GPL(kvm_get_rflags); |
13183 | |
13184 | static void __kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
13185 | { |
13186 | if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP && |
13187 | kvm_is_linear_rip(vcpu, vcpu->arch.singlestep_rip)) |
13188 | rflags |= X86_EFLAGS_TF; |
13189 | static_call(kvm_x86_set_rflags)(vcpu, rflags); |
13190 | } |
13191 | |
13192 | void kvm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
13193 | { |
13194 | __kvm_set_rflags(vcpu, rflags); |
13195 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
13196 | } |
13197 | EXPORT_SYMBOL_GPL(kvm_set_rflags); |
13198 | |
13199 | static inline u32 kvm_async_pf_hash_fn(gfn_t gfn) |
13200 | { |
13201 | BUILD_BUG_ON(!is_power_of_2(ASYNC_PF_PER_VCPU)); |
13202 | |
13203 | return hash_32(val: gfn & 0xffffffff, order_base_2(ASYNC_PF_PER_VCPU)); |
13204 | } |
13205 | |
13206 | static inline u32 kvm_async_pf_next_probe(u32 key) |
13207 | { |
13208 | return (key + 1) & (ASYNC_PF_PER_VCPU - 1); |
13209 | } |
13210 | |
13211 | static void kvm_add_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) |
13212 | { |
13213 | u32 key = kvm_async_pf_hash_fn(gfn); |
13214 | |
13215 | while (vcpu->arch.apf.gfns[key] != ~0) |
13216 | key = kvm_async_pf_next_probe(key); |
13217 | |
13218 | vcpu->arch.apf.gfns[key] = gfn; |
13219 | } |
13220 | |
13221 | static u32 kvm_async_pf_gfn_slot(struct kvm_vcpu *vcpu, gfn_t gfn) |
13222 | { |
13223 | int i; |
13224 | u32 key = kvm_async_pf_hash_fn(gfn); |
13225 | |
13226 | for (i = 0; i < ASYNC_PF_PER_VCPU && |
13227 | (vcpu->arch.apf.gfns[key] != gfn && |
13228 | vcpu->arch.apf.gfns[key] != ~0); i++) |
13229 | key = kvm_async_pf_next_probe(key); |
13230 | |
13231 | return key; |
13232 | } |
13233 | |
13234 | bool kvm_find_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) |
13235 | { |
13236 | return vcpu->arch.apf.gfns[kvm_async_pf_gfn_slot(vcpu, gfn)] == gfn; |
13237 | } |
13238 | |
13239 | static void kvm_del_async_pf_gfn(struct kvm_vcpu *vcpu, gfn_t gfn) |
13240 | { |
13241 | u32 i, j, k; |
13242 | |
13243 | i = j = kvm_async_pf_gfn_slot(vcpu, gfn); |
13244 | |
13245 | if (WARN_ON_ONCE(vcpu->arch.apf.gfns[i] != gfn)) |
13246 | return; |
13247 | |
13248 | while (true) { |
13249 | vcpu->arch.apf.gfns[i] = ~0; |
13250 | do { |
13251 | j = kvm_async_pf_next_probe(key: j); |
13252 | if (vcpu->arch.apf.gfns[j] == ~0) |
13253 | return; |
13254 | k = kvm_async_pf_hash_fn(gfn: vcpu->arch.apf.gfns[j]); |
13255 | /* |
13256 | * k lies cyclically in ]i,j] |
13257 | * | i.k.j | |
13258 | * |....j i.k.| or |.k..j i...| |
13259 | */ |
13260 | } while ((i <= j) ? (i < k && k <= j) : (i < k || k <= j)); |
13261 | vcpu->arch.apf.gfns[i] = vcpu->arch.apf.gfns[j]; |
13262 | i = j; |
13263 | } |
13264 | } |
13265 | |
13266 | static inline int apf_put_user_notpresent(struct kvm_vcpu *vcpu) |
13267 | { |
13268 | u32 reason = KVM_PV_REASON_PAGE_NOT_PRESENT; |
13269 | |
13270 | return kvm_write_guest_cached(kvm: vcpu->kvm, ghc: &vcpu->arch.apf.data, data: &reason, |
13271 | len: sizeof(reason)); |
13272 | } |
13273 | |
13274 | static inline int apf_put_user_ready(struct kvm_vcpu *vcpu, u32 token) |
13275 | { |
13276 | unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); |
13277 | |
13278 | return kvm_write_guest_offset_cached(kvm: vcpu->kvm, ghc: &vcpu->arch.apf.data, |
13279 | data: &token, offset, len: sizeof(token)); |
13280 | } |
13281 | |
13282 | static inline bool (struct kvm_vcpu *vcpu) |
13283 | { |
13284 | unsigned int offset = offsetof(struct kvm_vcpu_pv_apf_data, token); |
13285 | u32 val; |
13286 | |
13287 | if (kvm_read_guest_offset_cached(kvm: vcpu->kvm, ghc: &vcpu->arch.apf.data, |
13288 | data: &val, offset, len: sizeof(val))) |
13289 | return false; |
13290 | |
13291 | return !val; |
13292 | } |
13293 | |
13294 | static bool kvm_can_deliver_async_pf(struct kvm_vcpu *vcpu) |
13295 | { |
13296 | |
13297 | if (!kvm_pv_async_pf_enabled(vcpu)) |
13298 | return false; |
13299 | |
13300 | if (vcpu->arch.apf.send_user_only && |
13301 | static_call(kvm_x86_get_cpl)(vcpu) == 0) |
13302 | return false; |
13303 | |
13304 | if (is_guest_mode(vcpu)) { |
13305 | /* |
13306 | * L1 needs to opt into the special #PF vmexits that are |
13307 | * used to deliver async page faults. |
13308 | */ |
13309 | return vcpu->arch.apf.delivery_as_pf_vmexit; |
13310 | } else { |
13311 | /* |
13312 | * Play it safe in case the guest temporarily disables paging. |
13313 | * The real mode IDT in particular is unlikely to have a #PF |
13314 | * exception setup. |
13315 | */ |
13316 | return is_paging(vcpu); |
13317 | } |
13318 | } |
13319 | |
13320 | bool kvm_can_do_async_pf(struct kvm_vcpu *vcpu) |
13321 | { |
13322 | if (unlikely(!lapic_in_kernel(vcpu) || |
13323 | kvm_event_needs_reinjection(vcpu) || |
13324 | kvm_is_exception_pending(vcpu))) |
13325 | return false; |
13326 | |
13327 | if (kvm_hlt_in_guest(kvm: vcpu->kvm) && !kvm_can_deliver_async_pf(vcpu)) |
13328 | return false; |
13329 | |
13330 | /* |
13331 | * If interrupts are off we cannot even use an artificial |
13332 | * halt state. |
13333 | */ |
13334 | return kvm_arch_interrupt_allowed(vcpu); |
13335 | } |
13336 | |
13337 | bool kvm_arch_async_page_not_present(struct kvm_vcpu *vcpu, |
13338 | struct kvm_async_pf *work) |
13339 | { |
13340 | struct x86_exception fault; |
13341 | |
13342 | trace_kvm_async_pf_not_present(token: work->arch.token, gva: work->cr2_or_gpa); |
13343 | kvm_add_async_pf_gfn(vcpu, gfn: work->arch.gfn); |
13344 | |
13345 | if (kvm_can_deliver_async_pf(vcpu) && |
13346 | !apf_put_user_notpresent(vcpu)) { |
13347 | fault.vector = PF_VECTOR; |
13348 | fault.error_code_valid = true; |
13349 | fault.error_code = 0; |
13350 | fault.nested_page_fault = false; |
13351 | fault.address = work->arch.token; |
13352 | fault.async_page_fault = true; |
13353 | kvm_inject_page_fault(vcpu, fault: &fault); |
13354 | return true; |
13355 | } else { |
13356 | /* |
13357 | * It is not possible to deliver a paravirtualized asynchronous |
13358 | * page fault, but putting the guest in an artificial halt state |
13359 | * can be beneficial nevertheless: if an interrupt arrives, we |
13360 | * can deliver it timely and perhaps the guest will schedule |
13361 | * another process. When the instruction that triggered a page |
13362 | * fault is retried, hopefully the page will be ready in the host. |
13363 | */ |
13364 | kvm_make_request(KVM_REQ_APF_HALT, vcpu); |
13365 | return false; |
13366 | } |
13367 | } |
13368 | |
13369 | void kvm_arch_async_page_present(struct kvm_vcpu *vcpu, |
13370 | struct kvm_async_pf *work) |
13371 | { |
13372 | struct kvm_lapic_irq irq = { |
13373 | .delivery_mode = APIC_DM_FIXED, |
13374 | .vector = vcpu->arch.apf.vec |
13375 | }; |
13376 | |
13377 | if (work->wakeup_all) |
13378 | work->arch.token = ~0; /* broadcast wakeup */ |
13379 | else |
13380 | kvm_del_async_pf_gfn(vcpu, gfn: work->arch.gfn); |
13381 | trace_kvm_async_pf_ready(token: work->arch.token, gva: work->cr2_or_gpa); |
13382 | |
13383 | if ((work->wakeup_all || work->notpresent_injected) && |
13384 | kvm_pv_async_pf_enabled(vcpu) && |
13385 | !apf_put_user_ready(vcpu, token: work->arch.token)) { |
13386 | vcpu->arch.apf.pageready_pending = true; |
13387 | kvm_apic_set_irq(vcpu, irq: &irq, NULL); |
13388 | } |
13389 | |
13390 | vcpu->arch.apf.halted = false; |
13391 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
13392 | } |
13393 | |
13394 | void kvm_arch_async_page_present_queued(struct kvm_vcpu *vcpu) |
13395 | { |
13396 | kvm_make_request(KVM_REQ_APF_READY, vcpu); |
13397 | if (!vcpu->arch.apf.pageready_pending) |
13398 | kvm_vcpu_kick(vcpu); |
13399 | } |
13400 | |
13401 | bool kvm_arch_can_dequeue_async_page_present(struct kvm_vcpu *vcpu) |
13402 | { |
13403 | if (!kvm_pv_async_pf_enabled(vcpu)) |
13404 | return true; |
13405 | else |
13406 | return kvm_lapic_enabled(vcpu) && apf_pageready_slot_free(vcpu); |
13407 | } |
13408 | |
13409 | void kvm_arch_start_assignment(struct kvm *kvm) |
13410 | { |
13411 | if (atomic_inc_return(v: &kvm->arch.assigned_device_count) == 1) |
13412 | static_call_cond(kvm_x86_pi_start_assignment)(kvm); |
13413 | } |
13414 | EXPORT_SYMBOL_GPL(kvm_arch_start_assignment); |
13415 | |
13416 | void kvm_arch_end_assignment(struct kvm *kvm) |
13417 | { |
13418 | atomic_dec(v: &kvm->arch.assigned_device_count); |
13419 | } |
13420 | EXPORT_SYMBOL_GPL(kvm_arch_end_assignment); |
13421 | |
13422 | bool noinstr kvm_arch_has_assigned_device(struct kvm *kvm) |
13423 | { |
13424 | return raw_atomic_read(v: &kvm->arch.assigned_device_count); |
13425 | } |
13426 | EXPORT_SYMBOL_GPL(kvm_arch_has_assigned_device); |
13427 | |
13428 | static void kvm_noncoherent_dma_assignment_start_or_stop(struct kvm *kvm) |
13429 | { |
13430 | /* |
13431 | * Non-coherent DMA assignment and de-assignment will affect |
13432 | * whether KVM honors guest MTRRs and cause changes in memtypes |
13433 | * in TDP. |
13434 | * So, pass %true unconditionally to indicate non-coherent DMA was, |
13435 | * or will be involved, and that zapping SPTEs might be necessary. |
13436 | */ |
13437 | if (__kvm_mmu_honors_guest_mtrrs(vm_has_noncoherent_dma: true)) |
13438 | kvm_zap_gfn_range(kvm, gfn_start: gpa_to_gfn(gpa: 0), gfn_end: gpa_to_gfn(gpa: ~0ULL)); |
13439 | } |
13440 | |
13441 | void kvm_arch_register_noncoherent_dma(struct kvm *kvm) |
13442 | { |
13443 | if (atomic_inc_return(v: &kvm->arch.noncoherent_dma_count) == 1) |
13444 | kvm_noncoherent_dma_assignment_start_or_stop(kvm); |
13445 | } |
13446 | EXPORT_SYMBOL_GPL(kvm_arch_register_noncoherent_dma); |
13447 | |
13448 | void kvm_arch_unregister_noncoherent_dma(struct kvm *kvm) |
13449 | { |
13450 | if (!atomic_dec_return(v: &kvm->arch.noncoherent_dma_count)) |
13451 | kvm_noncoherent_dma_assignment_start_or_stop(kvm); |
13452 | } |
13453 | EXPORT_SYMBOL_GPL(kvm_arch_unregister_noncoherent_dma); |
13454 | |
13455 | bool kvm_arch_has_noncoherent_dma(struct kvm *kvm) |
13456 | { |
13457 | return atomic_read(v: &kvm->arch.noncoherent_dma_count); |
13458 | } |
13459 | EXPORT_SYMBOL_GPL(kvm_arch_has_noncoherent_dma); |
13460 | |
13461 | bool kvm_arch_has_irq_bypass(void) |
13462 | { |
13463 | return enable_apicv && irq_remapping_cap(cap: IRQ_POSTING_CAP); |
13464 | } |
13465 | |
13466 | int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, |
13467 | struct irq_bypass_producer *prod) |
13468 | { |
13469 | struct kvm_kernel_irqfd *irqfd = |
13470 | container_of(cons, struct kvm_kernel_irqfd, consumer); |
13471 | int ret; |
13472 | |
13473 | irqfd->producer = prod; |
13474 | kvm_arch_start_assignment(irqfd->kvm); |
13475 | ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, |
13476 | prod->irq, irqfd->gsi, 1); |
13477 | |
13478 | if (ret) |
13479 | kvm_arch_end_assignment(irqfd->kvm); |
13480 | |
13481 | return ret; |
13482 | } |
13483 | |
13484 | void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, |
13485 | struct irq_bypass_producer *prod) |
13486 | { |
13487 | int ret; |
13488 | struct kvm_kernel_irqfd *irqfd = |
13489 | container_of(cons, struct kvm_kernel_irqfd, consumer); |
13490 | |
13491 | WARN_ON(irqfd->producer != prod); |
13492 | irqfd->producer = NULL; |
13493 | |
13494 | /* |
13495 | * When producer of consumer is unregistered, we change back to |
13496 | * remapped mode, so we can re-use the current implementation |
13497 | * when the irq is masked/disabled or the consumer side (KVM |
13498 | * int this case doesn't want to receive the interrupts. |
13499 | */ |
13500 | ret = static_call(kvm_x86_pi_update_irte)(irqfd->kvm, prod->irq, irqfd->gsi, 0); |
13501 | if (ret) |
13502 | printk(KERN_INFO "irq bypass consumer (token %p) unregistration" |
13503 | " fails: %d\n" , irqfd->consumer.token, ret); |
13504 | |
13505 | kvm_arch_end_assignment(irqfd->kvm); |
13506 | } |
13507 | |
13508 | int kvm_arch_update_irqfd_routing(struct kvm *kvm, unsigned int host_irq, |
13509 | uint32_t guest_irq, bool set) |
13510 | { |
13511 | return static_call(kvm_x86_pi_update_irte)(kvm, host_irq, guest_irq, set); |
13512 | } |
13513 | |
13514 | bool kvm_arch_irqfd_route_changed(struct kvm_kernel_irq_routing_entry *old, |
13515 | struct kvm_kernel_irq_routing_entry *new) |
13516 | { |
13517 | if (new->type != KVM_IRQ_ROUTING_MSI) |
13518 | return true; |
13519 | |
13520 | return !!memcmp(p: &old->msi, q: &new->msi, size: sizeof(new->msi)); |
13521 | } |
13522 | |
13523 | bool kvm_vector_hashing_enabled(void) |
13524 | { |
13525 | return vector_hashing; |
13526 | } |
13527 | |
13528 | bool kvm_arch_no_poll(struct kvm_vcpu *vcpu) |
13529 | { |
13530 | return (vcpu->arch.msr_kvm_poll_control & 1) == 0; |
13531 | } |
13532 | EXPORT_SYMBOL_GPL(kvm_arch_no_poll); |
13533 | |
13534 | |
13535 | int kvm_spec_ctrl_test_value(u64 value) |
13536 | { |
13537 | /* |
13538 | * test that setting IA32_SPEC_CTRL to given value |
13539 | * is allowed by the host processor |
13540 | */ |
13541 | |
13542 | u64 saved_value; |
13543 | unsigned long flags; |
13544 | int ret = 0; |
13545 | |
13546 | local_irq_save(flags); |
13547 | |
13548 | if (rdmsrl_safe(MSR_IA32_SPEC_CTRL, p: &saved_value)) |
13549 | ret = 1; |
13550 | else if (wrmsrl_safe(MSR_IA32_SPEC_CTRL, val: value)) |
13551 | ret = 1; |
13552 | else |
13553 | wrmsrl(MSR_IA32_SPEC_CTRL, val: saved_value); |
13554 | |
13555 | local_irq_restore(flags); |
13556 | |
13557 | return ret; |
13558 | } |
13559 | EXPORT_SYMBOL_GPL(kvm_spec_ctrl_test_value); |
13560 | |
13561 | void kvm_fixup_and_inject_pf_error(struct kvm_vcpu *vcpu, gva_t gva, u16 error_code) |
13562 | { |
13563 | struct kvm_mmu *mmu = vcpu->arch.walk_mmu; |
13564 | struct x86_exception fault; |
13565 | u64 access = error_code & |
13566 | (PFERR_WRITE_MASK | PFERR_FETCH_MASK | PFERR_USER_MASK); |
13567 | |
13568 | if (!(error_code & PFERR_PRESENT_MASK) || |
13569 | mmu->gva_to_gpa(vcpu, mmu, gva, access, &fault) != INVALID_GPA) { |
13570 | /* |
13571 | * If vcpu->arch.walk_mmu->gva_to_gpa succeeded, the page |
13572 | * tables probably do not match the TLB. Just proceed |
13573 | * with the error code that the processor gave. |
13574 | */ |
13575 | fault.vector = PF_VECTOR; |
13576 | fault.error_code_valid = true; |
13577 | fault.error_code = error_code; |
13578 | fault.nested_page_fault = false; |
13579 | fault.address = gva; |
13580 | fault.async_page_fault = false; |
13581 | } |
13582 | vcpu->arch.walk_mmu->inject_page_fault(vcpu, &fault); |
13583 | } |
13584 | EXPORT_SYMBOL_GPL(kvm_fixup_and_inject_pf_error); |
13585 | |
13586 | /* |
13587 | * Handles kvm_read/write_guest_virt*() result and either injects #PF or returns |
13588 | * KVM_EXIT_INTERNAL_ERROR for cases not currently handled by KVM. Return value |
13589 | * indicates whether exit to userspace is needed. |
13590 | */ |
13591 | int kvm_handle_memory_failure(struct kvm_vcpu *vcpu, int r, |
13592 | struct x86_exception *e) |
13593 | { |
13594 | if (r == X86EMUL_PROPAGATE_FAULT) { |
13595 | if (KVM_BUG_ON(!e, vcpu->kvm)) |
13596 | return -EIO; |
13597 | |
13598 | kvm_inject_emulated_page_fault(vcpu, e); |
13599 | return 1; |
13600 | } |
13601 | |
13602 | /* |
13603 | * In case kvm_read/write_guest_virt*() failed with X86EMUL_IO_NEEDED |
13604 | * while handling a VMX instruction KVM could've handled the request |
13605 | * correctly by exiting to userspace and performing I/O but there |
13606 | * doesn't seem to be a real use-case behind such requests, just return |
13607 | * KVM_EXIT_INTERNAL_ERROR for now. |
13608 | */ |
13609 | kvm_prepare_emulation_failure_exit(vcpu); |
13610 | |
13611 | return 0; |
13612 | } |
13613 | EXPORT_SYMBOL_GPL(kvm_handle_memory_failure); |
13614 | |
13615 | int kvm_handle_invpcid(struct kvm_vcpu *vcpu, unsigned long type, gva_t gva) |
13616 | { |
13617 | bool pcid_enabled; |
13618 | struct x86_exception e; |
13619 | struct { |
13620 | u64 pcid; |
13621 | u64 gla; |
13622 | } operand; |
13623 | int r; |
13624 | |
13625 | r = kvm_read_guest_virt(vcpu, gva, &operand, sizeof(operand), &e); |
13626 | if (r != X86EMUL_CONTINUE) |
13627 | return kvm_handle_memory_failure(vcpu, r, &e); |
13628 | |
13629 | if (operand.pcid >> 12 != 0) { |
13630 | kvm_inject_gp(vcpu, error_code: 0); |
13631 | return 1; |
13632 | } |
13633 | |
13634 | pcid_enabled = kvm_is_cr4_bit_set(vcpu, X86_CR4_PCIDE); |
13635 | |
13636 | switch (type) { |
13637 | case INVPCID_TYPE_INDIV_ADDR: |
13638 | /* |
13639 | * LAM doesn't apply to addresses that are inputs to TLB |
13640 | * invalidation. |
13641 | */ |
13642 | if ((!pcid_enabled && (operand.pcid != 0)) || |
13643 | is_noncanonical_address(la: operand.gla, vcpu)) { |
13644 | kvm_inject_gp(vcpu, error_code: 0); |
13645 | return 1; |
13646 | } |
13647 | kvm_mmu_invpcid_gva(vcpu, gva: operand.gla, pcid: operand.pcid); |
13648 | return kvm_skip_emulated_instruction(vcpu); |
13649 | |
13650 | case INVPCID_TYPE_SINGLE_CTXT: |
13651 | if (!pcid_enabled && (operand.pcid != 0)) { |
13652 | kvm_inject_gp(vcpu, error_code: 0); |
13653 | return 1; |
13654 | } |
13655 | |
13656 | kvm_invalidate_pcid(vcpu, pcid: operand.pcid); |
13657 | return kvm_skip_emulated_instruction(vcpu); |
13658 | |
13659 | case INVPCID_TYPE_ALL_NON_GLOBAL: |
13660 | /* |
13661 | * Currently, KVM doesn't mark global entries in the shadow |
13662 | * page tables, so a non-global flush just degenerates to a |
13663 | * global flush. If needed, we could optimize this later by |
13664 | * keeping track of global entries in shadow page tables. |
13665 | */ |
13666 | |
13667 | fallthrough; |
13668 | case INVPCID_TYPE_ALL_INCL_GLOBAL: |
13669 | kvm_make_request(KVM_REQ_TLB_FLUSH_GUEST, vcpu); |
13670 | return kvm_skip_emulated_instruction(vcpu); |
13671 | |
13672 | default: |
13673 | kvm_inject_gp(vcpu, error_code: 0); |
13674 | return 1; |
13675 | } |
13676 | } |
13677 | EXPORT_SYMBOL_GPL(kvm_handle_invpcid); |
13678 | |
13679 | static int complete_sev_es_emulated_mmio(struct kvm_vcpu *vcpu) |
13680 | { |
13681 | struct kvm_run *run = vcpu->run; |
13682 | struct kvm_mmio_fragment *frag; |
13683 | unsigned int len; |
13684 | |
13685 | BUG_ON(!vcpu->mmio_needed); |
13686 | |
13687 | /* Complete previous fragment */ |
13688 | frag = &vcpu->mmio_fragments[vcpu->mmio_cur_fragment]; |
13689 | len = min(8u, frag->len); |
13690 | if (!vcpu->mmio_is_write) |
13691 | memcpy(frag->data, run->mmio.data, len); |
13692 | |
13693 | if (frag->len <= 8) { |
13694 | /* Switch to the next fragment. */ |
13695 | frag++; |
13696 | vcpu->mmio_cur_fragment++; |
13697 | } else { |
13698 | /* Go forward to the next mmio piece. */ |
13699 | frag->data += len; |
13700 | frag->gpa += len; |
13701 | frag->len -= len; |
13702 | } |
13703 | |
13704 | if (vcpu->mmio_cur_fragment >= vcpu->mmio_nr_fragments) { |
13705 | vcpu->mmio_needed = 0; |
13706 | |
13707 | // VMG change, at this point, we're always done |
13708 | // RIP has already been advanced |
13709 | return 1; |
13710 | } |
13711 | |
13712 | // More MMIO is needed |
13713 | run->mmio.phys_addr = frag->gpa; |
13714 | run->mmio.len = min(8u, frag->len); |
13715 | run->mmio.is_write = vcpu->mmio_is_write; |
13716 | if (run->mmio.is_write) |
13717 | memcpy(run->mmio.data, frag->data, min(8u, frag->len)); |
13718 | run->exit_reason = KVM_EXIT_MMIO; |
13719 | |
13720 | vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; |
13721 | |
13722 | return 0; |
13723 | } |
13724 | |
13725 | int kvm_sev_es_mmio_write(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, |
13726 | void *data) |
13727 | { |
13728 | int handled; |
13729 | struct kvm_mmio_fragment *frag; |
13730 | |
13731 | if (!data) |
13732 | return -EINVAL; |
13733 | |
13734 | handled = write_emultor.read_write_mmio(vcpu, gpa, bytes, data); |
13735 | if (handled == bytes) |
13736 | return 1; |
13737 | |
13738 | bytes -= handled; |
13739 | gpa += handled; |
13740 | data += handled; |
13741 | |
13742 | /*TODO: Check if need to increment number of frags */ |
13743 | frag = vcpu->mmio_fragments; |
13744 | vcpu->mmio_nr_fragments = 1; |
13745 | frag->len = bytes; |
13746 | frag->gpa = gpa; |
13747 | frag->data = data; |
13748 | |
13749 | vcpu->mmio_needed = 1; |
13750 | vcpu->mmio_cur_fragment = 0; |
13751 | |
13752 | vcpu->run->mmio.phys_addr = gpa; |
13753 | vcpu->run->mmio.len = min(8u, frag->len); |
13754 | vcpu->run->mmio.is_write = 1; |
13755 | memcpy(vcpu->run->mmio.data, frag->data, min(8u, frag->len)); |
13756 | vcpu->run->exit_reason = KVM_EXIT_MMIO; |
13757 | |
13758 | vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; |
13759 | |
13760 | return 0; |
13761 | } |
13762 | EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_write); |
13763 | |
13764 | int kvm_sev_es_mmio_read(struct kvm_vcpu *vcpu, gpa_t gpa, unsigned int bytes, |
13765 | void *data) |
13766 | { |
13767 | int handled; |
13768 | struct kvm_mmio_fragment *frag; |
13769 | |
13770 | if (!data) |
13771 | return -EINVAL; |
13772 | |
13773 | handled = read_emultor.read_write_mmio(vcpu, gpa, bytes, data); |
13774 | if (handled == bytes) |
13775 | return 1; |
13776 | |
13777 | bytes -= handled; |
13778 | gpa += handled; |
13779 | data += handled; |
13780 | |
13781 | /*TODO: Check if need to increment number of frags */ |
13782 | frag = vcpu->mmio_fragments; |
13783 | vcpu->mmio_nr_fragments = 1; |
13784 | frag->len = bytes; |
13785 | frag->gpa = gpa; |
13786 | frag->data = data; |
13787 | |
13788 | vcpu->mmio_needed = 1; |
13789 | vcpu->mmio_cur_fragment = 0; |
13790 | |
13791 | vcpu->run->mmio.phys_addr = gpa; |
13792 | vcpu->run->mmio.len = min(8u, frag->len); |
13793 | vcpu->run->mmio.is_write = 0; |
13794 | vcpu->run->exit_reason = KVM_EXIT_MMIO; |
13795 | |
13796 | vcpu->arch.complete_userspace_io = complete_sev_es_emulated_mmio; |
13797 | |
13798 | return 0; |
13799 | } |
13800 | EXPORT_SYMBOL_GPL(kvm_sev_es_mmio_read); |
13801 | |
13802 | static void advance_sev_es_emulated_pio(struct kvm_vcpu *vcpu, unsigned count, int size) |
13803 | { |
13804 | vcpu->arch.sev_pio_count -= count; |
13805 | vcpu->arch.sev_pio_data += count * size; |
13806 | } |
13807 | |
13808 | static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, |
13809 | unsigned int port); |
13810 | |
13811 | static int complete_sev_es_emulated_outs(struct kvm_vcpu *vcpu) |
13812 | { |
13813 | int size = vcpu->arch.pio.size; |
13814 | int port = vcpu->arch.pio.port; |
13815 | |
13816 | vcpu->arch.pio.count = 0; |
13817 | if (vcpu->arch.sev_pio_count) |
13818 | return kvm_sev_es_outs(vcpu, size, port); |
13819 | return 1; |
13820 | } |
13821 | |
13822 | static int kvm_sev_es_outs(struct kvm_vcpu *vcpu, unsigned int size, |
13823 | unsigned int port) |
13824 | { |
13825 | for (;;) { |
13826 | unsigned int count = |
13827 | min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); |
13828 | int ret = emulator_pio_out(vcpu, size, port, val: vcpu->arch.sev_pio_data, count); |
13829 | |
13830 | /* memcpy done already by emulator_pio_out. */ |
13831 | advance_sev_es_emulated_pio(vcpu, count, size); |
13832 | if (!ret) |
13833 | break; |
13834 | |
13835 | /* Emulation done by the kernel. */ |
13836 | if (!vcpu->arch.sev_pio_count) |
13837 | return 1; |
13838 | } |
13839 | |
13840 | vcpu->arch.complete_userspace_io = complete_sev_es_emulated_outs; |
13841 | return 0; |
13842 | } |
13843 | |
13844 | static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, |
13845 | unsigned int port); |
13846 | |
13847 | static int complete_sev_es_emulated_ins(struct kvm_vcpu *vcpu) |
13848 | { |
13849 | unsigned count = vcpu->arch.pio.count; |
13850 | int size = vcpu->arch.pio.size; |
13851 | int port = vcpu->arch.pio.port; |
13852 | |
13853 | complete_emulator_pio_in(vcpu, val: vcpu->arch.sev_pio_data); |
13854 | advance_sev_es_emulated_pio(vcpu, count, size); |
13855 | if (vcpu->arch.sev_pio_count) |
13856 | return kvm_sev_es_ins(vcpu, size, port); |
13857 | return 1; |
13858 | } |
13859 | |
13860 | static int kvm_sev_es_ins(struct kvm_vcpu *vcpu, unsigned int size, |
13861 | unsigned int port) |
13862 | { |
13863 | for (;;) { |
13864 | unsigned int count = |
13865 | min_t(unsigned int, PAGE_SIZE / size, vcpu->arch.sev_pio_count); |
13866 | if (!emulator_pio_in(vcpu, size, port, val: vcpu->arch.sev_pio_data, count)) |
13867 | break; |
13868 | |
13869 | /* Emulation done by the kernel. */ |
13870 | advance_sev_es_emulated_pio(vcpu, count, size); |
13871 | if (!vcpu->arch.sev_pio_count) |
13872 | return 1; |
13873 | } |
13874 | |
13875 | vcpu->arch.complete_userspace_io = complete_sev_es_emulated_ins; |
13876 | return 0; |
13877 | } |
13878 | |
13879 | int kvm_sev_es_string_io(struct kvm_vcpu *vcpu, unsigned int size, |
13880 | unsigned int port, void *data, unsigned int count, |
13881 | int in) |
13882 | { |
13883 | vcpu->arch.sev_pio_data = data; |
13884 | vcpu->arch.sev_pio_count = count; |
13885 | return in ? kvm_sev_es_ins(vcpu, size, port) |
13886 | : kvm_sev_es_outs(vcpu, size, port); |
13887 | } |
13888 | EXPORT_SYMBOL_GPL(kvm_sev_es_string_io); |
13889 | |
13890 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_entry); |
13891 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_exit); |
13892 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_fast_mmio); |
13893 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_inj_virq); |
13894 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_page_fault); |
13895 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_msr); |
13896 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_cr); |
13897 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter); |
13898 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit); |
13899 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmexit_inject); |
13900 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intr_vmexit); |
13901 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_vmenter_failed); |
13902 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_invlpga); |
13903 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_skinit); |
13904 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_nested_intercepts); |
13905 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_write_tsc_offset); |
13906 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_ple_window_update); |
13907 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pml_full); |
13908 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_pi_irte_update); |
13909 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_unaccelerated_access); |
13910 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_incomplete_ipi); |
13911 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_ga_log); |
13912 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_kick_vcpu_slowpath); |
13913 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_avic_doorbell); |
13914 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_apicv_accept_irq); |
13915 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_enter); |
13916 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_exit); |
13917 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_enter); |
13918 | EXPORT_TRACEPOINT_SYMBOL_GPL(kvm_vmgexit_msr_protocol_exit); |
13919 | |
13920 | static int __init kvm_x86_init(void) |
13921 | { |
13922 | kvm_mmu_x86_module_init(); |
13923 | mitigate_smt_rsb &= boot_cpu_has_bug(X86_BUG_SMT_RSB) && cpu_smt_possible(); |
13924 | return 0; |
13925 | } |
13926 | module_init(kvm_x86_init); |
13927 | |
13928 | static void __exit kvm_x86_exit(void) |
13929 | { |
13930 | WARN_ON_ONCE(static_branch_unlikely(&kvm_has_noapic_vcpu)); |
13931 | } |
13932 | module_exit(kvm_x86_exit); |
13933 | |