1 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
2 | |
3 | #include <linux/kvm_host.h> |
4 | |
5 | #include "irq.h" |
6 | #include "mmu.h" |
7 | #include "kvm_cache_regs.h" |
8 | #include "x86.h" |
9 | #include "smm.h" |
10 | #include "cpuid.h" |
11 | #include "pmu.h" |
12 | |
13 | #include <linux/module.h> |
14 | #include <linux/mod_devicetable.h> |
15 | #include <linux/kernel.h> |
16 | #include <linux/vmalloc.h> |
17 | #include <linux/highmem.h> |
18 | #include <linux/amd-iommu.h> |
19 | #include <linux/sched.h> |
20 | #include <linux/trace_events.h> |
21 | #include <linux/slab.h> |
22 | #include <linux/hashtable.h> |
23 | #include <linux/objtool.h> |
24 | #include <linux/psp-sev.h> |
25 | #include <linux/file.h> |
26 | #include <linux/pagemap.h> |
27 | #include <linux/swap.h> |
28 | #include <linux/rwsem.h> |
29 | #include <linux/cc_platform.h> |
30 | #include <linux/smp.h> |
31 | |
32 | #include <asm/apic.h> |
33 | #include <asm/perf_event.h> |
34 | #include <asm/tlbflush.h> |
35 | #include <asm/desc.h> |
36 | #include <asm/debugreg.h> |
37 | #include <asm/kvm_para.h> |
38 | #include <asm/irq_remapping.h> |
39 | #include <asm/spec-ctrl.h> |
40 | #include <asm/cpu_device_id.h> |
41 | #include <asm/traps.h> |
42 | #include <asm/reboot.h> |
43 | #include <asm/fpu/api.h> |
44 | |
45 | #include <trace/events/ipi.h> |
46 | |
47 | #include "trace.h" |
48 | |
49 | #include "svm.h" |
50 | #include "svm_ops.h" |
51 | |
52 | #include "kvm_onhyperv.h" |
53 | #include "svm_onhyperv.h" |
54 | |
55 | MODULE_AUTHOR("Qumranet" ); |
56 | MODULE_LICENSE("GPL" ); |
57 | |
58 | #ifdef MODULE |
59 | static const struct x86_cpu_id svm_cpu_id[] = { |
60 | X86_MATCH_FEATURE(X86_FEATURE_SVM, NULL), |
61 | {} |
62 | }; |
63 | MODULE_DEVICE_TABLE(x86cpu, svm_cpu_id); |
64 | #endif |
65 | |
66 | #define SEG_TYPE_LDT 2 |
67 | #define SEG_TYPE_BUSY_TSS16 3 |
68 | |
69 | static bool erratum_383_found __read_mostly; |
70 | |
71 | u32 msrpm_offsets[MSRPM_OFFSETS] __read_mostly; |
72 | |
73 | /* |
74 | * Set osvw_len to higher value when updated Revision Guides |
75 | * are published and we know what the new status bits are |
76 | */ |
77 | static uint64_t osvw_len = 4, osvw_status; |
78 | |
79 | static DEFINE_PER_CPU(u64, current_tsc_ratio); |
80 | |
81 | #define X2APIC_MSR(x) (APIC_BASE_MSR + (x >> 4)) |
82 | |
83 | static const struct svm_direct_access_msrs { |
84 | u32 index; /* Index of the MSR */ |
85 | bool always; /* True if intercept is initially cleared */ |
86 | } direct_access_msrs[MAX_DIRECT_ACCESS_MSRS] = { |
87 | { .index = MSR_STAR, .always = true }, |
88 | { .index = MSR_IA32_SYSENTER_CS, .always = true }, |
89 | { .index = MSR_IA32_SYSENTER_EIP, .always = false }, |
90 | { .index = MSR_IA32_SYSENTER_ESP, .always = false }, |
91 | #ifdef CONFIG_X86_64 |
92 | { .index = MSR_GS_BASE, .always = true }, |
93 | { .index = MSR_FS_BASE, .always = true }, |
94 | { .index = MSR_KERNEL_GS_BASE, .always = true }, |
95 | { .index = MSR_LSTAR, .always = true }, |
96 | { .index = MSR_CSTAR, .always = true }, |
97 | { .index = MSR_SYSCALL_MASK, .always = true }, |
98 | #endif |
99 | { .index = MSR_IA32_SPEC_CTRL, .always = false }, |
100 | { .index = MSR_IA32_PRED_CMD, .always = false }, |
101 | { .index = MSR_IA32_FLUSH_CMD, .always = false }, |
102 | { .index = MSR_IA32_LASTBRANCHFROMIP, .always = false }, |
103 | { .index = MSR_IA32_LASTBRANCHTOIP, .always = false }, |
104 | { .index = MSR_IA32_LASTINTFROMIP, .always = false }, |
105 | { .index = MSR_IA32_LASTINTTOIP, .always = false }, |
106 | { .index = MSR_IA32_XSS, .always = false }, |
107 | { .index = MSR_EFER, .always = false }, |
108 | { .index = MSR_IA32_CR_PAT, .always = false }, |
109 | { .index = MSR_AMD64_SEV_ES_GHCB, .always = true }, |
110 | { .index = MSR_TSC_AUX, .always = false }, |
111 | { .index = X2APIC_MSR(APIC_ID), .always = false }, |
112 | { .index = X2APIC_MSR(APIC_LVR), .always = false }, |
113 | { .index = X2APIC_MSR(APIC_TASKPRI), .always = false }, |
114 | { .index = X2APIC_MSR(APIC_ARBPRI), .always = false }, |
115 | { .index = X2APIC_MSR(APIC_PROCPRI), .always = false }, |
116 | { .index = X2APIC_MSR(APIC_EOI), .always = false }, |
117 | { .index = X2APIC_MSR(APIC_RRR), .always = false }, |
118 | { .index = X2APIC_MSR(APIC_LDR), .always = false }, |
119 | { .index = X2APIC_MSR(APIC_DFR), .always = false }, |
120 | { .index = X2APIC_MSR(APIC_SPIV), .always = false }, |
121 | { .index = X2APIC_MSR(APIC_ISR), .always = false }, |
122 | { .index = X2APIC_MSR(APIC_TMR), .always = false }, |
123 | { .index = X2APIC_MSR(APIC_IRR), .always = false }, |
124 | { .index = X2APIC_MSR(APIC_ESR), .always = false }, |
125 | { .index = X2APIC_MSR(APIC_ICR), .always = false }, |
126 | { .index = X2APIC_MSR(APIC_ICR2), .always = false }, |
127 | |
128 | /* |
129 | * Note: |
130 | * AMD does not virtualize APIC TSC-deadline timer mode, but it is |
131 | * emulated by KVM. When setting APIC LVTT (0x832) register bit 18, |
132 | * the AVIC hardware would generate GP fault. Therefore, always |
133 | * intercept the MSR 0x832, and do not setup direct_access_msr. |
134 | */ |
135 | { .index = X2APIC_MSR(APIC_LVTTHMR), .always = false }, |
136 | { .index = X2APIC_MSR(APIC_LVTPC), .always = false }, |
137 | { .index = X2APIC_MSR(APIC_LVT0), .always = false }, |
138 | { .index = X2APIC_MSR(APIC_LVT1), .always = false }, |
139 | { .index = X2APIC_MSR(APIC_LVTERR), .always = false }, |
140 | { .index = X2APIC_MSR(APIC_TMICT), .always = false }, |
141 | { .index = X2APIC_MSR(APIC_TMCCT), .always = false }, |
142 | { .index = X2APIC_MSR(APIC_TDCR), .always = false }, |
143 | { .index = MSR_INVALID, .always = false }, |
144 | }; |
145 | |
146 | /* |
147 | * These 2 parameters are used to config the controls for Pause-Loop Exiting: |
148 | * pause_filter_count: On processors that support Pause filtering(indicated |
149 | * by CPUID Fn8000_000A_EDX), the VMCB provides a 16 bit pause filter |
150 | * count value. On VMRUN this value is loaded into an internal counter. |
151 | * Each time a pause instruction is executed, this counter is decremented |
152 | * until it reaches zero at which time a #VMEXIT is generated if pause |
153 | * intercept is enabled. Refer to AMD APM Vol 2 Section 15.14.4 Pause |
154 | * Intercept Filtering for more details. |
155 | * This also indicate if ple logic enabled. |
156 | * |
157 | * pause_filter_thresh: In addition, some processor families support advanced |
158 | * pause filtering (indicated by CPUID Fn8000_000A_EDX) upper bound on |
159 | * the amount of time a guest is allowed to execute in a pause loop. |
160 | * In this mode, a 16-bit pause filter threshold field is added in the |
161 | * VMCB. The threshold value is a cycle count that is used to reset the |
162 | * pause counter. As with simple pause filtering, VMRUN loads the pause |
163 | * count value from VMCB into an internal counter. Then, on each pause |
164 | * instruction the hardware checks the elapsed number of cycles since |
165 | * the most recent pause instruction against the pause filter threshold. |
166 | * If the elapsed cycle count is greater than the pause filter threshold, |
167 | * then the internal pause count is reloaded from the VMCB and execution |
168 | * continues. If the elapsed cycle count is less than the pause filter |
169 | * threshold, then the internal pause count is decremented. If the count |
170 | * value is less than zero and PAUSE intercept is enabled, a #VMEXIT is |
171 | * triggered. If advanced pause filtering is supported and pause filter |
172 | * threshold field is set to zero, the filter will operate in the simpler, |
173 | * count only mode. |
174 | */ |
175 | |
176 | static unsigned short pause_filter_thresh = KVM_DEFAULT_PLE_GAP; |
177 | module_param(pause_filter_thresh, ushort, 0444); |
178 | |
179 | static unsigned short pause_filter_count = KVM_SVM_DEFAULT_PLE_WINDOW; |
180 | module_param(pause_filter_count, ushort, 0444); |
181 | |
182 | /* Default doubles per-vcpu window every exit. */ |
183 | static unsigned short pause_filter_count_grow = KVM_DEFAULT_PLE_WINDOW_GROW; |
184 | module_param(pause_filter_count_grow, ushort, 0444); |
185 | |
186 | /* Default resets per-vcpu window every exit to pause_filter_count. */ |
187 | static unsigned short pause_filter_count_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK; |
188 | module_param(pause_filter_count_shrink, ushort, 0444); |
189 | |
190 | /* Default is to compute the maximum so we can never overflow. */ |
191 | static unsigned short pause_filter_count_max = KVM_SVM_DEFAULT_PLE_WINDOW_MAX; |
192 | module_param(pause_filter_count_max, ushort, 0444); |
193 | |
194 | /* |
195 | * Use nested page tables by default. Note, NPT may get forced off by |
196 | * svm_hardware_setup() if it's unsupported by hardware or the host kernel. |
197 | */ |
198 | bool npt_enabled = true; |
199 | module_param_named(npt, npt_enabled, bool, 0444); |
200 | |
201 | /* allow nested virtualization in KVM/SVM */ |
202 | static int nested = true; |
203 | module_param(nested, int, 0444); |
204 | |
205 | /* enable/disable Next RIP Save */ |
206 | int nrips = true; |
207 | module_param(nrips, int, 0444); |
208 | |
209 | /* enable/disable Virtual VMLOAD VMSAVE */ |
210 | static int vls = true; |
211 | module_param(vls, int, 0444); |
212 | |
213 | /* enable/disable Virtual GIF */ |
214 | int vgif = true; |
215 | module_param(vgif, int, 0444); |
216 | |
217 | /* enable/disable LBR virtualization */ |
218 | static int lbrv = true; |
219 | module_param(lbrv, int, 0444); |
220 | |
221 | static int tsc_scaling = true; |
222 | module_param(tsc_scaling, int, 0444); |
223 | |
224 | /* |
225 | * enable / disable AVIC. Because the defaults differ for APICv |
226 | * support between VMX and SVM we cannot use module_param_named. |
227 | */ |
228 | static bool avic; |
229 | module_param(avic, bool, 0444); |
230 | |
231 | bool __read_mostly dump_invalid_vmcb; |
232 | module_param(dump_invalid_vmcb, bool, 0644); |
233 | |
234 | |
235 | bool intercept_smi = true; |
236 | module_param(intercept_smi, bool, 0444); |
237 | |
238 | bool vnmi = true; |
239 | module_param(vnmi, bool, 0444); |
240 | |
241 | static bool svm_gp_erratum_intercept = true; |
242 | |
243 | static u8 rsm_ins_bytes[] = "\x0f\xaa" ; |
244 | |
245 | static unsigned long iopm_base; |
246 | |
247 | DEFINE_PER_CPU(struct svm_cpu_data, svm_data); |
248 | |
249 | /* |
250 | * Only MSR_TSC_AUX is switched via the user return hook. EFER is switched via |
251 | * the VMCB, and the SYSCALL/SYSENTER MSRs are handled by VMLOAD/VMSAVE. |
252 | * |
253 | * RDTSCP and RDPID are not used in the kernel, specifically to allow KVM to |
254 | * defer the restoration of TSC_AUX until the CPU returns to userspace. |
255 | */ |
256 | static int tsc_aux_uret_slot __read_mostly = -1; |
257 | |
258 | static const u32 msrpm_ranges[] = {0, 0xc0000000, 0xc0010000}; |
259 | |
260 | #define NUM_MSR_MAPS ARRAY_SIZE(msrpm_ranges) |
261 | #define MSRS_RANGE_SIZE 2048 |
262 | #define MSRS_IN_RANGE (MSRS_RANGE_SIZE * 8 / 2) |
263 | |
264 | u32 svm_msrpm_offset(u32 msr) |
265 | { |
266 | u32 offset; |
267 | int i; |
268 | |
269 | for (i = 0; i < NUM_MSR_MAPS; i++) { |
270 | if (msr < msrpm_ranges[i] || |
271 | msr >= msrpm_ranges[i] + MSRS_IN_RANGE) |
272 | continue; |
273 | |
274 | offset = (msr - msrpm_ranges[i]) / 4; /* 4 msrs per u8 */ |
275 | offset += (i * MSRS_RANGE_SIZE); /* add range offset */ |
276 | |
277 | /* Now we have the u8 offset - but need the u32 offset */ |
278 | return offset / 4; |
279 | } |
280 | |
281 | /* MSR not in any range */ |
282 | return MSR_INVALID; |
283 | } |
284 | |
285 | static void svm_flush_tlb_current(struct kvm_vcpu *vcpu); |
286 | |
287 | static int get_npt_level(void) |
288 | { |
289 | #ifdef CONFIG_X86_64 |
290 | return pgtable_l5_enabled() ? PT64_ROOT_5LEVEL : PT64_ROOT_4LEVEL; |
291 | #else |
292 | return PT32E_ROOT_LEVEL; |
293 | #endif |
294 | } |
295 | |
296 | int svm_set_efer(struct kvm_vcpu *vcpu, u64 efer) |
297 | { |
298 | struct vcpu_svm *svm = to_svm(vcpu); |
299 | u64 old_efer = vcpu->arch.efer; |
300 | vcpu->arch.efer = efer; |
301 | |
302 | if (!npt_enabled) { |
303 | /* Shadow paging assumes NX to be available. */ |
304 | efer |= EFER_NX; |
305 | |
306 | if (!(efer & EFER_LMA)) |
307 | efer &= ~EFER_LME; |
308 | } |
309 | |
310 | if ((old_efer & EFER_SVME) != (efer & EFER_SVME)) { |
311 | if (!(efer & EFER_SVME)) { |
312 | svm_leave_nested(vcpu); |
313 | svm_set_gif(svm, value: true); |
314 | /* #GP intercept is still needed for vmware backdoor */ |
315 | if (!enable_vmware_backdoor) |
316 | clr_exception_intercept(svm, GP_VECTOR); |
317 | |
318 | /* |
319 | * Free the nested guest state, unless we are in SMM. |
320 | * In this case we will return to the nested guest |
321 | * as soon as we leave SMM. |
322 | */ |
323 | if (!is_smm(vcpu)) |
324 | svm_free_nested(svm); |
325 | |
326 | } else { |
327 | int ret = svm_allocate_nested(svm); |
328 | |
329 | if (ret) { |
330 | vcpu->arch.efer = old_efer; |
331 | return ret; |
332 | } |
333 | |
334 | /* |
335 | * Never intercept #GP for SEV guests, KVM can't |
336 | * decrypt guest memory to workaround the erratum. |
337 | */ |
338 | if (svm_gp_erratum_intercept && !sev_guest(kvm: vcpu->kvm)) |
339 | set_exception_intercept(svm, GP_VECTOR); |
340 | } |
341 | } |
342 | |
343 | svm->vmcb->save.efer = efer | EFER_SVME; |
344 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_CR); |
345 | return 0; |
346 | } |
347 | |
348 | static u32 svm_get_interrupt_shadow(struct kvm_vcpu *vcpu) |
349 | { |
350 | struct vcpu_svm *svm = to_svm(vcpu); |
351 | u32 ret = 0; |
352 | |
353 | if (svm->vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK) |
354 | ret = KVM_X86_SHADOW_INT_STI | KVM_X86_SHADOW_INT_MOV_SS; |
355 | return ret; |
356 | } |
357 | |
358 | static void svm_set_interrupt_shadow(struct kvm_vcpu *vcpu, int mask) |
359 | { |
360 | struct vcpu_svm *svm = to_svm(vcpu); |
361 | |
362 | if (mask == 0) |
363 | svm->vmcb->control.int_state &= ~SVM_INTERRUPT_SHADOW_MASK; |
364 | else |
365 | svm->vmcb->control.int_state |= SVM_INTERRUPT_SHADOW_MASK; |
366 | |
367 | } |
368 | |
369 | static int __svm_skip_emulated_instruction(struct kvm_vcpu *vcpu, |
370 | bool commit_side_effects) |
371 | { |
372 | struct vcpu_svm *svm = to_svm(vcpu); |
373 | unsigned long old_rflags; |
374 | |
375 | /* |
376 | * SEV-ES does not expose the next RIP. The RIP update is controlled by |
377 | * the type of exit and the #VC handler in the guest. |
378 | */ |
379 | if (sev_es_guest(kvm: vcpu->kvm)) |
380 | goto done; |
381 | |
382 | if (nrips && svm->vmcb->control.next_rip != 0) { |
383 | WARN_ON_ONCE(!static_cpu_has(X86_FEATURE_NRIPS)); |
384 | svm->next_rip = svm->vmcb->control.next_rip; |
385 | } |
386 | |
387 | if (!svm->next_rip) { |
388 | if (unlikely(!commit_side_effects)) |
389 | old_rflags = svm->vmcb->save.rflags; |
390 | |
391 | if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP)) |
392 | return 0; |
393 | |
394 | if (unlikely(!commit_side_effects)) |
395 | svm->vmcb->save.rflags = old_rflags; |
396 | } else { |
397 | kvm_rip_write(vcpu, svm->next_rip); |
398 | } |
399 | |
400 | done: |
401 | if (likely(commit_side_effects)) |
402 | svm_set_interrupt_shadow(vcpu, mask: 0); |
403 | |
404 | return 1; |
405 | } |
406 | |
407 | static int svm_skip_emulated_instruction(struct kvm_vcpu *vcpu) |
408 | { |
409 | return __svm_skip_emulated_instruction(vcpu, commit_side_effects: true); |
410 | } |
411 | |
412 | static int svm_update_soft_interrupt_rip(struct kvm_vcpu *vcpu) |
413 | { |
414 | unsigned long rip, old_rip = kvm_rip_read(vcpu); |
415 | struct vcpu_svm *svm = to_svm(vcpu); |
416 | |
417 | /* |
418 | * Due to architectural shortcomings, the CPU doesn't always provide |
419 | * NextRIP, e.g. if KVM intercepted an exception that occurred while |
420 | * the CPU was vectoring an INTO/INT3 in the guest. Temporarily skip |
421 | * the instruction even if NextRIP is supported to acquire the next |
422 | * RIP so that it can be shoved into the NextRIP field, otherwise |
423 | * hardware will fail to advance guest RIP during event injection. |
424 | * Drop the exception/interrupt if emulation fails and effectively |
425 | * retry the instruction, it's the least awful option. If NRIPS is |
426 | * in use, the skip must not commit any side effects such as clearing |
427 | * the interrupt shadow or RFLAGS.RF. |
428 | */ |
429 | if (!__svm_skip_emulated_instruction(vcpu, commit_side_effects: !nrips)) |
430 | return -EIO; |
431 | |
432 | rip = kvm_rip_read(vcpu); |
433 | |
434 | /* |
435 | * Save the injection information, even when using next_rip, as the |
436 | * VMCB's next_rip will be lost (cleared on VM-Exit) if the injection |
437 | * doesn't complete due to a VM-Exit occurring while the CPU is |
438 | * vectoring the event. Decoding the instruction isn't guaranteed to |
439 | * work as there may be no backing instruction, e.g. if the event is |
440 | * being injected by L1 for L2, or if the guest is patching INT3 into |
441 | * a different instruction. |
442 | */ |
443 | svm->soft_int_injected = true; |
444 | svm->soft_int_csbase = svm->vmcb->save.cs.base; |
445 | svm->soft_int_old_rip = old_rip; |
446 | svm->soft_int_next_rip = rip; |
447 | |
448 | if (nrips) |
449 | kvm_rip_write(vcpu, old_rip); |
450 | |
451 | if (static_cpu_has(X86_FEATURE_NRIPS)) |
452 | svm->vmcb->control.next_rip = rip; |
453 | |
454 | return 0; |
455 | } |
456 | |
457 | static void svm_inject_exception(struct kvm_vcpu *vcpu) |
458 | { |
459 | struct kvm_queued_exception *ex = &vcpu->arch.exception; |
460 | struct vcpu_svm *svm = to_svm(vcpu); |
461 | |
462 | kvm_deliver_exception_payload(vcpu, ex); |
463 | |
464 | if (kvm_exception_is_soft(ex->vector) && |
465 | svm_update_soft_interrupt_rip(vcpu)) |
466 | return; |
467 | |
468 | svm->vmcb->control.event_inj = ex->vector |
469 | | SVM_EVTINJ_VALID |
470 | | (ex->has_error_code ? SVM_EVTINJ_VALID_ERR : 0) |
471 | | SVM_EVTINJ_TYPE_EXEPT; |
472 | svm->vmcb->control.event_inj_err = ex->error_code; |
473 | } |
474 | |
475 | static void svm_init_erratum_383(void) |
476 | { |
477 | u32 low, high; |
478 | int err; |
479 | u64 val; |
480 | |
481 | if (!static_cpu_has_bug(X86_BUG_AMD_TLB_MMATCH)) |
482 | return; |
483 | |
484 | /* Use _safe variants to not break nested virtualization */ |
485 | val = native_read_msr_safe(MSR_AMD64_DC_CFG, err: &err); |
486 | if (err) |
487 | return; |
488 | |
489 | val |= (1ULL << 47); |
490 | |
491 | low = lower_32_bits(val); |
492 | high = upper_32_bits(val); |
493 | |
494 | native_write_msr_safe(MSR_AMD64_DC_CFG, low, high); |
495 | |
496 | erratum_383_found = true; |
497 | } |
498 | |
499 | static void svm_init_osvw(struct kvm_vcpu *vcpu) |
500 | { |
501 | /* |
502 | * Guests should see errata 400 and 415 as fixed (assuming that |
503 | * HLT and IO instructions are intercepted). |
504 | */ |
505 | vcpu->arch.osvw.length = (osvw_len >= 3) ? (osvw_len) : 3; |
506 | vcpu->arch.osvw.status = osvw_status & ~(6ULL); |
507 | |
508 | /* |
509 | * By increasing VCPU's osvw.length to 3 we are telling the guest that |
510 | * all osvw.status bits inside that length, including bit 0 (which is |
511 | * reserved for erratum 298), are valid. However, if host processor's |
512 | * osvw_len is 0 then osvw_status[0] carries no information. We need to |
513 | * be conservative here and therefore we tell the guest that erratum 298 |
514 | * is present (because we really don't know). |
515 | */ |
516 | if (osvw_len == 0 && boot_cpu_data.x86 == 0x10) |
517 | vcpu->arch.osvw.status |= 1; |
518 | } |
519 | |
520 | static bool __kvm_is_svm_supported(void) |
521 | { |
522 | int cpu = smp_processor_id(); |
523 | struct cpuinfo_x86 *c = &cpu_data(cpu); |
524 | |
525 | if (c->x86_vendor != X86_VENDOR_AMD && |
526 | c->x86_vendor != X86_VENDOR_HYGON) { |
527 | pr_err("CPU %d isn't AMD or Hygon\n" , cpu); |
528 | return false; |
529 | } |
530 | |
531 | if (!cpu_has(c, X86_FEATURE_SVM)) { |
532 | pr_err("SVM not supported by CPU %d\n" , cpu); |
533 | return false; |
534 | } |
535 | |
536 | if (cc_platform_has(attr: CC_ATTR_GUEST_MEM_ENCRYPT)) { |
537 | pr_info("KVM is unsupported when running as an SEV guest\n" ); |
538 | return false; |
539 | } |
540 | |
541 | return true; |
542 | } |
543 | |
544 | static bool kvm_is_svm_supported(void) |
545 | { |
546 | bool supported; |
547 | |
548 | migrate_disable(); |
549 | supported = __kvm_is_svm_supported(); |
550 | migrate_enable(); |
551 | |
552 | return supported; |
553 | } |
554 | |
555 | static int svm_check_processor_compat(void) |
556 | { |
557 | if (!__kvm_is_svm_supported()) |
558 | return -EIO; |
559 | |
560 | return 0; |
561 | } |
562 | |
563 | static void __svm_write_tsc_multiplier(u64 multiplier) |
564 | { |
565 | if (multiplier == __this_cpu_read(current_tsc_ratio)) |
566 | return; |
567 | |
568 | wrmsrl(MSR_AMD64_TSC_RATIO, val: multiplier); |
569 | __this_cpu_write(current_tsc_ratio, multiplier); |
570 | } |
571 | |
572 | static inline void kvm_cpu_svm_disable(void) |
573 | { |
574 | uint64_t efer; |
575 | |
576 | wrmsrl(MSR_VM_HSAVE_PA, val: 0); |
577 | rdmsrl(MSR_EFER, efer); |
578 | if (efer & EFER_SVME) { |
579 | /* |
580 | * Force GIF=1 prior to disabling SVM, e.g. to ensure INIT and |
581 | * NMI aren't blocked. |
582 | */ |
583 | stgi(); |
584 | wrmsrl(MSR_EFER, val: efer & ~EFER_SVME); |
585 | } |
586 | } |
587 | |
588 | static void svm_emergency_disable(void) |
589 | { |
590 | kvm_rebooting = true; |
591 | |
592 | kvm_cpu_svm_disable(); |
593 | } |
594 | |
595 | static void svm_hardware_disable(void) |
596 | { |
597 | /* Make sure we clean up behind us */ |
598 | if (tsc_scaling) |
599 | __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); |
600 | |
601 | kvm_cpu_svm_disable(); |
602 | |
603 | amd_pmu_disable_virt(); |
604 | } |
605 | |
606 | static int svm_hardware_enable(void) |
607 | { |
608 | |
609 | struct svm_cpu_data *sd; |
610 | uint64_t efer; |
611 | int me = raw_smp_processor_id(); |
612 | |
613 | rdmsrl(MSR_EFER, efer); |
614 | if (efer & EFER_SVME) |
615 | return -EBUSY; |
616 | |
617 | sd = per_cpu_ptr(&svm_data, me); |
618 | sd->asid_generation = 1; |
619 | sd->max_asid = cpuid_ebx(SVM_CPUID_FUNC) - 1; |
620 | sd->next_asid = sd->max_asid + 1; |
621 | sd->min_asid = max_sev_asid + 1; |
622 | |
623 | wrmsrl(MSR_EFER, val: efer | EFER_SVME); |
624 | |
625 | wrmsrl(MSR_VM_HSAVE_PA, val: sd->save_area_pa); |
626 | |
627 | if (static_cpu_has(X86_FEATURE_TSCRATEMSR)) { |
628 | /* |
629 | * Set the default value, even if we don't use TSC scaling |
630 | * to avoid having stale value in the msr |
631 | */ |
632 | __svm_write_tsc_multiplier(SVM_TSC_RATIO_DEFAULT); |
633 | } |
634 | |
635 | |
636 | /* |
637 | * Get OSVW bits. |
638 | * |
639 | * Note that it is possible to have a system with mixed processor |
640 | * revisions and therefore different OSVW bits. If bits are not the same |
641 | * on different processors then choose the worst case (i.e. if erratum |
642 | * is present on one processor and not on another then assume that the |
643 | * erratum is present everywhere). |
644 | */ |
645 | if (cpu_has(&boot_cpu_data, X86_FEATURE_OSVW)) { |
646 | uint64_t len, status = 0; |
647 | int err; |
648 | |
649 | len = native_read_msr_safe(MSR_AMD64_OSVW_ID_LENGTH, err: &err); |
650 | if (!err) |
651 | status = native_read_msr_safe(MSR_AMD64_OSVW_STATUS, |
652 | err: &err); |
653 | |
654 | if (err) |
655 | osvw_status = osvw_len = 0; |
656 | else { |
657 | if (len < osvw_len) |
658 | osvw_len = len; |
659 | osvw_status |= status; |
660 | osvw_status &= (1ULL << osvw_len) - 1; |
661 | } |
662 | } else |
663 | osvw_status = osvw_len = 0; |
664 | |
665 | svm_init_erratum_383(); |
666 | |
667 | amd_pmu_enable_virt(); |
668 | |
669 | /* |
670 | * If TSC_AUX virtualization is supported, TSC_AUX becomes a swap type |
671 | * "B" field (see sev_es_prepare_switch_to_guest()) for SEV-ES guests. |
672 | * Since Linux does not change the value of TSC_AUX once set, prime the |
673 | * TSC_AUX field now to avoid a RDMSR on every vCPU run. |
674 | */ |
675 | if (boot_cpu_has(X86_FEATURE_V_TSC_AUX)) { |
676 | struct sev_es_save_area *hostsa; |
677 | u32 __maybe_unused msr_hi; |
678 | |
679 | hostsa = (struct sev_es_save_area *)(page_address(sd->save_area) + 0x400); |
680 | |
681 | rdmsr(MSR_TSC_AUX, hostsa->tsc_aux, msr_hi); |
682 | } |
683 | |
684 | return 0; |
685 | } |
686 | |
687 | static void svm_cpu_uninit(int cpu) |
688 | { |
689 | struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); |
690 | |
691 | if (!sd->save_area) |
692 | return; |
693 | |
694 | kfree(objp: sd->sev_vmcbs); |
695 | __free_page(sd->save_area); |
696 | sd->save_area_pa = 0; |
697 | sd->save_area = NULL; |
698 | } |
699 | |
700 | static int svm_cpu_init(int cpu) |
701 | { |
702 | struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); |
703 | int ret = -ENOMEM; |
704 | |
705 | memset(sd, 0, sizeof(struct svm_cpu_data)); |
706 | sd->save_area = snp_safe_alloc_page(NULL); |
707 | if (!sd->save_area) |
708 | return ret; |
709 | |
710 | ret = sev_cpu_init(sd); |
711 | if (ret) |
712 | goto free_save_area; |
713 | |
714 | sd->save_area_pa = __sme_page_pa(sd->save_area); |
715 | return 0; |
716 | |
717 | free_save_area: |
718 | __free_page(sd->save_area); |
719 | sd->save_area = NULL; |
720 | return ret; |
721 | |
722 | } |
723 | |
724 | static void set_dr_intercepts(struct vcpu_svm *svm) |
725 | { |
726 | struct vmcb *vmcb = svm->vmcb01.ptr; |
727 | |
728 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR0_READ); |
729 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR1_READ); |
730 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR2_READ); |
731 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR3_READ); |
732 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR4_READ); |
733 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR5_READ); |
734 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR6_READ); |
735 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR0_WRITE); |
736 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR1_WRITE); |
737 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR2_WRITE); |
738 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR3_WRITE); |
739 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR4_WRITE); |
740 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR5_WRITE); |
741 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR6_WRITE); |
742 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR7_READ); |
743 | vmcb_set_intercept(control: &vmcb->control, bit: INTERCEPT_DR7_WRITE); |
744 | |
745 | recalc_intercepts(svm); |
746 | } |
747 | |
748 | static void clr_dr_intercepts(struct vcpu_svm *svm) |
749 | { |
750 | struct vmcb *vmcb = svm->vmcb01.ptr; |
751 | |
752 | vmcb->control.intercepts[INTERCEPT_DR] = 0; |
753 | |
754 | recalc_intercepts(svm); |
755 | } |
756 | |
757 | static int direct_access_msr_slot(u32 msr) |
758 | { |
759 | u32 i; |
760 | |
761 | for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) |
762 | if (direct_access_msrs[i].index == msr) |
763 | return i; |
764 | |
765 | return -ENOENT; |
766 | } |
767 | |
768 | static void set_shadow_msr_intercept(struct kvm_vcpu *vcpu, u32 msr, int read, |
769 | int write) |
770 | { |
771 | struct vcpu_svm *svm = to_svm(vcpu); |
772 | int slot = direct_access_msr_slot(msr); |
773 | |
774 | if (slot == -ENOENT) |
775 | return; |
776 | |
777 | /* Set the shadow bitmaps to the desired intercept states */ |
778 | if (read) |
779 | set_bit(nr: slot, addr: svm->shadow_msr_intercept.read); |
780 | else |
781 | clear_bit(nr: slot, addr: svm->shadow_msr_intercept.read); |
782 | |
783 | if (write) |
784 | set_bit(nr: slot, addr: svm->shadow_msr_intercept.write); |
785 | else |
786 | clear_bit(nr: slot, addr: svm->shadow_msr_intercept.write); |
787 | } |
788 | |
789 | static bool valid_msr_intercept(u32 index) |
790 | { |
791 | return direct_access_msr_slot(msr: index) != -ENOENT; |
792 | } |
793 | |
794 | static bool msr_write_intercepted(struct kvm_vcpu *vcpu, u32 msr) |
795 | { |
796 | u8 bit_write; |
797 | unsigned long tmp; |
798 | u32 offset; |
799 | u32 *msrpm; |
800 | |
801 | /* |
802 | * For non-nested case: |
803 | * If the L01 MSR bitmap does not intercept the MSR, then we need to |
804 | * save it. |
805 | * |
806 | * For nested case: |
807 | * If the L02 MSR bitmap does not intercept the MSR, then we need to |
808 | * save it. |
809 | */ |
810 | msrpm = is_guest_mode(vcpu) ? to_svm(vcpu)->nested.msrpm: |
811 | to_svm(vcpu)->msrpm; |
812 | |
813 | offset = svm_msrpm_offset(msr); |
814 | bit_write = 2 * (msr & 0x0f) + 1; |
815 | tmp = msrpm[offset]; |
816 | |
817 | BUG_ON(offset == MSR_INVALID); |
818 | |
819 | return test_bit(bit_write, &tmp); |
820 | } |
821 | |
822 | static void set_msr_interception_bitmap(struct kvm_vcpu *vcpu, u32 *msrpm, |
823 | u32 msr, int read, int write) |
824 | { |
825 | struct vcpu_svm *svm = to_svm(vcpu); |
826 | u8 bit_read, bit_write; |
827 | unsigned long tmp; |
828 | u32 offset; |
829 | |
830 | /* |
831 | * If this warning triggers extend the direct_access_msrs list at the |
832 | * beginning of the file |
833 | */ |
834 | WARN_ON(!valid_msr_intercept(msr)); |
835 | |
836 | /* Enforce non allowed MSRs to trap */ |
837 | if (read && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_READ)) |
838 | read = 0; |
839 | |
840 | if (write && !kvm_msr_allowed(vcpu, msr, KVM_MSR_FILTER_WRITE)) |
841 | write = 0; |
842 | |
843 | offset = svm_msrpm_offset(msr); |
844 | bit_read = 2 * (msr & 0x0f); |
845 | bit_write = 2 * (msr & 0x0f) + 1; |
846 | tmp = msrpm[offset]; |
847 | |
848 | BUG_ON(offset == MSR_INVALID); |
849 | |
850 | read ? clear_bit(nr: bit_read, addr: &tmp) : set_bit(nr: bit_read, addr: &tmp); |
851 | write ? clear_bit(nr: bit_write, addr: &tmp) : set_bit(nr: bit_write, addr: &tmp); |
852 | |
853 | msrpm[offset] = tmp; |
854 | |
855 | svm_hv_vmcb_dirty_nested_enlightenments(vcpu); |
856 | svm->nested.force_msr_bitmap_recalc = true; |
857 | } |
858 | |
859 | void set_msr_interception(struct kvm_vcpu *vcpu, u32 *msrpm, u32 msr, |
860 | int read, int write) |
861 | { |
862 | set_shadow_msr_intercept(vcpu, msr, read, write); |
863 | set_msr_interception_bitmap(vcpu, msrpm, msr, read, write); |
864 | } |
865 | |
866 | u32 *svm_vcpu_alloc_msrpm(void) |
867 | { |
868 | unsigned int order = get_order(MSRPM_SIZE); |
869 | struct page *pages = alloc_pages(GFP_KERNEL_ACCOUNT, order); |
870 | u32 *msrpm; |
871 | |
872 | if (!pages) |
873 | return NULL; |
874 | |
875 | msrpm = page_address(pages); |
876 | memset(msrpm, 0xff, PAGE_SIZE * (1 << order)); |
877 | |
878 | return msrpm; |
879 | } |
880 | |
881 | void svm_vcpu_init_msrpm(struct kvm_vcpu *vcpu, u32 *msrpm) |
882 | { |
883 | int i; |
884 | |
885 | for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { |
886 | if (!direct_access_msrs[i].always) |
887 | continue; |
888 | set_msr_interception(vcpu, msrpm, msr: direct_access_msrs[i].index, read: 1, write: 1); |
889 | } |
890 | } |
891 | |
892 | void svm_set_x2apic_msr_interception(struct vcpu_svm *svm, bool intercept) |
893 | { |
894 | int i; |
895 | |
896 | if (intercept == svm->x2avic_msrs_intercepted) |
897 | return; |
898 | |
899 | if (!x2avic_enabled) |
900 | return; |
901 | |
902 | for (i = 0; i < MAX_DIRECT_ACCESS_MSRS; i++) { |
903 | int index = direct_access_msrs[i].index; |
904 | |
905 | if ((index < APIC_BASE_MSR) || |
906 | (index > APIC_BASE_MSR + 0xff)) |
907 | continue; |
908 | set_msr_interception(vcpu: &svm->vcpu, msrpm: svm->msrpm, msr: index, |
909 | read: !intercept, write: !intercept); |
910 | } |
911 | |
912 | svm->x2avic_msrs_intercepted = intercept; |
913 | } |
914 | |
915 | void svm_vcpu_free_msrpm(u32 *msrpm) |
916 | { |
917 | __free_pages(virt_to_page(msrpm), order: get_order(MSRPM_SIZE)); |
918 | } |
919 | |
920 | static void svm_msr_filter_changed(struct kvm_vcpu *vcpu) |
921 | { |
922 | struct vcpu_svm *svm = to_svm(vcpu); |
923 | u32 i; |
924 | |
925 | /* |
926 | * Set intercept permissions for all direct access MSRs again. They |
927 | * will automatically get filtered through the MSR filter, so we are |
928 | * back in sync after this. |
929 | */ |
930 | for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { |
931 | u32 msr = direct_access_msrs[i].index; |
932 | u32 read = test_bit(i, svm->shadow_msr_intercept.read); |
933 | u32 write = test_bit(i, svm->shadow_msr_intercept.write); |
934 | |
935 | set_msr_interception_bitmap(vcpu, msrpm: svm->msrpm, msr, read, write); |
936 | } |
937 | } |
938 | |
939 | static void add_msr_offset(u32 offset) |
940 | { |
941 | int i; |
942 | |
943 | for (i = 0; i < MSRPM_OFFSETS; ++i) { |
944 | |
945 | /* Offset already in list? */ |
946 | if (msrpm_offsets[i] == offset) |
947 | return; |
948 | |
949 | /* Slot used by another offset? */ |
950 | if (msrpm_offsets[i] != MSR_INVALID) |
951 | continue; |
952 | |
953 | /* Add offset to list */ |
954 | msrpm_offsets[i] = offset; |
955 | |
956 | return; |
957 | } |
958 | |
959 | /* |
960 | * If this BUG triggers the msrpm_offsets table has an overflow. Just |
961 | * increase MSRPM_OFFSETS in this case. |
962 | */ |
963 | BUG(); |
964 | } |
965 | |
966 | static void init_msrpm_offsets(void) |
967 | { |
968 | int i; |
969 | |
970 | memset(msrpm_offsets, 0xff, sizeof(msrpm_offsets)); |
971 | |
972 | for (i = 0; direct_access_msrs[i].index != MSR_INVALID; i++) { |
973 | u32 offset; |
974 | |
975 | offset = svm_msrpm_offset(msr: direct_access_msrs[i].index); |
976 | BUG_ON(offset == MSR_INVALID); |
977 | |
978 | add_msr_offset(offset); |
979 | } |
980 | } |
981 | |
982 | void svm_copy_lbrs(struct vmcb *to_vmcb, struct vmcb *from_vmcb) |
983 | { |
984 | to_vmcb->save.dbgctl = from_vmcb->save.dbgctl; |
985 | to_vmcb->save.br_from = from_vmcb->save.br_from; |
986 | to_vmcb->save.br_to = from_vmcb->save.br_to; |
987 | to_vmcb->save.last_excp_from = from_vmcb->save.last_excp_from; |
988 | to_vmcb->save.last_excp_to = from_vmcb->save.last_excp_to; |
989 | |
990 | vmcb_mark_dirty(vmcb: to_vmcb, bit: VMCB_LBR); |
991 | } |
992 | |
993 | static void svm_enable_lbrv(struct kvm_vcpu *vcpu) |
994 | { |
995 | struct vcpu_svm *svm = to_svm(vcpu); |
996 | |
997 | svm->vmcb->control.virt_ext |= LBR_CTL_ENABLE_MASK; |
998 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, read: 1, write: 1); |
999 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTBRANCHTOIP, read: 1, write: 1); |
1000 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTINTFROMIP, read: 1, write: 1); |
1001 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTINTTOIP, read: 1, write: 1); |
1002 | |
1003 | /* Move the LBR msrs to the vmcb02 so that the guest can see them. */ |
1004 | if (is_guest_mode(vcpu)) |
1005 | svm_copy_lbrs(to_vmcb: svm->vmcb, from_vmcb: svm->vmcb01.ptr); |
1006 | } |
1007 | |
1008 | static void svm_disable_lbrv(struct kvm_vcpu *vcpu) |
1009 | { |
1010 | struct vcpu_svm *svm = to_svm(vcpu); |
1011 | |
1012 | svm->vmcb->control.virt_ext &= ~LBR_CTL_ENABLE_MASK; |
1013 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTBRANCHFROMIP, read: 0, write: 0); |
1014 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTBRANCHTOIP, read: 0, write: 0); |
1015 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTINTFROMIP, read: 0, write: 0); |
1016 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_LASTINTTOIP, read: 0, write: 0); |
1017 | |
1018 | /* |
1019 | * Move the LBR msrs back to the vmcb01 to avoid copying them |
1020 | * on nested guest entries. |
1021 | */ |
1022 | if (is_guest_mode(vcpu)) |
1023 | svm_copy_lbrs(to_vmcb: svm->vmcb01.ptr, from_vmcb: svm->vmcb); |
1024 | } |
1025 | |
1026 | static struct vmcb *svm_get_lbr_vmcb(struct vcpu_svm *svm) |
1027 | { |
1028 | /* |
1029 | * If LBR virtualization is disabled, the LBR MSRs are always kept in |
1030 | * vmcb01. If LBR virtualization is enabled and L1 is running VMs of |
1031 | * its own, the MSRs are moved between vmcb01 and vmcb02 as needed. |
1032 | */ |
1033 | return svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK ? svm->vmcb : |
1034 | svm->vmcb01.ptr; |
1035 | } |
1036 | |
1037 | void svm_update_lbrv(struct kvm_vcpu *vcpu) |
1038 | { |
1039 | struct vcpu_svm *svm = to_svm(vcpu); |
1040 | bool current_enable_lbrv = svm->vmcb->control.virt_ext & LBR_CTL_ENABLE_MASK; |
1041 | bool enable_lbrv = (svm_get_lbr_vmcb(svm)->save.dbgctl & DEBUGCTLMSR_LBR) || |
1042 | (is_guest_mode(vcpu) && guest_can_use(vcpu, X86_FEATURE_LBRV) && |
1043 | (svm->nested.ctl.virt_ext & LBR_CTL_ENABLE_MASK)); |
1044 | |
1045 | if (enable_lbrv == current_enable_lbrv) |
1046 | return; |
1047 | |
1048 | if (enable_lbrv) |
1049 | svm_enable_lbrv(vcpu); |
1050 | else |
1051 | svm_disable_lbrv(vcpu); |
1052 | } |
1053 | |
1054 | void disable_nmi_singlestep(struct vcpu_svm *svm) |
1055 | { |
1056 | svm->nmi_singlestep = false; |
1057 | |
1058 | if (!(svm->vcpu.guest_debug & KVM_GUESTDBG_SINGLESTEP)) { |
1059 | /* Clear our flags if they were not set by the guest */ |
1060 | if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) |
1061 | svm->vmcb->save.rflags &= ~X86_EFLAGS_TF; |
1062 | if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) |
1063 | svm->vmcb->save.rflags &= ~X86_EFLAGS_RF; |
1064 | } |
1065 | } |
1066 | |
1067 | static void grow_ple_window(struct kvm_vcpu *vcpu) |
1068 | { |
1069 | struct vcpu_svm *svm = to_svm(vcpu); |
1070 | struct vmcb_control_area *control = &svm->vmcb->control; |
1071 | int old = control->pause_filter_count; |
1072 | |
1073 | if (kvm_pause_in_guest(vcpu->kvm)) |
1074 | return; |
1075 | |
1076 | control->pause_filter_count = __grow_ple_window(old, |
1077 | pause_filter_count, |
1078 | pause_filter_count_grow, |
1079 | pause_filter_count_max); |
1080 | |
1081 | if (control->pause_filter_count != old) { |
1082 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_INTERCEPTS); |
1083 | trace_kvm_ple_window_update(vcpu->vcpu_id, |
1084 | control->pause_filter_count, old); |
1085 | } |
1086 | } |
1087 | |
1088 | static void shrink_ple_window(struct kvm_vcpu *vcpu) |
1089 | { |
1090 | struct vcpu_svm *svm = to_svm(vcpu); |
1091 | struct vmcb_control_area *control = &svm->vmcb->control; |
1092 | int old = control->pause_filter_count; |
1093 | |
1094 | if (kvm_pause_in_guest(vcpu->kvm)) |
1095 | return; |
1096 | |
1097 | control->pause_filter_count = |
1098 | __shrink_ple_window(old, |
1099 | pause_filter_count, |
1100 | pause_filter_count_shrink, |
1101 | pause_filter_count); |
1102 | if (control->pause_filter_count != old) { |
1103 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_INTERCEPTS); |
1104 | trace_kvm_ple_window_update(vcpu->vcpu_id, |
1105 | control->pause_filter_count, old); |
1106 | } |
1107 | } |
1108 | |
1109 | static void svm_hardware_unsetup(void) |
1110 | { |
1111 | int cpu; |
1112 | |
1113 | sev_hardware_unsetup(); |
1114 | |
1115 | for_each_possible_cpu(cpu) |
1116 | svm_cpu_uninit(cpu); |
1117 | |
1118 | __free_pages(pfn_to_page(iopm_base >> PAGE_SHIFT), |
1119 | order: get_order(IOPM_SIZE)); |
1120 | iopm_base = 0; |
1121 | } |
1122 | |
1123 | static void init_seg(struct vmcb_seg *seg) |
1124 | { |
1125 | seg->selector = 0; |
1126 | seg->attrib = SVM_SELECTOR_P_MASK | SVM_SELECTOR_S_MASK | |
1127 | SVM_SELECTOR_WRITE_MASK; /* Read/Write Data Segment */ |
1128 | seg->limit = 0xffff; |
1129 | seg->base = 0; |
1130 | } |
1131 | |
1132 | static void init_sys_seg(struct vmcb_seg *seg, uint32_t type) |
1133 | { |
1134 | seg->selector = 0; |
1135 | seg->attrib = SVM_SELECTOR_P_MASK | type; |
1136 | seg->limit = 0xffff; |
1137 | seg->base = 0; |
1138 | } |
1139 | |
1140 | static u64 svm_get_l2_tsc_offset(struct kvm_vcpu *vcpu) |
1141 | { |
1142 | struct vcpu_svm *svm = to_svm(vcpu); |
1143 | |
1144 | return svm->nested.ctl.tsc_offset; |
1145 | } |
1146 | |
1147 | static u64 svm_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu) |
1148 | { |
1149 | struct vcpu_svm *svm = to_svm(vcpu); |
1150 | |
1151 | return svm->tsc_ratio_msr; |
1152 | } |
1153 | |
1154 | static void svm_write_tsc_offset(struct kvm_vcpu *vcpu) |
1155 | { |
1156 | struct vcpu_svm *svm = to_svm(vcpu); |
1157 | |
1158 | svm->vmcb01.ptr->control.tsc_offset = vcpu->arch.l1_tsc_offset; |
1159 | svm->vmcb->control.tsc_offset = vcpu->arch.tsc_offset; |
1160 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_INTERCEPTS); |
1161 | } |
1162 | |
1163 | void svm_write_tsc_multiplier(struct kvm_vcpu *vcpu) |
1164 | { |
1165 | preempt_disable(); |
1166 | if (to_svm(vcpu)->guest_state_loaded) |
1167 | __svm_write_tsc_multiplier(multiplier: vcpu->arch.tsc_scaling_ratio); |
1168 | preempt_enable(); |
1169 | } |
1170 | |
1171 | /* Evaluate instruction intercepts that depend on guest CPUID features. */ |
1172 | static void svm_recalc_instruction_intercepts(struct kvm_vcpu *vcpu, |
1173 | struct vcpu_svm *svm) |
1174 | { |
1175 | /* |
1176 | * Intercept INVPCID if shadow paging is enabled to sync/free shadow |
1177 | * roots, or if INVPCID is disabled in the guest to inject #UD. |
1178 | */ |
1179 | if (kvm_cpu_cap_has(X86_FEATURE_INVPCID)) { |
1180 | if (!npt_enabled || |
1181 | !guest_cpuid_has(&svm->vcpu, X86_FEATURE_INVPCID)) |
1182 | svm_set_intercept(svm, bit: INTERCEPT_INVPCID); |
1183 | else |
1184 | svm_clr_intercept(svm, bit: INTERCEPT_INVPCID); |
1185 | } |
1186 | |
1187 | if (kvm_cpu_cap_has(X86_FEATURE_RDTSCP)) { |
1188 | if (guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP)) |
1189 | svm_clr_intercept(svm, bit: INTERCEPT_RDTSCP); |
1190 | else |
1191 | svm_set_intercept(svm, bit: INTERCEPT_RDTSCP); |
1192 | } |
1193 | } |
1194 | |
1195 | static inline void init_vmcb_after_set_cpuid(struct kvm_vcpu *vcpu) |
1196 | { |
1197 | struct vcpu_svm *svm = to_svm(vcpu); |
1198 | |
1199 | if (guest_cpuid_is_intel(vcpu)) { |
1200 | /* |
1201 | * We must intercept SYSENTER_EIP and SYSENTER_ESP |
1202 | * accesses because the processor only stores 32 bits. |
1203 | * For the same reason we cannot use virtual VMLOAD/VMSAVE. |
1204 | */ |
1205 | svm_set_intercept(svm, bit: INTERCEPT_VMLOAD); |
1206 | svm_set_intercept(svm, bit: INTERCEPT_VMSAVE); |
1207 | svm->vmcb->control.virt_ext &= ~VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; |
1208 | |
1209 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_SYSENTER_EIP, read: 0, write: 0); |
1210 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_SYSENTER_ESP, read: 0, write: 0); |
1211 | } else { |
1212 | /* |
1213 | * If hardware supports Virtual VMLOAD VMSAVE then enable it |
1214 | * in VMCB and clear intercepts to avoid #VMEXIT. |
1215 | */ |
1216 | if (vls) { |
1217 | svm_clr_intercept(svm, bit: INTERCEPT_VMLOAD); |
1218 | svm_clr_intercept(svm, bit: INTERCEPT_VMSAVE); |
1219 | svm->vmcb->control.virt_ext |= VIRTUAL_VMLOAD_VMSAVE_ENABLE_MASK; |
1220 | } |
1221 | /* No need to intercept these MSRs */ |
1222 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_SYSENTER_EIP, read: 1, write: 1); |
1223 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_SYSENTER_ESP, read: 1, write: 1); |
1224 | } |
1225 | } |
1226 | |
1227 | static void init_vmcb(struct kvm_vcpu *vcpu) |
1228 | { |
1229 | struct vcpu_svm *svm = to_svm(vcpu); |
1230 | struct vmcb *vmcb = svm->vmcb01.ptr; |
1231 | struct vmcb_control_area *control = &vmcb->control; |
1232 | struct vmcb_save_area *save = &vmcb->save; |
1233 | |
1234 | svm_set_intercept(svm, bit: INTERCEPT_CR0_READ); |
1235 | svm_set_intercept(svm, bit: INTERCEPT_CR3_READ); |
1236 | svm_set_intercept(svm, bit: INTERCEPT_CR4_READ); |
1237 | svm_set_intercept(svm, bit: INTERCEPT_CR0_WRITE); |
1238 | svm_set_intercept(svm, bit: INTERCEPT_CR3_WRITE); |
1239 | svm_set_intercept(svm, bit: INTERCEPT_CR4_WRITE); |
1240 | if (!kvm_vcpu_apicv_active(vcpu)) |
1241 | svm_set_intercept(svm, bit: INTERCEPT_CR8_WRITE); |
1242 | |
1243 | set_dr_intercepts(svm); |
1244 | |
1245 | set_exception_intercept(svm, PF_VECTOR); |
1246 | set_exception_intercept(svm, UD_VECTOR); |
1247 | set_exception_intercept(svm, MC_VECTOR); |
1248 | set_exception_intercept(svm, AC_VECTOR); |
1249 | set_exception_intercept(svm, DB_VECTOR); |
1250 | /* |
1251 | * Guest access to VMware backdoor ports could legitimately |
1252 | * trigger #GP because of TSS I/O permission bitmap. |
1253 | * We intercept those #GP and allow access to them anyway |
1254 | * as VMware does. |
1255 | */ |
1256 | if (enable_vmware_backdoor) |
1257 | set_exception_intercept(svm, GP_VECTOR); |
1258 | |
1259 | svm_set_intercept(svm, bit: INTERCEPT_INTR); |
1260 | svm_set_intercept(svm, bit: INTERCEPT_NMI); |
1261 | |
1262 | if (intercept_smi) |
1263 | svm_set_intercept(svm, bit: INTERCEPT_SMI); |
1264 | |
1265 | svm_set_intercept(svm, bit: INTERCEPT_SELECTIVE_CR0); |
1266 | svm_set_intercept(svm, bit: INTERCEPT_RDPMC); |
1267 | svm_set_intercept(svm, bit: INTERCEPT_CPUID); |
1268 | svm_set_intercept(svm, bit: INTERCEPT_INVD); |
1269 | svm_set_intercept(svm, bit: INTERCEPT_INVLPG); |
1270 | svm_set_intercept(svm, bit: INTERCEPT_INVLPGA); |
1271 | svm_set_intercept(svm, bit: INTERCEPT_IOIO_PROT); |
1272 | svm_set_intercept(svm, bit: INTERCEPT_MSR_PROT); |
1273 | svm_set_intercept(svm, bit: INTERCEPT_TASK_SWITCH); |
1274 | svm_set_intercept(svm, bit: INTERCEPT_SHUTDOWN); |
1275 | svm_set_intercept(svm, bit: INTERCEPT_VMRUN); |
1276 | svm_set_intercept(svm, bit: INTERCEPT_VMMCALL); |
1277 | svm_set_intercept(svm, bit: INTERCEPT_VMLOAD); |
1278 | svm_set_intercept(svm, bit: INTERCEPT_VMSAVE); |
1279 | svm_set_intercept(svm, bit: INTERCEPT_STGI); |
1280 | svm_set_intercept(svm, bit: INTERCEPT_CLGI); |
1281 | svm_set_intercept(svm, bit: INTERCEPT_SKINIT); |
1282 | svm_set_intercept(svm, bit: INTERCEPT_WBINVD); |
1283 | svm_set_intercept(svm, bit: INTERCEPT_XSETBV); |
1284 | svm_set_intercept(svm, bit: INTERCEPT_RDPRU); |
1285 | svm_set_intercept(svm, bit: INTERCEPT_RSM); |
1286 | |
1287 | if (!kvm_mwait_in_guest(vcpu->kvm)) { |
1288 | svm_set_intercept(svm, bit: INTERCEPT_MONITOR); |
1289 | svm_set_intercept(svm, bit: INTERCEPT_MWAIT); |
1290 | } |
1291 | |
1292 | if (!kvm_hlt_in_guest(vcpu->kvm)) |
1293 | svm_set_intercept(svm, bit: INTERCEPT_HLT); |
1294 | |
1295 | control->iopm_base_pa = __sme_set(iopm_base); |
1296 | control->msrpm_base_pa = __sme_set(__pa(svm->msrpm)); |
1297 | control->int_ctl = V_INTR_MASKING_MASK; |
1298 | |
1299 | init_seg(seg: &save->es); |
1300 | init_seg(seg: &save->ss); |
1301 | init_seg(seg: &save->ds); |
1302 | init_seg(seg: &save->fs); |
1303 | init_seg(seg: &save->gs); |
1304 | |
1305 | save->cs.selector = 0xf000; |
1306 | save->cs.base = 0xffff0000; |
1307 | /* Executable/Readable Code Segment */ |
1308 | save->cs.attrib = SVM_SELECTOR_READ_MASK | SVM_SELECTOR_P_MASK | |
1309 | SVM_SELECTOR_S_MASK | SVM_SELECTOR_CODE_MASK; |
1310 | save->cs.limit = 0xffff; |
1311 | |
1312 | save->gdtr.base = 0; |
1313 | save->gdtr.limit = 0xffff; |
1314 | save->idtr.base = 0; |
1315 | save->idtr.limit = 0xffff; |
1316 | |
1317 | init_sys_seg(seg: &save->ldtr, SEG_TYPE_LDT); |
1318 | init_sys_seg(seg: &save->tr, SEG_TYPE_BUSY_TSS16); |
1319 | |
1320 | if (npt_enabled) { |
1321 | /* Setup VMCB for Nested Paging */ |
1322 | control->nested_ctl |= SVM_NESTED_CTL_NP_ENABLE; |
1323 | svm_clr_intercept(svm, bit: INTERCEPT_INVLPG); |
1324 | clr_exception_intercept(svm, PF_VECTOR); |
1325 | svm_clr_intercept(svm, bit: INTERCEPT_CR3_READ); |
1326 | svm_clr_intercept(svm, bit: INTERCEPT_CR3_WRITE); |
1327 | save->g_pat = vcpu->arch.pat; |
1328 | save->cr3 = 0; |
1329 | } |
1330 | svm->current_vmcb->asid_generation = 0; |
1331 | svm->asid = 0; |
1332 | |
1333 | svm->nested.vmcb12_gpa = INVALID_GPA; |
1334 | svm->nested.last_vmcb12_gpa = INVALID_GPA; |
1335 | |
1336 | if (!kvm_pause_in_guest(vcpu->kvm)) { |
1337 | control->pause_filter_count = pause_filter_count; |
1338 | if (pause_filter_thresh) |
1339 | control->pause_filter_thresh = pause_filter_thresh; |
1340 | svm_set_intercept(svm, bit: INTERCEPT_PAUSE); |
1341 | } else { |
1342 | svm_clr_intercept(svm, bit: INTERCEPT_PAUSE); |
1343 | } |
1344 | |
1345 | svm_recalc_instruction_intercepts(vcpu, svm); |
1346 | |
1347 | /* |
1348 | * If the host supports V_SPEC_CTRL then disable the interception |
1349 | * of MSR_IA32_SPEC_CTRL. |
1350 | */ |
1351 | if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) |
1352 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_SPEC_CTRL, read: 1, write: 1); |
1353 | |
1354 | if (kvm_vcpu_apicv_active(vcpu)) |
1355 | avic_init_vmcb(svm, vmcb); |
1356 | |
1357 | if (vnmi) |
1358 | svm->vmcb->control.int_ctl |= V_NMI_ENABLE_MASK; |
1359 | |
1360 | if (vgif) { |
1361 | svm_clr_intercept(svm, bit: INTERCEPT_STGI); |
1362 | svm_clr_intercept(svm, bit: INTERCEPT_CLGI); |
1363 | svm->vmcb->control.int_ctl |= V_GIF_ENABLE_MASK; |
1364 | } |
1365 | |
1366 | if (sev_guest(kvm: vcpu->kvm)) |
1367 | sev_init_vmcb(svm); |
1368 | |
1369 | svm_hv_init_vmcb(vmcb); |
1370 | init_vmcb_after_set_cpuid(vcpu); |
1371 | |
1372 | vmcb_mark_all_dirty(vmcb); |
1373 | |
1374 | enable_gif(svm); |
1375 | } |
1376 | |
1377 | static void __svm_vcpu_reset(struct kvm_vcpu *vcpu) |
1378 | { |
1379 | struct vcpu_svm *svm = to_svm(vcpu); |
1380 | |
1381 | svm_vcpu_init_msrpm(vcpu, msrpm: svm->msrpm); |
1382 | |
1383 | svm_init_osvw(vcpu); |
1384 | vcpu->arch.microcode_version = 0x01000065; |
1385 | svm->tsc_ratio_msr = kvm_caps.default_tsc_scaling_ratio; |
1386 | |
1387 | svm->nmi_masked = false; |
1388 | svm->awaiting_iret_completion = false; |
1389 | |
1390 | if (sev_es_guest(kvm: vcpu->kvm)) |
1391 | sev_es_vcpu_reset(svm); |
1392 | } |
1393 | |
1394 | static void svm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event) |
1395 | { |
1396 | struct vcpu_svm *svm = to_svm(vcpu); |
1397 | |
1398 | svm->spec_ctrl = 0; |
1399 | svm->virt_spec_ctrl = 0; |
1400 | |
1401 | init_vmcb(vcpu); |
1402 | |
1403 | if (!init_event) |
1404 | __svm_vcpu_reset(vcpu); |
1405 | } |
1406 | |
1407 | void svm_switch_vmcb(struct vcpu_svm *svm, struct kvm_vmcb_info *target_vmcb) |
1408 | { |
1409 | svm->current_vmcb = target_vmcb; |
1410 | svm->vmcb = target_vmcb->ptr; |
1411 | } |
1412 | |
1413 | static int svm_vcpu_create(struct kvm_vcpu *vcpu) |
1414 | { |
1415 | struct vcpu_svm *svm; |
1416 | struct page *vmcb01_page; |
1417 | struct page *vmsa_page = NULL; |
1418 | int err; |
1419 | |
1420 | BUILD_BUG_ON(offsetof(struct vcpu_svm, vcpu) != 0); |
1421 | svm = to_svm(vcpu); |
1422 | |
1423 | err = -ENOMEM; |
1424 | vmcb01_page = snp_safe_alloc_page(vcpu); |
1425 | if (!vmcb01_page) |
1426 | goto out; |
1427 | |
1428 | if (sev_es_guest(kvm: vcpu->kvm)) { |
1429 | /* |
1430 | * SEV-ES guests require a separate VMSA page used to contain |
1431 | * the encrypted register state of the guest. |
1432 | */ |
1433 | vmsa_page = snp_safe_alloc_page(vcpu); |
1434 | if (!vmsa_page) |
1435 | goto error_free_vmcb_page; |
1436 | |
1437 | /* |
1438 | * SEV-ES guests maintain an encrypted version of their FPU |
1439 | * state which is restored and saved on VMRUN and VMEXIT. |
1440 | * Mark vcpu->arch.guest_fpu->fpstate as scratch so it won't |
1441 | * do xsave/xrstor on it. |
1442 | */ |
1443 | fpstate_set_confidential(gfpu: &vcpu->arch.guest_fpu); |
1444 | } |
1445 | |
1446 | err = avic_init_vcpu(svm); |
1447 | if (err) |
1448 | goto error_free_vmsa_page; |
1449 | |
1450 | svm->msrpm = svm_vcpu_alloc_msrpm(); |
1451 | if (!svm->msrpm) { |
1452 | err = -ENOMEM; |
1453 | goto error_free_vmsa_page; |
1454 | } |
1455 | |
1456 | svm->x2avic_msrs_intercepted = true; |
1457 | |
1458 | svm->vmcb01.ptr = page_address(vmcb01_page); |
1459 | svm->vmcb01.pa = __sme_set(page_to_pfn(vmcb01_page) << PAGE_SHIFT); |
1460 | svm_switch_vmcb(svm, target_vmcb: &svm->vmcb01); |
1461 | |
1462 | if (vmsa_page) |
1463 | svm->sev_es.vmsa = page_address(vmsa_page); |
1464 | |
1465 | svm->guest_state_loaded = false; |
1466 | |
1467 | return 0; |
1468 | |
1469 | error_free_vmsa_page: |
1470 | if (vmsa_page) |
1471 | __free_page(vmsa_page); |
1472 | error_free_vmcb_page: |
1473 | __free_page(vmcb01_page); |
1474 | out: |
1475 | return err; |
1476 | } |
1477 | |
1478 | static void svm_clear_current_vmcb(struct vmcb *vmcb) |
1479 | { |
1480 | int i; |
1481 | |
1482 | for_each_online_cpu(i) |
1483 | cmpxchg(per_cpu_ptr(&svm_data.current_vmcb, i), vmcb, NULL); |
1484 | } |
1485 | |
1486 | static void svm_vcpu_free(struct kvm_vcpu *vcpu) |
1487 | { |
1488 | struct vcpu_svm *svm = to_svm(vcpu); |
1489 | |
1490 | /* |
1491 | * The vmcb page can be recycled, causing a false negative in |
1492 | * svm_vcpu_load(). So, ensure that no logical CPU has this |
1493 | * vmcb page recorded as its current vmcb. |
1494 | */ |
1495 | svm_clear_current_vmcb(vmcb: svm->vmcb); |
1496 | |
1497 | svm_leave_nested(vcpu); |
1498 | svm_free_nested(svm); |
1499 | |
1500 | sev_free_vcpu(vcpu); |
1501 | |
1502 | __free_page(pfn_to_page(__sme_clr(svm->vmcb01.pa) >> PAGE_SHIFT)); |
1503 | __free_pages(virt_to_page(svm->msrpm), order: get_order(MSRPM_SIZE)); |
1504 | } |
1505 | |
1506 | static struct sev_es_save_area *sev_es_host_save_area(struct svm_cpu_data *sd) |
1507 | { |
1508 | return page_address(sd->save_area) + 0x400; |
1509 | } |
1510 | |
1511 | static void svm_prepare_switch_to_guest(struct kvm_vcpu *vcpu) |
1512 | { |
1513 | struct vcpu_svm *svm = to_svm(vcpu); |
1514 | struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); |
1515 | |
1516 | if (sev_es_guest(kvm: vcpu->kvm)) |
1517 | sev_es_unmap_ghcb(svm); |
1518 | |
1519 | if (svm->guest_state_loaded) |
1520 | return; |
1521 | |
1522 | /* |
1523 | * Save additional host state that will be restored on VMEXIT (sev-es) |
1524 | * or subsequent vmload of host save area. |
1525 | */ |
1526 | vmsave(pa: sd->save_area_pa); |
1527 | if (sev_es_guest(kvm: vcpu->kvm)) |
1528 | sev_es_prepare_switch_to_guest(hostsa: sev_es_host_save_area(sd)); |
1529 | |
1530 | if (tsc_scaling) |
1531 | __svm_write_tsc_multiplier(multiplier: vcpu->arch.tsc_scaling_ratio); |
1532 | |
1533 | /* |
1534 | * TSC_AUX is always virtualized for SEV-ES guests when the feature is |
1535 | * available. The user return MSR support is not required in this case |
1536 | * because TSC_AUX is restored on #VMEXIT from the host save area |
1537 | * (which has been initialized in svm_hardware_enable()). |
1538 | */ |
1539 | if (likely(tsc_aux_uret_slot >= 0) && |
1540 | (!boot_cpu_has(X86_FEATURE_V_TSC_AUX) || !sev_es_guest(kvm: vcpu->kvm))) |
1541 | kvm_set_user_return_msr(index: tsc_aux_uret_slot, val: svm->tsc_aux, mask: -1ull); |
1542 | |
1543 | svm->guest_state_loaded = true; |
1544 | } |
1545 | |
1546 | static void svm_prepare_host_switch(struct kvm_vcpu *vcpu) |
1547 | { |
1548 | to_svm(vcpu)->guest_state_loaded = false; |
1549 | } |
1550 | |
1551 | static void svm_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
1552 | { |
1553 | struct vcpu_svm *svm = to_svm(vcpu); |
1554 | struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, cpu); |
1555 | |
1556 | if (sd->current_vmcb != svm->vmcb) { |
1557 | sd->current_vmcb = svm->vmcb; |
1558 | |
1559 | if (!cpu_feature_enabled(X86_FEATURE_IBPB_ON_VMEXIT)) |
1560 | indirect_branch_prediction_barrier(); |
1561 | } |
1562 | if (kvm_vcpu_apicv_active(vcpu)) |
1563 | avic_vcpu_load(vcpu, cpu); |
1564 | } |
1565 | |
1566 | static void svm_vcpu_put(struct kvm_vcpu *vcpu) |
1567 | { |
1568 | if (kvm_vcpu_apicv_active(vcpu)) |
1569 | avic_vcpu_put(vcpu); |
1570 | |
1571 | svm_prepare_host_switch(vcpu); |
1572 | |
1573 | ++vcpu->stat.host_state_reload; |
1574 | } |
1575 | |
1576 | static unsigned long svm_get_rflags(struct kvm_vcpu *vcpu) |
1577 | { |
1578 | struct vcpu_svm *svm = to_svm(vcpu); |
1579 | unsigned long rflags = svm->vmcb->save.rflags; |
1580 | |
1581 | if (svm->nmi_singlestep) { |
1582 | /* Hide our flags if they were not set by the guest */ |
1583 | if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_TF)) |
1584 | rflags &= ~X86_EFLAGS_TF; |
1585 | if (!(svm->nmi_singlestep_guest_rflags & X86_EFLAGS_RF)) |
1586 | rflags &= ~X86_EFLAGS_RF; |
1587 | } |
1588 | return rflags; |
1589 | } |
1590 | |
1591 | static void svm_set_rflags(struct kvm_vcpu *vcpu, unsigned long rflags) |
1592 | { |
1593 | if (to_svm(vcpu)->nmi_singlestep) |
1594 | rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); |
1595 | |
1596 | /* |
1597 | * Any change of EFLAGS.VM is accompanied by a reload of SS |
1598 | * (caused by either a task switch or an inter-privilege IRET), |
1599 | * so we do not need to update the CPL here. |
1600 | */ |
1601 | to_svm(vcpu)->vmcb->save.rflags = rflags; |
1602 | } |
1603 | |
1604 | static bool svm_get_if_flag(struct kvm_vcpu *vcpu) |
1605 | { |
1606 | struct vmcb *vmcb = to_svm(vcpu)->vmcb; |
1607 | |
1608 | return sev_es_guest(kvm: vcpu->kvm) |
1609 | ? vmcb->control.int_state & SVM_GUEST_INTERRUPT_MASK |
1610 | : kvm_get_rflags(vcpu) & X86_EFLAGS_IF; |
1611 | } |
1612 | |
1613 | static void svm_cache_reg(struct kvm_vcpu *vcpu, enum kvm_reg reg) |
1614 | { |
1615 | kvm_register_mark_available(vcpu, reg); |
1616 | |
1617 | switch (reg) { |
1618 | case VCPU_EXREG_PDPTR: |
1619 | /* |
1620 | * When !npt_enabled, mmu->pdptrs[] is already available since |
1621 | * it is always updated per SDM when moving to CRs. |
1622 | */ |
1623 | if (npt_enabled) |
1624 | load_pdptrs(vcpu, cr3: kvm_read_cr3(vcpu)); |
1625 | break; |
1626 | default: |
1627 | KVM_BUG_ON(1, vcpu->kvm); |
1628 | } |
1629 | } |
1630 | |
1631 | static void svm_set_vintr(struct vcpu_svm *svm) |
1632 | { |
1633 | struct vmcb_control_area *control; |
1634 | |
1635 | /* |
1636 | * The following fields are ignored when AVIC is enabled |
1637 | */ |
1638 | WARN_ON(kvm_vcpu_apicv_activated(&svm->vcpu)); |
1639 | |
1640 | svm_set_intercept(svm, bit: INTERCEPT_VINTR); |
1641 | |
1642 | /* |
1643 | * Recalculating intercepts may have cleared the VINTR intercept. If |
1644 | * V_INTR_MASKING is enabled in vmcb12, then the effective RFLAGS.IF |
1645 | * for L1 physical interrupts is L1's RFLAGS.IF at the time of VMRUN. |
1646 | * Requesting an interrupt window if save.RFLAGS.IF=0 is pointless as |
1647 | * interrupts will never be unblocked while L2 is running. |
1648 | */ |
1649 | if (!svm_is_intercept(svm, bit: INTERCEPT_VINTR)) |
1650 | return; |
1651 | |
1652 | /* |
1653 | * This is just a dummy VINTR to actually cause a vmexit to happen. |
1654 | * Actual injection of virtual interrupts happens through EVENTINJ. |
1655 | */ |
1656 | control = &svm->vmcb->control; |
1657 | control->int_vector = 0x0; |
1658 | control->int_ctl &= ~V_INTR_PRIO_MASK; |
1659 | control->int_ctl |= V_IRQ_MASK | |
1660 | ((/*control->int_vector >> 4*/ 0xf) << V_INTR_PRIO_SHIFT); |
1661 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_INTR); |
1662 | } |
1663 | |
1664 | static void svm_clear_vintr(struct vcpu_svm *svm) |
1665 | { |
1666 | svm_clr_intercept(svm, bit: INTERCEPT_VINTR); |
1667 | |
1668 | /* Drop int_ctl fields related to VINTR injection. */ |
1669 | svm->vmcb->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; |
1670 | if (is_guest_mode(&svm->vcpu)) { |
1671 | svm->vmcb01.ptr->control.int_ctl &= ~V_IRQ_INJECTION_BITS_MASK; |
1672 | |
1673 | WARN_ON((svm->vmcb->control.int_ctl & V_TPR_MASK) != |
1674 | (svm->nested.ctl.int_ctl & V_TPR_MASK)); |
1675 | |
1676 | svm->vmcb->control.int_ctl |= svm->nested.ctl.int_ctl & |
1677 | V_IRQ_INJECTION_BITS_MASK; |
1678 | |
1679 | svm->vmcb->control.int_vector = svm->nested.ctl.int_vector; |
1680 | } |
1681 | |
1682 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_INTR); |
1683 | } |
1684 | |
1685 | static struct vmcb_seg *svm_seg(struct kvm_vcpu *vcpu, int seg) |
1686 | { |
1687 | struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; |
1688 | struct vmcb_save_area *save01 = &to_svm(vcpu)->vmcb01.ptr->save; |
1689 | |
1690 | switch (seg) { |
1691 | case VCPU_SREG_CS: return &save->cs; |
1692 | case VCPU_SREG_DS: return &save->ds; |
1693 | case VCPU_SREG_ES: return &save->es; |
1694 | case VCPU_SREG_FS: return &save01->fs; |
1695 | case VCPU_SREG_GS: return &save01->gs; |
1696 | case VCPU_SREG_SS: return &save->ss; |
1697 | case VCPU_SREG_TR: return &save01->tr; |
1698 | case VCPU_SREG_LDTR: return &save01->ldtr; |
1699 | } |
1700 | BUG(); |
1701 | return NULL; |
1702 | } |
1703 | |
1704 | static u64 svm_get_segment_base(struct kvm_vcpu *vcpu, int seg) |
1705 | { |
1706 | struct vmcb_seg *s = svm_seg(vcpu, seg); |
1707 | |
1708 | return s->base; |
1709 | } |
1710 | |
1711 | static void svm_get_segment(struct kvm_vcpu *vcpu, |
1712 | struct kvm_segment *var, int seg) |
1713 | { |
1714 | struct vmcb_seg *s = svm_seg(vcpu, seg); |
1715 | |
1716 | var->base = s->base; |
1717 | var->limit = s->limit; |
1718 | var->selector = s->selector; |
1719 | var->type = s->attrib & SVM_SELECTOR_TYPE_MASK; |
1720 | var->s = (s->attrib >> SVM_SELECTOR_S_SHIFT) & 1; |
1721 | var->dpl = (s->attrib >> SVM_SELECTOR_DPL_SHIFT) & 3; |
1722 | var->present = (s->attrib >> SVM_SELECTOR_P_SHIFT) & 1; |
1723 | var->avl = (s->attrib >> SVM_SELECTOR_AVL_SHIFT) & 1; |
1724 | var->l = (s->attrib >> SVM_SELECTOR_L_SHIFT) & 1; |
1725 | var->db = (s->attrib >> SVM_SELECTOR_DB_SHIFT) & 1; |
1726 | |
1727 | /* |
1728 | * AMD CPUs circa 2014 track the G bit for all segments except CS. |
1729 | * However, the SVM spec states that the G bit is not observed by the |
1730 | * CPU, and some VMware virtual CPUs drop the G bit for all segments. |
1731 | * So let's synthesize a legal G bit for all segments, this helps |
1732 | * running KVM nested. It also helps cross-vendor migration, because |
1733 | * Intel's vmentry has a check on the 'G' bit. |
1734 | */ |
1735 | var->g = s->limit > 0xfffff; |
1736 | |
1737 | /* |
1738 | * AMD's VMCB does not have an explicit unusable field, so emulate it |
1739 | * for cross vendor migration purposes by "not present" |
1740 | */ |
1741 | var->unusable = !var->present; |
1742 | |
1743 | switch (seg) { |
1744 | case VCPU_SREG_TR: |
1745 | /* |
1746 | * Work around a bug where the busy flag in the tr selector |
1747 | * isn't exposed |
1748 | */ |
1749 | var->type |= 0x2; |
1750 | break; |
1751 | case VCPU_SREG_DS: |
1752 | case VCPU_SREG_ES: |
1753 | case VCPU_SREG_FS: |
1754 | case VCPU_SREG_GS: |
1755 | /* |
1756 | * The accessed bit must always be set in the segment |
1757 | * descriptor cache, although it can be cleared in the |
1758 | * descriptor, the cached bit always remains at 1. Since |
1759 | * Intel has a check on this, set it here to support |
1760 | * cross-vendor migration. |
1761 | */ |
1762 | if (!var->unusable) |
1763 | var->type |= 0x1; |
1764 | break; |
1765 | case VCPU_SREG_SS: |
1766 | /* |
1767 | * On AMD CPUs sometimes the DB bit in the segment |
1768 | * descriptor is left as 1, although the whole segment has |
1769 | * been made unusable. Clear it here to pass an Intel VMX |
1770 | * entry check when cross vendor migrating. |
1771 | */ |
1772 | if (var->unusable) |
1773 | var->db = 0; |
1774 | /* This is symmetric with svm_set_segment() */ |
1775 | var->dpl = to_svm(vcpu)->vmcb->save.cpl; |
1776 | break; |
1777 | } |
1778 | } |
1779 | |
1780 | static int svm_get_cpl(struct kvm_vcpu *vcpu) |
1781 | { |
1782 | struct vmcb_save_area *save = &to_svm(vcpu)->vmcb->save; |
1783 | |
1784 | return save->cpl; |
1785 | } |
1786 | |
1787 | static void svm_get_cs_db_l_bits(struct kvm_vcpu *vcpu, int *db, int *l) |
1788 | { |
1789 | struct kvm_segment cs; |
1790 | |
1791 | svm_get_segment(vcpu, var: &cs, seg: VCPU_SREG_CS); |
1792 | *db = cs.db; |
1793 | *l = cs.l; |
1794 | } |
1795 | |
1796 | static void svm_get_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) |
1797 | { |
1798 | struct vcpu_svm *svm = to_svm(vcpu); |
1799 | |
1800 | dt->size = svm->vmcb->save.idtr.limit; |
1801 | dt->address = svm->vmcb->save.idtr.base; |
1802 | } |
1803 | |
1804 | static void svm_set_idt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) |
1805 | { |
1806 | struct vcpu_svm *svm = to_svm(vcpu); |
1807 | |
1808 | svm->vmcb->save.idtr.limit = dt->size; |
1809 | svm->vmcb->save.idtr.base = dt->address ; |
1810 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_DT); |
1811 | } |
1812 | |
1813 | static void svm_get_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) |
1814 | { |
1815 | struct vcpu_svm *svm = to_svm(vcpu); |
1816 | |
1817 | dt->size = svm->vmcb->save.gdtr.limit; |
1818 | dt->address = svm->vmcb->save.gdtr.base; |
1819 | } |
1820 | |
1821 | static void svm_set_gdt(struct kvm_vcpu *vcpu, struct desc_ptr *dt) |
1822 | { |
1823 | struct vcpu_svm *svm = to_svm(vcpu); |
1824 | |
1825 | svm->vmcb->save.gdtr.limit = dt->size; |
1826 | svm->vmcb->save.gdtr.base = dt->address ; |
1827 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_DT); |
1828 | } |
1829 | |
1830 | static void sev_post_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3) |
1831 | { |
1832 | struct vcpu_svm *svm = to_svm(vcpu); |
1833 | |
1834 | /* |
1835 | * For guests that don't set guest_state_protected, the cr3 update is |
1836 | * handled via kvm_mmu_load() while entering the guest. For guests |
1837 | * that do (SEV-ES/SEV-SNP), the cr3 update needs to be written to |
1838 | * VMCB save area now, since the save area will become the initial |
1839 | * contents of the VMSA, and future VMCB save area updates won't be |
1840 | * seen. |
1841 | */ |
1842 | if (sev_es_guest(kvm: vcpu->kvm)) { |
1843 | svm->vmcb->save.cr3 = cr3; |
1844 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_CR); |
1845 | } |
1846 | } |
1847 | |
1848 | static bool svm_is_valid_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
1849 | { |
1850 | return true; |
1851 | } |
1852 | |
1853 | void svm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0) |
1854 | { |
1855 | struct vcpu_svm *svm = to_svm(vcpu); |
1856 | u64 hcr0 = cr0; |
1857 | bool old_paging = is_paging(vcpu); |
1858 | |
1859 | #ifdef CONFIG_X86_64 |
1860 | if (vcpu->arch.efer & EFER_LME) { |
1861 | if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) { |
1862 | vcpu->arch.efer |= EFER_LMA; |
1863 | if (!vcpu->arch.guest_state_protected) |
1864 | svm->vmcb->save.efer |= EFER_LMA | EFER_LME; |
1865 | } |
1866 | |
1867 | if (is_paging(vcpu) && !(cr0 & X86_CR0_PG)) { |
1868 | vcpu->arch.efer &= ~EFER_LMA; |
1869 | if (!vcpu->arch.guest_state_protected) |
1870 | svm->vmcb->save.efer &= ~(EFER_LMA | EFER_LME); |
1871 | } |
1872 | } |
1873 | #endif |
1874 | vcpu->arch.cr0 = cr0; |
1875 | |
1876 | if (!npt_enabled) { |
1877 | hcr0 |= X86_CR0_PG | X86_CR0_WP; |
1878 | if (old_paging != is_paging(vcpu)) |
1879 | svm_set_cr4(vcpu, cr4: kvm_read_cr4(vcpu)); |
1880 | } |
1881 | |
1882 | /* |
1883 | * re-enable caching here because the QEMU bios |
1884 | * does not do it - this results in some delay at |
1885 | * reboot |
1886 | */ |
1887 | if (kvm_check_has_quirk(vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED)) |
1888 | hcr0 &= ~(X86_CR0_CD | X86_CR0_NW); |
1889 | |
1890 | svm->vmcb->save.cr0 = hcr0; |
1891 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_CR); |
1892 | |
1893 | /* |
1894 | * SEV-ES guests must always keep the CR intercepts cleared. CR |
1895 | * tracking is done using the CR write traps. |
1896 | */ |
1897 | if (sev_es_guest(kvm: vcpu->kvm)) |
1898 | return; |
1899 | |
1900 | if (hcr0 == cr0) { |
1901 | /* Selective CR0 write remains on. */ |
1902 | svm_clr_intercept(svm, bit: INTERCEPT_CR0_READ); |
1903 | svm_clr_intercept(svm, bit: INTERCEPT_CR0_WRITE); |
1904 | } else { |
1905 | svm_set_intercept(svm, bit: INTERCEPT_CR0_READ); |
1906 | svm_set_intercept(svm, bit: INTERCEPT_CR0_WRITE); |
1907 | } |
1908 | } |
1909 | |
1910 | static bool svm_is_valid_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1911 | { |
1912 | return true; |
1913 | } |
1914 | |
1915 | void svm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4) |
1916 | { |
1917 | unsigned long host_cr4_mce = cr4_read_shadow() & X86_CR4_MCE; |
1918 | unsigned long old_cr4 = vcpu->arch.cr4; |
1919 | |
1920 | if (npt_enabled && ((old_cr4 ^ cr4) & X86_CR4_PGE)) |
1921 | svm_flush_tlb_current(vcpu); |
1922 | |
1923 | vcpu->arch.cr4 = cr4; |
1924 | if (!npt_enabled) { |
1925 | cr4 |= X86_CR4_PAE; |
1926 | |
1927 | if (!is_paging(vcpu)) |
1928 | cr4 &= ~(X86_CR4_SMEP | X86_CR4_SMAP | X86_CR4_PKE); |
1929 | } |
1930 | cr4 |= host_cr4_mce; |
1931 | to_svm(vcpu)->vmcb->save.cr4 = cr4; |
1932 | vmcb_mark_dirty(vmcb: to_svm(vcpu)->vmcb, bit: VMCB_CR); |
1933 | |
1934 | if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE | X86_CR4_PKE)) |
1935 | kvm_update_cpuid_runtime(vcpu); |
1936 | } |
1937 | |
1938 | static void svm_set_segment(struct kvm_vcpu *vcpu, |
1939 | struct kvm_segment *var, int seg) |
1940 | { |
1941 | struct vcpu_svm *svm = to_svm(vcpu); |
1942 | struct vmcb_seg *s = svm_seg(vcpu, seg); |
1943 | |
1944 | s->base = var->base; |
1945 | s->limit = var->limit; |
1946 | s->selector = var->selector; |
1947 | s->attrib = (var->type & SVM_SELECTOR_TYPE_MASK); |
1948 | s->attrib |= (var->s & 1) << SVM_SELECTOR_S_SHIFT; |
1949 | s->attrib |= (var->dpl & 3) << SVM_SELECTOR_DPL_SHIFT; |
1950 | s->attrib |= ((var->present & 1) && !var->unusable) << SVM_SELECTOR_P_SHIFT; |
1951 | s->attrib |= (var->avl & 1) << SVM_SELECTOR_AVL_SHIFT; |
1952 | s->attrib |= (var->l & 1) << SVM_SELECTOR_L_SHIFT; |
1953 | s->attrib |= (var->db & 1) << SVM_SELECTOR_DB_SHIFT; |
1954 | s->attrib |= (var->g & 1) << SVM_SELECTOR_G_SHIFT; |
1955 | |
1956 | /* |
1957 | * This is always accurate, except if SYSRET returned to a segment |
1958 | * with SS.DPL != 3. Intel does not have this quirk, and always |
1959 | * forces SS.DPL to 3 on sysret, so we ignore that case; fixing it |
1960 | * would entail passing the CPL to userspace and back. |
1961 | */ |
1962 | if (seg == VCPU_SREG_SS) |
1963 | /* This is symmetric with svm_get_segment() */ |
1964 | svm->vmcb->save.cpl = (var->dpl & 3); |
1965 | |
1966 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_SEG); |
1967 | } |
1968 | |
1969 | static void svm_update_exception_bitmap(struct kvm_vcpu *vcpu) |
1970 | { |
1971 | struct vcpu_svm *svm = to_svm(vcpu); |
1972 | |
1973 | clr_exception_intercept(svm, BP_VECTOR); |
1974 | |
1975 | if (vcpu->guest_debug & KVM_GUESTDBG_ENABLE) { |
1976 | if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP) |
1977 | set_exception_intercept(svm, BP_VECTOR); |
1978 | } |
1979 | } |
1980 | |
1981 | static void new_asid(struct vcpu_svm *svm, struct svm_cpu_data *sd) |
1982 | { |
1983 | if (sd->next_asid > sd->max_asid) { |
1984 | ++sd->asid_generation; |
1985 | sd->next_asid = sd->min_asid; |
1986 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ALL_ASID; |
1987 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_ASID); |
1988 | } |
1989 | |
1990 | svm->current_vmcb->asid_generation = sd->asid_generation; |
1991 | svm->asid = sd->next_asid++; |
1992 | } |
1993 | |
1994 | static void svm_set_dr6(struct vcpu_svm *svm, unsigned long value) |
1995 | { |
1996 | struct vmcb *vmcb = svm->vmcb; |
1997 | |
1998 | if (svm->vcpu.arch.guest_state_protected) |
1999 | return; |
2000 | |
2001 | if (unlikely(value != vmcb->save.dr6)) { |
2002 | vmcb->save.dr6 = value; |
2003 | vmcb_mark_dirty(vmcb, bit: VMCB_DR); |
2004 | } |
2005 | } |
2006 | |
2007 | static void svm_sync_dirty_debug_regs(struct kvm_vcpu *vcpu) |
2008 | { |
2009 | struct vcpu_svm *svm = to_svm(vcpu); |
2010 | |
2011 | if (WARN_ON_ONCE(sev_es_guest(vcpu->kvm))) |
2012 | return; |
2013 | |
2014 | get_debugreg(vcpu->arch.db[0], 0); |
2015 | get_debugreg(vcpu->arch.db[1], 1); |
2016 | get_debugreg(vcpu->arch.db[2], 2); |
2017 | get_debugreg(vcpu->arch.db[3], 3); |
2018 | /* |
2019 | * We cannot reset svm->vmcb->save.dr6 to DR6_ACTIVE_LOW here, |
2020 | * because db_interception might need it. We can do it before vmentry. |
2021 | */ |
2022 | vcpu->arch.dr6 = svm->vmcb->save.dr6; |
2023 | vcpu->arch.dr7 = svm->vmcb->save.dr7; |
2024 | vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT; |
2025 | set_dr_intercepts(svm); |
2026 | } |
2027 | |
2028 | static void svm_set_dr7(struct kvm_vcpu *vcpu, unsigned long value) |
2029 | { |
2030 | struct vcpu_svm *svm = to_svm(vcpu); |
2031 | |
2032 | if (vcpu->arch.guest_state_protected) |
2033 | return; |
2034 | |
2035 | svm->vmcb->save.dr7 = value; |
2036 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_DR); |
2037 | } |
2038 | |
2039 | static int pf_interception(struct kvm_vcpu *vcpu) |
2040 | { |
2041 | struct vcpu_svm *svm = to_svm(vcpu); |
2042 | |
2043 | u64 fault_address = svm->vmcb->control.exit_info_2; |
2044 | u64 error_code = svm->vmcb->control.exit_info_1; |
2045 | |
2046 | return kvm_handle_page_fault(vcpu, error_code, fault_address, |
2047 | static_cpu_has(X86_FEATURE_DECODEASSISTS) ? |
2048 | svm->vmcb->control.insn_bytes : NULL, |
2049 | svm->vmcb->control.insn_len); |
2050 | } |
2051 | |
2052 | static int npf_interception(struct kvm_vcpu *vcpu) |
2053 | { |
2054 | struct vcpu_svm *svm = to_svm(vcpu); |
2055 | |
2056 | u64 fault_address = svm->vmcb->control.exit_info_2; |
2057 | u64 error_code = svm->vmcb->control.exit_info_1; |
2058 | |
2059 | trace_kvm_page_fault(vcpu, fault_address, error_code); |
2060 | return kvm_mmu_page_fault(vcpu, cr2_or_gpa: fault_address, error_code, |
2061 | static_cpu_has(X86_FEATURE_DECODEASSISTS) ? |
2062 | svm->vmcb->control.insn_bytes : NULL, |
2063 | insn_len: svm->vmcb->control.insn_len); |
2064 | } |
2065 | |
2066 | static int db_interception(struct kvm_vcpu *vcpu) |
2067 | { |
2068 | struct kvm_run *kvm_run = vcpu->run; |
2069 | struct vcpu_svm *svm = to_svm(vcpu); |
2070 | |
2071 | if (!(vcpu->guest_debug & |
2072 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && |
2073 | !svm->nmi_singlestep) { |
2074 | u32 payload = svm->vmcb->save.dr6 ^ DR6_ACTIVE_LOW; |
2075 | kvm_queue_exception_p(vcpu, DB_VECTOR, payload); |
2076 | return 1; |
2077 | } |
2078 | |
2079 | if (svm->nmi_singlestep) { |
2080 | disable_nmi_singlestep(svm); |
2081 | /* Make sure we check for pending NMIs upon entry */ |
2082 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
2083 | } |
2084 | |
2085 | if (vcpu->guest_debug & |
2086 | (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) { |
2087 | kvm_run->exit_reason = KVM_EXIT_DEBUG; |
2088 | kvm_run->debug.arch.dr6 = svm->vmcb->save.dr6; |
2089 | kvm_run->debug.arch.dr7 = svm->vmcb->save.dr7; |
2090 | kvm_run->debug.arch.pc = |
2091 | svm->vmcb->save.cs.base + svm->vmcb->save.rip; |
2092 | kvm_run->debug.arch.exception = DB_VECTOR; |
2093 | return 0; |
2094 | } |
2095 | |
2096 | return 1; |
2097 | } |
2098 | |
2099 | static int bp_interception(struct kvm_vcpu *vcpu) |
2100 | { |
2101 | struct vcpu_svm *svm = to_svm(vcpu); |
2102 | struct kvm_run *kvm_run = vcpu->run; |
2103 | |
2104 | kvm_run->exit_reason = KVM_EXIT_DEBUG; |
2105 | kvm_run->debug.arch.pc = svm->vmcb->save.cs.base + svm->vmcb->save.rip; |
2106 | kvm_run->debug.arch.exception = BP_VECTOR; |
2107 | return 0; |
2108 | } |
2109 | |
2110 | static int ud_interception(struct kvm_vcpu *vcpu) |
2111 | { |
2112 | return handle_ud(vcpu); |
2113 | } |
2114 | |
2115 | static int ac_interception(struct kvm_vcpu *vcpu) |
2116 | { |
2117 | kvm_queue_exception_e(vcpu, AC_VECTOR, error_code: 0); |
2118 | return 1; |
2119 | } |
2120 | |
2121 | static bool is_erratum_383(void) |
2122 | { |
2123 | int err, i; |
2124 | u64 value; |
2125 | |
2126 | if (!erratum_383_found) |
2127 | return false; |
2128 | |
2129 | value = native_read_msr_safe(MSR_IA32_MC0_STATUS, err: &err); |
2130 | if (err) |
2131 | return false; |
2132 | |
2133 | /* Bit 62 may or may not be set for this mce */ |
2134 | value &= ~(1ULL << 62); |
2135 | |
2136 | if (value != 0xb600000000010015ULL) |
2137 | return false; |
2138 | |
2139 | /* Clear MCi_STATUS registers */ |
2140 | for (i = 0; i < 6; ++i) |
2141 | native_write_msr_safe(MSR_IA32_MCx_STATUS(i), low: 0, high: 0); |
2142 | |
2143 | value = native_read_msr_safe(MSR_IA32_MCG_STATUS, err: &err); |
2144 | if (!err) { |
2145 | u32 low, high; |
2146 | |
2147 | value &= ~(1ULL << 2); |
2148 | low = lower_32_bits(value); |
2149 | high = upper_32_bits(value); |
2150 | |
2151 | native_write_msr_safe(MSR_IA32_MCG_STATUS, low, high); |
2152 | } |
2153 | |
2154 | /* Flush tlb to evict multi-match entries */ |
2155 | __flush_tlb_all(); |
2156 | |
2157 | return true; |
2158 | } |
2159 | |
2160 | static void svm_handle_mce(struct kvm_vcpu *vcpu) |
2161 | { |
2162 | if (is_erratum_383()) { |
2163 | /* |
2164 | * Erratum 383 triggered. Guest state is corrupt so kill the |
2165 | * guest. |
2166 | */ |
2167 | pr_err("Guest triggered AMD Erratum 383\n" ); |
2168 | |
2169 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
2170 | |
2171 | return; |
2172 | } |
2173 | |
2174 | /* |
2175 | * On an #MC intercept the MCE handler is not called automatically in |
2176 | * the host. So do it by hand here. |
2177 | */ |
2178 | kvm_machine_check(); |
2179 | } |
2180 | |
2181 | static int mc_interception(struct kvm_vcpu *vcpu) |
2182 | { |
2183 | return 1; |
2184 | } |
2185 | |
2186 | static int shutdown_interception(struct kvm_vcpu *vcpu) |
2187 | { |
2188 | struct kvm_run *kvm_run = vcpu->run; |
2189 | struct vcpu_svm *svm = to_svm(vcpu); |
2190 | |
2191 | |
2192 | /* |
2193 | * VMCB is undefined after a SHUTDOWN intercept. INIT the vCPU to put |
2194 | * the VMCB in a known good state. Unfortuately, KVM doesn't have |
2195 | * KVM_MP_STATE_SHUTDOWN and can't add it without potentially breaking |
2196 | * userspace. At a platform view, INIT is acceptable behavior as |
2197 | * there exist bare metal platforms that automatically INIT the CPU |
2198 | * in response to shutdown. |
2199 | * |
2200 | * The VM save area for SEV-ES guests has already been encrypted so it |
2201 | * cannot be reinitialized, i.e. synthesizing INIT is futile. |
2202 | */ |
2203 | if (!sev_es_guest(kvm: vcpu->kvm)) { |
2204 | clear_page(page: svm->vmcb); |
2205 | kvm_vcpu_reset(vcpu, init_event: true); |
2206 | } |
2207 | |
2208 | kvm_run->exit_reason = KVM_EXIT_SHUTDOWN; |
2209 | return 0; |
2210 | } |
2211 | |
2212 | static int io_interception(struct kvm_vcpu *vcpu) |
2213 | { |
2214 | struct vcpu_svm *svm = to_svm(vcpu); |
2215 | u32 io_info = svm->vmcb->control.exit_info_1; /* address size bug? */ |
2216 | int size, in, string; |
2217 | unsigned port; |
2218 | |
2219 | ++vcpu->stat.io_exits; |
2220 | string = (io_info & SVM_IOIO_STR_MASK) != 0; |
2221 | in = (io_info & SVM_IOIO_TYPE_MASK) != 0; |
2222 | port = io_info >> 16; |
2223 | size = (io_info & SVM_IOIO_SIZE_MASK) >> SVM_IOIO_SIZE_SHIFT; |
2224 | |
2225 | if (string) { |
2226 | if (sev_es_guest(kvm: vcpu->kvm)) |
2227 | return sev_es_string_io(svm, size, port, in); |
2228 | else |
2229 | return kvm_emulate_instruction(vcpu, emulation_type: 0); |
2230 | } |
2231 | |
2232 | svm->next_rip = svm->vmcb->control.exit_info_2; |
2233 | |
2234 | return kvm_fast_pio(vcpu, size, port, in); |
2235 | } |
2236 | |
2237 | static int nmi_interception(struct kvm_vcpu *vcpu) |
2238 | { |
2239 | return 1; |
2240 | } |
2241 | |
2242 | static int smi_interception(struct kvm_vcpu *vcpu) |
2243 | { |
2244 | return 1; |
2245 | } |
2246 | |
2247 | static int intr_interception(struct kvm_vcpu *vcpu) |
2248 | { |
2249 | ++vcpu->stat.irq_exits; |
2250 | return 1; |
2251 | } |
2252 | |
2253 | static int vmload_vmsave_interception(struct kvm_vcpu *vcpu, bool vmload) |
2254 | { |
2255 | struct vcpu_svm *svm = to_svm(vcpu); |
2256 | struct vmcb *vmcb12; |
2257 | struct kvm_host_map map; |
2258 | int ret; |
2259 | |
2260 | if (nested_svm_check_permissions(vcpu)) |
2261 | return 1; |
2262 | |
2263 | ret = kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: svm->vmcb->save.rax), map: &map); |
2264 | if (ret) { |
2265 | if (ret == -EINVAL) |
2266 | kvm_inject_gp(vcpu, error_code: 0); |
2267 | return 1; |
2268 | } |
2269 | |
2270 | vmcb12 = map.hva; |
2271 | |
2272 | ret = kvm_skip_emulated_instruction(vcpu); |
2273 | |
2274 | if (vmload) { |
2275 | svm_copy_vmloadsave_state(to_vmcb: svm->vmcb, from_vmcb: vmcb12); |
2276 | svm->sysenter_eip_hi = 0; |
2277 | svm->sysenter_esp_hi = 0; |
2278 | } else { |
2279 | svm_copy_vmloadsave_state(to_vmcb: vmcb12, from_vmcb: svm->vmcb); |
2280 | } |
2281 | |
2282 | kvm_vcpu_unmap(vcpu, map: &map, dirty: true); |
2283 | |
2284 | return ret; |
2285 | } |
2286 | |
2287 | static int vmload_interception(struct kvm_vcpu *vcpu) |
2288 | { |
2289 | return vmload_vmsave_interception(vcpu, vmload: true); |
2290 | } |
2291 | |
2292 | static int vmsave_interception(struct kvm_vcpu *vcpu) |
2293 | { |
2294 | return vmload_vmsave_interception(vcpu, vmload: false); |
2295 | } |
2296 | |
2297 | static int vmrun_interception(struct kvm_vcpu *vcpu) |
2298 | { |
2299 | if (nested_svm_check_permissions(vcpu)) |
2300 | return 1; |
2301 | |
2302 | return nested_svm_vmrun(vcpu); |
2303 | } |
2304 | |
2305 | enum { |
2306 | NONE_SVM_INSTR, |
2307 | SVM_INSTR_VMRUN, |
2308 | SVM_INSTR_VMLOAD, |
2309 | SVM_INSTR_VMSAVE, |
2310 | }; |
2311 | |
2312 | /* Return NONE_SVM_INSTR if not SVM instrs, otherwise return decode result */ |
2313 | static int svm_instr_opcode(struct kvm_vcpu *vcpu) |
2314 | { |
2315 | struct x86_emulate_ctxt *ctxt = vcpu->arch.emulate_ctxt; |
2316 | |
2317 | if (ctxt->b != 0x1 || ctxt->opcode_len != 2) |
2318 | return NONE_SVM_INSTR; |
2319 | |
2320 | switch (ctxt->modrm) { |
2321 | case 0xd8: /* VMRUN */ |
2322 | return SVM_INSTR_VMRUN; |
2323 | case 0xda: /* VMLOAD */ |
2324 | return SVM_INSTR_VMLOAD; |
2325 | case 0xdb: /* VMSAVE */ |
2326 | return SVM_INSTR_VMSAVE; |
2327 | default: |
2328 | break; |
2329 | } |
2330 | |
2331 | return NONE_SVM_INSTR; |
2332 | } |
2333 | |
2334 | static int emulate_svm_instr(struct kvm_vcpu *vcpu, int opcode) |
2335 | { |
2336 | const int guest_mode_exit_codes[] = { |
2337 | [SVM_INSTR_VMRUN] = SVM_EXIT_VMRUN, |
2338 | [SVM_INSTR_VMLOAD] = SVM_EXIT_VMLOAD, |
2339 | [SVM_INSTR_VMSAVE] = SVM_EXIT_VMSAVE, |
2340 | }; |
2341 | int (*const svm_instr_handlers[])(struct kvm_vcpu *vcpu) = { |
2342 | [SVM_INSTR_VMRUN] = vmrun_interception, |
2343 | [SVM_INSTR_VMLOAD] = vmload_interception, |
2344 | [SVM_INSTR_VMSAVE] = vmsave_interception, |
2345 | }; |
2346 | struct vcpu_svm *svm = to_svm(vcpu); |
2347 | int ret; |
2348 | |
2349 | if (is_guest_mode(vcpu)) { |
2350 | /* Returns '1' or -errno on failure, '0' on success. */ |
2351 | ret = nested_svm_simple_vmexit(svm, exit_code: guest_mode_exit_codes[opcode]); |
2352 | if (ret) |
2353 | return ret; |
2354 | return 1; |
2355 | } |
2356 | return svm_instr_handlers[opcode](vcpu); |
2357 | } |
2358 | |
2359 | /* |
2360 | * #GP handling code. Note that #GP can be triggered under the following two |
2361 | * cases: |
2362 | * 1) SVM VM-related instructions (VMRUN/VMSAVE/VMLOAD) that trigger #GP on |
2363 | * some AMD CPUs when EAX of these instructions are in the reserved memory |
2364 | * regions (e.g. SMM memory on host). |
2365 | * 2) VMware backdoor |
2366 | */ |
2367 | static int gp_interception(struct kvm_vcpu *vcpu) |
2368 | { |
2369 | struct vcpu_svm *svm = to_svm(vcpu); |
2370 | u32 error_code = svm->vmcb->control.exit_info_1; |
2371 | int opcode; |
2372 | |
2373 | /* Both #GP cases have zero error_code */ |
2374 | if (error_code) |
2375 | goto reinject; |
2376 | |
2377 | /* Decode the instruction for usage later */ |
2378 | if (x86_decode_emulated_instruction(vcpu, 0, NULL, 0) != EMULATION_OK) |
2379 | goto reinject; |
2380 | |
2381 | opcode = svm_instr_opcode(vcpu); |
2382 | |
2383 | if (opcode == NONE_SVM_INSTR) { |
2384 | if (!enable_vmware_backdoor) |
2385 | goto reinject; |
2386 | |
2387 | /* |
2388 | * VMware backdoor emulation on #GP interception only handles |
2389 | * IN{S}, OUT{S}, and RDPMC. |
2390 | */ |
2391 | if (!is_guest_mode(vcpu)) |
2392 | return kvm_emulate_instruction(vcpu, |
2393 | EMULTYPE_VMWARE_GP | EMULTYPE_NO_DECODE); |
2394 | } else { |
2395 | /* All SVM instructions expect page aligned RAX */ |
2396 | if (svm->vmcb->save.rax & ~PAGE_MASK) |
2397 | goto reinject; |
2398 | |
2399 | return emulate_svm_instr(vcpu, opcode); |
2400 | } |
2401 | |
2402 | reinject: |
2403 | kvm_queue_exception_e(vcpu, GP_VECTOR, error_code); |
2404 | return 1; |
2405 | } |
2406 | |
2407 | void svm_set_gif(struct vcpu_svm *svm, bool value) |
2408 | { |
2409 | if (value) { |
2410 | /* |
2411 | * If VGIF is enabled, the STGI intercept is only added to |
2412 | * detect the opening of the SMI/NMI window; remove it now. |
2413 | * Likewise, clear the VINTR intercept, we will set it |
2414 | * again while processing KVM_REQ_EVENT if needed. |
2415 | */ |
2416 | if (vgif) |
2417 | svm_clr_intercept(svm, bit: INTERCEPT_STGI); |
2418 | if (svm_is_intercept(svm, bit: INTERCEPT_VINTR)) |
2419 | svm_clear_vintr(svm); |
2420 | |
2421 | enable_gif(svm); |
2422 | if (svm->vcpu.arch.smi_pending || |
2423 | svm->vcpu.arch.nmi_pending || |
2424 | kvm_cpu_has_injectable_intr(v: &svm->vcpu) || |
2425 | kvm_apic_has_pending_init_or_sipi(&svm->vcpu)) |
2426 | kvm_make_request(KVM_REQ_EVENT, vcpu: &svm->vcpu); |
2427 | } else { |
2428 | disable_gif(svm); |
2429 | |
2430 | /* |
2431 | * After a CLGI no interrupts should come. But if vGIF is |
2432 | * in use, we still rely on the VINTR intercept (rather than |
2433 | * STGI) to detect an open interrupt window. |
2434 | */ |
2435 | if (!vgif) |
2436 | svm_clear_vintr(svm); |
2437 | } |
2438 | } |
2439 | |
2440 | static int stgi_interception(struct kvm_vcpu *vcpu) |
2441 | { |
2442 | int ret; |
2443 | |
2444 | if (nested_svm_check_permissions(vcpu)) |
2445 | return 1; |
2446 | |
2447 | ret = kvm_skip_emulated_instruction(vcpu); |
2448 | svm_set_gif(svm: to_svm(vcpu), value: true); |
2449 | return ret; |
2450 | } |
2451 | |
2452 | static int clgi_interception(struct kvm_vcpu *vcpu) |
2453 | { |
2454 | int ret; |
2455 | |
2456 | if (nested_svm_check_permissions(vcpu)) |
2457 | return 1; |
2458 | |
2459 | ret = kvm_skip_emulated_instruction(vcpu); |
2460 | svm_set_gif(svm: to_svm(vcpu), value: false); |
2461 | return ret; |
2462 | } |
2463 | |
2464 | static int invlpga_interception(struct kvm_vcpu *vcpu) |
2465 | { |
2466 | gva_t gva = kvm_rax_read(vcpu); |
2467 | u32 asid = kvm_rcx_read(vcpu); |
2468 | |
2469 | /* FIXME: Handle an address size prefix. */ |
2470 | if (!is_long_mode(vcpu)) |
2471 | gva = (u32)gva; |
2472 | |
2473 | trace_kvm_invlpga(to_svm(vcpu)->vmcb->save.rip, asid, gva); |
2474 | |
2475 | /* Let's treat INVLPGA the same as INVLPG (can be optimized!) */ |
2476 | kvm_mmu_invlpg(vcpu, gva); |
2477 | |
2478 | return kvm_skip_emulated_instruction(vcpu); |
2479 | } |
2480 | |
2481 | static int skinit_interception(struct kvm_vcpu *vcpu) |
2482 | { |
2483 | trace_kvm_skinit(to_svm(vcpu)->vmcb->save.rip, kvm_rax_read(vcpu)); |
2484 | |
2485 | kvm_queue_exception(vcpu, UD_VECTOR); |
2486 | return 1; |
2487 | } |
2488 | |
2489 | static int task_switch_interception(struct kvm_vcpu *vcpu) |
2490 | { |
2491 | struct vcpu_svm *svm = to_svm(vcpu); |
2492 | u16 tss_selector; |
2493 | int reason; |
2494 | int int_type = svm->vmcb->control.exit_int_info & |
2495 | SVM_EXITINTINFO_TYPE_MASK; |
2496 | int int_vec = svm->vmcb->control.exit_int_info & SVM_EVTINJ_VEC_MASK; |
2497 | uint32_t type = |
2498 | svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_TYPE_MASK; |
2499 | uint32_t idt_v = |
2500 | svm->vmcb->control.exit_int_info & SVM_EXITINTINFO_VALID; |
2501 | bool has_error_code = false; |
2502 | u32 error_code = 0; |
2503 | |
2504 | tss_selector = (u16)svm->vmcb->control.exit_info_1; |
2505 | |
2506 | if (svm->vmcb->control.exit_info_2 & |
2507 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_IRET)) |
2508 | reason = TASK_SWITCH_IRET; |
2509 | else if (svm->vmcb->control.exit_info_2 & |
2510 | (1ULL << SVM_EXITINFOSHIFT_TS_REASON_JMP)) |
2511 | reason = TASK_SWITCH_JMP; |
2512 | else if (idt_v) |
2513 | reason = TASK_SWITCH_GATE; |
2514 | else |
2515 | reason = TASK_SWITCH_CALL; |
2516 | |
2517 | if (reason == TASK_SWITCH_GATE) { |
2518 | switch (type) { |
2519 | case SVM_EXITINTINFO_TYPE_NMI: |
2520 | vcpu->arch.nmi_injected = false; |
2521 | break; |
2522 | case SVM_EXITINTINFO_TYPE_EXEPT: |
2523 | if (svm->vmcb->control.exit_info_2 & |
2524 | (1ULL << SVM_EXITINFOSHIFT_TS_HAS_ERROR_CODE)) { |
2525 | has_error_code = true; |
2526 | error_code = |
2527 | (u32)svm->vmcb->control.exit_info_2; |
2528 | } |
2529 | kvm_clear_exception_queue(vcpu); |
2530 | break; |
2531 | case SVM_EXITINTINFO_TYPE_INTR: |
2532 | case SVM_EXITINTINFO_TYPE_SOFT: |
2533 | kvm_clear_interrupt_queue(vcpu); |
2534 | break; |
2535 | default: |
2536 | break; |
2537 | } |
2538 | } |
2539 | |
2540 | if (reason != TASK_SWITCH_GATE || |
2541 | int_type == SVM_EXITINTINFO_TYPE_SOFT || |
2542 | (int_type == SVM_EXITINTINFO_TYPE_EXEPT && |
2543 | (int_vec == OF_VECTOR || int_vec == BP_VECTOR))) { |
2544 | if (!svm_skip_emulated_instruction(vcpu)) |
2545 | return 0; |
2546 | } |
2547 | |
2548 | if (int_type != SVM_EXITINTINFO_TYPE_SOFT) |
2549 | int_vec = -1; |
2550 | |
2551 | return kvm_task_switch(vcpu, tss_selector, idt_index: int_vec, reason, |
2552 | has_error_code, error_code); |
2553 | } |
2554 | |
2555 | static void svm_clr_iret_intercept(struct vcpu_svm *svm) |
2556 | { |
2557 | if (!sev_es_guest(kvm: svm->vcpu.kvm)) |
2558 | svm_clr_intercept(svm, bit: INTERCEPT_IRET); |
2559 | } |
2560 | |
2561 | static void svm_set_iret_intercept(struct vcpu_svm *svm) |
2562 | { |
2563 | if (!sev_es_guest(kvm: svm->vcpu.kvm)) |
2564 | svm_set_intercept(svm, bit: INTERCEPT_IRET); |
2565 | } |
2566 | |
2567 | static int iret_interception(struct kvm_vcpu *vcpu) |
2568 | { |
2569 | struct vcpu_svm *svm = to_svm(vcpu); |
2570 | |
2571 | WARN_ON_ONCE(sev_es_guest(vcpu->kvm)); |
2572 | |
2573 | ++vcpu->stat.nmi_window_exits; |
2574 | svm->awaiting_iret_completion = true; |
2575 | |
2576 | svm_clr_iret_intercept(svm); |
2577 | svm->nmi_iret_rip = kvm_rip_read(vcpu); |
2578 | |
2579 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
2580 | return 1; |
2581 | } |
2582 | |
2583 | static int invlpg_interception(struct kvm_vcpu *vcpu) |
2584 | { |
2585 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) |
2586 | return kvm_emulate_instruction(vcpu, emulation_type: 0); |
2587 | |
2588 | kvm_mmu_invlpg(vcpu, gva: to_svm(vcpu)->vmcb->control.exit_info_1); |
2589 | return kvm_skip_emulated_instruction(vcpu); |
2590 | } |
2591 | |
2592 | static int emulate_on_interception(struct kvm_vcpu *vcpu) |
2593 | { |
2594 | return kvm_emulate_instruction(vcpu, emulation_type: 0); |
2595 | } |
2596 | |
2597 | static int rsm_interception(struct kvm_vcpu *vcpu) |
2598 | { |
2599 | return kvm_emulate_instruction_from_buffer(vcpu, insn: rsm_ins_bytes, insn_len: 2); |
2600 | } |
2601 | |
2602 | static bool check_selective_cr0_intercepted(struct kvm_vcpu *vcpu, |
2603 | unsigned long val) |
2604 | { |
2605 | struct vcpu_svm *svm = to_svm(vcpu); |
2606 | unsigned long cr0 = vcpu->arch.cr0; |
2607 | bool ret = false; |
2608 | |
2609 | if (!is_guest_mode(vcpu) || |
2610 | (!(vmcb12_is_intercept(control: &svm->nested.ctl, bit: INTERCEPT_SELECTIVE_CR0)))) |
2611 | return false; |
2612 | |
2613 | cr0 &= ~SVM_CR0_SELECTIVE_MASK; |
2614 | val &= ~SVM_CR0_SELECTIVE_MASK; |
2615 | |
2616 | if (cr0 ^ val) { |
2617 | svm->vmcb->control.exit_code = SVM_EXIT_CR0_SEL_WRITE; |
2618 | ret = (nested_svm_exit_handled(svm) == NESTED_EXIT_DONE); |
2619 | } |
2620 | |
2621 | return ret; |
2622 | } |
2623 | |
2624 | #define CR_VALID (1ULL << 63) |
2625 | |
2626 | static int cr_interception(struct kvm_vcpu *vcpu) |
2627 | { |
2628 | struct vcpu_svm *svm = to_svm(vcpu); |
2629 | int reg, cr; |
2630 | unsigned long val; |
2631 | int err; |
2632 | |
2633 | if (!static_cpu_has(X86_FEATURE_DECODEASSISTS)) |
2634 | return emulate_on_interception(vcpu); |
2635 | |
2636 | if (unlikely((svm->vmcb->control.exit_info_1 & CR_VALID) == 0)) |
2637 | return emulate_on_interception(vcpu); |
2638 | |
2639 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; |
2640 | if (svm->vmcb->control.exit_code == SVM_EXIT_CR0_SEL_WRITE) |
2641 | cr = SVM_EXIT_WRITE_CR0 - SVM_EXIT_READ_CR0; |
2642 | else |
2643 | cr = svm->vmcb->control.exit_code - SVM_EXIT_READ_CR0; |
2644 | |
2645 | err = 0; |
2646 | if (cr >= 16) { /* mov to cr */ |
2647 | cr -= 16; |
2648 | val = kvm_register_read(vcpu, reg); |
2649 | trace_kvm_cr_write(cr, val); |
2650 | switch (cr) { |
2651 | case 0: |
2652 | if (!check_selective_cr0_intercepted(vcpu, val)) |
2653 | err = kvm_set_cr0(vcpu, cr0: val); |
2654 | else |
2655 | return 1; |
2656 | |
2657 | break; |
2658 | case 3: |
2659 | err = kvm_set_cr3(vcpu, cr3: val); |
2660 | break; |
2661 | case 4: |
2662 | err = kvm_set_cr4(vcpu, cr4: val); |
2663 | break; |
2664 | case 8: |
2665 | err = kvm_set_cr8(vcpu, cr8: val); |
2666 | break; |
2667 | default: |
2668 | WARN(1, "unhandled write to CR%d" , cr); |
2669 | kvm_queue_exception(vcpu, UD_VECTOR); |
2670 | return 1; |
2671 | } |
2672 | } else { /* mov from cr */ |
2673 | switch (cr) { |
2674 | case 0: |
2675 | val = kvm_read_cr0(vcpu); |
2676 | break; |
2677 | case 2: |
2678 | val = vcpu->arch.cr2; |
2679 | break; |
2680 | case 3: |
2681 | val = kvm_read_cr3(vcpu); |
2682 | break; |
2683 | case 4: |
2684 | val = kvm_read_cr4(vcpu); |
2685 | break; |
2686 | case 8: |
2687 | val = kvm_get_cr8(vcpu); |
2688 | break; |
2689 | default: |
2690 | WARN(1, "unhandled read from CR%d" , cr); |
2691 | kvm_queue_exception(vcpu, UD_VECTOR); |
2692 | return 1; |
2693 | } |
2694 | kvm_register_write(vcpu, reg, val); |
2695 | trace_kvm_cr_read(cr, val); |
2696 | } |
2697 | return kvm_complete_insn_gp(vcpu, err); |
2698 | } |
2699 | |
2700 | static int cr_trap(struct kvm_vcpu *vcpu) |
2701 | { |
2702 | struct vcpu_svm *svm = to_svm(vcpu); |
2703 | unsigned long old_value, new_value; |
2704 | unsigned int cr; |
2705 | int ret = 0; |
2706 | |
2707 | new_value = (unsigned long)svm->vmcb->control.exit_info_1; |
2708 | |
2709 | cr = svm->vmcb->control.exit_code - SVM_EXIT_CR0_WRITE_TRAP; |
2710 | switch (cr) { |
2711 | case 0: |
2712 | old_value = kvm_read_cr0(vcpu); |
2713 | svm_set_cr0(vcpu, cr0: new_value); |
2714 | |
2715 | kvm_post_set_cr0(vcpu, old_cr0: old_value, cr0: new_value); |
2716 | break; |
2717 | case 4: |
2718 | old_value = kvm_read_cr4(vcpu); |
2719 | svm_set_cr4(vcpu, cr4: new_value); |
2720 | |
2721 | kvm_post_set_cr4(vcpu, old_cr4: old_value, cr4: new_value); |
2722 | break; |
2723 | case 8: |
2724 | ret = kvm_set_cr8(vcpu, cr8: new_value); |
2725 | break; |
2726 | default: |
2727 | WARN(1, "unhandled CR%d write trap" , cr); |
2728 | kvm_queue_exception(vcpu, UD_VECTOR); |
2729 | return 1; |
2730 | } |
2731 | |
2732 | return kvm_complete_insn_gp(vcpu, err: ret); |
2733 | } |
2734 | |
2735 | static int dr_interception(struct kvm_vcpu *vcpu) |
2736 | { |
2737 | struct vcpu_svm *svm = to_svm(vcpu); |
2738 | int reg, dr; |
2739 | int err = 0; |
2740 | |
2741 | /* |
2742 | * SEV-ES intercepts DR7 only to disable guest debugging and the guest issues a VMGEXIT |
2743 | * for DR7 write only. KVM cannot change DR7 (always swapped as type 'A') so return early. |
2744 | */ |
2745 | if (sev_es_guest(kvm: vcpu->kvm)) |
2746 | return 1; |
2747 | |
2748 | if (vcpu->guest_debug == 0) { |
2749 | /* |
2750 | * No more DR vmexits; force a reload of the debug registers |
2751 | * and reenter on this instruction. The next vmexit will |
2752 | * retrieve the full state of the debug registers. |
2753 | */ |
2754 | clr_dr_intercepts(svm); |
2755 | vcpu->arch.switch_db_regs |= KVM_DEBUGREG_WONT_EXIT; |
2756 | return 1; |
2757 | } |
2758 | |
2759 | if (!boot_cpu_has(X86_FEATURE_DECODEASSISTS)) |
2760 | return emulate_on_interception(vcpu); |
2761 | |
2762 | reg = svm->vmcb->control.exit_info_1 & SVM_EXITINFO_REG_MASK; |
2763 | dr = svm->vmcb->control.exit_code - SVM_EXIT_READ_DR0; |
2764 | if (dr >= 16) { /* mov to DRn */ |
2765 | dr -= 16; |
2766 | err = kvm_set_dr(vcpu, dr, val: kvm_register_read(vcpu, reg)); |
2767 | } else { |
2768 | kvm_register_write(vcpu, reg, kvm_get_dr(vcpu, dr)); |
2769 | } |
2770 | |
2771 | return kvm_complete_insn_gp(vcpu, err); |
2772 | } |
2773 | |
2774 | static int cr8_write_interception(struct kvm_vcpu *vcpu) |
2775 | { |
2776 | int r; |
2777 | |
2778 | u8 cr8_prev = kvm_get_cr8(vcpu); |
2779 | /* instruction emulation calls kvm_set_cr8() */ |
2780 | r = cr_interception(vcpu); |
2781 | if (lapic_in_kernel(vcpu)) |
2782 | return r; |
2783 | if (cr8_prev <= kvm_get_cr8(vcpu)) |
2784 | return r; |
2785 | vcpu->run->exit_reason = KVM_EXIT_SET_TPR; |
2786 | return 0; |
2787 | } |
2788 | |
2789 | static int efer_trap(struct kvm_vcpu *vcpu) |
2790 | { |
2791 | struct msr_data msr_info; |
2792 | int ret; |
2793 | |
2794 | /* |
2795 | * Clear the EFER_SVME bit from EFER. The SVM code always sets this |
2796 | * bit in svm_set_efer(), but __kvm_valid_efer() checks it against |
2797 | * whether the guest has X86_FEATURE_SVM - this avoids a failure if |
2798 | * the guest doesn't have X86_FEATURE_SVM. |
2799 | */ |
2800 | msr_info.host_initiated = false; |
2801 | msr_info.index = MSR_EFER; |
2802 | msr_info.data = to_svm(vcpu)->vmcb->control.exit_info_1 & ~EFER_SVME; |
2803 | ret = kvm_set_msr_common(vcpu, msr: &msr_info); |
2804 | |
2805 | return kvm_complete_insn_gp(vcpu, err: ret); |
2806 | } |
2807 | |
2808 | static int svm_get_msr_feature(struct kvm_msr_entry *msr) |
2809 | { |
2810 | msr->data = 0; |
2811 | |
2812 | switch (msr->index) { |
2813 | case MSR_AMD64_DE_CFG: |
2814 | if (cpu_feature_enabled(X86_FEATURE_LFENCE_RDTSC)) |
2815 | msr->data |= MSR_AMD64_DE_CFG_LFENCE_SERIALIZE; |
2816 | break; |
2817 | default: |
2818 | return KVM_MSR_RET_INVALID; |
2819 | } |
2820 | |
2821 | return 0; |
2822 | } |
2823 | |
2824 | static int svm_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) |
2825 | { |
2826 | struct vcpu_svm *svm = to_svm(vcpu); |
2827 | |
2828 | switch (msr_info->index) { |
2829 | case MSR_AMD64_TSC_RATIO: |
2830 | if (!msr_info->host_initiated && |
2831 | !guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) |
2832 | return 1; |
2833 | msr_info->data = svm->tsc_ratio_msr; |
2834 | break; |
2835 | case MSR_STAR: |
2836 | msr_info->data = svm->vmcb01.ptr->save.star; |
2837 | break; |
2838 | #ifdef CONFIG_X86_64 |
2839 | case MSR_LSTAR: |
2840 | msr_info->data = svm->vmcb01.ptr->save.lstar; |
2841 | break; |
2842 | case MSR_CSTAR: |
2843 | msr_info->data = svm->vmcb01.ptr->save.cstar; |
2844 | break; |
2845 | case MSR_KERNEL_GS_BASE: |
2846 | msr_info->data = svm->vmcb01.ptr->save.kernel_gs_base; |
2847 | break; |
2848 | case MSR_SYSCALL_MASK: |
2849 | msr_info->data = svm->vmcb01.ptr->save.sfmask; |
2850 | break; |
2851 | #endif |
2852 | case MSR_IA32_SYSENTER_CS: |
2853 | msr_info->data = svm->vmcb01.ptr->save.sysenter_cs; |
2854 | break; |
2855 | case MSR_IA32_SYSENTER_EIP: |
2856 | msr_info->data = (u32)svm->vmcb01.ptr->save.sysenter_eip; |
2857 | if (guest_cpuid_is_intel(vcpu)) |
2858 | msr_info->data |= (u64)svm->sysenter_eip_hi << 32; |
2859 | break; |
2860 | case MSR_IA32_SYSENTER_ESP: |
2861 | msr_info->data = svm->vmcb01.ptr->save.sysenter_esp; |
2862 | if (guest_cpuid_is_intel(vcpu)) |
2863 | msr_info->data |= (u64)svm->sysenter_esp_hi << 32; |
2864 | break; |
2865 | case MSR_TSC_AUX: |
2866 | msr_info->data = svm->tsc_aux; |
2867 | break; |
2868 | case MSR_IA32_DEBUGCTLMSR: |
2869 | msr_info->data = svm_get_lbr_vmcb(svm)->save.dbgctl; |
2870 | break; |
2871 | case MSR_IA32_LASTBRANCHFROMIP: |
2872 | msr_info->data = svm_get_lbr_vmcb(svm)->save.br_from; |
2873 | break; |
2874 | case MSR_IA32_LASTBRANCHTOIP: |
2875 | msr_info->data = svm_get_lbr_vmcb(svm)->save.br_to; |
2876 | break; |
2877 | case MSR_IA32_LASTINTFROMIP: |
2878 | msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_from; |
2879 | break; |
2880 | case MSR_IA32_LASTINTTOIP: |
2881 | msr_info->data = svm_get_lbr_vmcb(svm)->save.last_excp_to; |
2882 | break; |
2883 | case MSR_VM_HSAVE_PA: |
2884 | msr_info->data = svm->nested.hsave_msr; |
2885 | break; |
2886 | case MSR_VM_CR: |
2887 | msr_info->data = svm->nested.vm_cr_msr; |
2888 | break; |
2889 | case MSR_IA32_SPEC_CTRL: |
2890 | if (!msr_info->host_initiated && |
2891 | !guest_has_spec_ctrl_msr(vcpu)) |
2892 | return 1; |
2893 | |
2894 | if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) |
2895 | msr_info->data = svm->vmcb->save.spec_ctrl; |
2896 | else |
2897 | msr_info->data = svm->spec_ctrl; |
2898 | break; |
2899 | case MSR_AMD64_VIRT_SPEC_CTRL: |
2900 | if (!msr_info->host_initiated && |
2901 | !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) |
2902 | return 1; |
2903 | |
2904 | msr_info->data = svm->virt_spec_ctrl; |
2905 | break; |
2906 | case MSR_F15H_IC_CFG: { |
2907 | |
2908 | int family, model; |
2909 | |
2910 | family = guest_cpuid_family(vcpu); |
2911 | model = guest_cpuid_model(vcpu); |
2912 | |
2913 | if (family < 0 || model < 0) |
2914 | return kvm_get_msr_common(vcpu, msr: msr_info); |
2915 | |
2916 | msr_info->data = 0; |
2917 | |
2918 | if (family == 0x15 && |
2919 | (model >= 0x2 && model < 0x20)) |
2920 | msr_info->data = 0x1E; |
2921 | } |
2922 | break; |
2923 | case MSR_AMD64_DE_CFG: |
2924 | msr_info->data = svm->msr_decfg; |
2925 | break; |
2926 | default: |
2927 | return kvm_get_msr_common(vcpu, msr: msr_info); |
2928 | } |
2929 | return 0; |
2930 | } |
2931 | |
2932 | static int svm_complete_emulated_msr(struct kvm_vcpu *vcpu, int err) |
2933 | { |
2934 | struct vcpu_svm *svm = to_svm(vcpu); |
2935 | if (!err || !sev_es_guest(kvm: vcpu->kvm) || WARN_ON_ONCE(!svm->sev_es.ghcb)) |
2936 | return kvm_complete_insn_gp(vcpu, err); |
2937 | |
2938 | ghcb_set_sw_exit_info_1(ghcb: svm->sev_es.ghcb, value: 1); |
2939 | ghcb_set_sw_exit_info_2(ghcb: svm->sev_es.ghcb, |
2940 | X86_TRAP_GP | |
2941 | SVM_EVTINJ_TYPE_EXEPT | |
2942 | SVM_EVTINJ_VALID); |
2943 | return 1; |
2944 | } |
2945 | |
2946 | static int svm_set_vm_cr(struct kvm_vcpu *vcpu, u64 data) |
2947 | { |
2948 | struct vcpu_svm *svm = to_svm(vcpu); |
2949 | int svm_dis, chg_mask; |
2950 | |
2951 | if (data & ~SVM_VM_CR_VALID_MASK) |
2952 | return 1; |
2953 | |
2954 | chg_mask = SVM_VM_CR_VALID_MASK; |
2955 | |
2956 | if (svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK) |
2957 | chg_mask &= ~(SVM_VM_CR_SVM_LOCK_MASK | SVM_VM_CR_SVM_DIS_MASK); |
2958 | |
2959 | svm->nested.vm_cr_msr &= ~chg_mask; |
2960 | svm->nested.vm_cr_msr |= (data & chg_mask); |
2961 | |
2962 | svm_dis = svm->nested.vm_cr_msr & SVM_VM_CR_SVM_DIS_MASK; |
2963 | |
2964 | /* check for svm_disable while efer.svme is set */ |
2965 | if (svm_dis && (vcpu->arch.efer & EFER_SVME)) |
2966 | return 1; |
2967 | |
2968 | return 0; |
2969 | } |
2970 | |
2971 | static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) |
2972 | { |
2973 | struct vcpu_svm *svm = to_svm(vcpu); |
2974 | int ret = 0; |
2975 | |
2976 | u32 ecx = msr->index; |
2977 | u64 data = msr->data; |
2978 | switch (ecx) { |
2979 | case MSR_AMD64_TSC_RATIO: |
2980 | |
2981 | if (!guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR)) { |
2982 | |
2983 | if (!msr->host_initiated) |
2984 | return 1; |
2985 | /* |
2986 | * In case TSC scaling is not enabled, always |
2987 | * leave this MSR at the default value. |
2988 | * |
2989 | * Due to bug in qemu 6.2.0, it would try to set |
2990 | * this msr to 0 if tsc scaling is not enabled. |
2991 | * Ignore this value as well. |
2992 | */ |
2993 | if (data != 0 && data != svm->tsc_ratio_msr) |
2994 | return 1; |
2995 | break; |
2996 | } |
2997 | |
2998 | if (data & SVM_TSC_RATIO_RSVD) |
2999 | return 1; |
3000 | |
3001 | svm->tsc_ratio_msr = data; |
3002 | |
3003 | if (guest_can_use(vcpu, X86_FEATURE_TSCRATEMSR) && |
3004 | is_guest_mode(vcpu)) |
3005 | nested_svm_update_tsc_ratio_msr(vcpu); |
3006 | |
3007 | break; |
3008 | case MSR_IA32_CR_PAT: |
3009 | ret = kvm_set_msr_common(vcpu, msr); |
3010 | if (ret) |
3011 | break; |
3012 | |
3013 | svm->vmcb01.ptr->save.g_pat = data; |
3014 | if (is_guest_mode(vcpu)) |
3015 | nested_vmcb02_compute_g_pat(svm); |
3016 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_NPT); |
3017 | break; |
3018 | case MSR_IA32_SPEC_CTRL: |
3019 | if (!msr->host_initiated && |
3020 | !guest_has_spec_ctrl_msr(vcpu)) |
3021 | return 1; |
3022 | |
3023 | if (kvm_spec_ctrl_test_value(data)) |
3024 | return 1; |
3025 | |
3026 | if (boot_cpu_has(X86_FEATURE_V_SPEC_CTRL)) |
3027 | svm->vmcb->save.spec_ctrl = data; |
3028 | else |
3029 | svm->spec_ctrl = data; |
3030 | if (!data) |
3031 | break; |
3032 | |
3033 | /* |
3034 | * For non-nested: |
3035 | * When it's written (to non-zero) for the first time, pass |
3036 | * it through. |
3037 | * |
3038 | * For nested: |
3039 | * The handling of the MSR bitmap for L2 guests is done in |
3040 | * nested_svm_vmrun_msrpm. |
3041 | * We update the L1 MSR bit as well since it will end up |
3042 | * touching the MSR anyway now. |
3043 | */ |
3044 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_SPEC_CTRL, read: 1, write: 1); |
3045 | break; |
3046 | case MSR_AMD64_VIRT_SPEC_CTRL: |
3047 | if (!msr->host_initiated && |
3048 | !guest_cpuid_has(vcpu, X86_FEATURE_VIRT_SSBD)) |
3049 | return 1; |
3050 | |
3051 | if (data & ~SPEC_CTRL_SSBD) |
3052 | return 1; |
3053 | |
3054 | svm->virt_spec_ctrl = data; |
3055 | break; |
3056 | case MSR_STAR: |
3057 | svm->vmcb01.ptr->save.star = data; |
3058 | break; |
3059 | #ifdef CONFIG_X86_64 |
3060 | case MSR_LSTAR: |
3061 | svm->vmcb01.ptr->save.lstar = data; |
3062 | break; |
3063 | case MSR_CSTAR: |
3064 | svm->vmcb01.ptr->save.cstar = data; |
3065 | break; |
3066 | case MSR_KERNEL_GS_BASE: |
3067 | svm->vmcb01.ptr->save.kernel_gs_base = data; |
3068 | break; |
3069 | case MSR_SYSCALL_MASK: |
3070 | svm->vmcb01.ptr->save.sfmask = data; |
3071 | break; |
3072 | #endif |
3073 | case MSR_IA32_SYSENTER_CS: |
3074 | svm->vmcb01.ptr->save.sysenter_cs = data; |
3075 | break; |
3076 | case MSR_IA32_SYSENTER_EIP: |
3077 | svm->vmcb01.ptr->save.sysenter_eip = (u32)data; |
3078 | /* |
3079 | * We only intercept the MSR_IA32_SYSENTER_{EIP|ESP} msrs |
3080 | * when we spoof an Intel vendor ID (for cross vendor migration). |
3081 | * In this case we use this intercept to track the high |
3082 | * 32 bit part of these msrs to support Intel's |
3083 | * implementation of SYSENTER/SYSEXIT. |
3084 | */ |
3085 | svm->sysenter_eip_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; |
3086 | break; |
3087 | case MSR_IA32_SYSENTER_ESP: |
3088 | svm->vmcb01.ptr->save.sysenter_esp = (u32)data; |
3089 | svm->sysenter_esp_hi = guest_cpuid_is_intel(vcpu) ? (data >> 32) : 0; |
3090 | break; |
3091 | case MSR_TSC_AUX: |
3092 | /* |
3093 | * TSC_AUX is always virtualized for SEV-ES guests when the |
3094 | * feature is available. The user return MSR support is not |
3095 | * required in this case because TSC_AUX is restored on #VMEXIT |
3096 | * from the host save area (which has been initialized in |
3097 | * svm_hardware_enable()). |
3098 | */ |
3099 | if (boot_cpu_has(X86_FEATURE_V_TSC_AUX) && sev_es_guest(kvm: vcpu->kvm)) |
3100 | break; |
3101 | |
3102 | /* |
3103 | * TSC_AUX is usually changed only during boot and never read |
3104 | * directly. Intercept TSC_AUX instead of exposing it to the |
3105 | * guest via direct_access_msrs, and switch it via user return. |
3106 | */ |
3107 | preempt_disable(); |
3108 | ret = kvm_set_user_return_msr(index: tsc_aux_uret_slot, val: data, mask: -1ull); |
3109 | preempt_enable(); |
3110 | if (ret) |
3111 | break; |
3112 | |
3113 | svm->tsc_aux = data; |
3114 | break; |
3115 | case MSR_IA32_DEBUGCTLMSR: |
3116 | if (!lbrv) { |
3117 | kvm_pr_unimpl_wrmsr(vcpu, ecx, data); |
3118 | break; |
3119 | } |
3120 | if (data & DEBUGCTL_RESERVED_BITS) |
3121 | return 1; |
3122 | |
3123 | svm_get_lbr_vmcb(svm)->save.dbgctl = data; |
3124 | svm_update_lbrv(vcpu); |
3125 | break; |
3126 | case MSR_VM_HSAVE_PA: |
3127 | /* |
3128 | * Old kernels did not validate the value written to |
3129 | * MSR_VM_HSAVE_PA. Allow KVM_SET_MSR to set an invalid |
3130 | * value to allow live migrating buggy or malicious guests |
3131 | * originating from those kernels. |
3132 | */ |
3133 | if (!msr->host_initiated && !page_address_valid(vcpu, data)) |
3134 | return 1; |
3135 | |
3136 | svm->nested.hsave_msr = data & PAGE_MASK; |
3137 | break; |
3138 | case MSR_VM_CR: |
3139 | return svm_set_vm_cr(vcpu, data); |
3140 | case MSR_VM_IGNNE: |
3141 | kvm_pr_unimpl_wrmsr(vcpu, ecx, data); |
3142 | break; |
3143 | case MSR_AMD64_DE_CFG: { |
3144 | struct kvm_msr_entry msr_entry; |
3145 | |
3146 | msr_entry.index = msr->index; |
3147 | if (svm_get_msr_feature(msr: &msr_entry)) |
3148 | return 1; |
3149 | |
3150 | /* Check the supported bits */ |
3151 | if (data & ~msr_entry.data) |
3152 | return 1; |
3153 | |
3154 | /* Don't allow the guest to change a bit, #GP */ |
3155 | if (!msr->host_initiated && (data ^ msr_entry.data)) |
3156 | return 1; |
3157 | |
3158 | svm->msr_decfg = data; |
3159 | break; |
3160 | } |
3161 | default: |
3162 | return kvm_set_msr_common(vcpu, msr); |
3163 | } |
3164 | return ret; |
3165 | } |
3166 | |
3167 | static int msr_interception(struct kvm_vcpu *vcpu) |
3168 | { |
3169 | if (to_svm(vcpu)->vmcb->control.exit_info_1) |
3170 | return kvm_emulate_wrmsr(vcpu); |
3171 | else |
3172 | return kvm_emulate_rdmsr(vcpu); |
3173 | } |
3174 | |
3175 | static int interrupt_window_interception(struct kvm_vcpu *vcpu) |
3176 | { |
3177 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
3178 | svm_clear_vintr(svm: to_svm(vcpu)); |
3179 | |
3180 | /* |
3181 | * If not running nested, for AVIC, the only reason to end up here is ExtINTs. |
3182 | * In this case AVIC was temporarily disabled for |
3183 | * requesting the IRQ window and we have to re-enable it. |
3184 | * |
3185 | * If running nested, still remove the VM wide AVIC inhibit to |
3186 | * support case in which the interrupt window was requested when the |
3187 | * vCPU was not running nested. |
3188 | |
3189 | * All vCPUs which run still run nested, will remain to have their |
3190 | * AVIC still inhibited due to per-cpu AVIC inhibition. |
3191 | */ |
3192 | kvm_clear_apicv_inhibit(kvm: vcpu->kvm, reason: APICV_INHIBIT_REASON_IRQWIN); |
3193 | |
3194 | ++vcpu->stat.irq_window_exits; |
3195 | return 1; |
3196 | } |
3197 | |
3198 | static int pause_interception(struct kvm_vcpu *vcpu) |
3199 | { |
3200 | bool in_kernel; |
3201 | /* |
3202 | * CPL is not made available for an SEV-ES guest, therefore |
3203 | * vcpu->arch.preempted_in_kernel can never be true. Just |
3204 | * set in_kernel to false as well. |
3205 | */ |
3206 | in_kernel = !sev_es_guest(kvm: vcpu->kvm) && svm_get_cpl(vcpu) == 0; |
3207 | |
3208 | grow_ple_window(vcpu); |
3209 | |
3210 | kvm_vcpu_on_spin(vcpu, yield_to_kernel_mode: in_kernel); |
3211 | return kvm_skip_emulated_instruction(vcpu); |
3212 | } |
3213 | |
3214 | static int invpcid_interception(struct kvm_vcpu *vcpu) |
3215 | { |
3216 | struct vcpu_svm *svm = to_svm(vcpu); |
3217 | unsigned long type; |
3218 | gva_t gva; |
3219 | |
3220 | if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) { |
3221 | kvm_queue_exception(vcpu, UD_VECTOR); |
3222 | return 1; |
3223 | } |
3224 | |
3225 | /* |
3226 | * For an INVPCID intercept: |
3227 | * EXITINFO1 provides the linear address of the memory operand. |
3228 | * EXITINFO2 provides the contents of the register operand. |
3229 | */ |
3230 | type = svm->vmcb->control.exit_info_2; |
3231 | gva = svm->vmcb->control.exit_info_1; |
3232 | |
3233 | return kvm_handle_invpcid(vcpu, type, gva); |
3234 | } |
3235 | |
3236 | static int (*const svm_exit_handlers[])(struct kvm_vcpu *vcpu) = { |
3237 | [SVM_EXIT_READ_CR0] = cr_interception, |
3238 | [SVM_EXIT_READ_CR3] = cr_interception, |
3239 | [SVM_EXIT_READ_CR4] = cr_interception, |
3240 | [SVM_EXIT_READ_CR8] = cr_interception, |
3241 | [SVM_EXIT_CR0_SEL_WRITE] = cr_interception, |
3242 | [SVM_EXIT_WRITE_CR0] = cr_interception, |
3243 | [SVM_EXIT_WRITE_CR3] = cr_interception, |
3244 | [SVM_EXIT_WRITE_CR4] = cr_interception, |
3245 | [SVM_EXIT_WRITE_CR8] = cr8_write_interception, |
3246 | [SVM_EXIT_READ_DR0] = dr_interception, |
3247 | [SVM_EXIT_READ_DR1] = dr_interception, |
3248 | [SVM_EXIT_READ_DR2] = dr_interception, |
3249 | [SVM_EXIT_READ_DR3] = dr_interception, |
3250 | [SVM_EXIT_READ_DR4] = dr_interception, |
3251 | [SVM_EXIT_READ_DR5] = dr_interception, |
3252 | [SVM_EXIT_READ_DR6] = dr_interception, |
3253 | [SVM_EXIT_READ_DR7] = dr_interception, |
3254 | [SVM_EXIT_WRITE_DR0] = dr_interception, |
3255 | [SVM_EXIT_WRITE_DR1] = dr_interception, |
3256 | [SVM_EXIT_WRITE_DR2] = dr_interception, |
3257 | [SVM_EXIT_WRITE_DR3] = dr_interception, |
3258 | [SVM_EXIT_WRITE_DR4] = dr_interception, |
3259 | [SVM_EXIT_WRITE_DR5] = dr_interception, |
3260 | [SVM_EXIT_WRITE_DR6] = dr_interception, |
3261 | [SVM_EXIT_WRITE_DR7] = dr_interception, |
3262 | [SVM_EXIT_EXCP_BASE + DB_VECTOR] = db_interception, |
3263 | [SVM_EXIT_EXCP_BASE + BP_VECTOR] = bp_interception, |
3264 | [SVM_EXIT_EXCP_BASE + UD_VECTOR] = ud_interception, |
3265 | [SVM_EXIT_EXCP_BASE + PF_VECTOR] = pf_interception, |
3266 | [SVM_EXIT_EXCP_BASE + MC_VECTOR] = mc_interception, |
3267 | [SVM_EXIT_EXCP_BASE + AC_VECTOR] = ac_interception, |
3268 | [SVM_EXIT_EXCP_BASE + GP_VECTOR] = gp_interception, |
3269 | [SVM_EXIT_INTR] = intr_interception, |
3270 | [SVM_EXIT_NMI] = nmi_interception, |
3271 | [SVM_EXIT_SMI] = smi_interception, |
3272 | [SVM_EXIT_VINTR] = interrupt_window_interception, |
3273 | [SVM_EXIT_RDPMC] = kvm_emulate_rdpmc, |
3274 | [SVM_EXIT_CPUID] = kvm_emulate_cpuid, |
3275 | [SVM_EXIT_IRET] = iret_interception, |
3276 | [SVM_EXIT_INVD] = kvm_emulate_invd, |
3277 | [SVM_EXIT_PAUSE] = pause_interception, |
3278 | [SVM_EXIT_HLT] = kvm_emulate_halt, |
3279 | [SVM_EXIT_INVLPG] = invlpg_interception, |
3280 | [SVM_EXIT_INVLPGA] = invlpga_interception, |
3281 | [SVM_EXIT_IOIO] = io_interception, |
3282 | [SVM_EXIT_MSR] = msr_interception, |
3283 | [SVM_EXIT_TASK_SWITCH] = task_switch_interception, |
3284 | [SVM_EXIT_SHUTDOWN] = shutdown_interception, |
3285 | [SVM_EXIT_VMRUN] = vmrun_interception, |
3286 | [SVM_EXIT_VMMCALL] = kvm_emulate_hypercall, |
3287 | [SVM_EXIT_VMLOAD] = vmload_interception, |
3288 | [SVM_EXIT_VMSAVE] = vmsave_interception, |
3289 | [SVM_EXIT_STGI] = stgi_interception, |
3290 | [SVM_EXIT_CLGI] = clgi_interception, |
3291 | [SVM_EXIT_SKINIT] = skinit_interception, |
3292 | [SVM_EXIT_RDTSCP] = kvm_handle_invalid_op, |
3293 | [SVM_EXIT_WBINVD] = kvm_emulate_wbinvd, |
3294 | [SVM_EXIT_MONITOR] = kvm_emulate_monitor, |
3295 | [SVM_EXIT_MWAIT] = kvm_emulate_mwait, |
3296 | [SVM_EXIT_XSETBV] = kvm_emulate_xsetbv, |
3297 | [SVM_EXIT_RDPRU] = kvm_handle_invalid_op, |
3298 | [SVM_EXIT_EFER_WRITE_TRAP] = efer_trap, |
3299 | [SVM_EXIT_CR0_WRITE_TRAP] = cr_trap, |
3300 | [SVM_EXIT_CR4_WRITE_TRAP] = cr_trap, |
3301 | [SVM_EXIT_CR8_WRITE_TRAP] = cr_trap, |
3302 | [SVM_EXIT_INVPCID] = invpcid_interception, |
3303 | [SVM_EXIT_NPF] = npf_interception, |
3304 | [SVM_EXIT_RSM] = rsm_interception, |
3305 | [SVM_EXIT_AVIC_INCOMPLETE_IPI] = avic_incomplete_ipi_interception, |
3306 | [SVM_EXIT_AVIC_UNACCELERATED_ACCESS] = avic_unaccelerated_access_interception, |
3307 | [SVM_EXIT_VMGEXIT] = sev_handle_vmgexit, |
3308 | }; |
3309 | |
3310 | static void dump_vmcb(struct kvm_vcpu *vcpu) |
3311 | { |
3312 | struct vcpu_svm *svm = to_svm(vcpu); |
3313 | struct vmcb_control_area *control = &svm->vmcb->control; |
3314 | struct vmcb_save_area *save = &svm->vmcb->save; |
3315 | struct vmcb_save_area *save01 = &svm->vmcb01.ptr->save; |
3316 | |
3317 | if (!dump_invalid_vmcb) { |
3318 | pr_warn_ratelimited("set kvm_amd.dump_invalid_vmcb=1 to dump internal KVM state.\n" ); |
3319 | return; |
3320 | } |
3321 | |
3322 | pr_err("VMCB %p, last attempted VMRUN on CPU %d\n" , |
3323 | svm->current_vmcb->ptr, vcpu->arch.last_vmentry_cpu); |
3324 | pr_err("VMCB Control Area:\n" ); |
3325 | pr_err("%-20s%04x\n" , "cr_read:" , control->intercepts[INTERCEPT_CR] & 0xffff); |
3326 | pr_err("%-20s%04x\n" , "cr_write:" , control->intercepts[INTERCEPT_CR] >> 16); |
3327 | pr_err("%-20s%04x\n" , "dr_read:" , control->intercepts[INTERCEPT_DR] & 0xffff); |
3328 | pr_err("%-20s%04x\n" , "dr_write:" , control->intercepts[INTERCEPT_DR] >> 16); |
3329 | pr_err("%-20s%08x\n" , "exceptions:" , control->intercepts[INTERCEPT_EXCEPTION]); |
3330 | pr_err("%-20s%08x %08x\n" , "intercepts:" , |
3331 | control->intercepts[INTERCEPT_WORD3], |
3332 | control->intercepts[INTERCEPT_WORD4]); |
3333 | pr_err("%-20s%d\n" , "pause filter count:" , control->pause_filter_count); |
3334 | pr_err("%-20s%d\n" , "pause filter threshold:" , |
3335 | control->pause_filter_thresh); |
3336 | pr_err("%-20s%016llx\n" , "iopm_base_pa:" , control->iopm_base_pa); |
3337 | pr_err("%-20s%016llx\n" , "msrpm_base_pa:" , control->msrpm_base_pa); |
3338 | pr_err("%-20s%016llx\n" , "tsc_offset:" , control->tsc_offset); |
3339 | pr_err("%-20s%d\n" , "asid:" , control->asid); |
3340 | pr_err("%-20s%d\n" , "tlb_ctl:" , control->tlb_ctl); |
3341 | pr_err("%-20s%08x\n" , "int_ctl:" , control->int_ctl); |
3342 | pr_err("%-20s%08x\n" , "int_vector:" , control->int_vector); |
3343 | pr_err("%-20s%08x\n" , "int_state:" , control->int_state); |
3344 | pr_err("%-20s%08x\n" , "exit_code:" , control->exit_code); |
3345 | pr_err("%-20s%016llx\n" , "exit_info1:" , control->exit_info_1); |
3346 | pr_err("%-20s%016llx\n" , "exit_info2:" , control->exit_info_2); |
3347 | pr_err("%-20s%08x\n" , "exit_int_info:" , control->exit_int_info); |
3348 | pr_err("%-20s%08x\n" , "exit_int_info_err:" , control->exit_int_info_err); |
3349 | pr_err("%-20s%lld\n" , "nested_ctl:" , control->nested_ctl); |
3350 | pr_err("%-20s%016llx\n" , "nested_cr3:" , control->nested_cr3); |
3351 | pr_err("%-20s%016llx\n" , "avic_vapic_bar:" , control->avic_vapic_bar); |
3352 | pr_err("%-20s%016llx\n" , "ghcb:" , control->ghcb_gpa); |
3353 | pr_err("%-20s%08x\n" , "event_inj:" , control->event_inj); |
3354 | pr_err("%-20s%08x\n" , "event_inj_err:" , control->event_inj_err); |
3355 | pr_err("%-20s%lld\n" , "virt_ext:" , control->virt_ext); |
3356 | pr_err("%-20s%016llx\n" , "next_rip:" , control->next_rip); |
3357 | pr_err("%-20s%016llx\n" , "avic_backing_page:" , control->avic_backing_page); |
3358 | pr_err("%-20s%016llx\n" , "avic_logical_id:" , control->avic_logical_id); |
3359 | pr_err("%-20s%016llx\n" , "avic_physical_id:" , control->avic_physical_id); |
3360 | pr_err("%-20s%016llx\n" , "vmsa_pa:" , control->vmsa_pa); |
3361 | pr_err("VMCB State Save Area:\n" ); |
3362 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3363 | "es:" , |
3364 | save->es.selector, save->es.attrib, |
3365 | save->es.limit, save->es.base); |
3366 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3367 | "cs:" , |
3368 | save->cs.selector, save->cs.attrib, |
3369 | save->cs.limit, save->cs.base); |
3370 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3371 | "ss:" , |
3372 | save->ss.selector, save->ss.attrib, |
3373 | save->ss.limit, save->ss.base); |
3374 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3375 | "ds:" , |
3376 | save->ds.selector, save->ds.attrib, |
3377 | save->ds.limit, save->ds.base); |
3378 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3379 | "fs:" , |
3380 | save01->fs.selector, save01->fs.attrib, |
3381 | save01->fs.limit, save01->fs.base); |
3382 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3383 | "gs:" , |
3384 | save01->gs.selector, save01->gs.attrib, |
3385 | save01->gs.limit, save01->gs.base); |
3386 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3387 | "gdtr:" , |
3388 | save->gdtr.selector, save->gdtr.attrib, |
3389 | save->gdtr.limit, save->gdtr.base); |
3390 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3391 | "ldtr:" , |
3392 | save01->ldtr.selector, save01->ldtr.attrib, |
3393 | save01->ldtr.limit, save01->ldtr.base); |
3394 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3395 | "idtr:" , |
3396 | save->idtr.selector, save->idtr.attrib, |
3397 | save->idtr.limit, save->idtr.base); |
3398 | pr_err("%-5s s: %04x a: %04x l: %08x b: %016llx\n" , |
3399 | "tr:" , |
3400 | save01->tr.selector, save01->tr.attrib, |
3401 | save01->tr.limit, save01->tr.base); |
3402 | pr_err("vmpl: %d cpl: %d efer: %016llx\n" , |
3403 | save->vmpl, save->cpl, save->efer); |
3404 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3405 | "cr0:" , save->cr0, "cr2:" , save->cr2); |
3406 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3407 | "cr3:" , save->cr3, "cr4:" , save->cr4); |
3408 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3409 | "dr6:" , save->dr6, "dr7:" , save->dr7); |
3410 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3411 | "rip:" , save->rip, "rflags:" , save->rflags); |
3412 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3413 | "rsp:" , save->rsp, "rax:" , save->rax); |
3414 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3415 | "star:" , save01->star, "lstar:" , save01->lstar); |
3416 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3417 | "cstar:" , save01->cstar, "sfmask:" , save01->sfmask); |
3418 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3419 | "kernel_gs_base:" , save01->kernel_gs_base, |
3420 | "sysenter_cs:" , save01->sysenter_cs); |
3421 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3422 | "sysenter_esp:" , save01->sysenter_esp, |
3423 | "sysenter_eip:" , save01->sysenter_eip); |
3424 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3425 | "gpat:" , save->g_pat, "dbgctl:" , save->dbgctl); |
3426 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3427 | "br_from:" , save->br_from, "br_to:" , save->br_to); |
3428 | pr_err("%-15s %016llx %-13s %016llx\n" , |
3429 | "excp_from:" , save->last_excp_from, |
3430 | "excp_to:" , save->last_excp_to); |
3431 | } |
3432 | |
3433 | static bool svm_check_exit_valid(u64 exit_code) |
3434 | { |
3435 | return (exit_code < ARRAY_SIZE(svm_exit_handlers) && |
3436 | svm_exit_handlers[exit_code]); |
3437 | } |
3438 | |
3439 | static int svm_handle_invalid_exit(struct kvm_vcpu *vcpu, u64 exit_code) |
3440 | { |
3441 | vcpu_unimpl(vcpu, "svm: unexpected exit reason 0x%llx\n" , exit_code); |
3442 | dump_vmcb(vcpu); |
3443 | vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; |
3444 | vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON; |
3445 | vcpu->run->internal.ndata = 2; |
3446 | vcpu->run->internal.data[0] = exit_code; |
3447 | vcpu->run->internal.data[1] = vcpu->arch.last_vmentry_cpu; |
3448 | return 0; |
3449 | } |
3450 | |
3451 | int svm_invoke_exit_handler(struct kvm_vcpu *vcpu, u64 exit_code) |
3452 | { |
3453 | if (!svm_check_exit_valid(exit_code)) |
3454 | return svm_handle_invalid_exit(vcpu, exit_code); |
3455 | |
3456 | #ifdef CONFIG_MITIGATION_RETPOLINE |
3457 | if (exit_code == SVM_EXIT_MSR) |
3458 | return msr_interception(vcpu); |
3459 | else if (exit_code == SVM_EXIT_VINTR) |
3460 | return interrupt_window_interception(vcpu); |
3461 | else if (exit_code == SVM_EXIT_INTR) |
3462 | return intr_interception(vcpu); |
3463 | else if (exit_code == SVM_EXIT_HLT) |
3464 | return kvm_emulate_halt(vcpu); |
3465 | else if (exit_code == SVM_EXIT_NPF) |
3466 | return npf_interception(vcpu); |
3467 | #endif |
3468 | return svm_exit_handlers[exit_code](vcpu); |
3469 | } |
3470 | |
3471 | static void svm_get_exit_info(struct kvm_vcpu *vcpu, u32 *reason, |
3472 | u64 *info1, u64 *info2, |
3473 | u32 *intr_info, u32 *error_code) |
3474 | { |
3475 | struct vmcb_control_area *control = &to_svm(vcpu)->vmcb->control; |
3476 | |
3477 | *reason = control->exit_code; |
3478 | *info1 = control->exit_info_1; |
3479 | *info2 = control->exit_info_2; |
3480 | *intr_info = control->exit_int_info; |
3481 | if ((*intr_info & SVM_EXITINTINFO_VALID) && |
3482 | (*intr_info & SVM_EXITINTINFO_VALID_ERR)) |
3483 | *error_code = control->exit_int_info_err; |
3484 | else |
3485 | *error_code = 0; |
3486 | } |
3487 | |
3488 | static int svm_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath) |
3489 | { |
3490 | struct vcpu_svm *svm = to_svm(vcpu); |
3491 | struct kvm_run *kvm_run = vcpu->run; |
3492 | u32 exit_code = svm->vmcb->control.exit_code; |
3493 | |
3494 | /* SEV-ES guests must use the CR write traps to track CR registers. */ |
3495 | if (!sev_es_guest(kvm: vcpu->kvm)) { |
3496 | if (!svm_is_intercept(svm, bit: INTERCEPT_CR0_WRITE)) |
3497 | vcpu->arch.cr0 = svm->vmcb->save.cr0; |
3498 | if (npt_enabled) |
3499 | vcpu->arch.cr3 = svm->vmcb->save.cr3; |
3500 | } |
3501 | |
3502 | if (is_guest_mode(vcpu)) { |
3503 | int vmexit; |
3504 | |
3505 | trace_kvm_nested_vmexit(vcpu, KVM_ISA_SVM); |
3506 | |
3507 | vmexit = nested_svm_exit_special(svm); |
3508 | |
3509 | if (vmexit == NESTED_EXIT_CONTINUE) |
3510 | vmexit = nested_svm_exit_handled(svm); |
3511 | |
3512 | if (vmexit == NESTED_EXIT_DONE) |
3513 | return 1; |
3514 | } |
3515 | |
3516 | if (svm->vmcb->control.exit_code == SVM_EXIT_ERR) { |
3517 | kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; |
3518 | kvm_run->fail_entry.hardware_entry_failure_reason |
3519 | = svm->vmcb->control.exit_code; |
3520 | kvm_run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu; |
3521 | dump_vmcb(vcpu); |
3522 | return 0; |
3523 | } |
3524 | |
3525 | if (exit_fastpath != EXIT_FASTPATH_NONE) |
3526 | return 1; |
3527 | |
3528 | return svm_invoke_exit_handler(vcpu, exit_code); |
3529 | } |
3530 | |
3531 | static void pre_svm_run(struct kvm_vcpu *vcpu) |
3532 | { |
3533 | struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); |
3534 | struct vcpu_svm *svm = to_svm(vcpu); |
3535 | |
3536 | /* |
3537 | * If the previous vmrun of the vmcb occurred on a different physical |
3538 | * cpu, then mark the vmcb dirty and assign a new asid. Hardware's |
3539 | * vmcb clean bits are per logical CPU, as are KVM's asid assignments. |
3540 | */ |
3541 | if (unlikely(svm->current_vmcb->cpu != vcpu->cpu)) { |
3542 | svm->current_vmcb->asid_generation = 0; |
3543 | vmcb_mark_all_dirty(vmcb: svm->vmcb); |
3544 | svm->current_vmcb->cpu = vcpu->cpu; |
3545 | } |
3546 | |
3547 | if (sev_guest(kvm: vcpu->kvm)) |
3548 | return pre_sev_run(svm, cpu: vcpu->cpu); |
3549 | |
3550 | /* FIXME: handle wraparound of asid_generation */ |
3551 | if (svm->current_vmcb->asid_generation != sd->asid_generation) |
3552 | new_asid(svm, sd); |
3553 | } |
3554 | |
3555 | static void svm_inject_nmi(struct kvm_vcpu *vcpu) |
3556 | { |
3557 | struct vcpu_svm *svm = to_svm(vcpu); |
3558 | |
3559 | svm->vmcb->control.event_inj = SVM_EVTINJ_VALID | SVM_EVTINJ_TYPE_NMI; |
3560 | |
3561 | if (svm->nmi_l1_to_l2) |
3562 | return; |
3563 | |
3564 | /* |
3565 | * No need to manually track NMI masking when vNMI is enabled, hardware |
3566 | * automatically sets V_NMI_BLOCKING_MASK as appropriate, including the |
3567 | * case where software directly injects an NMI. |
3568 | */ |
3569 | if (!is_vnmi_enabled(svm)) { |
3570 | svm->nmi_masked = true; |
3571 | svm_set_iret_intercept(svm); |
3572 | } |
3573 | ++vcpu->stat.nmi_injections; |
3574 | } |
3575 | |
3576 | static bool svm_is_vnmi_pending(struct kvm_vcpu *vcpu) |
3577 | { |
3578 | struct vcpu_svm *svm = to_svm(vcpu); |
3579 | |
3580 | if (!is_vnmi_enabled(svm)) |
3581 | return false; |
3582 | |
3583 | return !!(svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK); |
3584 | } |
3585 | |
3586 | static bool svm_set_vnmi_pending(struct kvm_vcpu *vcpu) |
3587 | { |
3588 | struct vcpu_svm *svm = to_svm(vcpu); |
3589 | |
3590 | if (!is_vnmi_enabled(svm)) |
3591 | return false; |
3592 | |
3593 | if (svm->vmcb->control.int_ctl & V_NMI_PENDING_MASK) |
3594 | return false; |
3595 | |
3596 | svm->vmcb->control.int_ctl |= V_NMI_PENDING_MASK; |
3597 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_INTR); |
3598 | |
3599 | /* |
3600 | * Because the pending NMI is serviced by hardware, KVM can't know when |
3601 | * the NMI is "injected", but for all intents and purposes, passing the |
3602 | * NMI off to hardware counts as injection. |
3603 | */ |
3604 | ++vcpu->stat.nmi_injections; |
3605 | |
3606 | return true; |
3607 | } |
3608 | |
3609 | static void svm_inject_irq(struct kvm_vcpu *vcpu, bool reinjected) |
3610 | { |
3611 | struct vcpu_svm *svm = to_svm(vcpu); |
3612 | u32 type; |
3613 | |
3614 | if (vcpu->arch.interrupt.soft) { |
3615 | if (svm_update_soft_interrupt_rip(vcpu)) |
3616 | return; |
3617 | |
3618 | type = SVM_EVTINJ_TYPE_SOFT; |
3619 | } else { |
3620 | type = SVM_EVTINJ_TYPE_INTR; |
3621 | } |
3622 | |
3623 | trace_kvm_inj_virq(vcpu->arch.interrupt.nr, |
3624 | vcpu->arch.interrupt.soft, reinjected); |
3625 | ++vcpu->stat.irq_injections; |
3626 | |
3627 | svm->vmcb->control.event_inj = vcpu->arch.interrupt.nr | |
3628 | SVM_EVTINJ_VALID | type; |
3629 | } |
3630 | |
3631 | void svm_complete_interrupt_delivery(struct kvm_vcpu *vcpu, int delivery_mode, |
3632 | int trig_mode, int vector) |
3633 | { |
3634 | /* |
3635 | * apic->apicv_active must be read after vcpu->mode. |
3636 | * Pairs with smp_store_release in vcpu_enter_guest. |
3637 | */ |
3638 | bool in_guest_mode = (smp_load_acquire(&vcpu->mode) == IN_GUEST_MODE); |
3639 | |
3640 | /* Note, this is called iff the local APIC is in-kernel. */ |
3641 | if (!READ_ONCE(vcpu->arch.apic->apicv_active)) { |
3642 | /* Process the interrupt via kvm_check_and_inject_events(). */ |
3643 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
3644 | kvm_vcpu_kick(vcpu); |
3645 | return; |
3646 | } |
3647 | |
3648 | trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode, trig_mode, vector); |
3649 | if (in_guest_mode) { |
3650 | /* |
3651 | * Signal the doorbell to tell hardware to inject the IRQ. If |
3652 | * the vCPU exits the guest before the doorbell chimes, hardware |
3653 | * will automatically process AVIC interrupts at the next VMRUN. |
3654 | */ |
3655 | avic_ring_doorbell(vcpu); |
3656 | } else { |
3657 | /* |
3658 | * Wake the vCPU if it was blocking. KVM will then detect the |
3659 | * pending IRQ when checking if the vCPU has a wake event. |
3660 | */ |
3661 | kvm_vcpu_wake_up(vcpu); |
3662 | } |
3663 | } |
3664 | |
3665 | static void svm_deliver_interrupt(struct kvm_lapic *apic, int delivery_mode, |
3666 | int trig_mode, int vector) |
3667 | { |
3668 | kvm_lapic_set_irr(vector, apic); |
3669 | |
3670 | /* |
3671 | * Pairs with the smp_mb_*() after setting vcpu->guest_mode in |
3672 | * vcpu_enter_guest() to ensure the write to the vIRR is ordered before |
3673 | * the read of guest_mode. This guarantees that either VMRUN will see |
3674 | * and process the new vIRR entry, or that svm_complete_interrupt_delivery |
3675 | * will signal the doorbell if the CPU has already entered the guest. |
3676 | */ |
3677 | smp_mb__after_atomic(); |
3678 | svm_complete_interrupt_delivery(vcpu: apic->vcpu, delivery_mode, trig_mode, vector); |
3679 | } |
3680 | |
3681 | static void svm_update_cr8_intercept(struct kvm_vcpu *vcpu, int tpr, int irr) |
3682 | { |
3683 | struct vcpu_svm *svm = to_svm(vcpu); |
3684 | |
3685 | /* |
3686 | * SEV-ES guests must always keep the CR intercepts cleared. CR |
3687 | * tracking is done using the CR write traps. |
3688 | */ |
3689 | if (sev_es_guest(kvm: vcpu->kvm)) |
3690 | return; |
3691 | |
3692 | if (nested_svm_virtualize_tpr(vcpu)) |
3693 | return; |
3694 | |
3695 | svm_clr_intercept(svm, bit: INTERCEPT_CR8_WRITE); |
3696 | |
3697 | if (irr == -1) |
3698 | return; |
3699 | |
3700 | if (tpr >= irr) |
3701 | svm_set_intercept(svm, bit: INTERCEPT_CR8_WRITE); |
3702 | } |
3703 | |
3704 | static bool svm_get_nmi_mask(struct kvm_vcpu *vcpu) |
3705 | { |
3706 | struct vcpu_svm *svm = to_svm(vcpu); |
3707 | |
3708 | if (is_vnmi_enabled(svm)) |
3709 | return svm->vmcb->control.int_ctl & V_NMI_BLOCKING_MASK; |
3710 | else |
3711 | return svm->nmi_masked; |
3712 | } |
3713 | |
3714 | static void svm_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked) |
3715 | { |
3716 | struct vcpu_svm *svm = to_svm(vcpu); |
3717 | |
3718 | if (is_vnmi_enabled(svm)) { |
3719 | if (masked) |
3720 | svm->vmcb->control.int_ctl |= V_NMI_BLOCKING_MASK; |
3721 | else |
3722 | svm->vmcb->control.int_ctl &= ~V_NMI_BLOCKING_MASK; |
3723 | |
3724 | } else { |
3725 | svm->nmi_masked = masked; |
3726 | if (masked) |
3727 | svm_set_iret_intercept(svm); |
3728 | else |
3729 | svm_clr_iret_intercept(svm); |
3730 | } |
3731 | } |
3732 | |
3733 | bool svm_nmi_blocked(struct kvm_vcpu *vcpu) |
3734 | { |
3735 | struct vcpu_svm *svm = to_svm(vcpu); |
3736 | struct vmcb *vmcb = svm->vmcb; |
3737 | |
3738 | if (!gif_set(svm)) |
3739 | return true; |
3740 | |
3741 | if (is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) |
3742 | return false; |
3743 | |
3744 | if (svm_get_nmi_mask(vcpu)) |
3745 | return true; |
3746 | |
3747 | return vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK; |
3748 | } |
3749 | |
3750 | static int svm_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection) |
3751 | { |
3752 | struct vcpu_svm *svm = to_svm(vcpu); |
3753 | if (svm->nested.nested_run_pending) |
3754 | return -EBUSY; |
3755 | |
3756 | if (svm_nmi_blocked(vcpu)) |
3757 | return 0; |
3758 | |
3759 | /* An NMI must not be injected into L2 if it's supposed to VM-Exit. */ |
3760 | if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(svm)) |
3761 | return -EBUSY; |
3762 | return 1; |
3763 | } |
3764 | |
3765 | bool svm_interrupt_blocked(struct kvm_vcpu *vcpu) |
3766 | { |
3767 | struct vcpu_svm *svm = to_svm(vcpu); |
3768 | struct vmcb *vmcb = svm->vmcb; |
3769 | |
3770 | if (!gif_set(svm)) |
3771 | return true; |
3772 | |
3773 | if (is_guest_mode(vcpu)) { |
3774 | /* As long as interrupts are being delivered... */ |
3775 | if ((svm->nested.ctl.int_ctl & V_INTR_MASKING_MASK) |
3776 | ? !(svm->vmcb01.ptr->save.rflags & X86_EFLAGS_IF) |
3777 | : !(kvm_get_rflags(vcpu) & X86_EFLAGS_IF)) |
3778 | return true; |
3779 | |
3780 | /* ... vmexits aren't blocked by the interrupt shadow */ |
3781 | if (nested_exit_on_intr(svm)) |
3782 | return false; |
3783 | } else { |
3784 | if (!svm_get_if_flag(vcpu)) |
3785 | return true; |
3786 | } |
3787 | |
3788 | return (vmcb->control.int_state & SVM_INTERRUPT_SHADOW_MASK); |
3789 | } |
3790 | |
3791 | static int svm_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection) |
3792 | { |
3793 | struct vcpu_svm *svm = to_svm(vcpu); |
3794 | |
3795 | if (svm->nested.nested_run_pending) |
3796 | return -EBUSY; |
3797 | |
3798 | if (svm_interrupt_blocked(vcpu)) |
3799 | return 0; |
3800 | |
3801 | /* |
3802 | * An IRQ must not be injected into L2 if it's supposed to VM-Exit, |
3803 | * e.g. if the IRQ arrived asynchronously after checking nested events. |
3804 | */ |
3805 | if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(svm)) |
3806 | return -EBUSY; |
3807 | |
3808 | return 1; |
3809 | } |
3810 | |
3811 | static void svm_enable_irq_window(struct kvm_vcpu *vcpu) |
3812 | { |
3813 | struct vcpu_svm *svm = to_svm(vcpu); |
3814 | |
3815 | /* |
3816 | * In case GIF=0 we can't rely on the CPU to tell us when GIF becomes |
3817 | * 1, because that's a separate STGI/VMRUN intercept. The next time we |
3818 | * get that intercept, this function will be called again though and |
3819 | * we'll get the vintr intercept. However, if the vGIF feature is |
3820 | * enabled, the STGI interception will not occur. Enable the irq |
3821 | * window under the assumption that the hardware will set the GIF. |
3822 | */ |
3823 | if (vgif || gif_set(svm)) { |
3824 | /* |
3825 | * IRQ window is not needed when AVIC is enabled, |
3826 | * unless we have pending ExtINT since it cannot be injected |
3827 | * via AVIC. In such case, KVM needs to temporarily disable AVIC, |
3828 | * and fallback to injecting IRQ via V_IRQ. |
3829 | * |
3830 | * If running nested, AVIC is already locally inhibited |
3831 | * on this vCPU, therefore there is no need to request |
3832 | * the VM wide AVIC inhibition. |
3833 | */ |
3834 | if (!is_guest_mode(vcpu)) |
3835 | kvm_set_apicv_inhibit(kvm: vcpu->kvm, reason: APICV_INHIBIT_REASON_IRQWIN); |
3836 | |
3837 | svm_set_vintr(svm); |
3838 | } |
3839 | } |
3840 | |
3841 | static void svm_enable_nmi_window(struct kvm_vcpu *vcpu) |
3842 | { |
3843 | struct vcpu_svm *svm = to_svm(vcpu); |
3844 | |
3845 | /* |
3846 | * KVM should never request an NMI window when vNMI is enabled, as KVM |
3847 | * allows at most one to-be-injected NMI and one pending NMI, i.e. if |
3848 | * two NMIs arrive simultaneously, KVM will inject one and set |
3849 | * V_NMI_PENDING for the other. WARN, but continue with the standard |
3850 | * single-step approach to try and salvage the pending NMI. |
3851 | */ |
3852 | WARN_ON_ONCE(is_vnmi_enabled(svm)); |
3853 | |
3854 | if (svm_get_nmi_mask(vcpu) && !svm->awaiting_iret_completion) |
3855 | return; /* IRET will cause a vm exit */ |
3856 | |
3857 | /* |
3858 | * SEV-ES guests are responsible for signaling when a vCPU is ready to |
3859 | * receive a new NMI, as SEV-ES guests can't be single-stepped, i.e. |
3860 | * KVM can't intercept and single-step IRET to detect when NMIs are |
3861 | * unblocked (architecturally speaking). See SVM_VMGEXIT_NMI_COMPLETE. |
3862 | * |
3863 | * Note, GIF is guaranteed to be '1' for SEV-ES guests as hardware |
3864 | * ignores SEV-ES guest writes to EFER.SVME *and* CLGI/STGI are not |
3865 | * supported NAEs in the GHCB protocol. |
3866 | */ |
3867 | if (sev_es_guest(kvm: vcpu->kvm)) |
3868 | return; |
3869 | |
3870 | if (!gif_set(svm)) { |
3871 | if (vgif) |
3872 | svm_set_intercept(svm, bit: INTERCEPT_STGI); |
3873 | return; /* STGI will cause a vm exit */ |
3874 | } |
3875 | |
3876 | /* |
3877 | * Something prevents NMI from been injected. Single step over possible |
3878 | * problem (IRET or exception injection or interrupt shadow) |
3879 | */ |
3880 | svm->nmi_singlestep_guest_rflags = svm_get_rflags(vcpu); |
3881 | svm->nmi_singlestep = true; |
3882 | svm->vmcb->save.rflags |= (X86_EFLAGS_TF | X86_EFLAGS_RF); |
3883 | } |
3884 | |
3885 | static void svm_flush_tlb_asid(struct kvm_vcpu *vcpu) |
3886 | { |
3887 | struct vcpu_svm *svm = to_svm(vcpu); |
3888 | |
3889 | /* |
3890 | * Unlike VMX, SVM doesn't provide a way to flush only NPT TLB entries. |
3891 | * A TLB flush for the current ASID flushes both "host" and "guest" TLB |
3892 | * entries, and thus is a superset of Hyper-V's fine grained flushing. |
3893 | */ |
3894 | kvm_hv_vcpu_purge_flush_tlb(vcpu); |
3895 | |
3896 | /* |
3897 | * Flush only the current ASID even if the TLB flush was invoked via |
3898 | * kvm_flush_remote_tlbs(). Although flushing remote TLBs requires all |
3899 | * ASIDs to be flushed, KVM uses a single ASID for L1 and L2, and |
3900 | * unconditionally does a TLB flush on both nested VM-Enter and nested |
3901 | * VM-Exit (via kvm_mmu_reset_context()). |
3902 | */ |
3903 | if (static_cpu_has(X86_FEATURE_FLUSHBYASID)) |
3904 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_FLUSH_ASID; |
3905 | else |
3906 | svm->current_vmcb->asid_generation--; |
3907 | } |
3908 | |
3909 | static void svm_flush_tlb_current(struct kvm_vcpu *vcpu) |
3910 | { |
3911 | hpa_t root_tdp = vcpu->arch.mmu->root.hpa; |
3912 | |
3913 | /* |
3914 | * When running on Hyper-V with EnlightenedNptTlb enabled, explicitly |
3915 | * flush the NPT mappings via hypercall as flushing the ASID only |
3916 | * affects virtual to physical mappings, it does not invalidate guest |
3917 | * physical to host physical mappings. |
3918 | */ |
3919 | if (svm_hv_is_enlightened_tlb_enabled(vcpu) && VALID_PAGE(root_tdp)) |
3920 | hyperv_flush_guest_mapping(as: root_tdp); |
3921 | |
3922 | svm_flush_tlb_asid(vcpu); |
3923 | } |
3924 | |
3925 | static void svm_flush_tlb_all(struct kvm_vcpu *vcpu) |
3926 | { |
3927 | /* |
3928 | * When running on Hyper-V with EnlightenedNptTlb enabled, remote TLB |
3929 | * flushes should be routed to hv_flush_remote_tlbs() without requesting |
3930 | * a "regular" remote flush. Reaching this point means either there's |
3931 | * a KVM bug or a prior hv_flush_remote_tlbs() call failed, both of |
3932 | * which might be fatal to the guest. Yell, but try to recover. |
3933 | */ |
3934 | if (WARN_ON_ONCE(svm_hv_is_enlightened_tlb_enabled(vcpu))) |
3935 | hv_flush_remote_tlbs(vcpu->kvm); |
3936 | |
3937 | svm_flush_tlb_asid(vcpu); |
3938 | } |
3939 | |
3940 | static void svm_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t gva) |
3941 | { |
3942 | struct vcpu_svm *svm = to_svm(vcpu); |
3943 | |
3944 | invlpga(addr: gva, asid: svm->vmcb->control.asid); |
3945 | } |
3946 | |
3947 | static inline void sync_cr8_to_lapic(struct kvm_vcpu *vcpu) |
3948 | { |
3949 | struct vcpu_svm *svm = to_svm(vcpu); |
3950 | |
3951 | if (nested_svm_virtualize_tpr(vcpu)) |
3952 | return; |
3953 | |
3954 | if (!svm_is_intercept(svm, bit: INTERCEPT_CR8_WRITE)) { |
3955 | int cr8 = svm->vmcb->control.int_ctl & V_TPR_MASK; |
3956 | kvm_set_cr8(vcpu, cr8); |
3957 | } |
3958 | } |
3959 | |
3960 | static inline void sync_lapic_to_cr8(struct kvm_vcpu *vcpu) |
3961 | { |
3962 | struct vcpu_svm *svm = to_svm(vcpu); |
3963 | u64 cr8; |
3964 | |
3965 | if (nested_svm_virtualize_tpr(vcpu) || |
3966 | kvm_vcpu_apicv_active(vcpu)) |
3967 | return; |
3968 | |
3969 | cr8 = kvm_get_cr8(vcpu); |
3970 | svm->vmcb->control.int_ctl &= ~V_TPR_MASK; |
3971 | svm->vmcb->control.int_ctl |= cr8 & V_TPR_MASK; |
3972 | } |
3973 | |
3974 | static void svm_complete_soft_interrupt(struct kvm_vcpu *vcpu, u8 vector, |
3975 | int type) |
3976 | { |
3977 | bool is_exception = (type == SVM_EXITINTINFO_TYPE_EXEPT); |
3978 | bool is_soft = (type == SVM_EXITINTINFO_TYPE_SOFT); |
3979 | struct vcpu_svm *svm = to_svm(vcpu); |
3980 | |
3981 | /* |
3982 | * If NRIPS is enabled, KVM must snapshot the pre-VMRUN next_rip that's |
3983 | * associated with the original soft exception/interrupt. next_rip is |
3984 | * cleared on all exits that can occur while vectoring an event, so KVM |
3985 | * needs to manually set next_rip for re-injection. Unlike the !nrips |
3986 | * case below, this needs to be done if and only if KVM is re-injecting |
3987 | * the same event, i.e. if the event is a soft exception/interrupt, |
3988 | * otherwise next_rip is unused on VMRUN. |
3989 | */ |
3990 | if (nrips && (is_soft || (is_exception && kvm_exception_is_soft(vector))) && |
3991 | kvm_is_linear_rip(vcpu, linear_rip: svm->soft_int_old_rip + svm->soft_int_csbase)) |
3992 | svm->vmcb->control.next_rip = svm->soft_int_next_rip; |
3993 | /* |
3994 | * If NRIPS isn't enabled, KVM must manually advance RIP prior to |
3995 | * injecting the soft exception/interrupt. That advancement needs to |
3996 | * be unwound if vectoring didn't complete. Note, the new event may |
3997 | * not be the injected event, e.g. if KVM injected an INTn, the INTn |
3998 | * hit a #NP in the guest, and the #NP encountered a #PF, the #NP will |
3999 | * be the reported vectored event, but RIP still needs to be unwound. |
4000 | */ |
4001 | else if (!nrips && (is_soft || is_exception) && |
4002 | kvm_is_linear_rip(vcpu, linear_rip: svm->soft_int_next_rip + svm->soft_int_csbase)) |
4003 | kvm_rip_write(vcpu, svm->soft_int_old_rip); |
4004 | } |
4005 | |
4006 | static void svm_complete_interrupts(struct kvm_vcpu *vcpu) |
4007 | { |
4008 | struct vcpu_svm *svm = to_svm(vcpu); |
4009 | u8 vector; |
4010 | int type; |
4011 | u32 exitintinfo = svm->vmcb->control.exit_int_info; |
4012 | bool nmi_l1_to_l2 = svm->nmi_l1_to_l2; |
4013 | bool soft_int_injected = svm->soft_int_injected; |
4014 | |
4015 | svm->nmi_l1_to_l2 = false; |
4016 | svm->soft_int_injected = false; |
4017 | |
4018 | /* |
4019 | * If we've made progress since setting awaiting_iret_completion, we've |
4020 | * executed an IRET and can allow NMI injection. |
4021 | */ |
4022 | if (svm->awaiting_iret_completion && |
4023 | kvm_rip_read(vcpu) != svm->nmi_iret_rip) { |
4024 | svm->awaiting_iret_completion = false; |
4025 | svm->nmi_masked = false; |
4026 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
4027 | } |
4028 | |
4029 | vcpu->arch.nmi_injected = false; |
4030 | kvm_clear_exception_queue(vcpu); |
4031 | kvm_clear_interrupt_queue(vcpu); |
4032 | |
4033 | if (!(exitintinfo & SVM_EXITINTINFO_VALID)) |
4034 | return; |
4035 | |
4036 | kvm_make_request(KVM_REQ_EVENT, vcpu); |
4037 | |
4038 | vector = exitintinfo & SVM_EXITINTINFO_VEC_MASK; |
4039 | type = exitintinfo & SVM_EXITINTINFO_TYPE_MASK; |
4040 | |
4041 | if (soft_int_injected) |
4042 | svm_complete_soft_interrupt(vcpu, vector, type); |
4043 | |
4044 | switch (type) { |
4045 | case SVM_EXITINTINFO_TYPE_NMI: |
4046 | vcpu->arch.nmi_injected = true; |
4047 | svm->nmi_l1_to_l2 = nmi_l1_to_l2; |
4048 | break; |
4049 | case SVM_EXITINTINFO_TYPE_EXEPT: |
4050 | /* |
4051 | * Never re-inject a #VC exception. |
4052 | */ |
4053 | if (vector == X86_TRAP_VC) |
4054 | break; |
4055 | |
4056 | if (exitintinfo & SVM_EXITINTINFO_VALID_ERR) { |
4057 | u32 err = svm->vmcb->control.exit_int_info_err; |
4058 | kvm_requeue_exception_e(vcpu, nr: vector, error_code: err); |
4059 | |
4060 | } else |
4061 | kvm_requeue_exception(vcpu, nr: vector); |
4062 | break; |
4063 | case SVM_EXITINTINFO_TYPE_INTR: |
4064 | kvm_queue_interrupt(vcpu, vector, false); |
4065 | break; |
4066 | case SVM_EXITINTINFO_TYPE_SOFT: |
4067 | kvm_queue_interrupt(vcpu, vector, true); |
4068 | break; |
4069 | default: |
4070 | break; |
4071 | } |
4072 | |
4073 | } |
4074 | |
4075 | static void svm_cancel_injection(struct kvm_vcpu *vcpu) |
4076 | { |
4077 | struct vcpu_svm *svm = to_svm(vcpu); |
4078 | struct vmcb_control_area *control = &svm->vmcb->control; |
4079 | |
4080 | control->exit_int_info = control->event_inj; |
4081 | control->exit_int_info_err = control->event_inj_err; |
4082 | control->event_inj = 0; |
4083 | svm_complete_interrupts(vcpu); |
4084 | } |
4085 | |
4086 | static int svm_vcpu_pre_run(struct kvm_vcpu *vcpu) |
4087 | { |
4088 | return 1; |
4089 | } |
4090 | |
4091 | static fastpath_t svm_exit_handlers_fastpath(struct kvm_vcpu *vcpu) |
4092 | { |
4093 | if (is_guest_mode(vcpu)) |
4094 | return EXIT_FASTPATH_NONE; |
4095 | |
4096 | if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_MSR && |
4097 | to_svm(vcpu)->vmcb->control.exit_info_1) |
4098 | return handle_fastpath_set_msr_irqoff(vcpu); |
4099 | |
4100 | return EXIT_FASTPATH_NONE; |
4101 | } |
4102 | |
4103 | static noinstr void svm_vcpu_enter_exit(struct kvm_vcpu *vcpu, bool spec_ctrl_intercepted) |
4104 | { |
4105 | struct svm_cpu_data *sd = per_cpu_ptr(&svm_data, vcpu->cpu); |
4106 | struct vcpu_svm *svm = to_svm(vcpu); |
4107 | |
4108 | guest_state_enter_irqoff(); |
4109 | |
4110 | amd_clear_divider(); |
4111 | |
4112 | if (sev_es_guest(kvm: vcpu->kvm)) |
4113 | __svm_sev_es_vcpu_run(svm, spec_ctrl_intercepted, |
4114 | hostsa: sev_es_host_save_area(sd)); |
4115 | else |
4116 | __svm_vcpu_run(svm, spec_ctrl_intercepted); |
4117 | |
4118 | guest_state_exit_irqoff(); |
4119 | } |
4120 | |
4121 | static __no_kcsan fastpath_t svm_vcpu_run(struct kvm_vcpu *vcpu, |
4122 | bool force_immediate_exit) |
4123 | { |
4124 | struct vcpu_svm *svm = to_svm(vcpu); |
4125 | bool spec_ctrl_intercepted = msr_write_intercepted(vcpu, MSR_IA32_SPEC_CTRL); |
4126 | |
4127 | trace_kvm_entry(vcpu, force_immediate_exit); |
4128 | |
4129 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
4130 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
4131 | svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; |
4132 | |
4133 | /* |
4134 | * Disable singlestep if we're injecting an interrupt/exception. |
4135 | * We don't want our modified rflags to be pushed on the stack where |
4136 | * we might not be able to easily reset them if we disabled NMI |
4137 | * singlestep later. |
4138 | */ |
4139 | if (svm->nmi_singlestep && svm->vmcb->control.event_inj) { |
4140 | /* |
4141 | * Event injection happens before external interrupts cause a |
4142 | * vmexit and interrupts are disabled here, so smp_send_reschedule |
4143 | * is enough to force an immediate vmexit. |
4144 | */ |
4145 | disable_nmi_singlestep(svm); |
4146 | force_immediate_exit = true; |
4147 | } |
4148 | |
4149 | if (force_immediate_exit) |
4150 | smp_send_reschedule(vcpu->cpu); |
4151 | |
4152 | pre_svm_run(vcpu); |
4153 | |
4154 | sync_lapic_to_cr8(vcpu); |
4155 | |
4156 | if (unlikely(svm->asid != svm->vmcb->control.asid)) { |
4157 | svm->vmcb->control.asid = svm->asid; |
4158 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_ASID); |
4159 | } |
4160 | svm->vmcb->save.cr2 = vcpu->arch.cr2; |
4161 | |
4162 | svm_hv_update_vp_id(vmcb: svm->vmcb, vcpu); |
4163 | |
4164 | /* |
4165 | * Run with all-zero DR6 unless needed, so that we can get the exact cause |
4166 | * of a #DB. |
4167 | */ |
4168 | if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)) |
4169 | svm_set_dr6(svm, value: vcpu->arch.dr6); |
4170 | else |
4171 | svm_set_dr6(svm, DR6_ACTIVE_LOW); |
4172 | |
4173 | clgi(); |
4174 | kvm_load_guest_xsave_state(vcpu); |
4175 | |
4176 | kvm_wait_lapic_expire(vcpu); |
4177 | |
4178 | /* |
4179 | * If this vCPU has touched SPEC_CTRL, restore the guest's value if |
4180 | * it's non-zero. Since vmentry is serialising on affected CPUs, there |
4181 | * is no need to worry about the conditional branch over the wrmsr |
4182 | * being speculatively taken. |
4183 | */ |
4184 | if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) |
4185 | x86_spec_ctrl_set_guest(guest_virt_spec_ctrl: svm->virt_spec_ctrl); |
4186 | |
4187 | svm_vcpu_enter_exit(vcpu, spec_ctrl_intercepted); |
4188 | |
4189 | if (!static_cpu_has(X86_FEATURE_V_SPEC_CTRL)) |
4190 | x86_spec_ctrl_restore_host(guest_virt_spec_ctrl: svm->virt_spec_ctrl); |
4191 | |
4192 | if (!sev_es_guest(kvm: vcpu->kvm)) { |
4193 | vcpu->arch.cr2 = svm->vmcb->save.cr2; |
4194 | vcpu->arch.regs[VCPU_REGS_RAX] = svm->vmcb->save.rax; |
4195 | vcpu->arch.regs[VCPU_REGS_RSP] = svm->vmcb->save.rsp; |
4196 | vcpu->arch.regs[VCPU_REGS_RIP] = svm->vmcb->save.rip; |
4197 | } |
4198 | vcpu->arch.regs_dirty = 0; |
4199 | |
4200 | if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) |
4201 | kvm_before_interrupt(vcpu, KVM_HANDLING_NMI); |
4202 | |
4203 | kvm_load_host_xsave_state(vcpu); |
4204 | stgi(); |
4205 | |
4206 | /* Any pending NMI will happen here */ |
4207 | |
4208 | if (unlikely(svm->vmcb->control.exit_code == SVM_EXIT_NMI)) |
4209 | kvm_after_interrupt(vcpu); |
4210 | |
4211 | sync_cr8_to_lapic(vcpu); |
4212 | |
4213 | svm->next_rip = 0; |
4214 | if (is_guest_mode(vcpu)) { |
4215 | nested_sync_control_from_vmcb02(svm); |
4216 | |
4217 | /* Track VMRUNs that have made past consistency checking */ |
4218 | if (svm->nested.nested_run_pending && |
4219 | svm->vmcb->control.exit_code != SVM_EXIT_ERR) |
4220 | ++vcpu->stat.nested_run; |
4221 | |
4222 | svm->nested.nested_run_pending = 0; |
4223 | } |
4224 | |
4225 | svm->vmcb->control.tlb_ctl = TLB_CONTROL_DO_NOTHING; |
4226 | vmcb_mark_all_clean(vmcb: svm->vmcb); |
4227 | |
4228 | /* if exit due to PF check for async PF */ |
4229 | if (svm->vmcb->control.exit_code == SVM_EXIT_EXCP_BASE + PF_VECTOR) |
4230 | vcpu->arch.apf.host_apf_flags = |
4231 | kvm_read_and_reset_apf_flags(); |
4232 | |
4233 | vcpu->arch.regs_avail &= ~SVM_REGS_LAZY_LOAD_SET; |
4234 | |
4235 | /* |
4236 | * We need to handle MC intercepts here before the vcpu has a chance to |
4237 | * change the physical cpu |
4238 | */ |
4239 | if (unlikely(svm->vmcb->control.exit_code == |
4240 | SVM_EXIT_EXCP_BASE + MC_VECTOR)) |
4241 | svm_handle_mce(vcpu); |
4242 | |
4243 | trace_kvm_exit(vcpu, KVM_ISA_SVM); |
4244 | |
4245 | svm_complete_interrupts(vcpu); |
4246 | |
4247 | return svm_exit_handlers_fastpath(vcpu); |
4248 | } |
4249 | |
4250 | static void svm_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa, |
4251 | int root_level) |
4252 | { |
4253 | struct vcpu_svm *svm = to_svm(vcpu); |
4254 | unsigned long cr3; |
4255 | |
4256 | if (npt_enabled) { |
4257 | svm->vmcb->control.nested_cr3 = __sme_set(root_hpa); |
4258 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_NPT); |
4259 | |
4260 | hv_track_root_tdp(vcpu, root_hpa); |
4261 | |
4262 | cr3 = vcpu->arch.cr3; |
4263 | } else if (root_level >= PT64_ROOT_4LEVEL) { |
4264 | cr3 = __sme_set(root_hpa) | kvm_get_active_pcid(vcpu); |
4265 | } else { |
4266 | /* PCID in the guest should be impossible with a 32-bit MMU. */ |
4267 | WARN_ON_ONCE(kvm_get_active_pcid(vcpu)); |
4268 | cr3 = root_hpa; |
4269 | } |
4270 | |
4271 | svm->vmcb->save.cr3 = cr3; |
4272 | vmcb_mark_dirty(vmcb: svm->vmcb, bit: VMCB_CR); |
4273 | } |
4274 | |
4275 | static void |
4276 | svm_patch_hypercall(struct kvm_vcpu *vcpu, unsigned char *hypercall) |
4277 | { |
4278 | /* |
4279 | * Patch in the VMMCALL instruction: |
4280 | */ |
4281 | hypercall[0] = 0x0f; |
4282 | hypercall[1] = 0x01; |
4283 | hypercall[2] = 0xd9; |
4284 | } |
4285 | |
4286 | /* |
4287 | * The kvm parameter can be NULL (module initialization, or invocation before |
4288 | * VM creation). Be sure to check the kvm parameter before using it. |
4289 | */ |
4290 | static bool svm_has_emulated_msr(struct kvm *kvm, u32 index) |
4291 | { |
4292 | switch (index) { |
4293 | case MSR_IA32_MCG_EXT_CTL: |
4294 | case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR: |
4295 | return false; |
4296 | case MSR_IA32_SMBASE: |
4297 | if (!IS_ENABLED(CONFIG_KVM_SMM)) |
4298 | return false; |
4299 | /* SEV-ES guests do not support SMM, so report false */ |
4300 | if (kvm && sev_es_guest(kvm)) |
4301 | return false; |
4302 | break; |
4303 | default: |
4304 | break; |
4305 | } |
4306 | |
4307 | return true; |
4308 | } |
4309 | |
4310 | static void svm_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu) |
4311 | { |
4312 | struct vcpu_svm *svm = to_svm(vcpu); |
4313 | |
4314 | /* |
4315 | * SVM doesn't provide a way to disable just XSAVES in the guest, KVM |
4316 | * can only disable all variants of by disallowing CR4.OSXSAVE from |
4317 | * being set. As a result, if the host has XSAVE and XSAVES, and the |
4318 | * guest has XSAVE enabled, the guest can execute XSAVES without |
4319 | * faulting. Treat XSAVES as enabled in this case regardless of |
4320 | * whether it's advertised to the guest so that KVM context switches |
4321 | * XSS on VM-Enter/VM-Exit. Failure to do so would effectively give |
4322 | * the guest read/write access to the host's XSS. |
4323 | */ |
4324 | if (boot_cpu_has(X86_FEATURE_XSAVE) && |
4325 | boot_cpu_has(X86_FEATURE_XSAVES) && |
4326 | guest_cpuid_has(vcpu, X86_FEATURE_XSAVE)) |
4327 | kvm_governed_feature_set(vcpu, X86_FEATURE_XSAVES); |
4328 | |
4329 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_NRIPS); |
4330 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_TSCRATEMSR); |
4331 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LBRV); |
4332 | |
4333 | /* |
4334 | * Intercept VMLOAD if the vCPU mode is Intel in order to emulate that |
4335 | * VMLOAD drops bits 63:32 of SYSENTER (ignoring the fact that exposing |
4336 | * SVM on Intel is bonkers and extremely unlikely to work). |
4337 | */ |
4338 | if (!guest_cpuid_is_intel(vcpu)) |
4339 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_V_VMSAVE_VMLOAD); |
4340 | |
4341 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PAUSEFILTER); |
4342 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_PFTHRESHOLD); |
4343 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VGIF); |
4344 | kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VNMI); |
4345 | |
4346 | svm_recalc_instruction_intercepts(vcpu, svm); |
4347 | |
4348 | if (boot_cpu_has(X86_FEATURE_IBPB)) |
4349 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_PRED_CMD, read: 0, |
4350 | write: !!guest_has_pred_cmd_msr(vcpu)); |
4351 | |
4352 | if (boot_cpu_has(X86_FEATURE_FLUSH_L1D)) |
4353 | set_msr_interception(vcpu, msrpm: svm->msrpm, MSR_IA32_FLUSH_CMD, read: 0, |
4354 | write: !!guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D)); |
4355 | |
4356 | if (sev_guest(kvm: vcpu->kvm)) |
4357 | sev_vcpu_after_set_cpuid(svm); |
4358 | |
4359 | init_vmcb_after_set_cpuid(vcpu); |
4360 | } |
4361 | |
4362 | static bool svm_has_wbinvd_exit(void) |
4363 | { |
4364 | return true; |
4365 | } |
4366 | |
4367 | #define PRE_EX(exit) { .exit_code = (exit), \ |
4368 | .stage = X86_ICPT_PRE_EXCEPT, } |
4369 | #define POST_EX(exit) { .exit_code = (exit), \ |
4370 | .stage = X86_ICPT_POST_EXCEPT, } |
4371 | #define POST_MEM(exit) { .exit_code = (exit), \ |
4372 | .stage = X86_ICPT_POST_MEMACCESS, } |
4373 | |
4374 | static const struct __x86_intercept { |
4375 | u32 exit_code; |
4376 | enum x86_intercept_stage stage; |
4377 | } x86_intercept_map[] = { |
4378 | [x86_intercept_cr_read] = POST_EX(SVM_EXIT_READ_CR0), |
4379 | [x86_intercept_cr_write] = POST_EX(SVM_EXIT_WRITE_CR0), |
4380 | [x86_intercept_clts] = POST_EX(SVM_EXIT_WRITE_CR0), |
4381 | [x86_intercept_lmsw] = POST_EX(SVM_EXIT_WRITE_CR0), |
4382 | [x86_intercept_smsw] = POST_EX(SVM_EXIT_READ_CR0), |
4383 | [x86_intercept_dr_read] = POST_EX(SVM_EXIT_READ_DR0), |
4384 | [x86_intercept_dr_write] = POST_EX(SVM_EXIT_WRITE_DR0), |
4385 | [x86_intercept_sldt] = POST_EX(SVM_EXIT_LDTR_READ), |
4386 | [x86_intercept_str] = POST_EX(SVM_EXIT_TR_READ), |
4387 | [x86_intercept_lldt] = POST_EX(SVM_EXIT_LDTR_WRITE), |
4388 | [x86_intercept_ltr] = POST_EX(SVM_EXIT_TR_WRITE), |
4389 | [x86_intercept_sgdt] = POST_EX(SVM_EXIT_GDTR_READ), |
4390 | [x86_intercept_sidt] = POST_EX(SVM_EXIT_IDTR_READ), |
4391 | [x86_intercept_lgdt] = POST_EX(SVM_EXIT_GDTR_WRITE), |
4392 | [x86_intercept_lidt] = POST_EX(SVM_EXIT_IDTR_WRITE), |
4393 | [x86_intercept_vmrun] = POST_EX(SVM_EXIT_VMRUN), |
4394 | [x86_intercept_vmmcall] = POST_EX(SVM_EXIT_VMMCALL), |
4395 | [x86_intercept_vmload] = POST_EX(SVM_EXIT_VMLOAD), |
4396 | [x86_intercept_vmsave] = POST_EX(SVM_EXIT_VMSAVE), |
4397 | [x86_intercept_stgi] = POST_EX(SVM_EXIT_STGI), |
4398 | [x86_intercept_clgi] = POST_EX(SVM_EXIT_CLGI), |
4399 | [x86_intercept_skinit] = POST_EX(SVM_EXIT_SKINIT), |
4400 | [x86_intercept_invlpga] = POST_EX(SVM_EXIT_INVLPGA), |
4401 | [x86_intercept_rdtscp] = POST_EX(SVM_EXIT_RDTSCP), |
4402 | [x86_intercept_monitor] = POST_MEM(SVM_EXIT_MONITOR), |
4403 | [x86_intercept_mwait] = POST_EX(SVM_EXIT_MWAIT), |
4404 | [x86_intercept_invlpg] = POST_EX(SVM_EXIT_INVLPG), |
4405 | [x86_intercept_invd] = POST_EX(SVM_EXIT_INVD), |
4406 | [x86_intercept_wbinvd] = POST_EX(SVM_EXIT_WBINVD), |
4407 | [x86_intercept_wrmsr] = POST_EX(SVM_EXIT_MSR), |
4408 | [x86_intercept_rdtsc] = POST_EX(SVM_EXIT_RDTSC), |
4409 | [x86_intercept_rdmsr] = POST_EX(SVM_EXIT_MSR), |
4410 | [x86_intercept_rdpmc] = POST_EX(SVM_EXIT_RDPMC), |
4411 | [x86_intercept_cpuid] = PRE_EX(SVM_EXIT_CPUID), |
4412 | [x86_intercept_rsm] = PRE_EX(SVM_EXIT_RSM), |
4413 | [x86_intercept_pause] = PRE_EX(SVM_EXIT_PAUSE), |
4414 | [x86_intercept_pushf] = PRE_EX(SVM_EXIT_PUSHF), |
4415 | [x86_intercept_popf] = PRE_EX(SVM_EXIT_POPF), |
4416 | [x86_intercept_intn] = PRE_EX(SVM_EXIT_SWINT), |
4417 | [x86_intercept_iret] = PRE_EX(SVM_EXIT_IRET), |
4418 | [x86_intercept_icebp] = PRE_EX(SVM_EXIT_ICEBP), |
4419 | [x86_intercept_hlt] = POST_EX(SVM_EXIT_HLT), |
4420 | [x86_intercept_in] = POST_EX(SVM_EXIT_IOIO), |
4421 | [x86_intercept_ins] = POST_EX(SVM_EXIT_IOIO), |
4422 | [x86_intercept_out] = POST_EX(SVM_EXIT_IOIO), |
4423 | [x86_intercept_outs] = POST_EX(SVM_EXIT_IOIO), |
4424 | [x86_intercept_xsetbv] = PRE_EX(SVM_EXIT_XSETBV), |
4425 | }; |
4426 | |
4427 | #undef PRE_EX |
4428 | #undef POST_EX |
4429 | #undef POST_MEM |
4430 | |
4431 | static int svm_check_intercept(struct kvm_vcpu *vcpu, |
4432 | struct x86_instruction_info *info, |
4433 | enum x86_intercept_stage stage, |
4434 | struct x86_exception *exception) |
4435 | { |
4436 | struct vcpu_svm *svm = to_svm(vcpu); |
4437 | int vmexit, ret = X86EMUL_CONTINUE; |
4438 | struct __x86_intercept icpt_info; |
4439 | struct vmcb *vmcb = svm->vmcb; |
4440 | |
4441 | if (info->intercept >= ARRAY_SIZE(x86_intercept_map)) |
4442 | goto out; |
4443 | |
4444 | icpt_info = x86_intercept_map[info->intercept]; |
4445 | |
4446 | if (stage != icpt_info.stage) |
4447 | goto out; |
4448 | |
4449 | switch (icpt_info.exit_code) { |
4450 | case SVM_EXIT_READ_CR0: |
4451 | if (info->intercept == x86_intercept_cr_read) |
4452 | icpt_info.exit_code += info->modrm_reg; |
4453 | break; |
4454 | case SVM_EXIT_WRITE_CR0: { |
4455 | unsigned long cr0, val; |
4456 | |
4457 | if (info->intercept == x86_intercept_cr_write) |
4458 | icpt_info.exit_code += info->modrm_reg; |
4459 | |
4460 | if (icpt_info.exit_code != SVM_EXIT_WRITE_CR0 || |
4461 | info->intercept == x86_intercept_clts) |
4462 | break; |
4463 | |
4464 | if (!(vmcb12_is_intercept(control: &svm->nested.ctl, |
4465 | bit: INTERCEPT_SELECTIVE_CR0))) |
4466 | break; |
4467 | |
4468 | cr0 = vcpu->arch.cr0 & ~SVM_CR0_SELECTIVE_MASK; |
4469 | val = info->src_val & ~SVM_CR0_SELECTIVE_MASK; |
4470 | |
4471 | if (info->intercept == x86_intercept_lmsw) { |
4472 | cr0 &= 0xfUL; |
4473 | val &= 0xfUL; |
4474 | /* lmsw can't clear PE - catch this here */ |
4475 | if (cr0 & X86_CR0_PE) |
4476 | val |= X86_CR0_PE; |
4477 | } |
4478 | |
4479 | if (cr0 ^ val) |
4480 | icpt_info.exit_code = SVM_EXIT_CR0_SEL_WRITE; |
4481 | |
4482 | break; |
4483 | } |
4484 | case SVM_EXIT_READ_DR0: |
4485 | case SVM_EXIT_WRITE_DR0: |
4486 | icpt_info.exit_code += info->modrm_reg; |
4487 | break; |
4488 | case SVM_EXIT_MSR: |
4489 | if (info->intercept == x86_intercept_wrmsr) |
4490 | vmcb->control.exit_info_1 = 1; |
4491 | else |
4492 | vmcb->control.exit_info_1 = 0; |
4493 | break; |
4494 | case SVM_EXIT_PAUSE: |
4495 | /* |
4496 | * We get this for NOP only, but pause |
4497 | * is rep not, check this here |
4498 | */ |
4499 | if (info->rep_prefix != REPE_PREFIX) |
4500 | goto out; |
4501 | break; |
4502 | case SVM_EXIT_IOIO: { |
4503 | u64 exit_info; |
4504 | u32 bytes; |
4505 | |
4506 | if (info->intercept == x86_intercept_in || |
4507 | info->intercept == x86_intercept_ins) { |
4508 | exit_info = ((info->src_val & 0xffff) << 16) | |
4509 | SVM_IOIO_TYPE_MASK; |
4510 | bytes = info->dst_bytes; |
4511 | } else { |
4512 | exit_info = (info->dst_val & 0xffff) << 16; |
4513 | bytes = info->src_bytes; |
4514 | } |
4515 | |
4516 | if (info->intercept == x86_intercept_outs || |
4517 | info->intercept == x86_intercept_ins) |
4518 | exit_info |= SVM_IOIO_STR_MASK; |
4519 | |
4520 | if (info->rep_prefix) |
4521 | exit_info |= SVM_IOIO_REP_MASK; |
4522 | |
4523 | bytes = min(bytes, 4u); |
4524 | |
4525 | exit_info |= bytes << SVM_IOIO_SIZE_SHIFT; |
4526 | |
4527 | exit_info |= (u32)info->ad_bytes << (SVM_IOIO_ASIZE_SHIFT - 1); |
4528 | |
4529 | vmcb->control.exit_info_1 = exit_info; |
4530 | vmcb->control.exit_info_2 = info->next_rip; |
4531 | |
4532 | break; |
4533 | } |
4534 | default: |
4535 | break; |
4536 | } |
4537 | |
4538 | /* TODO: Advertise NRIPS to guest hypervisor unconditionally */ |
4539 | if (static_cpu_has(X86_FEATURE_NRIPS)) |
4540 | vmcb->control.next_rip = info->next_rip; |
4541 | vmcb->control.exit_code = icpt_info.exit_code; |
4542 | vmexit = nested_svm_exit_handled(svm); |
4543 | |
4544 | ret = (vmexit == NESTED_EXIT_DONE) ? X86EMUL_INTERCEPTED |
4545 | : X86EMUL_CONTINUE; |
4546 | |
4547 | out: |
4548 | return ret; |
4549 | } |
4550 | |
4551 | static void svm_handle_exit_irqoff(struct kvm_vcpu *vcpu) |
4552 | { |
4553 | if (to_svm(vcpu)->vmcb->control.exit_code == SVM_EXIT_INTR) |
4554 | vcpu->arch.at_instruction_boundary = true; |
4555 | } |
4556 | |
4557 | static void svm_sched_in(struct kvm_vcpu *vcpu, int cpu) |
4558 | { |
4559 | if (!kvm_pause_in_guest(vcpu->kvm)) |
4560 | shrink_ple_window(vcpu); |
4561 | } |
4562 | |
4563 | static void svm_setup_mce(struct kvm_vcpu *vcpu) |
4564 | { |
4565 | /* [63:9] are reserved. */ |
4566 | vcpu->arch.mcg_cap &= 0x1ff; |
4567 | } |
4568 | |
4569 | #ifdef CONFIG_KVM_SMM |
4570 | bool svm_smi_blocked(struct kvm_vcpu *vcpu) |
4571 | { |
4572 | struct vcpu_svm *svm = to_svm(vcpu); |
4573 | |
4574 | /* Per APM Vol.2 15.22.2 "Response to SMI" */ |
4575 | if (!gif_set(svm)) |
4576 | return true; |
4577 | |
4578 | return is_smm(vcpu); |
4579 | } |
4580 | |
4581 | static int svm_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection) |
4582 | { |
4583 | struct vcpu_svm *svm = to_svm(vcpu); |
4584 | if (svm->nested.nested_run_pending) |
4585 | return -EBUSY; |
4586 | |
4587 | if (svm_smi_blocked(vcpu)) |
4588 | return 0; |
4589 | |
4590 | /* An SMI must not be injected into L2 if it's supposed to VM-Exit. */ |
4591 | if (for_injection && is_guest_mode(vcpu) && nested_exit_on_smi(svm)) |
4592 | return -EBUSY; |
4593 | |
4594 | return 1; |
4595 | } |
4596 | |
4597 | static int svm_enter_smm(struct kvm_vcpu *vcpu, union kvm_smram *smram) |
4598 | { |
4599 | struct vcpu_svm *svm = to_svm(vcpu); |
4600 | struct kvm_host_map map_save; |
4601 | int ret; |
4602 | |
4603 | if (!is_guest_mode(vcpu)) |
4604 | return 0; |
4605 | |
4606 | /* |
4607 | * 32-bit SMRAM format doesn't preserve EFER and SVM state. Userspace is |
4608 | * responsible for ensuring nested SVM and SMIs are mutually exclusive. |
4609 | */ |
4610 | |
4611 | if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) |
4612 | return 1; |
4613 | |
4614 | smram->smram64.svm_guest_flag = 1; |
4615 | smram->smram64.svm_guest_vmcb_gpa = svm->nested.vmcb12_gpa; |
4616 | |
4617 | svm->vmcb->save.rax = vcpu->arch.regs[VCPU_REGS_RAX]; |
4618 | svm->vmcb->save.rsp = vcpu->arch.regs[VCPU_REGS_RSP]; |
4619 | svm->vmcb->save.rip = vcpu->arch.regs[VCPU_REGS_RIP]; |
4620 | |
4621 | ret = nested_svm_simple_vmexit(svm, SVM_EXIT_SW); |
4622 | if (ret) |
4623 | return ret; |
4624 | |
4625 | /* |
4626 | * KVM uses VMCB01 to store L1 host state while L2 runs but |
4627 | * VMCB01 is going to be used during SMM and thus the state will |
4628 | * be lost. Temporary save non-VMLOAD/VMSAVE state to the host save |
4629 | * area pointed to by MSR_VM_HSAVE_PA. APM guarantees that the |
4630 | * format of the area is identical to guest save area offsetted |
4631 | * by 0x400 (matches the offset of 'struct vmcb_save_area' |
4632 | * within 'struct vmcb'). Note: HSAVE area may also be used by |
4633 | * L1 hypervisor to save additional host context (e.g. KVM does |
4634 | * that, see svm_prepare_switch_to_guest()) which must be |
4635 | * preserved. |
4636 | */ |
4637 | if (kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: svm->nested.hsave_msr), map: &map_save)) |
4638 | return 1; |
4639 | |
4640 | BUILD_BUG_ON(offsetof(struct vmcb, save) != 0x400); |
4641 | |
4642 | svm_copy_vmrun_state(to_save: map_save.hva + 0x400, |
4643 | from_save: &svm->vmcb01.ptr->save); |
4644 | |
4645 | kvm_vcpu_unmap(vcpu, map: &map_save, dirty: true); |
4646 | return 0; |
4647 | } |
4648 | |
4649 | static int svm_leave_smm(struct kvm_vcpu *vcpu, const union kvm_smram *smram) |
4650 | { |
4651 | struct vcpu_svm *svm = to_svm(vcpu); |
4652 | struct kvm_host_map map, map_save; |
4653 | struct vmcb *vmcb12; |
4654 | int ret; |
4655 | |
4656 | const struct kvm_smram_state_64 *smram64 = &smram->smram64; |
4657 | |
4658 | if (!guest_cpuid_has(vcpu, X86_FEATURE_LM)) |
4659 | return 0; |
4660 | |
4661 | /* Non-zero if SMI arrived while vCPU was in guest mode. */ |
4662 | if (!smram64->svm_guest_flag) |
4663 | return 0; |
4664 | |
4665 | if (!guest_cpuid_has(vcpu, X86_FEATURE_SVM)) |
4666 | return 1; |
4667 | |
4668 | if (!(smram64->efer & EFER_SVME)) |
4669 | return 1; |
4670 | |
4671 | if (kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: smram64->svm_guest_vmcb_gpa), map: &map)) |
4672 | return 1; |
4673 | |
4674 | ret = 1; |
4675 | if (kvm_vcpu_map(vcpu, gpa: gpa_to_gfn(gpa: svm->nested.hsave_msr), map: &map_save)) |
4676 | goto unmap_map; |
4677 | |
4678 | if (svm_allocate_nested(svm)) |
4679 | goto unmap_save; |
4680 | |
4681 | /* |
4682 | * Restore L1 host state from L1 HSAVE area as VMCB01 was |
4683 | * used during SMM (see svm_enter_smm()) |
4684 | */ |
4685 | |
4686 | svm_copy_vmrun_state(to_save: &svm->vmcb01.ptr->save, from_save: map_save.hva + 0x400); |
4687 | |
4688 | /* |
4689 | * Enter the nested guest now |
4690 | */ |
4691 | |
4692 | vmcb_mark_all_dirty(vmcb: svm->vmcb01.ptr); |
4693 | |
4694 | vmcb12 = map.hva; |
4695 | nested_copy_vmcb_control_to_cache(svm, control: &vmcb12->control); |
4696 | nested_copy_vmcb_save_to_cache(svm, save: &vmcb12->save); |
4697 | ret = enter_svm_guest_mode(vcpu, vmcb_gpa: smram64->svm_guest_vmcb_gpa, vmcb12, from_vmrun: false); |
4698 | |
4699 | if (ret) |
4700 | goto unmap_save; |
4701 | |
4702 | svm->nested.nested_run_pending = 1; |
4703 | |
4704 | unmap_save: |
4705 | kvm_vcpu_unmap(vcpu, map: &map_save, dirty: true); |
4706 | unmap_map: |
4707 | kvm_vcpu_unmap(vcpu, map: &map, dirty: true); |
4708 | return ret; |
4709 | } |
4710 | |
4711 | static void svm_enable_smi_window(struct kvm_vcpu *vcpu) |
4712 | { |
4713 | struct vcpu_svm *svm = to_svm(vcpu); |
4714 | |
4715 | if (!gif_set(svm)) { |
4716 | if (vgif) |
4717 | svm_set_intercept(svm, bit: INTERCEPT_STGI); |
4718 | /* STGI will cause a vm exit */ |
4719 | } else { |
4720 | /* We must be in SMM; RSM will cause a vmexit anyway. */ |
4721 | } |
4722 | } |
4723 | #endif |
4724 | |
4725 | static int svm_check_emulate_instruction(struct kvm_vcpu *vcpu, int emul_type, |
4726 | void *insn, int insn_len) |
4727 | { |
4728 | bool smep, smap, is_user; |
4729 | u64 error_code; |
4730 | |
4731 | /* Emulation is always possible when KVM has access to all guest state. */ |
4732 | if (!sev_guest(vcpu->kvm)) |
4733 | return X86EMUL_CONTINUE; |
4734 | |
4735 | /* #UD and #GP should never be intercepted for SEV guests. */ |
4736 | WARN_ON_ONCE(emul_type & (EMULTYPE_TRAP_UD | |
4737 | EMULTYPE_TRAP_UD_FORCED | |
4738 | EMULTYPE_VMWARE_GP)); |
4739 | |
4740 | /* |
4741 | * Emulation is impossible for SEV-ES guests as KVM doesn't have access |
4742 | * to guest register state. |
4743 | */ |
4744 | if (sev_es_guest(vcpu->kvm)) |
4745 | return X86EMUL_RETRY_INSTR; |
4746 | |
4747 | /* |
4748 | * Emulation is possible if the instruction is already decoded, e.g. |
4749 | * when completing I/O after returning from userspace. |
4750 | */ |
4751 | if (emul_type & EMULTYPE_NO_DECODE) |
4752 | return X86EMUL_CONTINUE; |
4753 | |
4754 | /* |
4755 | * Emulation is possible for SEV guests if and only if a prefilled |
4756 | * buffer containing the bytes of the intercepted instruction is |
4757 | * available. SEV guest memory is encrypted with a guest specific key |
4758 | * and cannot be decrypted by KVM, i.e. KVM would read ciphertext and |
4759 | * decode garbage. |
4760 | * |
4761 | * If KVM is NOT trying to simply skip an instruction, inject #UD if |
4762 | * KVM reached this point without an instruction buffer. In practice, |
4763 | * this path should never be hit by a well-behaved guest, e.g. KVM |
4764 | * doesn't intercept #UD or #GP for SEV guests, but this path is still |
4765 | * theoretically reachable, e.g. via unaccelerated fault-like AVIC |
4766 | * access, and needs to be handled by KVM to avoid putting the guest |
4767 | * into an infinite loop. Injecting #UD is somewhat arbitrary, but |
4768 | * its the least awful option given lack of insight into the guest. |
4769 | * |
4770 | * If KVM is trying to skip an instruction, simply resume the guest. |
4771 | * If a #NPF occurs while the guest is vectoring an INT3/INTO, then KVM |
4772 | * will attempt to re-inject the INT3/INTO and skip the instruction. |
4773 | * In that scenario, retrying the INT3/INTO and hoping the guest will |
4774 | * make forward progress is the only option that has a chance of |
4775 | * success (and in practice it will work the vast majority of the time). |
4776 | */ |
4777 | if (unlikely(!insn)) { |
4778 | if (emul_type & EMULTYPE_SKIP) |
4779 | return X86EMUL_UNHANDLEABLE; |
4780 | |
4781 | kvm_queue_exception(vcpu, UD_VECTOR); |
4782 | return X86EMUL_PROPAGATE_FAULT; |
4783 | } |
4784 | |
4785 | /* |
4786 | * Emulate for SEV guests if the insn buffer is not empty. The buffer |
4787 | * will be empty if the DecodeAssist microcode cannot fetch bytes for |
4788 | * the faulting instruction because the code fetch itself faulted, e.g. |
4789 | * the guest attempted to fetch from emulated MMIO or a guest page |
4790 | * table used to translate CS:RIP resides in emulated MMIO. |
4791 | */ |
4792 | if (likely(insn_len)) |
4793 | return X86EMUL_CONTINUE; |
4794 | |
4795 | /* |
4796 | * Detect and workaround Errata 1096 Fam_17h_00_0Fh. |
4797 | * |
4798 | * Errata: |
4799 | * When CPU raises #NPF on guest data access and vCPU CR4.SMAP=1, it is |
4800 | * possible that CPU microcode implementing DecodeAssist will fail to |
4801 | * read guest memory at CS:RIP and vmcb.GuestIntrBytes will incorrectly |
4802 | * be '0'. This happens because microcode reads CS:RIP using a _data_ |
4803 | * loap uop with CPL=0 privileges. If the load hits a SMAP #PF, ucode |
4804 | * gives up and does not fill the instruction bytes buffer. |
4805 | * |
4806 | * As above, KVM reaches this point iff the VM is an SEV guest, the CPU |
4807 | * supports DecodeAssist, a #NPF was raised, KVM's page fault handler |
4808 | * triggered emulation (e.g. for MMIO), and the CPU returned 0 in the |
4809 | * GuestIntrBytes field of the VMCB. |
4810 | * |
4811 | * This does _not_ mean that the erratum has been encountered, as the |
4812 | * DecodeAssist will also fail if the load for CS:RIP hits a legitimate |
4813 | * #PF, e.g. if the guest attempt to execute from emulated MMIO and |
4814 | * encountered a reserved/not-present #PF. |
4815 | * |
4816 | * To hit the erratum, the following conditions must be true: |
4817 | * 1. CR4.SMAP=1 (obviously). |
4818 | * 2. CR4.SMEP=0 || CPL=3. If SMEP=1 and CPL<3, the erratum cannot |
4819 | * have been hit as the guest would have encountered a SMEP |
4820 | * violation #PF, not a #NPF. |
4821 | * 3. The #NPF is not due to a code fetch, in which case failure to |
4822 | * retrieve the instruction bytes is legitimate (see abvoe). |
4823 | * |
4824 | * In addition, don't apply the erratum workaround if the #NPF occurred |
4825 | * while translating guest page tables (see below). |
4826 | */ |
4827 | error_code = to_svm(vcpu)->vmcb->control.exit_info_1; |
4828 | if (error_code & (PFERR_GUEST_PAGE_MASK | PFERR_FETCH_MASK)) |
4829 | goto resume_guest; |
4830 | |
4831 | smep = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMEP); |
4832 | smap = kvm_is_cr4_bit_set(vcpu, X86_CR4_SMAP); |
4833 | is_user = svm_get_cpl(vcpu) == 3; |
4834 | if (smap && (!smep || is_user)) { |
4835 | pr_err_ratelimited("SEV Guest triggered AMD Erratum 1096\n" ); |
4836 | |
4837 | /* |
4838 | * If the fault occurred in userspace, arbitrarily inject #GP |
4839 | * to avoid killing the guest and to hopefully avoid confusing |
4840 | * the guest kernel too much, e.g. injecting #PF would not be |
4841 | * coherent with respect to the guest's page tables. Request |
4842 | * triple fault if the fault occurred in the kernel as there's |
4843 | * no fault that KVM can inject without confusing the guest. |
4844 | * In practice, the triple fault is moot as no sane SEV kernel |
4845 | * will execute from user memory while also running with SMAP=1. |
4846 | */ |
4847 | if (is_user) |
4848 | kvm_inject_gp(vcpu, error_code: 0); |
4849 | else |
4850 | kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu); |
4851 | return X86EMUL_PROPAGATE_FAULT; |
4852 | } |
4853 | |
4854 | resume_guest: |
4855 | /* |
4856 | * If the erratum was not hit, simply resume the guest and let it fault |
4857 | * again. While awful, e.g. the vCPU may get stuck in an infinite loop |
4858 | * if the fault is at CPL=0, it's the lesser of all evils. Exiting to |
4859 | * userspace will kill the guest, and letting the emulator read garbage |
4860 | * will yield random behavior and potentially corrupt the guest. |
4861 | * |
4862 | * Simply resuming the guest is technically not a violation of the SEV |
4863 | * architecture. AMD's APM states that all code fetches and page table |
4864 | * accesses for SEV guest are encrypted, regardless of the C-Bit. The |
4865 | * APM also states that encrypted accesses to MMIO are "ignored", but |
4866 | * doesn't explicitly define "ignored", i.e. doing nothing and letting |
4867 | * the guest spin is technically "ignoring" the access. |
4868 | */ |
4869 | return X86EMUL_RETRY_INSTR; |
4870 | } |
4871 | |
4872 | static bool svm_apic_init_signal_blocked(struct kvm_vcpu *vcpu) |
4873 | { |
4874 | struct vcpu_svm *svm = to_svm(vcpu); |
4875 | |
4876 | return !gif_set(svm); |
4877 | } |
4878 | |
4879 | static void svm_vcpu_deliver_sipi_vector(struct kvm_vcpu *vcpu, u8 vector) |
4880 | { |
4881 | if (!sev_es_guest(kvm: vcpu->kvm)) |
4882 | return kvm_vcpu_deliver_sipi_vector(vcpu, vector); |
4883 | |
4884 | sev_vcpu_deliver_sipi_vector(vcpu, vector); |
4885 | } |
4886 | |
4887 | static void svm_vm_destroy(struct kvm *kvm) |
4888 | { |
4889 | avic_vm_destroy(kvm); |
4890 | sev_vm_destroy(kvm); |
4891 | } |
4892 | |
4893 | static int svm_vm_init(struct kvm *kvm) |
4894 | { |
4895 | if (!pause_filter_count || !pause_filter_thresh) |
4896 | kvm->arch.pause_in_guest = true; |
4897 | |
4898 | if (enable_apicv) { |
4899 | int ret = avic_vm_init(kvm); |
4900 | if (ret) |
4901 | return ret; |
4902 | } |
4903 | |
4904 | return 0; |
4905 | } |
4906 | |
4907 | static void *svm_alloc_apic_backing_page(struct kvm_vcpu *vcpu) |
4908 | { |
4909 | struct page *page = snp_safe_alloc_page(vcpu); |
4910 | |
4911 | if (!page) |
4912 | return NULL; |
4913 | |
4914 | return page_address(page); |
4915 | } |
4916 | |
4917 | static struct kvm_x86_ops svm_x86_ops __initdata = { |
4918 | .name = KBUILD_MODNAME, |
4919 | |
4920 | .check_processor_compatibility = svm_check_processor_compat, |
4921 | |
4922 | .hardware_unsetup = svm_hardware_unsetup, |
4923 | .hardware_enable = svm_hardware_enable, |
4924 | .hardware_disable = svm_hardware_disable, |
4925 | .has_emulated_msr = svm_has_emulated_msr, |
4926 | |
4927 | .vcpu_create = svm_vcpu_create, |
4928 | .vcpu_free = svm_vcpu_free, |
4929 | .vcpu_reset = svm_vcpu_reset, |
4930 | |
4931 | .vm_size = sizeof(struct kvm_svm), |
4932 | .vm_init = svm_vm_init, |
4933 | .vm_destroy = svm_vm_destroy, |
4934 | |
4935 | .prepare_switch_to_guest = svm_prepare_switch_to_guest, |
4936 | .vcpu_load = svm_vcpu_load, |
4937 | .vcpu_put = svm_vcpu_put, |
4938 | .vcpu_blocking = avic_vcpu_blocking, |
4939 | .vcpu_unblocking = avic_vcpu_unblocking, |
4940 | |
4941 | .update_exception_bitmap = svm_update_exception_bitmap, |
4942 | .get_msr_feature = svm_get_msr_feature, |
4943 | .get_msr = svm_get_msr, |
4944 | .set_msr = svm_set_msr, |
4945 | .get_segment_base = svm_get_segment_base, |
4946 | .get_segment = svm_get_segment, |
4947 | .set_segment = svm_set_segment, |
4948 | .get_cpl = svm_get_cpl, |
4949 | .get_cs_db_l_bits = svm_get_cs_db_l_bits, |
4950 | .is_valid_cr0 = svm_is_valid_cr0, |
4951 | .set_cr0 = svm_set_cr0, |
4952 | .post_set_cr3 = sev_post_set_cr3, |
4953 | .is_valid_cr4 = svm_is_valid_cr4, |
4954 | .set_cr4 = svm_set_cr4, |
4955 | .set_efer = svm_set_efer, |
4956 | .get_idt = svm_get_idt, |
4957 | .set_idt = svm_set_idt, |
4958 | .get_gdt = svm_get_gdt, |
4959 | .set_gdt = svm_set_gdt, |
4960 | .set_dr7 = svm_set_dr7, |
4961 | .sync_dirty_debug_regs = svm_sync_dirty_debug_regs, |
4962 | .cache_reg = svm_cache_reg, |
4963 | .get_rflags = svm_get_rflags, |
4964 | .set_rflags = svm_set_rflags, |
4965 | .get_if_flag = svm_get_if_flag, |
4966 | |
4967 | .flush_tlb_all = svm_flush_tlb_all, |
4968 | .flush_tlb_current = svm_flush_tlb_current, |
4969 | .flush_tlb_gva = svm_flush_tlb_gva, |
4970 | .flush_tlb_guest = svm_flush_tlb_asid, |
4971 | |
4972 | .vcpu_pre_run = svm_vcpu_pre_run, |
4973 | .vcpu_run = svm_vcpu_run, |
4974 | .handle_exit = svm_handle_exit, |
4975 | .skip_emulated_instruction = svm_skip_emulated_instruction, |
4976 | .update_emulated_instruction = NULL, |
4977 | .set_interrupt_shadow = svm_set_interrupt_shadow, |
4978 | .get_interrupt_shadow = svm_get_interrupt_shadow, |
4979 | .patch_hypercall = svm_patch_hypercall, |
4980 | .inject_irq = svm_inject_irq, |
4981 | .inject_nmi = svm_inject_nmi, |
4982 | .is_vnmi_pending = svm_is_vnmi_pending, |
4983 | .set_vnmi_pending = svm_set_vnmi_pending, |
4984 | .inject_exception = svm_inject_exception, |
4985 | .cancel_injection = svm_cancel_injection, |
4986 | .interrupt_allowed = svm_interrupt_allowed, |
4987 | .nmi_allowed = svm_nmi_allowed, |
4988 | .get_nmi_mask = svm_get_nmi_mask, |
4989 | .set_nmi_mask = svm_set_nmi_mask, |
4990 | .enable_nmi_window = svm_enable_nmi_window, |
4991 | .enable_irq_window = svm_enable_irq_window, |
4992 | .update_cr8_intercept = svm_update_cr8_intercept, |
4993 | .set_virtual_apic_mode = avic_refresh_virtual_apic_mode, |
4994 | .refresh_apicv_exec_ctrl = avic_refresh_apicv_exec_ctrl, |
4995 | .apicv_post_state_restore = avic_apicv_post_state_restore, |
4996 | .required_apicv_inhibits = AVIC_REQUIRED_APICV_INHIBITS, |
4997 | |
4998 | .get_exit_info = svm_get_exit_info, |
4999 | |
5000 | .vcpu_after_set_cpuid = svm_vcpu_after_set_cpuid, |
5001 | |
5002 | .has_wbinvd_exit = svm_has_wbinvd_exit, |
5003 | |
5004 | .get_l2_tsc_offset = svm_get_l2_tsc_offset, |
5005 | .get_l2_tsc_multiplier = svm_get_l2_tsc_multiplier, |
5006 | .write_tsc_offset = svm_write_tsc_offset, |
5007 | .write_tsc_multiplier = svm_write_tsc_multiplier, |
5008 | |
5009 | .load_mmu_pgd = svm_load_mmu_pgd, |
5010 | |
5011 | .check_intercept = svm_check_intercept, |
5012 | .handle_exit_irqoff = svm_handle_exit_irqoff, |
5013 | |
5014 | .sched_in = svm_sched_in, |
5015 | |
5016 | .nested_ops = &svm_nested_ops, |
5017 | |
5018 | .deliver_interrupt = svm_deliver_interrupt, |
5019 | .pi_update_irte = avic_pi_update_irte, |
5020 | .setup_mce = svm_setup_mce, |
5021 | |
5022 | #ifdef CONFIG_KVM_SMM |
5023 | .smi_allowed = svm_smi_allowed, |
5024 | .enter_smm = svm_enter_smm, |
5025 | .leave_smm = svm_leave_smm, |
5026 | .enable_smi_window = svm_enable_smi_window, |
5027 | #endif |
5028 | |
5029 | .mem_enc_ioctl = sev_mem_enc_ioctl, |
5030 | .mem_enc_register_region = sev_mem_enc_register_region, |
5031 | .mem_enc_unregister_region = sev_mem_enc_unregister_region, |
5032 | .guest_memory_reclaimed = sev_guest_memory_reclaimed, |
5033 | |
5034 | .vm_copy_enc_context_from = sev_vm_copy_enc_context_from, |
5035 | .vm_move_enc_context_from = sev_vm_move_enc_context_from, |
5036 | |
5037 | .check_emulate_instruction = svm_check_emulate_instruction, |
5038 | |
5039 | .apic_init_signal_blocked = svm_apic_init_signal_blocked, |
5040 | |
5041 | .msr_filter_changed = svm_msr_filter_changed, |
5042 | .complete_emulated_msr = svm_complete_emulated_msr, |
5043 | |
5044 | .vcpu_deliver_sipi_vector = svm_vcpu_deliver_sipi_vector, |
5045 | .vcpu_get_apicv_inhibit_reasons = avic_vcpu_get_apicv_inhibit_reasons, |
5046 | .alloc_apic_backing_page = svm_alloc_apic_backing_page, |
5047 | }; |
5048 | |
5049 | /* |
5050 | * The default MMIO mask is a single bit (excluding the present bit), |
5051 | * which could conflict with the memory encryption bit. Check for |
5052 | * memory encryption support and override the default MMIO mask if |
5053 | * memory encryption is enabled. |
5054 | */ |
5055 | static __init void svm_adjust_mmio_mask(void) |
5056 | { |
5057 | unsigned int enc_bit, mask_bit; |
5058 | u64 msr, mask; |
5059 | |
5060 | /* If there is no memory encryption support, use existing mask */ |
5061 | if (cpuid_eax(op: 0x80000000) < 0x8000001f) |
5062 | return; |
5063 | |
5064 | /* If memory encryption is not enabled, use existing mask */ |
5065 | rdmsrl(MSR_AMD64_SYSCFG, msr); |
5066 | if (!(msr & MSR_AMD64_SYSCFG_MEM_ENCRYPT)) |
5067 | return; |
5068 | |
5069 | enc_bit = cpuid_ebx(op: 0x8000001f) & 0x3f; |
5070 | mask_bit = boot_cpu_data.x86_phys_bits; |
5071 | |
5072 | /* Increment the mask bit if it is the same as the encryption bit */ |
5073 | if (enc_bit == mask_bit) |
5074 | mask_bit++; |
5075 | |
5076 | /* |
5077 | * If the mask bit location is below 52, then some bits above the |
5078 | * physical addressing limit will always be reserved, so use the |
5079 | * rsvd_bits() function to generate the mask. This mask, along with |
5080 | * the present bit, will be used to generate a page fault with |
5081 | * PFER.RSV = 1. |
5082 | * |
5083 | * If the mask bit location is 52 (or above), then clear the mask. |
5084 | */ |
5085 | mask = (mask_bit < 52) ? rsvd_bits(mask_bit, 51) | PT_PRESENT_MASK : 0; |
5086 | |
5087 | kvm_mmu_set_mmio_spte_mask(mask, mask, PT_WRITABLE_MASK | PT_USER_MASK); |
5088 | } |
5089 | |
5090 | static __init void svm_set_cpu_caps(void) |
5091 | { |
5092 | kvm_set_cpu_caps(); |
5093 | |
5094 | kvm_caps.supported_perf_cap = 0; |
5095 | kvm_caps.supported_xss = 0; |
5096 | |
5097 | /* CPUID 0x80000001 and 0x8000000A (SVM features) */ |
5098 | if (nested) { |
5099 | kvm_cpu_cap_set(X86_FEATURE_SVM); |
5100 | kvm_cpu_cap_set(X86_FEATURE_VMCBCLEAN); |
5101 | |
5102 | /* |
5103 | * KVM currently flushes TLBs on *every* nested SVM transition, |
5104 | * and so for all intents and purposes KVM supports flushing by |
5105 | * ASID, i.e. KVM is guaranteed to honor every L1 ASID flush. |
5106 | */ |
5107 | kvm_cpu_cap_set(X86_FEATURE_FLUSHBYASID); |
5108 | |
5109 | if (nrips) |
5110 | kvm_cpu_cap_set(X86_FEATURE_NRIPS); |
5111 | |
5112 | if (npt_enabled) |
5113 | kvm_cpu_cap_set(X86_FEATURE_NPT); |
5114 | |
5115 | if (tsc_scaling) |
5116 | kvm_cpu_cap_set(X86_FEATURE_TSCRATEMSR); |
5117 | |
5118 | if (vls) |
5119 | kvm_cpu_cap_set(X86_FEATURE_V_VMSAVE_VMLOAD); |
5120 | if (lbrv) |
5121 | kvm_cpu_cap_set(X86_FEATURE_LBRV); |
5122 | |
5123 | if (boot_cpu_has(X86_FEATURE_PAUSEFILTER)) |
5124 | kvm_cpu_cap_set(X86_FEATURE_PAUSEFILTER); |
5125 | |
5126 | if (boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) |
5127 | kvm_cpu_cap_set(X86_FEATURE_PFTHRESHOLD); |
5128 | |
5129 | if (vgif) |
5130 | kvm_cpu_cap_set(X86_FEATURE_VGIF); |
5131 | |
5132 | if (vnmi) |
5133 | kvm_cpu_cap_set(X86_FEATURE_VNMI); |
5134 | |
5135 | /* Nested VM can receive #VMEXIT instead of triggering #GP */ |
5136 | kvm_cpu_cap_set(X86_FEATURE_SVME_ADDR_CHK); |
5137 | } |
5138 | |
5139 | /* CPUID 0x80000008 */ |
5140 | if (boot_cpu_has(X86_FEATURE_LS_CFG_SSBD) || |
5141 | boot_cpu_has(X86_FEATURE_AMD_SSBD)) |
5142 | kvm_cpu_cap_set(X86_FEATURE_VIRT_SSBD); |
5143 | |
5144 | if (enable_pmu) { |
5145 | /* |
5146 | * Enumerate support for PERFCTR_CORE if and only if KVM has |
5147 | * access to enough counters to virtualize "core" support, |
5148 | * otherwise limit vPMU support to the legacy number of counters. |
5149 | */ |
5150 | if (kvm_pmu_cap.num_counters_gp < AMD64_NUM_COUNTERS_CORE) |
5151 | kvm_pmu_cap.num_counters_gp = min(AMD64_NUM_COUNTERS, |
5152 | kvm_pmu_cap.num_counters_gp); |
5153 | else |
5154 | kvm_cpu_cap_check_and_set(X86_FEATURE_PERFCTR_CORE); |
5155 | |
5156 | if (kvm_pmu_cap.version != 2 || |
5157 | !kvm_cpu_cap_has(X86_FEATURE_PERFCTR_CORE)) |
5158 | kvm_cpu_cap_clear(X86_FEATURE_PERFMON_V2); |
5159 | } |
5160 | |
5161 | /* CPUID 0x8000001F (SME/SEV features) */ |
5162 | sev_set_cpu_caps(); |
5163 | } |
5164 | |
5165 | static __init int svm_hardware_setup(void) |
5166 | { |
5167 | int cpu; |
5168 | struct page *iopm_pages; |
5169 | void *iopm_va; |
5170 | int r; |
5171 | unsigned int order = get_order(IOPM_SIZE); |
5172 | |
5173 | /* |
5174 | * NX is required for shadow paging and for NPT if the NX huge pages |
5175 | * mitigation is enabled. |
5176 | */ |
5177 | if (!boot_cpu_has(X86_FEATURE_NX)) { |
5178 | pr_err_ratelimited("NX (Execute Disable) not supported\n" ); |
5179 | return -EOPNOTSUPP; |
5180 | } |
5181 | kvm_enable_efer_bits(EFER_NX); |
5182 | |
5183 | iopm_pages = alloc_pages(GFP_KERNEL, order); |
5184 | |
5185 | if (!iopm_pages) |
5186 | return -ENOMEM; |
5187 | |
5188 | iopm_va = page_address(iopm_pages); |
5189 | memset(iopm_va, 0xff, PAGE_SIZE * (1 << order)); |
5190 | iopm_base = page_to_pfn(iopm_pages) << PAGE_SHIFT; |
5191 | |
5192 | init_msrpm_offsets(); |
5193 | |
5194 | kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS | |
5195 | XFEATURE_MASK_BNDCSR); |
5196 | |
5197 | if (boot_cpu_has(X86_FEATURE_FXSR_OPT)) |
5198 | kvm_enable_efer_bits(EFER_FFXSR); |
5199 | |
5200 | if (tsc_scaling) { |
5201 | if (!boot_cpu_has(X86_FEATURE_TSCRATEMSR)) { |
5202 | tsc_scaling = false; |
5203 | } else { |
5204 | pr_info("TSC scaling supported\n" ); |
5205 | kvm_caps.has_tsc_control = true; |
5206 | } |
5207 | } |
5208 | kvm_caps.max_tsc_scaling_ratio = SVM_TSC_RATIO_MAX; |
5209 | kvm_caps.tsc_scaling_ratio_frac_bits = 32; |
5210 | |
5211 | tsc_aux_uret_slot = kvm_add_user_return_msr(MSR_TSC_AUX); |
5212 | |
5213 | if (boot_cpu_has(X86_FEATURE_AUTOIBRS)) |
5214 | kvm_enable_efer_bits(EFER_AUTOIBRS); |
5215 | |
5216 | /* Check for pause filtering support */ |
5217 | if (!boot_cpu_has(X86_FEATURE_PAUSEFILTER)) { |
5218 | pause_filter_count = 0; |
5219 | pause_filter_thresh = 0; |
5220 | } else if (!boot_cpu_has(X86_FEATURE_PFTHRESHOLD)) { |
5221 | pause_filter_thresh = 0; |
5222 | } |
5223 | |
5224 | if (nested) { |
5225 | pr_info("Nested Virtualization enabled\n" ); |
5226 | kvm_enable_efer_bits(EFER_SVME | EFER_LMSLE); |
5227 | } |
5228 | |
5229 | /* |
5230 | * KVM's MMU doesn't support using 2-level paging for itself, and thus |
5231 | * NPT isn't supported if the host is using 2-level paging since host |
5232 | * CR4 is unchanged on VMRUN. |
5233 | */ |
5234 | if (!IS_ENABLED(CONFIG_X86_64) && !IS_ENABLED(CONFIG_X86_PAE)) |
5235 | npt_enabled = false; |
5236 | |
5237 | if (!boot_cpu_has(X86_FEATURE_NPT)) |
5238 | npt_enabled = false; |
5239 | |
5240 | /* Force VM NPT level equal to the host's paging level */ |
5241 | kvm_configure_mmu(enable_tdp: npt_enabled, tdp_forced_root_level: get_npt_level(), |
5242 | tdp_max_root_level: get_npt_level(), tdp_huge_page_level: PG_LEVEL_1G); |
5243 | pr_info("Nested Paging %sabled\n" , npt_enabled ? "en" : "dis" ); |
5244 | |
5245 | /* Setup shadow_me_value and shadow_me_mask */ |
5246 | kvm_mmu_set_me_spte_mask(sme_me_mask, sme_me_mask); |
5247 | |
5248 | svm_adjust_mmio_mask(); |
5249 | |
5250 | nrips = nrips && boot_cpu_has(X86_FEATURE_NRIPS); |
5251 | |
5252 | /* |
5253 | * Note, SEV setup consumes npt_enabled and enable_mmio_caching (which |
5254 | * may be modified by svm_adjust_mmio_mask()), as well as nrips. |
5255 | */ |
5256 | sev_hardware_setup(); |
5257 | |
5258 | svm_hv_hardware_setup(); |
5259 | |
5260 | for_each_possible_cpu(cpu) { |
5261 | r = svm_cpu_init(cpu); |
5262 | if (r) |
5263 | goto err; |
5264 | } |
5265 | |
5266 | enable_apicv = avic = avic && avic_hardware_setup(); |
5267 | |
5268 | if (!enable_apicv) { |
5269 | svm_x86_ops.vcpu_blocking = NULL; |
5270 | svm_x86_ops.vcpu_unblocking = NULL; |
5271 | svm_x86_ops.vcpu_get_apicv_inhibit_reasons = NULL; |
5272 | } else if (!x2avic_enabled) { |
5273 | svm_x86_ops.allow_apicv_in_x2apic_without_x2apic_virtualization = true; |
5274 | } |
5275 | |
5276 | if (vls) { |
5277 | if (!npt_enabled || |
5278 | !boot_cpu_has(X86_FEATURE_V_VMSAVE_VMLOAD) || |
5279 | !IS_ENABLED(CONFIG_X86_64)) { |
5280 | vls = false; |
5281 | } else { |
5282 | pr_info("Virtual VMLOAD VMSAVE supported\n" ); |
5283 | } |
5284 | } |
5285 | |
5286 | if (boot_cpu_has(X86_FEATURE_SVME_ADDR_CHK)) |
5287 | svm_gp_erratum_intercept = false; |
5288 | |
5289 | if (vgif) { |
5290 | if (!boot_cpu_has(X86_FEATURE_VGIF)) |
5291 | vgif = false; |
5292 | else |
5293 | pr_info("Virtual GIF supported\n" ); |
5294 | } |
5295 | |
5296 | vnmi = vgif && vnmi && boot_cpu_has(X86_FEATURE_VNMI); |
5297 | if (vnmi) |
5298 | pr_info("Virtual NMI enabled\n" ); |
5299 | |
5300 | if (!vnmi) { |
5301 | svm_x86_ops.is_vnmi_pending = NULL; |
5302 | svm_x86_ops.set_vnmi_pending = NULL; |
5303 | } |
5304 | |
5305 | |
5306 | if (lbrv) { |
5307 | if (!boot_cpu_has(X86_FEATURE_LBRV)) |
5308 | lbrv = false; |
5309 | else |
5310 | pr_info("LBR virtualization supported\n" ); |
5311 | } |
5312 | |
5313 | if (!enable_pmu) |
5314 | pr_info("PMU virtualization is disabled\n" ); |
5315 | |
5316 | svm_set_cpu_caps(); |
5317 | |
5318 | /* |
5319 | * It seems that on AMD processors PTE's accessed bit is |
5320 | * being set by the CPU hardware before the NPF vmexit. |
5321 | * This is not expected behaviour and our tests fail because |
5322 | * of it. |
5323 | * A workaround here is to disable support for |
5324 | * GUEST_MAXPHYADDR < HOST_MAXPHYADDR if NPT is enabled. |
5325 | * In this case userspace can know if there is support using |
5326 | * KVM_CAP_SMALLER_MAXPHYADDR extension and decide how to handle |
5327 | * it |
5328 | * If future AMD CPU models change the behaviour described above, |
5329 | * this variable can be changed accordingly |
5330 | */ |
5331 | allow_smaller_maxphyaddr = !npt_enabled; |
5332 | |
5333 | return 0; |
5334 | |
5335 | err: |
5336 | svm_hardware_unsetup(); |
5337 | return r; |
5338 | } |
5339 | |
5340 | |
5341 | static struct kvm_x86_init_ops svm_init_ops __initdata = { |
5342 | .hardware_setup = svm_hardware_setup, |
5343 | |
5344 | .runtime_ops = &svm_x86_ops, |
5345 | .pmu_ops = &amd_pmu_ops, |
5346 | }; |
5347 | |
5348 | static void __svm_exit(void) |
5349 | { |
5350 | kvm_x86_vendor_exit(); |
5351 | |
5352 | cpu_emergency_unregister_virt_callback(callback: svm_emergency_disable); |
5353 | } |
5354 | |
5355 | static int __init svm_init(void) |
5356 | { |
5357 | int r; |
5358 | |
5359 | __unused_size_checks(); |
5360 | |
5361 | if (!kvm_is_svm_supported()) |
5362 | return -EOPNOTSUPP; |
5363 | |
5364 | r = kvm_x86_vendor_init(ops: &svm_init_ops); |
5365 | if (r) |
5366 | return r; |
5367 | |
5368 | cpu_emergency_register_virt_callback(callback: svm_emergency_disable); |
5369 | |
5370 | /* |
5371 | * Common KVM initialization _must_ come last, after this, /dev/kvm is |
5372 | * exposed to userspace! |
5373 | */ |
5374 | r = kvm_init(vcpu_size: sizeof(struct vcpu_svm), vcpu_align: __alignof__(struct vcpu_svm), |
5375 | THIS_MODULE); |
5376 | if (r) |
5377 | goto err_kvm_init; |
5378 | |
5379 | return 0; |
5380 | |
5381 | err_kvm_init: |
5382 | __svm_exit(); |
5383 | return r; |
5384 | } |
5385 | |
5386 | static void __exit svm_exit(void) |
5387 | { |
5388 | kvm_exit(); |
5389 | __svm_exit(); |
5390 | } |
5391 | |
5392 | module_init(svm_init) |
5393 | module_exit(svm_exit) |
5394 | |