vmx.c source code [linux/arch/x86/kvm/vmx/vmx.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Kernel-based Virtual Machine driver for Linux
4	*
5	* This module enables machines with Intel VT-x extensions to run virtual
6	* machines without emulation or binary translation.
7	*
8	* Copyright (C) 2006 Qumranet, Inc.
9	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
10	*
11	* Authors:
12	* Avi Kivity <avi@qumranet.com>
13	* Yaniv Kamay <yaniv@qumranet.com>
14	*/
15	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
16
17	#include <linux/highmem.h>
18	#include <linux/hrtimer.h>
19	#include <linux/kernel.h>
20	#include <linux/kvm_host.h>
21	#include <linux/module.h>
22	#include <linux/moduleparam.h>
23	#include <linux/mod_devicetable.h>
24	#include <linux/mm.h>
25	#include <linux/objtool.h>
26	#include <linux/sched.h>
27	#include <linux/sched/smt.h>
28	#include <linux/slab.h>
29	#include <linux/tboot.h>
30	#include <linux/trace_events.h>
31	#include <linux/entry-kvm.h>
32
33	#include <asm/apic.h>
34	#include <asm/asm.h>
35	#include <asm/cpu.h>
36	#include <asm/cpu_device_id.h>
37	#include <asm/debugreg.h>
38	#include <asm/desc.h>
39	#include <asm/fpu/api.h>
40	#include <asm/fpu/xstate.h>
41	#include <asm/fred.h>
42	#include <asm/idtentry.h>
43	#include <asm/io.h>
44	#include <asm/irq_remapping.h>
45	#include <asm/reboot.h>
46	#include <asm/perf_event.h>
47	#include <asm/mmu_context.h>
48	#include <asm/mshyperv.h>
49	#include <asm/mwait.h>
50	#include <asm/spec-ctrl.h>
51	#include <asm/vmx.h>
52
53	#include <trace/events/ipi.h>
54
55	#include "capabilities.h"
56	#include "cpuid.h"
57	#include "hyperv.h"
58	#include "kvm_onhyperv.h"
59	#include "irq.h"
60	#include "kvm_cache_regs.h"
61	#include "lapic.h"
62	#include "mmu.h"
63	#include "nested.h"
64	#include "pmu.h"
65	#include "sgx.h"
66	#include "trace.h"
67	#include "vmcs.h"
68	#include "vmcs12.h"
69	#include "vmx.h"
70	#include "x86.h"
71	#include "smm.h"
72	#include "vmx_onhyperv.h"
73
74	MODULE_AUTHOR("Qumranet");
75	MODULE_LICENSE("GPL");
76
77	#ifdef MODULE
78	static const struct x86_cpu_id vmx_cpu_id[] = {
79	X86_MATCH_FEATURE(X86_FEATURE_VMX, NULL),
80	{}
81	};
82	MODULE_DEVICE_TABLE(x86cpu, vmx_cpu_id);
83	#endif
84
85	bool __read_mostly enable_vpid = `1`;
86	module_param_named(vpid, enable_vpid, bool, `0444`);
87
88	static bool __read_mostly enable_vnmi = `1`;
89	module_param_named(vnmi, enable_vnmi, bool, `0444`);
90
91	bool __read_mostly flexpriority_enabled = `1`;
92	module_param_named(flexpriority, flexpriority_enabled, bool, `0444`);
93
94	bool __read_mostly enable_ept = `1`;
95	module_param_named(ept, enable_ept, bool, `0444`);
96
97	bool __read_mostly enable_unrestricted_guest = `1`;
98	module_param_named(unrestricted_guest,
99	enable_unrestricted_guest, bool, `0444`);
100
101	bool __read_mostly enable_ept_ad_bits = `1`;
102	module_param_named(eptad, enable_ept_ad_bits, bool, `0444`);
103
104	static bool __read_mostly emulate_invalid_guest_state = true;
105	module_param(emulate_invalid_guest_state, bool, `0444`);
106
107	static bool __read_mostly fasteoi = `1`;
108	module_param(fasteoi, bool, `0444`);
109
110	module_param(enable_apicv, bool, `0444`);
111
112	bool __read_mostly enable_ipiv = true;
113	module_param(enable_ipiv, bool, `0444`);
114
115	/*
116	* If nested=1, nested virtualization is supported, i.e., guests may use
117	* VMX and be a hypervisor for its own guests. If nested=0, guests may not
118	* use VMX instructions.
119	*/
120	static bool __read_mostly nested = `1`;
121	module_param(nested, bool, `0444`);
122
123	bool __read_mostly enable_pml = `1`;
124	module_param_named(pml, enable_pml, bool, `0444`);
125
126	static bool __read_mostly error_on_inconsistent_vmcs_config = true;
127	module_param(error_on_inconsistent_vmcs_config, bool, `0444`);
128
129	static bool __read_mostly dump_invalid_vmcs = `0`;
130	module_param(dump_invalid_vmcs, bool, `0644`);
131
132	#define MSR_BITMAP_MODE_X2APIC 1
133	#define MSR_BITMAP_MODE_X2APIC_APICV 2
134
135	#define KVM_VMX_TSC_MULTIPLIER_MAX 0xffffffffffffffffULL
136
137	/ Guest_tsc -> host_tsc conversion requires 64-bit division. /
138	static int __read_mostly cpu_preemption_timer_multi;
139	static bool __read_mostly enable_preemption_timer = `1`;
140	#ifdef CONFIG_X86_64
141	module_param_named(preemption_timer, enable_preemption_timer, bool, S_IRUGO);
142	#endif
143
144	extern bool __read_mostly allow_smaller_maxphyaddr;
145	module_param(allow_smaller_maxphyaddr, bool, S_IRUGO);
146
147	#define KVM_VM_CR0_ALWAYS_OFF (X86_CR0_NW \| X86_CR0_CD)
148	#define KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR0_NE
149	#define KVM_VM_CR0_ALWAYS_ON \
150	(KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST \| X86_CR0_PG \| X86_CR0_PE)
151
152	#define KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST X86_CR4_VMXE
153	#define KVM_PMODE_VM_CR4_ALWAYS_ON (X86_CR4_PAE \| X86_CR4_VMXE)
154	#define KVM_RMODE_VM_CR4_ALWAYS_ON (X86_CR4_VME \| X86_CR4_PAE \| X86_CR4_VMXE)
155
156	#define RMODE_GUEST_OWNED_EFLAGS_BITS (~(X86_EFLAGS_IOPL \| X86_EFLAGS_VM))
157
158	#define MSR_IA32_RTIT_STATUS_MASK (~(RTIT_STATUS_FILTEREN \| \
159	RTIT_STATUS_CONTEXTEN \| RTIT_STATUS_TRIGGEREN \| \
160	RTIT_STATUS_ERROR \| RTIT_STATUS_STOPPED \| \
161	RTIT_STATUS_BYTECNT))
162
163	/*
164	* List of MSRs that can be directly passed to the guest.
165	* In addition to these x2apic, PT and LBR MSRs are handled specially.
166	*/
167	static u32 vmx_possible_passthrough_msrs[MAX_POSSIBLE_PASSTHROUGH_MSRS] = {
168	MSR_IA32_SPEC_CTRL,
169	MSR_IA32_PRED_CMD,
170	MSR_IA32_FLUSH_CMD,
171	MSR_IA32_TSC,
172	#ifdef CONFIG_X86_64
173	MSR_FS_BASE,
174	MSR_GS_BASE,
175	MSR_KERNEL_GS_BASE,
176	MSR_IA32_XFD,
177	MSR_IA32_XFD_ERR,
178	#endif
179	MSR_IA32_SYSENTER_CS,
180	MSR_IA32_SYSENTER_ESP,
181	MSR_IA32_SYSENTER_EIP,
182	MSR_CORE_C1_RES,
183	MSR_CORE_C3_RESIDENCY,
184	MSR_CORE_C6_RESIDENCY,
185	MSR_CORE_C7_RESIDENCY,
186	};
187
188	/*
189	* These 2 parameters are used to config the controls for Pause-Loop Exiting:
190	* ple_gap: upper bound on the amount of time between two successive
191	* executions of PAUSE in a loop. Also indicate if ple enabled.
192	* According to test, this time is usually smaller than 128 cycles.
193	* ple_window: upper bound on the amount of time a guest is allowed to execute
194	* in a PAUSE loop. Tests indicate that most spinlocks are held for
195	* less than 2^12 cycles
196	* Time is measured based on a counter that runs at the same rate as the TSC,
197	* refer SDM volume 3b section 21.6.13 & 22.1.3.
198	*/
199	static unsigned int ple_gap = KVM_DEFAULT_PLE_GAP;
200	module_param(ple_gap, uint, `0444`);
201
202	static unsigned int ple_window = KVM_VMX_DEFAULT_PLE_WINDOW;
203	module_param(ple_window, uint, `0444`);
204
205	/ Default doubles per-vcpu window every exit. /
206	static unsigned int ple_window_grow = KVM_DEFAULT_PLE_WINDOW_GROW;
207	module_param(ple_window_grow, uint, `0444`);
208
209	/ Default resets per-vcpu window every exit to ple_window. /
210	static unsigned int ple_window_shrink = KVM_DEFAULT_PLE_WINDOW_SHRINK;
211	module_param(ple_window_shrink, uint, `0444`);
212
213	/ Default is to compute the maximum so we can never overflow. /
214	static unsigned int ple_window_max = KVM_VMX_DEFAULT_PLE_WINDOW_MAX;
215	module_param(ple_window_max, uint, `0444`);
216
217	/ Default is SYSTEM mode, 1 for host-guest mode /
218	int __read_mostly pt_mode = PT_MODE_SYSTEM;
219	module_param(pt_mode, int, S_IRUGO);
220
221	struct x86_pmu_lbr __ro_after_init vmx_lbr_caps;
222
223	static DEFINE_STATIC_KEY_FALSE(vmx_l1d_should_flush);
224	static DEFINE_STATIC_KEY_FALSE(vmx_l1d_flush_cond);
225	static DEFINE_MUTEX(vmx_l1d_flush_mutex);
226
227	/ Storage for pre module init parameter parsing /
228	static enum vmx_l1d_flush_state __read_mostly vmentry_l1d_flush_param = VMENTER_L1D_FLUSH_AUTO;
229
230	static const struct {
231	const char *option;
232	bool for_parse;
233	} vmentry_l1d_param[] = {
234	[VMENTER_L1D_FLUSH_AUTO] = {"auto", true},
235	[VMENTER_L1D_FLUSH_NEVER] = {.option: "never", .for_parse: true},
236	[VMENTER_L1D_FLUSH_COND] = {.option: "cond", .for_parse: true},
237	[VMENTER_L1D_FLUSH_ALWAYS] = {.option: "always", .for_parse: true},
238	[VMENTER_L1D_FLUSH_EPT_DISABLED] = {.option: "EPT disabled", .for_parse: false},
239	[VMENTER_L1D_FLUSH_NOT_REQUIRED] = {.option: "not required", .for_parse: false},
240	};
241
242	#define L1D_CACHE_ORDER 4
243	static void *vmx_l1d_flush_pages;
244
245	static int vmx_setup_l1d_flush(enum vmx_l1d_flush_state l1tf)
246	{
247	struct page *page;
248	unsigned int i;
249
250	if (!boot_cpu_has_bug(X86_BUG_L1TF)) {
251	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
252	return `0`;
253	}
254
255	if (!enable_ept) {
256	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_EPT_DISABLED;
257	return `0`;
258	}
259
260	if (host_arch_capabilities & ARCH_CAP_SKIP_VMENTRY_L1DFLUSH) {
261	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_NOT_REQUIRED;
262	return `0`;
263	}
264
265	/ If set to auto use the default l1tf mitigation method /
266	if (l1tf == VMENTER_L1D_FLUSH_AUTO) {
267	switch (l1tf_mitigation) {
268	case L1TF_MITIGATION_OFF:
269	l1tf = VMENTER_L1D_FLUSH_NEVER;
270	break;
271	case L1TF_MITIGATION_FLUSH_NOWARN:
272	case L1TF_MITIGATION_FLUSH:
273	case L1TF_MITIGATION_FLUSH_NOSMT:
274	l1tf = VMENTER_L1D_FLUSH_COND;
275	break;
276	case L1TF_MITIGATION_FULL:
277	case L1TF_MITIGATION_FULL_FORCE:
278	l1tf = VMENTER_L1D_FLUSH_ALWAYS;
279	break;
280	}
281	} else if (l1tf_mitigation == L1TF_MITIGATION_FULL_FORCE) {
282	l1tf = VMENTER_L1D_FLUSH_ALWAYS;
283	}
284
285	if (l1tf != VMENTER_L1D_FLUSH_NEVER && !vmx_l1d_flush_pages &&
286	!boot_cpu_has(X86_FEATURE_FLUSH_L1D)) {
287	/*
288	* This allocation for vmx_l1d_flush_pages is not tied to a VM
289	* lifetime and so should not be charged to a memcg.
290	*/
291	page = alloc_pages(GFP_KERNEL, L1D_CACHE_ORDER);
292	if (!page)
293	return -ENOMEM;
294	vmx_l1d_flush_pages = page_address(page);
295
296	/*
297	* Initialize each page with a different pattern in
298	* order to protect against KSM in the nested
299	* virtualization case.
300	*/
301	for (i = `0`; i < `1u` << L1D_CACHE_ORDER; ++i) {
302	memset(vmx_l1d_flush_pages + i * PAGE_SIZE, i + `1`,
303	PAGE_SIZE);
304	}
305	}
306
307	l1tf_vmx_mitigation = l1tf;
308
309	if (l1tf != VMENTER_L1D_FLUSH_NEVER)
310	static_branch_enable(&vmx_l1d_should_flush);
311	else
312	static_branch_disable(&vmx_l1d_should_flush);
313
314	if (l1tf == VMENTER_L1D_FLUSH_COND)
315	static_branch_enable(&vmx_l1d_flush_cond);
316	else
317	static_branch_disable(&vmx_l1d_flush_cond);
318	return `0`;
319	}
320
321	static int vmentry_l1d_flush_parse(const char *s)
322	{
323	unsigned int i;
324
325	if (s) {
326	for (i = `0`; i < ARRAY_SIZE(vmentry_l1d_param); i++) {
327	if (vmentry_l1d_param[i].for_parse &&
328	sysfs_streq(s1: s, s2: vmentry_l1d_param[i].option))
329	return i;
330	}
331	}
332	return -EINVAL;
333	}
334
335	static int vmentry_l1d_flush_set(const char s, const* struct kernel_param *kp)
336	{
337	int l1tf, ret;
338
339	l1tf = vmentry_l1d_flush_parse(s);
340	if (l1tf < `0`)
341	return l1tf;
342
343	if (!boot_cpu_has(X86_BUG_L1TF))
344	return `0`;
345
346	/*
347	* Has vmx_init() run already? If not then this is the pre init
348	* parameter parsing. In that case just store the value and let
349	* vmx_init() do the proper setup after enable_ept has been
350	* established.
351	*/
352	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_AUTO) {
353	vmentry_l1d_flush_param = l1tf;
354	return `0`;
355	}
356
357	mutex_lock(&vmx_l1d_flush_mutex);
358	ret = vmx_setup_l1d_flush(l1tf);
359	mutex_unlock(lock: &vmx_l1d_flush_mutex);
360	return ret;
361	}
362
363	static int vmentry_l1d_flush_get(char s, const* struct kernel_param *kp)
364	{
365	if (WARN_ON_ONCE(l1tf_vmx_mitigation >= ARRAY_SIZE(vmentry_l1d_param)))
366	return sysfs_emit(buf: s, fmt: "???\n");
367
368	return sysfs_emit(buf: s, fmt: "%s\n", vmentry_l1d_param[l1tf_vmx_mitigation].option);
369	}
370
371	static __always_inline void vmx_disable_fb_clear(struct vcpu_vmx *vmx)
372	{
373	u64 msr;
374
375	if (!vmx->disable_fb_clear)
376	return;
377
378	msr = __rdmsr(MSR_IA32_MCU_OPT_CTRL);
379	msr \|= FB_CLEAR_DIS;
380	native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, msr);
381	/ Cache the MSR value to avoid reading it later /
382	vmx->msr_ia32_mcu_opt_ctrl = msr;
383	}
384
385	static __always_inline void vmx_enable_fb_clear(struct vcpu_vmx *vmx)
386	{
387	if (!vmx->disable_fb_clear)
388	return;
389
390	vmx->msr_ia32_mcu_opt_ctrl &= ~FB_CLEAR_DIS;
391	native_wrmsrl(MSR_IA32_MCU_OPT_CTRL, vmx->msr_ia32_mcu_opt_ctrl);
392	}
393
394	static void vmx_update_fb_clear_dis(struct kvm_vcpu vcpu, struct* vcpu_vmx *vmx)
395	{
396	/*
397	* Disable VERW's behavior of clearing CPU buffers for the guest if the
398	* CPU isn't affected by MDS/TAA, and the host hasn't forcefully enabled
399	* the mitigation. Disabling the clearing behavior provides a
400	* performance boost for guests that aren't aware that manually clearing
401	* CPU buffers is unnecessary, at the cost of MSR accesses on VM-Entry
402	* and VM-Exit.
403	*/
404	vmx->disable_fb_clear = !cpu_feature_enabled(X86_FEATURE_CLEAR_CPU_BUF) &&
405	(host_arch_capabilities & ARCH_CAP_FB_CLEAR_CTRL) &&
406	!boot_cpu_has_bug(X86_BUG_MDS) &&
407	!boot_cpu_has_bug(X86_BUG_TAA);
408
409	/*
410	* If guest will not execute VERW, there is no need to set FB_CLEAR_DIS
411	* at VMEntry. Skip the MSR read/write when a guest has no use case to
412	* execute VERW.
413	*/
414	if ((vcpu->arch.arch_capabilities & ARCH_CAP_FB_CLEAR) \|\|
415	((vcpu->arch.arch_capabilities & ARCH_CAP_MDS_NO) &&
416	(vcpu->arch.arch_capabilities & ARCH_CAP_TAA_NO) &&
417	(vcpu->arch.arch_capabilities & ARCH_CAP_PSDP_NO) &&
418	(vcpu->arch.arch_capabilities & ARCH_CAP_FBSDP_NO) &&
419	(vcpu->arch.arch_capabilities & ARCH_CAP_SBDR_SSDP_NO)))
420	vmx->disable_fb_clear = false;
421	}
422
423	static const struct kernel_param_ops vmentry_l1d_flush_ops = {
424	.set = vmentry_l1d_flush_set,
425	.get = vmentry_l1d_flush_get,
426	};
427	module_param_cb(vmentry_l1d_flush, &vmentry_l1d_flush_ops, NULL, `0644`);
428
429	static u32 vmx_segment_access_rights(struct kvm_segment *var);
430
431	void vmx_vmexit(void);
432
433	#define vmx_insn_failed(fmt...) \
434	do { \
435	WARN_ONCE(1, fmt); \
436	pr_warn_ratelimited(fmt); \
437	} while (0)
438
439	noinline void vmread_error(unsigned long field)
440	{
441	vmx_insn_failed("vmread failed: field=%lx\n", field);
442	}
443
444	#ifndef CONFIG_CC_HAS_ASM_GOTO_OUTPUT
445	noinstr void vmread_error_trampoline2(unsigned long field, bool fault)
446	{
447	if (fault) {
448	kvm_spurious_fault();
449	} else {
450	instrumentation_begin();
451	vmread_error(field);
452	instrumentation_end();
453	}
454	}
455	#endif
456
457	noinline void vmwrite_error(unsigned long field, unsigned long value)
458	{
459	vmx_insn_failed("vmwrite failed: field=%lx val=%lx err=%u\n",
460	field, value, vmcs_read32(VM_INSTRUCTION_ERROR));
461	}
462
463	noinline void vmclear_error(struct vmcs *vmcs, u64 phys_addr)
464	{
465	vmx_insn_failed("vmclear failed: %p/%llx err=%u\n",
466	vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
467	}
468
469	noinline void vmptrld_error(struct vmcs *vmcs, u64 phys_addr)
470	{
471	vmx_insn_failed("vmptrld failed: %p/%llx err=%u\n",
472	vmcs, phys_addr, vmcs_read32(VM_INSTRUCTION_ERROR));
473	}
474
475	noinline void invvpid_error(unsigned long ext, u16 vpid, gva_t gva)
476	{
477	vmx_insn_failed("invvpid failed: ext=0x%lx vpid=%u gva=0x%lx\n",
478	ext, vpid, gva);
479	}
480
481	noinline void invept_error(unsigned long ext, u64 eptp, gpa_t gpa)
482	{
483	vmx_insn_failed("invept failed: ext=0x%lx eptp=%llx gpa=0x%llx\n",
484	ext, eptp, gpa);
485	}
486
487	static DEFINE_PER_CPU(struct vmcs *, vmxarea);
488	DEFINE_PER_CPU(struct vmcs *, current_vmcs);
489	/*
490	* We maintain a per-CPU linked-list of VMCS loaded on that CPU. This is needed
491	* when a CPU is brought down, and we need to VMCLEAR all VMCSs loaded on it.
492	*/
493	static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
494
495	static DECLARE_BITMAP(vmx_vpid_bitmap, VMX_NR_VPIDS);
496	static DEFINE_SPINLOCK(vmx_vpid_lock);
497
498	struct vmcs_config vmcs_config __ro_after_init;
499	struct vmx_capability vmx_capability __ro_after_init;
500
501	#define VMX_SEGMENT_FIELD(seg) \
502	[VCPU_SREG_##seg] = { \
503	.selector = GUEST_##seg##_SELECTOR, \
504	.base = GUEST_##seg##_BASE, \
505	.limit = GUEST_##seg##_LIMIT, \
506	.ar_bytes = GUEST_##seg##_AR_BYTES, \
507	}
508
509	static const struct kvm_vmx_segment_field {
510	unsigned selector;
511	unsigned base;
512	unsigned limit;
513	unsigned ar_bytes;
514	} kvm_vmx_segment_fields[] = {
515	VMX_SEGMENT_FIELD(CS),
516	VMX_SEGMENT_FIELD(DS),
517	VMX_SEGMENT_FIELD(ES),
518	VMX_SEGMENT_FIELD(FS),
519	VMX_SEGMENT_FIELD(GS),
520	VMX_SEGMENT_FIELD(SS),
521	VMX_SEGMENT_FIELD(TR),
522	VMX_SEGMENT_FIELD(LDTR),
523	};
524
525	static inline void vmx_segment_cache_clear(struct vcpu_vmx *vmx)
526	{
527	vmx->segment_cache.bitmask = `0`;
528	}
529
530	static unsigned long host_idt_base;
531
532	#if IS_ENABLED(CONFIG_HYPERV)
533	static struct kvm_x86_ops vmx_x86_ops __initdata;
534
535	static bool __read_mostly enlightened_vmcs = true;
536	module_param(enlightened_vmcs, bool, `0444`);
537
538	static int hv_enable_l2_tlb_flush(struct kvm_vcpu *vcpu)
539	{
540	struct hv_enlightened_vmcs *evmcs;
541	hpa_t partition_assist_page = hv_get_partition_assist_page(vcpu);
542
543	if (partition_assist_page == INVALID_PAGE)
544	return -ENOMEM;
545
546	evmcs = (struct hv_enlightened_vmcs *)to_vmx(vcpu)->loaded_vmcs->vmcs;
547
548	evmcs->partition_assist_page = partition_assist_page;
549	evmcs->hv_vm_id = (unsigned long)vcpu->kvm;
550	evmcs->hv_enlightenments_control.nested_flush_hypercall = `1`;
551
552	return `0`;
553	}
554
555	static __init void hv_init_evmcs(void)
556	{
557	int cpu;
558
559	if (!enlightened_vmcs)
560	return;
561
562	/*
563	* Enlightened VMCS usage should be recommended and the host needs
564	* to support eVMCS v1 or above.
565	*/
566	if (ms_hyperv.hints & HV_X64_ENLIGHTENED_VMCS_RECOMMENDED &&
567	(ms_hyperv.nested_features & HV_X64_ENLIGHTENED_VMCS_VERSION) >=
568	KVM_EVMCS_VERSION) {
569
570	/ Check that we have assist pages on all online CPUs /
571	for_each_online_cpu(cpu) {
572	if (!hv_get_vp_assist_page(cpu)) {
573	enlightened_vmcs = false;
574	break;
575	}
576	}
577
578	if (enlightened_vmcs) {
579	pr_info("Using Hyper-V Enlightened VMCS\n");
580	static_branch_enable(&__kvm_is_using_evmcs);
581	}
582
583	if (ms_hyperv.nested_features & HV_X64_NESTED_DIRECT_FLUSH)
584	vmx_x86_ops.enable_l2_tlb_flush
585	= hv_enable_l2_tlb_flush;
586
587	} else {
588	enlightened_vmcs = false;
589	}
590	}
591
592	static void hv_reset_evmcs(void)
593	{
594	struct hv_vp_assist_page *vp_ap;
595
596	if (!kvm_is_using_evmcs())
597	return;
598
599	/*
600	* KVM should enable eVMCS if and only if all CPUs have a VP assist
601	* page, and should reject CPU onlining if eVMCS is enabled the CPU
602	* doesn't have a VP assist page allocated.
603	*/
604	vp_ap = hv_get_vp_assist_page(smp_processor_id());
605	if (WARN_ON_ONCE(!vp_ap))
606	return;
607
608	/*
609	* Reset everything to support using non-enlightened VMCS access later
610	* (e.g. when we reload the module with enlightened_vmcs=0)
611	*/
612	vp_ap->nested_control.features.directhypercall = `0`;
613	vp_ap->current_nested_vmcs = `0`;
614	vp_ap->enlighten_vmentry = `0`;
615	}
616
617	#else /* IS_ENABLED(CONFIG_HYPERV) */
618	static void hv_init_evmcs(void) {}
619	static void hv_reset_evmcs(void) {}
620	#endif /* IS_ENABLED(CONFIG_HYPERV) */
621
622	/*
623	* Comment's format: document - errata name - stepping - processor name.
624	* Refer from
625	* https://www.virtualbox.org/svn/vbox/trunk/src/VBox/VMM/VMMR0/HMR0.cpp
626	*/
627	static u32 vmx_preemption_cpu_tfms[] = {
628	/ 323344.pdf - BA86 - D0 - Xeon 7500 Series /
629	`0x000206E6`,
630	/ 323056.pdf - AAX65 - C2 - Xeon L3406 /
631	/ 322814.pdf - AAT59 - C2 - i7-600, i5-500, i5-400 and i3-300 Mobile /
632	/ 322911.pdf - AAU65 - C2 - i5-600, i3-500 Desktop and Pentium G6950 /
633	`0x00020652`,
634	/ 322911.pdf - AAU65 - K0 - i5-600, i3-500 Desktop and Pentium G6950 /
635	`0x00020655`,
636	/ 322373.pdf - AAO95 - B1 - Xeon 3400 Series /
637	/ 322166.pdf - AAN92 - B1 - i7-800 and i5-700 Desktop /
638	/*
639	* 320767.pdf - AAP86 - B1 -
640	* i7-900 Mobile Extreme, i7-800 and i7-700 Mobile
641	*/
642	`0x000106E5`,
643	/ 321333.pdf - AAM126 - C0 - Xeon 3500 /
644	`0x000106A0`,
645	/ 321333.pdf - AAM126 - C1 - Xeon 3500 /
646	`0x000106A1`,
647	/ 320836.pdf - AAJ124 - C0 - i7-900 Desktop Extreme and i7-900 Desktop /
648	`0x000106A4`,
649	/ 321333.pdf - AAM126 - D0 - Xeon 3500 /
650	/ 321324.pdf - AAK139 - D0 - Xeon 5500 /
651	/ 320836.pdf - AAJ124 - D0 - i7-900 Extreme and i7-900 Desktop /
652	`0x000106A5`,
653	/ Xeon E3-1220 V2 /
654	`0x000306A8`,
655	};
656
657	static inline bool cpu_has_broken_vmx_preemption_timer(void)
658	{
659	u32 eax = cpuid_eax(op: `0x00000001`), i;
660
661	/ Clear the reserved bits /
662	eax &= ~(`0x3U` << `14` \| `0xfU` << `28`);
663	for (i = `0`; i < ARRAY_SIZE(vmx_preemption_cpu_tfms); i++)
664	if (eax == vmx_preemption_cpu_tfms[i])
665	return true;
666
667	return false;
668	}
669
670	static inline bool cpu_need_virtualize_apic_accesses(struct kvm_vcpu *vcpu)
671	{
672	return flexpriority_enabled && lapic_in_kernel(vcpu);
673	}
674
675	static int vmx_get_passthrough_msr_slot(u32 msr)
676	{
677	int i;
678
679	switch (msr) {
680	case `0x800` ... `0x8ff`:
681	/ x2APIC MSRs. These are handled in vmx_update_msr_bitmap_x2apic() /
682	return -ENOENT;
683	case MSR_IA32_RTIT_STATUS:
684	case MSR_IA32_RTIT_OUTPUT_BASE:
685	case MSR_IA32_RTIT_OUTPUT_MASK:
686	case MSR_IA32_RTIT_CR3_MATCH:
687	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
688	/ PT MSRs. These are handled in pt_update_intercept_for_msr() /
689	case MSR_LBR_SELECT:
690	case MSR_LBR_TOS:
691	case MSR_LBR_INFO_0 ... MSR_LBR_INFO_0 + `31`:
692	case MSR_LBR_NHM_FROM ... MSR_LBR_NHM_FROM + `31`:
693	case MSR_LBR_NHM_TO ... MSR_LBR_NHM_TO + `31`:
694	case MSR_LBR_CORE_FROM ... MSR_LBR_CORE_FROM + `8`:
695	case MSR_LBR_CORE_TO ... MSR_LBR_CORE_TO + `8`:
696	/ LBR MSRs. These are handled in vmx_update_intercept_for_lbr_msrs() /
697	return -ENOENT;
698	}
699
700	for (i = `0`; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
701	if (vmx_possible_passthrough_msrs[i] == msr)
702	return i;
703	}
704
705	WARN(`1`, "Invalid MSR %x, please adapt vmx_possible_passthrough_msrs[]", msr);
706	return -ENOENT;
707	}
708
709	struct vmx_uret_msr vmx_find_uret_msr(struct* vcpu_vmx *vmx, u32 msr)
710	{
711	int i;
712
713	i = kvm_find_user_return_msr(msr);
714	if (i >= `0`)
715	return &vmx->guest_uret_msrs[i];
716	return NULL;
717	}
718
719	static int vmx_set_guest_uret_msr(struct vcpu_vmx *vmx,
720	struct vmx_uret_msr *msr, u64 data)
721	{
722	unsigned int slot = msr - vmx->guest_uret_msrs;
723	int ret = `0`;
724
725	if (msr->load_into_hardware) {
726	preempt_disable();
727	ret = kvm_set_user_return_msr(index: slot, val: data, mask: msr->mask);
728	preempt_enable();
729	}
730	if (!ret)
731	msr->data = data;
732	return ret;
733	}
734
735	/*
736	* Disable VMX and clear CR4.VMXE (even if VMXOFF faults)
737	*
738	* Note, VMXOFF causes a #UD if the CPU is !post-VMXON, but it's impossible to
739	* atomically track post-VMXON state, e.g. this may be called in NMI context.
740	* Eat all faults as all other faults on VMXOFF faults are mode related, i.e.
741	* faults are guaranteed to be due to the !post-VMXON check unless the CPU is
742	* magically in RM, VM86, compat mode, or at CPL>0.
743	*/
744	static int kvm_cpu_vmxoff(void)
745	{
746	asm goto("1: vmxoff\n\t"
747	_ASM_EXTABLE(`1b`, %l[fault])
748	::: "cc", "memory" : fault);
749
750	cr4_clear_bits(X86_CR4_VMXE);
751	return `0`;
752
753	fault:
754	cr4_clear_bits(X86_CR4_VMXE);
755	return -EIO;
756	}
757
758	static void vmx_emergency_disable(void)
759	{
760	int cpu = raw_smp_processor_id();
761	struct loaded_vmcs *v;
762
763	kvm_rebooting = true;
764
765	/*
766	* Note, CR4.VMXE can be _cleared_ in NMI context, but it can only be
767	* set in task context. If this races with VMX is disabled by an NMI,
768	* VMCLEAR and VMXOFF may #UD, but KVM will eat those faults due to
769	* kvm_rebooting set.
770	*/
771	if (!(__read_cr4() & X86_CR4_VMXE))
772	return;
773
774	list_for_each_entry(v, &per_cpu(loaded_vmcss_on_cpu, cpu),
775	loaded_vmcss_on_cpu_link)
776	vmcs_clear(vmcs: v->vmcs);
777
778	kvm_cpu_vmxoff();
779	}
780
781	static void __loaded_vmcs_clear(void *arg)
782	{
783	struct loaded_vmcs *loaded_vmcs = arg;
784	int cpu = raw_smp_processor_id();
785
786	if (loaded_vmcs->cpu != cpu)
787	return; / vcpu migration can race with cpu offline /
788	if (per_cpu(current_vmcs, cpu) == loaded_vmcs->vmcs)
789	per_cpu(current_vmcs, cpu) = NULL;
790
791	vmcs_clear(vmcs: loaded_vmcs->vmcs);
792	if (loaded_vmcs->shadow_vmcs && loaded_vmcs->launched)
793	vmcs_clear(vmcs: loaded_vmcs->shadow_vmcs);
794
795	list_del(entry: &loaded_vmcs->loaded_vmcss_on_cpu_link);
796
797	/*
798	* Ensure all writes to loaded_vmcs, including deleting it from its
799	* current percpu list, complete before setting loaded_vmcs->cpu to
800	* -1, otherwise a different cpu can see loaded_vmcs->cpu == -1 first
801	* and add loaded_vmcs to its percpu list before it's deleted from this
802	* cpu's list. Pairs with the smp_rmb() in vmx_vcpu_load_vmcs().
803	*/
804	smp_wmb();
805
806	loaded_vmcs->cpu = -`1`;
807	loaded_vmcs->launched = `0`;
808	}
809
810	void loaded_vmcs_clear(struct loaded_vmcs *loaded_vmcs)
811	{
812	int cpu = loaded_vmcs->cpu;
813
814	if (cpu != -`1`)
815	smp_call_function_single(cpuid: cpu,
816	func: __loaded_vmcs_clear, info: loaded_vmcs, wait: `1`);
817	}
818
819	static bool vmx_segment_cache_test_set(struct vcpu_vmx vmx, unsigned* seg,
820	unsigned field)
821	{
822	bool ret;
823	u32 mask = `1` << (seg * SEG_FIELD_NR + field);
824
825	if (!kvm_register_is_available(vcpu: &vmx->vcpu, reg: VCPU_EXREG_SEGMENTS)) {
826	kvm_register_mark_available(vcpu: &vmx->vcpu, reg: VCPU_EXREG_SEGMENTS);
827	vmx->segment_cache.bitmask = `0`;
828	}
829	ret = vmx->segment_cache.bitmask & mask;
830	vmx->segment_cache.bitmask \|= mask;
831	return ret;
832	}
833
834	static u16 vmx_read_guest_seg_selector(struct vcpu_vmx vmx, unsigned* seg)
835	{
836	u16 *p = &vmx->segment_cache.seg[seg].selector;
837
838	if (!vmx_segment_cache_test_set(vmx, seg, field: SEG_FIELD_SEL))
839	*p = vmcs_read16(field: kvm_vmx_segment_fields[seg].selector);
840	return *p;
841	}
842
843	static ulong vmx_read_guest_seg_base(struct vcpu_vmx vmx, unsigned* seg)
844	{
845	ulong *p = &vmx->segment_cache.seg[seg].base;
846
847	if (!vmx_segment_cache_test_set(vmx, seg, field: SEG_FIELD_BASE))
848	*p = vmcs_readl(field: kvm_vmx_segment_fields[seg].base);
849	return *p;
850	}
851
852	static u32 vmx_read_guest_seg_limit(struct vcpu_vmx vmx, unsigned* seg)
853	{
854	u32 *p = &vmx->segment_cache.seg[seg].limit;
855
856	if (!vmx_segment_cache_test_set(vmx, seg, field: SEG_FIELD_LIMIT))
857	*p = vmcs_read32(field: kvm_vmx_segment_fields[seg].limit);
858	return *p;
859	}
860
861	static u32 vmx_read_guest_seg_ar(struct vcpu_vmx vmx, unsigned* seg)
862	{
863	u32 *p = &vmx->segment_cache.seg[seg].ar;
864
865	if (!vmx_segment_cache_test_set(vmx, seg, field: SEG_FIELD_AR))
866	*p = vmcs_read32(field: kvm_vmx_segment_fields[seg].ar_bytes);
867	return *p;
868	}
869
870	void vmx_update_exception_bitmap(struct kvm_vcpu *vcpu)
871	{
872	u32 eb;
873
874	eb = (`1u` << PF_VECTOR) \| (`1u` << UD_VECTOR) \| (`1u` << MC_VECTOR) \|
875	(`1u` << DB_VECTOR) \| (`1u` << AC_VECTOR);
876	/*
877	* Guest access to VMware backdoor ports could legitimately
878	* trigger #GP because of TSS I/O permission bitmap.
879	* We intercept those #GP and allow access to them anyway
880	* as VMware does.
881	*/
882	if (enable_vmware_backdoor)
883	eb \|= (`1u` << GP_VECTOR);
884	if ((vcpu->guest_debug &
885	(KVM_GUESTDBG_ENABLE \| KVM_GUESTDBG_USE_SW_BP)) ==
886	(KVM_GUESTDBG_ENABLE \| KVM_GUESTDBG_USE_SW_BP))
887	eb \|= `1u` << BP_VECTOR;
888	if (to_vmx(vcpu)->rmode.vm86_active)
889	eb = ~`0`;
890	if (!vmx_need_pf_intercept(vcpu))
891	eb &= ~(`1u` << PF_VECTOR);
892
893	/ When we are running a nested L2 guest and L1 specified for it a*
894	* certain exception bitmap, we must trap the same exceptions and pass
895	* them to L1. When running L2, we will only handle the exceptions
896	* specified above if L1 did not want them.
897	*/
898	if (is_guest_mode(vcpu))
899	eb \|= get_vmcs12(vcpu)->exception_bitmap;
900	else {
901	int mask = `0`, match = `0`;
902
903	if (enable_ept && (eb & (`1u` << PF_VECTOR))) {
904	/*
905	* If EPT is enabled, #PF is currently only intercepted
906	* if MAXPHYADDR is smaller on the guest than on the
907	* host. In that case we only care about present,
908	* non-reserved faults. For vmcs02, however, PFEC_MASK
909	* and PFEC_MATCH are set in prepare_vmcs02_rare.
910	*/
911	mask = PFERR_PRESENT_MASK \| PFERR_RSVD_MASK;
912	match = PFERR_PRESENT_MASK;
913	}
914	vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MASK, value: mask);
915	vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MATCH, value: match);
916	}
917
918	/*
919	* Disabling xfd interception indicates that dynamic xfeatures
920	* might be used in the guest. Always trap #NM in this case
921	* to save guest xfd_err timely.
922	*/
923	if (vcpu->arch.xfd_no_write_intercept)
924	eb \|= (`1u` << NM_VECTOR);
925
926	vmcs_write32(field: EXCEPTION_BITMAP, value: eb);
927	}
928
929	/*
930	* Check if MSR is intercepted for currently loaded MSR bitmap.
931	*/
932	static bool msr_write_intercepted(struct vcpu_vmx *vmx, u32 msr)
933	{
934	if (!(exec_controls_get(vmx) & CPU_BASED_USE_MSR_BITMAPS))
935	return true;
936
937	return vmx_test_msr_bitmap_write(bitmap: vmx->loaded_vmcs->msr_bitmap, msr);
938	}
939
940	unsigned int __vmx_vcpu_run_flags(struct vcpu_vmx *vmx)
941	{
942	unsigned int flags = `0`;
943
944	if (vmx->loaded_vmcs->launched)
945	flags \|= VMX_RUN_VMRESUME;
946
947	/*
948	* If writes to the SPEC_CTRL MSR aren't intercepted, the guest is free
949	* to change it directly without causing a vmexit. In that case read
950	* it after vmexit and store it in vmx->spec_ctrl.
951	*/
952	if (!msr_write_intercepted(vmx, MSR_IA32_SPEC_CTRL))
953	flags \|= VMX_RUN_SAVE_SPEC_CTRL;
954
955	return flags;
956	}
957
958	static __always_inline void clear_atomic_switch_msr_special(struct vcpu_vmx *vmx,
959	unsigned long entry, unsigned long exit)
960	{
961	vm_entry_controls_clearbit(vmx, val: entry);
962	vm_exit_controls_clearbit(vmx, val: exit);
963	}
964
965	int vmx_find_loadstore_msr_slot(struct vmx_msrs *m, u32 msr)
966	{
967	unsigned int i;
968
969	for (i = `0`; i < m->nr; ++i) {
970	if (m->val[i].index == msr)
971	return i;
972	}
973	return -ENOENT;
974	}
975
976	static void clear_atomic_switch_msr(struct vcpu_vmx vmx, unsigned* msr)
977	{
978	int i;
979	struct msr_autoload *m = &vmx->msr_autoload;
980
981	switch (msr) {
982	case MSR_EFER:
983	if (cpu_has_load_ia32_efer()) {
984	clear_atomic_switch_msr_special(vmx,
985	VM_ENTRY_LOAD_IA32_EFER,
986	VM_EXIT_LOAD_IA32_EFER);
987	return;
988	}
989	break;
990	case MSR_CORE_PERF_GLOBAL_CTRL:
991	if (cpu_has_load_perf_global_ctrl()) {
992	clear_atomic_switch_msr_special(vmx,
993	VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
994	VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL);
995	return;
996	}
997	break;
998	}
999	i = vmx_find_loadstore_msr_slot(m: &m->guest, msr);
1000	if (i < `0`)
1001	goto skip_guest;
1002	--m->guest.nr;
1003	m->guest.val[i] = m->guest.val[m->guest.nr];
1004	vmcs_write32(field: VM_ENTRY_MSR_LOAD_COUNT, value: m->guest.nr);
1005
1006	skip_guest:
1007	i = vmx_find_loadstore_msr_slot(m: &m->host, msr);
1008	if (i < `0`)
1009	return;
1010
1011	--m->host.nr;
1012	m->host.val[i] = m->host.val[m->host.nr];
1013	vmcs_write32(field: VM_EXIT_MSR_LOAD_COUNT, value: m->host.nr);
1014	}
1015
1016	static __always_inline void add_atomic_switch_msr_special(struct vcpu_vmx *vmx,
1017	unsigned long entry, unsigned long exit,
1018	unsigned long guest_val_vmcs, unsigned long host_val_vmcs,
1019	u64 guest_val, u64 host_val)
1020	{
1021	vmcs_write64(field: guest_val_vmcs, value: guest_val);
1022	if (host_val_vmcs != HOST_IA32_EFER)
1023	vmcs_write64(field: host_val_vmcs, value: host_val);
1024	vm_entry_controls_setbit(vmx, val: entry);
1025	vm_exit_controls_setbit(vmx, val: exit);
1026	}
1027
1028	static void add_atomic_switch_msr(struct vcpu_vmx vmx, unsigned* msr,
1029	u64 guest_val, u64 host_val, bool entry_only)
1030	{
1031	int i, j = `0`;
1032	struct msr_autoload *m = &vmx->msr_autoload;
1033
1034	switch (msr) {
1035	case MSR_EFER:
1036	if (cpu_has_load_ia32_efer()) {
1037	add_atomic_switch_msr_special(vmx,
1038	VM_ENTRY_LOAD_IA32_EFER,
1039	VM_EXIT_LOAD_IA32_EFER,
1040	guest_val_vmcs: GUEST_IA32_EFER,
1041	host_val_vmcs: HOST_IA32_EFER,
1042	guest_val, host_val);
1043	return;
1044	}
1045	break;
1046	case MSR_CORE_PERF_GLOBAL_CTRL:
1047	if (cpu_has_load_perf_global_ctrl()) {
1048	add_atomic_switch_msr_special(vmx,
1049	VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL,
1050	VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL,
1051	guest_val_vmcs: GUEST_IA32_PERF_GLOBAL_CTRL,
1052	host_val_vmcs: HOST_IA32_PERF_GLOBAL_CTRL,
1053	guest_val, host_val);
1054	return;
1055	}
1056	break;
1057	case MSR_IA32_PEBS_ENABLE:
1058	/ PEBS needs a quiescent period after being disabled (to write*
1059	* a record). Disabling PEBS through VMX MSR swapping doesn't
1060	* provide that period, so a CPU could write host's record into
1061	* guest's memory.
1062	*/
1063	wrmsrl(MSR_IA32_PEBS_ENABLE, val: `0`);
1064	}
1065
1066	i = vmx_find_loadstore_msr_slot(m: &m->guest, msr);
1067	if (!entry_only)
1068	j = vmx_find_loadstore_msr_slot(m: &m->host, msr);
1069
1070	if ((i < `0` && m->guest.nr == MAX_NR_LOADSTORE_MSRS) \|\|
1071	(j < `0` && m->host.nr == MAX_NR_LOADSTORE_MSRS)) {
1072	printk_once(KERN_WARNING "Not enough msr switch entries. "
1073	"Can't add msr %x\n", msr);
1074	return;
1075	}
1076	if (i < `0`) {
1077	i = m->guest.nr++;
1078	vmcs_write32(field: VM_ENTRY_MSR_LOAD_COUNT, value: m->guest.nr);
1079	}
1080	m->guest.val[i].index = msr;
1081	m->guest.val[i].value = guest_val;
1082
1083	if (entry_only)
1084	return;
1085
1086	if (j < `0`) {
1087	j = m->host.nr++;
1088	vmcs_write32(field: VM_EXIT_MSR_LOAD_COUNT, value: m->host.nr);
1089	}
1090	m->host.val[j].index = msr;
1091	m->host.val[j].value = host_val;
1092	}
1093
1094	static bool update_transition_efer(struct vcpu_vmx *vmx)
1095	{
1096	u64 guest_efer = vmx->vcpu.arch.efer;
1097	u64 ignore_bits = `0`;
1098	int i;
1099
1100	/ Shadow paging assumes NX to be available. /
1101	if (!enable_ept)
1102	guest_efer \|= EFER_NX;
1103
1104	/*
1105	* LMA and LME handled by hardware; SCE meaningless outside long mode.
1106	*/
1107	ignore_bits \|= EFER_SCE;
1108	#ifdef CONFIG_X86_64
1109	ignore_bits \|= EFER_LMA \| EFER_LME;
1110	/ SCE is meaningful only in long mode on Intel /
1111	if (guest_efer & EFER_LMA)
1112	ignore_bits &= ~(u64)EFER_SCE;
1113	#endif
1114
1115	/*
1116	* On EPT, we can't emulate NX, so we must switch EFER atomically.
1117	* On CPUs that support "load IA32_EFER", always switch EFER
1118	* atomically, since it's faster than switching it manually.
1119	*/
1120	if (cpu_has_load_ia32_efer() \|\|
1121	(enable_ept && ((vmx->vcpu.arch.efer ^ host_efer) & EFER_NX))) {
1122	if (!(guest_efer & EFER_LMA))
1123	guest_efer &= ~EFER_LME;
1124	if (guest_efer != host_efer)
1125	add_atomic_switch_msr(vmx, MSR_EFER,
1126	guest_val: guest_efer, host_val: host_efer, entry_only: false);
1127	else
1128	clear_atomic_switch_msr(vmx, MSR_EFER);
1129	return false;
1130	}
1131
1132	i = kvm_find_user_return_msr(MSR_EFER);
1133	if (i < `0`)
1134	return false;
1135
1136	clear_atomic_switch_msr(vmx, MSR_EFER);
1137
1138	guest_efer &= ~ignore_bits;
1139	guest_efer \|= host_efer & ignore_bits;
1140
1141	vmx->guest_uret_msrs[i].data = guest_efer;
1142	vmx->guest_uret_msrs[i].mask = ~ignore_bits;
1143
1144	return true;
1145	}
1146
1147	#ifdef CONFIG_X86_32
1148	/*
1149	* On 32-bit kernels, VM exits still load the FS and GS bases from the
1150	* VMCS rather than the segment table. KVM uses this helper to figure
1151	* out the current bases to poke them into the VMCS before entry.
1152	*/
1153	static unsigned long segment_base(u16 selector)
1154	{
1155	struct desc_struct *table;
1156	unsigned long v;
1157
1158	if (!(selector & ~SEGMENT_RPL_MASK))
1159	return `0`;
1160
1161	table = get_current_gdt_ro();
1162
1163	if ((selector & SEGMENT_TI_MASK) == SEGMENT_LDT) {
1164	u16 ldt_selector = kvm_read_ldt();
1165
1166	if (!(ldt_selector & ~SEGMENT_RPL_MASK))
1167	return `0`;
1168
1169	table = (struct desc_struct *)segment_base(ldt_selector);
1170	}
1171	v = get_desc_base(&table[selector >> `3`]);
1172	return v;
1173	}
1174	#endif
1175
1176	static inline bool pt_can_write_msr(struct vcpu_vmx *vmx)
1177	{
1178	return vmx_pt_mode_is_host_guest() &&
1179	!(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
1180	}
1181
1182	static inline bool pt_output_base_valid(struct kvm_vcpu *vcpu, u64 base)
1183	{
1184	/ The base must be 128-byte aligned and a legal physical address. /
1185	return kvm_vcpu_is_legal_aligned_gpa(vcpu, gpa: base, alignment: `128`);
1186	}
1187
1188	static inline void pt_load_msr(struct pt_ctx *ctx, u32 addr_range)
1189	{
1190	u32 i;
1191
1192	wrmsrl(MSR_IA32_RTIT_STATUS, val: ctx->status);
1193	wrmsrl(MSR_IA32_RTIT_OUTPUT_BASE, val: ctx->output_base);
1194	wrmsrl(MSR_IA32_RTIT_OUTPUT_MASK, val: ctx->output_mask);
1195	wrmsrl(MSR_IA32_RTIT_CR3_MATCH, val: ctx->cr3_match);
1196	for (i = `0`; i < addr_range; i++) {
1197	wrmsrl(MSR_IA32_RTIT_ADDR0_A + i * `2`, val: ctx->addr_a[i]);
1198	wrmsrl(MSR_IA32_RTIT_ADDR0_B + i * `2`, val: ctx->addr_b[i]);
1199	}
1200	}
1201
1202	static inline void pt_save_msr(struct pt_ctx *ctx, u32 addr_range)
1203	{
1204	u32 i;
1205
1206	rdmsrl(MSR_IA32_RTIT_STATUS, ctx->status);
1207	rdmsrl(MSR_IA32_RTIT_OUTPUT_BASE, ctx->output_base);
1208	rdmsrl(MSR_IA32_RTIT_OUTPUT_MASK, ctx->output_mask);
1209	rdmsrl(MSR_IA32_RTIT_CR3_MATCH, ctx->cr3_match);
1210	for (i = `0`; i < addr_range; i++) {
1211	rdmsrl(MSR_IA32_RTIT_ADDR0_A + i * `2`, ctx->addr_a[i]);
1212	rdmsrl(MSR_IA32_RTIT_ADDR0_B + i * `2`, ctx->addr_b[i]);
1213	}
1214	}
1215
1216	static void pt_guest_enter(struct vcpu_vmx *vmx)
1217	{
1218	if (vmx_pt_mode_is_system())
1219	return;
1220
1221	/*
1222	* GUEST_IA32_RTIT_CTL is already set in the VMCS.
1223	* Save host state before VM entry.
1224	*/
1225	rdmsrl(MSR_IA32_RTIT_CTL, vmx->pt_desc.host.ctl);
1226	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1227	wrmsrl(MSR_IA32_RTIT_CTL, val: `0`);
1228	pt_save_msr(ctx: &vmx->pt_desc.host, addr_range: vmx->pt_desc.num_address_ranges);
1229	pt_load_msr(ctx: &vmx->pt_desc.guest, addr_range: vmx->pt_desc.num_address_ranges);
1230	}
1231	}
1232
1233	static void pt_guest_exit(struct vcpu_vmx *vmx)
1234	{
1235	if (vmx_pt_mode_is_system())
1236	return;
1237
1238	if (vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) {
1239	pt_save_msr(ctx: &vmx->pt_desc.guest, addr_range: vmx->pt_desc.num_address_ranges);
1240	pt_load_msr(ctx: &vmx->pt_desc.host, addr_range: vmx->pt_desc.num_address_ranges);
1241	}
1242
1243	/*
1244	* KVM requires VM_EXIT_CLEAR_IA32_RTIT_CTL to expose PT to the guest,
1245	* i.e. RTIT_CTL is always cleared on VM-Exit. Restore it if necessary.
1246	*/
1247	if (vmx->pt_desc.host.ctl)
1248	wrmsrl(MSR_IA32_RTIT_CTL, val: vmx->pt_desc.host.ctl);
1249	}
1250
1251	void vmx_set_host_fs_gs(struct vmcs_host_state *host, u16 fs_sel, u16 gs_sel,
1252	unsigned long fs_base, unsigned long gs_base)
1253	{
1254	if (unlikely(fs_sel != host->fs_sel)) {
1255	if (!(fs_sel & `7`))
1256	vmcs_write16(field: HOST_FS_SELECTOR, value: fs_sel);
1257	else
1258	vmcs_write16(field: HOST_FS_SELECTOR, value: `0`);
1259	host->fs_sel = fs_sel;
1260	}
1261	if (unlikely(gs_sel != host->gs_sel)) {
1262	if (!(gs_sel & `7`))
1263	vmcs_write16(field: HOST_GS_SELECTOR, value: gs_sel);
1264	else
1265	vmcs_write16(field: HOST_GS_SELECTOR, value: `0`);
1266	host->gs_sel = gs_sel;
1267	}
1268	if (unlikely(fs_base != host->fs_base)) {
1269	vmcs_writel(field: HOST_FS_BASE, value: fs_base);
1270	host->fs_base = fs_base;
1271	}
1272	if (unlikely(gs_base != host->gs_base)) {
1273	vmcs_writel(field: HOST_GS_BASE, value: gs_base);
1274	host->gs_base = gs_base;
1275	}
1276	}
1277
1278	void vmx_prepare_switch_to_guest(struct kvm_vcpu *vcpu)
1279	{
1280	struct vcpu_vmx *vmx = to_vmx(vcpu);
1281	struct vmcs_host_state *host_state;
1282	#ifdef CONFIG_X86_64
1283	int cpu = raw_smp_processor_id();
1284	#endif
1285	unsigned long fs_base, gs_base;
1286	u16 fs_sel, gs_sel;
1287	int i;
1288
1289	/*
1290	* Note that guest MSRs to be saved/restored can also be changed
1291	* when guest state is loaded. This happens when guest transitions
1292	* to/from long-mode by setting MSR_EFER.LMA.
1293	*/
1294	if (!vmx->guest_uret_msrs_loaded) {
1295	vmx->guest_uret_msrs_loaded = true;
1296	for (i = `0`; i < kvm_nr_uret_msrs; ++i) {
1297	if (!vmx->guest_uret_msrs[i].load_into_hardware)
1298	continue;
1299
1300	kvm_set_user_return_msr(index: i,
1301	val: vmx->guest_uret_msrs[i].data,
1302	mask: vmx->guest_uret_msrs[i].mask);
1303	}
1304	}
1305
1306	if (vmx->nested.need_vmcs12_to_shadow_sync)
1307	nested_sync_vmcs12_to_shadow(vcpu);
1308
1309	if (vmx->guest_state_loaded)
1310	return;
1311
1312	host_state = &vmx->loaded_vmcs->host_state;
1313
1314	/*
1315	* Set host fs and gs selectors. Unfortunately, 22.2.3 does not
1316	* allow segment selectors with cpl > 0 or ti == 1.
1317	*/
1318	host_state->ldt_sel = kvm_read_ldt();
1319
1320	#ifdef CONFIG_X86_64
1321	savesegment(ds, host_state->ds_sel);
1322	savesegment(es, host_state->es_sel);
1323
1324	gs_base = cpu_kernelmode_gs_base(cpu);
1325	if (likely(is_64bit_mm(current->mm))) {
1326	current_save_fsgs();
1327	fs_sel = current->thread.fsindex;
1328	gs_sel = current->thread.gsindex;
1329	fs_base = current->thread.fsbase;
1330	vmx->msr_host_kernel_gs_base = current->thread.gsbase;
1331	} else {
1332	savesegment(fs, fs_sel);
1333	savesegment(gs, gs_sel);
1334	fs_base = read_msr(MSR_FS_BASE);
1335	vmx->msr_host_kernel_gs_base = read_msr(MSR_KERNEL_GS_BASE);
1336	}
1337
1338	wrmsrl(MSR_KERNEL_GS_BASE, val: vmx->msr_guest_kernel_gs_base);
1339	#else
1340	savesegment(fs, fs_sel);
1341	savesegment(gs, gs_sel);
1342	fs_base = segment_base(fs_sel);
1343	gs_base = segment_base(gs_sel);
1344	#endif
1345
1346	vmx_set_host_fs_gs(host: host_state, fs_sel, gs_sel, fs_base, gs_base);
1347	vmx->guest_state_loaded = true;
1348	}
1349
1350	static void vmx_prepare_switch_to_host(struct vcpu_vmx *vmx)
1351	{
1352	struct vmcs_host_state *host_state;
1353
1354	if (!vmx->guest_state_loaded)
1355	return;
1356
1357	host_state = &vmx->loaded_vmcs->host_state;
1358
1359	++vmx->vcpu.stat.host_state_reload;
1360
1361	#ifdef CONFIG_X86_64
1362	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1363	#endif
1364	if (host_state->ldt_sel \|\| (host_state->gs_sel & `7`)) {
1365	kvm_load_ldt(sel: host_state->ldt_sel);
1366	#ifdef CONFIG_X86_64
1367	load_gs_index(gs: host_state->gs_sel);
1368	#else
1369	loadsegment(gs, host_state->gs_sel);
1370	#endif
1371	}
1372	if (host_state->fs_sel & `7`)
1373	loadsegment(fs, host_state->fs_sel);
1374	#ifdef CONFIG_X86_64
1375	if (unlikely(host_state->ds_sel \| host_state->es_sel)) {
1376	loadsegment(ds, host_state->ds_sel);
1377	loadsegment(es, host_state->es_sel);
1378	}
1379	#endif
1380	invalidate_tss_limit();
1381	#ifdef CONFIG_X86_64
1382	wrmsrl(MSR_KERNEL_GS_BASE, val: vmx->msr_host_kernel_gs_base);
1383	#endif
1384	load_fixmap_gdt(raw_smp_processor_id());
1385	vmx->guest_state_loaded = false;
1386	vmx->guest_uret_msrs_loaded = false;
1387	}
1388
1389	#ifdef CONFIG_X86_64
1390	static u64 vmx_read_guest_kernel_gs_base(struct vcpu_vmx *vmx)
1391	{
1392	preempt_disable();
1393	if (vmx->guest_state_loaded)
1394	rdmsrl(MSR_KERNEL_GS_BASE, vmx->msr_guest_kernel_gs_base);
1395	preempt_enable();
1396	return vmx->msr_guest_kernel_gs_base;
1397	}
1398
1399	static void vmx_write_guest_kernel_gs_base(struct vcpu_vmx *vmx, u64 data)
1400	{
1401	preempt_disable();
1402	if (vmx->guest_state_loaded)
1403	wrmsrl(MSR_KERNEL_GS_BASE, val: data);
1404	preempt_enable();
1405	vmx->msr_guest_kernel_gs_base = data;
1406	}
1407	#endif
1408
1409	void vmx_vcpu_load_vmcs(struct kvm_vcpu vcpu, int* cpu,
1410	struct loaded_vmcs *buddy)
1411	{
1412	struct vcpu_vmx *vmx = to_vmx(vcpu);
1413	bool already_loaded = vmx->loaded_vmcs->cpu == cpu;
1414	struct vmcs *prev;
1415
1416	if (!already_loaded) {
1417	loaded_vmcs_clear(loaded_vmcs: vmx->loaded_vmcs);
1418	local_irq_disable();
1419
1420	/*
1421	* Ensure loaded_vmcs->cpu is read before adding loaded_vmcs to
1422	* this cpu's percpu list, otherwise it may not yet be deleted
1423	* from its previous cpu's percpu list. Pairs with the
1424	* smb_wmb() in __loaded_vmcs_clear().
1425	*/
1426	smp_rmb();
1427
1428	list_add(new: &vmx->loaded_vmcs->loaded_vmcss_on_cpu_link,
1429	head: &per_cpu(loaded_vmcss_on_cpu, cpu));
1430	local_irq_enable();
1431	}
1432
1433	prev = per_cpu(current_vmcs, cpu);
1434	if (prev != vmx->loaded_vmcs->vmcs) {
1435	per_cpu(current_vmcs, cpu) = vmx->loaded_vmcs->vmcs;
1436	vmcs_load(vmcs: vmx->loaded_vmcs->vmcs);
1437
1438	/*
1439	* No indirect branch prediction barrier needed when switching
1440	* the active VMCS within a vCPU, unless IBRS is advertised to
1441	* the vCPU. To minimize the number of IBPBs executed, KVM
1442	* performs IBPB on nested VM-Exit (a single nested transition
1443	* may switch the active VMCS multiple times).
1444	*/
1445	if (!buddy \|\| WARN_ON_ONCE(buddy->vmcs != prev))
1446	indirect_branch_prediction_barrier();
1447	}
1448
1449	if (!already_loaded) {
1450	void *gdt = get_current_gdt_ro();
1451
1452	/*
1453	* Flush all EPTP/VPID contexts, the new pCPU may have stale
1454	* TLB entries from its previous association with the vCPU.
1455	*/
1456	kvm_make_request(KVM_REQ_TLB_FLUSH, vcpu);
1457
1458	/*
1459	* Linux uses per-cpu TSS and GDT, so set these when switching
1460	* processors. See 22.2.4.
1461	*/
1462	vmcs_writel(field: HOST_TR_BASE,
1463	value: (unsigned long)&get_cpu_entry_area(cpu)->tss.x86_tss);
1464	vmcs_writel(field: HOST_GDTR_BASE, value: (unsigned long)gdt); / 22.2.4 /
1465
1466	if (IS_ENABLED(CONFIG_IA32_EMULATION) \|\| IS_ENABLED(CONFIG_X86_32)) {
1467	/ 22.2.3 /
1468	vmcs_writel(field: HOST_IA32_SYSENTER_ESP,
1469	value: (unsigned long)(cpu_entry_stack(cpu) + `1`));
1470	}
1471
1472	vmx->loaded_vmcs->cpu = cpu;
1473	}
1474	}
1475
1476	/*
1477	* Switches to specified vcpu, until a matching vcpu_put(), but assumes
1478	* vcpu mutex is already taken.
1479	*/
1480	static void vmx_vcpu_load(struct kvm_vcpu vcpu, int* cpu)
1481	{
1482	struct vcpu_vmx *vmx = to_vmx(vcpu);
1483
1484	vmx_vcpu_load_vmcs(vcpu, cpu, NULL);
1485
1486	vmx_vcpu_pi_load(vcpu, cpu);
1487
1488	vmx->host_debugctlmsr = get_debugctlmsr();
1489	}
1490
1491	static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
1492	{
1493	vmx_vcpu_pi_put(vcpu);
1494
1495	vmx_prepare_switch_to_host(vmx: to_vmx(vcpu));
1496	}
1497
1498	bool vmx_emulation_required(struct kvm_vcpu *vcpu)
1499	{
1500	return emulate_invalid_guest_state && !vmx_guest_state_valid(vcpu);
1501	}
1502
1503	unsigned long vmx_get_rflags(struct kvm_vcpu *vcpu)
1504	{
1505	struct vcpu_vmx *vmx = to_vmx(vcpu);
1506	unsigned long rflags, save_rflags;
1507
1508	if (!kvm_register_is_available(vcpu, reg: VCPU_EXREG_RFLAGS)) {
1509	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_RFLAGS);
1510	rflags = vmcs_readl(field: GUEST_RFLAGS);
1511	if (vmx->rmode.vm86_active) {
1512	rflags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
1513	save_rflags = vmx->rmode.save_rflags;
1514	rflags \|= save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
1515	}
1516	vmx->rflags = rflags;
1517	}
1518	return vmx->rflags;
1519	}
1520
1521	void vmx_set_rflags(struct kvm_vcpu vcpu, unsigned* long rflags)
1522	{
1523	struct vcpu_vmx *vmx = to_vmx(vcpu);
1524	unsigned long old_rflags;
1525
1526	/*
1527	* Unlike CR0 and CR4, RFLAGS handling requires checking if the vCPU
1528	* is an unrestricted guest in order to mark L2 as needing emulation
1529	* if L1 runs L2 as a restricted guest.
1530	*/
1531	if (is_unrestricted_guest(vcpu)) {
1532	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_RFLAGS);
1533	vmx->rflags = rflags;
1534	vmcs_writel(field: GUEST_RFLAGS, value: rflags);
1535	return;
1536	}
1537
1538	old_rflags = vmx_get_rflags(vcpu);
1539	vmx->rflags = rflags;
1540	if (vmx->rmode.vm86_active) {
1541	vmx->rmode.save_rflags = rflags;
1542	rflags \|= X86_EFLAGS_IOPL \| X86_EFLAGS_VM;
1543	}
1544	vmcs_writel(field: GUEST_RFLAGS, value: rflags);
1545
1546	if ((old_rflags ^ vmx->rflags) & X86_EFLAGS_VM)
1547	vmx->emulation_required = vmx_emulation_required(vcpu);
1548	}
1549
1550	static bool vmx_get_if_flag(struct kvm_vcpu *vcpu)
1551	{
1552	return vmx_get_rflags(vcpu) & X86_EFLAGS_IF;
1553	}
1554
1555	u32 vmx_get_interrupt_shadow(struct kvm_vcpu *vcpu)
1556	{
1557	u32 interruptibility = vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO);
1558	int ret = `0`;
1559
1560	if (interruptibility & GUEST_INTR_STATE_STI)
1561	ret \|= KVM_X86_SHADOW_INT_STI;
1562	if (interruptibility & GUEST_INTR_STATE_MOV_SS)
1563	ret \|= KVM_X86_SHADOW_INT_MOV_SS;
1564
1565	return ret;
1566	}
1567
1568	void vmx_set_interrupt_shadow(struct kvm_vcpu vcpu, int* mask)
1569	{
1570	u32 interruptibility_old = vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO);
1571	u32 interruptibility = interruptibility_old;
1572
1573	interruptibility &= ~(GUEST_INTR_STATE_STI \| GUEST_INTR_STATE_MOV_SS);
1574
1575	if (mask & KVM_X86_SHADOW_INT_MOV_SS)
1576	interruptibility \|= GUEST_INTR_STATE_MOV_SS;
1577	else if (mask & KVM_X86_SHADOW_INT_STI)
1578	interruptibility \|= GUEST_INTR_STATE_STI;
1579
1580	if ((interruptibility != interruptibility_old))
1581	vmcs_write32(field: GUEST_INTERRUPTIBILITY_INFO, value: interruptibility);
1582	}
1583
1584	static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
1585	{
1586	struct vcpu_vmx *vmx = to_vmx(vcpu);
1587	unsigned long value;
1588
1589	/*
1590	* Any MSR write that attempts to change bits marked reserved will
1591	* case a #GP fault.
1592	*/
1593	if (data & vmx->pt_desc.ctl_bitmask)
1594	return `1`;
1595
1596	/*
1597	* Any attempt to modify IA32_RTIT_CTL while TraceEn is set will
1598	* result in a #GP unless the same write also clears TraceEn.
1599	*/
1600	if ((vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN) &&
1601	((vmx->pt_desc.guest.ctl ^ data) & ~RTIT_CTL_TRACEEN))
1602	return `1`;
1603
1604	/*
1605	* WRMSR to IA32_RTIT_CTL that sets TraceEn but clears this bit
1606	* and FabricEn would cause #GP, if
1607	* CPUID.(EAX=14H, ECX=0):ECX.SNGLRGNOUT[bit 2] = 0
1608	*/
1609	if ((data & RTIT_CTL_TRACEEN) && !(data & RTIT_CTL_TOPA) &&
1610	!(data & RTIT_CTL_FABRIC_EN) &&
1611	!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
1612	cap: PT_CAP_single_range_output))
1613	return `1`;
1614
1615	/*
1616	* MTCFreq, CycThresh and PSBFreq encodings check, any MSR write that
1617	* utilize encodings marked reserved will cause a #GP fault.
1618	*/
1619	value = intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_mtc_periods);
1620	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_mtc) &&
1621	!test_bit((data & RTIT_CTL_MTC_RANGE) >>
1622	RTIT_CTL_MTC_RANGE_OFFSET, &value))
1623	return `1`;
1624	value = intel_pt_validate_cap(caps: vmx->pt_desc.caps,
1625	cap: PT_CAP_cycle_thresholds);
1626	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_psb_cyc) &&
1627	!test_bit((data & RTIT_CTL_CYC_THRESH) >>
1628	RTIT_CTL_CYC_THRESH_OFFSET, &value))
1629	return `1`;
1630	value = intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_psb_periods);
1631	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_psb_cyc) &&
1632	!test_bit((data & RTIT_CTL_PSB_FREQ) >>
1633	RTIT_CTL_PSB_FREQ_OFFSET, &value))
1634	return `1`;
1635
1636	/*
1637	* If ADDRx_CFG is reserved or the encodings is >2 will
1638	* cause a #GP fault.
1639	*/
1640	value = (data & RTIT_CTL_ADDR0) >> RTIT_CTL_ADDR0_OFFSET;
1641	if ((value && (vmx->pt_desc.num_address_ranges < `1`)) \|\| (value > `2`))
1642	return `1`;
1643	value = (data & RTIT_CTL_ADDR1) >> RTIT_CTL_ADDR1_OFFSET;
1644	if ((value && (vmx->pt_desc.num_address_ranges < `2`)) \|\| (value > `2`))
1645	return `1`;
1646	value = (data & RTIT_CTL_ADDR2) >> RTIT_CTL_ADDR2_OFFSET;
1647	if ((value && (vmx->pt_desc.num_address_ranges < `3`)) \|\| (value > `2`))
1648	return `1`;
1649	value = (data & RTIT_CTL_ADDR3) >> RTIT_CTL_ADDR3_OFFSET;
1650	if ((value && (vmx->pt_desc.num_address_ranges < `4`)) \|\| (value > `2`))
1651	return `1`;
1652
1653	return `0`;
1654	}
1655
1656	static int vmx_check_emulate_instruction(struct kvm_vcpu vcpu, int* emul_type,
1657	void insn, int* insn_len)
1658	{
1659	/*
1660	* Emulation of instructions in SGX enclaves is impossible as RIP does
1661	* not point at the failing instruction, and even if it did, the code
1662	* stream is inaccessible. Inject #UD instead of exiting to userspace
1663	* so that guest userspace can't DoS the guest simply by triggering
1664	* emulation (enclaves are CPL3 only).
1665	*/
1666	if (to_vmx(vcpu)->exit_reason.enclave_mode) {
1667	kvm_queue_exception(vcpu, UD_VECTOR);
1668	return X86EMUL_PROPAGATE_FAULT;
1669	}
1670	return X86EMUL_CONTINUE;
1671	}
1672
1673	static int skip_emulated_instruction(struct kvm_vcpu *vcpu)
1674	{
1675	union vmx_exit_reason exit_reason = to_vmx(vcpu)->exit_reason;
1676	unsigned long rip, orig_rip;
1677	u32 instr_len;
1678
1679	/*
1680	* Using VMCS.VM_EXIT_INSTRUCTION_LEN on EPT misconfig depends on
1681	* undefined behavior: Intel's SDM doesn't mandate the VMCS field be
1682	* set when EPT misconfig occurs. In practice, real hardware updates
1683	* VM_EXIT_INSTRUCTION_LEN on EPT misconfig, but other hypervisors
1684	* (namely Hyper-V) don't set it due to it being undefined behavior,
1685	* i.e. we end up advancing IP with some random value.
1686	*/
1687	if (!static_cpu_has(X86_FEATURE_HYPERVISOR) \|\|
1688	exit_reason.basic != EXIT_REASON_EPT_MISCONFIG) {
1689	instr_len = vmcs_read32(field: VM_EXIT_INSTRUCTION_LEN);
1690
1691	/*
1692	* Emulating an enclave's instructions isn't supported as KVM
1693	* cannot access the enclave's memory or its true RIP, e.g. the
1694	* vmcs.GUEST_RIP points at the exit point of the enclave, not
1695	* the RIP that actually triggered the VM-Exit. But, because
1696	* most instructions that cause VM-Exit will #UD in an enclave,
1697	* most instruction-based VM-Exits simply do not occur.
1698	*
1699	* There are a few exceptions, notably the debug instructions
1700	* INT1ICEBRK and INT3, as they are allowed in debug enclaves
1701	* and generate #DB/#BP as expected, which KVM might intercept.
1702	* But again, the CPU does the dirty work and saves an instr
1703	* length of zero so VMMs don't shoot themselves in the foot.
1704	* WARN if KVM tries to skip a non-zero length instruction on
1705	* a VM-Exit from an enclave.
1706	*/
1707	if (!instr_len)
1708	goto rip_updated;
1709
1710	WARN_ONCE(exit_reason.enclave_mode,
1711	"skipping instruction after SGX enclave VM-Exit");
1712
1713	orig_rip = kvm_rip_read(vcpu);
1714	rip = orig_rip + instr_len;
1715	#ifdef CONFIG_X86_64
1716	/*
1717	* We need to mask out the high 32 bits of RIP if not in 64-bit
1718	* mode, but just finding out that we are in 64-bit mode is
1719	* quite expensive. Only do it if there was a carry.
1720	*/
1721	if (unlikely(((rip ^ orig_rip) >> `31`) == `3`) && !is_64_bit_mode(vcpu))
1722	rip = (u32)rip;
1723	#endif
1724	kvm_rip_write(vcpu, val: rip);
1725	} else {
1726	if (!kvm_emulate_instruction(vcpu, EMULTYPE_SKIP))
1727	return `0`;
1728	}
1729
1730	rip_updated:
1731	/ skipping an emulated instruction also counts /
1732	vmx_set_interrupt_shadow(vcpu, mask: `0`);
1733
1734	return `1`;
1735	}
1736
1737	/*
1738	* Recognizes a pending MTF VM-exit and records the nested state for later
1739	* delivery.
1740	*/
1741	static void vmx_update_emulated_instruction(struct kvm_vcpu *vcpu)
1742	{
1743	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1744	struct vcpu_vmx *vmx = to_vmx(vcpu);
1745
1746	if (!is_guest_mode(vcpu))
1747	return;
1748
1749	/*
1750	* Per the SDM, MTF takes priority over debug-trap exceptions besides
1751	* TSS T-bit traps and ICEBP (INT1). KVM doesn't emulate T-bit traps
1752	* or ICEBP (in the emulator proper), and skipping of ICEBP after an
1753	* intercepted #DB deliberately avoids single-step #DB and MTF updates
1754	* as ICEBP is higher priority than both. As instruction emulation is
1755	* completed at this point (i.e. KVM is at the instruction boundary),
1756	* any #DB exception pending delivery must be a debug-trap of lower
1757	* priority than MTF. Record the pending MTF state to be delivered in
1758	* vmx_check_nested_events().
1759	*/
1760	if (nested_cpu_has_mtf(vmcs12) &&
1761	(!vcpu->arch.exception.pending \|\|
1762	vcpu->arch.exception.vector == DB_VECTOR) &&
1763	(!vcpu->arch.exception_vmexit.pending \|\|
1764	vcpu->arch.exception_vmexit.vector == DB_VECTOR)) {
1765	vmx->nested.mtf_pending = true;
1766	kvm_make_request(KVM_REQ_EVENT, vcpu);
1767	} else {
1768	vmx->nested.mtf_pending = false;
1769	}
1770	}
1771
1772	static int vmx_skip_emulated_instruction(struct kvm_vcpu *vcpu)
1773	{
1774	vmx_update_emulated_instruction(vcpu);
1775	return skip_emulated_instruction(vcpu);
1776	}
1777
1778	static void vmx_clear_hlt(struct kvm_vcpu *vcpu)
1779	{
1780	/*
1781	* Ensure that we clear the HLT state in the VMCS. We don't need to
1782	* explicitly skip the instruction because if the HLT state is set,
1783	* then the instruction is already executing and RIP has already been
1784	* advanced.
1785	*/
1786	if (kvm_hlt_in_guest(kvm: vcpu->kvm) &&
1787	vmcs_read32(field: GUEST_ACTIVITY_STATE) == GUEST_ACTIVITY_HLT)
1788	vmcs_write32(field: GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
1789	}
1790
1791	static void vmx_inject_exception(struct kvm_vcpu *vcpu)
1792	{
1793	struct kvm_queued_exception *ex = &vcpu->arch.exception;
1794	u32 intr_info = ex->vector \| INTR_INFO_VALID_MASK;
1795	struct vcpu_vmx *vmx = to_vmx(vcpu);
1796
1797	kvm_deliver_exception_payload(vcpu, ex);
1798
1799	if (ex->has_error_code) {
1800	/*
1801	* Despite the error code being architecturally defined as 32
1802	* bits, and the VMCS field being 32 bits, Intel CPUs and thus
1803	* VMX don't actually supporting setting bits 31:16. Hardware
1804	* will (should) never provide a bogus error code, but AMD CPUs
1805	* do generate error codes with bits 31:16 set, and so KVM's
1806	* ABI lets userspace shove in arbitrary 32-bit values. Drop
1807	* the upper bits to avoid VM-Fail, losing information that
1808	* doesn't really exist is preferable to killing the VM.
1809	*/
1810	vmcs_write32(field: VM_ENTRY_EXCEPTION_ERROR_CODE, value: (u16)ex->error_code);
1811	intr_info \|= INTR_INFO_DELIVER_CODE_MASK;
1812	}
1813
1814	if (vmx->rmode.vm86_active) {
1815	int inc_eip = `0`;
1816	if (kvm_exception_is_soft(nr: ex->vector))
1817	inc_eip = vcpu->arch.event_exit_inst_len;
1818	kvm_inject_realmode_interrupt(vcpu, irq: ex->vector, inc_eip);
1819	return;
1820	}
1821
1822	WARN_ON_ONCE(vmx->emulation_required);
1823
1824	if (kvm_exception_is_soft(nr: ex->vector)) {
1825	vmcs_write32(field: VM_ENTRY_INSTRUCTION_LEN,
1826	value: vmx->vcpu.arch.event_exit_inst_len);
1827	intr_info \|= INTR_TYPE_SOFT_EXCEPTION;
1828	} else
1829	intr_info \|= INTR_TYPE_HARD_EXCEPTION;
1830
1831	vmcs_write32(field: VM_ENTRY_INTR_INFO_FIELD, value: intr_info);
1832
1833	vmx_clear_hlt(vcpu);
1834	}
1835
1836	static void vmx_setup_uret_msr(struct vcpu_vmx vmx, unsigned* int msr,
1837	bool load_into_hardware)
1838	{
1839	struct vmx_uret_msr *uret_msr;
1840
1841	uret_msr = vmx_find_uret_msr(vmx, msr);
1842	if (!uret_msr)
1843	return;
1844
1845	uret_msr->load_into_hardware = load_into_hardware;
1846	}
1847
1848	/*
1849	* Configuring user return MSRs to automatically save, load, and restore MSRs
1850	* that need to be shoved into hardware when running the guest. Note, omitting
1851	* an MSR here does _NOT_ mean it's not emulated, only that it will not be
1852	* loaded into hardware when running the guest.
1853	*/
1854	static void vmx_setup_uret_msrs(struct vcpu_vmx *vmx)
1855	{
1856	#ifdef CONFIG_X86_64
1857	bool load_syscall_msrs;
1858
1859	/*
1860	* The SYSCALL MSRs are only needed on long mode guests, and only
1861	* when EFER.SCE is set.
1862	*/
1863	load_syscall_msrs = is_long_mode(vcpu: &vmx->vcpu) &&
1864	(vmx->vcpu.arch.efer & EFER_SCE);
1865
1866	vmx_setup_uret_msr(vmx, MSR_STAR, load_into_hardware: load_syscall_msrs);
1867	vmx_setup_uret_msr(vmx, MSR_LSTAR, load_into_hardware: load_syscall_msrs);
1868	vmx_setup_uret_msr(vmx, MSR_SYSCALL_MASK, load_into_hardware: load_syscall_msrs);
1869	#endif
1870	vmx_setup_uret_msr(vmx, MSR_EFER, load_into_hardware: update_transition_efer(vmx));
1871
1872	vmx_setup_uret_msr(vmx, MSR_TSC_AUX,
1873	load_into_hardware: guest_cpuid_has(vcpu: &vmx->vcpu, X86_FEATURE_RDTSCP) \|\|
1874	guest_cpuid_has(vcpu: &vmx->vcpu, X86_FEATURE_RDPID));
1875
1876	/*
1877	* hle=0, rtm=0, tsx_ctrl=1 can be found with some combinations of new
1878	* kernel and old userspace. If those guests run on a tsx=off host, do
1879	* allow guests to use TSX_CTRL, but don't change the value in hardware
1880	* so that TSX remains always disabled.
1881	*/
1882	vmx_setup_uret_msr(vmx, MSR_IA32_TSX_CTRL, boot_cpu_has(X86_FEATURE_RTM));
1883
1884	/*
1885	* The set of MSRs to load may have changed, reload MSRs before the
1886	* next VM-Enter.
1887	*/
1888	vmx->guest_uret_msrs_loaded = false;
1889	}
1890
1891	u64 vmx_get_l2_tsc_offset(struct kvm_vcpu *vcpu)
1892	{
1893	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1894
1895	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING))
1896	return vmcs12->tsc_offset;
1897
1898	return `0`;
1899	}
1900
1901	u64 vmx_get_l2_tsc_multiplier(struct kvm_vcpu *vcpu)
1902	{
1903	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
1904
1905	if (nested_cpu_has(vmcs12, CPU_BASED_USE_TSC_OFFSETTING) &&
1906	nested_cpu_has2(vmcs12, SECONDARY_EXEC_TSC_SCALING))
1907	return vmcs12->tsc_multiplier;
1908
1909	return kvm_caps.default_tsc_scaling_ratio;
1910	}
1911
1912	static void vmx_write_tsc_offset(struct kvm_vcpu *vcpu)
1913	{
1914	vmcs_write64(field: TSC_OFFSET, value: vcpu->arch.tsc_offset);
1915	}
1916
1917	static void vmx_write_tsc_multiplier(struct kvm_vcpu *vcpu)
1918	{
1919	vmcs_write64(field: TSC_MULTIPLIER, value: vcpu->arch.tsc_scaling_ratio);
1920	}
1921
1922	/*
1923	* Userspace is allowed to set any supported IA32_FEATURE_CONTROL regardless of
1924	* guest CPUID. Note, KVM allows userspace to set "VMX in SMX" to maintain
1925	* backwards compatibility even though KVM doesn't support emulating SMX. And
1926	* because userspace set "VMX in SMX", the guest must also be allowed to set it,
1927	* e.g. if the MSR is left unlocked and the guest does a RMW operation.
1928	*/
1929	#define KVM_SUPPORTED_FEATURE_CONTROL (FEAT_CTL_LOCKED \| \
1930	FEAT_CTL_VMX_ENABLED_INSIDE_SMX \| \
1931	FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX \| \
1932	FEAT_CTL_SGX_LC_ENABLED \| \
1933	FEAT_CTL_SGX_ENABLED \| \
1934	FEAT_CTL_LMCE_ENABLED)
1935
1936	static inline bool is_vmx_feature_control_msr_valid(struct vcpu_vmx *vmx,
1937	struct msr_data *msr)
1938	{
1939	uint64_t valid_bits;
1940
1941	/*
1942	* Ensure KVM_SUPPORTED_FEATURE_CONTROL is updated when new bits are
1943	* exposed to the guest.
1944	*/
1945	WARN_ON_ONCE(vmx->msr_ia32_feature_control_valid_bits &
1946	~KVM_SUPPORTED_FEATURE_CONTROL);
1947
1948	if (!msr->host_initiated &&
1949	(vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED))
1950	return false;
1951
1952	if (msr->host_initiated)
1953	valid_bits = KVM_SUPPORTED_FEATURE_CONTROL;
1954	else
1955	valid_bits = vmx->msr_ia32_feature_control_valid_bits;
1956
1957	return !(msr->data & ~valid_bits);
1958	}
1959
1960	static int vmx_get_msr_feature(struct kvm_msr_entry *msr)
1961	{
1962	switch (msr->index) {
1963	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
1964	if (!nested)
1965	return `1`;
1966	return vmx_get_vmx_msr(msrs: &vmcs_config.nested, msr_index: msr->index, pdata: &msr->data);
1967	default:
1968	return KVM_MSR_RET_INVALID;
1969	}
1970	}
1971
1972	/*
1973	* Reads an msr value (of 'msr_info->index') into 'msr_info->data'.
1974	* Returns 0 on success, non-0 otherwise.
1975	* Assumes vcpu_load() was already called.
1976	*/
1977	static int vmx_get_msr(struct kvm_vcpu vcpu, struct* msr_data *msr_info)
1978	{
1979	struct vcpu_vmx *vmx = to_vmx(vcpu);
1980	struct vmx_uret_msr *msr;
1981	u32 index;
1982
1983	switch (msr_info->index) {
1984	#ifdef CONFIG_X86_64
1985	case MSR_FS_BASE:
1986	msr_info->data = vmcs_readl(field: GUEST_FS_BASE);
1987	break;
1988	case MSR_GS_BASE:
1989	msr_info->data = vmcs_readl(field: GUEST_GS_BASE);
1990	break;
1991	case MSR_KERNEL_GS_BASE:
1992	msr_info->data = vmx_read_guest_kernel_gs_base(vmx);
1993	break;
1994	#endif
1995	case MSR_EFER:
1996	return kvm_get_msr_common(vcpu, msr: msr_info);
1997	case MSR_IA32_TSX_CTRL:
1998	if (!msr_info->host_initiated &&
1999	!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2000	return `1`;
2001	goto find_uret_msr;
2002	case MSR_IA32_UMWAIT_CONTROL:
2003	if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2004	return `1`;
2005
2006	msr_info->data = vmx->msr_ia32_umwait_control;
2007	break;
2008	case MSR_IA32_SPEC_CTRL:
2009	if (!msr_info->host_initiated &&
2010	!guest_has_spec_ctrl_msr(vcpu))
2011	return `1`;
2012
2013	msr_info->data = to_vmx(vcpu)->spec_ctrl;
2014	break;
2015	case MSR_IA32_SYSENTER_CS:
2016	msr_info->data = vmcs_read32(field: GUEST_SYSENTER_CS);
2017	break;
2018	case MSR_IA32_SYSENTER_EIP:
2019	msr_info->data = vmcs_readl(field: GUEST_SYSENTER_EIP);
2020	break;
2021	case MSR_IA32_SYSENTER_ESP:
2022	msr_info->data = vmcs_readl(field: GUEST_SYSENTER_ESP);
2023	break;
2024	case MSR_IA32_BNDCFGS:
2025	if (!kvm_mpx_supported() \|\|
2026	(!msr_info->host_initiated &&
2027	!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2028	return `1`;
2029	msr_info->data = vmcs_read64(field: GUEST_BNDCFGS);
2030	break;
2031	case MSR_IA32_MCG_EXT_CTL:
2032	if (!msr_info->host_initiated &&
2033	!(vmx->msr_ia32_feature_control &
2034	FEAT_CTL_LMCE_ENABLED))
2035	return `1`;
2036	msr_info->data = vcpu->arch.mcg_ext_ctl;
2037	break;
2038	case MSR_IA32_FEAT_CTL:
2039	msr_info->data = vmx->msr_ia32_feature_control;
2040	break;
2041	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2042	if (!msr_info->host_initiated &&
2043	!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
2044	return `1`;
2045	msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
2046	[msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
2047	break;
2048	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2049	if (!guest_can_use(vcpu, X86_FEATURE_VMX))
2050	return `1`;
2051	if (vmx_get_vmx_msr(msrs: &vmx->nested.msrs, msr_index: msr_info->index,
2052	pdata: &msr_info->data))
2053	return `1`;
2054	#ifdef CONFIG_KVM_HYPERV
2055	/*
2056	* Enlightened VMCS v1 doesn't have certain VMCS fields but
2057	* instead of just ignoring the features, different Hyper-V
2058	* versions are either trying to use them and fail or do some
2059	* sanity checking and refuse to boot. Filter all unsupported
2060	* features out.
2061	*/
2062	if (!msr_info->host_initiated && guest_cpuid_has_evmcs(vcpu))
2063	nested_evmcs_filter_control_msr(vcpu, msr_index: msr_info->index,
2064	pdata: &msr_info->data);
2065	#endif
2066	break;
2067	case MSR_IA32_RTIT_CTL:
2068	if (!vmx_pt_mode_is_host_guest())
2069	return `1`;
2070	msr_info->data = vmx->pt_desc.guest.ctl;
2071	break;
2072	case MSR_IA32_RTIT_STATUS:
2073	if (!vmx_pt_mode_is_host_guest())
2074	return `1`;
2075	msr_info->data = vmx->pt_desc.guest.status;
2076	break;
2077	case MSR_IA32_RTIT_CR3_MATCH:
2078	if (!vmx_pt_mode_is_host_guest() \|\|
2079	!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2080	cap: PT_CAP_cr3_filtering))
2081	return `1`;
2082	msr_info->data = vmx->pt_desc.guest.cr3_match;
2083	break;
2084	case MSR_IA32_RTIT_OUTPUT_BASE:
2085	if (!vmx_pt_mode_is_host_guest() \|\|
2086	(!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2087	cap: PT_CAP_topa_output) &&
2088	!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2089	cap: PT_CAP_single_range_output)))
2090	return `1`;
2091	msr_info->data = vmx->pt_desc.guest.output_base;
2092	break;
2093	case MSR_IA32_RTIT_OUTPUT_MASK:
2094	if (!vmx_pt_mode_is_host_guest() \|\|
2095	(!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2096	cap: PT_CAP_topa_output) &&
2097	!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2098	cap: PT_CAP_single_range_output)))
2099	return `1`;
2100	msr_info->data = vmx->pt_desc.guest.output_mask;
2101	break;
2102	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2103	index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2104	if (!vmx_pt_mode_is_host_guest() \|\|
2105	(index >= `2` * vmx->pt_desc.num_address_ranges))
2106	return `1`;
2107	if (index % `2`)
2108	msr_info->data = vmx->pt_desc.guest.addr_b[index / `2`];
2109	else
2110	msr_info->data = vmx->pt_desc.guest.addr_a[index / `2`];
2111	break;
2112	case MSR_IA32_DEBUGCTLMSR:
2113	msr_info->data = vmcs_read64(field: GUEST_IA32_DEBUGCTL);
2114	break;
2115	default:
2116	find_uret_msr:
2117	msr = vmx_find_uret_msr(vmx, msr: msr_info->index);
2118	if (msr) {
2119	msr_info->data = msr->data;
2120	break;
2121	}
2122	return kvm_get_msr_common(vcpu, msr: msr_info);
2123	}
2124
2125	return `0`;
2126	}
2127
2128	static u64 nested_vmx_truncate_sysenter_addr(struct kvm_vcpu *vcpu,
2129	u64 data)
2130	{
2131	#ifdef CONFIG_X86_64
2132	if (!guest_cpuid_has(vcpu, X86_FEATURE_LM))
2133	return (u32)data;
2134	#endif
2135	return (unsigned long)data;
2136	}
2137
2138	static u64 vmx_get_supported_debugctl(struct kvm_vcpu *vcpu, bool host_initiated)
2139	{
2140	u64 debugctl = `0`;
2141
2142	if (boot_cpu_has(X86_FEATURE_BUS_LOCK_DETECT) &&
2143	(host_initiated \|\| guest_cpuid_has(vcpu, X86_FEATURE_BUS_LOCK_DETECT)))
2144	debugctl \|= DEBUGCTLMSR_BUS_LOCK_DETECT;
2145
2146	if ((kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT) &&
2147	(host_initiated \|\| intel_pmu_lbr_is_enabled(vcpu)))
2148	debugctl \|= DEBUGCTLMSR_LBR \| DEBUGCTLMSR_FREEZE_LBRS_ON_PMI;
2149
2150	return debugctl;
2151	}
2152
2153	/*
2154	* Writes msr value into the appropriate "register".
2155	* Returns 0 on success, non-0 otherwise.
2156	* Assumes vcpu_load() was already called.
2157	*/
2158	static int vmx_set_msr(struct kvm_vcpu vcpu, struct* msr_data *msr_info)
2159	{
2160	struct vcpu_vmx *vmx = to_vmx(vcpu);
2161	struct vmx_uret_msr *msr;
2162	int ret = `0`;
2163	u32 msr_index = msr_info->index;
2164	u64 data = msr_info->data;
2165	u32 index;
2166
2167	switch (msr_index) {
2168	case MSR_EFER:
2169	ret = kvm_set_msr_common(vcpu, msr: msr_info);
2170	break;
2171	#ifdef CONFIG_X86_64
2172	case MSR_FS_BASE:
2173	vmx_segment_cache_clear(vmx);
2174	vmcs_writel(field: GUEST_FS_BASE, value: data);
2175	break;
2176	case MSR_GS_BASE:
2177	vmx_segment_cache_clear(vmx);
2178	vmcs_writel(field: GUEST_GS_BASE, value: data);
2179	break;
2180	case MSR_KERNEL_GS_BASE:
2181	vmx_write_guest_kernel_gs_base(vmx, data);
2182	break;
2183	case MSR_IA32_XFD:
2184	ret = kvm_set_msr_common(vcpu, msr: msr_info);
2185	/*
2186	* Always intercepting WRMSR could incur non-negligible
2187	* overhead given xfd might be changed frequently in
2188	* guest context switch. Disable write interception
2189	* upon the first write with a non-zero value (indicating
2190	* potential usage on dynamic xfeatures). Also update
2191	* exception bitmap to trap #NM for proper virtualization
2192	* of guest xfd_err.
2193	*/
2194	if (!ret && data) {
2195	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_XFD,
2196	MSR_TYPE_RW);
2197	vcpu->arch.xfd_no_write_intercept = true;
2198	vmx_update_exception_bitmap(vcpu);
2199	}
2200	break;
2201	#endif
2202	case MSR_IA32_SYSENTER_CS:
2203	if (is_guest_mode(vcpu))
2204	get_vmcs12(vcpu)->guest_sysenter_cs = data;
2205	vmcs_write32(field: GUEST_SYSENTER_CS, value: data);
2206	break;
2207	case MSR_IA32_SYSENTER_EIP:
2208	if (is_guest_mode(vcpu)) {
2209	data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2210	get_vmcs12(vcpu)->guest_sysenter_eip = data;
2211	}
2212	vmcs_writel(field: GUEST_SYSENTER_EIP, value: data);
2213	break;
2214	case MSR_IA32_SYSENTER_ESP:
2215	if (is_guest_mode(vcpu)) {
2216	data = nested_vmx_truncate_sysenter_addr(vcpu, data);
2217	get_vmcs12(vcpu)->guest_sysenter_esp = data;
2218	}
2219	vmcs_writel(field: GUEST_SYSENTER_ESP, value: data);
2220	break;
2221	case MSR_IA32_DEBUGCTLMSR: {
2222	u64 invalid;
2223
2224	invalid = data & ~vmx_get_supported_debugctl(vcpu, host_initiated: msr_info->host_initiated);
2225	if (invalid & (DEBUGCTLMSR_BTF\|DEBUGCTLMSR_LBR)) {
2226	kvm_pr_unimpl_wrmsr(vcpu, msr: msr_index, data);
2227	data &= ~(DEBUGCTLMSR_BTF\|DEBUGCTLMSR_LBR);
2228	invalid &= ~(DEBUGCTLMSR_BTF\|DEBUGCTLMSR_LBR);
2229	}
2230
2231	if (invalid)
2232	return `1`;
2233
2234	if (is_guest_mode(vcpu) && get_vmcs12(vcpu)->vm_exit_controls &
2235	VM_EXIT_SAVE_DEBUG_CONTROLS)
2236	get_vmcs12(vcpu)->guest_ia32_debugctl = data;
2237
2238	vmcs_write64(field: GUEST_IA32_DEBUGCTL, value: data);
2239	if (intel_pmu_lbr_is_enabled(vcpu) && !to_vmx(vcpu)->lbr_desc.event &&
2240	(data & DEBUGCTLMSR_LBR))
2241	intel_pmu_create_guest_lbr_event(vcpu);
2242	return `0`;
2243	}
2244	case MSR_IA32_BNDCFGS:
2245	if (!kvm_mpx_supported() \|\|
2246	(!msr_info->host_initiated &&
2247	!guest_cpuid_has(vcpu, X86_FEATURE_MPX)))
2248	return `1`;
2249	if (is_noncanonical_address(la: data & PAGE_MASK, vcpu) \|\|
2250	(data & MSR_IA32_BNDCFGS_RSVD))
2251	return `1`;
2252
2253	if (is_guest_mode(vcpu) &&
2254	((vmx->nested.msrs.entry_ctls_high & VM_ENTRY_LOAD_BNDCFGS) \|\|
2255	(vmx->nested.msrs.exit_ctls_high & VM_EXIT_CLEAR_BNDCFGS)))
2256	get_vmcs12(vcpu)->guest_bndcfgs = data;
2257
2258	vmcs_write64(field: GUEST_BNDCFGS, value: data);
2259	break;
2260	case MSR_IA32_UMWAIT_CONTROL:
2261	if (!msr_info->host_initiated && !vmx_has_waitpkg(vmx))
2262	return `1`;
2263
2264	/ The reserved bit 1 and non-32 bit [63:32] should be zero /
2265	if (data & (BIT_ULL(`1`) \| GENMASK_ULL(`63`, `32`)))
2266	return `1`;
2267
2268	vmx->msr_ia32_umwait_control = data;
2269	break;
2270	case MSR_IA32_SPEC_CTRL:
2271	if (!msr_info->host_initiated &&
2272	!guest_has_spec_ctrl_msr(vcpu))
2273	return `1`;
2274
2275	if (kvm_spec_ctrl_test_value(value: data))
2276	return `1`;
2277
2278	vmx->spec_ctrl = data;
2279	if (!data)
2280	break;
2281
2282	/*
2283	* For non-nested:
2284	* When it's written (to non-zero) for the first time, pass
2285	* it through.
2286	*
2287	* For nested:
2288	* The handling of the MSR bitmap for L2 guests is done in
2289	* nested_vmx_prepare_msr_bitmap. We should not touch the
2290	* vmcs02.msr_bitmap here since it gets completely overwritten
2291	* in the merging. We update the vmcs01 here for L1 as well
2292	* since it will end up touching the MSR anyway now.
2293	*/
2294	vmx_disable_intercept_for_msr(vcpu,
2295	MSR_IA32_SPEC_CTRL,
2296	MSR_TYPE_RW);
2297	break;
2298	case MSR_IA32_TSX_CTRL:
2299	if (!msr_info->host_initiated &&
2300	!(vcpu->arch.arch_capabilities & ARCH_CAP_TSX_CTRL_MSR))
2301	return `1`;
2302	if (data & ~(TSX_CTRL_RTM_DISABLE \| TSX_CTRL_CPUID_CLEAR))
2303	return `1`;
2304	goto find_uret_msr;
2305	case MSR_IA32_CR_PAT:
2306	ret = kvm_set_msr_common(vcpu, msr: msr_info);
2307	if (ret)
2308	break;
2309
2310	if (is_guest_mode(vcpu) &&
2311	get_vmcs12(vcpu)->vm_exit_controls & VM_EXIT_SAVE_IA32_PAT)
2312	get_vmcs12(vcpu)->guest_ia32_pat = data;
2313
2314	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
2315	vmcs_write64(field: GUEST_IA32_PAT, value: data);
2316	break;
2317	case MSR_IA32_MCG_EXT_CTL:
2318	if ((!msr_info->host_initiated &&
2319	!(to_vmx(vcpu)->msr_ia32_feature_control &
2320	FEAT_CTL_LMCE_ENABLED)) \|\|
2321	(data & ~MCG_EXT_CTL_LMCE_EN))
2322	return `1`;
2323	vcpu->arch.mcg_ext_ctl = data;
2324	break;
2325	case MSR_IA32_FEAT_CTL:
2326	if (!is_vmx_feature_control_msr_valid(vmx, msr: msr_info))
2327	return `1`;
2328
2329	vmx->msr_ia32_feature_control = data;
2330	if (msr_info->host_initiated && data == `0`)
2331	vmx_leave_nested(vcpu);
2332
2333	/ SGX may be enabled/disabled by guest's firmware /
2334	vmx_write_encls_bitmap(vcpu, NULL);
2335	break;
2336	case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
2337	/*
2338	* On real hardware, the LE hash MSRs are writable before
2339	* the firmware sets bit 0 in MSR 0x7a ("activating" SGX),
2340	* at which point SGX related bits in IA32_FEATURE_CONTROL
2341	* become writable.
2342	*
2343	* KVM does not emulate SGX activation for simplicity, so
2344	* allow writes to the LE hash MSRs if IA32_FEATURE_CONTROL
2345	* is unlocked. This is technically not architectural
2346	* behavior, but it's close enough.
2347	*/
2348	if (!msr_info->host_initiated &&
2349	(!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) \|\|
2350	((vmx->msr_ia32_feature_control & FEAT_CTL_LOCKED) &&
2351	!(vmx->msr_ia32_feature_control & FEAT_CTL_SGX_LC_ENABLED))))
2352	return `1`;
2353	vmx->msr_ia32_sgxlepubkeyhash
2354	[msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
2355	break;
2356	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
2357	if (!msr_info->host_initiated)
2358	return `1`; / they are read-only /
2359	if (!guest_can_use(vcpu, X86_FEATURE_VMX))
2360	return `1`;
2361	return vmx_set_vmx_msr(vcpu, msr_index, data);
2362	case MSR_IA32_RTIT_CTL:
2363	if (!vmx_pt_mode_is_host_guest() \|\|
2364	vmx_rtit_ctl_check(vcpu, data) \|\|
2365	vmx->nested.vmxon)
2366	return `1`;
2367	vmcs_write64(field: GUEST_IA32_RTIT_CTL, value: data);
2368	vmx->pt_desc.guest.ctl = data;
2369	pt_update_intercept_for_msr(vcpu);
2370	break;
2371	case MSR_IA32_RTIT_STATUS:
2372	if (!pt_can_write_msr(vmx))
2373	return `1`;
2374	if (data & MSR_IA32_RTIT_STATUS_MASK)
2375	return `1`;
2376	vmx->pt_desc.guest.status = data;
2377	break;
2378	case MSR_IA32_RTIT_CR3_MATCH:
2379	if (!pt_can_write_msr(vmx))
2380	return `1`;
2381	if (!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2382	cap: PT_CAP_cr3_filtering))
2383	return `1`;
2384	vmx->pt_desc.guest.cr3_match = data;
2385	break;
2386	case MSR_IA32_RTIT_OUTPUT_BASE:
2387	if (!pt_can_write_msr(vmx))
2388	return `1`;
2389	if (!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2390	cap: PT_CAP_topa_output) &&
2391	!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2392	cap: PT_CAP_single_range_output))
2393	return `1`;
2394	if (!pt_output_base_valid(vcpu, base: data))
2395	return `1`;
2396	vmx->pt_desc.guest.output_base = data;
2397	break;
2398	case MSR_IA32_RTIT_OUTPUT_MASK:
2399	if (!pt_can_write_msr(vmx))
2400	return `1`;
2401	if (!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2402	cap: PT_CAP_topa_output) &&
2403	!intel_pt_validate_cap(caps: vmx->pt_desc.caps,
2404	cap: PT_CAP_single_range_output))
2405	return `1`;
2406	vmx->pt_desc.guest.output_mask = data;
2407	break;
2408	case MSR_IA32_RTIT_ADDR0_A ... MSR_IA32_RTIT_ADDR3_B:
2409	if (!pt_can_write_msr(vmx))
2410	return `1`;
2411	index = msr_info->index - MSR_IA32_RTIT_ADDR0_A;
2412	if (index >= `2` * vmx->pt_desc.num_address_ranges)
2413	return `1`;
2414	if (is_noncanonical_address(la: data, vcpu))
2415	return `1`;
2416	if (index % `2`)
2417	vmx->pt_desc.guest.addr_b[index / `2`] = data;
2418	else
2419	vmx->pt_desc.guest.addr_a[index / `2`] = data;
2420	break;
2421	case MSR_IA32_PERF_CAPABILITIES:
2422	if (data && !vcpu_to_pmu(vcpu)->version)
2423	return `1`;
2424	if (data & PMU_CAP_LBR_FMT) {
2425	if ((data & PMU_CAP_LBR_FMT) !=
2426	(kvm_caps.supported_perf_cap & PMU_CAP_LBR_FMT))
2427	return `1`;
2428	if (!cpuid_model_is_consistent(vcpu))
2429	return `1`;
2430	}
2431	if (data & PERF_CAP_PEBS_FORMAT) {
2432	if ((data & PERF_CAP_PEBS_MASK) !=
2433	(kvm_caps.supported_perf_cap & PERF_CAP_PEBS_MASK))
2434	return `1`;
2435	if (!guest_cpuid_has(vcpu, X86_FEATURE_DS))
2436	return `1`;
2437	if (!guest_cpuid_has(vcpu, X86_FEATURE_DTES64))
2438	return `1`;
2439	if (!cpuid_model_is_consistent(vcpu))
2440	return `1`;
2441	}
2442	ret = kvm_set_msr_common(vcpu, msr: msr_info);
2443	break;
2444
2445	default:
2446	find_uret_msr:
2447	msr = vmx_find_uret_msr(vmx, msr: msr_index);
2448	if (msr)
2449	ret = vmx_set_guest_uret_msr(vmx, msr, data);
2450	else
2451	ret = kvm_set_msr_common(vcpu, msr: msr_info);
2452	}
2453
2454	/ FB_CLEAR may have changed, also update the FB_CLEAR_DIS behavior /
2455	if (msr_index == MSR_IA32_ARCH_CAPABILITIES)
2456	vmx_update_fb_clear_dis(vcpu, vmx);
2457
2458	return ret;
2459	}
2460
2461	static void vmx_cache_reg(struct kvm_vcpu vcpu, enum* kvm_reg reg)
2462	{
2463	unsigned long guest_owned_bits;
2464
2465	kvm_register_mark_available(vcpu, reg);
2466
2467	switch (reg) {
2468	case VCPU_REGS_RSP:
2469	vcpu->arch.regs[VCPU_REGS_RSP] = vmcs_readl(field: GUEST_RSP);
2470	break;
2471	case VCPU_REGS_RIP:
2472	vcpu->arch.regs[VCPU_REGS_RIP] = vmcs_readl(field: GUEST_RIP);
2473	break;
2474	case VCPU_EXREG_PDPTR:
2475	if (enable_ept)
2476	ept_save_pdptrs(vcpu);
2477	break;
2478	case VCPU_EXREG_CR0:
2479	guest_owned_bits = vcpu->arch.cr0_guest_owned_bits;
2480
2481	vcpu->arch.cr0 &= ~guest_owned_bits;
2482	vcpu->arch.cr0 \|= vmcs_readl(field: GUEST_CR0) & guest_owned_bits;
2483	break;
2484	case VCPU_EXREG_CR3:
2485	/*
2486	* When intercepting CR3 loads, e.g. for shadowing paging, KVM's
2487	* CR3 is loaded into hardware, not the guest's CR3.
2488	*/
2489	if (!(exec_controls_get(vmx: to_vmx(vcpu)) & CPU_BASED_CR3_LOAD_EXITING))
2490	vcpu->arch.cr3 = vmcs_readl(field: GUEST_CR3);
2491	break;
2492	case VCPU_EXREG_CR4:
2493	guest_owned_bits = vcpu->arch.cr4_guest_owned_bits;
2494
2495	vcpu->arch.cr4 &= ~guest_owned_bits;
2496	vcpu->arch.cr4 \|= vmcs_readl(field: GUEST_CR4) & guest_owned_bits;
2497	break;
2498	default:
2499	KVM_BUG_ON(`1`, vcpu->kvm);
2500	break;
2501	}
2502	}
2503
2504	/*
2505	* There is no X86_FEATURE for SGX yet, but anyway we need to query CPUID
2506	* directly instead of going through cpu_has(), to ensure KVM is trapping
2507	* ENCLS whenever it's supported in hardware. It does not matter whether
2508	* the host OS supports or has enabled SGX.
2509	*/
2510	static bool cpu_has_sgx(void)
2511	{
2512	return cpuid_eax(op: `0`) >= `0x12` && (cpuid_eax(op: `0x12`) & BIT(`0`));
2513	}
2514
2515	/*
2516	* Some cpus support VM_{ENTRY,EXIT}_IA32_PERF_GLOBAL_CTRL but they
2517	* can't be used due to errata where VM Exit may incorrectly clear
2518	* IA32_PERF_GLOBAL_CTRL[34:32]. Work around the errata by using the
2519	* MSR load mechanism to switch IA32_PERF_GLOBAL_CTRL.
2520	*/
2521	static bool cpu_has_perf_global_ctrl_bug(void)
2522	{
2523	if (boot_cpu_data.x86 == `0x6`) {
2524	switch (boot_cpu_data.x86_model) {
2525	case INTEL_FAM6_NEHALEM_EP: / AAK155 /
2526	case INTEL_FAM6_NEHALEM: / AAP115 /
2527	case INTEL_FAM6_WESTMERE: / AAT100 /
2528	case INTEL_FAM6_WESTMERE_EP: / BC86,AAY89,BD102 /
2529	case INTEL_FAM6_NEHALEM_EX: / BA97 /
2530	return true;
2531	default:
2532	break;
2533	}
2534	}
2535
2536	return false;
2537	}
2538
2539	static int adjust_vmx_controls(u32 ctl_min, u32 ctl_opt, u32 msr, u32 *result)
2540	{
2541	u32 vmx_msr_low, vmx_msr_high;
2542	u32 ctl = ctl_min \| ctl_opt;
2543
2544	rdmsr(msr, vmx_msr_low, vmx_msr_high);
2545
2546	ctl &= vmx_msr_high; / bit == 0 in high word ==> must be zero /
2547	ctl \|= vmx_msr_low; / bit == 1 in low word ==> must be one /
2548
2549	/ Ensure minimum (required) set of control bits are supported. /
2550	if (ctl_min & ~ctl)
2551	return -EIO;
2552
2553	*result = ctl;
2554	return `0`;
2555	}
2556
2557	static u64 adjust_vmx_controls64(u64 ctl_opt, u32 msr)
2558	{
2559	u64 allowed;
2560
2561	rdmsrl(msr, allowed);
2562
2563	return ctl_opt & allowed;
2564	}
2565
2566	static int setup_vmcs_config(struct vmcs_config *vmcs_conf,
2567	struct vmx_capability *vmx_cap)
2568	{
2569	u32 vmx_msr_low, vmx_msr_high;
2570	u32 _pin_based_exec_control = `0`;
2571	u32 _cpu_based_exec_control = `0`;
2572	u32 _cpu_based_2nd_exec_control = `0`;
2573	u64 _cpu_based_3rd_exec_control = `0`;
2574	u32 _vmexit_control = `0`;
2575	u32 _vmentry_control = `0`;
2576	u64 misc_msr;
2577	int i;
2578
2579	/*
2580	* LOAD/SAVE_DEBUG_CONTROLS are absent because both are mandatory.
2581	* SAVE_IA32_PAT and SAVE_IA32_EFER are absent because KVM always
2582	* intercepts writes to PAT and EFER, i.e. never enables those controls.
2583	*/
2584	struct {
2585	u32 entry_control;
2586	u32 exit_control;
2587	} const vmcs_entry_exit_pairs[] = {
2588	{ VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL, VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL },
2589	{ VM_ENTRY_LOAD_IA32_PAT, VM_EXIT_LOAD_IA32_PAT },
2590	{ VM_ENTRY_LOAD_IA32_EFER, VM_EXIT_LOAD_IA32_EFER },
2591	{ VM_ENTRY_LOAD_BNDCFGS, VM_EXIT_CLEAR_BNDCFGS },
2592	{ VM_ENTRY_LOAD_IA32_RTIT_CTL, VM_EXIT_CLEAR_IA32_RTIT_CTL },
2593	};
2594
2595	memset(vmcs_conf, `0`, sizeof(*vmcs_conf));
2596
2597	if (adjust_vmx_controls(KVM_REQUIRED_VMX_CPU_BASED_VM_EXEC_CONTROL,
2598	KVM_OPTIONAL_VMX_CPU_BASED_VM_EXEC_CONTROL,
2599	MSR_IA32_VMX_PROCBASED_CTLS,
2600	result: &_cpu_based_exec_control))
2601	return -EIO;
2602	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS) {
2603	if (adjust_vmx_controls(KVM_REQUIRED_VMX_SECONDARY_VM_EXEC_CONTROL,
2604	KVM_OPTIONAL_VMX_SECONDARY_VM_EXEC_CONTROL,
2605	MSR_IA32_VMX_PROCBASED_CTLS2,
2606	result: &_cpu_based_2nd_exec_control))
2607	return -EIO;
2608	}
2609	#ifndef CONFIG_X86_64
2610	if (!(_cpu_based_2nd_exec_control &
2611	SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
2612	_cpu_based_exec_control &= ~CPU_BASED_TPR_SHADOW;
2613	#endif
2614
2615	if (!(_cpu_based_exec_control & CPU_BASED_TPR_SHADOW))
2616	_cpu_based_2nd_exec_control &= ~(
2617	SECONDARY_EXEC_APIC_REGISTER_VIRT \|
2618	SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE \|
2619	SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
2620
2621	rdmsr_safe(MSR_IA32_VMX_EPT_VPID_CAP,
2622	&vmx_cap->ept, &vmx_cap->vpid);
2623
2624	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_EPT) &&
2625	vmx_cap->ept) {
2626	pr_warn_once("EPT CAP should not exist if not support "
2627	"1-setting enable EPT VM-execution control\n");
2628
2629	if (error_on_inconsistent_vmcs_config)
2630	return -EIO;
2631
2632	vmx_cap->ept = `0`;
2633	}
2634	if (!(_cpu_based_2nd_exec_control & SECONDARY_EXEC_ENABLE_VPID) &&
2635	vmx_cap->vpid) {
2636	pr_warn_once("VPID CAP should not exist if not support "
2637	"1-setting enable VPID VM-execution control\n");
2638
2639	if (error_on_inconsistent_vmcs_config)
2640	return -EIO;
2641
2642	vmx_cap->vpid = `0`;
2643	}
2644
2645	if (!cpu_has_sgx())
2646	_cpu_based_2nd_exec_control &= ~SECONDARY_EXEC_ENCLS_EXITING;
2647
2648	if (_cpu_based_exec_control & CPU_BASED_ACTIVATE_TERTIARY_CONTROLS)
2649	_cpu_based_3rd_exec_control =
2650	adjust_vmx_controls64(KVM_OPTIONAL_VMX_TERTIARY_VM_EXEC_CONTROL,
2651	MSR_IA32_VMX_PROCBASED_CTLS3);
2652
2653	if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_EXIT_CONTROLS,
2654	KVM_OPTIONAL_VMX_VM_EXIT_CONTROLS,
2655	MSR_IA32_VMX_EXIT_CTLS,
2656	result: &_vmexit_control))
2657	return -EIO;
2658
2659	if (adjust_vmx_controls(KVM_REQUIRED_VMX_PIN_BASED_VM_EXEC_CONTROL,
2660	KVM_OPTIONAL_VMX_PIN_BASED_VM_EXEC_CONTROL,
2661	MSR_IA32_VMX_PINBASED_CTLS,
2662	result: &_pin_based_exec_control))
2663	return -EIO;
2664
2665	if (cpu_has_broken_vmx_preemption_timer())
2666	_pin_based_exec_control &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
2667	if (!(_cpu_based_2nd_exec_control &
2668	SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY))
2669	_pin_based_exec_control &= ~PIN_BASED_POSTED_INTR;
2670
2671	if (adjust_vmx_controls(KVM_REQUIRED_VMX_VM_ENTRY_CONTROLS,
2672	KVM_OPTIONAL_VMX_VM_ENTRY_CONTROLS,
2673	MSR_IA32_VMX_ENTRY_CTLS,
2674	result: &_vmentry_control))
2675	return -EIO;
2676
2677	for (i = `0`; i < ARRAY_SIZE(vmcs_entry_exit_pairs); i++) {
2678	u32 n_ctrl = vmcs_entry_exit_pairs[i].entry_control;
2679	u32 x_ctrl = vmcs_entry_exit_pairs[i].exit_control;
2680
2681	if (!(_vmentry_control & n_ctrl) == !(_vmexit_control & x_ctrl))
2682	continue;
2683
2684	pr_warn_once("Inconsistent VM-Entry/VM-Exit pair, entry = %x, exit = %x\n",
2685	_vmentry_control & n_ctrl, _vmexit_control & x_ctrl);
2686
2687	if (error_on_inconsistent_vmcs_config)
2688	return -EIO;
2689
2690	_vmentry_control &= ~n_ctrl;
2691	_vmexit_control &= ~x_ctrl;
2692	}
2693
2694	rdmsr(MSR_IA32_VMX_BASIC, vmx_msr_low, vmx_msr_high);
2695
2696	/ IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. /
2697	if ((vmx_msr_high & `0x1fff`) > PAGE_SIZE)
2698	return -EIO;
2699
2700	#ifdef CONFIG_X86_64
2701	/ IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. /
2702	if (vmx_msr_high & (`1u`<<`16`))
2703	return -EIO;
2704	#endif
2705
2706	/ Require Write-Back (WB) memory type for VMCS accesses. /
2707	if (((vmx_msr_high >> `18`) & `15`) != `6`)
2708	return -EIO;
2709
2710	rdmsrl(MSR_IA32_VMX_MISC, misc_msr);
2711
2712	vmcs_conf->size = vmx_msr_high & `0x1fff`;
2713	vmcs_conf->basic_cap = vmx_msr_high & ~`0x1fff`;
2714
2715	vmcs_conf->revision_id = vmx_msr_low;
2716
2717	vmcs_conf->pin_based_exec_ctrl = _pin_based_exec_control;
2718	vmcs_conf->cpu_based_exec_ctrl = _cpu_based_exec_control;
2719	vmcs_conf->cpu_based_2nd_exec_ctrl = _cpu_based_2nd_exec_control;
2720	vmcs_conf->cpu_based_3rd_exec_ctrl = _cpu_based_3rd_exec_control;
2721	vmcs_conf->vmexit_ctrl = _vmexit_control;
2722	vmcs_conf->vmentry_ctrl = _vmentry_control;
2723	vmcs_conf->misc = misc_msr;
2724
2725	#if IS_ENABLED(CONFIG_HYPERV)
2726	if (enlightened_vmcs)
2727	evmcs_sanitize_exec_ctrls(vmcs_conf);
2728	#endif
2729
2730	return `0`;
2731	}
2732
2733	static bool __kvm_is_vmx_supported(void)
2734	{
2735	int cpu = smp_processor_id();
2736
2737	if (!(cpuid_ecx(op: `1`) & feature_bit(VMX))) {
2738	pr_err("VMX not supported by CPU %d\n", cpu);
2739	return false;
2740	}
2741
2742	if (!this_cpu_has(X86_FEATURE_MSR_IA32_FEAT_CTL) \|\|
2743	!this_cpu_has(X86_FEATURE_VMX)) {
2744	pr_err("VMX not enabled (by BIOS) in MSR_IA32_FEAT_CTL on CPU %d\n", cpu);
2745	return false;
2746	}
2747
2748	return true;
2749	}
2750
2751	static bool kvm_is_vmx_supported(void)
2752	{
2753	bool supported;
2754
2755	migrate_disable();
2756	supported = __kvm_is_vmx_supported();
2757	migrate_enable();
2758
2759	return supported;
2760	}
2761
2762	static int vmx_check_processor_compat(void)
2763	{
2764	int cpu = raw_smp_processor_id();
2765	struct vmcs_config vmcs_conf;
2766	struct vmx_capability vmx_cap;
2767
2768	if (!__kvm_is_vmx_supported())
2769	return -EIO;
2770
2771	if (setup_vmcs_config(vmcs_conf: &vmcs_conf, vmx_cap: &vmx_cap) < `0`) {
2772	pr_err("Failed to setup VMCS config on CPU %d\n", cpu);
2773	return -EIO;
2774	}
2775	if (nested)
2776	nested_vmx_setup_ctls_msrs(vmcs_conf: &vmcs_conf, ept_caps: vmx_cap.ept);
2777	if (memcmp(p: &vmcs_config, q: &vmcs_conf, size: sizeof(struct vmcs_config))) {
2778	pr_err("Inconsistent VMCS config on CPU %d\n", cpu);
2779	return -EIO;
2780	}
2781	return `0`;
2782	}
2783
2784	static int kvm_cpu_vmxon(u64 vmxon_pointer)
2785	{
2786	u64 msr;
2787
2788	cr4_set_bits(X86_CR4_VMXE);
2789
2790	asm goto("1: vmxon %[vmxon_pointer]\n\t"
2791	_ASM_EXTABLE(`1b`, %l[fault])
2792	: : [vmxon_pointer] "m"(vmxon_pointer)
2793	: : fault);
2794	return `0`;
2795
2796	fault:
2797	WARN_ONCE(`1`, "VMXON faulted, MSR_IA32_FEAT_CTL (0x3a) = 0x%llx\n",
2798	rdmsrl_safe(MSR_IA32_FEAT_CTL, &msr) ? `0xdeadbeef` : msr);
2799	cr4_clear_bits(X86_CR4_VMXE);
2800
2801	return -EFAULT;
2802	}
2803
2804	static int vmx_hardware_enable(void)
2805	{
2806	int cpu = raw_smp_processor_id();
2807	u64 phys_addr = __pa(per_cpu(vmxarea, cpu));
2808	int r;
2809
2810	if (cr4_read_shadow() & X86_CR4_VMXE)
2811	return -EBUSY;
2812
2813	/*
2814	* This can happen if we hot-added a CPU but failed to allocate
2815	* VP assist page for it.
2816	*/
2817	if (kvm_is_using_evmcs() && !hv_get_vp_assist_page(cpu))
2818	return -EFAULT;
2819
2820	intel_pt_handle_vmx(on: `1`);
2821
2822	r = kvm_cpu_vmxon(vmxon_pointer: phys_addr);
2823	if (r) {
2824	intel_pt_handle_vmx(on: `0`);
2825	return r;
2826	}
2827
2828	if (enable_ept)
2829	ept_sync_global();
2830
2831	return `0`;
2832	}
2833
2834	static void vmclear_local_loaded_vmcss(void)
2835	{
2836	int cpu = raw_smp_processor_id();
2837	struct loaded_vmcs v, n;
2838
2839	list_for_each_entry_safe(v, n, &per_cpu(loaded_vmcss_on_cpu, cpu),
2840	loaded_vmcss_on_cpu_link)
2841	__loaded_vmcs_clear(arg: v);
2842	}
2843
2844	static void vmx_hardware_disable(void)
2845	{
2846	vmclear_local_loaded_vmcss();
2847
2848	if (kvm_cpu_vmxoff())
2849	kvm_spurious_fault();
2850
2851	hv_reset_evmcs();
2852
2853	intel_pt_handle_vmx(on: `0`);
2854	}
2855
2856	struct vmcs alloc_vmcs_cpu(bool shadow, int* cpu, gfp_t flags)
2857	{
2858	int node = cpu_to_node(cpu);
2859	struct page *pages;
2860	struct vmcs *vmcs;
2861
2862	pages = __alloc_pages_node(nid: node, gfp_mask: flags, order: `0`);
2863	if (!pages)
2864	return NULL;
2865	vmcs = page_address(pages);
2866	memset(vmcs, `0`, vmcs_config.size);
2867
2868	/ KVM supports Enlightened VMCS v1 only /
2869	if (kvm_is_using_evmcs())
2870	vmcs->hdr.revision_id = KVM_EVMCS_VERSION;
2871	else
2872	vmcs->hdr.revision_id = vmcs_config.revision_id;
2873
2874	if (shadow)
2875	vmcs->hdr.shadow_vmcs = `1`;
2876	return vmcs;
2877	}
2878
2879	void free_vmcs(struct vmcs *vmcs)
2880	{
2881	free_page((unsigned long)vmcs);
2882	}
2883
2884	/*
2885	* Free a VMCS, but before that VMCLEAR it on the CPU where it was last loaded
2886	*/
2887	void free_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2888	{
2889	if (!loaded_vmcs->vmcs)
2890	return;
2891	loaded_vmcs_clear(loaded_vmcs);
2892	free_vmcs(vmcs: loaded_vmcs->vmcs);
2893	loaded_vmcs->vmcs = NULL;
2894	if (loaded_vmcs->msr_bitmap)
2895	free_page((unsigned long)loaded_vmcs->msr_bitmap);
2896	WARN_ON(loaded_vmcs->shadow_vmcs != NULL);
2897	}
2898
2899	int alloc_loaded_vmcs(struct loaded_vmcs *loaded_vmcs)
2900	{
2901	loaded_vmcs->vmcs = alloc_vmcs(shadow: false);
2902	if (!loaded_vmcs->vmcs)
2903	return -ENOMEM;
2904
2905	vmcs_clear(vmcs: loaded_vmcs->vmcs);
2906
2907	loaded_vmcs->shadow_vmcs = NULL;
2908	loaded_vmcs->hv_timer_soft_disabled = false;
2909	loaded_vmcs->cpu = -`1`;
2910	loaded_vmcs->launched = `0`;
2911
2912	if (cpu_has_vmx_msr_bitmap()) {
2913	loaded_vmcs->msr_bitmap = (unsigned long *)
2914	__get_free_page(GFP_KERNEL_ACCOUNT);
2915	if (!loaded_vmcs->msr_bitmap)
2916	goto out_vmcs;
2917	memset(loaded_vmcs->msr_bitmap, `0xff`, PAGE_SIZE);
2918	}
2919
2920	memset(&loaded_vmcs->host_state, `0`, sizeof(struct vmcs_host_state));
2921	memset(&loaded_vmcs->controls_shadow, `0`,
2922	sizeof(struct vmcs_controls_shadow));
2923
2924	return `0`;
2925
2926	out_vmcs:
2927	free_loaded_vmcs(loaded_vmcs);
2928	return -ENOMEM;
2929	}
2930
2931	static void free_kvm_area(void)
2932	{
2933	int cpu;
2934
2935	for_each_possible_cpu(cpu) {
2936	free_vmcs(per_cpu(vmxarea, cpu));
2937	per_cpu(vmxarea, cpu) = NULL;
2938	}
2939	}
2940
2941	static __init int alloc_kvm_area(void)
2942	{
2943	int cpu;
2944
2945	for_each_possible_cpu(cpu) {
2946	struct vmcs *vmcs;
2947
2948	vmcs = alloc_vmcs_cpu(shadow: false, cpu, GFP_KERNEL);
2949	if (!vmcs) {
2950	free_kvm_area();
2951	return -ENOMEM;
2952	}
2953
2954	/*
2955	* When eVMCS is enabled, alloc_vmcs_cpu() sets
2956	* vmcs->revision_id to KVM_EVMCS_VERSION instead of
2957	* revision_id reported by MSR_IA32_VMX_BASIC.
2958	*
2959	* However, even though not explicitly documented by
2960	* TLFS, VMXArea passed as VMXON argument should
2961	* still be marked with revision_id reported by
2962	* physical CPU.
2963	*/
2964	if (kvm_is_using_evmcs())
2965	vmcs->hdr.revision_id = vmcs_config.revision_id;
2966
2967	per_cpu(vmxarea, cpu) = vmcs;
2968	}
2969	return `0`;
2970	}
2971
2972	static void fix_pmode_seg(struct kvm_vcpu vcpu, int* seg,
2973	struct kvm_segment *save)
2974	{
2975	if (!emulate_invalid_guest_state) {
2976	/*
2977	* CS and SS RPL should be equal during guest entry according
2978	* to VMX spec, but in reality it is not always so. Since vcpu
2979	* is in the middle of the transition from real mode to
2980	* protected mode it is safe to assume that RPL 0 is a good
2981	* default value.
2982	*/
2983	if (seg == VCPU_SREG_CS \|\| seg == VCPU_SREG_SS)
2984	save->selector &= ~SEGMENT_RPL_MASK;
2985	save->dpl = save->selector & SEGMENT_RPL_MASK;
2986	save->s = `1`;
2987	}
2988	__vmx_set_segment(vcpu, var: save, seg);
2989	}
2990
2991	static void enter_pmode(struct kvm_vcpu *vcpu)
2992	{
2993	unsigned long flags;
2994	struct vcpu_vmx *vmx = to_vmx(vcpu);
2995
2996	/*
2997	* Update real mode segment cache. It may be not up-to-date if segment
2998	* register was written while vcpu was in a guest mode.
2999	*/
3000	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_ES], seg: VCPU_SREG_ES);
3001	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_DS], seg: VCPU_SREG_DS);
3002	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_FS], seg: VCPU_SREG_FS);
3003	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_GS], seg: VCPU_SREG_GS);
3004	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_SS], seg: VCPU_SREG_SS);
3005	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_CS], seg: VCPU_SREG_CS);
3006
3007	vmx->rmode.vm86_active = `0`;
3008
3009	__vmx_set_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_TR], seg: VCPU_SREG_TR);
3010
3011	flags = vmcs_readl(field: GUEST_RFLAGS);
3012	flags &= RMODE_GUEST_OWNED_EFLAGS_BITS;
3013	flags \|= vmx->rmode.save_rflags & ~RMODE_GUEST_OWNED_EFLAGS_BITS;
3014	vmcs_writel(field: GUEST_RFLAGS, value: flags);
3015
3016	vmcs_writel(field: GUEST_CR4, value: (vmcs_readl(field: GUEST_CR4) & ~X86_CR4_VME) \|
3017	(vmcs_readl(field: CR4_READ_SHADOW) & X86_CR4_VME));
3018
3019	vmx_update_exception_bitmap(vcpu);
3020
3021	fix_pmode_seg(vcpu, seg: VCPU_SREG_CS, save: &vmx->rmode.segs[VCPU_SREG_CS]);
3022	fix_pmode_seg(vcpu, seg: VCPU_SREG_SS, save: &vmx->rmode.segs[VCPU_SREG_SS]);
3023	fix_pmode_seg(vcpu, seg: VCPU_SREG_ES, save: &vmx->rmode.segs[VCPU_SREG_ES]);
3024	fix_pmode_seg(vcpu, seg: VCPU_SREG_DS, save: &vmx->rmode.segs[VCPU_SREG_DS]);
3025	fix_pmode_seg(vcpu, seg: VCPU_SREG_FS, save: &vmx->rmode.segs[VCPU_SREG_FS]);
3026	fix_pmode_seg(vcpu, seg: VCPU_SREG_GS, save: &vmx->rmode.segs[VCPU_SREG_GS]);
3027	}
3028
3029	static void fix_rmode_seg(int seg, struct kvm_segment *save)
3030	{
3031	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3032	struct kvm_segment var = *save;
3033
3034	var.dpl = `0x3`;
3035	if (seg == VCPU_SREG_CS)
3036	var.type = `0x3`;
3037
3038	if (!emulate_invalid_guest_state) {
3039	var.selector = var.base >> `4`;
3040	var.base = var.base & `0xffff0`;
3041	var.limit = `0xffff`;
3042	var.g = `0`;
3043	var.db = `0`;
3044	var.present = `1`;
3045	var.s = `1`;
3046	var.l = `0`;
3047	var.unusable = `0`;
3048	var.type = `0x3`;
3049	var.avl = `0`;
3050	if (save->base & `0xf`)
3051	pr_warn_once("segment base is not paragraph aligned "
3052	"when entering protected mode (seg=%d)", seg);
3053	}
3054
3055	vmcs_write16(field: sf->selector, value: var.selector);
3056	vmcs_writel(field: sf->base, value: var.base);
3057	vmcs_write32(field: sf->limit, value: var.limit);
3058	vmcs_write32(field: sf->ar_bytes, value: vmx_segment_access_rights(var: &var));
3059	}
3060
3061	static void enter_rmode(struct kvm_vcpu *vcpu)
3062	{
3063	unsigned long flags;
3064	struct vcpu_vmx *vmx = to_vmx(vcpu);
3065	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm: vcpu->kvm);
3066
3067	/*
3068	* KVM should never use VM86 to virtualize Real Mode when L2 is active,
3069	* as using VM86 is unnecessary if unrestricted guest is enabled, and
3070	* if unrestricted guest is disabled, VM-Enter (from L1) with CR0.PG=0
3071	* should VM-Fail and KVM should reject userspace attempts to stuff
3072	* CR0.PG=0 when L2 is active.
3073	*/
3074	WARN_ON_ONCE(is_guest_mode(vcpu));
3075
3076	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_TR], seg: VCPU_SREG_TR);
3077	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_ES], seg: VCPU_SREG_ES);
3078	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_DS], seg: VCPU_SREG_DS);
3079	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_FS], seg: VCPU_SREG_FS);
3080	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_GS], seg: VCPU_SREG_GS);
3081	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_SS], seg: VCPU_SREG_SS);
3082	vmx_get_segment(vcpu, var: &vmx->rmode.segs[VCPU_SREG_CS], seg: VCPU_SREG_CS);
3083
3084	vmx->rmode.vm86_active = `1`;
3085
3086	vmx_segment_cache_clear(vmx);
3087
3088	vmcs_writel(field: GUEST_TR_BASE, value: kvm_vmx->tss_addr);
3089	vmcs_write32(field: GUEST_TR_LIMIT, RMODE_TSS_SIZE - `1`);
3090	vmcs_write32(field: GUEST_TR_AR_BYTES, value: `0x008b`);
3091
3092	flags = vmcs_readl(field: GUEST_RFLAGS);
3093	vmx->rmode.save_rflags = flags;
3094
3095	flags \|= X86_EFLAGS_IOPL \| X86_EFLAGS_VM;
3096
3097	vmcs_writel(field: GUEST_RFLAGS, value: flags);
3098	vmcs_writel(field: GUEST_CR4, value: vmcs_readl(field: GUEST_CR4) \| X86_CR4_VME);
3099	vmx_update_exception_bitmap(vcpu);
3100
3101	fix_rmode_seg(seg: VCPU_SREG_SS, save: &vmx->rmode.segs[VCPU_SREG_SS]);
3102	fix_rmode_seg(seg: VCPU_SREG_CS, save: &vmx->rmode.segs[VCPU_SREG_CS]);
3103	fix_rmode_seg(seg: VCPU_SREG_ES, save: &vmx->rmode.segs[VCPU_SREG_ES]);
3104	fix_rmode_seg(seg: VCPU_SREG_DS, save: &vmx->rmode.segs[VCPU_SREG_DS]);
3105	fix_rmode_seg(seg: VCPU_SREG_GS, save: &vmx->rmode.segs[VCPU_SREG_GS]);
3106	fix_rmode_seg(seg: VCPU_SREG_FS, save: &vmx->rmode.segs[VCPU_SREG_FS]);
3107	}
3108
3109	int vmx_set_efer(struct kvm_vcpu *vcpu, u64 efer)
3110	{
3111	struct vcpu_vmx *vmx = to_vmx(vcpu);
3112
3113	/ Nothing to do if hardware doesn't support EFER. /
3114	if (!vmx_find_uret_msr(vmx, MSR_EFER))
3115	return `0`;
3116
3117	vcpu->arch.efer = efer;
3118	#ifdef CONFIG_X86_64
3119	if (efer & EFER_LMA)
3120	vm_entry_controls_setbit(vmx, VM_ENTRY_IA32E_MODE);
3121	else
3122	vm_entry_controls_clearbit(vmx, VM_ENTRY_IA32E_MODE);
3123	#else
3124	if (KVM_BUG_ON(efer & EFER_LMA, vcpu->kvm))
3125	return `1`;
3126	#endif
3127
3128	vmx_setup_uret_msrs(vmx);
3129	return `0`;
3130	}
3131
3132	#ifdef CONFIG_X86_64
3133
3134	static void enter_lmode(struct kvm_vcpu *vcpu)
3135	{
3136	u32 guest_tr_ar;
3137
3138	vmx_segment_cache_clear(vmx: to_vmx(vcpu));
3139
3140	guest_tr_ar = vmcs_read32(field: GUEST_TR_AR_BYTES);
3141	if ((guest_tr_ar & VMX_AR_TYPE_MASK) != VMX_AR_TYPE_BUSY_64_TSS) {
3142	pr_debug_ratelimited("%s: tss fixup for long mode. \n",
3143	__func__);
3144	vmcs_write32(field: GUEST_TR_AR_BYTES,
3145	value: (guest_tr_ar & ~VMX_AR_TYPE_MASK)
3146	\| VMX_AR_TYPE_BUSY_64_TSS);
3147	}
3148	vmx_set_efer(vcpu, efer: vcpu->arch.efer \| EFER_LMA);
3149	}
3150
3151	static void exit_lmode(struct kvm_vcpu *vcpu)
3152	{
3153	vmx_set_efer(vcpu, efer: vcpu->arch.efer & ~EFER_LMA);
3154	}
3155
3156	#endif
3157
3158	static void vmx_flush_tlb_all(struct kvm_vcpu *vcpu)
3159	{
3160	struct vcpu_vmx *vmx = to_vmx(vcpu);
3161
3162	/*
3163	* INVEPT must be issued when EPT is enabled, irrespective of VPID, as
3164	* the CPU is not required to invalidate guest-physical mappings on
3165	* VM-Entry, even if VPID is disabled. Guest-physical mappings are
3166	* associated with the root EPT structure and not any particular VPID
3167	* (INVVPID also isn't required to invalidate guest-physical mappings).
3168	*/
3169	if (enable_ept) {
3170	ept_sync_global();
3171	} else if (enable_vpid) {
3172	if (cpu_has_vmx_invvpid_global()) {
3173	vpid_sync_vcpu_global();
3174	} else {
3175	vpid_sync_vcpu_single(vpid: vmx->vpid);
3176	vpid_sync_vcpu_single(vpid: vmx->nested.vpid02);
3177	}
3178	}
3179	}
3180
3181	static inline int vmx_get_current_vpid(struct kvm_vcpu *vcpu)
3182	{
3183	if (is_guest_mode(vcpu))
3184	return nested_get_vpid02(vcpu);
3185	return to_vmx(vcpu)->vpid;
3186	}
3187
3188	static void vmx_flush_tlb_current(struct kvm_vcpu *vcpu)
3189	{
3190	struct kvm_mmu *mmu = vcpu->arch.mmu;
3191	u64 root_hpa = mmu->root.hpa;
3192
3193	/ No flush required if the current context is invalid. /
3194	if (!VALID_PAGE(root_hpa))
3195	return;
3196
3197	if (enable_ept)
3198	ept_sync_context(eptp: construct_eptp(vcpu, root_hpa,
3199	root_level: mmu->root_role.level));
3200	else
3201	vpid_sync_context(vpid: vmx_get_current_vpid(vcpu));
3202	}
3203
3204	static void vmx_flush_tlb_gva(struct kvm_vcpu *vcpu, gva_t addr)
3205	{
3206	/*
3207	* vpid_sync_vcpu_addr() is a nop if vpid==0, see the comment in
3208	* vmx_flush_tlb_guest() for an explanation of why this is ok.
3209	*/
3210	vpid_sync_vcpu_addr(vpid: vmx_get_current_vpid(vcpu), addr);
3211	}
3212
3213	static void vmx_flush_tlb_guest(struct kvm_vcpu *vcpu)
3214	{
3215	/*
3216	* vpid_sync_context() is a nop if vpid==0, e.g. if enable_vpid==0 or a
3217	* vpid couldn't be allocated for this vCPU. VM-Enter and VM-Exit are
3218	* required to flush GVA->{G,H}PA mappings from the TLB if vpid is
3219	* disabled (VM-Enter with vpid enabled and vpid==0 is disallowed),
3220	* i.e. no explicit INVVPID is necessary.
3221	*/
3222	vpid_sync_context(vpid: vmx_get_current_vpid(vcpu));
3223	}
3224
3225	void vmx_ept_load_pdptrs(struct kvm_vcpu *vcpu)
3226	{
3227	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3228
3229	if (!kvm_register_is_dirty(vcpu, reg: VCPU_EXREG_PDPTR))
3230	return;
3231
3232	if (is_pae_paging(vcpu)) {
3233	vmcs_write64(field: GUEST_PDPTR0, value: mmu->pdptrs[`0`]);
3234	vmcs_write64(field: GUEST_PDPTR1, value: mmu->pdptrs[`1`]);
3235	vmcs_write64(field: GUEST_PDPTR2, value: mmu->pdptrs[`2`]);
3236	vmcs_write64(field: GUEST_PDPTR3, value: mmu->pdptrs[`3`]);
3237	}
3238	}
3239
3240	void ept_save_pdptrs(struct kvm_vcpu *vcpu)
3241	{
3242	struct kvm_mmu *mmu = vcpu->arch.walk_mmu;
3243
3244	if (WARN_ON_ONCE(!is_pae_paging(vcpu)))
3245	return;
3246
3247	mmu->pdptrs[`0`] = vmcs_read64(field: GUEST_PDPTR0);
3248	mmu->pdptrs[`1`] = vmcs_read64(field: GUEST_PDPTR1);
3249	mmu->pdptrs[`2`] = vmcs_read64(field: GUEST_PDPTR2);
3250	mmu->pdptrs[`3`] = vmcs_read64(field: GUEST_PDPTR3);
3251
3252	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_PDPTR);
3253	}
3254
3255	#define CR3_EXITING_BITS (CPU_BASED_CR3_LOAD_EXITING \| \
3256	CPU_BASED_CR3_STORE_EXITING)
3257
3258	static bool vmx_is_valid_cr0(struct kvm_vcpu vcpu, unsigned* long cr0)
3259	{
3260	if (is_guest_mode(vcpu))
3261	return nested_guest_cr0_valid(vcpu, val: cr0);
3262
3263	if (to_vmx(vcpu)->nested.vmxon)
3264	return nested_host_cr0_valid(vcpu, val: cr0);
3265
3266	return true;
3267	}
3268
3269	void vmx_set_cr0(struct kvm_vcpu vcpu, unsigned* long cr0)
3270	{
3271	struct vcpu_vmx *vmx = to_vmx(vcpu);
3272	unsigned long hw_cr0, old_cr0_pg;
3273	u32 tmp;
3274
3275	old_cr0_pg = kvm_read_cr0_bits(vcpu, X86_CR0_PG);
3276
3277	hw_cr0 = (cr0 & ~KVM_VM_CR0_ALWAYS_OFF);
3278	if (enable_unrestricted_guest)
3279	hw_cr0 \|= KVM_VM_CR0_ALWAYS_ON_UNRESTRICTED_GUEST;
3280	else {
3281	hw_cr0 \|= KVM_VM_CR0_ALWAYS_ON;
3282	if (!enable_ept)
3283	hw_cr0 \|= X86_CR0_WP;
3284
3285	if (vmx->rmode.vm86_active && (cr0 & X86_CR0_PE))
3286	enter_pmode(vcpu);
3287
3288	if (!vmx->rmode.vm86_active && !(cr0 & X86_CR0_PE))
3289	enter_rmode(vcpu);
3290	}
3291
3292	vmcs_writel(field: CR0_READ_SHADOW, value: cr0);
3293	vmcs_writel(field: GUEST_CR0, value: hw_cr0);
3294	vcpu->arch.cr0 = cr0;
3295	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_CR0);
3296
3297	#ifdef CONFIG_X86_64
3298	if (vcpu->arch.efer & EFER_LME) {
3299	if (!old_cr0_pg && (cr0 & X86_CR0_PG))
3300	enter_lmode(vcpu);
3301	else if (old_cr0_pg && !(cr0 & X86_CR0_PG))
3302	exit_lmode(vcpu);
3303	}
3304	#endif
3305
3306	if (enable_ept && !enable_unrestricted_guest) {
3307	/*
3308	* Ensure KVM has an up-to-date snapshot of the guest's CR3. If
3309	* the below code _enables_ CR3 exiting, vmx_cache_reg() will
3310	* (correctly) stop reading vmcs.GUEST_CR3 because it thinks
3311	* KVM's CR3 is installed.
3312	*/
3313	if (!kvm_register_is_available(vcpu, reg: VCPU_EXREG_CR3))
3314	vmx_cache_reg(vcpu, reg: VCPU_EXREG_CR3);
3315
3316	/*
3317	* When running with EPT but not unrestricted guest, KVM must
3318	* intercept CR3 accesses when paging is _disabled_. This is
3319	* necessary because restricted guests can't actually run with
3320	* paging disabled, and so KVM stuffs its own CR3 in order to
3321	* run the guest when identity mapped page tables.
3322	*
3323	* Do _NOT_ check the old CR0.PG, e.g. to optimize away the
3324	* update, it may be stale with respect to CR3 interception,
3325	* e.g. after nested VM-Enter.
3326	*
3327	* Lastly, honor L1's desires, i.e. intercept CR3 loads and/or
3328	* stores to forward them to L1, even if KVM does not need to
3329	* intercept them to preserve its identity mapped page tables.
3330	*/
3331	if (!(cr0 & X86_CR0_PG)) {
3332	exec_controls_setbit(vmx, CR3_EXITING_BITS);
3333	} else if (!is_guest_mode(vcpu)) {
3334	exec_controls_clearbit(vmx, CR3_EXITING_BITS);
3335	} else {
3336	tmp = exec_controls_get(vmx);
3337	tmp &= ~CR3_EXITING_BITS;
3338	tmp \|= get_vmcs12(vcpu)->cpu_based_vm_exec_control & CR3_EXITING_BITS;
3339	exec_controls_set(vmx, val: tmp);
3340	}
3341
3342	/ Note, vmx_set_cr4() consumes the new vcpu->arch.cr0. /
3343	if ((old_cr0_pg ^ cr0) & X86_CR0_PG)
3344	vmx_set_cr4(vcpu, cr4: kvm_read_cr4(vcpu));
3345
3346	/*
3347	* When !CR0_PG -> CR0_PG, vcpu->arch.cr3 becomes active, but
3348	* GUEST_CR3 is still vmx->ept_identity_map_addr if EPT + !URG.
3349	*/
3350	if (!(old_cr0_pg & X86_CR0_PG) && (cr0 & X86_CR0_PG))
3351	kvm_register_mark_dirty(vcpu, reg: VCPU_EXREG_CR3);
3352	}
3353
3354	/ depends on vcpu->arch.cr0 to be set to a new value /
3355	vmx->emulation_required = vmx_emulation_required(vcpu);
3356	}
3357
3358	static int vmx_get_max_ept_level(void)
3359	{
3360	if (cpu_has_vmx_ept_5levels())
3361	return `5`;
3362	return `4`;
3363	}
3364
3365	u64 construct_eptp(struct kvm_vcpu vcpu, hpa_t root_hpa, int* root_level)
3366	{
3367	u64 eptp = VMX_EPTP_MT_WB;
3368
3369	eptp \|= (root_level == `5`) ? VMX_EPTP_PWL_5 : VMX_EPTP_PWL_4;
3370
3371	if (enable_ept_ad_bits &&
3372	(!is_guest_mode(vcpu) \|\| nested_ept_ad_enabled(vcpu)))
3373	eptp \|= VMX_EPTP_AD_ENABLE_BIT;
3374	eptp \|= root_hpa;
3375
3376	return eptp;
3377	}
3378
3379	static void vmx_load_mmu_pgd(struct kvm_vcpu *vcpu, hpa_t root_hpa,
3380	int root_level)
3381	{
3382	struct kvm *kvm = vcpu->kvm;
3383	bool update_guest_cr3 = true;
3384	unsigned long guest_cr3;
3385	u64 eptp;
3386
3387	if (enable_ept) {
3388	eptp = construct_eptp(vcpu, root_hpa, root_level);
3389	vmcs_write64(field: EPT_POINTER, value: eptp);
3390
3391	hv_track_root_tdp(vcpu, root_hpa);
3392
3393	if (!enable_unrestricted_guest && !is_paging(vcpu))
3394	guest_cr3 = to_kvm_vmx(kvm)->ept_identity_map_addr;
3395	else if (kvm_register_is_dirty(vcpu, reg: VCPU_EXREG_CR3))
3396	guest_cr3 = vcpu->arch.cr3;
3397	else / vmcs.GUEST_CR3 is already up-to-date. /
3398	update_guest_cr3 = false;
3399	vmx_ept_load_pdptrs(vcpu);
3400	} else {
3401	guest_cr3 = root_hpa \| kvm_get_active_pcid(vcpu) \|
3402	kvm_get_active_cr3_lam_bits(vcpu);
3403	}
3404
3405	if (update_guest_cr3)
3406	vmcs_writel(field: GUEST_CR3, value: guest_cr3);
3407	}
3408
3409
3410	static bool vmx_is_valid_cr4(struct kvm_vcpu vcpu, unsigned* long cr4)
3411	{
3412	/*
3413	* We operate under the default treatment of SMM, so VMX cannot be
3414	* enabled under SMM. Note, whether or not VMXE is allowed at all,
3415	* i.e. is a reserved bit, is handled by common x86 code.
3416	*/
3417	if ((cr4 & X86_CR4_VMXE) && is_smm(vcpu))
3418	return false;
3419
3420	if (to_vmx(vcpu)->nested.vmxon && !nested_cr4_valid(vcpu, val: cr4))
3421	return false;
3422
3423	return true;
3424	}
3425
3426	void vmx_set_cr4(struct kvm_vcpu vcpu, unsigned* long cr4)
3427	{
3428	unsigned long old_cr4 = kvm_read_cr4(vcpu);
3429	struct vcpu_vmx *vmx = to_vmx(vcpu);
3430	unsigned long hw_cr4;
3431
3432	/*
3433	* Pass through host's Machine Check Enable value to hw_cr4, which
3434	* is in force while we are in guest mode. Do not let guests control
3435	* this bit, even if host CR4.MCE == 0.
3436	*/
3437	hw_cr4 = (cr4_read_shadow() & X86_CR4_MCE) \| (cr4 & ~X86_CR4_MCE);
3438	if (enable_unrestricted_guest)
3439	hw_cr4 \|= KVM_VM_CR4_ALWAYS_ON_UNRESTRICTED_GUEST;
3440	else if (vmx->rmode.vm86_active)
3441	hw_cr4 \|= KVM_RMODE_VM_CR4_ALWAYS_ON;
3442	else
3443	hw_cr4 \|= KVM_PMODE_VM_CR4_ALWAYS_ON;
3444
3445	if (vmx_umip_emulated()) {
3446	if (cr4 & X86_CR4_UMIP) {
3447	secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_DESC);
3448	hw_cr4 &= ~X86_CR4_UMIP;
3449	} else if (!is_guest_mode(vcpu) \|\|
3450	!nested_cpu_has2(vmcs12: get_vmcs12(vcpu), SECONDARY_EXEC_DESC)) {
3451	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_DESC);
3452	}
3453	}
3454
3455	vcpu->arch.cr4 = cr4;
3456	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_CR4);
3457
3458	if (!enable_unrestricted_guest) {
3459	if (enable_ept) {
3460	if (!is_paging(vcpu)) {
3461	hw_cr4 &= ~X86_CR4_PAE;
3462	hw_cr4 \|= X86_CR4_PSE;
3463	} else if (!(cr4 & X86_CR4_PAE)) {
3464	hw_cr4 &= ~X86_CR4_PAE;
3465	}
3466	}
3467
3468	/*
3469	* SMEP/SMAP/PKU is disabled if CPU is in non-paging mode in
3470	* hardware. To emulate this behavior, SMEP/SMAP/PKU needs
3471	* to be manually disabled when guest switches to non-paging
3472	* mode.
3473	*
3474	* If !enable_unrestricted_guest, the CPU is always running
3475	* with CR0.PG=1 and CR4 needs to be modified.
3476	* If enable_unrestricted_guest, the CPU automatically
3477	* disables SMEP/SMAP/PKU when the guest sets CR0.PG=0.
3478	*/
3479	if (!is_paging(vcpu))
3480	hw_cr4 &= ~(X86_CR4_SMEP \| X86_CR4_SMAP \| X86_CR4_PKE);
3481	}
3482
3483	vmcs_writel(field: CR4_READ_SHADOW, value: cr4);
3484	vmcs_writel(field: GUEST_CR4, value: hw_cr4);
3485
3486	if ((cr4 ^ old_cr4) & (X86_CR4_OSXSAVE \| X86_CR4_PKE))
3487	kvm_update_cpuid_runtime(vcpu);
3488	}
3489
3490	void vmx_get_segment(struct kvm_vcpu vcpu, struct* kvm_segment var, int* seg)
3491	{
3492	struct vcpu_vmx *vmx = to_vmx(vcpu);
3493	u32 ar;
3494
3495	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3496	*var = vmx->rmode.segs[seg];
3497	if (seg == VCPU_SREG_TR
3498	\|\| var->selector == vmx_read_guest_seg_selector(vmx, seg))
3499	return;
3500	var->base = vmx_read_guest_seg_base(vmx, seg);
3501	var->selector = vmx_read_guest_seg_selector(vmx, seg);
3502	return;
3503	}
3504	var->base = vmx_read_guest_seg_base(vmx, seg);
3505	var->limit = vmx_read_guest_seg_limit(vmx, seg);
3506	var->selector = vmx_read_guest_seg_selector(vmx, seg);
3507	ar = vmx_read_guest_seg_ar(vmx, seg);
3508	var->unusable = (ar >> `16`) & `1`;
3509	var->type = ar & `15`;
3510	var->s = (ar >> `4`) & `1`;
3511	var->dpl = (ar >> `5`) & `3`;
3512	/*
3513	* Some userspaces do not preserve unusable property. Since usable
3514	* segment has to be present according to VMX spec we can use present
3515	* property to amend userspace bug by making unusable segment always
3516	* nonpresent. vmx_segment_access_rights() already marks nonpresent
3517	* segment as unusable.
3518	*/
3519	var->present = !var->unusable;
3520	var->avl = (ar >> `12`) & `1`;
3521	var->l = (ar >> `13`) & `1`;
3522	var->db = (ar >> `14`) & `1`;
3523	var->g = (ar >> `15`) & `1`;
3524	}
3525
3526	static u64 vmx_get_segment_base(struct kvm_vcpu vcpu, int* seg)
3527	{
3528	struct kvm_segment s;
3529
3530	if (to_vmx(vcpu)->rmode.vm86_active) {
3531	vmx_get_segment(vcpu, var: &s, seg);
3532	return s.base;
3533	}
3534	return vmx_read_guest_seg_base(vmx: to_vmx(vcpu), seg);
3535	}
3536
3537	int vmx_get_cpl(struct kvm_vcpu *vcpu)
3538	{
3539	struct vcpu_vmx *vmx = to_vmx(vcpu);
3540
3541	if (unlikely(vmx->rmode.vm86_active))
3542	return `0`;
3543	else {
3544	int ar = vmx_read_guest_seg_ar(vmx, seg: VCPU_SREG_SS);
3545	return VMX_AR_DPL(ar);
3546	}
3547	}
3548
3549	static u32 vmx_segment_access_rights(struct kvm_segment *var)
3550	{
3551	u32 ar;
3552
3553	ar = var->type & `15`;
3554	ar \|= (var->s & `1`) << `4`;
3555	ar \|= (var->dpl & `3`) << `5`;
3556	ar \|= (var->present & `1`) << `7`;
3557	ar \|= (var->avl & `1`) << `12`;
3558	ar \|= (var->l & `1`) << `13`;
3559	ar \|= (var->db & `1`) << `14`;
3560	ar \|= (var->g & `1`) << `15`;
3561	ar \|= (var->unusable \|\| !var->present) << `16`;
3562
3563	return ar;
3564	}
3565
3566	void __vmx_set_segment(struct kvm_vcpu vcpu, struct* kvm_segment var, int* seg)
3567	{
3568	struct vcpu_vmx *vmx = to_vmx(vcpu);
3569	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3570
3571	vmx_segment_cache_clear(vmx);
3572
3573	if (vmx->rmode.vm86_active && seg != VCPU_SREG_LDTR) {
3574	vmx->rmode.segs[seg] = *var;
3575	if (seg == VCPU_SREG_TR)
3576	vmcs_write16(field: sf->selector, value: var->selector);
3577	else if (var->s)
3578	fix_rmode_seg(seg, save: &vmx->rmode.segs[seg]);
3579	return;
3580	}
3581
3582	vmcs_writel(field: sf->base, value: var->base);
3583	vmcs_write32(field: sf->limit, value: var->limit);
3584	vmcs_write16(field: sf->selector, value: var->selector);
3585
3586	/*
3587	* Fix the "Accessed" bit in AR field of segment registers for older
3588	* qemu binaries.
3589	* IA32 arch specifies that at the time of processor reset the
3590	* "Accessed" bit in the AR field of segment registers is 1. And qemu
3591	* is setting it to 0 in the userland code. This causes invalid guest
3592	* state vmexit when "unrestricted guest" mode is turned on.
3593	* Fix for this setup issue in cpu_reset is being pushed in the qemu
3594	* tree. Newer qemu binaries with that qemu fix would not need this
3595	* kvm hack.
3596	*/
3597	if (is_unrestricted_guest(vcpu) && (seg != VCPU_SREG_LDTR))
3598	var->type \|= `0x1`; / Accessed /
3599
3600	vmcs_write32(field: sf->ar_bytes, value: vmx_segment_access_rights(var));
3601	}
3602
3603	static void vmx_set_segment(struct kvm_vcpu vcpu, struct* kvm_segment var, int* seg)
3604	{
3605	__vmx_set_segment(vcpu, var, seg);
3606
3607	to_vmx(vcpu)->emulation_required = vmx_emulation_required(vcpu);
3608	}
3609
3610	static void vmx_get_cs_db_l_bits(struct kvm_vcpu vcpu, int* db, int* *l)
3611	{
3612	u32 ar = vmx_read_guest_seg_ar(vmx: to_vmx(vcpu), seg: VCPU_SREG_CS);
3613
3614	*db = (ar >> `14`) & `1`;
3615	*l = (ar >> `13`) & `1`;
3616	}
3617
3618	static void vmx_get_idt(struct kvm_vcpu vcpu, struct* desc_ptr *dt)
3619	{
3620	dt->size = vmcs_read32(field: GUEST_IDTR_LIMIT);
3621	dt->address = vmcs_readl(field: GUEST_IDTR_BASE);
3622	}
3623
3624	static void vmx_set_idt(struct kvm_vcpu vcpu, struct* desc_ptr *dt)
3625	{
3626	vmcs_write32(field: GUEST_IDTR_LIMIT, value: dt->size);
3627	vmcs_writel(field: GUEST_IDTR_BASE, value: dt->address);
3628	}
3629
3630	static void vmx_get_gdt(struct kvm_vcpu vcpu, struct* desc_ptr *dt)
3631	{
3632	dt->size = vmcs_read32(field: GUEST_GDTR_LIMIT);
3633	dt->address = vmcs_readl(field: GUEST_GDTR_BASE);
3634	}
3635
3636	static void vmx_set_gdt(struct kvm_vcpu vcpu, struct* desc_ptr *dt)
3637	{
3638	vmcs_write32(field: GUEST_GDTR_LIMIT, value: dt->size);
3639	vmcs_writel(field: GUEST_GDTR_BASE, value: dt->address);
3640	}
3641
3642	static bool rmode_segment_valid(struct kvm_vcpu vcpu, int* seg)
3643	{
3644	struct kvm_segment var;
3645	u32 ar;
3646
3647	vmx_get_segment(vcpu, var: &var, seg);
3648	var.dpl = `0x3`;
3649	if (seg == VCPU_SREG_CS)
3650	var.type = `0x3`;
3651	ar = vmx_segment_access_rights(var: &var);
3652
3653	if (var.base != (var.selector << `4`))
3654	return false;
3655	if (var.limit != `0xffff`)
3656	return false;
3657	if (ar != `0xf3`)
3658	return false;
3659
3660	return true;
3661	}
3662
3663	static bool code_segment_valid(struct kvm_vcpu *vcpu)
3664	{
3665	struct kvm_segment cs;
3666	unsigned int cs_rpl;
3667
3668	vmx_get_segment(vcpu, var: &cs, seg: VCPU_SREG_CS);
3669	cs_rpl = cs.selector & SEGMENT_RPL_MASK;
3670
3671	if (cs.unusable)
3672	return false;
3673	if (~cs.type & (VMX_AR_TYPE_CODE_MASK\|VMX_AR_TYPE_ACCESSES_MASK))
3674	return false;
3675	if (!cs.s)
3676	return false;
3677	if (cs.type & VMX_AR_TYPE_WRITEABLE_MASK) {
3678	if (cs.dpl > cs_rpl)
3679	return false;
3680	} else {
3681	if (cs.dpl != cs_rpl)
3682	return false;
3683	}
3684	if (!cs.present)
3685	return false;
3686
3687	/ TODO: Add Reserved field check, this'll require a new member in the kvm_segment_field structure /
3688	return true;
3689	}
3690
3691	static bool stack_segment_valid(struct kvm_vcpu *vcpu)
3692	{
3693	struct kvm_segment ss;
3694	unsigned int ss_rpl;
3695
3696	vmx_get_segment(vcpu, var: &ss, seg: VCPU_SREG_SS);
3697	ss_rpl = ss.selector & SEGMENT_RPL_MASK;
3698
3699	if (ss.unusable)
3700	return true;
3701	if (ss.type != `3` && ss.type != `7`)
3702	return false;
3703	if (!ss.s)
3704	return false;
3705	if (ss.dpl != ss_rpl) / DPL != RPL /
3706	return false;
3707	if (!ss.present)
3708	return false;
3709
3710	return true;
3711	}
3712
3713	static bool data_segment_valid(struct kvm_vcpu vcpu, int* seg)
3714	{
3715	struct kvm_segment var;
3716	unsigned int rpl;
3717
3718	vmx_get_segment(vcpu, var: &var, seg);
3719	rpl = var.selector & SEGMENT_RPL_MASK;
3720
3721	if (var.unusable)
3722	return true;
3723	if (!var.s)
3724	return false;
3725	if (!var.present)
3726	return false;
3727	if (~var.type & (VMX_AR_TYPE_CODE_MASK\|VMX_AR_TYPE_WRITEABLE_MASK)) {
3728	if (var.dpl < rpl) / DPL < RPL /
3729	return false;
3730	}
3731
3732	/ TODO: Add other members to kvm_segment_field to allow checking for other access*
3733	* rights flags
3734	*/
3735	return true;
3736	}
3737
3738	static bool tr_valid(struct kvm_vcpu *vcpu)
3739	{
3740	struct kvm_segment tr;
3741
3742	vmx_get_segment(vcpu, var: &tr, seg: VCPU_SREG_TR);
3743
3744	if (tr.unusable)
3745	return false;
3746	if (tr.selector & SEGMENT_TI_MASK) / TI = 1 /
3747	return false;
3748	if (tr.type != `3` && tr.type != `11`) / TODO: Check if guest is in IA32e mode /
3749	return false;
3750	if (!tr.present)
3751	return false;
3752
3753	return true;
3754	}
3755
3756	static bool ldtr_valid(struct kvm_vcpu *vcpu)
3757	{
3758	struct kvm_segment ldtr;
3759
3760	vmx_get_segment(vcpu, var: &ldtr, seg: VCPU_SREG_LDTR);
3761
3762	if (ldtr.unusable)
3763	return true;
3764	if (ldtr.selector & SEGMENT_TI_MASK) / TI = 1 /
3765	return false;
3766	if (ldtr.type != `2`)
3767	return false;
3768	if (!ldtr.present)
3769	return false;
3770
3771	return true;
3772	}
3773
3774	static bool cs_ss_rpl_check(struct kvm_vcpu *vcpu)
3775	{
3776	struct kvm_segment cs, ss;
3777
3778	vmx_get_segment(vcpu, var: &cs, seg: VCPU_SREG_CS);
3779	vmx_get_segment(vcpu, var: &ss, seg: VCPU_SREG_SS);
3780
3781	return ((cs.selector & SEGMENT_RPL_MASK) ==
3782	(ss.selector & SEGMENT_RPL_MASK));
3783	}
3784
3785	/*
3786	* Check if guest state is valid. Returns true if valid, false if
3787	* not.
3788	* We assume that registers are always usable
3789	*/
3790	bool __vmx_guest_state_valid(struct kvm_vcpu *vcpu)
3791	{
3792	/ real mode guest state checks /
3793	if (!is_protmode(vcpu) \|\| (vmx_get_rflags(vcpu) & X86_EFLAGS_VM)) {
3794	if (!rmode_segment_valid(vcpu, seg: VCPU_SREG_CS))
3795	return false;
3796	if (!rmode_segment_valid(vcpu, seg: VCPU_SREG_SS))
3797	return false;
3798	if (!rmode_segment_valid(vcpu, seg: VCPU_SREG_DS))
3799	return false;
3800	if (!rmode_segment_valid(vcpu, seg: VCPU_SREG_ES))
3801	return false;
3802	if (!rmode_segment_valid(vcpu, seg: VCPU_SREG_FS))
3803	return false;
3804	if (!rmode_segment_valid(vcpu, seg: VCPU_SREG_GS))
3805	return false;
3806	} else {
3807	/ protected mode guest state checks /
3808	if (!cs_ss_rpl_check(vcpu))
3809	return false;
3810	if (!code_segment_valid(vcpu))
3811	return false;
3812	if (!stack_segment_valid(vcpu))
3813	return false;
3814	if (!data_segment_valid(vcpu, seg: VCPU_SREG_DS))
3815	return false;
3816	if (!data_segment_valid(vcpu, seg: VCPU_SREG_ES))
3817	return false;
3818	if (!data_segment_valid(vcpu, seg: VCPU_SREG_FS))
3819	return false;
3820	if (!data_segment_valid(vcpu, seg: VCPU_SREG_GS))
3821	return false;
3822	if (!tr_valid(vcpu))
3823	return false;
3824	if (!ldtr_valid(vcpu))
3825	return false;
3826	}
3827	/ TODO:*
3828	* - Add checks on RIP
3829	* - Add checks on RFLAGS
3830	*/
3831
3832	return true;
3833	}
3834
3835	static int init_rmode_tss(struct kvm kvm, void* __user *ua)
3836	{
3837	const void zero_page = (const* void *) __va(page_to_phys(ZERO_PAGE(`0`)));
3838	u16 data;
3839	int i;
3840
3841	for (i = `0`; i < `3`; i++) {
3842	if (__copy_to_user(to: ua + PAGE_SIZE * i, from: zero_page, PAGE_SIZE))
3843	return -EFAULT;
3844	}
3845
3846	data = TSS_BASE_SIZE + TSS_REDIRECTION_SIZE;
3847	if (__copy_to_user(to: ua + TSS_IOPB_BASE_OFFSET, from: &data, n: sizeof(u16)))
3848	return -EFAULT;
3849
3850	data = ~`0`;
3851	if (__copy_to_user(to: ua + RMODE_TSS_SIZE - `1`, from: &data, n: sizeof(u8)))
3852	return -EFAULT;
3853
3854	return `0`;
3855	}
3856
3857	static int init_rmode_identity_map(struct kvm *kvm)
3858	{
3859	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
3860	int i, r = `0`;
3861	void __user *uaddr;
3862	u32 tmp;
3863
3864	/ Protect kvm_vmx->ept_identity_pagetable_done. /
3865	mutex_lock(&kvm->slots_lock);
3866
3867	if (likely(kvm_vmx->ept_identity_pagetable_done))
3868	goto out;
3869
3870	if (!kvm_vmx->ept_identity_map_addr)
3871	kvm_vmx->ept_identity_map_addr = VMX_EPT_IDENTITY_PAGETABLE_ADDR;
3872
3873	uaddr = __x86_set_memory_region(kvm,
3874	IDENTITY_PAGETABLE_PRIVATE_MEMSLOT,
3875	gpa: kvm_vmx->ept_identity_map_addr,
3876	PAGE_SIZE);
3877	if (IS_ERR(ptr: uaddr)) {
3878	r = PTR_ERR(ptr: uaddr);
3879	goto out;
3880	}
3881
3882	/ Set up identity-mapping pagetable for EPT in real mode /
3883	for (i = `0`; i < (PAGE_SIZE / sizeof(tmp)); i++) {
3884	tmp = (i << `22`) + (_PAGE_PRESENT \| _PAGE_RW \| _PAGE_USER \|
3885	_PAGE_ACCESSED \| _PAGE_DIRTY \| _PAGE_PSE);
3886	if (__copy_to_user(to: uaddr + i * sizeof(tmp), from: &tmp, n: sizeof(tmp))) {
3887	r = -EFAULT;
3888	goto out;
3889	}
3890	}
3891	kvm_vmx->ept_identity_pagetable_done = true;
3892
3893	out:
3894	mutex_unlock(lock: &kvm->slots_lock);
3895	return r;
3896	}
3897
3898	static void seg_setup(int seg)
3899	{
3900	const struct kvm_vmx_segment_field *sf = &kvm_vmx_segment_fields[seg];
3901	unsigned int ar;
3902
3903	vmcs_write16(field: sf->selector, value: `0`);
3904	vmcs_writel(field: sf->base, value: `0`);
3905	vmcs_write32(field: sf->limit, value: `0xffff`);
3906	ar = `0x93`;
3907	if (seg == VCPU_SREG_CS)
3908	ar \|= `0x08`; / code segment /
3909
3910	vmcs_write32(field: sf->ar_bytes, value: ar);
3911	}
3912
3913	int allocate_vpid(void)
3914	{
3915	int vpid;
3916
3917	if (!enable_vpid)
3918	return `0`;
3919	spin_lock(lock: &vmx_vpid_lock);
3920	vpid = find_first_zero_bit(addr: vmx_vpid_bitmap, VMX_NR_VPIDS);
3921	if (vpid < VMX_NR_VPIDS)
3922	__set_bit(vpid, vmx_vpid_bitmap);
3923	else
3924	vpid = `0`;
3925	spin_unlock(lock: &vmx_vpid_lock);
3926	return vpid;
3927	}
3928
3929	void free_vpid(int vpid)
3930	{
3931	if (!enable_vpid \|\| vpid == `0`)
3932	return;
3933	spin_lock(lock: &vmx_vpid_lock);
3934	__clear_bit(vpid, vmx_vpid_bitmap);
3935	spin_unlock(lock: &vmx_vpid_lock);
3936	}
3937
3938	static void vmx_msr_bitmap_l01_changed(struct vcpu_vmx *vmx)
3939	{
3940	/*
3941	* When KVM is a nested hypervisor on top of Hyper-V and uses
3942	* 'Enlightened MSR Bitmap' feature L0 needs to know that MSR
3943	* bitmap has changed.
3944	*/
3945	if (kvm_is_using_evmcs()) {
3946	struct hv_enlightened_vmcs evmcs = (void* *)vmx->vmcs01.vmcs;
3947
3948	if (evmcs->hv_enlightenments_control.msr_bitmap)
3949	evmcs->hv_clean_fields &=
3950	~HV_VMX_ENLIGHTENED_CLEAN_FIELD_MSR_BITMAP;
3951	}
3952
3953	vmx->nested.force_msr_bitmap_recalc = true;
3954	}
3955
3956	void vmx_disable_intercept_for_msr(struct kvm_vcpu vcpu, u32 msr, int* type)
3957	{
3958	struct vcpu_vmx *vmx = to_vmx(vcpu);
3959	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
3960	int idx;
3961
3962	if (!cpu_has_vmx_msr_bitmap())
3963	return;
3964
3965	vmx_msr_bitmap_l01_changed(vmx);
3966
3967	/*
3968	* Mark the desired intercept state in shadow bitmap, this is needed
3969	* for resync when the MSR filters change.
3970	*/
3971	idx = vmx_get_passthrough_msr_slot(msr);
3972	if (idx >= `0`) {
3973	if (type & MSR_TYPE_R)
3974	clear_bit(nr: idx, addr: vmx->shadow_msr_intercept.read);
3975	if (type & MSR_TYPE_W)
3976	clear_bit(nr: idx, addr: vmx->shadow_msr_intercept.write);
3977	}
3978
3979	if ((type & MSR_TYPE_R) &&
3980	!kvm_msr_allowed(vcpu, index: msr, KVM_MSR_FILTER_READ)) {
3981	vmx_set_msr_bitmap_read(bitmap: msr_bitmap, msr);
3982	type &= ~MSR_TYPE_R;
3983	}
3984
3985	if ((type & MSR_TYPE_W) &&
3986	!kvm_msr_allowed(vcpu, index: msr, KVM_MSR_FILTER_WRITE)) {
3987	vmx_set_msr_bitmap_write(bitmap: msr_bitmap, msr);
3988	type &= ~MSR_TYPE_W;
3989	}
3990
3991	if (type & MSR_TYPE_R)
3992	vmx_clear_msr_bitmap_read(bitmap: msr_bitmap, msr);
3993
3994	if (type & MSR_TYPE_W)
3995	vmx_clear_msr_bitmap_write(bitmap: msr_bitmap, msr);
3996	}
3997
3998	void vmx_enable_intercept_for_msr(struct kvm_vcpu vcpu, u32 msr, int* type)
3999	{
4000	struct vcpu_vmx *vmx = to_vmx(vcpu);
4001	unsigned long *msr_bitmap = vmx->vmcs01.msr_bitmap;
4002	int idx;
4003
4004	if (!cpu_has_vmx_msr_bitmap())
4005	return;
4006
4007	vmx_msr_bitmap_l01_changed(vmx);
4008
4009	/*
4010	* Mark the desired intercept state in shadow bitmap, this is needed
4011	* for resync when the MSR filter changes.
4012	*/
4013	idx = vmx_get_passthrough_msr_slot(msr);
4014	if (idx >= `0`) {
4015	if (type & MSR_TYPE_R)
4016	set_bit(nr: idx, addr: vmx->shadow_msr_intercept.read);
4017	if (type & MSR_TYPE_W)
4018	set_bit(nr: idx, addr: vmx->shadow_msr_intercept.write);
4019	}
4020
4021	if (type & MSR_TYPE_R)
4022	vmx_set_msr_bitmap_read(bitmap: msr_bitmap, msr);
4023
4024	if (type & MSR_TYPE_W)
4025	vmx_set_msr_bitmap_write(bitmap: msr_bitmap, msr);
4026	}
4027
4028	static void vmx_update_msr_bitmap_x2apic(struct kvm_vcpu *vcpu)
4029	{
4030	/*
4031	* x2APIC indices for 64-bit accesses into the RDMSR and WRMSR halves
4032	* of the MSR bitmap. KVM emulates APIC registers up through 0x3f0,
4033	* i.e. MSR 0x83f, and so only needs to dynamically manipulate 64 bits.
4034	*/
4035	const int read_idx = APIC_BASE_MSR / BITS_PER_LONG_LONG;
4036	const int write_idx = read_idx + (`0x800` / sizeof(u64));
4037	struct vcpu_vmx *vmx = to_vmx(vcpu);
4038	u64 msr_bitmap = (u64 )vmx->vmcs01.msr_bitmap;
4039	u8 mode;
4040
4041	if (!cpu_has_vmx_msr_bitmap() \|\| WARN_ON_ONCE(!lapic_in_kernel(vcpu)))
4042	return;
4043
4044	if (cpu_has_secondary_exec_ctrls() &&
4045	(secondary_exec_controls_get(vmx) &
4046	SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE)) {
4047	mode = MSR_BITMAP_MODE_X2APIC;
4048	if (enable_apicv && kvm_vcpu_apicv_active(vcpu))
4049	mode \|= MSR_BITMAP_MODE_X2APIC_APICV;
4050	} else {
4051	mode = `0`;
4052	}
4053
4054	if (mode == vmx->x2apic_msr_bitmap_mode)
4055	return;
4056
4057	vmx->x2apic_msr_bitmap_mode = mode;
4058
4059	/*
4060	* Reset the bitmap for MSRs 0x800 - 0x83f. Leave AMD's uber-extended
4061	* registers (0x840 and above) intercepted, KVM doesn't support them.
4062	* Intercept all writes by default and poke holes as needed. Pass
4063	* through reads for all valid registers by default in x2APIC+APICv
4064	* mode, only the current timer count needs on-demand emulation by KVM.
4065	*/
4066	if (mode & MSR_BITMAP_MODE_X2APIC_APICV)
4067	msr_bitmap[read_idx] = ~kvm_lapic_readable_reg_mask(apic: vcpu->arch.apic);
4068	else
4069	msr_bitmap[read_idx] = ~`0ull`;
4070	msr_bitmap[write_idx] = ~`0ull`;
4071
4072	/*
4073	* TPR reads and writes can be virtualized even if virtual interrupt
4074	* delivery is not in use.
4075	*/
4076	vmx_set_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TASKPRI), MSR_TYPE_RW,
4077	value: !(mode & MSR_BITMAP_MODE_X2APIC));
4078
4079	if (mode & MSR_BITMAP_MODE_X2APIC_APICV) {
4080	vmx_enable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_TMCCT), MSR_TYPE_RW);
4081	vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_EOI), MSR_TYPE_W);
4082	vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_SELF_IPI), MSR_TYPE_W);
4083	if (enable_ipiv)
4084	vmx_disable_intercept_for_msr(vcpu, X2APIC_MSR(APIC_ICR), MSR_TYPE_RW);
4085	}
4086	}
4087
4088	void pt_update_intercept_for_msr(struct kvm_vcpu *vcpu)
4089	{
4090	struct vcpu_vmx *vmx = to_vmx(vcpu);
4091	bool flag = !(vmx->pt_desc.guest.ctl & RTIT_CTL_TRACEEN);
4092	u32 i;
4093
4094	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_STATUS, MSR_TYPE_RW, value: flag);
4095	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_BASE, MSR_TYPE_RW, value: flag);
4096	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_OUTPUT_MASK, MSR_TYPE_RW, value: flag);
4097	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_CR3_MATCH, MSR_TYPE_RW, value: flag);
4098	for (i = `0`; i < vmx->pt_desc.num_address_ranges; i++) {
4099	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_A + i * `2`, MSR_TYPE_RW, value: flag);
4100	vmx_set_intercept_for_msr(vcpu, MSR_IA32_RTIT_ADDR0_B + i * `2`, MSR_TYPE_RW, value: flag);
4101	}
4102	}
4103
4104	static bool vmx_guest_apic_has_interrupt(struct kvm_vcpu *vcpu)
4105	{
4106	struct vcpu_vmx *vmx = to_vmx(vcpu);
4107	void *vapic_page;
4108	u32 vppr;
4109	int rvi;
4110
4111	if (WARN_ON_ONCE(!is_guest_mode(vcpu)) \|\|
4112	!nested_cpu_has_vid(vmcs12: get_vmcs12(vcpu)) \|\|
4113	WARN_ON_ONCE(!vmx->nested.virtual_apic_map.gfn))
4114	return false;
4115
4116	rvi = vmx_get_rvi();
4117
4118	vapic_page = vmx->nested.virtual_apic_map.hva;
4119	vppr = ((u32 )(vapic_page + APIC_PROCPRI));
4120
4121	return ((rvi & `0xf0`) > (vppr & `0xf0`));
4122	}
4123
4124	static void vmx_msr_filter_changed(struct kvm_vcpu *vcpu)
4125	{
4126	struct vcpu_vmx *vmx = to_vmx(vcpu);
4127	u32 i;
4128
4129	if (!cpu_has_vmx_msr_bitmap())
4130	return;
4131
4132	/*
4133	* Redo intercept permissions for MSRs that KVM is passing through to
4134	* the guest. Disabling interception will check the new MSR filter and
4135	* ensure that KVM enables interception if usersepace wants to filter
4136	* the MSR. MSRs that KVM is already intercepting don't need to be
4137	* refreshed since KVM is going to intercept them regardless of what
4138	* userspace wants.
4139	*/
4140	for (i = `0`; i < ARRAY_SIZE(vmx_possible_passthrough_msrs); i++) {
4141	u32 msr = vmx_possible_passthrough_msrs[i];
4142
4143	if (!test_bit(i, vmx->shadow_msr_intercept.read))
4144	vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_R);
4145
4146	if (!test_bit(i, vmx->shadow_msr_intercept.write))
4147	vmx_disable_intercept_for_msr(vcpu, msr, MSR_TYPE_W);
4148	}
4149
4150	/ PT MSRs can be passed through iff PT is exposed to the guest. /
4151	if (vmx_pt_mode_is_host_guest())
4152	pt_update_intercept_for_msr(vcpu);
4153	}
4154
4155	static inline void kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu,
4156	int pi_vec)
4157	{
4158	#ifdef CONFIG_SMP
4159	if (vcpu->mode == IN_GUEST_MODE) {
4160	/*
4161	* The vector of the virtual has already been set in the PIR.
4162	* Send a notification event to deliver the virtual interrupt
4163	* unless the vCPU is the currently running vCPU, i.e. the
4164	* event is being sent from a fastpath VM-Exit handler, in
4165	* which case the PIR will be synced to the vIRR before
4166	* re-entering the guest.
4167	*
4168	* When the target is not the running vCPU, the following
4169	* possibilities emerge:
4170	*
4171	* Case 1: vCPU stays in non-root mode. Sending a notification
4172	* event posts the interrupt to the vCPU.
4173	*
4174	* Case 2: vCPU exits to root mode and is still runnable. The
4175	* PIR will be synced to the vIRR before re-entering the guest.
4176	* Sending a notification event is ok as the host IRQ handler
4177	* will ignore the spurious event.
4178	*
4179	* Case 3: vCPU exits to root mode and is blocked. vcpu_block()
4180	* has already synced PIR to vIRR and never blocks the vCPU if
4181	* the vIRR is not empty. Therefore, a blocked vCPU here does
4182	* not wait for any requested interrupts in PIR, and sending a
4183	* notification event also results in a benign, spurious event.
4184	*/
4185
4186	if (vcpu != kvm_get_running_vcpu())
4187	__apic_send_IPI_mask(mask: get_cpu_mask(cpu: vcpu->cpu), vector: pi_vec);
4188	return;
4189	}
4190	#endif
4191	/*
4192	* The vCPU isn't in the guest; wake the vCPU in case it is blocking,
4193	* otherwise do nothing as KVM will grab the highest priority pending
4194	* IRQ via ->sync_pir_to_irr() in vcpu_enter_guest().
4195	*/
4196	kvm_vcpu_wake_up(vcpu);
4197	}
4198
4199	static int vmx_deliver_nested_posted_interrupt(struct kvm_vcpu *vcpu,
4200	int vector)
4201	{
4202	struct vcpu_vmx *vmx = to_vmx(vcpu);
4203
4204	if (is_guest_mode(vcpu) &&
4205	vector == vmx->nested.posted_intr_nv) {
4206	/*
4207	* If a posted intr is not recognized by hardware,
4208	* we will accomplish it in the next vmentry.
4209	*/
4210	vmx->nested.pi_pending = true;
4211	kvm_make_request(KVM_REQ_EVENT, vcpu);
4212
4213	/*
4214	* This pairs with the smp_mb_*() after setting vcpu->mode in
4215	* vcpu_enter_guest() to guarantee the vCPU sees the event
4216	* request if triggering a posted interrupt "fails" because
4217	* vcpu->mode != IN_GUEST_MODE. The extra barrier is needed as
4218	* the smb_wmb() in kvm_make_request() only ensures everything
4219	* done before making the request is visible when the request
4220	* is visible, it doesn't ensure ordering between the store to
4221	* vcpu->requests and the load from vcpu->mode.
4222	*/
4223	smp_mb__after_atomic();
4224
4225	/ the PIR and ON have been set by L1. /
4226	kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_NESTED_VECTOR);
4227	return `0`;
4228	}
4229	return -`1`;
4230	}
4231	/*
4232	* Send interrupt to vcpu via posted interrupt way.
4233	* 1. If target vcpu is running(non-root mode), send posted interrupt
4234	* notification to vcpu and hardware will sync PIR to vIRR atomically.
4235	* 2. If target vcpu isn't running(root mode), kick it to pick up the
4236	* interrupt from PIR in next vmentry.
4237	*/
4238	static int vmx_deliver_posted_interrupt(struct kvm_vcpu vcpu, int* vector)
4239	{
4240	struct vcpu_vmx *vmx = to_vmx(vcpu);
4241	int r;
4242
4243	r = vmx_deliver_nested_posted_interrupt(vcpu, vector);
4244	if (!r)
4245	return `0`;
4246
4247	/ Note, this is called iff the local APIC is in-kernel. /
4248	if (!vcpu->arch.apic->apicv_active)
4249	return -`1`;
4250
4251	if (pi_test_and_set_pir(vector, pi_desc: &vmx->pi_desc))
4252	return `0`;
4253
4254	/ If a previous notification has sent the IPI, nothing to do. /
4255	if (pi_test_and_set_on(pi_desc: &vmx->pi_desc))
4256	return `0`;
4257
4258	/*
4259	* The implied barrier in pi_test_and_set_on() pairs with the smp_mb_*()
4260	* after setting vcpu->mode in vcpu_enter_guest(), thus the vCPU is
4261	* guaranteed to see PID.ON=1 and sync the PIR to IRR if triggering a
4262	* posted interrupt "fails" because vcpu->mode != IN_GUEST_MODE.
4263	*/
4264	kvm_vcpu_trigger_posted_interrupt(vcpu, POSTED_INTR_VECTOR);
4265	return `0`;
4266	}
4267
4268	static void vmx_deliver_interrupt(struct kvm_lapic apic, int* delivery_mode,
4269	int trig_mode, int vector)
4270	{
4271	struct kvm_vcpu *vcpu = apic->vcpu;
4272
4273	if (vmx_deliver_posted_interrupt(vcpu, vector)) {
4274	kvm_lapic_set_irr(vec: vector, apic);
4275	kvm_make_request(KVM_REQ_EVENT, vcpu);
4276	kvm_vcpu_kick(vcpu);
4277	} else {
4278	trace_kvm_apicv_accept_irq(vcpu->vcpu_id, delivery_mode,
4279	trig_mode, vector);
4280	}
4281	}
4282
4283	/*
4284	* Set up the vmcs's constant host-state fields, i.e., host-state fields that
4285	* will not change in the lifetime of the guest.
4286	* Note that host-state that does change is set elsewhere. E.g., host-state
4287	* that is set differently for each CPU is set in vmx_vcpu_load(), not here.
4288	*/
4289	void vmx_set_constant_host_state(struct vcpu_vmx *vmx)
4290	{
4291	u32 low32, high32;
4292	unsigned long tmpl;
4293	unsigned long cr0, cr3, cr4;
4294
4295	cr0 = read_cr0();
4296	WARN_ON(cr0 & X86_CR0_TS);
4297	vmcs_writel(field: HOST_CR0, value: cr0); / 22.2.3 /
4298
4299	/*
4300	* Save the most likely value for this task's CR3 in the VMCS.
4301	* We can't use __get_current_cr3_fast() because we're not atomic.
4302	*/
4303	cr3 = __read_cr3();
4304	vmcs_writel(field: HOST_CR3, value: cr3); / 22.2.3 FIXME: shadow tables /
4305	vmx->loaded_vmcs->host_state.cr3 = cr3;
4306
4307	/ Save the most likely value for this task's CR4 in the VMCS. /
4308	cr4 = cr4_read_shadow();
4309	vmcs_writel(field: HOST_CR4, value: cr4); / 22.2.3, 22.2.5 /
4310	vmx->loaded_vmcs->host_state.cr4 = cr4;
4311
4312	vmcs_write16(field: HOST_CS_SELECTOR, __KERNEL_CS); / 22.2.4 /
4313	#ifdef CONFIG_X86_64
4314	/*
4315	* Load null selectors, so we can avoid reloading them in
4316	* vmx_prepare_switch_to_host(), in case userspace uses
4317	* the null selectors too (the expected case).
4318	*/
4319	vmcs_write16(field: HOST_DS_SELECTOR, value: `0`);
4320	vmcs_write16(field: HOST_ES_SELECTOR, value: `0`);
4321	#else
4322	vmcs_write16(HOST_DS_SELECTOR, __KERNEL_DS); / 22.2.4 /
4323	vmcs_write16(HOST_ES_SELECTOR, __KERNEL_DS); / 22.2.4 /
4324	#endif
4325	vmcs_write16(field: HOST_SS_SELECTOR, __KERNEL_DS); / 22.2.4 /
4326	vmcs_write16(field: HOST_TR_SELECTOR, GDT_ENTRY_TSS`8`); /* 22.2.4 /
4327
4328	vmcs_writel(field: HOST_IDTR_BASE, value: host_idt_base); / 22.2.4 /
4329
4330	vmcs_writel(field: HOST_RIP, value: (unsigned long)vmx_vmexit); / 22.2.5 /
4331
4332	rdmsr(MSR_IA32_SYSENTER_CS, low32, high32);
4333	vmcs_write32(field: HOST_IA32_SYSENTER_CS, value: low32);
4334
4335	/*
4336	* SYSENTER is used for 32-bit system calls on either 32-bit or
4337	* 64-bit kernels. It is always zero If neither is allowed, otherwise
4338	* vmx_vcpu_load_vmcs loads it with the per-CPU entry stack (and may
4339	* have already done so!).
4340	*/
4341	if (!IS_ENABLED(CONFIG_IA32_EMULATION) && !IS_ENABLED(CONFIG_X86_32))
4342	vmcs_writel(field: HOST_IA32_SYSENTER_ESP, value: `0`);
4343
4344	rdmsrl(MSR_IA32_SYSENTER_EIP, tmpl);
4345	vmcs_writel(field: HOST_IA32_SYSENTER_EIP, value: tmpl); / 22.2.3 /
4346
4347	if (vmcs_config.vmexit_ctrl & VM_EXIT_LOAD_IA32_PAT) {
4348	rdmsr(MSR_IA32_CR_PAT, low32, high32);
4349	vmcs_write64(field: HOST_IA32_PAT, value: low32 \| ((u64) high32 << `32`));
4350	}
4351
4352	if (cpu_has_load_ia32_efer())
4353	vmcs_write64(field: HOST_IA32_EFER, value: host_efer);
4354	}
4355
4356	void set_cr4_guest_host_mask(struct vcpu_vmx *vmx)
4357	{
4358	struct kvm_vcpu *vcpu = &vmx->vcpu;
4359
4360	vcpu->arch.cr4_guest_owned_bits = KVM_POSSIBLE_CR4_GUEST_BITS &
4361	~vcpu->arch.cr4_guest_rsvd_bits;
4362	if (!enable_ept) {
4363	vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_TLBFLUSH_BITS;
4364	vcpu->arch.cr4_guest_owned_bits &= ~X86_CR4_PDPTR_BITS;
4365	}
4366	if (is_guest_mode(vcpu: &vmx->vcpu))
4367	vcpu->arch.cr4_guest_owned_bits &=
4368	~get_vmcs12(vcpu)->cr4_guest_host_mask;
4369	vmcs_writel(field: CR4_GUEST_HOST_MASK, value: ~vcpu->arch.cr4_guest_owned_bits);
4370	}
4371
4372	static u32 vmx_pin_based_exec_ctrl(struct vcpu_vmx *vmx)
4373	{
4374	u32 pin_based_exec_ctrl = vmcs_config.pin_based_exec_ctrl;
4375
4376	if (!kvm_vcpu_apicv_active(vcpu: &vmx->vcpu))
4377	pin_based_exec_ctrl &= ~PIN_BASED_POSTED_INTR;
4378
4379	if (!enable_vnmi)
4380	pin_based_exec_ctrl &= ~PIN_BASED_VIRTUAL_NMIS;
4381
4382	if (!enable_preemption_timer)
4383	pin_based_exec_ctrl &= ~PIN_BASED_VMX_PREEMPTION_TIMER;
4384
4385	return pin_based_exec_ctrl;
4386	}
4387
4388	static u32 vmx_vmentry_ctrl(void)
4389	{
4390	u32 vmentry_ctrl = vmcs_config.vmentry_ctrl;
4391
4392	if (vmx_pt_mode_is_system())
4393	vmentry_ctrl &= ~(VM_ENTRY_PT_CONCEAL_PIP \|
4394	VM_ENTRY_LOAD_IA32_RTIT_CTL);
4395	/*
4396	* IA32e mode, and loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically.
4397	*/
4398	vmentry_ctrl &= ~(VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL \|
4399	VM_ENTRY_LOAD_IA32_EFER \|
4400	VM_ENTRY_IA32E_MODE);
4401
4402	if (cpu_has_perf_global_ctrl_bug())
4403	vmentry_ctrl &= ~VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL;
4404
4405	return vmentry_ctrl;
4406	}
4407
4408	static u32 vmx_vmexit_ctrl(void)
4409	{
4410	u32 vmexit_ctrl = vmcs_config.vmexit_ctrl;
4411
4412	/*
4413	* Not used by KVM and never set in vmcs01 or vmcs02, but emulated for
4414	* nested virtualization and thus allowed to be set in vmcs12.
4415	*/
4416	vmexit_ctrl &= ~(VM_EXIT_SAVE_IA32_PAT \| VM_EXIT_SAVE_IA32_EFER \|
4417	VM_EXIT_SAVE_VMX_PREEMPTION_TIMER);
4418
4419	if (vmx_pt_mode_is_system())
4420	vmexit_ctrl &= ~(VM_EXIT_PT_CONCEAL_PIP \|
4421	VM_EXIT_CLEAR_IA32_RTIT_CTL);
4422
4423	if (cpu_has_perf_global_ctrl_bug())
4424	vmexit_ctrl &= ~VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL;
4425
4426	/ Loading of EFER and PERF_GLOBAL_CTRL are toggled dynamically /
4427	return vmexit_ctrl &
4428	~(VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL \| VM_EXIT_LOAD_IA32_EFER);
4429	}
4430
4431	static void vmx_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu)
4432	{
4433	struct vcpu_vmx *vmx = to_vmx(vcpu);
4434
4435	if (is_guest_mode(vcpu)) {
4436	vmx->nested.update_vmcs01_apicv_status = true;
4437	return;
4438	}
4439
4440	pin_controls_set(vmx, val: vmx_pin_based_exec_ctrl(vmx));
4441
4442	if (kvm_vcpu_apicv_active(vcpu)) {
4443	secondary_exec_controls_setbit(vmx,
4444	SECONDARY_EXEC_APIC_REGISTER_VIRT \|
4445	SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4446	if (enable_ipiv)
4447	tertiary_exec_controls_setbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4448	} else {
4449	secondary_exec_controls_clearbit(vmx,
4450	SECONDARY_EXEC_APIC_REGISTER_VIRT \|
4451	SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4452	if (enable_ipiv)
4453	tertiary_exec_controls_clearbit(vmx, TERTIARY_EXEC_IPI_VIRT);
4454	}
4455
4456	vmx_update_msr_bitmap_x2apic(vcpu);
4457	}
4458
4459	static u32 vmx_exec_control(struct vcpu_vmx *vmx)
4460	{
4461	u32 exec_control = vmcs_config.cpu_based_exec_ctrl;
4462
4463	/*
4464	* Not used by KVM, but fully supported for nesting, i.e. are allowed in
4465	* vmcs12 and propagated to vmcs02 when set in vmcs12.
4466	*/
4467	exec_control &= ~(CPU_BASED_RDTSC_EXITING \|
4468	CPU_BASED_USE_IO_BITMAPS \|
4469	CPU_BASED_MONITOR_TRAP_FLAG \|
4470	CPU_BASED_PAUSE_EXITING);
4471
4472	/ INTR_WINDOW_EXITING and NMI_WINDOW_EXITING are toggled dynamically /
4473	exec_control &= ~(CPU_BASED_INTR_WINDOW_EXITING \|
4474	CPU_BASED_NMI_WINDOW_EXITING);
4475
4476	if (vmx->vcpu.arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT)
4477	exec_control &= ~CPU_BASED_MOV_DR_EXITING;
4478
4479	if (!cpu_need_tpr_shadow(vcpu: &vmx->vcpu))
4480	exec_control &= ~CPU_BASED_TPR_SHADOW;
4481
4482	#ifdef CONFIG_X86_64
4483	if (exec_control & CPU_BASED_TPR_SHADOW)
4484	exec_control &= ~(CPU_BASED_CR8_LOAD_EXITING \|
4485	CPU_BASED_CR8_STORE_EXITING);
4486	else
4487	exec_control \|= CPU_BASED_CR8_STORE_EXITING \|
4488	CPU_BASED_CR8_LOAD_EXITING;
4489	#endif
4490	/ No need to intercept CR3 access or INVPLG when using EPT. /
4491	if (enable_ept)
4492	exec_control &= ~(CPU_BASED_CR3_LOAD_EXITING \|
4493	CPU_BASED_CR3_STORE_EXITING \|
4494	CPU_BASED_INVLPG_EXITING);
4495	if (kvm_mwait_in_guest(kvm: vmx->vcpu.kvm))
4496	exec_control &= ~(CPU_BASED_MWAIT_EXITING \|
4497	CPU_BASED_MONITOR_EXITING);
4498	if (kvm_hlt_in_guest(kvm: vmx->vcpu.kvm))
4499	exec_control &= ~CPU_BASED_HLT_EXITING;
4500	return exec_control;
4501	}
4502
4503	static u64 vmx_tertiary_exec_control(struct vcpu_vmx *vmx)
4504	{
4505	u64 exec_control = vmcs_config.cpu_based_3rd_exec_ctrl;
4506
4507	/*
4508	* IPI virtualization relies on APICv. Disable IPI virtualization if
4509	* APICv is inhibited.
4510	*/
4511	if (!enable_ipiv \|\| !kvm_vcpu_apicv_active(vcpu: &vmx->vcpu))
4512	exec_control &= ~TERTIARY_EXEC_IPI_VIRT;
4513
4514	return exec_control;
4515	}
4516
4517	/*
4518	* Adjust a single secondary execution control bit to intercept/allow an
4519	* instruction in the guest. This is usually done based on whether or not a
4520	* feature has been exposed to the guest in order to correctly emulate faults.
4521	*/
4522	static inline void
4523	vmx_adjust_secondary_exec_control(struct vcpu_vmx vmx, u32 exec_control,
4524	u32 control, bool enabled, bool exiting)
4525	{
4526	/*
4527	* If the control is for an opt-in feature, clear the control if the
4528	* feature is not exposed to the guest, i.e. not enabled. If the
4529	* control is opt-out, i.e. an exiting control, clear the control if
4530	* the feature _is_ exposed to the guest, i.e. exiting/interception is
4531	* disabled for the associated instruction. Note, the caller is
4532	* responsible presetting exec_control to set all supported bits.
4533	*/
4534	if (enabled == exiting)
4535	*exec_control &= ~control;
4536
4537	/*
4538	* Update the nested MSR settings so that a nested VMM can/can't set
4539	* controls for features that are/aren't exposed to the guest.
4540	*/
4541	if (nested) {
4542	/*
4543	* All features that can be added or removed to VMX MSRs must
4544	* be supported in the first place for nested virtualization.
4545	*/
4546	if (WARN_ON_ONCE(!(vmcs_config.nested.secondary_ctls_high & control)))
4547	enabled = false;
4548
4549	if (enabled)
4550	vmx->nested.msrs.secondary_ctls_high \|= control;
4551	else
4552	vmx->nested.msrs.secondary_ctls_high &= ~control;
4553	}
4554	}
4555
4556	/*
4557	* Wrapper macro for the common case of adjusting a secondary execution control
4558	* based on a single guest CPUID bit, with a dedicated feature bit. This also
4559	* verifies that the control is actually supported by KVM and hardware.
4560	*/
4561	#define vmx_adjust_sec_exec_control(vmx, exec_control, name, feat_name, ctrl_name, exiting) \
4562	({ \
4563	struct kvm_vcpu *__vcpu = &(vmx)->vcpu; \
4564	bool __enabled; \
4565	\
4566	if (cpu_has_vmx_##name()) { \
4567	if (kvm_is_governed_feature(X86_FEATURE_##feat_name)) \
4568	__enabled = guest_can_use(__vcpu, X86_FEATURE_##feat_name); \
4569	else \
4570	__enabled = guest_cpuid_has(__vcpu, X86_FEATURE_##feat_name); \
4571	vmx_adjust_secondary_exec_control(vmx, exec_control, SECONDARY_EXEC_##ctrl_name,\
4572	__enabled, exiting); \
4573	} \
4574	})
4575
4576	/ More macro magic for ENABLE_/opt-in versus _EXITING/opt-out controls. /
4577	#define vmx_adjust_sec_exec_feature(vmx, exec_control, lname, uname) \
4578	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, ENABLE_##uname, false)
4579
4580	#define vmx_adjust_sec_exec_exiting(vmx, exec_control, lname, uname) \
4581	vmx_adjust_sec_exec_control(vmx, exec_control, lname, uname, uname##_EXITING, true)
4582
4583	static u32 vmx_secondary_exec_control(struct vcpu_vmx *vmx)
4584	{
4585	struct kvm_vcpu *vcpu = &vmx->vcpu;
4586
4587	u32 exec_control = vmcs_config.cpu_based_2nd_exec_ctrl;
4588
4589	if (vmx_pt_mode_is_system())
4590	exec_control &= ~(SECONDARY_EXEC_PT_USE_GPA \| SECONDARY_EXEC_PT_CONCEAL_VMX);
4591	if (!cpu_need_virtualize_apic_accesses(vcpu))
4592	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
4593	if (vmx->vpid == `0`)
4594	exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
4595	if (!enable_ept) {
4596	exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
4597	enable_unrestricted_guest = `0`;
4598	}
4599	if (!enable_unrestricted_guest)
4600	exec_control &= ~SECONDARY_EXEC_UNRESTRICTED_GUEST;
4601	if (kvm_pause_in_guest(kvm: vmx->vcpu.kvm))
4602	exec_control &= ~SECONDARY_EXEC_PAUSE_LOOP_EXITING;
4603	if (!kvm_vcpu_apicv_active(vcpu))
4604	exec_control &= ~(SECONDARY_EXEC_APIC_REGISTER_VIRT \|
4605	SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
4606	exec_control &= ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
4607
4608	/*
4609	* KVM doesn't support VMFUNC for L1, but the control is set in KVM's
4610	* base configuration as KVM emulates VMFUNC[EPTP_SWITCHING] for L2.
4611	*/
4612	exec_control &= ~SECONDARY_EXEC_ENABLE_VMFUNC;
4613
4614	/ SECONDARY_EXEC_DESC is enabled/disabled on writes to CR4.UMIP,*
4615	* in vmx_set_cr4. */
4616	exec_control &= ~SECONDARY_EXEC_DESC;
4617
4618	/ SECONDARY_EXEC_SHADOW_VMCS is enabled when L1 executes VMPTRLD*
4619	(handle_vmptrld).
4620	We can NOT enable shadow_vmcs here because we don't have yet
4621	a current VMCS12
4622	*/
4623	exec_control &= ~SECONDARY_EXEC_SHADOW_VMCS;
4624
4625	/*
4626	* PML is enabled/disabled when dirty logging of memsmlots changes, but
4627	* it needs to be set here when dirty logging is already active, e.g.
4628	* if this vCPU was created after dirty logging was enabled.
4629	*/
4630	if (!enable_pml \|\| !atomic_read(v: &vcpu->kvm->nr_memslots_dirty_logging))
4631	exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
4632
4633	vmx_adjust_sec_exec_feature(vmx, &exec_control, xsaves, XSAVES);
4634
4635	/*
4636	* RDPID is also gated by ENABLE_RDTSCP, turn on the control if either
4637	* feature is exposed to the guest. This creates a virtualization hole
4638	* if both are supported in hardware but only one is exposed to the
4639	* guest, but letting the guest execute RDTSCP or RDPID when either one
4640	* is advertised is preferable to emulating the advertised instruction
4641	* in KVM on #UD, and obviously better than incorrectly injecting #UD.
4642	*/
4643	if (cpu_has_vmx_rdtscp()) {
4644	bool rdpid_or_rdtscp_enabled =
4645	guest_cpuid_has(vcpu, X86_FEATURE_RDTSCP) \|\|
4646	guest_cpuid_has(vcpu, X86_FEATURE_RDPID);
4647
4648	vmx_adjust_secondary_exec_control(vmx, exec_control: &exec_control,
4649	SECONDARY_EXEC_ENABLE_RDTSCP,
4650	enabled: rdpid_or_rdtscp_enabled, exiting: false);
4651	}
4652
4653	vmx_adjust_sec_exec_feature(vmx, &exec_control, invpcid, INVPCID);
4654
4655	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdrand, RDRAND);
4656	vmx_adjust_sec_exec_exiting(vmx, &exec_control, rdseed, RDSEED);
4657
4658	vmx_adjust_sec_exec_control(vmx, &exec_control, waitpkg, WAITPKG,
4659	ENABLE_USR_WAIT_PAUSE, false);
4660
4661	if (!vcpu->kvm->arch.bus_lock_detection_enabled)
4662	exec_control &= ~SECONDARY_EXEC_BUS_LOCK_DETECTION;
4663
4664	if (!kvm_notify_vmexit_enabled(kvm: vcpu->kvm))
4665	exec_control &= ~SECONDARY_EXEC_NOTIFY_VM_EXITING;
4666
4667	return exec_control;
4668	}
4669
4670	static inline int vmx_get_pid_table_order(struct kvm *kvm)
4671	{
4672	return get_order(size: kvm->arch.max_vcpu_ids * sizeof(*to_kvm_vmx(kvm)->pid_table));
4673	}
4674
4675	static int vmx_alloc_ipiv_pid_table(struct kvm *kvm)
4676	{
4677	struct page *pages;
4678	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4679
4680	if (!irqchip_in_kernel(kvm) \|\| !enable_ipiv)
4681	return `0`;
4682
4683	if (kvm_vmx->pid_table)
4684	return `0`;
4685
4686	pages = alloc_pages(GFP_KERNEL_ACCOUNT \| __GFP_ZERO,
4687	order: vmx_get_pid_table_order(kvm));
4688	if (!pages)
4689	return -ENOMEM;
4690
4691	kvm_vmx->pid_table = (void *)page_address(pages);
4692	return `0`;
4693	}
4694
4695	static int vmx_vcpu_precreate(struct kvm *kvm)
4696	{
4697	return vmx_alloc_ipiv_pid_table(kvm);
4698	}
4699
4700	#define VMX_XSS_EXIT_BITMAP 0
4701
4702	static void init_vmcs(struct vcpu_vmx *vmx)
4703	{
4704	struct kvm *kvm = vmx->vcpu.kvm;
4705	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
4706
4707	if (nested)
4708	nested_vmx_set_vmcs_shadowing_bitmap();
4709
4710	if (cpu_has_vmx_msr_bitmap())
4711	vmcs_write64(field: MSR_BITMAP, __pa(vmx->vmcs01.msr_bitmap));
4712
4713	vmcs_write64(field: VMCS_LINK_POINTER, INVALID_GPA); / 22.3.1.5 /
4714
4715	/ Control /
4716	pin_controls_set(vmx, val: vmx_pin_based_exec_ctrl(vmx));
4717
4718	exec_controls_set(vmx, val: vmx_exec_control(vmx));
4719
4720	if (cpu_has_secondary_exec_ctrls())
4721	secondary_exec_controls_set(vmx, val: vmx_secondary_exec_control(vmx));
4722
4723	if (cpu_has_tertiary_exec_ctrls())
4724	tertiary_exec_controls_set(vmx, val: vmx_tertiary_exec_control(vmx));
4725
4726	if (enable_apicv && lapic_in_kernel(vcpu: &vmx->vcpu)) {
4727	vmcs_write64(field: EOI_EXIT_BITMAP0, value: `0`);
4728	vmcs_write64(field: EOI_EXIT_BITMAP1, value: `0`);
4729	vmcs_write64(field: EOI_EXIT_BITMAP2, value: `0`);
4730	vmcs_write64(field: EOI_EXIT_BITMAP3, value: `0`);
4731
4732	vmcs_write16(field: GUEST_INTR_STATUS, value: `0`);
4733
4734	vmcs_write16(field: POSTED_INTR_NV, POSTED_INTR_VECTOR);
4735	vmcs_write64(field: POSTED_INTR_DESC_ADDR, __pa((&vmx->pi_desc)));
4736	}
4737
4738	if (vmx_can_use_ipiv(vcpu: &vmx->vcpu)) {
4739	vmcs_write64(field: PID_POINTER_TABLE, __pa(kvm_vmx->pid_table));
4740	vmcs_write16(field: LAST_PID_POINTER_INDEX, value: kvm->arch.max_vcpu_ids - `1`);
4741	}
4742
4743	if (!kvm_pause_in_guest(kvm)) {
4744	vmcs_write32(field: PLE_GAP, value: ple_gap);
4745	vmx->ple_window = ple_window;
4746	vmx->ple_window_dirty = true;
4747	}
4748
4749	if (kvm_notify_vmexit_enabled(kvm))
4750	vmcs_write32(field: NOTIFY_WINDOW, value: kvm->arch.notify_window);
4751
4752	vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MASK, value: `0`);
4753	vmcs_write32(field: PAGE_FAULT_ERROR_CODE_MATCH, value: `0`);
4754	vmcs_write32(field: CR3_TARGET_COUNT, value: `0`); / 22.2.1 /
4755
4756	vmcs_write16(field: HOST_FS_SELECTOR, value: `0`); / 22.2.4 /
4757	vmcs_write16(field: HOST_GS_SELECTOR, value: `0`); / 22.2.4 /
4758	vmx_set_constant_host_state(vmx);
4759	vmcs_writel(field: HOST_FS_BASE, value: `0`); / 22.2.4 /
4760	vmcs_writel(field: HOST_GS_BASE, value: `0`); / 22.2.4 /
4761
4762	if (cpu_has_vmx_vmfunc())
4763	vmcs_write64(field: VM_FUNCTION_CONTROL, value: `0`);
4764
4765	vmcs_write32(field: VM_EXIT_MSR_STORE_COUNT, value: `0`);
4766	vmcs_write32(field: VM_EXIT_MSR_LOAD_COUNT, value: `0`);
4767	vmcs_write64(field: VM_EXIT_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.host.val));
4768	vmcs_write32(field: VM_ENTRY_MSR_LOAD_COUNT, value: `0`);
4769	vmcs_write64(field: VM_ENTRY_MSR_LOAD_ADDR, __pa(vmx->msr_autoload.guest.val));
4770
4771	if (vmcs_config.vmentry_ctrl & VM_ENTRY_LOAD_IA32_PAT)
4772	vmcs_write64(field: GUEST_IA32_PAT, value: vmx->vcpu.arch.pat);
4773
4774	vm_exit_controls_set(vmx, val: vmx_vmexit_ctrl());
4775
4776	/ 22.2.1, 20.8.1 /
4777	vm_entry_controls_set(vmx, val: vmx_vmentry_ctrl());
4778
4779	vmx->vcpu.arch.cr0_guest_owned_bits = vmx_l1_guest_owned_cr0_bits();
4780	vmcs_writel(field: CR0_GUEST_HOST_MASK, value: ~vmx->vcpu.arch.cr0_guest_owned_bits);
4781
4782	set_cr4_guest_host_mask(vmx);
4783
4784	if (vmx->vpid != `0`)
4785	vmcs_write16(field: VIRTUAL_PROCESSOR_ID, value: vmx->vpid);
4786
4787	if (cpu_has_vmx_xsaves())
4788	vmcs_write64(field: XSS_EXIT_BITMAP, VMX_XSS_EXIT_BITMAP);
4789
4790	if (enable_pml) {
4791	vmcs_write64(field: PML_ADDRESS, page_to_phys(vmx->pml_pg));
4792	vmcs_write16(field: GUEST_PML_INDEX, PML_ENTITY_NUM - `1`);
4793	}
4794
4795	vmx_write_encls_bitmap(vcpu: &vmx->vcpu, NULL);
4796
4797	if (vmx_pt_mode_is_host_guest()) {
4798	memset(&vmx->pt_desc, `0`, sizeof(vmx->pt_desc));
4799	/ Bit[6~0] are forced to 1, writes are ignored. /
4800	vmx->pt_desc.guest.output_mask = `0x7F`;
4801	vmcs_write64(field: GUEST_IA32_RTIT_CTL, value: `0`);
4802	}
4803
4804	vmcs_write32(field: GUEST_SYSENTER_CS, value: `0`);
4805	vmcs_writel(field: GUEST_SYSENTER_ESP, value: `0`);
4806	vmcs_writel(field: GUEST_SYSENTER_EIP, value: `0`);
4807	vmcs_write64(field: GUEST_IA32_DEBUGCTL, value: `0`);
4808
4809	if (cpu_has_vmx_tpr_shadow()) {
4810	vmcs_write64(field: VIRTUAL_APIC_PAGE_ADDR, value: `0`);
4811	if (cpu_need_tpr_shadow(vcpu: &vmx->vcpu))
4812	vmcs_write64(field: VIRTUAL_APIC_PAGE_ADDR,
4813	__pa(vmx->vcpu.arch.apic->regs));
4814	vmcs_write32(field: TPR_THRESHOLD, value: `0`);
4815	}
4816
4817	vmx_setup_uret_msrs(vmx);
4818	}
4819
4820	static void __vmx_vcpu_reset(struct kvm_vcpu *vcpu)
4821	{
4822	struct vcpu_vmx *vmx = to_vmx(vcpu);
4823
4824	init_vmcs(vmx);
4825
4826	if (nested)
4827	memcpy(&vmx->nested.msrs, &vmcs_config.nested, sizeof(vmx->nested.msrs));
4828
4829	vcpu_setup_sgx_lepubkeyhash(vcpu);
4830
4831	vmx->nested.posted_intr_nv = -`1`;
4832	vmx->nested.vmxon_ptr = INVALID_GPA;
4833	vmx->nested.current_vmptr = INVALID_GPA;
4834
4835	#ifdef CONFIG_KVM_HYPERV
4836	vmx->nested.hv_evmcs_vmptr = EVMPTR_INVALID;
4837	#endif
4838
4839	vcpu->arch.microcode_version = `0x100000000ULL`;
4840	vmx->msr_ia32_feature_control_valid_bits = FEAT_CTL_LOCKED;
4841
4842	/*
4843	* Enforce invariant: pi_desc.nv is always either POSTED_INTR_VECTOR
4844	* or POSTED_INTR_WAKEUP_VECTOR.
4845	*/
4846	vmx->pi_desc.nv = POSTED_INTR_VECTOR;
4847	vmx->pi_desc.sn = `1`;
4848	}
4849
4850	static void vmx_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event)
4851	{
4852	struct vcpu_vmx *vmx = to_vmx(vcpu);
4853
4854	if (!init_event)
4855	__vmx_vcpu_reset(vcpu);
4856
4857	vmx->rmode.vm86_active = `0`;
4858	vmx->spec_ctrl = `0`;
4859
4860	vmx->msr_ia32_umwait_control = `0`;
4861
4862	vmx->hv_deadline_tsc = -`1`;
4863	kvm_set_cr8(vcpu, cr8: `0`);
4864
4865	vmx_segment_cache_clear(vmx);
4866	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_SEGMENTS);
4867
4868	seg_setup(seg: VCPU_SREG_CS);
4869	vmcs_write16(field: GUEST_CS_SELECTOR, value: `0xf000`);
4870	vmcs_writel(field: GUEST_CS_BASE, value: `0xffff0000ul`);
4871
4872	seg_setup(seg: VCPU_SREG_DS);
4873	seg_setup(seg: VCPU_SREG_ES);
4874	seg_setup(seg: VCPU_SREG_FS);
4875	seg_setup(seg: VCPU_SREG_GS);
4876	seg_setup(seg: VCPU_SREG_SS);
4877
4878	vmcs_write16(field: GUEST_TR_SELECTOR, value: `0`);
4879	vmcs_writel(field: GUEST_TR_BASE, value: `0`);
4880	vmcs_write32(field: GUEST_TR_LIMIT, value: `0xffff`);
4881	vmcs_write32(field: GUEST_TR_AR_BYTES, value: `0x008b`);
4882
4883	vmcs_write16(field: GUEST_LDTR_SELECTOR, value: `0`);
4884	vmcs_writel(field: GUEST_LDTR_BASE, value: `0`);
4885	vmcs_write32(field: GUEST_LDTR_LIMIT, value: `0xffff`);
4886	vmcs_write32(field: GUEST_LDTR_AR_BYTES, value: `0x00082`);
4887
4888	vmcs_writel(field: GUEST_GDTR_BASE, value: `0`);
4889	vmcs_write32(field: GUEST_GDTR_LIMIT, value: `0xffff`);
4890
4891	vmcs_writel(field: GUEST_IDTR_BASE, value: `0`);
4892	vmcs_write32(field: GUEST_IDTR_LIMIT, value: `0xffff`);
4893
4894	vmcs_write32(field: GUEST_ACTIVITY_STATE, GUEST_ACTIVITY_ACTIVE);
4895	vmcs_write32(field: GUEST_INTERRUPTIBILITY_INFO, value: `0`);
4896	vmcs_writel(field: GUEST_PENDING_DBG_EXCEPTIONS, value: `0`);
4897	if (kvm_mpx_supported())
4898	vmcs_write64(field: GUEST_BNDCFGS, value: `0`);
4899
4900	vmcs_write32(field: VM_ENTRY_INTR_INFO_FIELD, value: `0`); / 22.2.1 /
4901
4902	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
4903
4904	vpid_sync_context(vpid: vmx->vpid);
4905
4906	vmx_update_fb_clear_dis(vcpu, vmx);
4907	}
4908
4909	static void vmx_enable_irq_window(struct kvm_vcpu *vcpu)
4910	{
4911	exec_controls_setbit(vmx: to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
4912	}
4913
4914	static void vmx_enable_nmi_window(struct kvm_vcpu *vcpu)
4915	{
4916	if (!enable_vnmi \|\|
4917	vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_STI) {
4918	vmx_enable_irq_window(vcpu);
4919	return;
4920	}
4921
4922	exec_controls_setbit(vmx: to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
4923	}
4924
4925	static void vmx_inject_irq(struct kvm_vcpu *vcpu, bool reinjected)
4926	{
4927	struct vcpu_vmx *vmx = to_vmx(vcpu);
4928	uint32_t intr;
4929	int irq = vcpu->arch.interrupt.nr;
4930
4931	trace_kvm_inj_virq(irq, vcpu->arch.interrupt.soft, reinjected);
4932
4933	++vcpu->stat.irq_injections;
4934	if (vmx->rmode.vm86_active) {
4935	int inc_eip = `0`;
4936	if (vcpu->arch.interrupt.soft)
4937	inc_eip = vcpu->arch.event_exit_inst_len;
4938	kvm_inject_realmode_interrupt(vcpu, irq, inc_eip);
4939	return;
4940	}
4941	intr = irq \| INTR_INFO_VALID_MASK;
4942	if (vcpu->arch.interrupt.soft) {
4943	intr \|= INTR_TYPE_SOFT_INTR;
4944	vmcs_write32(field: VM_ENTRY_INSTRUCTION_LEN,
4945	value: vmx->vcpu.arch.event_exit_inst_len);
4946	} else
4947	intr \|= INTR_TYPE_EXT_INTR;
4948	vmcs_write32(field: VM_ENTRY_INTR_INFO_FIELD, value: intr);
4949
4950	vmx_clear_hlt(vcpu);
4951	}
4952
4953	static void vmx_inject_nmi(struct kvm_vcpu *vcpu)
4954	{
4955	struct vcpu_vmx *vmx = to_vmx(vcpu);
4956
4957	if (!enable_vnmi) {
4958	/*
4959	* Tracking the NMI-blocked state in software is built upon
4960	* finding the next open IRQ window. This, in turn, depends on
4961	* well-behaving guests: They have to keep IRQs disabled at
4962	* least as long as the NMI handler runs. Otherwise we may
4963	* cause NMI nesting, maybe breaking the guest. But as this is
4964	* highly unlikely, we can live with the residual risk.
4965	*/
4966	vmx->loaded_vmcs->soft_vnmi_blocked = `1`;
4967	vmx->loaded_vmcs->vnmi_blocked_time = `0`;
4968	}
4969
4970	++vcpu->stat.nmi_injections;
4971	vmx->loaded_vmcs->nmi_known_unmasked = false;
4972
4973	if (vmx->rmode.vm86_active) {
4974	kvm_inject_realmode_interrupt(vcpu, NMI_VECTOR, inc_eip: `0`);
4975	return;
4976	}
4977
4978	vmcs_write32(field: VM_ENTRY_INTR_INFO_FIELD,
4979	INTR_TYPE_NMI_INTR \| INTR_INFO_VALID_MASK \| NMI_VECTOR);
4980
4981	vmx_clear_hlt(vcpu);
4982	}
4983
4984	bool vmx_get_nmi_mask(struct kvm_vcpu *vcpu)
4985	{
4986	struct vcpu_vmx *vmx = to_vmx(vcpu);
4987	bool masked;
4988
4989	if (!enable_vnmi)
4990	return vmx->loaded_vmcs->soft_vnmi_blocked;
4991	if (vmx->loaded_vmcs->nmi_known_unmasked)
4992	return false;
4993	masked = vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO) & GUEST_INTR_STATE_NMI;
4994	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
4995	return masked;
4996	}
4997
4998	void vmx_set_nmi_mask(struct kvm_vcpu *vcpu, bool masked)
4999	{
5000	struct vcpu_vmx *vmx = to_vmx(vcpu);
5001
5002	if (!enable_vnmi) {
5003	if (vmx->loaded_vmcs->soft_vnmi_blocked != masked) {
5004	vmx->loaded_vmcs->soft_vnmi_blocked = masked;
5005	vmx->loaded_vmcs->vnmi_blocked_time = `0`;
5006	}
5007	} else {
5008	vmx->loaded_vmcs->nmi_known_unmasked = !masked;
5009	if (masked)
5010	vmcs_set_bits(field: GUEST_INTERRUPTIBILITY_INFO,
5011	GUEST_INTR_STATE_NMI);
5012	else
5013	vmcs_clear_bits(field: GUEST_INTERRUPTIBILITY_INFO,
5014	GUEST_INTR_STATE_NMI);
5015	}
5016	}
5017
5018	bool vmx_nmi_blocked(struct kvm_vcpu *vcpu)
5019	{
5020	if (is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5021	return false;
5022
5023	if (!enable_vnmi && to_vmx(vcpu)->loaded_vmcs->soft_vnmi_blocked)
5024	return true;
5025
5026	return (vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO) &
5027	(GUEST_INTR_STATE_MOV_SS \| GUEST_INTR_STATE_STI \|
5028	GUEST_INTR_STATE_NMI));
5029	}
5030
5031	static int vmx_nmi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5032	{
5033	if (to_vmx(vcpu)->nested.nested_run_pending)
5034	return -EBUSY;
5035
5036	/ An NMI must not be injected into L2 if it's supposed to VM-Exit. /
5037	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_nmi(vcpu))
5038	return -EBUSY;
5039
5040	return !vmx_nmi_blocked(vcpu);
5041	}
5042
5043	bool vmx_interrupt_blocked(struct kvm_vcpu *vcpu)
5044	{
5045	if (is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5046	return false;
5047
5048	return !(vmx_get_rflags(vcpu) & X86_EFLAGS_IF) \|\|
5049	(vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO) &
5050	(GUEST_INTR_STATE_STI \| GUEST_INTR_STATE_MOV_SS));
5051	}
5052
5053	static int vmx_interrupt_allowed(struct kvm_vcpu *vcpu, bool for_injection)
5054	{
5055	if (to_vmx(vcpu)->nested.nested_run_pending)
5056	return -EBUSY;
5057
5058	/*
5059	* An IRQ must not be injected into L2 if it's supposed to VM-Exit,
5060	* e.g. if the IRQ arrived asynchronously after checking nested events.
5061	*/
5062	if (for_injection && is_guest_mode(vcpu) && nested_exit_on_intr(vcpu))
5063	return -EBUSY;
5064
5065	return !vmx_interrupt_blocked(vcpu);
5066	}
5067
5068	static int vmx_set_tss_addr(struct kvm kvm, unsigned* int addr)
5069	{
5070	void __user *ret;
5071
5072	if (enable_unrestricted_guest)
5073	return `0`;
5074
5075	mutex_lock(&kvm->slots_lock);
5076	ret = __x86_set_memory_region(kvm, TSS_PRIVATE_MEMSLOT, gpa: addr,
5077	PAGE_SIZE * `3`);
5078	mutex_unlock(lock: &kvm->slots_lock);
5079
5080	if (IS_ERR(ptr: ret))
5081	return PTR_ERR(ptr: ret);
5082
5083	to_kvm_vmx(kvm)->tss_addr = addr;
5084
5085	return init_rmode_tss(kvm, ua: ret);
5086	}
5087
5088	static int vmx_set_identity_map_addr(struct kvm *kvm, u64 ident_addr)
5089	{
5090	to_kvm_vmx(kvm)->ept_identity_map_addr = ident_addr;
5091	return `0`;
5092	}
5093
5094	static bool rmode_exception(struct kvm_vcpu vcpu, int* vec)
5095	{
5096	switch (vec) {
5097	case BP_VECTOR:
5098	/*
5099	* Update instruction length as we may reinject the exception
5100	* from user space while in guest debugging mode.
5101	*/
5102	to_vmx(vcpu)->vcpu.arch.event_exit_inst_len =
5103	vmcs_read32(field: VM_EXIT_INSTRUCTION_LEN);
5104	if (vcpu->guest_debug & KVM_GUESTDBG_USE_SW_BP)
5105	return false;
5106	fallthrough;
5107	case DB_VECTOR:
5108	return !(vcpu->guest_debug &
5109	(KVM_GUESTDBG_SINGLESTEP \| KVM_GUESTDBG_USE_HW_BP));
5110	case DE_VECTOR:
5111	case OF_VECTOR:
5112	case BR_VECTOR:
5113	case UD_VECTOR:
5114	case DF_VECTOR:
5115	case SS_VECTOR:
5116	case GP_VECTOR:
5117	case MF_VECTOR:
5118	return true;
5119	}
5120	return false;
5121	}
5122
5123	static int handle_rmode_exception(struct kvm_vcpu *vcpu,
5124	int vec, u32 err_code)
5125	{
5126	/*
5127	* Instruction with address size override prefix opcode 0x67
5128	* Cause the #SS fault with 0 error code in VM86 mode.
5129	*/
5130	if (((vec == GP_VECTOR) \|\| (vec == SS_VECTOR)) && err_code == `0`) {
5131	if (kvm_emulate_instruction(vcpu, emulation_type: `0`)) {
5132	if (vcpu->arch.halt_request) {
5133	vcpu->arch.halt_request = `0`;
5134	return kvm_emulate_halt_noskip(vcpu);
5135	}
5136	return `1`;
5137	}
5138	return `0`;
5139	}
5140
5141	/*
5142	* Forward all other exceptions that are valid in real mode.
5143	* FIXME: Breaks guest debugging in real mode, needs to be fixed with
5144	* the required debugging infrastructure rework.
5145	*/
5146	kvm_queue_exception(vcpu, nr: vec);
5147	return `1`;
5148	}
5149
5150	static int handle_machine_check(struct kvm_vcpu *vcpu)
5151	{
5152	/ handled by vmx_vcpu_run() /
5153	return `1`;
5154	}
5155
5156	/*
5157	* If the host has split lock detection disabled, then #AC is
5158	* unconditionally injected into the guest, which is the pre split lock
5159	* detection behaviour.
5160	*
5161	* If the host has split lock detection enabled then #AC is
5162	* only injected into the guest when:
5163	* - Guest CPL == 3 (user mode)
5164	* - Guest has #AC detection enabled in CR0
5165	* - Guest EFLAGS has AC bit set
5166	*/
5167	bool vmx_guest_inject_ac(struct kvm_vcpu *vcpu)
5168	{
5169	if (!boot_cpu_has(X86_FEATURE_SPLIT_LOCK_DETECT))
5170	return true;
5171
5172	return vmx_get_cpl(vcpu) == `3` && kvm_is_cr0_bit_set(vcpu, X86_CR0_AM) &&
5173	(kvm_get_rflags(vcpu) & X86_EFLAGS_AC);
5174	}
5175
5176	static int handle_exception_nmi(struct kvm_vcpu *vcpu)
5177	{
5178	struct vcpu_vmx *vmx = to_vmx(vcpu);
5179	struct kvm_run *kvm_run = vcpu->run;
5180	u32 intr_info, ex_no, error_code;
5181	unsigned long cr2, dr6;
5182	u32 vect_info;
5183
5184	vect_info = vmx->idt_vectoring_info;
5185	intr_info = vmx_get_intr_info(vcpu);
5186
5187	/*
5188	* Machine checks are handled by handle_exception_irqoff(), or by
5189	* vmx_vcpu_run() if a #MC occurs on VM-Entry. NMIs are handled by
5190	* vmx_vcpu_enter_exit().
5191	*/
5192	if (is_machine_check(intr_info) \|\| is_nmi(intr_info))
5193	return `1`;
5194
5195	/*
5196	* Queue the exception here instead of in handle_nm_fault_irqoff().
5197	* This ensures the nested_vmx check is not skipped so vmexit can
5198	* be reflected to L1 (when it intercepts #NM) before reaching this
5199	* point.
5200	*/
5201	if (is_nm_fault(intr_info)) {
5202	kvm_queue_exception(vcpu, NM_VECTOR);
5203	return `1`;
5204	}
5205
5206	if (is_invalid_opcode(intr_info))
5207	return handle_ud(vcpu);
5208
5209	error_code = `0`;
5210	if (intr_info & INTR_INFO_DELIVER_CODE_MASK)
5211	error_code = vmcs_read32(field: VM_EXIT_INTR_ERROR_CODE);
5212
5213	if (!vmx->rmode.vm86_active && is_gp_fault(intr_info)) {
5214	WARN_ON_ONCE(!enable_vmware_backdoor);
5215
5216	/*
5217	* VMware backdoor emulation on #GP interception only handles
5218	* IN{S}, OUT{S}, and RDPMC, none of which generate a non-zero
5219	* error code on #GP.
5220	*/
5221	if (error_code) {
5222	kvm_queue_exception_e(vcpu, GP_VECTOR, error_code);
5223	return `1`;
5224	}
5225	return kvm_emulate_instruction(vcpu, EMULTYPE_VMWARE_GP);
5226	}
5227
5228	/*
5229	* The #PF with PFEC.RSVD = 1 indicates the guest is accessing
5230	* MMIO, it is better to report an internal error.
5231	* See the comments in vmx_handle_exit.
5232	*/
5233	if ((vect_info & VECTORING_INFO_VALID_MASK) &&
5234	!(is_page_fault(intr_info) && !(error_code & PFERR_RSVD_MASK))) {
5235	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
5236	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_SIMUL_EX;
5237	vcpu->run->internal.ndata = `4`;
5238	vcpu->run->internal.data[`0`] = vect_info;
5239	vcpu->run->internal.data[`1`] = intr_info;
5240	vcpu->run->internal.data[`2`] = error_code;
5241	vcpu->run->internal.data[`3`] = vcpu->arch.last_vmentry_cpu;
5242	return `0`;
5243	}
5244
5245	if (is_page_fault(intr_info)) {
5246	cr2 = vmx_get_exit_qual(vcpu);
5247	if (enable_ept && !vcpu->arch.apf.host_apf_flags) {
5248	/*
5249	* EPT will cause page fault only if we need to
5250	* detect illegal GPAs.
5251	*/
5252	WARN_ON_ONCE(!allow_smaller_maxphyaddr);
5253	kvm_fixup_and_inject_pf_error(vcpu, gva: cr2, error_code);
5254	return `1`;
5255	} else
5256	return kvm_handle_page_fault(vcpu, error_code, fault_address: cr2, NULL, insn_len: `0`);
5257	}
5258
5259	ex_no = intr_info & INTR_INFO_VECTOR_MASK;
5260
5261	if (vmx->rmode.vm86_active && rmode_exception(vcpu, vec: ex_no))
5262	return handle_rmode_exception(vcpu, vec: ex_no, err_code: error_code);
5263
5264	switch (ex_no) {
5265	case DB_VECTOR:
5266	dr6 = vmx_get_exit_qual(vcpu);
5267	if (!(vcpu->guest_debug &
5268	(KVM_GUESTDBG_SINGLESTEP \| KVM_GUESTDBG_USE_HW_BP))) {
5269	/*
5270	* If the #DB was due to ICEBP, a.k.a. INT1, skip the
5271	* instruction. ICEBP generates a trap-like #DB, but
5272	* despite its interception control being tied to #DB,
5273	* is an instruction intercept, i.e. the VM-Exit occurs
5274	* on the ICEBP itself. Use the inner "skip" helper to
5275	* avoid single-step #DB and MTF updates, as ICEBP is
5276	* higher priority. Note, skipping ICEBP still clears
5277	* STI and MOVSS blocking.
5278	*
5279	* For all other #DBs, set vmcs.PENDING_DBG_EXCEPTIONS.BS
5280	* if single-step is enabled in RFLAGS and STI or MOVSS
5281	* blocking is active, as the CPU doesn't set the bit
5282	* on VM-Exit due to #DB interception. VM-Entry has a
5283	* consistency check that a single-step #DB is pending
5284	* in this scenario as the previous instruction cannot
5285	* have toggled RFLAGS.TF 0=>1 (because STI and POP/MOV
5286	* don't modify RFLAGS), therefore the one instruction
5287	* delay when activating single-step breakpoints must
5288	* have already expired. Note, the CPU sets/clears BS
5289	* as appropriate for all other VM-Exits types.
5290	*/
5291	if (is_icebp(intr_info))
5292	WARN_ON(!skip_emulated_instruction(vcpu));
5293	else if ((vmx_get_rflags(vcpu) & X86_EFLAGS_TF) &&
5294	(vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO) &
5295	(GUEST_INTR_STATE_STI \| GUEST_INTR_STATE_MOV_SS)))
5296	vmcs_writel(field: GUEST_PENDING_DBG_EXCEPTIONS,
5297	value: vmcs_readl(field: GUEST_PENDING_DBG_EXCEPTIONS) \| DR6_BS);
5298
5299	kvm_queue_exception_p(vcpu, DB_VECTOR, payload: dr6);
5300	return `1`;
5301	}
5302	kvm_run->debug.arch.dr6 = dr6 \| DR6_ACTIVE_LOW;
5303	kvm_run->debug.arch.dr7 = vmcs_readl(field: GUEST_DR7);
5304	fallthrough;
5305	case BP_VECTOR:
5306	/*
5307	* Update instruction length as we may reinject #BP from
5308	* user space while in guest debugging mode. Reading it for
5309	* #DB as well causes no harm, it is not used in that case.
5310	*/
5311	vmx->vcpu.arch.event_exit_inst_len =
5312	vmcs_read32(field: VM_EXIT_INSTRUCTION_LEN);
5313	kvm_run->exit_reason = KVM_EXIT_DEBUG;
5314	kvm_run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5315	kvm_run->debug.arch.exception = ex_no;
5316	break;
5317	case AC_VECTOR:
5318	if (vmx_guest_inject_ac(vcpu)) {
5319	kvm_queue_exception_e(vcpu, AC_VECTOR, error_code);
5320	return `1`;
5321	}
5322
5323	/*
5324	* Handle split lock. Depending on detection mode this will
5325	* either warn and disable split lock detection for this
5326	* task or force SIGBUS on it.
5327	*/
5328	if (handle_guest_split_lock(ip: kvm_rip_read(vcpu)))
5329	return `1`;
5330	fallthrough;
5331	default:
5332	kvm_run->exit_reason = KVM_EXIT_EXCEPTION;
5333	kvm_run->ex.exception = ex_no;
5334	kvm_run->ex.error_code = error_code;
5335	break;
5336	}
5337	return `0`;
5338	}
5339
5340	static __always_inline int handle_external_interrupt(struct kvm_vcpu *vcpu)
5341	{
5342	++vcpu->stat.irq_exits;
5343	return `1`;
5344	}
5345
5346	static int handle_triple_fault(struct kvm_vcpu *vcpu)
5347	{
5348	vcpu->run->exit_reason = KVM_EXIT_SHUTDOWN;
5349	vcpu->mmio_needed = `0`;
5350	return `0`;
5351	}
5352
5353	static int handle_io(struct kvm_vcpu *vcpu)
5354	{
5355	unsigned long exit_qualification;
5356	int size, in, string;
5357	unsigned port;
5358
5359	exit_qualification = vmx_get_exit_qual(vcpu);
5360	string = (exit_qualification & `16`) != `0`;
5361
5362	++vcpu->stat.io_exits;
5363
5364	if (string)
5365	return kvm_emulate_instruction(vcpu, emulation_type: `0`);
5366
5367	port = exit_qualification >> `16`;
5368	size = (exit_qualification & `7`) + `1`;
5369	in = (exit_qualification & `8`) != `0`;
5370
5371	return kvm_fast_pio(vcpu, size, port, in);
5372	}
5373
5374	static void
5375	vmx_patch_hypercall(struct kvm_vcpu vcpu, unsigned* char *hypercall)
5376	{
5377	/*
5378	* Patch in the VMCALL instruction:
5379	*/
5380	hypercall[`0`] = `0x0f`;
5381	hypercall[`1`] = `0x01`;
5382	hypercall[`2`] = `0xc1`;
5383	}
5384
5385	/ called to set cr0 as appropriate for a mov-to-cr0 exit. /
5386	static int handle_set_cr0(struct kvm_vcpu vcpu, unsigned* long val)
5387	{
5388	if (is_guest_mode(vcpu)) {
5389	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5390	unsigned long orig_val = val;
5391
5392	/*
5393	* We get here when L2 changed cr0 in a way that did not change
5394	* any of L1's shadowed bits (see nested_vmx_exit_handled_cr),
5395	* but did change L0 shadowed bits. So we first calculate the
5396	* effective cr0 value that L1 would like to write into the
5397	* hardware. It consists of the L2-owned bits from the new
5398	* value combined with the L1-owned bits from L1's guest_cr0.
5399	*/
5400	val = (val & ~vmcs12->cr0_guest_host_mask) \|
5401	(vmcs12->guest_cr0 & vmcs12->cr0_guest_host_mask);
5402
5403	if (kvm_set_cr0(vcpu, cr0: val))
5404	return `1`;
5405	vmcs_writel(field: CR0_READ_SHADOW, value: orig_val);
5406	return `0`;
5407	} else {
5408	return kvm_set_cr0(vcpu, cr0: val);
5409	}
5410	}
5411
5412	static int handle_set_cr4(struct kvm_vcpu vcpu, unsigned* long val)
5413	{
5414	if (is_guest_mode(vcpu)) {
5415	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
5416	unsigned long orig_val = val;
5417
5418	/ analogously to handle_set_cr0 /
5419	val = (val & ~vmcs12->cr4_guest_host_mask) \|
5420	(vmcs12->guest_cr4 & vmcs12->cr4_guest_host_mask);
5421	if (kvm_set_cr4(vcpu, cr4: val))
5422	return `1`;
5423	vmcs_writel(field: CR4_READ_SHADOW, value: orig_val);
5424	return `0`;
5425	} else
5426	return kvm_set_cr4(vcpu, cr4: val);
5427	}
5428
5429	static int handle_desc(struct kvm_vcpu *vcpu)
5430	{
5431	/*
5432	* UMIP emulation relies on intercepting writes to CR4.UMIP, i.e. this
5433	* and other code needs to be updated if UMIP can be guest owned.
5434	*/
5435	BUILD_BUG_ON(KVM_POSSIBLE_CR4_GUEST_BITS & X86_CR4_UMIP);
5436
5437	WARN_ON_ONCE(!kvm_is_cr4_bit_set(vcpu, X86_CR4_UMIP));
5438	return kvm_emulate_instruction(vcpu, emulation_type: `0`);
5439	}
5440
5441	static int handle_cr(struct kvm_vcpu *vcpu)
5442	{
5443	unsigned long exit_qualification, val;
5444	int cr;
5445	int reg;
5446	int err;
5447	int ret;
5448
5449	exit_qualification = vmx_get_exit_qual(vcpu);
5450	cr = exit_qualification & `15`;
5451	reg = (exit_qualification >> `8`) & `15`;
5452	switch ((exit_qualification >> `4`) & `3`) {
5453	case `0`: / mov to cr /
5454	val = kvm_register_read(vcpu, reg);
5455	trace_kvm_cr_write(cr, val);
5456	switch (cr) {
5457	case `0`:
5458	err = handle_set_cr0(vcpu, val);
5459	return kvm_complete_insn_gp(vcpu, err);
5460	case `3`:
5461	WARN_ON_ONCE(enable_unrestricted_guest);
5462
5463	err = kvm_set_cr3(vcpu, cr3: val);
5464	return kvm_complete_insn_gp(vcpu, err);
5465	case `4`:
5466	err = handle_set_cr4(vcpu, val);
5467	return kvm_complete_insn_gp(vcpu, err);
5468	case `8`: {
5469	u8 cr8_prev = kvm_get_cr8(vcpu);
5470	u8 cr8 = (u8)val;
5471	err = kvm_set_cr8(vcpu, cr8);
5472	ret = kvm_complete_insn_gp(vcpu, err);
5473	if (lapic_in_kernel(vcpu))
5474	return ret;
5475	if (cr8_prev <= cr8)
5476	return ret;
5477	/*
5478	* TODO: we might be squashing a
5479	* KVM_GUESTDBG_SINGLESTEP-triggered
5480	* KVM_EXIT_DEBUG here.
5481	*/
5482	vcpu->run->exit_reason = KVM_EXIT_SET_TPR;
5483	return `0`;
5484	}
5485	}
5486	break;
5487	case `2`: / clts /
5488	KVM_BUG(`1`, vcpu->kvm, "Guest always owns CR0.TS");
5489	return -EIO;
5490	case `1`: /mov from cr/
5491	switch (cr) {
5492	case `3`:
5493	WARN_ON_ONCE(enable_unrestricted_guest);
5494
5495	val = kvm_read_cr3(vcpu);
5496	kvm_register_write(vcpu, reg, val);
5497	trace_kvm_cr_read(cr, val);
5498	return kvm_skip_emulated_instruction(vcpu);
5499	case `8`:
5500	val = kvm_get_cr8(vcpu);
5501	kvm_register_write(vcpu, reg, val);
5502	trace_kvm_cr_read(cr, val);
5503	return kvm_skip_emulated_instruction(vcpu);
5504	}
5505	break;
5506	case `3`: / lmsw /
5507	val = (exit_qualification >> LMSW_SOURCE_DATA_SHIFT) & `0x0f`;
5508	trace_kvm_cr_write(`0`, (kvm_read_cr0_bits(vcpu, mask: ~`0xful`) \| val));
5509	kvm_lmsw(vcpu, msw: val);
5510
5511	return kvm_skip_emulated_instruction(vcpu);
5512	default:
5513	break;
5514	}
5515	vcpu->run->exit_reason = `0`;
5516	vcpu_unimpl(vcpu, "unhandled control register: op %d cr %d\n",
5517	(int)(exit_qualification >> `4`) & `3`, cr);
5518	return `0`;
5519	}
5520
5521	static int handle_dr(struct kvm_vcpu *vcpu)
5522	{
5523	unsigned long exit_qualification;
5524	int dr, dr7, reg;
5525	int err = `1`;
5526
5527	exit_qualification = vmx_get_exit_qual(vcpu);
5528	dr = exit_qualification & DEBUG_REG_ACCESS_NUM;
5529
5530	/ First, if DR does not exist, trigger UD /
5531	if (!kvm_require_dr(vcpu, dr))
5532	return `1`;
5533
5534	if (vmx_get_cpl(vcpu) > `0`)
5535	goto out;
5536
5537	dr7 = vmcs_readl(field: GUEST_DR7);
5538	if (dr7 & DR7_GD) {
5539	/*
5540	* As the vm-exit takes precedence over the debug trap, we
5541	* need to emulate the latter, either for the host or the
5542	* guest debugging itself.
5543	*/
5544	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP) {
5545	vcpu->run->debug.arch.dr6 = DR6_BD \| DR6_ACTIVE_LOW;
5546	vcpu->run->debug.arch.dr7 = dr7;
5547	vcpu->run->debug.arch.pc = kvm_get_linear_rip(vcpu);
5548	vcpu->run->debug.arch.exception = DB_VECTOR;
5549	vcpu->run->exit_reason = KVM_EXIT_DEBUG;
5550	return `0`;
5551	} else {
5552	kvm_queue_exception_p(vcpu, DB_VECTOR, DR6_BD);
5553	return `1`;
5554	}
5555	}
5556
5557	if (vcpu->guest_debug == `0`) {
5558	exec_controls_clearbit(vmx: to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5559
5560	/*
5561	* No more DR vmexits; force a reload of the debug registers
5562	* and reenter on this instruction. The next vmexit will
5563	* retrieve the full state of the debug registers.
5564	*/
5565	vcpu->arch.switch_db_regs \|= KVM_DEBUGREG_WONT_EXIT;
5566	return `1`;
5567	}
5568
5569	reg = DEBUG_REG_ACCESS_REG(exit_qualification);
5570	if (exit_qualification & TYPE_MOV_FROM_DR) {
5571	kvm_register_write(vcpu, reg, val: kvm_get_dr(vcpu, dr));
5572	err = `0`;
5573	} else {
5574	err = kvm_set_dr(vcpu, dr, val: kvm_register_read(vcpu, reg));
5575	}
5576
5577	out:
5578	return kvm_complete_insn_gp(vcpu, err);
5579	}
5580
5581	static void vmx_sync_dirty_debug_regs(struct kvm_vcpu *vcpu)
5582	{
5583	get_debugreg(vcpu->arch.db[`0`], `0`);
5584	get_debugreg(vcpu->arch.db[`1`], `1`);
5585	get_debugreg(vcpu->arch.db[`2`], `2`);
5586	get_debugreg(vcpu->arch.db[`3`], `3`);
5587	get_debugreg(vcpu->arch.dr6, `6`);
5588	vcpu->arch.dr7 = vmcs_readl(field: GUEST_DR7);
5589
5590	vcpu->arch.switch_db_regs &= ~KVM_DEBUGREG_WONT_EXIT;
5591	exec_controls_setbit(vmx: to_vmx(vcpu), CPU_BASED_MOV_DR_EXITING);
5592
5593	/*
5594	* exc_debug expects dr6 to be cleared after it runs, avoid that it sees
5595	* a stale dr6 from the guest.
5596	*/
5597	set_debugreg(DR6_RESERVED, reg: `6`);
5598	}
5599
5600	static void vmx_set_dr7(struct kvm_vcpu vcpu, unsigned* long val)
5601	{
5602	vmcs_writel(field: GUEST_DR7, value: val);
5603	}
5604
5605	static int handle_tpr_below_threshold(struct kvm_vcpu *vcpu)
5606	{
5607	kvm_apic_update_ppr(vcpu);
5608	return `1`;
5609	}
5610
5611	static int handle_interrupt_window(struct kvm_vcpu *vcpu)
5612	{
5613	exec_controls_clearbit(vmx: to_vmx(vcpu), CPU_BASED_INTR_WINDOW_EXITING);
5614
5615	kvm_make_request(KVM_REQ_EVENT, vcpu);
5616
5617	++vcpu->stat.irq_window_exits;
5618	return `1`;
5619	}
5620
5621	static int handle_invlpg(struct kvm_vcpu *vcpu)
5622	{
5623	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5624
5625	kvm_mmu_invlpg(vcpu, gva: exit_qualification);
5626	return kvm_skip_emulated_instruction(vcpu);
5627	}
5628
5629	static int handle_apic_access(struct kvm_vcpu *vcpu)
5630	{
5631	if (likely(fasteoi)) {
5632	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5633	int access_type, offset;
5634
5635	access_type = exit_qualification & APIC_ACCESS_TYPE;
5636	offset = exit_qualification & APIC_ACCESS_OFFSET;
5637	/*
5638	* Sane guest uses MOV to write EOI, with written value
5639	* not cared. So make a short-circuit here by avoiding
5640	* heavy instruction emulation.
5641	*/
5642	if ((access_type == TYPE_LINEAR_APIC_INST_WRITE) &&
5643	(offset == APIC_EOI)) {
5644	kvm_lapic_set_eoi(vcpu);
5645	return kvm_skip_emulated_instruction(vcpu);
5646	}
5647	}
5648	return kvm_emulate_instruction(vcpu, emulation_type: `0`);
5649	}
5650
5651	static int handle_apic_eoi_induced(struct kvm_vcpu *vcpu)
5652	{
5653	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5654	int vector = exit_qualification & `0xff`;
5655
5656	/ EOI-induced VM exit is trap-like and thus no need to adjust IP /
5657	kvm_apic_set_eoi_accelerated(vcpu, vector);
5658	return `1`;
5659	}
5660
5661	static int handle_apic_write(struct kvm_vcpu *vcpu)
5662	{
5663	unsigned long exit_qualification = vmx_get_exit_qual(vcpu);
5664
5665	/*
5666	* APIC-write VM-Exit is trap-like, KVM doesn't need to advance RIP and
5667	* hardware has done any necessary aliasing, offset adjustments, etc...
5668	* for the access. I.e. the correct value has already been written to
5669	* the vAPIC page for the correct 16-byte chunk. KVM needs only to
5670	* retrieve the register value and emulate the access.
5671	*/
5672	u32 offset = exit_qualification & `0xff0`;
5673
5674	kvm_apic_write_nodecode(vcpu, offset);
5675	return `1`;
5676	}
5677
5678	static int handle_task_switch(struct kvm_vcpu *vcpu)
5679	{
5680	struct vcpu_vmx *vmx = to_vmx(vcpu);
5681	unsigned long exit_qualification;
5682	bool has_error_code = false;
5683	u32 error_code = `0`;
5684	u16 tss_selector;
5685	int reason, type, idt_v, idt_index;
5686
5687	idt_v = (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK);
5688	idt_index = (vmx->idt_vectoring_info & VECTORING_INFO_VECTOR_MASK);
5689	type = (vmx->idt_vectoring_info & VECTORING_INFO_TYPE_MASK);
5690
5691	exit_qualification = vmx_get_exit_qual(vcpu);
5692
5693	reason = (u32)exit_qualification >> `30`;
5694	if (reason == TASK_SWITCH_GATE && idt_v) {
5695	switch (type) {
5696	case INTR_TYPE_NMI_INTR:
5697	vcpu->arch.nmi_injected = false;
5698	vmx_set_nmi_mask(vcpu, masked: true);
5699	break;
5700	case INTR_TYPE_EXT_INTR:
5701	case INTR_TYPE_SOFT_INTR:
5702	kvm_clear_interrupt_queue(vcpu);
5703	break;
5704	case INTR_TYPE_HARD_EXCEPTION:
5705	if (vmx->idt_vectoring_info &
5706	VECTORING_INFO_DELIVER_CODE_MASK) {
5707	has_error_code = true;
5708	error_code =
5709	vmcs_read32(field: IDT_VECTORING_ERROR_CODE);
5710	}
5711	fallthrough;
5712	case INTR_TYPE_SOFT_EXCEPTION:
5713	kvm_clear_exception_queue(vcpu);
5714	break;
5715	default:
5716	break;
5717	}
5718	}
5719	tss_selector = exit_qualification;
5720
5721	if (!idt_v \|\| (type != INTR_TYPE_HARD_EXCEPTION &&
5722	type != INTR_TYPE_EXT_INTR &&
5723	type != INTR_TYPE_NMI_INTR))
5724	WARN_ON(!skip_emulated_instruction(vcpu));
5725
5726	/*
5727	* TODO: What about debug traps on tss switch?
5728	* Are we supposed to inject them and update dr6?
5729	*/
5730	return kvm_task_switch(vcpu, tss_selector,
5731	idt_index: type == INTR_TYPE_SOFT_INTR ? idt_index : -`1`,
5732	reason, has_error_code, error_code);
5733	}
5734
5735	static int handle_ept_violation(struct kvm_vcpu *vcpu)
5736	{
5737	unsigned long exit_qualification;
5738	gpa_t gpa;
5739	u64 error_code;
5740
5741	exit_qualification = vmx_get_exit_qual(vcpu);
5742
5743	/*
5744	* EPT violation happened while executing iret from NMI,
5745	* "blocked by NMI" bit has to be set before next VM entry.
5746	* There are errata that may cause this bit to not be set:
5747	* AAK134, BY25.
5748	*/
5749	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5750	enable_vnmi &&
5751	(exit_qualification & INTR_INFO_UNBLOCK_NMI))
5752	vmcs_set_bits(field: GUEST_INTERRUPTIBILITY_INFO, GUEST_INTR_STATE_NMI);
5753
5754	gpa = vmcs_read64(field: GUEST_PHYSICAL_ADDRESS);
5755	trace_kvm_page_fault(vcpu, gpa, exit_qualification);
5756
5757	/ Is it a read fault? /
5758	error_code = (exit_qualification & EPT_VIOLATION_ACC_READ)
5759	? PFERR_USER_MASK : `0`;
5760	/ Is it a write fault? /
5761	error_code \|= (exit_qualification & EPT_VIOLATION_ACC_WRITE)
5762	? PFERR_WRITE_MASK : `0`;
5763	/ Is it a fetch fault? /
5764	error_code \|= (exit_qualification & EPT_VIOLATION_ACC_INSTR)
5765	? PFERR_FETCH_MASK : `0`;
5766	/ ept page table entry is present? /
5767	error_code \|= (exit_qualification & EPT_VIOLATION_RWX_MASK)
5768	? PFERR_PRESENT_MASK : `0`;
5769
5770	error_code \|= (exit_qualification & EPT_VIOLATION_GVA_TRANSLATED) != `0` ?
5771	PFERR_GUEST_FINAL_MASK : PFERR_GUEST_PAGE_MASK;
5772
5773	vcpu->arch.exit_qualification = exit_qualification;
5774
5775	/*
5776	* Check that the GPA doesn't exceed physical memory limits, as that is
5777	* a guest page fault. We have to emulate the instruction here, because
5778	* if the illegal address is that of a paging structure, then
5779	* EPT_VIOLATION_ACC_WRITE bit is set. Alternatively, if supported we
5780	* would also use advanced VM-exit information for EPT violations to
5781	* reconstruct the page fault error code.
5782	*/
5783	if (unlikely(allow_smaller_maxphyaddr && !kvm_vcpu_is_legal_gpa(vcpu, gpa)))
5784	return kvm_emulate_instruction(vcpu, emulation_type: `0`);
5785
5786	return kvm_mmu_page_fault(vcpu, cr2_or_gpa: gpa, error_code, NULL, insn_len: `0`);
5787	}
5788
5789	static int handle_ept_misconfig(struct kvm_vcpu *vcpu)
5790	{
5791	gpa_t gpa;
5792
5793	if (vmx_check_emulate_instruction(vcpu, EMULTYPE_PF, NULL, insn_len: `0`))
5794	return `1`;
5795
5796	/*
5797	* A nested guest cannot optimize MMIO vmexits, because we have an
5798	* nGPA here instead of the required GPA.
5799	*/
5800	gpa = vmcs_read64(field: GUEST_PHYSICAL_ADDRESS);
5801	if (!is_guest_mode(vcpu) &&
5802	!kvm_io_bus_write(vcpu, bus_idx: KVM_FAST_MMIO_BUS, addr: gpa, len: `0`, NULL)) {
5803	trace_kvm_fast_mmio(gpa);
5804	return kvm_skip_emulated_instruction(vcpu);
5805	}
5806
5807	return kvm_mmu_page_fault(vcpu, cr2_or_gpa: gpa, PFERR_RSVD_MASK, NULL, insn_len: `0`);
5808	}
5809
5810	static int handle_nmi_window(struct kvm_vcpu *vcpu)
5811	{
5812	if (KVM_BUG_ON(!enable_vnmi, vcpu->kvm))
5813	return -EIO;
5814
5815	exec_controls_clearbit(vmx: to_vmx(vcpu), CPU_BASED_NMI_WINDOW_EXITING);
5816	++vcpu->stat.nmi_window_exits;
5817	kvm_make_request(KVM_REQ_EVENT, vcpu);
5818
5819	return `1`;
5820	}
5821
5822	static bool vmx_emulation_required_with_pending_exception(struct kvm_vcpu *vcpu)
5823	{
5824	struct vcpu_vmx *vmx = to_vmx(vcpu);
5825
5826	return vmx->emulation_required && !vmx->rmode.vm86_active &&
5827	(kvm_is_exception_pending(vcpu) \|\| vcpu->arch.exception.injected);
5828	}
5829
5830	static int handle_invalid_guest_state(struct kvm_vcpu *vcpu)
5831	{
5832	struct vcpu_vmx *vmx = to_vmx(vcpu);
5833	bool intr_window_requested;
5834	unsigned count = `130`;
5835
5836	intr_window_requested = exec_controls_get(vmx) &
5837	CPU_BASED_INTR_WINDOW_EXITING;
5838
5839	while (vmx->emulation_required && count-- != `0`) {
5840	if (intr_window_requested && !vmx_interrupt_blocked(vcpu))
5841	return handle_interrupt_window(vcpu: &vmx->vcpu);
5842
5843	if (kvm_test_request(KVM_REQ_EVENT, vcpu))
5844	return `1`;
5845
5846	if (!kvm_emulate_instruction(vcpu, emulation_type: `0`))
5847	return `0`;
5848
5849	if (vmx_emulation_required_with_pending_exception(vcpu)) {
5850	kvm_prepare_emulation_failure_exit(vcpu);
5851	return `0`;
5852	}
5853
5854	if (vcpu->arch.halt_request) {
5855	vcpu->arch.halt_request = `0`;
5856	return kvm_emulate_halt_noskip(vcpu);
5857	}
5858
5859	/*
5860	* Note, return 1 and not 0, vcpu_run() will invoke
5861	* xfer_to_guest_mode() which will create a proper return
5862	* code.
5863	*/
5864	if (__xfer_to_guest_mode_work_pending())
5865	return `1`;
5866	}
5867
5868	return `1`;
5869	}
5870
5871	static int vmx_vcpu_pre_run(struct kvm_vcpu *vcpu)
5872	{
5873	if (vmx_emulation_required_with_pending_exception(vcpu)) {
5874	kvm_prepare_emulation_failure_exit(vcpu);
5875	return `0`;
5876	}
5877
5878	return `1`;
5879	}
5880
5881	static void grow_ple_window(struct kvm_vcpu *vcpu)
5882	{
5883	struct vcpu_vmx *vmx = to_vmx(vcpu);
5884	unsigned int old = vmx->ple_window;
5885
5886	vmx->ple_window = __grow_ple_window(val: old, base: ple_window,
5887	modifier: ple_window_grow,
5888	max: ple_window_max);
5889
5890	if (vmx->ple_window != old) {
5891	vmx->ple_window_dirty = true;
5892	trace_kvm_ple_window_update(vcpu->vcpu_id,
5893	vmx->ple_window, old);
5894	}
5895	}
5896
5897	static void shrink_ple_window(struct kvm_vcpu *vcpu)
5898	{
5899	struct vcpu_vmx *vmx = to_vmx(vcpu);
5900	unsigned int old = vmx->ple_window;
5901
5902	vmx->ple_window = __shrink_ple_window(val: old, base: ple_window,
5903	modifier: ple_window_shrink,
5904	min: ple_window);
5905
5906	if (vmx->ple_window != old) {
5907	vmx->ple_window_dirty = true;
5908	trace_kvm_ple_window_update(vcpu->vcpu_id,
5909	vmx->ple_window, old);
5910	}
5911	}
5912
5913	/*
5914	* Indicate a busy-waiting vcpu in spinlock. We do not enable the PAUSE
5915	* exiting, so only get here on cpu with PAUSE-Loop-Exiting.
5916	*/
5917	static int handle_pause(struct kvm_vcpu *vcpu)
5918	{
5919	if (!kvm_pause_in_guest(kvm: vcpu->kvm))
5920	grow_ple_window(vcpu);
5921
5922	/*
5923	* Intel sdm vol3 ch-25.1.3 says: The "PAUSE-loop exiting"
5924	* VM-execution control is ignored if CPL > 0. OTOH, KVM
5925	* never set PAUSE_EXITING and just set PLE if supported,
5926	* so the vcpu must be CPL=0 if it gets a PAUSE exit.
5927	*/
5928	kvm_vcpu_on_spin(vcpu, yield_to_kernel_mode: true);
5929	return kvm_skip_emulated_instruction(vcpu);
5930	}
5931
5932	static int handle_monitor_trap(struct kvm_vcpu *vcpu)
5933	{
5934	return `1`;
5935	}
5936
5937	static int handle_invpcid(struct kvm_vcpu *vcpu)
5938	{
5939	u32 vmx_instruction_info;
5940	unsigned long type;
5941	gva_t gva;
5942	struct {
5943	u64 pcid;
5944	u64 gla;
5945	} operand;
5946	int gpr_index;
5947
5948	if (!guest_cpuid_has(vcpu, X86_FEATURE_INVPCID)) {
5949	kvm_queue_exception(vcpu, UD_VECTOR);
5950	return `1`;
5951	}
5952
5953	vmx_instruction_info = vmcs_read32(field: VMX_INSTRUCTION_INFO);
5954	gpr_index = vmx_get_instr_info_reg2(vmx_instr_info: vmx_instruction_info);
5955	type = kvm_register_read(vcpu, reg: gpr_index);
5956
5957	/ According to the Intel instruction reference, the memory operand*
5958	* is read even if it isn't needed (e.g., for type==all)
5959	*/
5960	if (get_vmx_mem_address(vcpu, exit_qualification: vmx_get_exit_qual(vcpu),
5961	vmx_instruction_info, wr: false,
5962	len: sizeof(operand), ret: &gva))
5963	return `1`;
5964
5965	return kvm_handle_invpcid(vcpu, type, gva);
5966	}
5967
5968	static int handle_pml_full(struct kvm_vcpu *vcpu)
5969	{
5970	unsigned long exit_qualification;
5971
5972	trace_kvm_pml_full(vcpu->vcpu_id);
5973
5974	exit_qualification = vmx_get_exit_qual(vcpu);
5975
5976	/*
5977	* PML buffer FULL happened while executing iret from NMI,
5978	* "blocked by NMI" bit has to be set before next VM entry.
5979	*/
5980	if (!(to_vmx(vcpu)->idt_vectoring_info & VECTORING_INFO_VALID_MASK) &&
5981	enable_vnmi &&
5982	(exit_qualification & INTR_INFO_UNBLOCK_NMI))
5983	vmcs_set_bits(field: GUEST_INTERRUPTIBILITY_INFO,
5984	GUEST_INTR_STATE_NMI);
5985
5986	/*
5987	* PML buffer already flushed at beginning of VMEXIT. Nothing to do
5988	* here.., and there's no userspace involvement needed for PML.
5989	*/
5990	return `1`;
5991	}
5992
5993	static fastpath_t handle_fastpath_preemption_timer(struct kvm_vcpu *vcpu,
5994	bool force_immediate_exit)
5995	{
5996	struct vcpu_vmx *vmx = to_vmx(vcpu);
5997
5998	/*
5999	* In the extremely unlikely scenario that this is a spurious VM-Exit
6000	* due to the timer expiring while it was "soft" disabled, just eat the
6001	* exit and re-enter the guest.
6002	*/
6003	if (unlikely(vmx->loaded_vmcs->hv_timer_soft_disabled))
6004	return EXIT_FASTPATH_REENTER_GUEST;
6005
6006	/*
6007	* If the timer expired because KVM used it to force an immediate exit,
6008	* then mission accomplished.
6009	*/
6010	if (force_immediate_exit)
6011	return EXIT_FASTPATH_EXIT_HANDLED;
6012
6013	/*
6014	* If L2 is active, go down the slow path as emulating the guest timer
6015	* expiration likely requires synthesizing a nested VM-Exit.
6016	*/
6017	if (is_guest_mode(vcpu))
6018	return EXIT_FASTPATH_NONE;
6019
6020	kvm_lapic_expired_hv_timer(vcpu);
6021	return EXIT_FASTPATH_REENTER_GUEST;
6022	}
6023
6024	static int handle_preemption_timer(struct kvm_vcpu *vcpu)
6025	{
6026	/*
6027	* This non-fastpath handler is reached if and only if the preemption
6028	* timer was being used to emulate a guest timer while L2 is active.
6029	* All other scenarios are supposed to be handled in the fastpath.
6030	*/
6031	WARN_ON_ONCE(!is_guest_mode(vcpu));
6032	kvm_lapic_expired_hv_timer(vcpu);
6033	return `1`;
6034	}
6035
6036	/*
6037	* When nested=0, all VMX instruction VM Exits filter here. The handlers
6038	* are overwritten by nested_vmx_setup() when nested=1.
6039	*/
6040	static int handle_vmx_instruction(struct kvm_vcpu *vcpu)
6041	{
6042	kvm_queue_exception(vcpu, UD_VECTOR);
6043	return `1`;
6044	}
6045
6046	#ifndef CONFIG_X86_SGX_KVM
6047	static int handle_encls(struct kvm_vcpu *vcpu)
6048	{
6049	/*
6050	* SGX virtualization is disabled. There is no software enable bit for
6051	* SGX, so KVM intercepts all ENCLS leafs and injects a #UD to prevent
6052	* the guest from executing ENCLS (when SGX is supported by hardware).
6053	*/
6054	kvm_queue_exception(vcpu, UD_VECTOR);
6055	return `1`;
6056	}
6057	#endif /* CONFIG_X86_SGX_KVM */
6058
6059	static int handle_bus_lock_vmexit(struct kvm_vcpu *vcpu)
6060	{
6061	/*
6062	* Hardware may or may not set the BUS_LOCK_DETECTED flag on BUS_LOCK
6063	* VM-Exits. Unconditionally set the flag here and leave the handling to
6064	* vmx_handle_exit().
6065	*/
6066	to_vmx(vcpu)->exit_reason.bus_lock_detected = true;
6067	return `1`;
6068	}
6069
6070	static int handle_notify(struct kvm_vcpu *vcpu)
6071	{
6072	unsigned long exit_qual = vmx_get_exit_qual(vcpu);
6073	bool context_invalid = exit_qual & NOTIFY_VM_CONTEXT_INVALID;
6074
6075	++vcpu->stat.notify_window_exits;
6076
6077	/*
6078	* Notify VM exit happened while executing iret from NMI,
6079	* "blocked by NMI" bit has to be set before next VM entry.
6080	*/
6081	if (enable_vnmi && (exit_qual & INTR_INFO_UNBLOCK_NMI))
6082	vmcs_set_bits(field: GUEST_INTERRUPTIBILITY_INFO,
6083	GUEST_INTR_STATE_NMI);
6084
6085	if (vcpu->kvm->arch.notify_vmexit_flags & KVM_X86_NOTIFY_VMEXIT_USER \|\|
6086	context_invalid) {
6087	vcpu->run->exit_reason = KVM_EXIT_NOTIFY;
6088	vcpu->run->notify.flags = context_invalid ?
6089	KVM_NOTIFY_CONTEXT_INVALID : `0`;
6090	return `0`;
6091	}
6092
6093	return `1`;
6094	}
6095
6096	/*
6097	* The exit handlers return 1 if the exit was handled fully and guest execution
6098	* may resume. Otherwise they set the kvm_run parameter to indicate what needs
6099	* to be done to userspace and return 0.
6100	*/
6101	static int (kvm_vmx_exit_handlers[])(struct* kvm_vcpu *vcpu) = {
6102	[EXIT_REASON_EXCEPTION_NMI] = handle_exception_nmi,
6103	[EXIT_REASON_EXTERNAL_INTERRUPT] = handle_external_interrupt,
6104	[EXIT_REASON_TRIPLE_FAULT] = handle_triple_fault,
6105	[EXIT_REASON_NMI_WINDOW] = handle_nmi_window,
6106	[EXIT_REASON_IO_INSTRUCTION] = handle_io,
6107	[EXIT_REASON_CR_ACCESS] = handle_cr,
6108	[EXIT_REASON_DR_ACCESS] = handle_dr,
6109	[EXIT_REASON_CPUID] = kvm_emulate_cpuid,
6110	[EXIT_REASON_MSR_READ] = kvm_emulate_rdmsr,
6111	[EXIT_REASON_MSR_WRITE] = kvm_emulate_wrmsr,
6112	[EXIT_REASON_INTERRUPT_WINDOW] = handle_interrupt_window,
6113	[EXIT_REASON_HLT] = kvm_emulate_halt,
6114	[EXIT_REASON_INVD] = kvm_emulate_invd,
6115	[EXIT_REASON_INVLPG] = handle_invlpg,
6116	[EXIT_REASON_RDPMC] = kvm_emulate_rdpmc,
6117	[EXIT_REASON_VMCALL] = kvm_emulate_hypercall,
6118	[EXIT_REASON_VMCLEAR] = handle_vmx_instruction,
6119	[EXIT_REASON_VMLAUNCH] = handle_vmx_instruction,
6120	[EXIT_REASON_VMPTRLD] = handle_vmx_instruction,
6121	[EXIT_REASON_VMPTRST] = handle_vmx_instruction,
6122	[EXIT_REASON_VMREAD] = handle_vmx_instruction,
6123	[EXIT_REASON_VMRESUME] = handle_vmx_instruction,
6124	[EXIT_REASON_VMWRITE] = handle_vmx_instruction,
6125	[EXIT_REASON_VMOFF] = handle_vmx_instruction,
6126	[EXIT_REASON_VMON] = handle_vmx_instruction,
6127	[EXIT_REASON_TPR_BELOW_THRESHOLD] = handle_tpr_below_threshold,
6128	[EXIT_REASON_APIC_ACCESS] = handle_apic_access,
6129	[EXIT_REASON_APIC_WRITE] = handle_apic_write,
6130	[EXIT_REASON_EOI_INDUCED] = handle_apic_eoi_induced,
6131	[EXIT_REASON_WBINVD] = kvm_emulate_wbinvd,
6132	[EXIT_REASON_XSETBV] = kvm_emulate_xsetbv,
6133	[EXIT_REASON_TASK_SWITCH] = handle_task_switch,
6134	[EXIT_REASON_MCE_DURING_VMENTRY] = handle_machine_check,
6135	[EXIT_REASON_GDTR_IDTR] = handle_desc,
6136	[EXIT_REASON_LDTR_TR] = handle_desc,
6137	[EXIT_REASON_EPT_VIOLATION] = handle_ept_violation,
6138	[EXIT_REASON_EPT_MISCONFIG] = handle_ept_misconfig,
6139	[EXIT_REASON_PAUSE_INSTRUCTION] = handle_pause,
6140	[EXIT_REASON_MWAIT_INSTRUCTION] = kvm_emulate_mwait,
6141	[EXIT_REASON_MONITOR_TRAP_FLAG] = handle_monitor_trap,
6142	[EXIT_REASON_MONITOR_INSTRUCTION] = kvm_emulate_monitor,
6143	[EXIT_REASON_INVEPT] = handle_vmx_instruction,
6144	[EXIT_REASON_INVVPID] = handle_vmx_instruction,
6145	[EXIT_REASON_RDRAND] = kvm_handle_invalid_op,
6146	[EXIT_REASON_RDSEED] = kvm_handle_invalid_op,
6147	[EXIT_REASON_PML_FULL] = handle_pml_full,
6148	[EXIT_REASON_INVPCID] = handle_invpcid,
6149	[EXIT_REASON_VMFUNC] = handle_vmx_instruction,
6150	[EXIT_REASON_PREEMPTION_TIMER] = handle_preemption_timer,
6151	[EXIT_REASON_ENCLS] = handle_encls,
6152	[EXIT_REASON_BUS_LOCK] = handle_bus_lock_vmexit,
6153	[EXIT_REASON_NOTIFY] = handle_notify,
6154	};
6155
6156	static const int kvm_vmx_max_exit_handlers =
6157	ARRAY_SIZE(kvm_vmx_exit_handlers);
6158
6159	static void vmx_get_exit_info(struct kvm_vcpu vcpu, u32 reason,
6160	u64 info1, u64 info2,
6161	u32 intr_info, u32 error_code)
6162	{
6163	struct vcpu_vmx *vmx = to_vmx(vcpu);
6164
6165	*reason = vmx->exit_reason.full;
6166	*info1 = vmx_get_exit_qual(vcpu);
6167	if (!(vmx->exit_reason.failed_vmentry)) {
6168	*info2 = vmx->idt_vectoring_info;
6169	*intr_info = vmx_get_intr_info(vcpu);
6170	if (is_exception_with_error_code(intr_info: *intr_info))
6171	*error_code = vmcs_read32(field: VM_EXIT_INTR_ERROR_CODE);
6172	else
6173	*error_code = `0`;
6174	} else {
6175	*info2 = `0`;
6176	*intr_info = `0`;
6177	*error_code = `0`;
6178	}
6179	}
6180
6181	static void vmx_destroy_pml_buffer(struct vcpu_vmx *vmx)
6182	{
6183	if (vmx->pml_pg) {
6184	__free_page(vmx->pml_pg);
6185	vmx->pml_pg = NULL;
6186	}
6187	}
6188
6189	static void vmx_flush_pml_buffer(struct kvm_vcpu *vcpu)
6190	{
6191	struct vcpu_vmx *vmx = to_vmx(vcpu);
6192	u64 *pml_buf;
6193	u16 pml_idx;
6194
6195	pml_idx = vmcs_read16(field: GUEST_PML_INDEX);
6196
6197	/ Do nothing if PML buffer is empty /
6198	if (pml_idx == (PML_ENTITY_NUM - `1`))
6199	return;
6200
6201	/ PML index always points to next available PML buffer entity /
6202	if (pml_idx >= PML_ENTITY_NUM)
6203	pml_idx = `0`;
6204	else
6205	pml_idx++;
6206
6207	pml_buf = page_address(vmx->pml_pg);
6208	for (; pml_idx < PML_ENTITY_NUM; pml_idx++) {
6209	u64 gpa;
6210
6211	gpa = pml_buf[pml_idx];
6212	WARN_ON(gpa & (PAGE_SIZE - `1`));
6213	kvm_vcpu_mark_page_dirty(vcpu, gfn: gpa >> PAGE_SHIFT);
6214	}
6215
6216	/ reset PML index /
6217	vmcs_write16(field: GUEST_PML_INDEX, PML_ENTITY_NUM - `1`);
6218	}
6219
6220	static void vmx_dump_sel(char *name, uint32_t sel)
6221	{
6222	pr_err("%s sel=0x%04x, attr=0x%05x, limit=0x%08x, base=0x%016lx\n",
6223	name, vmcs_read16(sel),
6224	vmcs_read32(sel + GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR),
6225	vmcs_read32(sel + GUEST_ES_LIMIT - GUEST_ES_SELECTOR),
6226	vmcs_readl(sel + GUEST_ES_BASE - GUEST_ES_SELECTOR));
6227	}
6228
6229	static void vmx_dump_dtsel(char *name, uint32_t limit)
6230	{
6231	pr_err("%s limit=0x%08x, base=0x%016lx\n",
6232	name, vmcs_read32(limit),
6233	vmcs_readl(limit + GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
6234	}
6235
6236	static void vmx_dump_msrs(char name, struct* vmx_msrs *m)
6237	{
6238	unsigned int i;
6239	struct vmx_msr_entry *e;
6240
6241	pr_err("MSR %s:\n", name);
6242	for (i = `0`, e = m->val; i < m->nr; ++i, ++e)
6243	pr_err(" %2d: msr=0x%08x value=0x%016llx\n", i, e->index, e->value);
6244	}
6245
6246	void dump_vmcs(struct kvm_vcpu *vcpu)
6247	{
6248	struct vcpu_vmx *vmx = to_vmx(vcpu);
6249	u32 vmentry_ctl, vmexit_ctl;
6250	u32 cpu_based_exec_ctrl, pin_based_exec_ctrl, secondary_exec_control;
6251	u64 tertiary_exec_control;
6252	unsigned long cr4;
6253	int efer_slot;
6254
6255	if (!dump_invalid_vmcs) {
6256	pr_warn_ratelimited("set kvm_intel.dump_invalid_vmcs=1 to dump internal KVM state.\n");
6257	return;
6258	}
6259
6260	vmentry_ctl = vmcs_read32(field: VM_ENTRY_CONTROLS);
6261	vmexit_ctl = vmcs_read32(field: VM_EXIT_CONTROLS);
6262	cpu_based_exec_ctrl = vmcs_read32(field: CPU_BASED_VM_EXEC_CONTROL);
6263	pin_based_exec_ctrl = vmcs_read32(field: PIN_BASED_VM_EXEC_CONTROL);
6264	cr4 = vmcs_readl(field: GUEST_CR4);
6265
6266	if (cpu_has_secondary_exec_ctrls())
6267	secondary_exec_control = vmcs_read32(field: SECONDARY_VM_EXEC_CONTROL);
6268	else
6269	secondary_exec_control = `0`;
6270
6271	if (cpu_has_tertiary_exec_ctrls())
6272	tertiary_exec_control = vmcs_read64(field: TERTIARY_VM_EXEC_CONTROL);
6273	else
6274	tertiary_exec_control = `0`;
6275
6276	pr_err("VMCS %p, last attempted VM-entry on CPU %d\n",
6277	vmx->loaded_vmcs->vmcs, vcpu->arch.last_vmentry_cpu);
6278	pr_err("* Guest State *\n");
6279	pr_err("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6280	vmcs_readl(GUEST_CR0), vmcs_readl(CR0_READ_SHADOW),
6281	vmcs_readl(CR0_GUEST_HOST_MASK));
6282	pr_err("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
6283	cr4, vmcs_readl(CR4_READ_SHADOW), vmcs_readl(CR4_GUEST_HOST_MASK));
6284	pr_err("CR3 = 0x%016lx\n", vmcs_readl(GUEST_CR3));
6285	if (cpu_has_vmx_ept()) {
6286	pr_err("PDPTR0 = 0x%016llx PDPTR1 = 0x%016llx\n",
6287	vmcs_read64(GUEST_PDPTR0), vmcs_read64(GUEST_PDPTR1));
6288	pr_err("PDPTR2 = 0x%016llx PDPTR3 = 0x%016llx\n",
6289	vmcs_read64(GUEST_PDPTR2), vmcs_read64(GUEST_PDPTR3));
6290	}
6291	pr_err("RSP = 0x%016lx RIP = 0x%016lx\n",
6292	vmcs_readl(GUEST_RSP), vmcs_readl(GUEST_RIP));
6293	pr_err("RFLAGS=0x%08lx DR7 = 0x%016lx\n",
6294	vmcs_readl(GUEST_RFLAGS), vmcs_readl(GUEST_DR7));
6295	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6296	vmcs_readl(GUEST_SYSENTER_ESP),
6297	vmcs_read32(GUEST_SYSENTER_CS), vmcs_readl(GUEST_SYSENTER_EIP));
6298	vmx_dump_sel(name: "CS: ", sel: GUEST_CS_SELECTOR);
6299	vmx_dump_sel(name: "DS: ", sel: GUEST_DS_SELECTOR);
6300	vmx_dump_sel(name: "SS: ", sel: GUEST_SS_SELECTOR);
6301	vmx_dump_sel(name: "ES: ", sel: GUEST_ES_SELECTOR);
6302	vmx_dump_sel(name: "FS: ", sel: GUEST_FS_SELECTOR);
6303	vmx_dump_sel(name: "GS: ", sel: GUEST_GS_SELECTOR);
6304	vmx_dump_dtsel(name: "GDTR:", limit: GUEST_GDTR_LIMIT);
6305	vmx_dump_sel(name: "LDTR:", sel: GUEST_LDTR_SELECTOR);
6306	vmx_dump_dtsel(name: "IDTR:", limit: GUEST_IDTR_LIMIT);
6307	vmx_dump_sel(name: "TR: ", sel: GUEST_TR_SELECTOR);
6308	efer_slot = vmx_find_loadstore_msr_slot(m: &vmx->msr_autoload.guest, MSR_EFER);
6309	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_EFER)
6310	pr_err("EFER= 0x%016llx\n", vmcs_read64(GUEST_IA32_EFER));
6311	else if (efer_slot >= `0`)
6312	pr_err("EFER= 0x%016llx (autoload)\n",
6313	vmx->msr_autoload.guest.val[efer_slot].value);
6314	else if (vmentry_ctl & VM_ENTRY_IA32E_MODE)
6315	pr_err("EFER= 0x%016llx (effective)\n",
6316	vcpu->arch.efer \| (EFER_LMA \| EFER_LME));
6317	else
6318	pr_err("EFER= 0x%016llx (effective)\n",
6319	vcpu->arch.efer & ~(EFER_LMA \| EFER_LME));
6320	if (vmentry_ctl & VM_ENTRY_LOAD_IA32_PAT)
6321	pr_err("PAT = 0x%016llx\n", vmcs_read64(GUEST_IA32_PAT));
6322	pr_err("DebugCtl = 0x%016llx DebugExceptions = 0x%016lx\n",
6323	vmcs_read64(GUEST_IA32_DEBUGCTL),
6324	vmcs_readl(GUEST_PENDING_DBG_EXCEPTIONS));
6325	if (cpu_has_load_perf_global_ctrl() &&
6326	vmentry_ctl & VM_ENTRY_LOAD_IA32_PERF_GLOBAL_CTRL)
6327	pr_err("PerfGlobCtl = 0x%016llx\n",
6328	vmcs_read64(GUEST_IA32_PERF_GLOBAL_CTRL));
6329	if (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS)
6330	pr_err("BndCfgS = 0x%016llx\n", vmcs_read64(GUEST_BNDCFGS));
6331	pr_err("Interruptibility = %08x ActivityState = %08x\n",
6332	vmcs_read32(GUEST_INTERRUPTIBILITY_INFO),
6333	vmcs_read32(GUEST_ACTIVITY_STATE));
6334	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY)
6335	pr_err("InterruptStatus = %04x\n",
6336	vmcs_read16(GUEST_INTR_STATUS));
6337	if (vmcs_read32(field: VM_ENTRY_MSR_LOAD_COUNT) > `0`)
6338	vmx_dump_msrs(name: "guest autoload", m: &vmx->msr_autoload.guest);
6339	if (vmcs_read32(field: VM_EXIT_MSR_STORE_COUNT) > `0`)
6340	vmx_dump_msrs(name: "guest autostore", m: &vmx->msr_autostore.guest);
6341
6342	pr_err("* Host State *\n");
6343	pr_err("RIP = 0x%016lx RSP = 0x%016lx\n",
6344	vmcs_readl(HOST_RIP), vmcs_readl(HOST_RSP));
6345	pr_err("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
6346	vmcs_read16(HOST_CS_SELECTOR), vmcs_read16(HOST_SS_SELECTOR),
6347	vmcs_read16(HOST_DS_SELECTOR), vmcs_read16(HOST_ES_SELECTOR),
6348	vmcs_read16(HOST_FS_SELECTOR), vmcs_read16(HOST_GS_SELECTOR),
6349	vmcs_read16(HOST_TR_SELECTOR));
6350	pr_err("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
6351	vmcs_readl(HOST_FS_BASE), vmcs_readl(HOST_GS_BASE),
6352	vmcs_readl(HOST_TR_BASE));
6353	pr_err("GDTBase=%016lx IDTBase=%016lx\n",
6354	vmcs_readl(HOST_GDTR_BASE), vmcs_readl(HOST_IDTR_BASE));
6355	pr_err("CR0=%016lx CR3=%016lx CR4=%016lx\n",
6356	vmcs_readl(HOST_CR0), vmcs_readl(HOST_CR3),
6357	vmcs_readl(HOST_CR4));
6358	pr_err("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
6359	vmcs_readl(HOST_IA32_SYSENTER_ESP),
6360	vmcs_read32(HOST_IA32_SYSENTER_CS),
6361	vmcs_readl(HOST_IA32_SYSENTER_EIP));
6362	if (vmexit_ctl & VM_EXIT_LOAD_IA32_EFER)
6363	pr_err("EFER= 0x%016llx\n", vmcs_read64(HOST_IA32_EFER));
6364	if (vmexit_ctl & VM_EXIT_LOAD_IA32_PAT)
6365	pr_err("PAT = 0x%016llx\n", vmcs_read64(HOST_IA32_PAT));
6366	if (cpu_has_load_perf_global_ctrl() &&
6367	vmexit_ctl & VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL)
6368	pr_err("PerfGlobCtl = 0x%016llx\n",
6369	vmcs_read64(HOST_IA32_PERF_GLOBAL_CTRL));
6370	if (vmcs_read32(field: VM_EXIT_MSR_LOAD_COUNT) > `0`)
6371	vmx_dump_msrs(name: "host autoload", m: &vmx->msr_autoload.host);
6372
6373	pr_err("* Control State *\n");
6374	pr_err("CPUBased=0x%08x SecondaryExec=0x%08x TertiaryExec=0x%016llx\n",
6375	cpu_based_exec_ctrl, secondary_exec_control, tertiary_exec_control);
6376	pr_err("PinBased=0x%08x EntryControls=%08x ExitControls=%08x\n",
6377	pin_based_exec_ctrl, vmentry_ctl, vmexit_ctl);
6378	pr_err("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
6379	vmcs_read32(EXCEPTION_BITMAP),
6380	vmcs_read32(PAGE_FAULT_ERROR_CODE_MASK),
6381	vmcs_read32(PAGE_FAULT_ERROR_CODE_MATCH));
6382	pr_err("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
6383	vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
6384	vmcs_read32(VM_ENTRY_EXCEPTION_ERROR_CODE),
6385	vmcs_read32(VM_ENTRY_INSTRUCTION_LEN));
6386	pr_err("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
6387	vmcs_read32(VM_EXIT_INTR_INFO),
6388	vmcs_read32(VM_EXIT_INTR_ERROR_CODE),
6389	vmcs_read32(VM_EXIT_INSTRUCTION_LEN));
6390	pr_err(" reason=%08x qualification=%016lx\n",
6391	vmcs_read32(VM_EXIT_REASON), vmcs_readl(EXIT_QUALIFICATION));
6392	pr_err("IDTVectoring: info=%08x errcode=%08x\n",
6393	vmcs_read32(IDT_VECTORING_INFO_FIELD),
6394	vmcs_read32(IDT_VECTORING_ERROR_CODE));
6395	pr_err("TSC Offset = 0x%016llx\n", vmcs_read64(TSC_OFFSET));
6396	if (secondary_exec_control & SECONDARY_EXEC_TSC_SCALING)
6397	pr_err("TSC Multiplier = 0x%016llx\n",
6398	vmcs_read64(TSC_MULTIPLIER));
6399	if (cpu_based_exec_ctrl & CPU_BASED_TPR_SHADOW) {
6400	if (secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) {
6401	u16 status = vmcs_read16(field: GUEST_INTR_STATUS);
6402	pr_err("SVI\|RVI = %02x\|%02x ", status >> `8`, status & `0xff`);
6403	}
6404	pr_cont("TPR Threshold = 0x%02x\n", vmcs_read32(TPR_THRESHOLD));
6405	if (secondary_exec_control & SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES)
6406	pr_err("APIC-access addr = 0x%016llx ", vmcs_read64(APIC_ACCESS_ADDR));
6407	pr_cont("virt-APIC addr = 0x%016llx\n", vmcs_read64(VIRTUAL_APIC_PAGE_ADDR));
6408	}
6409	if (pin_based_exec_ctrl & PIN_BASED_POSTED_INTR)
6410	pr_err("PostedIntrVec = 0x%02x\n", vmcs_read16(POSTED_INTR_NV));
6411	if ((secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT))
6412	pr_err("EPT pointer = 0x%016llx\n", vmcs_read64(EPT_POINTER));
6413	if (secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING)
6414	pr_err("PLE Gap=%08x Window=%08x\n",
6415	vmcs_read32(PLE_GAP), vmcs_read32(PLE_WINDOW));
6416	if (secondary_exec_control & SECONDARY_EXEC_ENABLE_VPID)
6417	pr_err("Virtual processor ID = 0x%04x\n",
6418	vmcs_read16(VIRTUAL_PROCESSOR_ID));
6419	}
6420
6421	/*
6422	* The guest has exited. See if we can fix it or if we need userspace
6423	* assistance.
6424	*/
6425	static int __vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6426	{
6427	struct vcpu_vmx *vmx = to_vmx(vcpu);
6428	union vmx_exit_reason exit_reason = vmx->exit_reason;
6429	u32 vectoring_info = vmx->idt_vectoring_info;
6430	u16 exit_handler_index;
6431
6432	/*
6433	* Flush logged GPAs PML buffer, this will make dirty_bitmap more
6434	* updated. Another good is, in kvm_vm_ioctl_get_dirty_log, before
6435	* querying dirty_bitmap, we only need to kick all vcpus out of guest
6436	* mode as if vcpus is in root mode, the PML buffer must has been
6437	* flushed already. Note, PML is never enabled in hardware while
6438	* running L2.
6439	*/
6440	if (enable_pml && !is_guest_mode(vcpu))
6441	vmx_flush_pml_buffer(vcpu);
6442
6443	/*
6444	* KVM should never reach this point with a pending nested VM-Enter.
6445	* More specifically, short-circuiting VM-Entry to emulate L2 due to
6446	* invalid guest state should never happen as that means KVM knowingly
6447	* allowed a nested VM-Enter with an invalid vmcs12. More below.
6448	*/
6449	if (KVM_BUG_ON(vmx->nested.nested_run_pending, vcpu->kvm))
6450	return -EIO;
6451
6452	if (is_guest_mode(vcpu)) {
6453	/*
6454	* PML is never enabled when running L2, bail immediately if a
6455	* PML full exit occurs as something is horribly wrong.
6456	*/
6457	if (exit_reason.basic == EXIT_REASON_PML_FULL)
6458	goto unexpected_vmexit;
6459
6460	/*
6461	* The host physical addresses of some pages of guest memory
6462	* are loaded into the vmcs02 (e.g. vmcs12's Virtual APIC
6463	* Page). The CPU may write to these pages via their host
6464	* physical address while L2 is running, bypassing any
6465	* address-translation-based dirty tracking (e.g. EPT write
6466	* protection).
6467	*
6468	* Mark them dirty on every exit from L2 to prevent them from
6469	* getting out of sync with dirty tracking.
6470	*/
6471	nested_mark_vmcs12_pages_dirty(vcpu);
6472
6473	/*
6474	* Synthesize a triple fault if L2 state is invalid. In normal
6475	* operation, nested VM-Enter rejects any attempt to enter L2
6476	* with invalid state. However, those checks are skipped if
6477	* state is being stuffed via RSM or KVM_SET_NESTED_STATE. If
6478	* L2 state is invalid, it means either L1 modified SMRAM state
6479	* or userspace provided bad state. Synthesize TRIPLE_FAULT as
6480	* doing so is architecturally allowed in the RSM case, and is
6481	* the least awful solution for the userspace case without
6482	* risking false positives.
6483	*/
6484	if (vmx->emulation_required) {
6485	nested_vmx_vmexit(vcpu, EXIT_REASON_TRIPLE_FAULT, exit_intr_info: `0`, exit_qualification: `0`);
6486	return `1`;
6487	}
6488
6489	if (nested_vmx_reflect_vmexit(vcpu))
6490	return `1`;
6491	}
6492
6493	/ If guest state is invalid, start emulating. L2 is handled above. /
6494	if (vmx->emulation_required)
6495	return handle_invalid_guest_state(vcpu);
6496
6497	if (exit_reason.failed_vmentry) {
6498	dump_vmcs(vcpu);
6499	vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6500	vcpu->run->fail_entry.hardware_entry_failure_reason
6501	= exit_reason.full;
6502	vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6503	return `0`;
6504	}
6505
6506	if (unlikely(vmx->fail)) {
6507	dump_vmcs(vcpu);
6508	vcpu->run->exit_reason = KVM_EXIT_FAIL_ENTRY;
6509	vcpu->run->fail_entry.hardware_entry_failure_reason
6510	= vmcs_read32(field: VM_INSTRUCTION_ERROR);
6511	vcpu->run->fail_entry.cpu = vcpu->arch.last_vmentry_cpu;
6512	return `0`;
6513	}
6514
6515	/*
6516	* Note:
6517	* Do not try to fix EXIT_REASON_EPT_MISCONFIG if it caused by
6518	* delivery event since it indicates guest is accessing MMIO.
6519	* The vm-exit can be triggered again after return to guest that
6520	* will cause infinite loop.
6521	*/
6522	if ((vectoring_info & VECTORING_INFO_VALID_MASK) &&
6523	(exit_reason.basic != EXIT_REASON_EXCEPTION_NMI &&
6524	exit_reason.basic != EXIT_REASON_EPT_VIOLATION &&
6525	exit_reason.basic != EXIT_REASON_PML_FULL &&
6526	exit_reason.basic != EXIT_REASON_APIC_ACCESS &&
6527	exit_reason.basic != EXIT_REASON_TASK_SWITCH &&
6528	exit_reason.basic != EXIT_REASON_NOTIFY)) {
6529	int ndata = `3`;
6530
6531	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6532	vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_DELIVERY_EV;
6533	vcpu->run->internal.data[`0`] = vectoring_info;
6534	vcpu->run->internal.data[`1`] = exit_reason.full;
6535	vcpu->run->internal.data[`2`] = vmx_get_exit_qual(vcpu);
6536	if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG) {
6537	vcpu->run->internal.data[ndata++] =
6538	vmcs_read64(field: GUEST_PHYSICAL_ADDRESS);
6539	}
6540	vcpu->run->internal.data[ndata++] = vcpu->arch.last_vmentry_cpu;
6541	vcpu->run->internal.ndata = ndata;
6542	return `0`;
6543	}
6544
6545	if (unlikely(!enable_vnmi &&
6546	vmx->loaded_vmcs->soft_vnmi_blocked)) {
6547	if (!vmx_interrupt_blocked(vcpu)) {
6548	vmx->loaded_vmcs->soft_vnmi_blocked = `0`;
6549	} else if (vmx->loaded_vmcs->vnmi_blocked_time > `1000000000LL` &&
6550	vcpu->arch.nmi_pending) {
6551	/*
6552	* This CPU don't support us in finding the end of an
6553	* NMI-blocked window if the guest runs with IRQs
6554	* disabled. So we pull the trigger after 1 s of
6555	* futile waiting, but inform the user about this.
6556	*/
6557	printk(KERN_WARNING "%s: Breaking out of NMI-blocked "
6558	"state on VCPU %d after 1 s timeout\n",
6559	__func__, vcpu->vcpu_id);
6560	vmx->loaded_vmcs->soft_vnmi_blocked = `0`;
6561	}
6562	}
6563
6564	if (exit_fastpath != EXIT_FASTPATH_NONE)
6565	return `1`;
6566
6567	if (exit_reason.basic >= kvm_vmx_max_exit_handlers)
6568	goto unexpected_vmexit;
6569	#ifdef CONFIG_MITIGATION_RETPOLINE
6570	if (exit_reason.basic == EXIT_REASON_MSR_WRITE)
6571	return kvm_emulate_wrmsr(vcpu);
6572	else if (exit_reason.basic == EXIT_REASON_PREEMPTION_TIMER)
6573	return handle_preemption_timer(vcpu);
6574	else if (exit_reason.basic == EXIT_REASON_INTERRUPT_WINDOW)
6575	return handle_interrupt_window(vcpu);
6576	else if (exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
6577	return handle_external_interrupt(vcpu);
6578	else if (exit_reason.basic == EXIT_REASON_HLT)
6579	return kvm_emulate_halt(vcpu);
6580	else if (exit_reason.basic == EXIT_REASON_EPT_MISCONFIG)
6581	return handle_ept_misconfig(vcpu);
6582	#endif
6583
6584	exit_handler_index = array_index_nospec((u16)exit_reason.basic,
6585	kvm_vmx_max_exit_handlers);
6586	if (!kvm_vmx_exit_handlers[exit_handler_index])
6587	goto unexpected_vmexit;
6588
6589	return kvm_vmx_exit_handlers[exit_handler_index](vcpu);
6590
6591	unexpected_vmexit:
6592	vcpu_unimpl(vcpu, "vmx: unexpected exit reason 0x%x\n",
6593	exit_reason.full);
6594	dump_vmcs(vcpu);
6595	vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
6596	vcpu->run->internal.suberror =
6597	KVM_INTERNAL_ERROR_UNEXPECTED_EXIT_REASON;
6598	vcpu->run->internal.ndata = `2`;
6599	vcpu->run->internal.data[`0`] = exit_reason.full;
6600	vcpu->run->internal.data[`1`] = vcpu->arch.last_vmentry_cpu;
6601	return `0`;
6602	}
6603
6604	static int vmx_handle_exit(struct kvm_vcpu *vcpu, fastpath_t exit_fastpath)
6605	{
6606	int ret = __vmx_handle_exit(vcpu, exit_fastpath);
6607
6608	/*
6609	* Exit to user space when bus lock detected to inform that there is
6610	* a bus lock in guest.
6611	*/
6612	if (to_vmx(vcpu)->exit_reason.bus_lock_detected) {
6613	if (ret > `0`)
6614	vcpu->run->exit_reason = KVM_EXIT_X86_BUS_LOCK;
6615
6616	vcpu->run->flags \|= KVM_RUN_X86_BUS_LOCK;
6617	return `0`;
6618	}
6619	return ret;
6620	}
6621
6622	/*
6623	* Software based L1D cache flush which is used when microcode providing
6624	* the cache control MSR is not loaded.
6625	*
6626	* The L1D cache is 32 KiB on Nehalem and later microarchitectures, but to
6627	* flush it is required to read in 64 KiB because the replacement algorithm
6628	* is not exactly LRU. This could be sized at runtime via topology
6629	* information but as all relevant affected CPUs have 32KiB L1D cache size
6630	* there is no point in doing so.
6631	*/
6632	static noinstr void vmx_l1d_flush(struct kvm_vcpu *vcpu)
6633	{
6634	int size = PAGE_SIZE << L1D_CACHE_ORDER;
6635
6636	/*
6637	* This code is only executed when the flush mode is 'cond' or
6638	* 'always'
6639	*/
6640	if (static_branch_likely(&vmx_l1d_flush_cond)) {
6641	bool flush_l1d;
6642
6643	/*
6644	* Clear the per-vcpu flush bit, it gets set again
6645	* either from vcpu_run() or from one of the unsafe
6646	* VMEXIT handlers.
6647	*/
6648	flush_l1d = vcpu->arch.l1tf_flush_l1d;
6649	vcpu->arch.l1tf_flush_l1d = false;
6650
6651	/*
6652	* Clear the per-cpu flush bit, it gets set again from
6653	* the interrupt handlers.
6654	*/
6655	flush_l1d \|= kvm_get_cpu_l1tf_flush_l1d();
6656	kvm_clear_cpu_l1tf_flush_l1d();
6657
6658	if (!flush_l1d)
6659	return;
6660	}
6661
6662	vcpu->stat.l1d_flush++;
6663
6664	if (static_cpu_has(X86_FEATURE_FLUSH_L1D)) {
6665	native_wrmsrl(MSR_IA32_FLUSH_CMD, L1D_FLUSH);
6666	return;
6667	}
6668
6669	asm volatile(
6670	/ First ensure the pages are in the TLB /
6671	"xorl %%eax, %%eax\n"
6672	".Lpopulate_tlb:\n\t"
6673	"movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6674	"addl $4096, %%eax\n\t"
6675	"cmpl %%eax, %[size]\n\t"
6676	"jne .Lpopulate_tlb\n\t"
6677	"xorl %%eax, %%eax\n\t"
6678	"cpuid\n\t"
6679	/ Now fill the cache /
6680	"xorl %%eax, %%eax\n"
6681	".Lfill_cache:\n"
6682	"movzbl (%[flush_pages], %%" _ASM_AX "), %%ecx\n\t"
6683	"addl $64, %%eax\n\t"
6684	"cmpl %%eax, %[size]\n\t"
6685	"jne .Lfill_cache\n\t"
6686	"lfence\n"
6687	:: [flush_pages] "r" (vmx_l1d_flush_pages),
6688	[size] "r" (size)
6689	: "eax", "ebx", "ecx", "edx");
6690	}
6691
6692	static void vmx_update_cr8_intercept(struct kvm_vcpu vcpu, int* tpr, int irr)
6693	{
6694	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
6695	int tpr_threshold;
6696
6697	if (is_guest_mode(vcpu) &&
6698	nested_cpu_has(vmcs12, CPU_BASED_TPR_SHADOW))
6699	return;
6700
6701	tpr_threshold = (irr == -`1` \|\| tpr < irr) ? `0` : irr;
6702	if (is_guest_mode(vcpu))
6703	to_vmx(vcpu)->nested.l1_tpr_threshold = tpr_threshold;
6704	else
6705	vmcs_write32(field: TPR_THRESHOLD, value: tpr_threshold);
6706	}
6707
6708	void vmx_set_virtual_apic_mode(struct kvm_vcpu *vcpu)
6709	{
6710	struct vcpu_vmx *vmx = to_vmx(vcpu);
6711	u32 sec_exec_control;
6712
6713	if (!lapic_in_kernel(vcpu))
6714	return;
6715
6716	if (!flexpriority_enabled &&
6717	!cpu_has_vmx_virtualize_x2apic_mode())
6718	return;
6719
6720	/ Postpone execution until vmcs01 is the current VMCS. /
6721	if (is_guest_mode(vcpu)) {
6722	vmx->nested.change_vmcs01_virtual_apic_mode = true;
6723	return;
6724	}
6725
6726	sec_exec_control = secondary_exec_controls_get(vmx);
6727	sec_exec_control &= ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES \|
6728	SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE);
6729
6730	switch (kvm_get_apic_mode(vcpu)) {
6731	case LAPIC_MODE_INVALID:
6732	WARN_ONCE(true, "Invalid local APIC state");
6733	break;
6734	case LAPIC_MODE_DISABLED:
6735	break;
6736	case LAPIC_MODE_XAPIC:
6737	if (flexpriority_enabled) {
6738	sec_exec_control \|=
6739	SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES;
6740	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6741
6742	/*
6743	* Flush the TLB, reloading the APIC access page will
6744	* only do so if its physical address has changed, but
6745	* the guest may have inserted a non-APIC mapping into
6746	* the TLB while the APIC access page was disabled.
6747	*/
6748	kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
6749	}
6750	break;
6751	case LAPIC_MODE_X2APIC:
6752	if (cpu_has_vmx_virtualize_x2apic_mode())
6753	sec_exec_control \|=
6754	SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
6755	break;
6756	}
6757	secondary_exec_controls_set(vmx, val: sec_exec_control);
6758
6759	vmx_update_msr_bitmap_x2apic(vcpu);
6760	}
6761
6762	static void vmx_set_apic_access_page_addr(struct kvm_vcpu *vcpu)
6763	{
6764	const gfn_t gfn = APIC_DEFAULT_PHYS_BASE >> PAGE_SHIFT;
6765	struct kvm *kvm = vcpu->kvm;
6766	struct kvm_memslots *slots = kvm_memslots(kvm);
6767	struct kvm_memory_slot *slot;
6768	unsigned long mmu_seq;
6769	kvm_pfn_t pfn;
6770
6771	/ Defer reload until vmcs01 is the current VMCS. /
6772	if (is_guest_mode(vcpu)) {
6773	to_vmx(vcpu)->nested.reload_vmcs01_apic_access_page = true;
6774	return;
6775	}
6776
6777	if (!(secondary_exec_controls_get(vmx: to_vmx(vcpu)) &
6778	SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES))
6779	return;
6780
6781	/*
6782	* Explicitly grab the memslot using KVM's internal slot ID to ensure
6783	* KVM doesn't unintentionally grab a userspace memslot. It _should_
6784	* be impossible for userspace to create a memslot for the APIC when
6785	* APICv is enabled, but paranoia won't hurt in this case.
6786	*/
6787	slot = id_to_memslot(slots, APIC_ACCESS_PAGE_PRIVATE_MEMSLOT);
6788	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
6789	return;
6790
6791	/*
6792	* Ensure that the mmu_notifier sequence count is read before KVM
6793	* retrieves the pfn from the primary MMU. Note, the memslot is
6794	* protected by SRCU, not the mmu_notifier. Pairs with the smp_wmb()
6795	* in kvm_mmu_invalidate_end().
6796	*/
6797	mmu_seq = kvm->mmu_invalidate_seq;
6798	smp_rmb();
6799
6800	/*
6801	* No need to retry if the memslot does not exist or is invalid. KVM
6802	* controls the APIC-access page memslot, and only deletes the memslot
6803	* if APICv is permanently inhibited, i.e. the memslot won't reappear.
6804	*/
6805	pfn = gfn_to_pfn_memslot(slot, gfn);
6806	if (is_error_noslot_pfn(pfn))
6807	return;
6808
6809	read_lock(&vcpu->kvm->mmu_lock);
6810	if (mmu_invalidate_retry_gfn(kvm, mmu_seq, gfn)) {
6811	kvm_make_request(KVM_REQ_APIC_PAGE_RELOAD, vcpu);
6812	read_unlock(&vcpu->kvm->mmu_lock);
6813	goto out;
6814	}
6815
6816	vmcs_write64(field: APIC_ACCESS_ADDR, value: pfn_to_hpa(pfn));
6817	read_unlock(&vcpu->kvm->mmu_lock);
6818
6819	/*
6820	* No need for a manual TLB flush at this point, KVM has already done a
6821	* flush if there were SPTEs pointing at the previous page.
6822	*/
6823	out:
6824	/*
6825	* Do not pin apic access page in memory, the MMU notifier
6826	* will call us again if it is migrated or swapped out.
6827	*/
6828	kvm_release_pfn_clean(pfn);
6829	}
6830
6831	static void vmx_hwapic_isr_update(int max_isr)
6832	{
6833	u16 status;
6834	u8 old;
6835
6836	if (max_isr == -`1`)
6837	max_isr = `0`;
6838
6839	status = vmcs_read16(field: GUEST_INTR_STATUS);
6840	old = status >> `8`;
6841	if (max_isr != old) {
6842	status &= `0xff`;
6843	status \|= max_isr << `8`;
6844	vmcs_write16(field: GUEST_INTR_STATUS, value: status);
6845	}
6846	}
6847
6848	static void vmx_set_rvi(int vector)
6849	{
6850	u16 status;
6851	u8 old;
6852
6853	if (vector == -`1`)
6854	vector = `0`;
6855
6856	status = vmcs_read16(field: GUEST_INTR_STATUS);
6857	old = (u8)status & `0xff`;
6858	if ((u8)vector != old) {
6859	status &= ~`0xff`;
6860	status \|= (u8)vector;
6861	vmcs_write16(field: GUEST_INTR_STATUS, value: status);
6862	}
6863	}
6864
6865	static void vmx_hwapic_irr_update(struct kvm_vcpu vcpu, int* max_irr)
6866	{
6867	/*
6868	* When running L2, updating RVI is only relevant when
6869	* vmcs12 virtual-interrupt-delivery enabled.
6870	* However, it can be enabled only when L1 also
6871	* intercepts external-interrupts and in that case
6872	* we should not update vmcs02 RVI but instead intercept
6873	* interrupt. Therefore, do nothing when running L2.
6874	*/
6875	if (!is_guest_mode(vcpu))
6876	vmx_set_rvi(vector: max_irr);
6877	}
6878
6879	static int vmx_sync_pir_to_irr(struct kvm_vcpu *vcpu)
6880	{
6881	struct vcpu_vmx *vmx = to_vmx(vcpu);
6882	int max_irr;
6883	bool got_posted_interrupt;
6884
6885	if (KVM_BUG_ON(!enable_apicv, vcpu->kvm))
6886	return -EIO;
6887
6888	if (pi_test_on(pi_desc: &vmx->pi_desc)) {
6889	pi_clear_on(pi_desc: &vmx->pi_desc);
6890	/*
6891	* IOMMU can write to PID.ON, so the barrier matters even on UP.
6892	* But on x86 this is just a compiler barrier anyway.
6893	*/
6894	smp_mb__after_atomic();
6895	got_posted_interrupt =
6896	kvm_apic_update_irr(vcpu, pir: vmx->pi_desc.pir, max_irr: &max_irr);
6897	} else {
6898	max_irr = kvm_lapic_find_highest_irr(vcpu);
6899	got_posted_interrupt = false;
6900	}
6901
6902	/*
6903	* Newly recognized interrupts are injected via either virtual interrupt
6904	* delivery (RVI) or KVM_REQ_EVENT. Virtual interrupt delivery is
6905	* disabled in two cases:
6906	*
6907	* 1) If L2 is running and the vCPU has a new pending interrupt. If L1
6908	* wants to exit on interrupts, KVM_REQ_EVENT is needed to synthesize a
6909	* VM-Exit to L1. If L1 doesn't want to exit, the interrupt is injected
6910	* into L2, but KVM doesn't use virtual interrupt delivery to inject
6911	* interrupts into L2, and so KVM_REQ_EVENT is again needed.
6912	*
6913	* 2) If APICv is disabled for this vCPU, assigned devices may still
6914	* attempt to post interrupts. The posted interrupt vector will cause
6915	* a VM-Exit and the subsequent entry will call sync_pir_to_irr.
6916	*/
6917	if (!is_guest_mode(vcpu) && kvm_vcpu_apicv_active(vcpu))
6918	vmx_set_rvi(vector: max_irr);
6919	else if (got_posted_interrupt)
6920	kvm_make_request(KVM_REQ_EVENT, vcpu);
6921
6922	return max_irr;
6923	}
6924
6925	static void vmx_load_eoi_exitmap(struct kvm_vcpu vcpu, u64 eoi_exit_bitmap)
6926	{
6927	if (!kvm_vcpu_apicv_active(vcpu))
6928	return;
6929
6930	vmcs_write64(field: EOI_EXIT_BITMAP0, value: eoi_exit_bitmap[`0`]);
6931	vmcs_write64(field: EOI_EXIT_BITMAP1, value: eoi_exit_bitmap[`1`]);
6932	vmcs_write64(field: EOI_EXIT_BITMAP2, value: eoi_exit_bitmap[`2`]);
6933	vmcs_write64(field: EOI_EXIT_BITMAP3, value: eoi_exit_bitmap[`3`]);
6934	}
6935
6936	static void vmx_apicv_pre_state_restore(struct kvm_vcpu *vcpu)
6937	{
6938	struct vcpu_vmx *vmx = to_vmx(vcpu);
6939
6940	pi_clear_on(pi_desc: &vmx->pi_desc);
6941	memset(vmx->pi_desc.pir, `0`, sizeof(vmx->pi_desc.pir));
6942	}
6943
6944	void vmx_do_interrupt_irqoff(unsigned long entry);
6945	void vmx_do_nmi_irqoff(void);
6946
6947	static void handle_nm_fault_irqoff(struct kvm_vcpu *vcpu)
6948	{
6949	/*
6950	* Save xfd_err to guest_fpu before interrupt is enabled, so the
6951	* MSR value is not clobbered by the host activity before the guest
6952	* has chance to consume it.
6953	*
6954	* Do not blindly read xfd_err here, since this exception might
6955	* be caused by L1 interception on a platform which doesn't
6956	* support xfd at all.
6957	*
6958	* Do it conditionally upon guest_fpu::xfd. xfd_err matters
6959	* only when xfd contains a non-zero value.
6960	*
6961	* Queuing exception is done in vmx_handle_exit. See comment there.
6962	*/
6963	if (vcpu->arch.guest_fpu.fpstate->xfd)
6964	rdmsrl(MSR_IA32_XFD_ERR, vcpu->arch.guest_fpu.xfd_err);
6965	}
6966
6967	static void handle_exception_irqoff(struct vcpu_vmx *vmx)
6968	{
6969	u32 intr_info = vmx_get_intr_info(vcpu: &vmx->vcpu);
6970
6971	/ if exit due to PF check for async PF /
6972	if (is_page_fault(intr_info))
6973	vmx->vcpu.arch.apf.host_apf_flags = kvm_read_and_reset_apf_flags();
6974	/ if exit due to NM, handle before interrupts are enabled /
6975	else if (is_nm_fault(intr_info))
6976	handle_nm_fault_irqoff(vcpu: &vmx->vcpu);
6977	/ Handle machine checks before interrupts are enabled /
6978	else if (is_machine_check(intr_info))
6979	kvm_machine_check();
6980	}
6981
6982	static void handle_external_interrupt_irqoff(struct kvm_vcpu *vcpu)
6983	{
6984	u32 intr_info = vmx_get_intr_info(vcpu);
6985	unsigned int vector = intr_info & INTR_INFO_VECTOR_MASK;
6986
6987	if (KVM_BUG(!is_external_intr(intr_info), vcpu->kvm,
6988	"unexpected VM-Exit interrupt info: 0x%x", intr_info))
6989	return;
6990
6991	kvm_before_interrupt(vcpu, intr: KVM_HANDLING_IRQ);
6992	if (cpu_feature_enabled(X86_FEATURE_FRED))
6993	fred_entry_from_kvm(EVENT_TYPE_EXTINT, vector);
6994	else
6995	vmx_do_interrupt_irqoff(entry: gate_offset(g: (gate_desc *)host_idt_base + vector));
6996	kvm_after_interrupt(vcpu);
6997
6998	vcpu->arch.at_instruction_boundary = true;
6999	}
7000
7001	static void vmx_handle_exit_irqoff(struct kvm_vcpu *vcpu)
7002	{
7003	struct vcpu_vmx *vmx = to_vmx(vcpu);
7004
7005	if (vmx->emulation_required)
7006	return;
7007
7008	if (vmx->exit_reason.basic == EXIT_REASON_EXTERNAL_INTERRUPT)
7009	handle_external_interrupt_irqoff(vcpu);
7010	else if (vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI)
7011	handle_exception_irqoff(vmx);
7012	}
7013
7014	/*
7015	* The kvm parameter can be NULL (module initialization, or invocation before
7016	* VM creation). Be sure to check the kvm parameter before using it.
7017	*/
7018	static bool vmx_has_emulated_msr(struct kvm *kvm, u32 index)
7019	{
7020	switch (index) {
7021	case MSR_IA32_SMBASE:
7022	if (!IS_ENABLED(CONFIG_KVM_SMM))
7023	return false;
7024	/*
7025	* We cannot do SMM unless we can run the guest in big
7026	* real mode.
7027	*/
7028	return enable_unrestricted_guest \|\| emulate_invalid_guest_state;
7029	case KVM_FIRST_EMULATED_VMX_MSR ... KVM_LAST_EMULATED_VMX_MSR:
7030	return nested;
7031	case MSR_AMD64_VIRT_SPEC_CTRL:
7032	case MSR_AMD64_TSC_RATIO:
7033	/ This is AMD only. /
7034	return false;
7035	default:
7036	return true;
7037	}
7038	}
7039
7040	static void vmx_recover_nmi_blocking(struct vcpu_vmx *vmx)
7041	{
7042	u32 exit_intr_info;
7043	bool unblock_nmi;
7044	u8 vector;
7045	bool idtv_info_valid;
7046
7047	idtv_info_valid = vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7048
7049	if (enable_vnmi) {
7050	if (vmx->loaded_vmcs->nmi_known_unmasked)
7051	return;
7052
7053	exit_intr_info = vmx_get_intr_info(vcpu: &vmx->vcpu);
7054	unblock_nmi = (exit_intr_info & INTR_INFO_UNBLOCK_NMI) != `0`;
7055	vector = exit_intr_info & INTR_INFO_VECTOR_MASK;
7056	/*
7057	* SDM 3: 27.7.1.2 (September 2008)
7058	* Re-set bit "block by NMI" before VM entry if vmexit caused by
7059	* a guest IRET fault.
7060	* SDM 3: 23.2.2 (September 2008)
7061	* Bit 12 is undefined in any of the following cases:
7062	* If the VM exit sets the valid bit in the IDT-vectoring
7063	* information field.
7064	* If the VM exit is due to a double fault.
7065	*/
7066	if ((exit_intr_info & INTR_INFO_VALID_MASK) && unblock_nmi &&
7067	vector != DF_VECTOR && !idtv_info_valid)
7068	vmcs_set_bits(field: GUEST_INTERRUPTIBILITY_INFO,
7069	GUEST_INTR_STATE_NMI);
7070	else
7071	vmx->loaded_vmcs->nmi_known_unmasked =
7072	!(vmcs_read32(field: GUEST_INTERRUPTIBILITY_INFO)
7073	& GUEST_INTR_STATE_NMI);
7074	} else if (unlikely(vmx->loaded_vmcs->soft_vnmi_blocked))
7075	vmx->loaded_vmcs->vnmi_blocked_time +=
7076	ktime_to_ns(ktime_sub(ktime_get(),
7077	vmx->loaded_vmcs->entry_time));
7078	}
7079
7080	static void __vmx_complete_interrupts(struct kvm_vcpu *vcpu,
7081	u32 idt_vectoring_info,
7082	int instr_len_field,
7083	int error_code_field)
7084	{
7085	u8 vector;
7086	int type;
7087	bool idtv_info_valid;
7088
7089	idtv_info_valid = idt_vectoring_info & VECTORING_INFO_VALID_MASK;
7090
7091	vcpu->arch.nmi_injected = false;
7092	kvm_clear_exception_queue(vcpu);
7093	kvm_clear_interrupt_queue(vcpu);
7094
7095	if (!idtv_info_valid)
7096	return;
7097
7098	kvm_make_request(KVM_REQ_EVENT, vcpu);
7099
7100	vector = idt_vectoring_info & VECTORING_INFO_VECTOR_MASK;
7101	type = idt_vectoring_info & VECTORING_INFO_TYPE_MASK;
7102
7103	switch (type) {
7104	case INTR_TYPE_NMI_INTR:
7105	vcpu->arch.nmi_injected = true;
7106	/*
7107	* SDM 3: 27.7.1.2 (September 2008)
7108	* Clear bit "block by NMI" before VM entry if a NMI
7109	* delivery faulted.
7110	*/
7111	vmx_set_nmi_mask(vcpu, masked: false);
7112	break;
7113	case INTR_TYPE_SOFT_EXCEPTION:
7114	vcpu->arch.event_exit_inst_len = vmcs_read32(field: instr_len_field);
7115	fallthrough;
7116	case INTR_TYPE_HARD_EXCEPTION:
7117	if (idt_vectoring_info & VECTORING_INFO_DELIVER_CODE_MASK) {
7118	u32 err = vmcs_read32(field: error_code_field);
7119	kvm_requeue_exception_e(vcpu, nr: vector, error_code: err);
7120	} else
7121	kvm_requeue_exception(vcpu, nr: vector);
7122	break;
7123	case INTR_TYPE_SOFT_INTR:
7124	vcpu->arch.event_exit_inst_len = vmcs_read32(field: instr_len_field);
7125	fallthrough;
7126	case INTR_TYPE_EXT_INTR:
7127	kvm_queue_interrupt(vcpu, vector, soft: type == INTR_TYPE_SOFT_INTR);
7128	break;
7129	default:
7130	break;
7131	}
7132	}
7133
7134	static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
7135	{
7136	__vmx_complete_interrupts(vcpu: &vmx->vcpu, idt_vectoring_info: vmx->idt_vectoring_info,
7137	instr_len_field: VM_EXIT_INSTRUCTION_LEN,
7138	error_code_field: IDT_VECTORING_ERROR_CODE);
7139	}
7140
7141	static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
7142	{
7143	__vmx_complete_interrupts(vcpu,
7144	idt_vectoring_info: vmcs_read32(field: VM_ENTRY_INTR_INFO_FIELD),
7145	instr_len_field: VM_ENTRY_INSTRUCTION_LEN,
7146	error_code_field: VM_ENTRY_EXCEPTION_ERROR_CODE);
7147
7148	vmcs_write32(field: VM_ENTRY_INTR_INFO_FIELD, value: `0`);
7149	}
7150
7151	static void atomic_switch_perf_msrs(struct vcpu_vmx *vmx)
7152	{
7153	int i, nr_msrs;
7154	struct perf_guest_switch_msr *msrs;
7155	struct kvm_pmu *pmu = vcpu_to_pmu(&vmx->vcpu);
7156
7157	pmu->host_cross_mapped_mask = `0`;
7158	if (pmu->pebs_enable & pmu->global_ctrl)
7159	intel_pmu_cross_mapped_check(pmu);
7160
7161	/ Note, nr_msrs may be garbage if perf_guest_get_msrs() returns NULL. /
7162	msrs = perf_guest_get_msrs(nr: &nr_msrs, data: (void *)pmu);
7163	if (!msrs)
7164	return;
7165
7166	for (i = `0`; i < nr_msrs; i++)
7167	if (msrs[i].host == msrs[i].guest)
7168	clear_atomic_switch_msr(vmx, msr: msrs[i].msr);
7169	else
7170	add_atomic_switch_msr(vmx, msr: msrs[i].msr, guest_val: msrs[i].guest,
7171	host_val: msrs[i].host, entry_only: false);
7172	}
7173
7174	static void vmx_update_hv_timer(struct kvm_vcpu *vcpu, bool force_immediate_exit)
7175	{
7176	struct vcpu_vmx *vmx = to_vmx(vcpu);
7177	u64 tscl;
7178	u32 delta_tsc;
7179
7180	if (force_immediate_exit) {
7181	vmcs_write32(field: VMX_PREEMPTION_TIMER_VALUE, value: `0`);
7182	vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7183	} else if (vmx->hv_deadline_tsc != -`1`) {
7184	tscl = rdtsc();
7185	if (vmx->hv_deadline_tsc > tscl)
7186	/ set_hv_timer ensures the delta fits in 32-bits /
7187	delta_tsc = (u32)((vmx->hv_deadline_tsc - tscl) >>
7188	cpu_preemption_timer_multi);
7189	else
7190	delta_tsc = `0`;
7191
7192	vmcs_write32(field: VMX_PREEMPTION_TIMER_VALUE, value: delta_tsc);
7193	vmx->loaded_vmcs->hv_timer_soft_disabled = false;
7194	} else if (!vmx->loaded_vmcs->hv_timer_soft_disabled) {
7195	vmcs_write32(field: VMX_PREEMPTION_TIMER_VALUE, value: -`1`);
7196	vmx->loaded_vmcs->hv_timer_soft_disabled = true;
7197	}
7198	}
7199
7200	void noinstr vmx_update_host_rsp(struct vcpu_vmx vmx, unsigned* long host_rsp)
7201	{
7202	if (unlikely(host_rsp != vmx->loaded_vmcs->host_state.rsp)) {
7203	vmx->loaded_vmcs->host_state.rsp = host_rsp;
7204	vmcs_writel(field: HOST_RSP, value: host_rsp);
7205	}
7206	}
7207
7208	void noinstr vmx_spec_ctrl_restore_host(struct vcpu_vmx *vmx,
7209	unsigned int flags)
7210	{
7211	u64 hostval = this_cpu_read(x86_spec_ctrl_current);
7212
7213	if (!cpu_feature_enabled(X86_FEATURE_MSR_SPEC_CTRL))
7214	return;
7215
7216	if (flags & VMX_RUN_SAVE_SPEC_CTRL)
7217	vmx->spec_ctrl = __rdmsr(MSR_IA32_SPEC_CTRL);
7218
7219	/*
7220	* If the guest/host SPEC_CTRL values differ, restore the host value.
7221	*
7222	* For legacy IBRS, the IBRS bit always needs to be written after
7223	* transitioning from a less privileged predictor mode, regardless of
7224	* whether the guest/host values differ.
7225	*/
7226	if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) \|\|
7227	vmx->spec_ctrl != hostval)
7228	native_wrmsrl(MSR_IA32_SPEC_CTRL, hostval);
7229
7230	barrier_nospec();
7231	}
7232
7233	static fastpath_t vmx_exit_handlers_fastpath(struct kvm_vcpu *vcpu,
7234	bool force_immediate_exit)
7235	{
7236	/*
7237	* If L2 is active, some VMX preemption timer exits can be handled in
7238	* the fastpath even, all other exits must use the slow path.
7239	*/
7240	if (is_guest_mode(vcpu) &&
7241	to_vmx(vcpu)->exit_reason.basic != EXIT_REASON_PREEMPTION_TIMER)
7242	return EXIT_FASTPATH_NONE;
7243
7244	switch (to_vmx(vcpu)->exit_reason.basic) {
7245	case EXIT_REASON_MSR_WRITE:
7246	return handle_fastpath_set_msr_irqoff(vcpu);
7247	case EXIT_REASON_PREEMPTION_TIMER:
7248	return handle_fastpath_preemption_timer(vcpu, force_immediate_exit);
7249	default:
7250	return EXIT_FASTPATH_NONE;
7251	}
7252	}
7253
7254	static noinstr void vmx_vcpu_enter_exit(struct kvm_vcpu *vcpu,
7255	unsigned int flags)
7256	{
7257	struct vcpu_vmx *vmx = to_vmx(vcpu);
7258
7259	guest_state_enter_irqoff();
7260
7261	/*
7262	* L1D Flush includes CPU buffer clear to mitigate MDS, but VERW
7263	* mitigation for MDS is done late in VMentry and is still
7264	* executed in spite of L1D Flush. This is because an extra VERW
7265	* should not matter much after the big hammer L1D Flush.
7266	*/
7267	if (static_branch_unlikely(&vmx_l1d_should_flush))
7268	vmx_l1d_flush(vcpu);
7269	else if (static_branch_unlikely(&mmio_stale_data_clear) &&
7270	kvm_arch_has_assigned_device(kvm: vcpu->kvm))
7271	mds_clear_cpu_buffers();
7272
7273	vmx_disable_fb_clear(vmx);
7274
7275	if (vcpu->arch.cr2 != native_read_cr2())
7276	native_write_cr2(val: vcpu->arch.cr2);
7277
7278	vmx->fail = __vmx_vcpu_run(vmx, regs: (unsigned long *)&vcpu->arch.regs,
7279	flags);
7280
7281	vcpu->arch.cr2 = native_read_cr2();
7282	vcpu->arch.regs_avail &= ~VMX_REGS_LAZY_LOAD_SET;
7283
7284	vmx->idt_vectoring_info = `0`;
7285
7286	vmx_enable_fb_clear(vmx);
7287
7288	if (unlikely(vmx->fail)) {
7289	vmx->exit_reason.full = `0xdead`;
7290	goto out;
7291	}
7292
7293	vmx->exit_reason.full = vmcs_read32(field: VM_EXIT_REASON);
7294	if (likely(!vmx->exit_reason.failed_vmentry))
7295	vmx->idt_vectoring_info = vmcs_read32(field: IDT_VECTORING_INFO_FIELD);
7296
7297	if ((u16)vmx->exit_reason.basic == EXIT_REASON_EXCEPTION_NMI &&
7298	is_nmi(intr_info: vmx_get_intr_info(vcpu))) {
7299	kvm_before_interrupt(vcpu, intr: KVM_HANDLING_NMI);
7300	if (cpu_feature_enabled(X86_FEATURE_FRED))
7301	fred_entry_from_kvm(EVENT_TYPE_NMI, NMI_VECTOR);
7302	else
7303	vmx_do_nmi_irqoff();
7304	kvm_after_interrupt(vcpu);
7305	}
7306
7307	out:
7308	guest_state_exit_irqoff();
7309	}
7310
7311	static fastpath_t vmx_vcpu_run(struct kvm_vcpu *vcpu, bool force_immediate_exit)
7312	{
7313	struct vcpu_vmx *vmx = to_vmx(vcpu);
7314	unsigned long cr3, cr4;
7315
7316	/ Record the guest's net vcpu time for enforced NMI injections. /
7317	if (unlikely(!enable_vnmi &&
7318	vmx->loaded_vmcs->soft_vnmi_blocked))
7319	vmx->loaded_vmcs->entry_time = ktime_get();
7320
7321	/*
7322	* Don't enter VMX if guest state is invalid, let the exit handler
7323	* start emulation until we arrive back to a valid state. Synthesize a
7324	* consistency check VM-Exit due to invalid guest state and bail.
7325	*/
7326	if (unlikely(vmx->emulation_required)) {
7327	vmx->fail = `0`;
7328
7329	vmx->exit_reason.full = EXIT_REASON_INVALID_STATE;
7330	vmx->exit_reason.failed_vmentry = `1`;
7331	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_EXIT_INFO_1);
7332	vmx->exit_qualification = ENTRY_FAIL_DEFAULT;
7333	kvm_register_mark_available(vcpu, reg: VCPU_EXREG_EXIT_INFO_2);
7334	vmx->exit_intr_info = `0`;
7335	return EXIT_FASTPATH_NONE;
7336	}
7337
7338	trace_kvm_entry(vcpu, force_immediate_exit);
7339
7340	if (vmx->ple_window_dirty) {
7341	vmx->ple_window_dirty = false;
7342	vmcs_write32(field: PLE_WINDOW, value: vmx->ple_window);
7343	}
7344
7345	/*
7346	* We did this in prepare_switch_to_guest, because it needs to
7347	* be within srcu_read_lock.
7348	*/
7349	WARN_ON_ONCE(vmx->nested.need_vmcs12_to_shadow_sync);
7350
7351	if (kvm_register_is_dirty(vcpu, reg: VCPU_REGS_RSP))
7352	vmcs_writel(field: GUEST_RSP, value: vcpu->arch.regs[VCPU_REGS_RSP]);
7353	if (kvm_register_is_dirty(vcpu, reg: VCPU_REGS_RIP))
7354	vmcs_writel(field: GUEST_RIP, value: vcpu->arch.regs[VCPU_REGS_RIP]);
7355	vcpu->arch.regs_dirty = `0`;
7356
7357	/*
7358	* Refresh vmcs.HOST_CR3 if necessary. This must be done immediately
7359	* prior to VM-Enter, as the kernel may load a new ASID (PCID) any time
7360	* it switches back to the current->mm, which can occur in KVM context
7361	* when switching to a temporary mm to patch kernel code, e.g. if KVM
7362	* toggles a static key while handling a VM-Exit.
7363	*/
7364	cr3 = __get_current_cr3_fast();
7365	if (unlikely(cr3 != vmx->loaded_vmcs->host_state.cr3)) {
7366	vmcs_writel(field: HOST_CR3, value: cr3);
7367	vmx->loaded_vmcs->host_state.cr3 = cr3;
7368	}
7369
7370	cr4 = cr4_read_shadow();
7371	if (unlikely(cr4 != vmx->loaded_vmcs->host_state.cr4)) {
7372	vmcs_writel(field: HOST_CR4, value: cr4);
7373	vmx->loaded_vmcs->host_state.cr4 = cr4;
7374	}
7375
7376	/ When KVM_DEBUGREG_WONT_EXIT, dr6 is accessible in guest. /
7377	if (unlikely(vcpu->arch.switch_db_regs & KVM_DEBUGREG_WONT_EXIT))
7378	set_debugreg(val: vcpu->arch.dr6, reg: `6`);
7379
7380	/ When single-stepping over STI and MOV SS, we must clear the*
7381	* corresponding interruptibility bits in the guest state. Otherwise
7382	* vmentry fails as it then expects bit 14 (BS) in pending debug
7383	* exceptions being set, but that's not correct for the guest debugging
7384	* case. */
7385	if (vcpu->guest_debug & KVM_GUESTDBG_SINGLESTEP)
7386	vmx_set_interrupt_shadow(vcpu, mask: `0`);
7387
7388	kvm_load_guest_xsave_state(vcpu);
7389
7390	pt_guest_enter(vmx);
7391
7392	atomic_switch_perf_msrs(vmx);
7393	if (intel_pmu_lbr_is_enabled(vcpu))
7394	vmx_passthrough_lbr_msrs(vcpu);
7395
7396	if (enable_preemption_timer)
7397	vmx_update_hv_timer(vcpu, force_immediate_exit);
7398	else if (force_immediate_exit)
7399	smp_send_reschedule(vcpu->cpu);
7400
7401	kvm_wait_lapic_expire(vcpu);
7402
7403	/ The actual VMENTER/EXIT is in the .noinstr.text section. /
7404	vmx_vcpu_enter_exit(vcpu, flags: __vmx_vcpu_run_flags(vmx));
7405
7406	/ All fields are clean at this point /
7407	if (kvm_is_using_evmcs()) {
7408	current_evmcs->hv_clean_fields \|=
7409	HV_VMX_ENLIGHTENED_CLEAN_FIELD_ALL;
7410
7411	current_evmcs->hv_vp_id = kvm_hv_get_vpindex(vcpu);
7412	}
7413
7414	/ MSR_IA32_DEBUGCTLMSR is zeroed on vmexit. Restore it if needed /
7415	if (vmx->host_debugctlmsr)
7416	update_debugctlmsr(debugctlmsr: vmx->host_debugctlmsr);
7417
7418	#ifndef CONFIG_X86_64
7419	/*
7420	* The sysexit path does not restore ds/es, so we must set them to
7421	* a reasonable value ourselves.
7422	*
7423	* We can't defer this to vmx_prepare_switch_to_host() since that
7424	* function may be executed in interrupt context, which saves and
7425	* restore segments around it, nullifying its effect.
7426	*/
7427	loadsegment(ds, __USER_DS);
7428	loadsegment(es, __USER_DS);
7429	#endif
7430
7431	pt_guest_exit(vmx);
7432
7433	kvm_load_host_xsave_state(vcpu);
7434
7435	if (is_guest_mode(vcpu)) {
7436	/*
7437	* Track VMLAUNCH/VMRESUME that have made past guest state
7438	* checking.
7439	*/
7440	if (vmx->nested.nested_run_pending &&
7441	!vmx->exit_reason.failed_vmentry)
7442	++vcpu->stat.nested_run;
7443
7444	vmx->nested.nested_run_pending = `0`;
7445	}
7446
7447	if (unlikely(vmx->fail))
7448	return EXIT_FASTPATH_NONE;
7449
7450	if (unlikely((u16)vmx->exit_reason.basic == EXIT_REASON_MCE_DURING_VMENTRY))
7451	kvm_machine_check();
7452
7453	trace_kvm_exit(vcpu, KVM_ISA_VMX);
7454
7455	if (unlikely(vmx->exit_reason.failed_vmentry))
7456	return EXIT_FASTPATH_NONE;
7457
7458	vmx->loaded_vmcs->launched = `1`;
7459
7460	vmx_recover_nmi_blocking(vmx);
7461	vmx_complete_interrupts(vmx);
7462
7463	return vmx_exit_handlers_fastpath(vcpu, force_immediate_exit);
7464	}
7465
7466	static void vmx_vcpu_free(struct kvm_vcpu *vcpu)
7467	{
7468	struct vcpu_vmx *vmx = to_vmx(vcpu);
7469
7470	if (enable_pml)
7471	vmx_destroy_pml_buffer(vmx);
7472	free_vpid(vpid: vmx->vpid);
7473	nested_vmx_free_vcpu(vcpu);
7474	free_loaded_vmcs(loaded_vmcs: vmx->loaded_vmcs);
7475	}
7476
7477	static int vmx_vcpu_create(struct kvm_vcpu *vcpu)
7478	{
7479	struct vmx_uret_msr *tsx_ctrl;
7480	struct vcpu_vmx *vmx;
7481	int i, err;
7482
7483	BUILD_BUG_ON(offsetof(struct vcpu_vmx, vcpu) != `0`);
7484	vmx = to_vmx(vcpu);
7485
7486	INIT_LIST_HEAD(list: &vmx->pi_wakeup_list);
7487
7488	err = -ENOMEM;
7489
7490	vmx->vpid = allocate_vpid();
7491
7492	/*
7493	* If PML is turned on, failure on enabling PML just results in failure
7494	* of creating the vcpu, therefore we can simplify PML logic (by
7495	* avoiding dealing with cases, such as enabling PML partially on vcpus
7496	* for the guest), etc.
7497	*/
7498	if (enable_pml) {
7499	vmx->pml_pg = alloc_page(GFP_KERNEL_ACCOUNT \| __GFP_ZERO);
7500	if (!vmx->pml_pg)
7501	goto free_vpid;
7502	}
7503
7504	for (i = `0`; i < kvm_nr_uret_msrs; ++i)
7505	vmx->guest_uret_msrs[i].mask = -`1ull`;
7506	if (boot_cpu_has(X86_FEATURE_RTM)) {
7507	/*
7508	* TSX_CTRL_CPUID_CLEAR is handled in the CPUID interception.
7509	* Keep the host value unchanged to avoid changing CPUID bits
7510	* under the host kernel's feet.
7511	*/
7512	tsx_ctrl = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7513	if (tsx_ctrl)
7514	tsx_ctrl->mask = ~(u64)TSX_CTRL_CPUID_CLEAR;
7515	}
7516
7517	err = alloc_loaded_vmcs(loaded_vmcs: &vmx->vmcs01);
7518	if (err < `0`)
7519	goto free_pml;
7520
7521	/*
7522	* Use Hyper-V 'Enlightened MSR Bitmap' feature when KVM runs as a
7523	* nested (L1) hypervisor and Hyper-V in L0 supports it. Enable the
7524	* feature only for vmcs01, KVM currently isn't equipped to realize any
7525	* performance benefits from enabling it for vmcs02.
7526	*/
7527	if (kvm_is_using_evmcs() &&
7528	(ms_hyperv.nested_features & HV_X64_NESTED_MSR_BITMAP)) {
7529	struct hv_enlightened_vmcs evmcs = (void* *)vmx->vmcs01.vmcs;
7530
7531	evmcs->hv_enlightenments_control.msr_bitmap = `1`;
7532	}
7533
7534	/ The MSR bitmap starts with all ones /
7535	bitmap_fill(dst: vmx->shadow_msr_intercept.read, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7536	bitmap_fill(dst: vmx->shadow_msr_intercept.write, MAX_POSSIBLE_PASSTHROUGH_MSRS);
7537
7538	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_TSC, MSR_TYPE_R);
7539	#ifdef CONFIG_X86_64
7540	vmx_disable_intercept_for_msr(vcpu, MSR_FS_BASE, MSR_TYPE_RW);
7541	vmx_disable_intercept_for_msr(vcpu, MSR_GS_BASE, MSR_TYPE_RW);
7542	vmx_disable_intercept_for_msr(vcpu, MSR_KERNEL_GS_BASE, MSR_TYPE_RW);
7543	#endif
7544	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_CS, MSR_TYPE_RW);
7545	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_ESP, MSR_TYPE_RW);
7546	vmx_disable_intercept_for_msr(vcpu, MSR_IA32_SYSENTER_EIP, MSR_TYPE_RW);
7547	if (kvm_cstate_in_guest(kvm: vcpu->kvm)) {
7548	vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C1_RES, MSR_TYPE_R);
7549	vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C3_RESIDENCY, MSR_TYPE_R);
7550	vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C6_RESIDENCY, MSR_TYPE_R);
7551	vmx_disable_intercept_for_msr(vcpu, MSR_CORE_C7_RESIDENCY, MSR_TYPE_R);
7552	}
7553
7554	vmx->loaded_vmcs = &vmx->vmcs01;
7555
7556	if (cpu_need_virtualize_apic_accesses(vcpu)) {
7557	err = kvm_alloc_apic_access_page(kvm: vcpu->kvm);
7558	if (err)
7559	goto free_vmcs;
7560	}
7561
7562	if (enable_ept && !enable_unrestricted_guest) {
7563	err = init_rmode_identity_map(kvm: vcpu->kvm);
7564	if (err)
7565	goto free_vmcs;
7566	}
7567
7568	if (vmx_can_use_ipiv(vcpu))
7569	WRITE_ONCE(to_kvm_vmx(vcpu->kvm)->pid_table[vcpu->vcpu_id],
7570	__pa(&vmx->pi_desc) \| PID_TABLE_ENTRY_VALID);
7571
7572	return `0`;
7573
7574	free_vmcs:
7575	free_loaded_vmcs(loaded_vmcs: vmx->loaded_vmcs);
7576	free_pml:
7577	vmx_destroy_pml_buffer(vmx);
7578	free_vpid:
7579	free_vpid(vpid: vmx->vpid);
7580	return err;
7581	}
7582
7583	#define L1TF_MSG_SMT "L1TF CPU bug present and SMT on, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7584	#define L1TF_MSG_L1D "L1TF CPU bug present and virtualization mitigation disabled, data leak possible. See CVE-2018-3646 and https://www.kernel.org/doc/html/latest/admin-guide/hw-vuln/l1tf.html for details.\n"
7585
7586	static int vmx_vm_init(struct kvm *kvm)
7587	{
7588	if (!ple_gap)
7589	kvm->arch.pause_in_guest = true;
7590
7591	if (boot_cpu_has(X86_BUG_L1TF) && enable_ept) {
7592	switch (l1tf_mitigation) {
7593	case L1TF_MITIGATION_OFF:
7594	case L1TF_MITIGATION_FLUSH_NOWARN:
7595	/ 'I explicitly don't care' is set /
7596	break;
7597	case L1TF_MITIGATION_FLUSH:
7598	case L1TF_MITIGATION_FLUSH_NOSMT:
7599	case L1TF_MITIGATION_FULL:
7600	/*
7601	* Warn upon starting the first VM in a potentially
7602	* insecure environment.
7603	*/
7604	if (sched_smt_active())
7605	pr_warn_once(L1TF_MSG_SMT);
7606	if (l1tf_vmx_mitigation == VMENTER_L1D_FLUSH_NEVER)
7607	pr_warn_once(L1TF_MSG_L1D);
7608	break;
7609	case L1TF_MITIGATION_FULL_FORCE:
7610	/ Flush is enforced /
7611	break;
7612	}
7613	}
7614	return `0`;
7615	}
7616
7617	static u8 vmx_get_mt_mask(struct kvm_vcpu *vcpu, gfn_t gfn, bool is_mmio)
7618	{
7619	/ We wanted to honor guest CD/MTRR/PAT, but doing so could result in*
7620	* memory aliases with conflicting memory types and sometimes MCEs.
7621	* We have to be careful as to what are honored and when.
7622	*
7623	* For MMIO, guest CD/MTRR are ignored. The EPT memory type is set to
7624	* UC. The effective memory type is UC or WC depending on guest PAT.
7625	* This was historically the source of MCEs and we want to be
7626	* conservative.
7627	*
7628	* When there is no need to deal with noncoherent DMA (e.g., no VT-d
7629	* or VT-d has snoop control), guest CD/MTRR/PAT are all ignored. The
7630	* EPT memory type is set to WB. The effective memory type is forced
7631	* WB.
7632	*
7633	* Otherwise, we trust guest. Guest CD/MTRR/PAT are all honored. The
7634	* EPT memory type is used to emulate guest CD/MTRR.
7635	*/
7636
7637	if (is_mmio)
7638	return MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT;
7639
7640	if (!kvm_arch_has_noncoherent_dma(kvm: vcpu->kvm))
7641	return (MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT) \| VMX_EPT_IPAT_BIT;
7642
7643	if (kvm_read_cr0_bits(vcpu, X86_CR0_CD)) {
7644	if (kvm_check_has_quirk(kvm: vcpu->kvm, KVM_X86_QUIRK_CD_NW_CLEARED))
7645	return MTRR_TYPE_WRBACK << VMX_EPT_MT_EPTE_SHIFT;
7646	else
7647	return (MTRR_TYPE_UNCACHABLE << VMX_EPT_MT_EPTE_SHIFT) \|
7648	VMX_EPT_IPAT_BIT;
7649	}
7650
7651	return kvm_mtrr_get_guest_memory_type(vcpu, gfn) << VMX_EPT_MT_EPTE_SHIFT;
7652	}
7653
7654	static void vmcs_set_secondary_exec_control(struct vcpu_vmx *vmx, u32 new_ctl)
7655	{
7656	/*
7657	* These bits in the secondary execution controls field
7658	* are dynamic, the others are mostly based on the hypervisor
7659	* architecture and the guest's CPUID. Do not touch the
7660	* dynamic bits.
7661	*/
7662	u32 mask =
7663	SECONDARY_EXEC_SHADOW_VMCS \|
7664	SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE \|
7665	SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES \|
7666	SECONDARY_EXEC_DESC;
7667
7668	u32 cur_ctl = secondary_exec_controls_get(vmx);
7669
7670	secondary_exec_controls_set(vmx, val: (new_ctl & ~mask) \| (cur_ctl & mask));
7671	}
7672
7673	/*
7674	* Generate MSR_IA32_VMX_CR{0,4}_FIXED1 according to CPUID. Only set bits
7675	* (indicating "allowed-1") if they are supported in the guest's CPUID.
7676	*/
7677	static void nested_vmx_cr_fixed1_bits_update(struct kvm_vcpu *vcpu)
7678	{
7679	struct vcpu_vmx *vmx = to_vmx(vcpu);
7680	struct kvm_cpuid_entry2 *entry;
7681
7682	vmx->nested.msrs.cr0_fixed1 = `0xffffffff`;
7683	vmx->nested.msrs.cr4_fixed1 = X86_CR4_PCE;
7684
7685	#define cr4_fixed1_update(_cr4_mask, _reg, _cpuid_mask) do { \
7686	if (entry && (entry->_reg & (_cpuid_mask))) \
7687	vmx->nested.msrs.cr4_fixed1 \|= (_cr4_mask); \
7688	} while (0)
7689
7690	entry = kvm_find_cpuid_entry(vcpu, function: `0x1`);
7691	cr4_fixed1_update(X86_CR4_VME, edx, feature_bit(VME));
7692	cr4_fixed1_update(X86_CR4_PVI, edx, feature_bit(VME));
7693	cr4_fixed1_update(X86_CR4_TSD, edx, feature_bit(TSC));
7694	cr4_fixed1_update(X86_CR4_DE, edx, feature_bit(DE));
7695	cr4_fixed1_update(X86_CR4_PSE, edx, feature_bit(PSE));
7696	cr4_fixed1_update(X86_CR4_PAE, edx, feature_bit(PAE));
7697	cr4_fixed1_update(X86_CR4_MCE, edx, feature_bit(MCE));
7698	cr4_fixed1_update(X86_CR4_PGE, edx, feature_bit(PGE));
7699	cr4_fixed1_update(X86_CR4_OSFXSR, edx, feature_bit(FXSR));
7700	cr4_fixed1_update(X86_CR4_OSXMMEXCPT, edx, feature_bit(XMM));
7701	cr4_fixed1_update(X86_CR4_VMXE, ecx, feature_bit(VMX));
7702	cr4_fixed1_update(X86_CR4_SMXE, ecx, feature_bit(SMX));
7703	cr4_fixed1_update(X86_CR4_PCIDE, ecx, feature_bit(PCID));
7704	cr4_fixed1_update(X86_CR4_OSXSAVE, ecx, feature_bit(XSAVE));
7705
7706	entry = kvm_find_cpuid_entry_index(vcpu, function: `0x7`, index: `0`);
7707	cr4_fixed1_update(X86_CR4_FSGSBASE, ebx, feature_bit(FSGSBASE));
7708	cr4_fixed1_update(X86_CR4_SMEP, ebx, feature_bit(SMEP));
7709	cr4_fixed1_update(X86_CR4_SMAP, ebx, feature_bit(SMAP));
7710	cr4_fixed1_update(X86_CR4_PKE, ecx, feature_bit(PKU));
7711	cr4_fixed1_update(X86_CR4_UMIP, ecx, feature_bit(UMIP));
7712	cr4_fixed1_update(X86_CR4_LA57, ecx, feature_bit(LA57));
7713
7714	entry = kvm_find_cpuid_entry_index(vcpu, function: `0x7`, index: `1`);
7715	cr4_fixed1_update(X86_CR4_LAM_SUP, eax, feature_bit(LAM));
7716
7717	#undef cr4_fixed1_update
7718	}
7719
7720	static void update_intel_pt_cfg(struct kvm_vcpu *vcpu)
7721	{
7722	struct vcpu_vmx *vmx = to_vmx(vcpu);
7723	struct kvm_cpuid_entry2 *best = NULL;
7724	int i;
7725
7726	for (i = `0`; i < PT_CPUID_LEAVES; i++) {
7727	best = kvm_find_cpuid_entry_index(vcpu, function: `0x14`, index: i);
7728	if (!best)
7729	return;
7730	vmx->pt_desc.caps[CPUID_EAX + i*PT_CPUID_REGS_NUM] = best->eax;
7731	vmx->pt_desc.caps[CPUID_EBX + i*PT_CPUID_REGS_NUM] = best->ebx;
7732	vmx->pt_desc.caps[CPUID_ECX + i*PT_CPUID_REGS_NUM] = best->ecx;
7733	vmx->pt_desc.caps[CPUID_EDX + i*PT_CPUID_REGS_NUM] = best->edx;
7734	}
7735
7736	/ Get the number of configurable Address Ranges for filtering /
7737	vmx->pt_desc.num_address_ranges = intel_pt_validate_cap(caps: vmx->pt_desc.caps,
7738	cap: PT_CAP_num_address_ranges);
7739
7740	/ Initialize and clear the no dependency bits /
7741	vmx->pt_desc.ctl_bitmask = ~(RTIT_CTL_TRACEEN \| RTIT_CTL_OS \|
7742	RTIT_CTL_USR \| RTIT_CTL_TSC_EN \| RTIT_CTL_DISRETC \|
7743	RTIT_CTL_BRANCH_EN);
7744
7745	/*
7746	* If CPUID.(EAX=14H,ECX=0):EBX[0]=1 CR3Filter can be set otherwise
7747	* will inject an #GP
7748	*/
7749	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_cr3_filtering))
7750	vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_CR3EN;
7751
7752	/*
7753	* If CPUID.(EAX=14H,ECX=0):EBX[1]=1 CYCEn, CycThresh and
7754	* PSBFreq can be set
7755	*/
7756	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_psb_cyc))
7757	vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_CYCLEACC \|
7758	RTIT_CTL_CYC_THRESH \| RTIT_CTL_PSB_FREQ);
7759
7760	/*
7761	* If CPUID.(EAX=14H,ECX=0):EBX[3]=1 MTCEn and MTCFreq can be set
7762	*/
7763	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_mtc))
7764	vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_MTC_EN \|
7765	RTIT_CTL_MTC_RANGE);
7766
7767	/ If CPUID.(EAX=14H,ECX=0):EBX[4]=1 FUPonPTW and PTWEn can be set /
7768	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_ptwrite))
7769	vmx->pt_desc.ctl_bitmask &= ~(RTIT_CTL_FUP_ON_PTW \|
7770	RTIT_CTL_PTW_EN);
7771
7772	/ If CPUID.(EAX=14H,ECX=0):EBX[5]=1 PwrEvEn can be set /
7773	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_power_event_trace))
7774	vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_PWR_EVT_EN;
7775
7776	/ If CPUID.(EAX=14H,ECX=0):ECX[0]=1 ToPA can be set /
7777	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_topa_output))
7778	vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_TOPA;
7779
7780	/ If CPUID.(EAX=14H,ECX=0):ECX[3]=1 FabricEn can be set /
7781	if (intel_pt_validate_cap(caps: vmx->pt_desc.caps, cap: PT_CAP_output_subsys))
7782	vmx->pt_desc.ctl_bitmask &= ~RTIT_CTL_FABRIC_EN;
7783
7784	/ unmask address range configure area /
7785	for (i = `0`; i < vmx->pt_desc.num_address_ranges; i++)
7786	vmx->pt_desc.ctl_bitmask &= ~(`0xfULL` << (`32` + i * `4`));
7787	}
7788
7789	static void vmx_vcpu_after_set_cpuid(struct kvm_vcpu *vcpu)
7790	{
7791	struct vcpu_vmx *vmx = to_vmx(vcpu);
7792
7793	/*
7794	* XSAVES is effectively enabled if and only if XSAVE is also exposed
7795	* to the guest. XSAVES depends on CR4.OSXSAVE, and CR4.OSXSAVE can be
7796	* set if and only if XSAVE is supported.
7797	*/
7798	if (boot_cpu_has(X86_FEATURE_XSAVE) &&
7799	guest_cpuid_has(vcpu, X86_FEATURE_XSAVE))
7800	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_XSAVES);
7801
7802	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_VMX);
7803	kvm_governed_feature_check_and_set(vcpu, X86_FEATURE_LAM);
7804
7805	vmx_setup_uret_msrs(vmx);
7806
7807	if (cpu_has_secondary_exec_ctrls())
7808	vmcs_set_secondary_exec_control(vmx,
7809	new_ctl: vmx_secondary_exec_control(vmx));
7810
7811	if (guest_can_use(vcpu, X86_FEATURE_VMX))
7812	vmx->msr_ia32_feature_control_valid_bits \|=
7813	FEAT_CTL_VMX_ENABLED_INSIDE_SMX \|
7814	FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX;
7815	else
7816	vmx->msr_ia32_feature_control_valid_bits &=
7817	~(FEAT_CTL_VMX_ENABLED_INSIDE_SMX \|
7818	FEAT_CTL_VMX_ENABLED_OUTSIDE_SMX);
7819
7820	if (guest_can_use(vcpu, X86_FEATURE_VMX))
7821	nested_vmx_cr_fixed1_bits_update(vcpu);
7822
7823	if (boot_cpu_has(X86_FEATURE_INTEL_PT) &&
7824	guest_cpuid_has(vcpu, X86_FEATURE_INTEL_PT))
7825	update_intel_pt_cfg(vcpu);
7826
7827	if (boot_cpu_has(X86_FEATURE_RTM)) {
7828	struct vmx_uret_msr *msr;
7829	msr = vmx_find_uret_msr(vmx, MSR_IA32_TSX_CTRL);
7830	if (msr) {
7831	bool enabled = guest_cpuid_has(vcpu, X86_FEATURE_RTM);
7832	vmx_set_guest_uret_msr(vmx, msr, data: enabled ? `0` : TSX_CTRL_RTM_DISABLE);
7833	}
7834	}
7835
7836	if (kvm_cpu_cap_has(X86_FEATURE_XFD))
7837	vmx_set_intercept_for_msr(vcpu, MSR_IA32_XFD_ERR, MSR_TYPE_R,
7838	value: !guest_cpuid_has(vcpu, X86_FEATURE_XFD));
7839
7840	if (boot_cpu_has(X86_FEATURE_IBPB))
7841	vmx_set_intercept_for_msr(vcpu, MSR_IA32_PRED_CMD, MSR_TYPE_W,
7842	value: !guest_has_pred_cmd_msr(vcpu));
7843
7844	if (boot_cpu_has(X86_FEATURE_FLUSH_L1D))
7845	vmx_set_intercept_for_msr(vcpu, MSR_IA32_FLUSH_CMD, MSR_TYPE_W,
7846	value: !guest_cpuid_has(vcpu, X86_FEATURE_FLUSH_L1D));
7847
7848	set_cr4_guest_host_mask(vmx);
7849
7850	vmx_write_encls_bitmap(vcpu, NULL);
7851	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX))
7852	vmx->msr_ia32_feature_control_valid_bits \|= FEAT_CTL_SGX_ENABLED;
7853	else
7854	vmx->msr_ia32_feature_control_valid_bits &= ~FEAT_CTL_SGX_ENABLED;
7855
7856	if (guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
7857	vmx->msr_ia32_feature_control_valid_bits \|=
7858	FEAT_CTL_SGX_LC_ENABLED;
7859	else
7860	vmx->msr_ia32_feature_control_valid_bits &=
7861	~FEAT_CTL_SGX_LC_ENABLED;
7862
7863	/ Refresh #PF interception to account for MAXPHYADDR changes. /
7864	vmx_update_exception_bitmap(vcpu);
7865	}
7866
7867	static __init u64 vmx_get_perf_capabilities(void)
7868	{
7869	u64 perf_cap = PMU_CAP_FW_WRITES;
7870	u64 host_perf_cap = `0`;
7871
7872	if (!enable_pmu)
7873	return `0`;
7874
7875	if (boot_cpu_has(X86_FEATURE_PDCM))
7876	rdmsrl(MSR_IA32_PERF_CAPABILITIES, host_perf_cap);
7877
7878	if (!cpu_feature_enabled(X86_FEATURE_ARCH_LBR)) {
7879	x86_perf_get_lbr(lbr: &vmx_lbr_caps);
7880
7881	/*
7882	* KVM requires LBR callstack support, as the overhead due to
7883	* context switching LBRs without said support is too high.
7884	* See intel_pmu_create_guest_lbr_event() for more info.
7885	*/
7886	if (!vmx_lbr_caps.has_callstack)
7887	memset(&vmx_lbr_caps, `0`, sizeof(vmx_lbr_caps));
7888	else if (vmx_lbr_caps.nr)
7889	perf_cap \|= host_perf_cap & PMU_CAP_LBR_FMT;
7890	}
7891
7892	if (vmx_pebs_supported()) {
7893	perf_cap \|= host_perf_cap & PERF_CAP_PEBS_MASK;
7894
7895	/*
7896	* Disallow adaptive PEBS as it is functionally broken, can be
7897	* used by the guest to read host LBRs, and can be used to
7898	* bypass userspace event filters. To correctly and safely
7899	* support adaptive PEBS, KVM needs to:
7900	*
7901	* 1. Account for the ADAPTIVE flag when (re)programming fixed
7902	* counters.
7903	*
7904	* 2. Gain support from perf (or take direct control of counter
7905	* programming) to support events without adaptive PEBS
7906	* enabled for the hardware counter.
7907	*
7908	* 3. Ensure LBR MSRs cannot hold host data on VM-Entry with
7909	* adaptive PEBS enabled and MSR_PEBS_DATA_CFG.LBRS=1.
7910	*
7911	* 4. Document which PMU events are effectively exposed to the
7912	* guest via adaptive PEBS, and make adaptive PEBS mutually
7913	* exclusive with KVM_SET_PMU_EVENT_FILTER if necessary.
7914	*/
7915	perf_cap &= ~PERF_CAP_PEBS_BASELINE;
7916	}
7917
7918	return perf_cap;
7919	}
7920
7921	static __init void vmx_set_cpu_caps(void)
7922	{
7923	kvm_set_cpu_caps();
7924
7925	/ CPUID 0x1 /
7926	if (nested)
7927	kvm_cpu_cap_set(X86_FEATURE_VMX);
7928
7929	/ CPUID 0x7 /
7930	if (kvm_mpx_supported())
7931	kvm_cpu_cap_check_and_set(X86_FEATURE_MPX);
7932	if (!cpu_has_vmx_invpcid())
7933	kvm_cpu_cap_clear(X86_FEATURE_INVPCID);
7934	if (vmx_pt_mode_is_host_guest())
7935	kvm_cpu_cap_check_and_set(X86_FEATURE_INTEL_PT);
7936	if (vmx_pebs_supported()) {
7937	kvm_cpu_cap_check_and_set(X86_FEATURE_DS);
7938	kvm_cpu_cap_check_and_set(X86_FEATURE_DTES64);
7939	}
7940
7941	if (!enable_pmu)
7942	kvm_cpu_cap_clear(X86_FEATURE_PDCM);
7943	kvm_caps.supported_perf_cap = vmx_get_perf_capabilities();
7944
7945	if (!enable_sgx) {
7946	kvm_cpu_cap_clear(X86_FEATURE_SGX);
7947	kvm_cpu_cap_clear(X86_FEATURE_SGX_LC);
7948	kvm_cpu_cap_clear(X86_FEATURE_SGX1);
7949	kvm_cpu_cap_clear(X86_FEATURE_SGX2);
7950	}
7951
7952	if (vmx_umip_emulated())
7953	kvm_cpu_cap_set(X86_FEATURE_UMIP);
7954
7955	/ CPUID 0xD.1 /
7956	kvm_caps.supported_xss = `0`;
7957	if (!cpu_has_vmx_xsaves())
7958	kvm_cpu_cap_clear(X86_FEATURE_XSAVES);
7959
7960	/ CPUID 0x80000001 and 0x7 (RDPID) /
7961	if (!cpu_has_vmx_rdtscp()) {
7962	kvm_cpu_cap_clear(X86_FEATURE_RDTSCP);
7963	kvm_cpu_cap_clear(X86_FEATURE_RDPID);
7964	}
7965
7966	if (cpu_has_vmx_waitpkg())
7967	kvm_cpu_cap_check_and_set(X86_FEATURE_WAITPKG);
7968	}
7969
7970	static int vmx_check_intercept_io(struct kvm_vcpu *vcpu,
7971	struct x86_instruction_info *info)
7972	{
7973	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
7974	unsigned short port;
7975	bool intercept;
7976	int size;
7977
7978	if (info->intercept == x86_intercept_in \|\|
7979	info->intercept == x86_intercept_ins) {
7980	port = info->src_val;
7981	size = info->dst_bytes;
7982	} else {
7983	port = info->dst_val;
7984	size = info->src_bytes;
7985	}
7986
7987	/*
7988	* If the 'use IO bitmaps' VM-execution control is 0, IO instruction
7989	* VM-exits depend on the 'unconditional IO exiting' VM-execution
7990	* control.
7991	*
7992	* Otherwise, IO instruction VM-exits are controlled by the IO bitmaps.
7993	*/
7994	if (!nested_cpu_has(vmcs12, CPU_BASED_USE_IO_BITMAPS))
7995	intercept = nested_cpu_has(vmcs12,
7996	CPU_BASED_UNCOND_IO_EXITING);
7997	else
7998	intercept = nested_vmx_check_io_bitmaps(vcpu, port, size);
7999
8000	/ FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. /
8001	return intercept ? X86EMUL_UNHANDLEABLE : X86EMUL_CONTINUE;
8002	}
8003
8004	static int vmx_check_intercept(struct kvm_vcpu *vcpu,
8005	struct x86_instruction_info *info,
8006	enum x86_intercept_stage stage,
8007	struct x86_exception *exception)
8008	{
8009	struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
8010
8011	switch (info->intercept) {
8012	/*
8013	* RDPID causes #UD if disabled through secondary execution controls.
8014	* Because it is marked as EmulateOnUD, we need to intercept it here.
8015	* Note, RDPID is hidden behind ENABLE_RDTSCP.
8016	*/
8017	case x86_intercept_rdpid:
8018	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_ENABLE_RDTSCP)) {
8019	exception->vector = UD_VECTOR;
8020	exception->error_code_valid = false;
8021	return X86EMUL_PROPAGATE_FAULT;
8022	}
8023	break;
8024
8025	case x86_intercept_in:
8026	case x86_intercept_ins:
8027	case x86_intercept_out:
8028	case x86_intercept_outs:
8029	return vmx_check_intercept_io(vcpu, info);
8030
8031	case x86_intercept_lgdt:
8032	case x86_intercept_lidt:
8033	case x86_intercept_lldt:
8034	case x86_intercept_ltr:
8035	case x86_intercept_sgdt:
8036	case x86_intercept_sidt:
8037	case x86_intercept_sldt:
8038	case x86_intercept_str:
8039	if (!nested_cpu_has2(vmcs12, SECONDARY_EXEC_DESC))
8040	return X86EMUL_CONTINUE;
8041
8042	/ FIXME: produce nested vmexit and return X86EMUL_INTERCEPTED. /
8043	break;
8044
8045	case x86_intercept_pause:
8046	/*
8047	* PAUSE is a single-byte NOP with a REPE prefix, i.e. collides
8048	* with vanilla NOPs in the emulator. Apply the interception
8049	* check only to actual PAUSE instructions. Don't check
8050	* PAUSE-loop-exiting, software can't expect a given PAUSE to
8051	* exit, i.e. KVM is within its rights to allow L2 to execute
8052	* the PAUSE.
8053	*/
8054	if ((info->rep_prefix != REPE_PREFIX) \|\|
8055	!nested_cpu_has2(vmcs12, CPU_BASED_PAUSE_EXITING))
8056	return X86EMUL_CONTINUE;
8057
8058	break;
8059
8060	/ TODO: check more intercepts... /
8061	default:
8062	break;
8063	}
8064
8065	return X86EMUL_UNHANDLEABLE;
8066	}
8067
8068	#ifdef CONFIG_X86_64
8069	/ (a << shift) / divisor, return 1 if overflow otherwise 0 /
8070	static inline int u64_shl_div_u64(u64 a, unsigned int shift,
8071	u64 divisor, u64 *result)
8072	{
8073	u64 low = a << shift, high = a >> (`64` - shift);
8074
8075	/ To avoid the overflow on divq /
8076	if (high >= divisor)
8077	return `1`;
8078
8079	/ Low hold the result, high hold rem which is discarded /
8080	asm("divq %2\n\t" : "=a" (low), "=d" (high) :
8081	"rm" (divisor), "0" (low), "1" (high));
8082	*result = low;
8083
8084	return `0`;
8085	}
8086
8087	static int vmx_set_hv_timer(struct kvm_vcpu *vcpu, u64 guest_deadline_tsc,
8088	bool *expired)
8089	{
8090	struct vcpu_vmx *vmx;
8091	u64 tscl, guest_tscl, delta_tsc, lapic_timer_advance_cycles;
8092	struct kvm_timer *ktimer = &vcpu->arch.apic->lapic_timer;
8093
8094	vmx = to_vmx(vcpu);
8095	tscl = rdtsc();
8096	guest_tscl = kvm_read_l1_tsc(vcpu, host_tsc: tscl);
8097	delta_tsc = max(guest_deadline_tsc, guest_tscl) - guest_tscl;
8098	lapic_timer_advance_cycles = nsec_to_cycles(vcpu,
8099	nsec: ktimer->timer_advance_ns);
8100
8101	if (delta_tsc > lapic_timer_advance_cycles)
8102	delta_tsc -= lapic_timer_advance_cycles;
8103	else
8104	delta_tsc = `0`;
8105
8106	/ Convert to host delta tsc if tsc scaling is enabled /
8107	if (vcpu->arch.l1_tsc_scaling_ratio != kvm_caps.default_tsc_scaling_ratio &&
8108	delta_tsc && u64_shl_div_u64(a: delta_tsc,
8109	shift: kvm_caps.tsc_scaling_ratio_frac_bits,
8110	divisor: vcpu->arch.l1_tsc_scaling_ratio, result: &delta_tsc))
8111	return -ERANGE;
8112
8113	/*
8114	* If the delta tsc can't fit in the 32 bit after the multi shift,
8115	* we can't use the preemption timer.
8116	* It's possible that it fits on later vmentries, but checking
8117	* on every vmentry is costly so we just use an hrtimer.
8118	*/
8119	if (delta_tsc >> (cpu_preemption_timer_multi + `32`))
8120	return -ERANGE;
8121
8122	vmx->hv_deadline_tsc = tscl + delta_tsc;
8123	*expired = !delta_tsc;
8124	return `0`;
8125	}
8126
8127	static void vmx_cancel_hv_timer(struct kvm_vcpu *vcpu)
8128	{
8129	to_vmx(vcpu)->hv_deadline_tsc = -`1`;
8130	}
8131	#endif
8132
8133	static void vmx_sched_in(struct kvm_vcpu vcpu, int* cpu)
8134	{
8135	if (!kvm_pause_in_guest(kvm: vcpu->kvm))
8136	shrink_ple_window(vcpu);
8137	}
8138
8139	void vmx_update_cpu_dirty_logging(struct kvm_vcpu *vcpu)
8140	{
8141	struct vcpu_vmx *vmx = to_vmx(vcpu);
8142
8143	if (WARN_ON_ONCE(!enable_pml))
8144	return;
8145
8146	if (is_guest_mode(vcpu)) {
8147	vmx->nested.update_vmcs01_cpu_dirty_logging = true;
8148	return;
8149	}
8150
8151	/*
8152	* Note, nr_memslots_dirty_logging can be changed concurrent with this
8153	* code, but in that case another update request will be made and so
8154	* the guest will never run with a stale PML value.
8155	*/
8156	if (atomic_read(v: &vcpu->kvm->nr_memslots_dirty_logging))
8157	secondary_exec_controls_setbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8158	else
8159	secondary_exec_controls_clearbit(vmx, SECONDARY_EXEC_ENABLE_PML);
8160	}
8161
8162	static void vmx_setup_mce(struct kvm_vcpu *vcpu)
8163	{
8164	if (vcpu->arch.mcg_cap & MCG_LMCE_P)
8165	to_vmx(vcpu)->msr_ia32_feature_control_valid_bits \|=
8166	FEAT_CTL_LMCE_ENABLED;
8167	else
8168	to_vmx(vcpu)->msr_ia32_feature_control_valid_bits &=
8169	~FEAT_CTL_LMCE_ENABLED;
8170	}
8171
8172	#ifdef CONFIG_KVM_SMM
8173	static int vmx_smi_allowed(struct kvm_vcpu *vcpu, bool for_injection)
8174	{
8175	/ we need a nested vmexit to enter SMM, postpone if run is pending /
8176	if (to_vmx(vcpu)->nested.nested_run_pending)
8177	return -EBUSY;
8178	return !is_smm(vcpu);
8179	}
8180
8181	static int vmx_enter_smm(struct kvm_vcpu vcpu, union* kvm_smram *smram)
8182	{
8183	struct vcpu_vmx *vmx = to_vmx(vcpu);
8184
8185	/*
8186	* TODO: Implement custom flows for forcing the vCPU out/in of L2 on
8187	* SMI and RSM. Using the common VM-Exit + VM-Enter routines is wrong
8188	* SMI and RSM only modify state that is saved and restored via SMRAM.
8189	* E.g. most MSRs are left untouched, but many are modified by VM-Exit
8190	* and VM-Enter, and thus L2's values may be corrupted on SMI+RSM.
8191	*/
8192	vmx->nested.smm.guest_mode = is_guest_mode(vcpu);
8193	if (vmx->nested.smm.guest_mode)
8194	nested_vmx_vmexit(vcpu, vm_exit_reason: -`1`, exit_intr_info: `0`, exit_qualification: `0`);
8195
8196	vmx->nested.smm.vmxon = vmx->nested.vmxon;
8197	vmx->nested.vmxon = false;
8198	vmx_clear_hlt(vcpu);
8199	return `0`;
8200	}
8201
8202	static int vmx_leave_smm(struct kvm_vcpu vcpu, const* union kvm_smram *smram)
8203	{
8204	struct vcpu_vmx *vmx = to_vmx(vcpu);
8205	int ret;
8206
8207	if (vmx->nested.smm.vmxon) {
8208	vmx->nested.vmxon = true;
8209	vmx->nested.smm.vmxon = false;
8210	}
8211
8212	if (vmx->nested.smm.guest_mode) {
8213	ret = nested_vmx_enter_non_root_mode(vcpu, from_vmentry: false);
8214	if (ret)
8215	return ret;
8216
8217	vmx->nested.nested_run_pending = `1`;
8218	vmx->nested.smm.guest_mode = false;
8219	}
8220	return `0`;
8221	}
8222
8223	static void vmx_enable_smi_window(struct kvm_vcpu *vcpu)
8224	{
8225	/ RSM will cause a vmexit anyway. /
8226	}
8227	#endif
8228
8229	static bool vmx_apic_init_signal_blocked(struct kvm_vcpu *vcpu)
8230	{
8231	return to_vmx(vcpu)->nested.vmxon && !is_guest_mode(vcpu);
8232	}
8233
8234	static void vmx_migrate_timers(struct kvm_vcpu *vcpu)
8235	{
8236	if (is_guest_mode(vcpu)) {
8237	struct hrtimer *timer = &to_vmx(vcpu)->nested.preemption_timer;
8238
8239	if (hrtimer_try_to_cancel(timer) == `1`)
8240	hrtimer_start_expires(timer, mode: HRTIMER_MODE_ABS_PINNED);
8241	}
8242	}
8243
8244	static void vmx_hardware_unsetup(void)
8245	{
8246	kvm_set_posted_intr_wakeup_handler(NULL);
8247
8248	if (nested)
8249	nested_vmx_hardware_unsetup();
8250
8251	free_kvm_area();
8252	}
8253
8254	#define VMX_REQUIRED_APICV_INHIBITS \
8255	( \
8256	BIT(APICV_INHIBIT_REASON_DISABLE)\| \
8257	BIT(APICV_INHIBIT_REASON_ABSENT) \| \
8258	BIT(APICV_INHIBIT_REASON_HYPERV) \| \
8259	BIT(APICV_INHIBIT_REASON_BLOCKIRQ) \| \
8260	BIT(APICV_INHIBIT_REASON_PHYSICAL_ID_ALIASED) \| \
8261	BIT(APICV_INHIBIT_REASON_APIC_ID_MODIFIED) \| \
8262	BIT(APICV_INHIBIT_REASON_APIC_BASE_MODIFIED) \
8263	)
8264
8265	static void vmx_vm_destroy(struct kvm *kvm)
8266	{
8267	struct kvm_vmx *kvm_vmx = to_kvm_vmx(kvm);
8268
8269	free_pages(addr: (unsigned long)kvm_vmx->pid_table, order: vmx_get_pid_table_order(kvm));
8270	}
8271
8272	/*
8273	* Note, the SDM states that the linear address is masked after the modified
8274	* canonicality check, whereas KVM masks (untags) the address and then performs
8275	* a "normal" canonicality check. Functionally, the two methods are identical,
8276	* and when the masking occurs relative to the canonicality check isn't visible
8277	* to software, i.e. KVM's behavior doesn't violate the SDM.
8278	*/
8279	gva_t vmx_get_untagged_addr(struct kvm_vcpu vcpu, gva_t gva, unsigned* int flags)
8280	{
8281	int lam_bit;
8282	unsigned long cr3_bits;
8283
8284	if (flags & (X86EMUL_F_FETCH \| X86EMUL_F_IMPLICIT \| X86EMUL_F_INVLPG))
8285	return gva;
8286
8287	if (!is_64_bit_mode(vcpu))
8288	return gva;
8289
8290	/*
8291	* Bit 63 determines if the address should be treated as user address
8292	* or a supervisor address.
8293	*/
8294	if (!(gva & BIT_ULL(`63`))) {
8295	cr3_bits = kvm_get_active_cr3_lam_bits(vcpu);
8296	if (!(cr3_bits & (X86_CR3_LAM_U57 \| X86_CR3_LAM_U48)))
8297	return gva;
8298
8299	/ LAM_U48 is ignored if LAM_U57 is set. /
8300	lam_bit = cr3_bits & X86_CR3_LAM_U57 ? `56` : `47`;
8301	} else {
8302	if (!kvm_is_cr4_bit_set(vcpu, X86_CR4_LAM_SUP))
8303	return gva;
8304
8305	lam_bit = kvm_is_cr4_bit_set(vcpu, X86_CR4_LA57) ? `56` : `47`;
8306	}
8307
8308	/*
8309	* Untag the address by sign-extending the lam_bit, but NOT to bit 63.
8310	* Bit 63 is retained from the raw virtual address so that untagging
8311	* doesn't change a user access to a supervisor access, and vice versa.
8312	*/
8313	return (sign_extend64(value: gva, index: lam_bit) & ~BIT_ULL(`63`)) \| (gva & BIT_ULL(`63`));
8314	}
8315
8316	static struct kvm_x86_ops vmx_x86_ops __initdata = {
8317	.name = KBUILD_MODNAME,
8318
8319	.check_processor_compatibility = vmx_check_processor_compat,
8320
8321	.hardware_unsetup = vmx_hardware_unsetup,
8322
8323	.hardware_enable = vmx_hardware_enable,
8324	.hardware_disable = vmx_hardware_disable,
8325	.has_emulated_msr = vmx_has_emulated_msr,
8326
8327	.vm_size = sizeof(struct kvm_vmx),
8328	.vm_init = vmx_vm_init,
8329	.vm_destroy = vmx_vm_destroy,
8330
8331	.vcpu_precreate = vmx_vcpu_precreate,
8332	.vcpu_create = vmx_vcpu_create,
8333	.vcpu_free = vmx_vcpu_free,
8334	.vcpu_reset = vmx_vcpu_reset,
8335
8336	.prepare_switch_to_guest = vmx_prepare_switch_to_guest,
8337	.vcpu_load = vmx_vcpu_load,
8338	.vcpu_put = vmx_vcpu_put,
8339
8340	.update_exception_bitmap = vmx_update_exception_bitmap,
8341	.get_msr_feature = vmx_get_msr_feature,
8342	.get_msr = vmx_get_msr,
8343	.set_msr = vmx_set_msr,
8344	.get_segment_base = vmx_get_segment_base,
8345	.get_segment = vmx_get_segment,
8346	.set_segment = vmx_set_segment,
8347	.get_cpl = vmx_get_cpl,
8348	.get_cs_db_l_bits = vmx_get_cs_db_l_bits,
8349	.is_valid_cr0 = vmx_is_valid_cr0,
8350	.set_cr0 = vmx_set_cr0,
8351	.is_valid_cr4 = vmx_is_valid_cr4,
8352	.set_cr4 = vmx_set_cr4,
8353	.set_efer = vmx_set_efer,
8354	.get_idt = vmx_get_idt,
8355	.set_idt = vmx_set_idt,
8356	.get_gdt = vmx_get_gdt,
8357	.set_gdt = vmx_set_gdt,
8358	.set_dr7 = vmx_set_dr7,
8359	.sync_dirty_debug_regs = vmx_sync_dirty_debug_regs,
8360	.cache_reg = vmx_cache_reg,
8361	.get_rflags = vmx_get_rflags,
8362	.set_rflags = vmx_set_rflags,
8363	.get_if_flag = vmx_get_if_flag,
8364
8365	.flush_tlb_all = vmx_flush_tlb_all,
8366	.flush_tlb_current = vmx_flush_tlb_current,
8367	.flush_tlb_gva = vmx_flush_tlb_gva,
8368	.flush_tlb_guest = vmx_flush_tlb_guest,
8369
8370	.vcpu_pre_run = vmx_vcpu_pre_run,
8371	.vcpu_run = vmx_vcpu_run,
8372	.handle_exit = vmx_handle_exit,
8373	.skip_emulated_instruction = vmx_skip_emulated_instruction,
8374	.update_emulated_instruction = vmx_update_emulated_instruction,
8375	.set_interrupt_shadow = vmx_set_interrupt_shadow,
8376	.get_interrupt_shadow = vmx_get_interrupt_shadow,
8377	.patch_hypercall = vmx_patch_hypercall,
8378	.inject_irq = vmx_inject_irq,
8379	.inject_nmi = vmx_inject_nmi,
8380	.inject_exception = vmx_inject_exception,
8381	.cancel_injection = vmx_cancel_injection,
8382	.interrupt_allowed = vmx_interrupt_allowed,
8383	.nmi_allowed = vmx_nmi_allowed,
8384	.get_nmi_mask = vmx_get_nmi_mask,
8385	.set_nmi_mask = vmx_set_nmi_mask,
8386	.enable_nmi_window = vmx_enable_nmi_window,
8387	.enable_irq_window = vmx_enable_irq_window,
8388	.update_cr8_intercept = vmx_update_cr8_intercept,
8389	.set_virtual_apic_mode = vmx_set_virtual_apic_mode,
8390	.set_apic_access_page_addr = vmx_set_apic_access_page_addr,
8391	.refresh_apicv_exec_ctrl = vmx_refresh_apicv_exec_ctrl,
8392	.load_eoi_exitmap = vmx_load_eoi_exitmap,
8393	.apicv_pre_state_restore = vmx_apicv_pre_state_restore,
8394	.required_apicv_inhibits = VMX_REQUIRED_APICV_INHIBITS,
8395	.hwapic_irr_update = vmx_hwapic_irr_update,
8396	.hwapic_isr_update = vmx_hwapic_isr_update,
8397	.guest_apic_has_interrupt = vmx_guest_apic_has_interrupt,
8398	.sync_pir_to_irr = vmx_sync_pir_to_irr,
8399	.deliver_interrupt = vmx_deliver_interrupt,
8400	.dy_apicv_has_pending_interrupt = pi_has_pending_interrupt,
8401
8402	.set_tss_addr = vmx_set_tss_addr,
8403	.set_identity_map_addr = vmx_set_identity_map_addr,
8404	.get_mt_mask = vmx_get_mt_mask,
8405
8406	.get_exit_info = vmx_get_exit_info,
8407
8408	.vcpu_after_set_cpuid = vmx_vcpu_after_set_cpuid,
8409
8410	.has_wbinvd_exit = cpu_has_vmx_wbinvd_exit,
8411
8412	.get_l2_tsc_offset = vmx_get_l2_tsc_offset,
8413	.get_l2_tsc_multiplier = vmx_get_l2_tsc_multiplier,
8414	.write_tsc_offset = vmx_write_tsc_offset,
8415	.write_tsc_multiplier = vmx_write_tsc_multiplier,
8416
8417	.load_mmu_pgd = vmx_load_mmu_pgd,
8418
8419	.check_intercept = vmx_check_intercept,
8420	.handle_exit_irqoff = vmx_handle_exit_irqoff,
8421
8422	.sched_in = vmx_sched_in,
8423
8424	.cpu_dirty_log_size = PML_ENTITY_NUM,
8425	.update_cpu_dirty_logging = vmx_update_cpu_dirty_logging,
8426
8427	.nested_ops = &vmx_nested_ops,
8428
8429	.pi_update_irte = vmx_pi_update_irte,
8430	.pi_start_assignment = vmx_pi_start_assignment,
8431
8432	#ifdef CONFIG_X86_64
8433	.set_hv_timer = vmx_set_hv_timer,
8434	.cancel_hv_timer = vmx_cancel_hv_timer,
8435	#endif
8436
8437	.setup_mce = vmx_setup_mce,
8438
8439	#ifdef CONFIG_KVM_SMM
8440	.smi_allowed = vmx_smi_allowed,
8441	.enter_smm = vmx_enter_smm,
8442	.leave_smm = vmx_leave_smm,
8443	.enable_smi_window = vmx_enable_smi_window,
8444	#endif
8445
8446	.check_emulate_instruction = vmx_check_emulate_instruction,
8447	.apic_init_signal_blocked = vmx_apic_init_signal_blocked,
8448	.migrate_timers = vmx_migrate_timers,
8449
8450	.msr_filter_changed = vmx_msr_filter_changed,
8451	.complete_emulated_msr = kvm_complete_insn_gp,
8452
8453	.vcpu_deliver_sipi_vector = kvm_vcpu_deliver_sipi_vector,
8454
8455	.get_untagged_addr = vmx_get_untagged_addr,
8456	};
8457
8458	static unsigned int vmx_handle_intel_pt_intr(void)
8459	{
8460	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
8461
8462	/ '0' on failure so that the !PT case can use a RET0 static call. /
8463	if (!vcpu \|\| !kvm_handling_nmi_from_guest(vcpu))
8464	return `0`;
8465
8466	kvm_make_request(KVM_REQ_PMI, vcpu);
8467	__set_bit(MSR_CORE_PERF_GLOBAL_OVF_CTRL_TRACE_TOPA_PMI_BIT,
8468	(unsigned long *)&vcpu->arch.pmu.global_status);
8469	return `1`;
8470	}
8471
8472	static __init void vmx_setup_user_return_msrs(void)
8473	{
8474
8475	/*
8476	* Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
8477	* will emulate SYSCALL in legacy mode if the vendor string in guest
8478	* CPUID.0:{EBX,ECX,EDX} is "AuthenticAMD" or "AMDisbetter!" To
8479	* support this emulation, MSR_STAR is included in the list for i386,
8480	* but is never loaded into hardware. MSR_CSTAR is also never loaded
8481	* into hardware and is here purely for emulation purposes.
8482	*/
8483	const u32 vmx_uret_msrs_list[] = {
8484	#ifdef CONFIG_X86_64
8485	MSR_SYSCALL_MASK, MSR_LSTAR, MSR_CSTAR,
8486	#endif
8487	MSR_EFER, MSR_TSC_AUX, MSR_STAR,
8488	MSR_IA32_TSX_CTRL,
8489	};
8490	int i;
8491
8492	BUILD_BUG_ON(ARRAY_SIZE(vmx_uret_msrs_list) != MAX_NR_USER_RETURN_MSRS);
8493
8494	for (i = `0`; i < ARRAY_SIZE(vmx_uret_msrs_list); ++i)
8495	kvm_add_user_return_msr(msr: vmx_uret_msrs_list[i]);
8496	}
8497
8498	static void __init vmx_setup_me_spte_mask(void)
8499	{
8500	u64 me_mask = `0`;
8501
8502	/*
8503	* kvm_get_shadow_phys_bits() returns shadow_phys_bits. Use
8504	* the former to avoid exposing shadow_phys_bits.
8505	*
8506	* On pre-MKTME system, boot_cpu_data.x86_phys_bits equals to
8507	* shadow_phys_bits. On MKTME and/or TDX capable systems,
8508	* boot_cpu_data.x86_phys_bits holds the actual physical address
8509	* w/o the KeyID bits, and shadow_phys_bits equals to MAXPHYADDR
8510	* reported by CPUID. Those bits between are KeyID bits.
8511	*/
8512	if (boot_cpu_data.x86_phys_bits != kvm_get_shadow_phys_bits())
8513	me_mask = rsvd_bits(s: boot_cpu_data.x86_phys_bits,
8514	e: kvm_get_shadow_phys_bits() - `1`);
8515	/*
8516	* Unlike SME, host kernel doesn't support setting up any
8517	* MKTME KeyID on Intel platforms. No memory encryption
8518	* bits should be included into the SPTE.
8519	*/
8520	kvm_mmu_set_me_spte_mask(me_value: `0`, me_mask);
8521	}
8522
8523	static struct kvm_x86_init_ops vmx_init_ops __initdata;
8524
8525	static __init int hardware_setup(void)
8526	{
8527	unsigned long host_bndcfgs;
8528	struct desc_ptr dt;
8529	int r;
8530
8531	store_idt(dtr: &dt);
8532	host_idt_base = dt.address;
8533
8534	vmx_setup_user_return_msrs();
8535
8536	if (setup_vmcs_config(vmcs_conf: &vmcs_config, vmx_cap: &vmx_capability) < `0`)
8537	return -EIO;
8538
8539	if (cpu_has_perf_global_ctrl_bug())
8540	pr_warn_once("VM_EXIT_LOAD_IA32_PERF_GLOBAL_CTRL "
8541	"does not work properly. Using workaround\n");
8542
8543	if (boot_cpu_has(X86_FEATURE_NX))
8544	kvm_enable_efer_bits(EFER_NX);
8545
8546	if (boot_cpu_has(X86_FEATURE_MPX)) {
8547	rdmsrl(MSR_IA32_BNDCFGS, host_bndcfgs);
8548	WARN_ONCE(host_bndcfgs, "BNDCFGS in host will be lost");
8549	}
8550
8551	if (!cpu_has_vmx_mpx())
8552	kvm_caps.supported_xcr0 &= ~(XFEATURE_MASK_BNDREGS \|
8553	XFEATURE_MASK_BNDCSR);
8554
8555	if (!cpu_has_vmx_vpid() \|\| !cpu_has_vmx_invvpid() \|\|
8556	!(cpu_has_vmx_invvpid_single() \|\| cpu_has_vmx_invvpid_global()))
8557	enable_vpid = `0`;
8558
8559	if (!cpu_has_vmx_ept() \|\|
8560	!cpu_has_vmx_ept_4levels() \|\|
8561	!cpu_has_vmx_ept_mt_wb() \|\|
8562	!cpu_has_vmx_invept_global())
8563	enable_ept = `0`;
8564
8565	/ NX support is required for shadow paging. /
8566	if (!enable_ept && !boot_cpu_has(X86_FEATURE_NX)) {
8567	pr_err_ratelimited("NX (Execute Disable) not supported\n");
8568	return -EOPNOTSUPP;
8569	}
8570
8571	if (!cpu_has_vmx_ept_ad_bits() \|\| !enable_ept)
8572	enable_ept_ad_bits = `0`;
8573
8574	if (!cpu_has_vmx_unrestricted_guest() \|\| !enable_ept)
8575	enable_unrestricted_guest = `0`;
8576
8577	if (!cpu_has_vmx_flexpriority())
8578	flexpriority_enabled = `0`;
8579
8580	if (!cpu_has_virtual_nmis())
8581	enable_vnmi = `0`;
8582
8583	#ifdef CONFIG_X86_SGX_KVM
8584	if (!cpu_has_vmx_encls_vmexit())
8585	enable_sgx = false;
8586	#endif
8587
8588	/*
8589	* set_apic_access_page_addr() is used to reload apic access
8590	* page upon invalidation. No need to do anything if not
8591	* using the APIC_ACCESS_ADDR VMCS field.
8592	*/
8593	if (!flexpriority_enabled)
8594	vmx_x86_ops.set_apic_access_page_addr = NULL;
8595
8596	if (!cpu_has_vmx_tpr_shadow())
8597	vmx_x86_ops.update_cr8_intercept = NULL;
8598
8599	#if IS_ENABLED(CONFIG_HYPERV)
8600	if (ms_hyperv.nested_features & HV_X64_NESTED_GUEST_MAPPING_FLUSH
8601	&& enable_ept) {
8602	vmx_x86_ops.flush_remote_tlbs = hv_flush_remote_tlbs;
8603	vmx_x86_ops.flush_remote_tlbs_range = hv_flush_remote_tlbs_range;
8604	}
8605	#endif
8606
8607	if (!cpu_has_vmx_ple()) {
8608	ple_gap = `0`;
8609	ple_window = `0`;
8610	ple_window_grow = `0`;
8611	ple_window_max = `0`;
8612	ple_window_shrink = `0`;
8613	}
8614
8615	if (!cpu_has_vmx_apicv())
8616	enable_apicv = `0`;
8617	if (!enable_apicv)
8618	vmx_x86_ops.sync_pir_to_irr = NULL;
8619
8620	if (!enable_apicv \|\| !cpu_has_vmx_ipiv())
8621	enable_ipiv = false;
8622
8623	if (cpu_has_vmx_tsc_scaling())
8624	kvm_caps.has_tsc_control = true;
8625
8626	kvm_caps.max_tsc_scaling_ratio = KVM_VMX_TSC_MULTIPLIER_MAX;
8627	kvm_caps.tsc_scaling_ratio_frac_bits = `48`;
8628	kvm_caps.has_bus_lock_exit = cpu_has_vmx_bus_lock_detection();
8629	kvm_caps.has_notify_vmexit = cpu_has_notify_vmexit();
8630
8631	set_bit(nr: `0`, addr: vmx_vpid_bitmap); / 0 is reserved for host /
8632
8633	if (enable_ept)
8634	kvm_mmu_set_ept_masks(has_ad_bits: enable_ept_ad_bits,
8635	has_exec_only: cpu_has_vmx_ept_execute_only());
8636
8637	/*
8638	* Setup shadow_me_value/shadow_me_mask to include MKTME KeyID
8639	* bits to shadow_zero_check.
8640	*/
8641	vmx_setup_me_spte_mask();
8642
8643	kvm_configure_mmu(enable_tdp: enable_ept, tdp_forced_root_level: `0`, tdp_max_root_level: vmx_get_max_ept_level(),
8644	tdp_huge_page_level: ept_caps_to_lpage_level(ept_caps: vmx_capability.ept));
8645
8646	/*
8647	* Only enable PML when hardware supports PML feature, and both EPT
8648	* and EPT A/D bit features are enabled -- PML depends on them to work.
8649	*/
8650	if (!enable_ept \|\| !enable_ept_ad_bits \|\| !cpu_has_vmx_pml())
8651	enable_pml = `0`;
8652
8653	if (!enable_pml)
8654	vmx_x86_ops.cpu_dirty_log_size = `0`;
8655
8656	if (!cpu_has_vmx_preemption_timer())
8657	enable_preemption_timer = false;
8658
8659	if (enable_preemption_timer) {
8660	u64 use_timer_freq = `5000ULL` * `1000` * `1000`;
8661
8662	cpu_preemption_timer_multi =
8663	vmcs_config.misc & VMX_MISC_PREEMPTION_TIMER_RATE_MASK;
8664
8665	if (tsc_khz)
8666	use_timer_freq = (u64)tsc_khz * `1000`;
8667	use_timer_freq >>= cpu_preemption_timer_multi;
8668
8669	/*
8670	* KVM "disables" the preemption timer by setting it to its max
8671	* value. Don't use the timer if it might cause spurious exits
8672	* at a rate faster than 0.1 Hz (of uninterrupted guest time).
8673	*/
8674	if (use_timer_freq > `0xffffffffu` / `10`)
8675	enable_preemption_timer = false;
8676	}
8677
8678	if (!enable_preemption_timer) {
8679	vmx_x86_ops.set_hv_timer = NULL;
8680	vmx_x86_ops.cancel_hv_timer = NULL;
8681	}
8682
8683	kvm_caps.supported_mce_cap \|= MCG_LMCE_P;
8684	kvm_caps.supported_mce_cap \|= MCG_CMCI_P;
8685
8686	if (pt_mode != PT_MODE_SYSTEM && pt_mode != PT_MODE_HOST_GUEST)
8687	return -EINVAL;
8688	if (!enable_ept \|\| !enable_pmu \|\| !cpu_has_vmx_intel_pt())
8689	pt_mode = PT_MODE_SYSTEM;
8690	if (pt_mode == PT_MODE_HOST_GUEST)
8691	vmx_init_ops.handle_intel_pt_intr = vmx_handle_intel_pt_intr;
8692	else
8693	vmx_init_ops.handle_intel_pt_intr = NULL;
8694
8695	setup_default_sgx_lepubkeyhash();
8696
8697	if (nested) {
8698	nested_vmx_setup_ctls_msrs(vmcs_conf: &vmcs_config, ept_caps: vmx_capability.ept);
8699
8700	r = nested_vmx_hardware_setup(exit_handlers: kvm_vmx_exit_handlers);
8701	if (r)
8702	return r;
8703	}
8704
8705	vmx_set_cpu_caps();
8706
8707	r = alloc_kvm_area();
8708	if (r && nested)
8709	nested_vmx_hardware_unsetup();
8710
8711	kvm_set_posted_intr_wakeup_handler(handler: pi_wakeup_handler);
8712
8713	return r;
8714	}
8715
8716	static struct kvm_x86_init_ops vmx_init_ops __initdata = {
8717	.hardware_setup = hardware_setup,
8718	.handle_intel_pt_intr = NULL,
8719
8720	.runtime_ops = &vmx_x86_ops,
8721	.pmu_ops = &intel_pmu_ops,
8722	};
8723
8724	static void vmx_cleanup_l1d_flush(void)
8725	{
8726	if (vmx_l1d_flush_pages) {
8727	free_pages(addr: (unsigned long)vmx_l1d_flush_pages, L1D_CACHE_ORDER);
8728	vmx_l1d_flush_pages = NULL;
8729	}
8730	/ Restore state so sysfs ignores VMX /
8731	l1tf_vmx_mitigation = VMENTER_L1D_FLUSH_AUTO;
8732	}
8733
8734	static void __vmx_exit(void)
8735	{
8736	allow_smaller_maxphyaddr = false;
8737
8738	cpu_emergency_unregister_virt_callback(callback: vmx_emergency_disable);
8739
8740	vmx_cleanup_l1d_flush();
8741	}
8742
8743	static void vmx_exit(void)
8744	{
8745	kvm_exit();
8746	kvm_x86_vendor_exit();
8747
8748	__vmx_exit();
8749	}
8750	module_exit(vmx_exit);
8751
8752	static int __init vmx_init(void)
8753	{
8754	int r, cpu;
8755
8756	if (!kvm_is_vmx_supported())
8757	return -EOPNOTSUPP;
8758
8759	/*
8760	* Note, hv_init_evmcs() touches only VMX knobs, i.e. there's nothing
8761	* to unwind if a later step fails.
8762	*/
8763	hv_init_evmcs();
8764
8765	r = kvm_x86_vendor_init(ops: &vmx_init_ops);
8766	if (r)
8767	return r;
8768
8769	/*
8770	* Must be called after common x86 init so enable_ept is properly set
8771	* up. Hand the parameter mitigation value in which was stored in
8772	* the pre module init parser. If no parameter was given, it will
8773	* contain 'auto' which will be turned into the default 'cond'
8774	* mitigation mode.
8775	*/
8776	r = vmx_setup_l1d_flush(l1tf: vmentry_l1d_flush_param);
8777	if (r)
8778	goto err_l1d_flush;
8779
8780	for_each_possible_cpu(cpu) {
8781	INIT_LIST_HEAD(list: &per_cpu(loaded_vmcss_on_cpu, cpu));
8782
8783	pi_init_cpu(cpu);
8784	}
8785
8786	cpu_emergency_register_virt_callback(callback: vmx_emergency_disable);
8787
8788	vmx_check_vmcs12_offsets();
8789
8790	/*
8791	* Shadow paging doesn't have a (further) performance penalty
8792	* from GUEST_MAXPHYADDR < HOST_MAXPHYADDR so enable it
8793	* by default
8794	*/
8795	if (!enable_ept)
8796	allow_smaller_maxphyaddr = true;
8797
8798	/*
8799	* Common KVM initialization _must_ come last, after this, /dev/kvm is
8800	* exposed to userspace!
8801	*/
8802	r = kvm_init(vcpu_size: sizeof(struct vcpu_vmx), vcpu_align: __alignof__(struct vcpu_vmx),
8803	THIS_MODULE);
8804	if (r)
8805	goto err_kvm_init;
8806
8807	return `0`;
8808
8809	err_kvm_init:
8810	__vmx_exit();
8811	err_l1d_flush:
8812	kvm_x86_vendor_exit();
8813	return r;
8814	}
8815	module_init(vmx_init);
8816

source code of linux/arch/x86/kvm/vmx/vmx.c