mmu.c source code [linux/arch/x86/kvm/mmu/mmu.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Kernel-based Virtual Machine driver for Linux
4	*
5	* This module enables machines with Intel VT-x extensions to run virtual
6	* machines without emulation or binary translation.
7	*
8	* MMU support
9	*
10	* Copyright (C) 2006 Qumranet, Inc.
11	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
12	*
13	* Authors:
14	* Yaniv Kamay <yaniv@qumranet.com>
15	* Avi Kivity <avi@qumranet.com>
16	*/
17	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
18
19	#include "irq.h"
20	#include "ioapic.h"
21	#include "mmu.h"
22	#include "mmu_internal.h"
23	#include "tdp_mmu.h"
24	#include "x86.h"
25	#include "kvm_cache_regs.h"
26	#include "smm.h"
27	#include "kvm_emulate.h"
28	#include "page_track.h"
29	#include "cpuid.h"
30	#include "spte.h"
31
32	#include <linux/kvm_host.h>
33	#include <linux/types.h>
34	#include <linux/string.h>
35	#include <linux/mm.h>
36	#include <linux/highmem.h>
37	#include <linux/moduleparam.h>
38	#include <linux/export.h>
39	#include <linux/swap.h>
40	#include <linux/hugetlb.h>
41	#include <linux/compiler.h>
42	#include <linux/srcu.h>
43	#include <linux/slab.h>
44	#include <linux/sched/signal.h>
45	#include <linux/uaccess.h>
46	#include <linux/hash.h>
47	#include <linux/kern_levels.h>
48	#include <linux/kstrtox.h>
49	#include <linux/kthread.h>
50	#include <linux/wordpart.h>
51
52	#include <asm/page.h>
53	#include <asm/memtype.h>
54	#include <asm/cmpxchg.h>
55	#include <asm/io.h>
56	#include <asm/set_memory.h>
57	#include <asm/spec-ctrl.h>
58	#include <asm/vmx.h>
59
60	#include "trace.h"
61
62	static bool nx_hugepage_mitigation_hard_disabled;
63
64	int __read_mostly nx_huge_pages = -`1`;
65	static uint __read_mostly nx_huge_pages_recovery_period_ms;
66	#ifdef CONFIG_PREEMPT_RT
67	/ Recovery can cause latency spikes, disable it for PREEMPT_RT. /
68	static uint __read_mostly nx_huge_pages_recovery_ratio = `0`;
69	#else
70	static uint __read_mostly nx_huge_pages_recovery_ratio = `60`;
71	#endif
72
73	static int get_nx_huge_pages(char buffer, const* struct kernel_param *kp);
74	static int set_nx_huge_pages(const char val, const* struct kernel_param *kp);
75	static int set_nx_huge_pages_recovery_param(const char val, const* struct kernel_param *kp);
76
77	static const struct kernel_param_ops nx_huge_pages_ops = {
78	.set = set_nx_huge_pages,
79	.get = get_nx_huge_pages,
80	};
81
82	static const struct kernel_param_ops nx_huge_pages_recovery_param_ops = {
83	.set = set_nx_huge_pages_recovery_param,
84	.get = param_get_uint,
85	};
86
87	module_param_cb(nx_huge_pages, &nx_huge_pages_ops, &nx_huge_pages, `0644`);
88	__MODULE_PARM_TYPE(nx_huge_pages, "bool");
89	module_param_cb(nx_huge_pages_recovery_ratio, &nx_huge_pages_recovery_param_ops,
90	&nx_huge_pages_recovery_ratio, `0644`);
91	__MODULE_PARM_TYPE(nx_huge_pages_recovery_ratio, "uint");
92	module_param_cb(nx_huge_pages_recovery_period_ms, &nx_huge_pages_recovery_param_ops,
93	&nx_huge_pages_recovery_period_ms, `0644`);
94	__MODULE_PARM_TYPE(nx_huge_pages_recovery_period_ms, "uint");
95
96	static bool __read_mostly force_flush_and_sync_on_reuse;
97	module_param_named(flush_on_reuse, force_flush_and_sync_on_reuse, bool, `0644`);
98
99	/*
100	* When setting this variable to true it enables Two-Dimensional-Paging
101	* where the hardware walks 2 page tables:
102	* 1. the guest-virtual to guest-physical
103	* 2. while doing 1. it walks guest-physical to host-physical
104	* If the hardware supports that we don't need to do shadow paging.
105	*/
106	bool tdp_enabled = false;
107
108	static bool __ro_after_init tdp_mmu_allowed;
109
110	#ifdef CONFIG_X86_64
111	bool __read_mostly tdp_mmu_enabled = true;
112	module_param_named(tdp_mmu, tdp_mmu_enabled, bool, `0444`);
113	#endif
114
115	static int max_huge_page_level __read_mostly;
116	static int tdp_root_level __read_mostly;
117	static int max_tdp_level __read_mostly;
118
119	#define PTE_PREFETCH_NUM 8
120
121	#include <trace/events/kvm.h>
122
123	/ make pte_list_desc fit well in cache lines /
124	#define PTE_LIST_EXT 14
125
126	/*
127	* struct pte_list_desc is the core data structure used to implement a custom
128	* list for tracking a set of related SPTEs, e.g. all the SPTEs that map a
129	* given GFN when used in the context of rmaps. Using a custom list allows KVM
130	* to optimize for the common case where many GFNs will have at most a handful
131	* of SPTEs pointing at them, i.e. allows packing multiple SPTEs into a small
132	* memory footprint, which in turn improves runtime performance by exploiting
133	* cache locality.
134	*
135	* A list is comprised of one or more pte_list_desc objects (descriptors).
136	* Each individual descriptor stores up to PTE_LIST_EXT SPTEs. If a descriptor
137	* is full and a new SPTEs needs to be added, a new descriptor is allocated and
138	* becomes the head of the list. This means that by definitions, all tail
139	* descriptors are full.
140	*
141	* Note, the meta data fields are deliberately placed at the start of the
142	* structure to optimize the cacheline layout; accessing the descriptor will
143	* touch only a single cacheline so long as @spte_count<=6 (or if only the
144	* descriptors metadata is accessed).
145	*/
146	struct pte_list_desc {
147	struct pte_list_desc *more;
148	/ The number of PTEs stored in _this_ descriptor. /
149	u32 spte_count;
150	/ The number of PTEs stored in all tails of this descriptor. /
151	u32 tail_count;
152	u64 *sptes[PTE_LIST_EXT];
153	};
154
155	struct kvm_shadow_walk_iterator {
156	u64 addr;
157	hpa_t shadow_addr;
158	u64 *sptep;
159	int level;
160	unsigned index;
161	};
162
163	#define for_each_shadow_entry_using_root(_vcpu, _root, _addr, _walker) \
164	for (shadow_walk_init_using_root(&(_walker), (_vcpu), \
165	(_root), (_addr)); \
166	shadow_walk_okay(&(_walker)); \
167	shadow_walk_next(&(_walker)))
168
169	#define for_each_shadow_entry(_vcpu, _addr, _walker) \
170	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
171	shadow_walk_okay(&(_walker)); \
172	shadow_walk_next(&(_walker)))
173
174	#define for_each_shadow_entry_lockless(_vcpu, _addr, _walker, spte) \
175	for (shadow_walk_init(&(_walker), _vcpu, _addr); \
176	shadow_walk_okay(&(_walker)) && \
177	({ spte = mmu_spte_get_lockless(_walker.sptep); 1; }); \
178	__shadow_walk_next(&(_walker), spte))
179
180	static struct kmem_cache *pte_list_desc_cache;
181	struct kmem_cache *mmu_page_header_cache;
182	static struct percpu_counter kvm_total_used_mmu_pages;
183
184	static void mmu_spte_set(u64 *sptep, u64 spte);
185
186	struct kvm_mmu_role_regs {
187	const unsigned long cr0;
188	const unsigned long cr4;
189	const u64 efer;
190	};
191
192	#define CREATE_TRACE_POINTS
193	#include "mmutrace.h"
194
195	/*
196	* Yes, lot's of underscores. They're a hint that you probably shouldn't be
197	* reading from the role_regs. Once the root_role is constructed, it becomes
198	* the single source of truth for the MMU's state.
199	*/
200	#define BUILD_MMU_ROLE_REGS_ACCESSOR(reg, name, flag) \
201	static inline bool __maybe_unused \
202	____is_##reg##_##name(const struct kvm_mmu_role_regs *regs) \
203	{ \
204	return !!(regs->reg & flag); \
205	}
206	BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, pg, X86_CR0_PG);
207	BUILD_MMU_ROLE_REGS_ACCESSOR(cr0, wp, X86_CR0_WP);
208	BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pse, X86_CR4_PSE);
209	BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pae, X86_CR4_PAE);
210	BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smep, X86_CR4_SMEP);
211	BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, smap, X86_CR4_SMAP);
212	BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, pke, X86_CR4_PKE);
213	BUILD_MMU_ROLE_REGS_ACCESSOR(cr4, la57, X86_CR4_LA57);
214	BUILD_MMU_ROLE_REGS_ACCESSOR(efer, nx, EFER_NX);
215	BUILD_MMU_ROLE_REGS_ACCESSOR(efer, lma, EFER_LMA);
216
217	/*
218	* The MMU itself (with a valid role) is the single source of truth for the
219	* MMU. Do not use the regs used to build the MMU/role, nor the vCPU. The
220	* regs don't account for dependencies, e.g. clearing CR4 bits if CR0.PG=1,
221	* and the vCPU may be incorrect/irrelevant.
222	*/
223	#define BUILD_MMU_ROLE_ACCESSOR(base_or_ext, reg, name) \
224	static inline bool __maybe_unused is_##reg##_##name(struct kvm_mmu *mmu) \
225	{ \
226	return !!(mmu->cpu_role. base_or_ext . reg##_##name); \
227	}
228	BUILD_MMU_ROLE_ACCESSOR(base, cr0, wp);
229	BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pse);
230	BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smep);
231	BUILD_MMU_ROLE_ACCESSOR(ext, cr4, smap);
232	BUILD_MMU_ROLE_ACCESSOR(ext, cr4, pke);
233	BUILD_MMU_ROLE_ACCESSOR(ext, cr4, la57);
234	BUILD_MMU_ROLE_ACCESSOR(base, efer, nx);
235	BUILD_MMU_ROLE_ACCESSOR(ext, efer, lma);
236
237	static inline bool is_cr0_pg(struct kvm_mmu *mmu)
238	{
239	return mmu->cpu_role.base.level > `0`;
240	}
241
242	static inline bool is_cr4_pae(struct kvm_mmu *mmu)
243	{
244	return !mmu->cpu_role.base.has_4_byte_gpte;
245	}
246
247	static struct kvm_mmu_role_regs vcpu_to_role_regs(struct kvm_vcpu *vcpu)
248	{
249	struct kvm_mmu_role_regs regs = {
250	.cr0 = kvm_read_cr0_bits(vcpu, KVM_MMU_CR0_ROLE_BITS),
251	.cr4 = kvm_read_cr4_bits(vcpu, KVM_MMU_CR4_ROLE_BITS),
252	.efer = vcpu->arch.efer,
253	};
254
255	return regs;
256	}
257
258	static unsigned long get_guest_cr3(struct kvm_vcpu *vcpu)
259	{
260	return kvm_read_cr3(vcpu);
261	}
262
263	static inline unsigned long kvm_mmu_get_guest_pgd(struct kvm_vcpu *vcpu,
264	struct kvm_mmu *mmu)
265	{
266	if (IS_ENABLED(CONFIG_MITIGATION_RETPOLINE) && mmu->get_guest_pgd == get_guest_cr3)
267	return kvm_read_cr3(vcpu);
268
269	return mmu->get_guest_pgd(vcpu);
270	}
271
272	static inline bool kvm_available_flush_remote_tlbs_range(void)
273	{
274	#if IS_ENABLED(CONFIG_HYPERV)
275	return kvm_x86_ops.flush_remote_tlbs_range;
276	#else
277	return false;
278	#endif
279	}
280
281	static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page sp, int* index);
282
283	/ Flush the range of guest memory mapped by the given SPTE. /
284	static void kvm_flush_remote_tlbs_sptep(struct kvm kvm, u64 sptep)
285	{
286	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
287	gfn_t gfn = kvm_mmu_page_get_gfn(sp, index: spte_index(sptep));
288
289	kvm_flush_remote_tlbs_gfn(kvm, gfn, level: sp->role.level);
290	}
291
292	static void mark_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, u64 gfn,
293	unsigned int access)
294	{
295	u64 spte = make_mmio_spte(vcpu, gfn, access);
296
297	trace_mark_mmio_spte(sptep, gfn, spte);
298	mmu_spte_set(sptep, spte);
299	}
300
301	static gfn_t get_mmio_spte_gfn(u64 spte)
302	{
303	u64 gpa = spte & shadow_nonpresent_or_rsvd_lower_gfn_mask;
304
305	gpa \|= (spte >> SHADOW_NONPRESENT_OR_RSVD_MASK_LEN)
306	& shadow_nonpresent_or_rsvd_mask;
307
308	return gpa >> PAGE_SHIFT;
309	}
310
311	static unsigned get_mmio_spte_access(u64 spte)
312	{
313	return spte & shadow_mmio_access_mask;
314	}
315
316	static bool check_mmio_spte(struct kvm_vcpu *vcpu, u64 spte)
317	{
318	u64 kvm_gen, spte_gen, gen;
319
320	gen = kvm_vcpu_memslots(vcpu)->generation;
321	if (unlikely(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS))
322	return false;
323
324	kvm_gen = gen & MMIO_SPTE_GEN_MASK;
325	spte_gen = get_mmio_spte_generation(spte);
326
327	trace_check_mmio_spte(spte, kvm_gen, spte_gen);
328	return likely(kvm_gen == spte_gen);
329	}
330
331	static int is_cpuid_PSE36(void)
332	{
333	return `1`;
334	}
335
336	#ifdef CONFIG_X86_64
337	static void __set_spte(u64 *sptep, u64 spte)
338	{
339	WRITE_ONCE(*sptep, spte);
340	}
341
342	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
343	{
344	WRITE_ONCE(*sptep, spte);
345	}
346
347	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
348	{
349	return xchg(sptep, spte);
350	}
351
352	static u64 __get_spte_lockless(u64 *sptep)
353	{
354	return READ_ONCE(*sptep);
355	}
356	#else
357	union split_spte {
358	struct {
359	u32 spte_low;
360	u32 spte_high;
361	};
362	u64 spte;
363	};
364
365	static void count_spte_clear(u64 *sptep, u64 spte)
366	{
367	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
368
369	if (is_shadow_present_pte(spte))
370	return;
371
372	/ Ensure the spte is completely set before we increase the count /
373	smp_wmb();
374	sp->clear_spte_count++;
375	}
376
377	static void __set_spte(u64 *sptep, u64 spte)
378	{
379	union split_spte *ssptep, sspte;
380
381	ssptep = (union split_spte *)sptep;
382	sspte = (union split_spte)spte;
383
384	ssptep->spte_high = sspte.spte_high;
385
386	/*
387	* If we map the spte from nonpresent to present, We should store
388	* the high bits firstly, then set present bit, so cpu can not
389	* fetch this spte while we are setting the spte.
390	*/
391	smp_wmb();
392
393	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
394	}
395
396	static void __update_clear_spte_fast(u64 *sptep, u64 spte)
397	{
398	union split_spte *ssptep, sspte;
399
400	ssptep = (union split_spte *)sptep;
401	sspte = (union split_spte)spte;
402
403	WRITE_ONCE(ssptep->spte_low, sspte.spte_low);
404
405	/*
406	* If we map the spte from present to nonpresent, we should clear
407	* present bit firstly to avoid vcpu fetch the old high bits.
408	*/
409	smp_wmb();
410
411	ssptep->spte_high = sspte.spte_high;
412	count_spte_clear(sptep, spte);
413	}
414
415	static u64 __update_clear_spte_slow(u64 *sptep, u64 spte)
416	{
417	union split_spte *ssptep, sspte, orig;
418
419	ssptep = (union split_spte *)sptep;
420	sspte = (union split_spte)spte;
421
422	/ xchg acts as a barrier before the setting of the high bits /
423	orig.spte_low = xchg(&ssptep->spte_low, sspte.spte_low);
424	orig.spte_high = ssptep->spte_high;
425	ssptep->spte_high = sspte.spte_high;
426	count_spte_clear(sptep, spte);
427
428	return orig.spte;
429	}
430
431	/*
432	* The idea using the light way get the spte on x86_32 guest is from
433	* gup_get_pte (mm/gup.c).
434	*
435	* An spte tlb flush may be pending, because kvm_set_pte_rmap
436	* coalesces them and we are running out of the MMU lock. Therefore
437	* we need to protect against in-progress updates of the spte.
438	*
439	* Reading the spte while an update is in progress may get the old value
440	* for the high part of the spte. The race is fine for a present->non-present
441	* change (because the high part of the spte is ignored for non-present spte),
442	* but for a present->present change we must reread the spte.
443	*
444	* All such changes are done in two steps (present->non-present and
445	* non-present->present), hence it is enough to count the number of
446	* present->non-present updates: if it changed while reading the spte,
447	* we might have hit the race. This is done using clear_spte_count.
448	*/
449	static u64 __get_spte_lockless(u64 *sptep)
450	{
451	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
452	union split_spte spte, orig = (union* split_spte *)sptep;
453	int count;
454
455	retry:
456	count = sp->clear_spte_count;
457	smp_rmb();
458
459	spte.spte_low = orig->spte_low;
460	smp_rmb();
461
462	spte.spte_high = orig->spte_high;
463	smp_rmb();
464
465	if (unlikely(spte.spte_low != orig->spte_low \|\|
466	count != sp->clear_spte_count))
467	goto retry;
468
469	return spte.spte;
470	}
471	#endif
472
473	/ Rules for using mmu_spte_set:*
474	* Set the sptep from nonpresent to present.
475	* Note: the sptep being assigned must be either not present
476	* or in a state where the hardware will not attempt to update
477	* the spte.
478	*/
479	static void mmu_spte_set(u64 *sptep, u64 new_spte)
480	{
481	WARN_ON_ONCE(is_shadow_present_pte(*sptep));
482	__set_spte(sptep, spte: new_spte);
483	}
484
485	/*
486	* Update the SPTE (excluding the PFN), but do not track changes in its
487	* accessed/dirty status.
488	*/
489	static u64 mmu_spte_update_no_track(u64 *sptep, u64 new_spte)
490	{
491	u64 old_spte = *sptep;
492
493	WARN_ON_ONCE(!is_shadow_present_pte(new_spte));
494	check_spte_writable_invariants(spte: new_spte);
495
496	if (!is_shadow_present_pte(pte: old_spte)) {
497	mmu_spte_set(sptep, new_spte);
498	return old_spte;
499	}
500
501	if (!spte_has_volatile_bits(spte: old_spte))
502	__update_clear_spte_fast(sptep, spte: new_spte);
503	else
504	old_spte = __update_clear_spte_slow(sptep, spte: new_spte);
505
506	WARN_ON_ONCE(spte_to_pfn(old_spte) != spte_to_pfn(new_spte));
507
508	return old_spte;
509	}
510
511	/ Rules for using mmu_spte_update:*
512	* Update the state bits, it means the mapped pfn is not changed.
513	*
514	* Whenever an MMU-writable SPTE is overwritten with a read-only SPTE, remote
515	* TLBs must be flushed. Otherwise rmap_write_protect will find a read-only
516	* spte, even though the writable spte might be cached on a CPU's TLB.
517	*
518	* Returns true if the TLB needs to be flushed
519	*/
520	static bool mmu_spte_update(u64 *sptep, u64 new_spte)
521	{
522	bool flush = false;
523	u64 old_spte = mmu_spte_update_no_track(sptep, new_spte);
524
525	if (!is_shadow_present_pte(pte: old_spte))
526	return false;
527
528	/*
529	* For the spte updated out of mmu-lock is safe, since
530	* we always atomically update it, see the comments in
531	* spte_has_volatile_bits().
532	*/
533	if (is_mmu_writable_spte(spte: old_spte) &&
534	!is_writable_pte(pte: new_spte))
535	flush = true;
536
537	/*
538	* Flush TLB when accessed/dirty states are changed in the page tables,
539	* to guarantee consistency between TLB and page tables.
540	*/
541
542	if (is_accessed_spte(spte: old_spte) && !is_accessed_spte(spte: new_spte)) {
543	flush = true;
544	kvm_set_pfn_accessed(pfn: spte_to_pfn(pte: old_spte));
545	}
546
547	if (is_dirty_spte(spte: old_spte) && !is_dirty_spte(spte: new_spte)) {
548	flush = true;
549	kvm_set_pfn_dirty(pfn: spte_to_pfn(pte: old_spte));
550	}
551
552	return flush;
553	}
554
555	/*
556	* Rules for using mmu_spte_clear_track_bits:
557	* It sets the sptep from present to nonpresent, and track the
558	* state bits, it is used to clear the last level sptep.
559	* Returns the old PTE.
560	*/
561	static u64 mmu_spte_clear_track_bits(struct kvm kvm, u64 sptep)
562	{
563	kvm_pfn_t pfn;
564	u64 old_spte = *sptep;
565	int level = sptep_to_sp(sptep)->role.level;
566	struct page *page;
567
568	if (!is_shadow_present_pte(pte: old_spte) \|\|
569	!spte_has_volatile_bits(spte: old_spte))
570	__update_clear_spte_fast(sptep, spte: `0ull`);
571	else
572	old_spte = __update_clear_spte_slow(sptep, spte: `0ull`);
573
574	if (!is_shadow_present_pte(pte: old_spte))
575	return old_spte;
576
577	kvm_update_page_stats(kvm, level, -`1`);
578
579	pfn = spte_to_pfn(pte: old_spte);
580
581	/*
582	* KVM doesn't hold a reference to any pages mapped into the guest, and
583	* instead uses the mmu_notifier to ensure that KVM unmaps any pages
584	* before they are reclaimed. Sanity check that, if the pfn is backed
585	* by a refcounted page, the refcount is elevated.
586	*/
587	page = kvm_pfn_to_refcounted_page(pfn);
588	WARN_ON_ONCE(page && !page_count(page));
589
590	if (is_accessed_spte(spte: old_spte))
591	kvm_set_pfn_accessed(pfn);
592
593	if (is_dirty_spte(spte: old_spte))
594	kvm_set_pfn_dirty(pfn);
595
596	return old_spte;
597	}
598
599	/*
600	* Rules for using mmu_spte_clear_no_track:
601	* Directly clear spte without caring the state bits of sptep,
602	* it is used to set the upper level spte.
603	*/
604	static void mmu_spte_clear_no_track(u64 *sptep)
605	{
606	__update_clear_spte_fast(sptep, spte: `0ull`);
607	}
608
609	static u64 mmu_spte_get_lockless(u64 *sptep)
610	{
611	return __get_spte_lockless(sptep);
612	}
613
614	/ Returns the Accessed status of the PTE and resets it at the same time. /
615	static bool mmu_spte_age(u64 *sptep)
616	{
617	u64 spte = mmu_spte_get_lockless(sptep);
618
619	if (!is_accessed_spte(spte))
620	return false;
621
622	if (spte_ad_enabled(spte)) {
623	clear_bit(nr: (ffs(shadow_accessed_mask) - `1`),
624	addr: (unsigned long *)sptep);
625	} else {
626	/*
627	* Capture the dirty status of the page, so that it doesn't get
628	* lost when the SPTE is marked for access tracking.
629	*/
630	if (is_writable_pte(pte: spte))
631	kvm_set_pfn_dirty(pfn: spte_to_pfn(pte: spte));
632
633	spte = mark_spte_for_access_track(spte);
634	mmu_spte_update_no_track(sptep, new_spte: spte);
635	}
636
637	return true;
638	}
639
640	static inline bool is_tdp_mmu_active(struct kvm_vcpu *vcpu)
641	{
642	return tdp_mmu_enabled && vcpu->arch.mmu->root_role.direct;
643	}
644
645	static void walk_shadow_page_lockless_begin(struct kvm_vcpu *vcpu)
646	{
647	if (is_tdp_mmu_active(vcpu)) {
648	kvm_tdp_mmu_walk_lockless_begin();
649	} else {
650	/*
651	* Prevent page table teardown by making any free-er wait during
652	* kvm_flush_remote_tlbs() IPI to all active vcpus.
653	*/
654	local_irq_disable();
655
656	/*
657	* Make sure a following spte read is not reordered ahead of the write
658	* to vcpu->mode.
659	*/
660	smp_store_mb(vcpu->mode, READING_SHADOW_PAGE_TABLES);
661	}
662	}
663
664	static void walk_shadow_page_lockless_end(struct kvm_vcpu *vcpu)
665	{
666	if (is_tdp_mmu_active(vcpu)) {
667	kvm_tdp_mmu_walk_lockless_end();
668	} else {
669	/*
670	* Make sure the write to vcpu->mode is not reordered in front of
671	* reads to sptes. If it does, kvm_mmu_commit_zap_page() can see us
672	* OUTSIDE_GUEST_MODE and proceed to free the shadow page table.
673	*/
674	smp_store_release(&vcpu->mode, OUTSIDE_GUEST_MODE);
675	local_irq_enable();
676	}
677	}
678
679	static int mmu_topup_memory_caches(struct kvm_vcpu *vcpu, bool maybe_indirect)
680	{
681	int r;
682
683	/ 1 rmap, 1 parent PTE per level, and the prefetched rmaps. /
684	r = kvm_mmu_topup_memory_cache(mc: &vcpu->arch.mmu_pte_list_desc_cache,
685	min: `1` + PT64_ROOT_MAX_LEVEL + PTE_PREFETCH_NUM);
686	if (r)
687	return r;
688	r = kvm_mmu_topup_memory_cache(mc: &vcpu->arch.mmu_shadow_page_cache,
689	PT64_ROOT_MAX_LEVEL);
690	if (r)
691	return r;
692	if (maybe_indirect) {
693	r = kvm_mmu_topup_memory_cache(mc: &vcpu->arch.mmu_shadowed_info_cache,
694	PT64_ROOT_MAX_LEVEL);
695	if (r)
696	return r;
697	}
698	return kvm_mmu_topup_memory_cache(mc: &vcpu->arch.mmu_page_header_cache,
699	PT64_ROOT_MAX_LEVEL);
700	}
701
702	static void mmu_free_memory_caches(struct kvm_vcpu *vcpu)
703	{
704	kvm_mmu_free_memory_cache(mc: &vcpu->arch.mmu_pte_list_desc_cache);
705	kvm_mmu_free_memory_cache(mc: &vcpu->arch.mmu_shadow_page_cache);
706	kvm_mmu_free_memory_cache(mc: &vcpu->arch.mmu_shadowed_info_cache);
707	kvm_mmu_free_memory_cache(mc: &vcpu->arch.mmu_page_header_cache);
708	}
709
710	static void mmu_free_pte_list_desc(struct pte_list_desc *pte_list_desc)
711	{
712	kmem_cache_free(s: pte_list_desc_cache, objp: pte_list_desc);
713	}
714
715	static bool sp_has_gptes(struct kvm_mmu_page *sp);
716
717	static gfn_t kvm_mmu_page_get_gfn(struct kvm_mmu_page sp, int* index)
718	{
719	if (sp->role.passthrough)
720	return sp->gfn;
721
722	if (!sp->role.direct)
723	return sp->shadowed_translation[index] >> PAGE_SHIFT;
724
725	return sp->gfn + (index << ((sp->role.level - `1`) * SPTE_LEVEL_BITS));
726	}
727
728	/*
729	* For leaf SPTEs, fetch the guest access permissions being shadowed. Note
730	* that the SPTE itself may have a more constrained access permissions that
731	* what the guest enforces. For example, a guest may create an executable
732	* huge PTE but KVM may disallow execution to mitigate iTLB multihit.
733	*/
734	static u32 kvm_mmu_page_get_access(struct kvm_mmu_page sp, int* index)
735	{
736	if (sp_has_gptes(sp))
737	return sp->shadowed_translation[index] & ACC_ALL;
738
739	/*
740	* For direct MMUs (e.g. TDP or non-paging guests) or passthrough SPs,
741	* KVM is not shadowing any guest page tables, so the "guest access
742	* permissions" are just ACC_ALL.
743	*
744	* For direct SPs in indirect MMUs (shadow paging), i.e. when KVM
745	* is shadowing a guest huge page with small pages, the guest access
746	* permissions being shadowed are the access permissions of the huge
747	* page.
748	*
749	* In both cases, sp->role.access contains the correct access bits.
750	*/
751	return sp->role.access;
752	}
753
754	static void kvm_mmu_page_set_translation(struct kvm_mmu_page sp, int* index,
755	gfn_t gfn, unsigned int access)
756	{
757	if (sp_has_gptes(sp)) {
758	sp->shadowed_translation[index] = (gfn << PAGE_SHIFT) \| access;
759	return;
760	}
761
762	WARN_ONCE(access != kvm_mmu_page_get_access(sp, index),
763	"access mismatch under %s page %llx (expected %u, got %u)\n",
764	sp->role.passthrough ? "passthrough" : "direct",
765	sp->gfn, kvm_mmu_page_get_access(sp, index), access);
766
767	WARN_ONCE(gfn != kvm_mmu_page_get_gfn(sp, index),
768	"gfn mismatch under %s page %llx (expected %llx, got %llx)\n",
769	sp->role.passthrough ? "passthrough" : "direct",
770	sp->gfn, kvm_mmu_page_get_gfn(sp, index), gfn);
771	}
772
773	static void kvm_mmu_page_set_access(struct kvm_mmu_page sp, int* index,
774	unsigned int access)
775	{
776	gfn_t gfn = kvm_mmu_page_get_gfn(sp, index);
777
778	kvm_mmu_page_set_translation(sp, index, gfn, access);
779	}
780
781	/*
782	* Return the pointer to the large page information for a given gfn,
783	* handling slots that are not large page aligned.
784	*/
785	static struct kvm_lpage_info *lpage_info_slot(gfn_t gfn,
786	const struct kvm_memory_slot slot, int* level)
787	{
788	unsigned long idx;
789
790	idx = gfn_to_index(gfn, slot->base_gfn, level);
791	return &slot->arch.lpage_info[level - `2`][idx];
792	}
793
794	/*
795	* The most significant bit in disallow_lpage tracks whether or not memory
796	* attributes are mixed, i.e. not identical for all gfns at the current level.
797	* The lower order bits are used to refcount other cases where a hugepage is
798	* disallowed, e.g. if KVM has shadow a page table at the gfn.
799	*/
800	#define KVM_LPAGE_MIXED_FLAG BIT(31)
801
802	static void update_gfn_disallow_lpage_count(const struct kvm_memory_slot *slot,
803	gfn_t gfn, int count)
804	{
805	struct kvm_lpage_info *linfo;
806	int old, i;
807
808	for (i = PG_LEVEL_2M; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
809	linfo = lpage_info_slot(gfn, slot, level: i);
810
811	old = linfo->disallow_lpage;
812	linfo->disallow_lpage += count;
813	WARN_ON_ONCE((old ^ linfo->disallow_lpage) & KVM_LPAGE_MIXED_FLAG);
814	}
815	}
816
817	void kvm_mmu_gfn_disallow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
818	{
819	update_gfn_disallow_lpage_count(slot, gfn, count: `1`);
820	}
821
822	void kvm_mmu_gfn_allow_lpage(const struct kvm_memory_slot *slot, gfn_t gfn)
823	{
824	update_gfn_disallow_lpage_count(slot, gfn, count: -`1`);
825	}
826
827	static void account_shadowed(struct kvm kvm, struct* kvm_mmu_page *sp)
828	{
829	struct kvm_memslots *slots;
830	struct kvm_memory_slot *slot;
831	gfn_t gfn;
832
833	kvm->arch.indirect_shadow_pages++;
834	gfn = sp->gfn;
835	slots = kvm_memslots_for_spte_role(kvm, sp->role);
836	slot = __gfn_to_memslot(slots, gfn);
837
838	/ the non-leaf shadow pages are keeping readonly. /
839	if (sp->role.level > PG_LEVEL_4K)
840	return __kvm_write_track_add_gfn(kvm, slot, gfn);
841
842	kvm_mmu_gfn_disallow_lpage(slot, gfn);
843
844	if (kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn, min_level: PG_LEVEL_4K))
845	kvm_flush_remote_tlbs_gfn(kvm, gfn, level: PG_LEVEL_4K);
846	}
847
848	void track_possible_nx_huge_page(struct kvm kvm, struct* kvm_mmu_page *sp)
849	{
850	/*
851	* If it's possible to replace the shadow page with an NX huge page,
852	* i.e. if the shadow page is the only thing currently preventing KVM
853	* from using a huge page, add the shadow page to the list of "to be
854	* zapped for NX recovery" pages. Note, the shadow page can already be
855	* on the list if KVM is reusing an existing shadow page, i.e. if KVM
856	* links a shadow page at multiple points.
857	*/
858	if (!list_empty(head: &sp->possible_nx_huge_page_link))
859	return;
860
861	++kvm->stat.nx_lpage_splits;
862	list_add_tail(new: &sp->possible_nx_huge_page_link,
863	head: &kvm->arch.possible_nx_huge_pages);
864	}
865
866	static void account_nx_huge_page(struct kvm kvm, struct* kvm_mmu_page *sp,
867	bool nx_huge_page_possible)
868	{
869	sp->nx_huge_page_disallowed = true;
870
871	if (nx_huge_page_possible)
872	track_possible_nx_huge_page(kvm, sp);
873	}
874
875	static void unaccount_shadowed(struct kvm kvm, struct* kvm_mmu_page *sp)
876	{
877	struct kvm_memslots *slots;
878	struct kvm_memory_slot *slot;
879	gfn_t gfn;
880
881	kvm->arch.indirect_shadow_pages--;
882	gfn = sp->gfn;
883	slots = kvm_memslots_for_spte_role(kvm, sp->role);
884	slot = __gfn_to_memslot(slots, gfn);
885	if (sp->role.level > PG_LEVEL_4K)
886	return __kvm_write_track_remove_gfn(kvm, slot, gfn);
887
888	kvm_mmu_gfn_allow_lpage(slot, gfn);
889	}
890
891	void untrack_possible_nx_huge_page(struct kvm kvm, struct* kvm_mmu_page *sp)
892	{
893	if (list_empty(head: &sp->possible_nx_huge_page_link))
894	return;
895
896	--kvm->stat.nx_lpage_splits;
897	list_del_init(entry: &sp->possible_nx_huge_page_link);
898	}
899
900	static void unaccount_nx_huge_page(struct kvm kvm, struct* kvm_mmu_page *sp)
901	{
902	sp->nx_huge_page_disallowed = false;
903
904	untrack_possible_nx_huge_page(kvm, sp);
905	}
906
907	static struct kvm_memory_slot gfn_to_memslot_dirty_bitmap(struct* kvm_vcpu *vcpu,
908	gfn_t gfn,
909	bool no_dirty_log)
910	{
911	struct kvm_memory_slot *slot;
912
913	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
914	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
915	return NULL;
916	if (no_dirty_log && kvm_slot_dirty_track_enabled(slot))
917	return NULL;
918
919	return slot;
920	}
921
922	/*
923	* About rmap_head encoding:
924	*
925	* If the bit zero of rmap_head->val is clear, then it points to the only spte
926	* in this rmap chain. Otherwise, (rmap_head->val & ~1) points to a struct
927	* pte_list_desc containing more mappings.
928	*/
929
930	/*
931	* Returns the number of pointers in the rmap chain, not counting the new one.
932	*/
933	static int pte_list_add(struct kvm_mmu_memory_cache cache, u64 spte,
934	struct kvm_rmap_head *rmap_head)
935	{
936	struct pte_list_desc *desc;
937	int count = `0`;
938
939	if (!rmap_head->val) {
940	rmap_head->val = (unsigned long)spte;
941	} else if (!(rmap_head->val & `1`)) {
942	desc = kvm_mmu_memory_cache_alloc(mc: cache);
943	desc->sptes[`0`] = (u64 *)rmap_head->val;
944	desc->sptes[`1`] = spte;
945	desc->spte_count = `2`;
946	desc->tail_count = `0`;
947	rmap_head->val = (unsigned long)desc \| `1`;
948	++count;
949	} else {
950	desc = (struct pte_list_desc *)(rmap_head->val & ~`1ul`);
951	count = desc->tail_count + desc->spte_count;
952
953	/*
954	* If the previous head is full, allocate a new head descriptor
955	* as tail descriptors are always kept full.
956	*/
957	if (desc->spte_count == PTE_LIST_EXT) {
958	desc = kvm_mmu_memory_cache_alloc(mc: cache);
959	desc->more = (struct pte_list_desc *)(rmap_head->val & ~`1ul`);
960	desc->spte_count = `0`;
961	desc->tail_count = count;
962	rmap_head->val = (unsigned long)desc \| `1`;
963	}
964	desc->sptes[desc->spte_count++] = spte;
965	}
966	return count;
967	}
968
969	static void pte_list_desc_remove_entry(struct kvm *kvm,
970	struct kvm_rmap_head *rmap_head,
971	struct pte_list_desc desc, int* i)
972	{
973	struct pte_list_desc head_desc = (struct* pte_list_desc *)(rmap_head->val & ~`1ul`);
974	int j = head_desc->spte_count - `1`;
975
976	/*
977	* The head descriptor should never be empty. A new head is added only
978	* when adding an entry and the previous head is full, and heads are
979	* removed (this flow) when they become empty.
980	*/
981	KVM_BUG_ON_DATA_CORRUPTION(j < `0`, kvm);
982
983	/*
984	* Replace the to-be-freed SPTE with the last valid entry from the head
985	* descriptor to ensure that tail descriptors are full at all times.
986	* Note, this also means that tail_count is stable for each descriptor.
987	*/
988	desc->sptes[i] = head_desc->sptes[j];
989	head_desc->sptes[j] = NULL;
990	head_desc->spte_count--;
991	if (head_desc->spte_count)
992	return;
993
994	/*
995	* The head descriptor is empty. If there are no tail descriptors,
996	* nullify the rmap head to mark the list as empty, else point the rmap
997	* head at the next descriptor, i.e. the new head.
998	*/
999	if (!head_desc->more)
1000	rmap_head->val = `0`;
1001	else
1002	rmap_head->val = (unsigned long)head_desc->more \| `1`;
1003	mmu_free_pte_list_desc(pte_list_desc: head_desc);
1004	}
1005
1006	static void pte_list_remove(struct kvm kvm, u64 spte,
1007	struct kvm_rmap_head *rmap_head)
1008	{
1009	struct pte_list_desc *desc;
1010	int i;
1011
1012	if (KVM_BUG_ON_DATA_CORRUPTION(!rmap_head->val, kvm))
1013	return;
1014
1015	if (!(rmap_head->val & `1`)) {
1016	if (KVM_BUG_ON_DATA_CORRUPTION((u64 *)rmap_head->val != spte, kvm))
1017	return;
1018
1019	rmap_head->val = `0`;
1020	} else {
1021	desc = (struct pte_list_desc *)(rmap_head->val & ~`1ul`);
1022	while (desc) {
1023	for (i = `0`; i < desc->spte_count; ++i) {
1024	if (desc->sptes[i] == spte) {
1025	pte_list_desc_remove_entry(kvm, rmap_head,
1026	desc, i);
1027	return;
1028	}
1029	}
1030	desc = desc->more;
1031	}
1032
1033	KVM_BUG_ON_DATA_CORRUPTION(true, kvm);
1034	}
1035	}
1036
1037	static void kvm_zap_one_rmap_spte(struct kvm *kvm,
1038	struct kvm_rmap_head rmap_head, u64 sptep)
1039	{
1040	mmu_spte_clear_track_bits(kvm, sptep);
1041	pte_list_remove(kvm, spte: sptep, rmap_head);
1042	}
1043
1044	/ Return true if at least one SPTE was zapped, false otherwise /
1045	static bool kvm_zap_all_rmap_sptes(struct kvm *kvm,
1046	struct kvm_rmap_head *rmap_head)
1047	{
1048	struct pte_list_desc desc, next;
1049	int i;
1050
1051	if (!rmap_head->val)
1052	return false;
1053
1054	if (!(rmap_head->val & `1`)) {
1055	mmu_spte_clear_track_bits(kvm, sptep: (u64 *)rmap_head->val);
1056	goto out;
1057	}
1058
1059	desc = (struct pte_list_desc *)(rmap_head->val & ~`1ul`);
1060
1061	for (; desc; desc = next) {
1062	for (i = `0`; i < desc->spte_count; i++)
1063	mmu_spte_clear_track_bits(kvm, sptep: desc->sptes[i]);
1064	next = desc->more;
1065	mmu_free_pte_list_desc(pte_list_desc: desc);
1066	}
1067	out:
1068	/ rmap_head is meaningless now, remember to reset it /
1069	rmap_head->val = `0`;
1070	return true;
1071	}
1072
1073	unsigned int pte_list_count(struct kvm_rmap_head *rmap_head)
1074	{
1075	struct pte_list_desc *desc;
1076
1077	if (!rmap_head->val)
1078	return `0`;
1079	else if (!(rmap_head->val & `1`))
1080	return `1`;
1081
1082	desc = (struct pte_list_desc *)(rmap_head->val & ~`1ul`);
1083	return desc->tail_count + desc->spte_count;
1084	}
1085
1086	static struct kvm_rmap_head gfn_to_rmap(gfn_t gfn, int* level,
1087	const struct kvm_memory_slot *slot)
1088	{
1089	unsigned long idx;
1090
1091	idx = gfn_to_index(gfn, slot->base_gfn, level);
1092	return &slot->arch.rmap[level - PG_LEVEL_4K][idx];
1093	}
1094
1095	static void rmap_remove(struct kvm kvm, u64 spte)
1096	{
1097	struct kvm_memslots *slots;
1098	struct kvm_memory_slot *slot;
1099	struct kvm_mmu_page *sp;
1100	gfn_t gfn;
1101	struct kvm_rmap_head *rmap_head;
1102
1103	sp = sptep_to_sp(sptep: spte);
1104	gfn = kvm_mmu_page_get_gfn(sp, index: spte_index(sptep: spte));
1105
1106	/*
1107	* Unlike rmap_add, rmap_remove does not run in the context of a vCPU
1108	* so we have to determine which memslots to use based on context
1109	* information in sp->role.
1110	*/
1111	slots = kvm_memslots_for_spte_role(kvm, sp->role);
1112
1113	slot = __gfn_to_memslot(slots, gfn);
1114	rmap_head = gfn_to_rmap(gfn, level: sp->role.level, slot);
1115
1116	pte_list_remove(kvm, spte, rmap_head);
1117	}
1118
1119	/*
1120	* Used by the following functions to iterate through the sptes linked by a
1121	* rmap. All fields are private and not assumed to be used outside.
1122	*/
1123	struct rmap_iterator {
1124	/ private fields /
1125	struct pte_list_desc desc; /* holds the sptep if not NULL /
1126	int pos; / index of the sptep /
1127	};
1128
1129	/*
1130	* Iteration must be started by this function. This should also be used after
1131	* removing/dropping sptes from the rmap link because in such cases the
1132	* information in the iterator may not be valid.
1133	*
1134	* Returns sptep if found, NULL otherwise.
1135	*/
1136	static u64 rmap_get_first(struct* kvm_rmap_head *rmap_head,
1137	struct rmap_iterator *iter)
1138	{
1139	u64 *sptep;
1140
1141	if (!rmap_head->val)
1142	return NULL;
1143
1144	if (!(rmap_head->val & `1`)) {
1145	iter->desc = NULL;
1146	sptep = (u64 *)rmap_head->val;
1147	goto out;
1148	}
1149
1150	iter->desc = (struct pte_list_desc *)(rmap_head->val & ~`1ul`);
1151	iter->pos = `0`;
1152	sptep = iter->desc->sptes[iter->pos];
1153	out:
1154	BUG_ON(!is_shadow_present_pte(*sptep));
1155	return sptep;
1156	}
1157
1158	/*
1159	* Must be used with a valid iterator: e.g. after rmap_get_first().
1160	*
1161	* Returns sptep if found, NULL otherwise.
1162	*/
1163	static u64 rmap_get_next(struct* rmap_iterator *iter)
1164	{
1165	u64 *sptep;
1166
1167	if (iter->desc) {
1168	if (iter->pos < PTE_LIST_EXT - `1`) {
1169	++iter->pos;
1170	sptep = iter->desc->sptes[iter->pos];
1171	if (sptep)
1172	goto out;
1173	}
1174
1175	iter->desc = iter->desc->more;
1176
1177	if (iter->desc) {
1178	iter->pos = `0`;
1179	/ desc->sptes[0] cannot be NULL /
1180	sptep = iter->desc->sptes[iter->pos];
1181	goto out;
1182	}
1183	}
1184
1185	return NULL;
1186	out:
1187	BUG_ON(!is_shadow_present_pte(*sptep));
1188	return sptep;
1189	}
1190
1191	#define for_each_rmap_spte(_rmap_head_, _iter_, _spte_) \
1192	for (_spte_ = rmap_get_first(_rmap_head_, _iter_); \
1193	_spte_; _spte_ = rmap_get_next(_iter_))
1194
1195	static void drop_spte(struct kvm kvm, u64 sptep)
1196	{
1197	u64 old_spte = mmu_spte_clear_track_bits(kvm, sptep);
1198
1199	if (is_shadow_present_pte(pte: old_spte))
1200	rmap_remove(kvm, spte: sptep);
1201	}
1202
1203	static void drop_large_spte(struct kvm kvm, u64 sptep, bool flush)
1204	{
1205	struct kvm_mmu_page *sp;
1206
1207	sp = sptep_to_sp(sptep);
1208	WARN_ON_ONCE(sp->role.level == PG_LEVEL_4K);
1209
1210	drop_spte(kvm, sptep);
1211
1212	if (flush)
1213	kvm_flush_remote_tlbs_sptep(kvm, sptep);
1214	}
1215
1216	/*
1217	* Write-protect on the specified @sptep, @pt_protect indicates whether
1218	* spte write-protection is caused by protecting shadow page table.
1219	*
1220	* Note: write protection is difference between dirty logging and spte
1221	* protection:
1222	* - for dirty logging, the spte can be set to writable at anytime if
1223	* its dirty bitmap is properly set.
1224	* - for spte protection, the spte can be writable only after unsync-ing
1225	* shadow page.
1226	*
1227	* Return true if tlb need be flushed.
1228	*/
1229	static bool spte_write_protect(u64 *sptep, bool pt_protect)
1230	{
1231	u64 spte = *sptep;
1232
1233	if (!is_writable_pte(pte: spte) &&
1234	!(pt_protect && is_mmu_writable_spte(spte)))
1235	return false;
1236
1237	if (pt_protect)
1238	spte &= ~shadow_mmu_writable_mask;
1239	spte = spte & ~PT_WRITABLE_MASK;
1240
1241	return mmu_spte_update(sptep, new_spte: spte);
1242	}
1243
1244	static bool rmap_write_protect(struct kvm_rmap_head *rmap_head,
1245	bool pt_protect)
1246	{
1247	u64 *sptep;
1248	struct rmap_iterator iter;
1249	bool flush = false;
1250
1251	for_each_rmap_spte(rmap_head, &iter, sptep)
1252	flush \|= spte_write_protect(sptep, pt_protect);
1253
1254	return flush;
1255	}
1256
1257	static bool spte_clear_dirty(u64 *sptep)
1258	{
1259	u64 spte = *sptep;
1260
1261	KVM_MMU_WARN_ON(!spte_ad_enabled(spte));
1262	spte &= ~shadow_dirty_mask;
1263	return mmu_spte_update(sptep, new_spte: spte);
1264	}
1265
1266	static bool spte_wrprot_for_clear_dirty(u64 *sptep)
1267	{
1268	bool was_writable = test_and_clear_bit(nr: PT_WRITABLE_SHIFT,
1269	addr: (unsigned long *)sptep);
1270	if (was_writable && !spte_ad_enabled(spte: *sptep))
1271	kvm_set_pfn_dirty(pfn: spte_to_pfn(pte: *sptep));
1272
1273	return was_writable;
1274	}
1275
1276	/*
1277	* Gets the GFN ready for another round of dirty logging by clearing the
1278	* - D bit on ad-enabled SPTEs, and
1279	* - W bit on ad-disabled SPTEs.
1280	* Returns true iff any D or W bits were cleared.
1281	*/
1282	static bool __rmap_clear_dirty(struct kvm kvm, struct* kvm_rmap_head *rmap_head,
1283	const struct kvm_memory_slot *slot)
1284	{
1285	u64 *sptep;
1286	struct rmap_iterator iter;
1287	bool flush = false;
1288
1289	for_each_rmap_spte(rmap_head, &iter, sptep)
1290	if (spte_ad_need_write_protect(spte: *sptep))
1291	flush \|= spte_wrprot_for_clear_dirty(sptep);
1292	else
1293	flush \|= spte_clear_dirty(sptep);
1294
1295	return flush;
1296	}
1297
1298	/**
1299	* kvm_mmu_write_protect_pt_masked - write protect selected PT level pages
1300	* @kvm: kvm instance
1301	* @slot: slot to protect
1302	* @gfn_offset: start of the BITS_PER_LONG pages we care about
1303	* @mask: indicates which pages we should protect
1304	*
1305	* Used when we do not need to care about huge page mappings.
1306	*/
1307	static void kvm_mmu_write_protect_pt_masked(struct kvm *kvm,
1308	struct kvm_memory_slot *slot,
1309	gfn_t gfn_offset, unsigned long mask)
1310	{
1311	struct kvm_rmap_head *rmap_head;
1312
1313	if (tdp_mmu_enabled)
1314	kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1315	gfn: slot->base_gfn + gfn_offset, mask, wrprot: true);
1316
1317	if (!kvm_memslots_have_rmaps(kvm))
1318	return;
1319
1320	while (mask) {
1321	rmap_head = gfn_to_rmap(gfn: slot->base_gfn + gfn_offset + __ffs(mask),
1322	level: PG_LEVEL_4K, slot);
1323	rmap_write_protect(rmap_head, pt_protect: false);
1324
1325	/ clear the first set bit /
1326	mask &= mask - `1`;
1327	}
1328	}
1329
1330	/**
1331	* kvm_mmu_clear_dirty_pt_masked - clear MMU D-bit for PT level pages, or write
1332	* protect the page if the D-bit isn't supported.
1333	* @kvm: kvm instance
1334	* @slot: slot to clear D-bit
1335	* @gfn_offset: start of the BITS_PER_LONG pages we care about
1336	* @mask: indicates which pages we should clear D-bit
1337	*
1338	* Used for PML to re-log the dirty GPAs after userspace querying dirty_bitmap.
1339	*/
1340	static void kvm_mmu_clear_dirty_pt_masked(struct kvm *kvm,
1341	struct kvm_memory_slot *slot,
1342	gfn_t gfn_offset, unsigned long mask)
1343	{
1344	struct kvm_rmap_head *rmap_head;
1345
1346	if (tdp_mmu_enabled)
1347	kvm_tdp_mmu_clear_dirty_pt_masked(kvm, slot,
1348	gfn: slot->base_gfn + gfn_offset, mask, wrprot: false);
1349
1350	if (!kvm_memslots_have_rmaps(kvm))
1351	return;
1352
1353	while (mask) {
1354	rmap_head = gfn_to_rmap(gfn: slot->base_gfn + gfn_offset + __ffs(mask),
1355	level: PG_LEVEL_4K, slot);
1356	__rmap_clear_dirty(kvm, rmap_head, slot);
1357
1358	/ clear the first set bit /
1359	mask &= mask - `1`;
1360	}
1361	}
1362
1363	/**
1364	* kvm_arch_mmu_enable_log_dirty_pt_masked - enable dirty logging for selected
1365	* PT level pages.
1366	*
1367	* It calls kvm_mmu_write_protect_pt_masked to write protect selected pages to
1368	* enable dirty logging for them.
1369	*
1370	* We need to care about huge page mappings: e.g. during dirty logging we may
1371	* have such mappings.
1372	*/
1373	void kvm_arch_mmu_enable_log_dirty_pt_masked(struct kvm *kvm,
1374	struct kvm_memory_slot *slot,
1375	gfn_t gfn_offset, unsigned long mask)
1376	{
1377	/*
1378	* Huge pages are NOT write protected when we start dirty logging in
1379	* initially-all-set mode; must write protect them here so that they
1380	* are split to 4K on the first write.
1381	*
1382	* The gfn_offset is guaranteed to be aligned to 64, but the base_gfn
1383	* of memslot has no such restriction, so the range can cross two large
1384	* pages.
1385	*/
1386	if (kvm_dirty_log_manual_protect_and_init_set(kvm)) {
1387	gfn_t start = slot->base_gfn + gfn_offset + __ffs(mask);
1388	gfn_t end = slot->base_gfn + gfn_offset + __fls(word: mask);
1389
1390	if (READ_ONCE(eager_page_split))
1391	kvm_mmu_try_split_huge_pages(kvm, memslot: slot, start, end: end + `1`, target_level: PG_LEVEL_4K);
1392
1393	kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn: start, min_level: PG_LEVEL_2M);
1394
1395	/ Cross two large pages? /
1396	if (ALIGN(start << PAGE_SHIFT, PMD_SIZE) !=
1397	ALIGN(end << PAGE_SHIFT, PMD_SIZE))
1398	kvm_mmu_slot_gfn_write_protect(kvm, slot, gfn: end,
1399	min_level: PG_LEVEL_2M);
1400	}
1401
1402	/ Now handle 4K PTEs. /
1403	if (kvm_x86_ops.cpu_dirty_log_size)
1404	kvm_mmu_clear_dirty_pt_masked(kvm, slot, gfn_offset, mask);
1405	else
1406	kvm_mmu_write_protect_pt_masked(kvm, slot, gfn_offset, mask);
1407	}
1408
1409	int kvm_cpu_dirty_log_size(void)
1410	{
1411	return kvm_x86_ops.cpu_dirty_log_size;
1412	}
1413
1414	bool kvm_mmu_slot_gfn_write_protect(struct kvm *kvm,
1415	struct kvm_memory_slot *slot, u64 gfn,
1416	int min_level)
1417	{
1418	struct kvm_rmap_head *rmap_head;
1419	int i;
1420	bool write_protected = false;
1421
1422	if (kvm_memslots_have_rmaps(kvm)) {
1423	for (i = min_level; i <= KVM_MAX_HUGEPAGE_LEVEL; ++i) {
1424	rmap_head = gfn_to_rmap(gfn, level: i, slot);
1425	write_protected \|= rmap_write_protect(rmap_head, pt_protect: true);
1426	}
1427	}
1428
1429	if (tdp_mmu_enabled)
1430	write_protected \|=
1431	kvm_tdp_mmu_write_protect_gfn(kvm, slot, gfn, min_level);
1432
1433	return write_protected;
1434	}
1435
1436	static bool kvm_vcpu_write_protect_gfn(struct kvm_vcpu *vcpu, u64 gfn)
1437	{
1438	struct kvm_memory_slot *slot;
1439
1440	slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
1441	return kvm_mmu_slot_gfn_write_protect(kvm: vcpu->kvm, slot, gfn, min_level: PG_LEVEL_4K);
1442	}
1443
1444	static bool __kvm_zap_rmap(struct kvm kvm, struct* kvm_rmap_head *rmap_head,
1445	const struct kvm_memory_slot *slot)
1446	{
1447	return kvm_zap_all_rmap_sptes(kvm, rmap_head);
1448	}
1449
1450	static bool kvm_zap_rmap(struct kvm kvm, struct* kvm_rmap_head *rmap_head,
1451	struct kvm_memory_slot slot, gfn_t gfn, int* level,
1452	pte_t unused)
1453	{
1454	return __kvm_zap_rmap(kvm, rmap_head, slot);
1455	}
1456
1457	static bool kvm_set_pte_rmap(struct kvm kvm, struct* kvm_rmap_head *rmap_head,
1458	struct kvm_memory_slot slot, gfn_t gfn, int* level,
1459	pte_t pte)
1460	{
1461	u64 *sptep;
1462	struct rmap_iterator iter;
1463	bool need_flush = false;
1464	u64 new_spte;
1465	kvm_pfn_t new_pfn;
1466
1467	WARN_ON_ONCE(pte_huge(pte));
1468	new_pfn = pte_pfn(pte);
1469
1470	restart:
1471	for_each_rmap_spte(rmap_head, &iter, sptep) {
1472	need_flush = true;
1473
1474	if (pte_write(pte)) {
1475	kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
1476	goto restart;
1477	} else {
1478	new_spte = kvm_mmu_changed_pte_notifier_make_spte(
1479	old_spte: *sptep, new_pfn);
1480
1481	mmu_spte_clear_track_bits(kvm, sptep);
1482	mmu_spte_set(sptep, new_spte);
1483	}
1484	}
1485
1486	if (need_flush && kvm_available_flush_remote_tlbs_range()) {
1487	kvm_flush_remote_tlbs_gfn(kvm, gfn, level);
1488	return false;
1489	}
1490
1491	return need_flush;
1492	}
1493
1494	struct slot_rmap_walk_iterator {
1495	/ input fields. /
1496	const struct kvm_memory_slot *slot;
1497	gfn_t start_gfn;
1498	gfn_t end_gfn;
1499	int start_level;
1500	int end_level;
1501
1502	/ output fields. /
1503	gfn_t gfn;
1504	struct kvm_rmap_head *rmap;
1505	int level;
1506
1507	/ private field. /
1508	struct kvm_rmap_head *end_rmap;
1509	};
1510
1511	static void rmap_walk_init_level(struct slot_rmap_walk_iterator *iterator,
1512	int level)
1513	{
1514	iterator->level = level;
1515	iterator->gfn = iterator->start_gfn;
1516	iterator->rmap = gfn_to_rmap(gfn: iterator->gfn, level, slot: iterator->slot);
1517	iterator->end_rmap = gfn_to_rmap(gfn: iterator->end_gfn, level, slot: iterator->slot);
1518	}
1519
1520	static void slot_rmap_walk_init(struct slot_rmap_walk_iterator *iterator,
1521	const struct kvm_memory_slot *slot,
1522	int start_level, int end_level,
1523	gfn_t start_gfn, gfn_t end_gfn)
1524	{
1525	iterator->slot = slot;
1526	iterator->start_level = start_level;
1527	iterator->end_level = end_level;
1528	iterator->start_gfn = start_gfn;
1529	iterator->end_gfn = end_gfn;
1530
1531	rmap_walk_init_level(iterator, level: iterator->start_level);
1532	}
1533
1534	static bool slot_rmap_walk_okay(struct slot_rmap_walk_iterator *iterator)
1535	{
1536	return !!iterator->rmap;
1537	}
1538
1539	static void slot_rmap_walk_next(struct slot_rmap_walk_iterator *iterator)
1540	{
1541	while (++iterator->rmap <= iterator->end_rmap) {
1542	iterator->gfn += (`1UL` << KVM_HPAGE_GFN_SHIFT(iterator->level));
1543
1544	if (iterator->rmap->val)
1545	return;
1546	}
1547
1548	if (++iterator->level > iterator->end_level) {
1549	iterator->rmap = NULL;
1550	return;
1551	}
1552
1553	rmap_walk_init_level(iterator, level: iterator->level);
1554	}
1555
1556	#define for_each_slot_rmap_range(_slot_, _start_level_, _end_level_, \
1557	_start_gfn, _end_gfn, _iter_) \
1558	for (slot_rmap_walk_init(_iter_, _slot_, _start_level_, \
1559	_end_level_, _start_gfn, _end_gfn); \
1560	slot_rmap_walk_okay(_iter_); \
1561	slot_rmap_walk_next(_iter_))
1562
1563	typedef bool (rmap_handler_t)(struct* kvm kvm, struct* kvm_rmap_head *rmap_head,
1564	struct kvm_memory_slot *slot, gfn_t gfn,
1565	int level, pte_t pte);
1566
1567	static __always_inline bool kvm_handle_gfn_range(struct kvm *kvm,
1568	struct kvm_gfn_range *range,
1569	rmap_handler_t handler)
1570	{
1571	struct slot_rmap_walk_iterator iterator;
1572	bool ret = false;
1573
1574	for_each_slot_rmap_range(range->slot, PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
1575	range->start, range->end - `1`, &iterator)
1576	ret \|= handler(kvm, iterator.rmap, range->slot, iterator.gfn,
1577	iterator.level, range->arg.pte);
1578
1579	return ret;
1580	}
1581
1582	bool kvm_unmap_gfn_range(struct kvm kvm, struct* kvm_gfn_range *range)
1583	{
1584	bool flush = false;
1585
1586	if (kvm_memslots_have_rmaps(kvm))
1587	flush = kvm_handle_gfn_range(kvm, range, handler: kvm_zap_rmap);
1588
1589	if (tdp_mmu_enabled)
1590	flush = kvm_tdp_mmu_unmap_gfn_range(kvm, range, flush);
1591
1592	if (kvm_x86_ops.set_apic_access_page_addr &&
1593	range->slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT)
1594	kvm_make_all_cpus_request(kvm, KVM_REQ_APIC_PAGE_RELOAD);
1595
1596	return flush;
1597	}
1598
1599	bool kvm_set_spte_gfn(struct kvm kvm, struct* kvm_gfn_range *range)
1600	{
1601	bool flush = false;
1602
1603	if (kvm_memslots_have_rmaps(kvm))
1604	flush = kvm_handle_gfn_range(kvm, range, handler: kvm_set_pte_rmap);
1605
1606	if (tdp_mmu_enabled)
1607	flush \|= kvm_tdp_mmu_set_spte_gfn(kvm, range);
1608
1609	return flush;
1610	}
1611
1612	static bool kvm_age_rmap(struct kvm kvm, struct* kvm_rmap_head *rmap_head,
1613	struct kvm_memory_slot slot, gfn_t gfn, int* level,
1614	pte_t unused)
1615	{
1616	u64 *sptep;
1617	struct rmap_iterator iter;
1618	int young = `0`;
1619
1620	for_each_rmap_spte(rmap_head, &iter, sptep)
1621	young \|= mmu_spte_age(sptep);
1622
1623	return young;
1624	}
1625
1626	static bool kvm_test_age_rmap(struct kvm kvm, struct* kvm_rmap_head *rmap_head,
1627	struct kvm_memory_slot *slot, gfn_t gfn,
1628	int level, pte_t unused)
1629	{
1630	u64 *sptep;
1631	struct rmap_iterator iter;
1632
1633	for_each_rmap_spte(rmap_head, &iter, sptep)
1634	if (is_accessed_spte(spte: *sptep))
1635	return true;
1636	return false;
1637	}
1638
1639	#define RMAP_RECYCLE_THRESHOLD 1000
1640
1641	static void __rmap_add(struct kvm *kvm,
1642	struct kvm_mmu_memory_cache *cache,
1643	const struct kvm_memory_slot *slot,
1644	u64 spte, gfn_t gfn, unsigned* int access)
1645	{
1646	struct kvm_mmu_page *sp;
1647	struct kvm_rmap_head *rmap_head;
1648	int rmap_count;
1649
1650	sp = sptep_to_sp(sptep: spte);
1651	kvm_mmu_page_set_translation(sp, index: spte_index(sptep: spte), gfn, access);
1652	kvm_update_page_stats(kvm, sp->role.level, `1`);
1653
1654	rmap_head = gfn_to_rmap(gfn, level: sp->role.level, slot);
1655	rmap_count = pte_list_add(cache, spte, rmap_head);
1656
1657	if (rmap_count > kvm->stat.max_mmu_rmap_size)
1658	kvm->stat.max_mmu_rmap_size = rmap_count;
1659	if (rmap_count > RMAP_RECYCLE_THRESHOLD) {
1660	kvm_zap_all_rmap_sptes(kvm, rmap_head);
1661	kvm_flush_remote_tlbs_gfn(kvm, gfn, level: sp->role.level);
1662	}
1663	}
1664
1665	static void rmap_add(struct kvm_vcpu vcpu, const* struct kvm_memory_slot *slot,
1666	u64 spte, gfn_t gfn, unsigned* int access)
1667	{
1668	struct kvm_mmu_memory_cache *cache = &vcpu->arch.mmu_pte_list_desc_cache;
1669
1670	__rmap_add(kvm: vcpu->kvm, cache, slot, spte, gfn, access);
1671	}
1672
1673	bool kvm_age_gfn(struct kvm kvm, struct* kvm_gfn_range *range)
1674	{
1675	bool young = false;
1676
1677	if (kvm_memslots_have_rmaps(kvm))
1678	young = kvm_handle_gfn_range(kvm, range, handler: kvm_age_rmap);
1679
1680	if (tdp_mmu_enabled)
1681	young \|= kvm_tdp_mmu_age_gfn_range(kvm, range);
1682
1683	return young;
1684	}
1685
1686	bool kvm_test_age_gfn(struct kvm kvm, struct* kvm_gfn_range *range)
1687	{
1688	bool young = false;
1689
1690	if (kvm_memslots_have_rmaps(kvm))
1691	young = kvm_handle_gfn_range(kvm, range, handler: kvm_test_age_rmap);
1692
1693	if (tdp_mmu_enabled)
1694	young \|= kvm_tdp_mmu_test_age_gfn(kvm, range);
1695
1696	return young;
1697	}
1698
1699	static void kvm_mmu_check_sptes_at_free(struct kvm_mmu_page *sp)
1700	{
1701	#ifdef CONFIG_KVM_PROVE_MMU
1702	int i;
1703
1704	for (i = `0`; i < SPTE_ENT_PER_PAGE; i++) {
1705	if (KVM_MMU_WARN_ON(is_shadow_present_pte(sp->spt[i])))
1706	pr_err_ratelimited("SPTE %llx (@ %p) for gfn %llx shadow-present at free",
1707	sp->spt[i], &sp->spt[i],
1708	kvm_mmu_page_get_gfn(sp, i));
1709	}
1710	#endif
1711	}
1712
1713	/*
1714	* This value is the sum of all of the kvm instances's
1715	* kvm->arch.n_used_mmu_pages values. We need a global,
1716	* aggregate version in order to make the slab shrinker
1717	* faster
1718	*/
1719	static inline void kvm_mod_used_mmu_pages(struct kvm kvm, long* nr)
1720	{
1721	kvm->arch.n_used_mmu_pages += nr;
1722	percpu_counter_add(fbc: &kvm_total_used_mmu_pages, amount: nr);
1723	}
1724
1725	static void kvm_account_mmu_page(struct kvm kvm, struct* kvm_mmu_page *sp)
1726	{
1727	kvm_mod_used_mmu_pages(kvm, nr: +`1`);
1728	kvm_account_pgtable_pages(virt: (void *)sp->spt, nr: +`1`);
1729	}
1730
1731	static void kvm_unaccount_mmu_page(struct kvm kvm, struct* kvm_mmu_page *sp)
1732	{
1733	kvm_mod_used_mmu_pages(kvm, nr: -`1`);
1734	kvm_account_pgtable_pages(virt: (void *)sp->spt, nr: -`1`);
1735	}
1736
1737	static void kvm_mmu_free_shadow_page(struct kvm_mmu_page *sp)
1738	{
1739	kvm_mmu_check_sptes_at_free(sp);
1740
1741	hlist_del(n: &sp->hash_link);
1742	list_del(entry: &sp->link);
1743	free_page((unsigned long)sp->spt);
1744	if (!sp->role.direct)
1745	free_page((unsigned long)sp->shadowed_translation);
1746	kmem_cache_free(s: mmu_page_header_cache, objp: sp);
1747	}
1748
1749	static unsigned kvm_page_table_hashfn(gfn_t gfn)
1750	{
1751	return hash_64(val: gfn, KVM_MMU_HASH_SHIFT);
1752	}
1753
1754	static void mmu_page_add_parent_pte(struct kvm_mmu_memory_cache *cache,
1755	struct kvm_mmu_page sp, u64 parent_pte)
1756	{
1757	if (!parent_pte)
1758	return;
1759
1760	pte_list_add(cache, spte: parent_pte, rmap_head: &sp->parent_ptes);
1761	}
1762
1763	static void mmu_page_remove_parent_pte(struct kvm kvm, struct* kvm_mmu_page *sp,
1764	u64 *parent_pte)
1765	{
1766	pte_list_remove(kvm, spte: parent_pte, rmap_head: &sp->parent_ptes);
1767	}
1768
1769	static void drop_parent_pte(struct kvm kvm, struct* kvm_mmu_page *sp,
1770	u64 *parent_pte)
1771	{
1772	mmu_page_remove_parent_pte(kvm, sp, parent_pte);
1773	mmu_spte_clear_no_track(sptep: parent_pte);
1774	}
1775
1776	static void mark_unsync(u64 *spte);
1777	static void kvm_mmu_mark_parents_unsync(struct kvm_mmu_page *sp)
1778	{
1779	u64 *sptep;
1780	struct rmap_iterator iter;
1781
1782	for_each_rmap_spte(&sp->parent_ptes, &iter, sptep) {
1783	mark_unsync(spte: sptep);
1784	}
1785	}
1786
1787	static void mark_unsync(u64 *spte)
1788	{
1789	struct kvm_mmu_page *sp;
1790
1791	sp = sptep_to_sp(sptep: spte);
1792	if (__test_and_set_bit(spte_index(spte), sp->unsync_child_bitmap))
1793	return;
1794	if (sp->unsync_children++)
1795	return;
1796	kvm_mmu_mark_parents_unsync(sp);
1797	}
1798
1799	#define KVM_PAGE_ARRAY_NR 16
1800
1801	struct kvm_mmu_pages {
1802	struct mmu_page_and_offset {
1803	struct kvm_mmu_page *sp;
1804	unsigned int idx;
1805	} page[KVM_PAGE_ARRAY_NR];
1806	unsigned int nr;
1807	};
1808
1809	static int mmu_pages_add(struct kvm_mmu_pages pvec, struct* kvm_mmu_page *sp,
1810	int idx)
1811	{
1812	int i;
1813
1814	if (sp->unsync)
1815	for (i=`0`; i < pvec->nr; i++)
1816	if (pvec->page[i].sp == sp)
1817	return `0`;
1818
1819	pvec->page[pvec->nr].sp = sp;
1820	pvec->page[pvec->nr].idx = idx;
1821	pvec->nr++;
1822	return (pvec->nr == KVM_PAGE_ARRAY_NR);
1823	}
1824
1825	static inline void clear_unsync_child_bit(struct kvm_mmu_page sp, int* idx)
1826	{
1827	--sp->unsync_children;
1828	WARN_ON_ONCE((int)sp->unsync_children < `0`);
1829	__clear_bit(idx, sp->unsync_child_bitmap);
1830	}
1831
1832	static int __mmu_unsync_walk(struct kvm_mmu_page *sp,
1833	struct kvm_mmu_pages *pvec)
1834	{
1835	int i, ret, nr_unsync_leaf = `0`;
1836
1837	for_each_set_bit(i, sp->unsync_child_bitmap, `512`) {
1838	struct kvm_mmu_page *child;
1839	u64 ent = sp->spt[i];
1840
1841	if (!is_shadow_present_pte(pte: ent) \|\| is_large_pte(pte: ent)) {
1842	clear_unsync_child_bit(sp, idx: i);
1843	continue;
1844	}
1845
1846	child = spte_to_child_sp(spte: ent);
1847
1848	if (child->unsync_children) {
1849	if (mmu_pages_add(pvec, sp: child, idx: i))
1850	return -ENOSPC;
1851
1852	ret = __mmu_unsync_walk(sp: child, pvec);
1853	if (!ret) {
1854	clear_unsync_child_bit(sp, idx: i);
1855	continue;
1856	} else if (ret > `0`) {
1857	nr_unsync_leaf += ret;
1858	} else
1859	return ret;
1860	} else if (child->unsync) {
1861	nr_unsync_leaf++;
1862	if (mmu_pages_add(pvec, sp: child, idx: i))
1863	return -ENOSPC;
1864	} else
1865	clear_unsync_child_bit(sp, idx: i);
1866	}
1867
1868	return nr_unsync_leaf;
1869	}
1870
1871	#define INVALID_INDEX (-1)
1872
1873	static int mmu_unsync_walk(struct kvm_mmu_page *sp,
1874	struct kvm_mmu_pages *pvec)
1875	{
1876	pvec->nr = `0`;
1877	if (!sp->unsync_children)
1878	return `0`;
1879
1880	mmu_pages_add(pvec, sp, INVALID_INDEX);
1881	return __mmu_unsync_walk(sp, pvec);
1882	}
1883
1884	static void kvm_unlink_unsync_page(struct kvm kvm, struct* kvm_mmu_page *sp)
1885	{
1886	WARN_ON_ONCE(!sp->unsync);
1887	trace_kvm_mmu_sync_page(sp);
1888	sp->unsync = `0`;
1889	--kvm->stat.mmu_unsync;
1890	}
1891
1892	static bool kvm_mmu_prepare_zap_page(struct kvm kvm, struct* kvm_mmu_page *sp,
1893	struct list_head *invalid_list);
1894	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
1895	struct list_head *invalid_list);
1896
1897	static bool sp_has_gptes(struct kvm_mmu_page *sp)
1898	{
1899	if (sp->role.direct)
1900	return false;
1901
1902	if (sp->role.passthrough)
1903	return false;
1904
1905	return true;
1906	}
1907
1908	#define for_each_valid_sp(_kvm, _sp, _list) \
1909	hlist_for_each_entry(_sp, _list, hash_link) \
1910	if (is_obsolete_sp((_kvm), (_sp))) { \
1911	} else
1912
1913	#define for_each_gfn_valid_sp_with_gptes(_kvm, _sp, _gfn) \
1914	for_each_valid_sp(_kvm, _sp, \
1915	&(_kvm)->arch.mmu_page_hash[kvm_page_table_hashfn(_gfn)]) \
1916	if ((_sp)->gfn != (_gfn) \|\| !sp_has_gptes(_sp)) {} else
1917
1918	static bool kvm_sync_page_check(struct kvm_vcpu vcpu, struct* kvm_mmu_page *sp)
1919	{
1920	union kvm_mmu_page_role root_role = vcpu->arch.mmu->root_role;
1921
1922	/*
1923	* Ignore various flags when verifying that it's safe to sync a shadow
1924	* page using the current MMU context.
1925	*
1926	* - level: not part of the overall MMU role and will never match as the MMU's
1927	* level tracks the root level
1928	* - access: updated based on the new guest PTE
1929	* - quadrant: not part of the overall MMU role (similar to level)
1930	*/
1931	const union kvm_mmu_page_role sync_role_ign = {
1932	.level = `0xf`,
1933	.access = `0x7`,
1934	.quadrant = `0x3`,
1935	.passthrough = `0x1`,
1936	};
1937
1938	/*
1939	* Direct pages can never be unsync, and KVM should never attempt to
1940	* sync a shadow page for a different MMU context, e.g. if the role
1941	* differs then the memslot lookup (SMM vs. non-SMM) will be bogus, the
1942	* reserved bits checks will be wrong, etc...
1943	*/
1944	if (WARN_ON_ONCE(sp->role.direct \|\| !vcpu->arch.mmu->sync_spte \|\|
1945	(sp->role.word ^ root_role.word) & ~sync_role_ign.word))
1946	return false;
1947
1948	return true;
1949	}
1950
1951	static int kvm_sync_spte(struct kvm_vcpu vcpu, struct* kvm_mmu_page sp, int* i)
1952	{
1953	if (!sp->spt[i])
1954	return `0`;
1955
1956	return vcpu->arch.mmu->sync_spte(vcpu, sp, i);
1957	}
1958
1959	static int __kvm_sync_page(struct kvm_vcpu vcpu, struct* kvm_mmu_page *sp)
1960	{
1961	int flush = `0`;
1962	int i;
1963
1964	if (!kvm_sync_page_check(vcpu, sp))
1965	return -`1`;
1966
1967	for (i = `0`; i < SPTE_ENT_PER_PAGE; i++) {
1968	int ret = kvm_sync_spte(vcpu, sp, i);
1969
1970	if (ret < -`1`)
1971	return -`1`;
1972	flush \|= ret;
1973	}
1974
1975	/*
1976	* Note, any flush is purely for KVM's correctness, e.g. when dropping
1977	* an existing SPTE or clearing W/A/D bits to ensure an mmu_notifier
1978	* unmap or dirty logging event doesn't fail to flush. The guest is
1979	* responsible for flushing the TLB to ensure any changes in protection
1980	* bits are recognized, i.e. until the guest flushes or page faults on
1981	* a relevant address, KVM is architecturally allowed to let vCPUs use
1982	* cached translations with the old protection bits.
1983	*/
1984	return flush;
1985	}
1986
1987	static int kvm_sync_page(struct kvm_vcpu vcpu, struct* kvm_mmu_page *sp,
1988	struct list_head *invalid_list)
1989	{
1990	int ret = __kvm_sync_page(vcpu, sp);
1991
1992	if (ret < `0`)
1993	kvm_mmu_prepare_zap_page(kvm: vcpu->kvm, sp, invalid_list);
1994	return ret;
1995	}
1996
1997	static bool kvm_mmu_remote_flush_or_zap(struct kvm *kvm,
1998	struct list_head *invalid_list,
1999	bool remote_flush)
2000	{
2001	if (!remote_flush && list_empty(head: invalid_list))
2002	return false;
2003
2004	if (!list_empty(head: invalid_list))
2005	kvm_mmu_commit_zap_page(kvm, invalid_list);
2006	else
2007	kvm_flush_remote_tlbs(kvm);
2008	return true;
2009	}
2010
2011	static bool is_obsolete_sp(struct kvm kvm, struct* kvm_mmu_page *sp)
2012	{
2013	if (sp->role.invalid)
2014	return true;
2015
2016	/ TDP MMU pages do not use the MMU generation. /
2017	return !is_tdp_mmu_page(sp) &&
2018	unlikely(sp->mmu_valid_gen != kvm->arch.mmu_valid_gen);
2019	}
2020
2021	struct mmu_page_path {
2022	struct kvm_mmu_page *parent[PT64_ROOT_MAX_LEVEL];
2023	unsigned int idx[PT64_ROOT_MAX_LEVEL];
2024	};
2025
2026	#define for_each_sp(pvec, sp, parents, i) \
2027	for (i = mmu_pages_first(&pvec, &parents); \
2028	i < pvec.nr && ({ sp = pvec.page[i].sp; 1;}); \
2029	i = mmu_pages_next(&pvec, &parents, i))
2030
2031	static int mmu_pages_next(struct kvm_mmu_pages *pvec,
2032	struct mmu_page_path *parents,
2033	int i)
2034	{
2035	int n;
2036
2037	for (n = i+`1`; n < pvec->nr; n++) {
2038	struct kvm_mmu_page *sp = pvec->page[n].sp;
2039	unsigned idx = pvec->page[n].idx;
2040	int level = sp->role.level;
2041
2042	parents->idx[level-`1`] = idx;
2043	if (level == PG_LEVEL_4K)
2044	break;
2045
2046	parents->parent[level-`2`] = sp;
2047	}
2048
2049	return n;
2050	}
2051
2052	static int mmu_pages_first(struct kvm_mmu_pages *pvec,
2053	struct mmu_page_path *parents)
2054	{
2055	struct kvm_mmu_page *sp;
2056	int level;
2057
2058	if (pvec->nr == `0`)
2059	return `0`;
2060
2061	WARN_ON_ONCE(pvec->page[`0`].idx != INVALID_INDEX);
2062
2063	sp = pvec->page[`0`].sp;
2064	level = sp->role.level;
2065	WARN_ON_ONCE(level == PG_LEVEL_4K);
2066
2067	parents->parent[level-`2`] = sp;
2068
2069	/ Also set up a sentinel. Further entries in pvec are all*
2070	* children of sp, so this element is never overwritten.
2071	*/
2072	parents->parent[level-`1`] = NULL;
2073	return mmu_pages_next(pvec, parents, i: `0`);
2074	}
2075
2076	static void mmu_pages_clear_parents(struct mmu_page_path *parents)
2077	{
2078	struct kvm_mmu_page *sp;
2079	unsigned int level = `0`;
2080
2081	do {
2082	unsigned int idx = parents->idx[level];
2083	sp = parents->parent[level];
2084	if (!sp)
2085	return;
2086
2087	WARN_ON_ONCE(idx == INVALID_INDEX);
2088	clear_unsync_child_bit(sp, idx);
2089	level++;
2090	} while (!sp->unsync_children);
2091	}
2092
2093	static int mmu_sync_children(struct kvm_vcpu *vcpu,
2094	struct kvm_mmu_page *parent, bool can_yield)
2095	{
2096	int i;
2097	struct kvm_mmu_page *sp;
2098	struct mmu_page_path parents;
2099	struct kvm_mmu_pages pages;
2100	LIST_HEAD(invalid_list);
2101	bool flush = false;
2102
2103	while (mmu_unsync_walk(sp: parent, pvec: &pages)) {
2104	bool protected = false;
2105
2106	for_each_sp(pages, sp, parents, i)
2107	protected \|= kvm_vcpu_write_protect_gfn(vcpu, gfn: sp->gfn);
2108
2109	if (protected) {
2110	kvm_mmu_remote_flush_or_zap(kvm: vcpu->kvm, invalid_list: &invalid_list, remote_flush: true);
2111	flush = false;
2112	}
2113
2114	for_each_sp(pages, sp, parents, i) {
2115	kvm_unlink_unsync_page(kvm: vcpu->kvm, sp);
2116	flush \|= kvm_sync_page(vcpu, sp, invalid_list: &invalid_list) > `0`;
2117	mmu_pages_clear_parents(parents: &parents);
2118	}
2119	if (need_resched() \|\| rwlock_needbreak(lock: &vcpu->kvm->mmu_lock)) {
2120	kvm_mmu_remote_flush_or_zap(kvm: vcpu->kvm, invalid_list: &invalid_list, remote_flush: flush);
2121	if (!can_yield) {
2122	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
2123	return -EINTR;
2124	}
2125
2126	cond_resched_rwlock_write(&vcpu->kvm->mmu_lock);
2127	flush = false;
2128	}
2129	}
2130
2131	kvm_mmu_remote_flush_or_zap(kvm: vcpu->kvm, invalid_list: &invalid_list, remote_flush: flush);
2132	return `0`;
2133	}
2134
2135	static void __clear_sp_write_flooding_count(struct kvm_mmu_page *sp)
2136	{
2137	atomic_set(v: &sp->write_flooding_count, i: `0`);
2138	}
2139
2140	static void clear_sp_write_flooding_count(u64 *spte)
2141	{
2142	__clear_sp_write_flooding_count(sp: sptep_to_sp(sptep: spte));
2143	}
2144
2145	/*
2146	* The vCPU is required when finding indirect shadow pages; the shadow
2147	* page may already exist and syncing it needs the vCPU pointer in
2148	* order to read guest page tables. Direct shadow pages are never
2149	* unsync, thus @vcpu can be NULL if @role.direct is true.
2150	*/
2151	static struct kvm_mmu_page kvm_mmu_find_shadow_page(struct* kvm *kvm,
2152	struct kvm_vcpu *vcpu,
2153	gfn_t gfn,
2154	struct hlist_head *sp_list,
2155	union kvm_mmu_page_role role)
2156	{
2157	struct kvm_mmu_page *sp;
2158	int ret;
2159	int collisions = `0`;
2160	LIST_HEAD(invalid_list);
2161
2162	for_each_valid_sp(kvm, sp, sp_list) {
2163	if (sp->gfn != gfn) {
2164	collisions++;
2165	continue;
2166	}
2167
2168	if (sp->role.word != role.word) {
2169	/*
2170	* If the guest is creating an upper-level page, zap
2171	* unsync pages for the same gfn. While it's possible
2172	* the guest is using recursive page tables, in all
2173	* likelihood the guest has stopped using the unsync
2174	* page and is installing a completely unrelated page.
2175	* Unsync pages must not be left as is, because the new
2176	* upper-level page will be write-protected.
2177	*/
2178	if (role.level > PG_LEVEL_4K && sp->unsync)
2179	kvm_mmu_prepare_zap_page(kvm, sp,
2180	invalid_list: &invalid_list);
2181	continue;
2182	}
2183
2184	/ unsync and write-flooding only apply to indirect SPs. /
2185	if (sp->role.direct)
2186	goto out;
2187
2188	if (sp->unsync) {
2189	if (KVM_BUG_ON(!vcpu, kvm))
2190	break;
2191
2192	/*
2193	* The page is good, but is stale. kvm_sync_page does
2194	* get the latest guest state, but (unlike mmu_unsync_children)
2195	* it doesn't write-protect the page or mark it synchronized!
2196	* This way the validity of the mapping is ensured, but the
2197	* overhead of write protection is not incurred until the
2198	* guest invalidates the TLB mapping. This allows multiple
2199	* SPs for a single gfn to be unsync.
2200	*
2201	* If the sync fails, the page is zapped. If so, break
2202	* in order to rebuild it.
2203	*/
2204	ret = kvm_sync_page(vcpu, sp, invalid_list: &invalid_list);
2205	if (ret < `0`)
2206	break;
2207
2208	WARN_ON_ONCE(!list_empty(&invalid_list));
2209	if (ret > `0`)
2210	kvm_flush_remote_tlbs(kvm);
2211	}
2212
2213	__clear_sp_write_flooding_count(sp);
2214
2215	goto out;
2216	}
2217
2218	sp = NULL;
2219	++kvm->stat.mmu_cache_miss;
2220
2221	out:
2222	kvm_mmu_commit_zap_page(kvm, invalid_list: &invalid_list);
2223
2224	if (collisions > kvm->stat.max_mmu_page_hash_collisions)
2225	kvm->stat.max_mmu_page_hash_collisions = collisions;
2226	return sp;
2227	}
2228
2229	/ Caches used when allocating a new shadow page. /
2230	struct shadow_page_caches {
2231	struct kvm_mmu_memory_cache *page_header_cache;
2232	struct kvm_mmu_memory_cache *shadow_page_cache;
2233	struct kvm_mmu_memory_cache *shadowed_info_cache;
2234	};
2235
2236	static struct kvm_mmu_page kvm_mmu_alloc_shadow_page(struct* kvm *kvm,
2237	struct shadow_page_caches *caches,
2238	gfn_t gfn,
2239	struct hlist_head *sp_list,
2240	union kvm_mmu_page_role role)
2241	{
2242	struct kvm_mmu_page *sp;
2243
2244	sp = kvm_mmu_memory_cache_alloc(mc: caches->page_header_cache);
2245	sp->spt = kvm_mmu_memory_cache_alloc(mc: caches->shadow_page_cache);
2246	if (!role.direct)
2247	sp->shadowed_translation = kvm_mmu_memory_cache_alloc(mc: caches->shadowed_info_cache);
2248
2249	set_page_private(virt_to_page(sp->spt), private: (unsigned long)sp);
2250
2251	INIT_LIST_HEAD(list: &sp->possible_nx_huge_page_link);
2252
2253	/*
2254	* active_mmu_pages must be a FIFO list, as kvm_zap_obsolete_pages()
2255	* depends on valid pages being added to the head of the list. See
2256	* comments in kvm_zap_obsolete_pages().
2257	*/
2258	sp->mmu_valid_gen = kvm->arch.mmu_valid_gen;
2259	list_add(new: &sp->link, head: &kvm->arch.active_mmu_pages);
2260	kvm_account_mmu_page(kvm, sp);
2261
2262	sp->gfn = gfn;
2263	sp->role = role;
2264	hlist_add_head(n: &sp->hash_link, h: sp_list);
2265	if (sp_has_gptes(sp))
2266	account_shadowed(kvm, sp);
2267
2268	return sp;
2269	}
2270
2271	/ Note, @vcpu may be NULL if @role.direct is true; see kvm_mmu_find_shadow_page. /
2272	static struct kvm_mmu_page __kvm_mmu_get_shadow_page(struct* kvm *kvm,
2273	struct kvm_vcpu *vcpu,
2274	struct shadow_page_caches *caches,
2275	gfn_t gfn,
2276	union kvm_mmu_page_role role)
2277	{
2278	struct hlist_head *sp_list;
2279	struct kvm_mmu_page *sp;
2280	bool created = false;
2281
2282	sp_list = &kvm->arch.mmu_page_hash[kvm_page_table_hashfn(gfn)];
2283
2284	sp = kvm_mmu_find_shadow_page(kvm, vcpu, gfn, sp_list, role);
2285	if (!sp) {
2286	created = true;
2287	sp = kvm_mmu_alloc_shadow_page(kvm, caches, gfn, sp_list, role);
2288	}
2289
2290	trace_kvm_mmu_get_page(sp, created);
2291	return sp;
2292	}
2293
2294	static struct kvm_mmu_page kvm_mmu_get_shadow_page(struct* kvm_vcpu *vcpu,
2295	gfn_t gfn,
2296	union kvm_mmu_page_role role)
2297	{
2298	struct shadow_page_caches caches = {
2299	.page_header_cache = &vcpu->arch.mmu_page_header_cache,
2300	.shadow_page_cache = &vcpu->arch.mmu_shadow_page_cache,
2301	.shadowed_info_cache = &vcpu->arch.mmu_shadowed_info_cache,
2302	};
2303
2304	return __kvm_mmu_get_shadow_page(kvm: vcpu->kvm, vcpu, caches: &caches, gfn, role);
2305	}
2306
2307	static union kvm_mmu_page_role kvm_mmu_child_role(u64 *sptep, bool direct,
2308	unsigned int access)
2309	{
2310	struct kvm_mmu_page *parent_sp = sptep_to_sp(sptep);
2311	union kvm_mmu_page_role role;
2312
2313	role = parent_sp->role;
2314	role.level--;
2315	role.access = access;
2316	role.direct = direct;
2317	role.passthrough = `0`;
2318
2319	/*
2320	* If the guest has 4-byte PTEs then that means it's using 32-bit,
2321	* 2-level, non-PAE paging. KVM shadows such guests with PAE paging
2322	* (i.e. 8-byte PTEs). The difference in PTE size means that KVM must
2323	* shadow each guest page table with multiple shadow page tables, which
2324	* requires extra bookkeeping in the role.
2325	*
2326	* Specifically, to shadow the guest's page directory (which covers a
2327	* 4GiB address space), KVM uses 4 PAE page directories, each mapping
2328	* 1GiB of the address space. @role.quadrant encodes which quarter of
2329	* the address space each maps.
2330	*
2331	* To shadow the guest's page tables (which each map a 4MiB region), KVM
2332	* uses 2 PAE page tables, each mapping a 2MiB region. For these,
2333	* @role.quadrant encodes which half of the region they map.
2334	*
2335	* Concretely, a 4-byte PDE consumes bits 31:22, while an 8-byte PDE
2336	* consumes bits 29:21. To consume bits 31:30, KVM's uses 4 shadow
2337	* PDPTEs; those 4 PAE page directories are pre-allocated and their
2338	* quadrant is assigned in mmu_alloc_root(). A 4-byte PTE consumes
2339	* bits 21:12, while an 8-byte PTE consumes bits 20:12. To consume
2340	* bit 21 in the PTE (the child here), KVM propagates that bit to the
2341	* quadrant, i.e. sets quadrant to '0' or '1'. The parent 8-byte PDE
2342	* covers bit 21 (see above), thus the quadrant is calculated from the
2343	* _least_ significant bit of the PDE index.
2344	*/
2345	if (role.has_4_byte_gpte) {
2346	WARN_ON_ONCE(role.level != PG_LEVEL_4K);
2347	role.quadrant = spte_index(sptep) & `1`;
2348	}
2349
2350	return role;
2351	}
2352
2353	static struct kvm_mmu_page kvm_mmu_get_child_sp(struct* kvm_vcpu *vcpu,
2354	u64 *sptep, gfn_t gfn,
2355	bool direct, unsigned int access)
2356	{
2357	union kvm_mmu_page_role role;
2358
2359	if (is_shadow_present_pte(pte: sptep) && !is_large_pte(pte: sptep))
2360	return ERR_PTR(error: -EEXIST);
2361
2362	role = kvm_mmu_child_role(sptep, direct, access);
2363	return kvm_mmu_get_shadow_page(vcpu, gfn, role);
2364	}
2365
2366	static void shadow_walk_init_using_root(struct kvm_shadow_walk_iterator *iterator,
2367	struct kvm_vcpu *vcpu, hpa_t root,
2368	u64 addr)
2369	{
2370	iterator->addr = addr;
2371	iterator->shadow_addr = root;
2372	iterator->level = vcpu->arch.mmu->root_role.level;
2373
2374	if (iterator->level >= PT64_ROOT_4LEVEL &&
2375	vcpu->arch.mmu->cpu_role.base.level < PT64_ROOT_4LEVEL &&
2376	!vcpu->arch.mmu->root_role.direct)
2377	iterator->level = PT32E_ROOT_LEVEL;
2378
2379	if (iterator->level == PT32E_ROOT_LEVEL) {
2380	/*
2381	* prev_root is currently only used for 64-bit hosts. So only
2382	* the active root_hpa is valid here.
2383	*/
2384	BUG_ON(root != vcpu->arch.mmu->root.hpa);
2385
2386	iterator->shadow_addr
2387	= vcpu->arch.mmu->pae_root[(addr >> `30`) & `3`];
2388	iterator->shadow_addr &= SPTE_BASE_ADDR_MASK;
2389	--iterator->level;
2390	if (!iterator->shadow_addr)
2391	iterator->level = `0`;
2392	}
2393	}
2394
2395	static void shadow_walk_init(struct kvm_shadow_walk_iterator *iterator,
2396	struct kvm_vcpu *vcpu, u64 addr)
2397	{
2398	shadow_walk_init_using_root(iterator, vcpu, root: vcpu->arch.mmu->root.hpa,
2399	addr);
2400	}
2401
2402	static bool shadow_walk_okay(struct kvm_shadow_walk_iterator *iterator)
2403	{
2404	if (iterator->level < PG_LEVEL_4K)
2405	return false;
2406
2407	iterator->index = SPTE_INDEX(iterator->addr, iterator->level);
2408	iterator->sptep = ((u64 *)__va(iterator->shadow_addr)) + iterator->index;
2409	return true;
2410	}
2411
2412	static void __shadow_walk_next(struct kvm_shadow_walk_iterator *iterator,
2413	u64 spte)
2414	{
2415	if (!is_shadow_present_pte(pte: spte) \|\| is_last_spte(pte: spte, level: iterator->level)) {
2416	iterator->level = `0`;
2417	return;
2418	}
2419
2420	iterator->shadow_addr = spte & SPTE_BASE_ADDR_MASK;
2421	--iterator->level;
2422	}
2423
2424	static void shadow_walk_next(struct kvm_shadow_walk_iterator *iterator)
2425	{
2426	__shadow_walk_next(iterator, spte: *iterator->sptep);
2427	}
2428
2429	static void __link_shadow_page(struct kvm *kvm,
2430	struct kvm_mmu_memory_cache cache, u64 sptep,
2431	struct kvm_mmu_page *sp, bool flush)
2432	{
2433	u64 spte;
2434
2435	BUILD_BUG_ON(VMX_EPT_WRITABLE_MASK != PT_WRITABLE_MASK);
2436
2437	/*
2438	* If an SPTE is present already, it must be a leaf and therefore
2439	* a large one. Drop it, and flush the TLB if needed, before
2440	* installing sp.
2441	*/
2442	if (is_shadow_present_pte(pte: *sptep))
2443	drop_large_spte(kvm, sptep, flush);
2444
2445	spte = make_nonleaf_spte(child_pt: sp->spt, ad_disabled: sp_ad_disabled(sp));
2446
2447	mmu_spte_set(sptep, new_spte: spte);
2448
2449	mmu_page_add_parent_pte(cache, sp, parent_pte: sptep);
2450
2451	/*
2452	* The non-direct sub-pagetable must be updated before linking. For
2453	* L1 sp, the pagetable is updated via kvm_sync_page() in
2454	* kvm_mmu_find_shadow_page() without write-protecting the gfn,
2455	* so sp->unsync can be true or false. For higher level non-direct
2456	* sp, the pagetable is updated/synced via mmu_sync_children() in
2457	* FNAME(fetch)(), so sp->unsync_children can only be false.
2458	* WARN_ON_ONCE() if anything happens unexpectedly.
2459	*/
2460	if (WARN_ON_ONCE(sp->unsync_children) \|\| sp->unsync)
2461	mark_unsync(spte: sptep);
2462	}
2463
2464	static void link_shadow_page(struct kvm_vcpu vcpu, u64 sptep,
2465	struct kvm_mmu_page *sp)
2466	{
2467	__link_shadow_page(kvm: vcpu->kvm, cache: &vcpu->arch.mmu_pte_list_desc_cache, sptep, sp, flush: true);
2468	}
2469
2470	static void validate_direct_spte(struct kvm_vcpu vcpu, u64 sptep,
2471	unsigned direct_access)
2472	{
2473	if (is_shadow_present_pte(pte: sptep) && !is_large_pte(pte: sptep)) {
2474	struct kvm_mmu_page *child;
2475
2476	/*
2477	* For the direct sp, if the guest pte's dirty bit
2478	* changed form clean to dirty, it will corrupt the
2479	* sp's access: allow writable in the read-only sp,
2480	* so we should update the spte at this point to get
2481	* a new sp with the correct access.
2482	*/
2483	child = spte_to_child_sp(spte: *sptep);
2484	if (child->role.access == direct_access)
2485	return;
2486
2487	drop_parent_pte(kvm: vcpu->kvm, sp: child, parent_pte: sptep);
2488	kvm_flush_remote_tlbs_sptep(kvm: vcpu->kvm, sptep);
2489	}
2490	}
2491
2492	/ Returns the number of zapped non-leaf child shadow pages. /
2493	static int mmu_page_zap_pte(struct kvm kvm, struct* kvm_mmu_page *sp,
2494	u64 spte, struct* list_head *invalid_list)
2495	{
2496	u64 pte;
2497	struct kvm_mmu_page *child;
2498
2499	pte = *spte;
2500	if (is_shadow_present_pte(pte)) {
2501	if (is_last_spte(pte, level: sp->role.level)) {
2502	drop_spte(kvm, sptep: spte);
2503	} else {
2504	child = spte_to_child_sp(spte: pte);
2505	drop_parent_pte(kvm, sp: child, parent_pte: spte);
2506
2507	/*
2508	* Recursively zap nested TDP SPs, parentless SPs are
2509	* unlikely to be used again in the near future. This
2510	* avoids retaining a large number of stale nested SPs.
2511	*/
2512	if (tdp_enabled && invalid_list &&
2513	child->role.guest_mode && !child->parent_ptes.val)
2514	return kvm_mmu_prepare_zap_page(kvm, sp: child,
2515	invalid_list);
2516	}
2517	} else if (is_mmio_spte(spte: pte)) {
2518	mmu_spte_clear_no_track(sptep: spte);
2519	}
2520	return `0`;
2521	}
2522
2523	static int kvm_mmu_page_unlink_children(struct kvm *kvm,
2524	struct kvm_mmu_page *sp,
2525	struct list_head *invalid_list)
2526	{
2527	int zapped = `0`;
2528	unsigned i;
2529
2530	for (i = `0`; i < SPTE_ENT_PER_PAGE; ++i)
2531	zapped += mmu_page_zap_pte(kvm, sp, spte: sp->spt + i, invalid_list);
2532
2533	return zapped;
2534	}
2535
2536	static void kvm_mmu_unlink_parents(struct kvm kvm, struct* kvm_mmu_page *sp)
2537	{
2538	u64 *sptep;
2539	struct rmap_iterator iter;
2540
2541	while ((sptep = rmap_get_first(rmap_head: &sp->parent_ptes, iter: &iter)))
2542	drop_parent_pte(kvm, sp, parent_pte: sptep);
2543	}
2544
2545	static int mmu_zap_unsync_children(struct kvm *kvm,
2546	struct kvm_mmu_page *parent,
2547	struct list_head *invalid_list)
2548	{
2549	int i, zapped = `0`;
2550	struct mmu_page_path parents;
2551	struct kvm_mmu_pages pages;
2552
2553	if (parent->role.level == PG_LEVEL_4K)
2554	return `0`;
2555
2556	while (mmu_unsync_walk(sp: parent, pvec: &pages)) {
2557	struct kvm_mmu_page *sp;
2558
2559	for_each_sp(pages, sp, parents, i) {
2560	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
2561	mmu_pages_clear_parents(parents: &parents);
2562	zapped++;
2563	}
2564	}
2565
2566	return zapped;
2567	}
2568
2569	static bool __kvm_mmu_prepare_zap_page(struct kvm *kvm,
2570	struct kvm_mmu_page *sp,
2571	struct list_head *invalid_list,
2572	int *nr_zapped)
2573	{
2574	bool list_unstable, zapped_root = false;
2575
2576	lockdep_assert_held_write(&kvm->mmu_lock);
2577	trace_kvm_mmu_prepare_zap_page(sp);
2578	++kvm->stat.mmu_shadow_zapped;
2579	*nr_zapped = mmu_zap_unsync_children(kvm, parent: sp, invalid_list);
2580	*nr_zapped += kvm_mmu_page_unlink_children(kvm, sp, invalid_list);
2581	kvm_mmu_unlink_parents(kvm, sp);
2582
2583	/ Zapping children means active_mmu_pages has become unstable. /
2584	list_unstable = *nr_zapped;
2585
2586	if (!sp->role.invalid && sp_has_gptes(sp))
2587	unaccount_shadowed(kvm, sp);
2588
2589	if (sp->unsync)
2590	kvm_unlink_unsync_page(kvm, sp);
2591	if (!sp->root_count) {
2592	/ Count self /
2593	(*nr_zapped)++;
2594
2595	/*
2596	* Already invalid pages (previously active roots) are not on
2597	* the active page list. See list_del() in the "else" case of
2598	* !sp->root_count.
2599	*/
2600	if (sp->role.invalid)
2601	list_add(new: &sp->link, head: invalid_list);
2602	else
2603	list_move(list: &sp->link, head: invalid_list);
2604	kvm_unaccount_mmu_page(kvm, sp);
2605	} else {
2606	/*
2607	* Remove the active root from the active page list, the root
2608	* will be explicitly freed when the root_count hits zero.
2609	*/
2610	list_del(entry: &sp->link);
2611
2612	/*
2613	* Obsolete pages cannot be used on any vCPUs, see the comment
2614	* in kvm_mmu_zap_all_fast(). Note, is_obsolete_sp() also
2615	* treats invalid shadow pages as being obsolete.
2616	*/
2617	zapped_root = !is_obsolete_sp(kvm, sp);
2618	}
2619
2620	if (sp->nx_huge_page_disallowed)
2621	unaccount_nx_huge_page(kvm, sp);
2622
2623	sp->role.invalid = `1`;
2624
2625	/*
2626	* Make the request to free obsolete roots after marking the root
2627	* invalid, otherwise other vCPUs may not see it as invalid.
2628	*/
2629	if (zapped_root)
2630	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
2631	return list_unstable;
2632	}
2633
2634	static bool kvm_mmu_prepare_zap_page(struct kvm kvm, struct* kvm_mmu_page *sp,
2635	struct list_head *invalid_list)
2636	{
2637	int nr_zapped;
2638
2639	__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list, nr_zapped: &nr_zapped);
2640	return nr_zapped;
2641	}
2642
2643	static void kvm_mmu_commit_zap_page(struct kvm *kvm,
2644	struct list_head *invalid_list)
2645	{
2646	struct kvm_mmu_page sp, nsp;
2647
2648	if (list_empty(head: invalid_list))
2649	return;
2650
2651	/*
2652	* We need to make sure everyone sees our modifications to
2653	* the page tables and see changes to vcpu->mode here. The barrier
2654	* in the kvm_flush_remote_tlbs() achieves this. This pairs
2655	* with vcpu_enter_guest and walk_shadow_page_lockless_begin/end.
2656	*
2657	* In addition, kvm_flush_remote_tlbs waits for all vcpus to exit
2658	* guest mode and/or lockless shadow page table walks.
2659	*/
2660	kvm_flush_remote_tlbs(kvm);
2661
2662	list_for_each_entry_safe(sp, nsp, invalid_list, link) {
2663	WARN_ON_ONCE(!sp->role.invalid \|\| sp->root_count);
2664	kvm_mmu_free_shadow_page(sp);
2665	}
2666	}
2667
2668	static unsigned long kvm_mmu_zap_oldest_mmu_pages(struct kvm *kvm,
2669	unsigned long nr_to_zap)
2670	{
2671	unsigned long total_zapped = `0`;
2672	struct kvm_mmu_page sp, tmp;
2673	LIST_HEAD(invalid_list);
2674	bool unstable;
2675	int nr_zapped;
2676
2677	if (list_empty(head: &kvm->arch.active_mmu_pages))
2678	return `0`;
2679
2680	restart:
2681	list_for_each_entry_safe_reverse(sp, tmp, &kvm->arch.active_mmu_pages, link) {
2682	/*
2683	* Don't zap active root pages, the page itself can't be freed
2684	* and zapping it will just force vCPUs to realloc and reload.
2685	*/
2686	if (sp->root_count)
2687	continue;
2688
2689	unstable = __kvm_mmu_prepare_zap_page(kvm, sp, invalid_list: &invalid_list,
2690	nr_zapped: &nr_zapped);
2691	total_zapped += nr_zapped;
2692	if (total_zapped >= nr_to_zap)
2693	break;
2694
2695	if (unstable)
2696	goto restart;
2697	}
2698
2699	kvm_mmu_commit_zap_page(kvm, invalid_list: &invalid_list);
2700
2701	kvm->stat.mmu_recycled += total_zapped;
2702	return total_zapped;
2703	}
2704
2705	static inline unsigned long kvm_mmu_available_pages(struct kvm *kvm)
2706	{
2707	if (kvm->arch.n_max_mmu_pages > kvm->arch.n_used_mmu_pages)
2708	return kvm->arch.n_max_mmu_pages -
2709	kvm->arch.n_used_mmu_pages;
2710
2711	return `0`;
2712	}
2713
2714	static int make_mmu_pages_available(struct kvm_vcpu *vcpu)
2715	{
2716	unsigned long avail = kvm_mmu_available_pages(kvm: vcpu->kvm);
2717
2718	if (likely(avail >= KVM_MIN_FREE_MMU_PAGES))
2719	return `0`;
2720
2721	kvm_mmu_zap_oldest_mmu_pages(kvm: vcpu->kvm, KVM_REFILL_PAGES - avail);
2722
2723	/*
2724	* Note, this check is intentionally soft, it only guarantees that one
2725	* page is available, while the caller may end up allocating as many as
2726	* four pages, e.g. for PAE roots or for 5-level paging. Temporarily
2727	* exceeding the (arbitrary by default) limit will not harm the host,
2728	* being too aggressive may unnecessarily kill the guest, and getting an
2729	* exact count is far more trouble than it's worth, especially in the
2730	* page fault paths.
2731	*/
2732	if (!kvm_mmu_available_pages(kvm: vcpu->kvm))
2733	return -ENOSPC;
2734	return `0`;
2735	}
2736
2737	/*
2738	* Changing the number of mmu pages allocated to the vm
2739	* Note: if goal_nr_mmu_pages is too small, you will get dead lock
2740	*/
2741	void kvm_mmu_change_mmu_pages(struct kvm kvm, unsigned* long goal_nr_mmu_pages)
2742	{
2743	write_lock(&kvm->mmu_lock);
2744
2745	if (kvm->arch.n_used_mmu_pages > goal_nr_mmu_pages) {
2746	kvm_mmu_zap_oldest_mmu_pages(kvm, nr_to_zap: kvm->arch.n_used_mmu_pages -
2747	goal_nr_mmu_pages);
2748
2749	goal_nr_mmu_pages = kvm->arch.n_used_mmu_pages;
2750	}
2751
2752	kvm->arch.n_max_mmu_pages = goal_nr_mmu_pages;
2753
2754	write_unlock(&kvm->mmu_lock);
2755	}
2756
2757	int kvm_mmu_unprotect_page(struct kvm *kvm, gfn_t gfn)
2758	{
2759	struct kvm_mmu_page *sp;
2760	LIST_HEAD(invalid_list);
2761	int r;
2762
2763	r = `0`;
2764	write_lock(&kvm->mmu_lock);
2765	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2766	r = `1`;
2767	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list: &invalid_list);
2768	}
2769	kvm_mmu_commit_zap_page(kvm, invalid_list: &invalid_list);
2770	write_unlock(&kvm->mmu_lock);
2771
2772	return r;
2773	}
2774
2775	static int kvm_mmu_unprotect_page_virt(struct kvm_vcpu *vcpu, gva_t gva)
2776	{
2777	gpa_t gpa;
2778	int r;
2779
2780	if (vcpu->arch.mmu->root_role.direct)
2781	return `0`;
2782
2783	gpa = kvm_mmu_gva_to_gpa_read(vcpu, gva, NULL);
2784
2785	r = kvm_mmu_unprotect_page(kvm: vcpu->kvm, gfn: gpa >> PAGE_SHIFT);
2786
2787	return r;
2788	}
2789
2790	static void kvm_unsync_page(struct kvm kvm, struct* kvm_mmu_page *sp)
2791	{
2792	trace_kvm_mmu_unsync_page(sp);
2793	++kvm->stat.mmu_unsync;
2794	sp->unsync = `1`;
2795
2796	kvm_mmu_mark_parents_unsync(sp);
2797	}
2798
2799	/*
2800	* Attempt to unsync any shadow pages that can be reached by the specified gfn,
2801	* KVM is creating a writable mapping for said gfn. Returns 0 if all pages
2802	* were marked unsync (or if there is no shadow page), -EPERM if the SPTE must
2803	* be write-protected.
2804	*/
2805	int mmu_try_to_unsync_pages(struct kvm kvm, const* struct kvm_memory_slot *slot,
2806	gfn_t gfn, bool can_unsync, bool prefetch)
2807	{
2808	struct kvm_mmu_page *sp;
2809	bool locked = false;
2810
2811	/*
2812	* Force write-protection if the page is being tracked. Note, the page
2813	* track machinery is used to write-protect upper-level shadow pages,
2814	* i.e. this guards the role.level == 4K assertion below!
2815	*/
2816	if (kvm_gfn_is_write_tracked(kvm, slot, gfn))
2817	return -EPERM;
2818
2819	/*
2820	* The page is not write-tracked, mark existing shadow pages unsync
2821	* unless KVM is synchronizing an unsync SP (can_unsync = false). In
2822	* that case, KVM must complete emulation of the guest TLB flush before
2823	* allowing shadow pages to become unsync (writable by the guest).
2824	*/
2825	for_each_gfn_valid_sp_with_gptes(kvm, sp, gfn) {
2826	if (!can_unsync)
2827	return -EPERM;
2828
2829	if (sp->unsync)
2830	continue;
2831
2832	if (prefetch)
2833	return -EEXIST;
2834
2835	/*
2836	* TDP MMU page faults require an additional spinlock as they
2837	* run with mmu_lock held for read, not write, and the unsync
2838	* logic is not thread safe. Take the spinklock regardless of
2839	* the MMU type to avoid extra conditionals/parameters, there's
2840	* no meaningful penalty if mmu_lock is held for write.
2841	*/
2842	if (!locked) {
2843	locked = true;
2844	spin_lock(lock: &kvm->arch.mmu_unsync_pages_lock);
2845
2846	/*
2847	* Recheck after taking the spinlock, a different vCPU
2848	* may have since marked the page unsync. A false
2849	* negative on the unprotected check above is not
2850	* possible as clearing sp->unsync _must_ hold mmu_lock
2851	* for write, i.e. unsync cannot transition from 1->0
2852	* while this CPU holds mmu_lock for read (or write).
2853	*/
2854	if (READ_ONCE(sp->unsync))
2855	continue;
2856	}
2857
2858	WARN_ON_ONCE(sp->role.level != PG_LEVEL_4K);
2859	kvm_unsync_page(kvm, sp);
2860	}
2861	if (locked)
2862	spin_unlock(lock: &kvm->arch.mmu_unsync_pages_lock);
2863
2864	/*
2865	* We need to ensure that the marking of unsync pages is visible
2866	* before the SPTE is updated to allow writes because
2867	* kvm_mmu_sync_roots() checks the unsync flags without holding
2868	* the MMU lock and so can race with this. If the SPTE was updated
2869	* before the page had been marked as unsync-ed, something like the
2870	* following could happen:
2871	*
2872	* CPU 1 CPU 2
2873	* ---------------------------------------------------------------------
2874	* 1.2 Host updates SPTE
2875	* to be writable
2876	* 2.1 Guest writes a GPTE for GVA X.
2877	* (GPTE being in the guest page table shadowed
2878	* by the SP from CPU 1.)
2879	* This reads SPTE during the page table walk.
2880	* Since SPTE.W is read as 1, there is no
2881	* fault.
2882	*
2883	* 2.2 Guest issues TLB flush.
2884	* That causes a VM Exit.
2885	*
2886	* 2.3 Walking of unsync pages sees sp->unsync is
2887	* false and skips the page.
2888	*
2889	* 2.4 Guest accesses GVA X.
2890	* Since the mapping in the SP was not updated,
2891	* so the old mapping for GVA X incorrectly
2892	* gets used.
2893	* 1.1 Host marks SP
2894	* as unsync
2895	* (sp->unsync = true)
2896	*
2897	* The write barrier below ensures that 1.1 happens before 1.2 and thus
2898	* the situation in 2.4 does not arise. It pairs with the read barrier
2899	* in is_unsync_root(), placed between 2.1's load of SPTE.W and 2.3.
2900	*/
2901	smp_wmb();
2902
2903	return `0`;
2904	}
2905
2906	static int mmu_set_spte(struct kvm_vcpu vcpu, struct* kvm_memory_slot *slot,
2907	u64 sptep, unsigned* int pte_access, gfn_t gfn,
2908	kvm_pfn_t pfn, struct kvm_page_fault *fault)
2909	{
2910	struct kvm_mmu_page *sp = sptep_to_sp(sptep);
2911	int level = sp->role.level;
2912	int was_rmapped = `0`;
2913	int ret = RET_PF_FIXED;
2914	bool flush = false;
2915	bool wrprot;
2916	u64 spte;
2917
2918	/ Prefetching always gets a writable pfn. /
2919	bool host_writable = !fault \|\| fault->map_writable;
2920	bool prefetch = !fault \|\| fault->prefetch;
2921	bool write_fault = fault && fault->write;
2922
2923	if (unlikely(is_noslot_pfn(pfn))) {
2924	vcpu->stat.pf_mmio_spte_created++;
2925	mark_mmio_spte(vcpu, sptep, gfn, access: pte_access);
2926	return RET_PF_EMULATE;
2927	}
2928
2929	if (is_shadow_present_pte(pte: *sptep)) {
2930	/*
2931	* If we overwrite a PTE page pointer with a 2MB PMD, unlink
2932	* the parent of the now unreachable PTE.
2933	*/
2934	if (level > PG_LEVEL_4K && !is_large_pte(pte: *sptep)) {
2935	struct kvm_mmu_page *child;
2936	u64 pte = *sptep;
2937
2938	child = spte_to_child_sp(spte: pte);
2939	drop_parent_pte(kvm: vcpu->kvm, sp: child, parent_pte: sptep);
2940	flush = true;
2941	} else if (pfn != spte_to_pfn(pte: *sptep)) {
2942	drop_spte(kvm: vcpu->kvm, sptep);
2943	flush = true;
2944	} else
2945	was_rmapped = `1`;
2946	}
2947
2948	wrprot = make_spte(vcpu, sp, slot, pte_access, gfn, pfn, old_spte: *sptep, prefetch,
2949	can_unsync: true, host_writable, new_spte: &spte);
2950
2951	if (*sptep == spte) {
2952	ret = RET_PF_SPURIOUS;
2953	} else {
2954	flush \|= mmu_spte_update(sptep, new_spte: spte);
2955	trace_kvm_mmu_set_spte(level, gfn, sptep);
2956	}
2957
2958	if (wrprot) {
2959	if (write_fault)
2960	ret = RET_PF_EMULATE;
2961	}
2962
2963	if (flush)
2964	kvm_flush_remote_tlbs_gfn(kvm: vcpu->kvm, gfn, level);
2965
2966	if (!was_rmapped) {
2967	WARN_ON_ONCE(ret == RET_PF_SPURIOUS);
2968	rmap_add(vcpu, slot, spte: sptep, gfn, access: pte_access);
2969	} else {
2970	/ Already rmapped but the pte_access bits may have changed. /
2971	kvm_mmu_page_set_access(sp, index: spte_index(sptep), access: pte_access);
2972	}
2973
2974	return ret;
2975	}
2976
2977	static int direct_pte_prefetch_many(struct kvm_vcpu *vcpu,
2978	struct kvm_mmu_page *sp,
2979	u64 start, u64 end)
2980	{
2981	struct page *pages[PTE_PREFETCH_NUM];
2982	struct kvm_memory_slot *slot;
2983	unsigned int access = sp->role.access;
2984	int i, ret;
2985	gfn_t gfn;
2986
2987	gfn = kvm_mmu_page_get_gfn(sp, index: spte_index(sptep: start));
2988	slot = gfn_to_memslot_dirty_bitmap(vcpu, gfn, no_dirty_log: access & ACC_WRITE_MASK);
2989	if (!slot)
2990	return -`1`;
2991
2992	ret = gfn_to_page_many_atomic(slot, gfn, pages, nr_pages: end - start);
2993	if (ret <= `0`)
2994	return -`1`;
2995
2996	for (i = `0`; i < ret; i++, gfn++, start++) {
2997	mmu_set_spte(vcpu, slot, sptep: start, pte_access: access, gfn,
2998	page_to_pfn(pages[i]), NULL);
2999	put_page(page: pages[i]);
3000	}
3001
3002	return `0`;
3003	}
3004
3005	static void __direct_pte_prefetch(struct kvm_vcpu *vcpu,
3006	struct kvm_mmu_page sp, u64 sptep)
3007	{
3008	u64 spte, start = NULL;
3009	int i;
3010
3011	WARN_ON_ONCE(!sp->role.direct);
3012
3013	i = spte_index(sptep) & ~(PTE_PREFETCH_NUM - `1`);
3014	spte = sp->spt + i;
3015
3016	for (i = `0`; i < PTE_PREFETCH_NUM; i++, spte++) {
3017	if (is_shadow_present_pte(pte: *spte) \|\| spte == sptep) {
3018	if (!start)
3019	continue;
3020	if (direct_pte_prefetch_many(vcpu, sp, start, end: spte) < `0`)
3021	return;
3022	start = NULL;
3023	} else if (!start)
3024	start = spte;
3025	}
3026	if (start)
3027	direct_pte_prefetch_many(vcpu, sp, start, end: spte);
3028	}
3029
3030	static void direct_pte_prefetch(struct kvm_vcpu vcpu, u64 sptep)
3031	{
3032	struct kvm_mmu_page *sp;
3033
3034	sp = sptep_to_sp(sptep);
3035
3036	/*
3037	* Without accessed bits, there's no way to distinguish between
3038	* actually accessed translations and prefetched, so disable pte
3039	* prefetch if accessed bits aren't available.
3040	*/
3041	if (sp_ad_disabled(sp))
3042	return;
3043
3044	if (sp->role.level > PG_LEVEL_4K)
3045	return;
3046
3047	/*
3048	* If addresses are being invalidated, skip prefetching to avoid
3049	* accidentally prefetching those addresses.
3050	*/
3051	if (unlikely(vcpu->kvm->mmu_invalidate_in_progress))
3052	return;
3053
3054	__direct_pte_prefetch(vcpu, sp, sptep);
3055	}
3056
3057	/*
3058	* Lookup the mapping level for @gfn in the current mm.
3059	*
3060	* WARNING! Use of host_pfn_mapping_level() requires the caller and the end
3061	* consumer to be tied into KVM's handlers for MMU notifier events!
3062	*
3063	* There are several ways to safely use this helper:
3064	*
3065	* - Check mmu_invalidate_retry_gfn() after grabbing the mapping level, before
3066	* consuming it. In this case, mmu_lock doesn't need to be held during the
3067	* lookup, but it does need to be held while checking the MMU notifier.
3068	*
3069	* - Hold mmu_lock AND ensure there is no in-progress MMU notifier invalidation
3070	* event for the hva. This can be done by explicit checking the MMU notifier
3071	* or by ensuring that KVM already has a valid mapping that covers the hva.
3072	*
3073	* - Do not use the result to install new mappings, e.g. use the host mapping
3074	* level only to decide whether or not to zap an entry. In this case, it's
3075	* not required to hold mmu_lock (though it's highly likely the caller will
3076	* want to hold mmu_lock anyways, e.g. to modify SPTEs).
3077	*
3078	* Note! The lookup can still race with modifications to host page tables, but
3079	* the above "rules" ensure KVM will not _consume_ the result of the walk if a
3080	* race with the primary MMU occurs.
3081	*/
3082	static int host_pfn_mapping_level(struct kvm *kvm, gfn_t gfn,
3083	const struct kvm_memory_slot *slot)
3084	{
3085	int level = PG_LEVEL_4K;
3086	unsigned long hva;
3087	unsigned long flags;
3088	pgd_t pgd;
3089	p4d_t p4d;
3090	pud_t pud;
3091	pmd_t pmd;
3092
3093	/*
3094	* Note, using the already-retrieved memslot and __gfn_to_hva_memslot()
3095	* is not solely for performance, it's also necessary to avoid the
3096	* "writable" check in __gfn_to_hva_many(), which will always fail on
3097	* read-only memslots due to gfn_to_hva() assuming writes. Earlier
3098	* page fault steps have already verified the guest isn't writing a
3099	* read-only memslot.
3100	*/
3101	hva = __gfn_to_hva_memslot(slot, gfn);
3102
3103	/*
3104	* Disable IRQs to prevent concurrent tear down of host page tables,
3105	* e.g. if the primary MMU promotes a P*D to a huge page and then frees
3106	* the original page table.
3107	*/
3108	local_irq_save(flags);
3109
3110	/*
3111	* Read each entry once. As above, a non-leaf entry can be promoted to
3112	* a huge page _during_ this walk. Re-reading the entry could send the
3113	* walk into the weeks, e.g. p*d_leaf() returns false (sees the old
3114	* value) and then p*d_offset() walks into the target huge page instead
3115	* of the old page table (sees the new value).
3116	*/
3117	pgd = READ_ONCE(*pgd_offset(kvm->mm, hva));
3118	if (pgd_none(pgd))
3119	goto out;
3120
3121	p4d = READ_ONCE(*p4d_offset(&pgd, hva));
3122	if (p4d_none(p4d) \|\| !p4d_present(p4d))
3123	goto out;
3124
3125	pud = READ_ONCE(*pud_offset(&p4d, hva));
3126	if (pud_none(pud) \|\| !pud_present(pud))
3127	goto out;
3128
3129	if (pud_leaf(pud)) {
3130	level = PG_LEVEL_1G;
3131	goto out;
3132	}
3133
3134	pmd = READ_ONCE(*pmd_offset(&pud, hva));
3135	if (pmd_none(pmd) \|\| !pmd_present(pmd))
3136	goto out;
3137
3138	if (pmd_leaf(pte: pmd))
3139	level = PG_LEVEL_2M;
3140
3141	out:
3142	local_irq_restore(flags);
3143	return level;
3144	}
3145
3146	static int __kvm_mmu_max_mapping_level(struct kvm *kvm,
3147	const struct kvm_memory_slot *slot,
3148	gfn_t gfn, int max_level, bool is_private)
3149	{
3150	struct kvm_lpage_info *linfo;
3151	int host_level;
3152
3153	max_level = min(max_level, max_huge_page_level);
3154	for ( ; max_level > PG_LEVEL_4K; max_level--) {
3155	linfo = lpage_info_slot(gfn, slot, level: max_level);
3156	if (!linfo->disallow_lpage)
3157	break;
3158	}
3159
3160	if (is_private)
3161	return max_level;
3162
3163	if (max_level == PG_LEVEL_4K)
3164	return PG_LEVEL_4K;
3165
3166	host_level = host_pfn_mapping_level(kvm, gfn, slot);
3167	return min(host_level, max_level);
3168	}
3169
3170	int kvm_mmu_max_mapping_level(struct kvm *kvm,
3171	const struct kvm_memory_slot *slot, gfn_t gfn,
3172	int max_level)
3173	{
3174	bool is_private = kvm_slot_can_be_private(slot) &&
3175	kvm_mem_is_private(kvm, gfn);
3176
3177	return __kvm_mmu_max_mapping_level(kvm, slot, gfn, max_level, is_private);
3178	}
3179
3180	void kvm_mmu_hugepage_adjust(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault)
3181	{
3182	struct kvm_memory_slot *slot = fault->slot;
3183	kvm_pfn_t mask;
3184
3185	fault->huge_page_disallowed = fault->exec && fault->nx_huge_page_workaround_enabled;
3186
3187	if (unlikely(fault->max_level == PG_LEVEL_4K))
3188	return;
3189
3190	if (is_error_noslot_pfn(pfn: fault->pfn))
3191	return;
3192
3193	if (kvm_slot_dirty_track_enabled(slot))
3194	return;
3195
3196	/*
3197	* Enforce the iTLB multihit workaround after capturing the requested
3198	* level, which will be used to do precise, accurate accounting.
3199	*/
3200	fault->req_level = __kvm_mmu_max_mapping_level(kvm: vcpu->kvm, slot,
3201	gfn: fault->gfn, max_level: fault->max_level,
3202	is_private: fault->is_private);
3203	if (fault->req_level == PG_LEVEL_4K \|\| fault->huge_page_disallowed)
3204	return;
3205
3206	/*
3207	* mmu_invalidate_retry() was successful and mmu_lock is held, so
3208	* the pmd can't be split from under us.
3209	*/
3210	fault->goal_level = fault->req_level;
3211	mask = KVM_PAGES_PER_HPAGE(fault->goal_level) - `1`;
3212	VM_BUG_ON((fault->gfn & mask) != (fault->pfn & mask));
3213	fault->pfn &= ~mask;
3214	}
3215
3216	void disallowed_hugepage_adjust(struct kvm_page_fault fault, u64 spte, int* cur_level)
3217	{
3218	if (cur_level > PG_LEVEL_4K &&
3219	cur_level == fault->goal_level &&
3220	is_shadow_present_pte(pte: spte) &&
3221	!is_large_pte(pte: spte) &&
3222	spte_to_child_sp(spte)->nx_huge_page_disallowed) {
3223	/*
3224	* A small SPTE exists for this pfn, but FNAME(fetch),
3225	* direct_map(), or kvm_tdp_mmu_map() would like to create a
3226	* large PTE instead: just force them to go down another level,
3227	* patching back for them into pfn the next 9 bits of the
3228	* address.
3229	*/
3230	u64 page_mask = KVM_PAGES_PER_HPAGE(cur_level) -
3231	KVM_PAGES_PER_HPAGE(cur_level - `1`);
3232	fault->pfn \|= fault->gfn & page_mask;
3233	fault->goal_level--;
3234	}
3235	}
3236
3237	static int direct_map(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault)
3238	{
3239	struct kvm_shadow_walk_iterator it;
3240	struct kvm_mmu_page *sp;
3241	int ret;
3242	gfn_t base_gfn = fault->gfn;
3243
3244	kvm_mmu_hugepage_adjust(vcpu, fault);
3245
3246	trace_kvm_mmu_spte_requested(fault);
3247	for_each_shadow_entry(vcpu, fault->addr, it) {
3248	/*
3249	* We cannot overwrite existing page tables with an NX
3250	* large page, as the leaf could be executable.
3251	*/
3252	if (fault->nx_huge_page_workaround_enabled)
3253	disallowed_hugepage_adjust(fault, spte: *it.sptep, cur_level: it.level);
3254
3255	base_gfn = gfn_round_for_level(gfn: fault->gfn, level: it.level);
3256	if (it.level == fault->goal_level)
3257	break;
3258
3259	sp = kvm_mmu_get_child_sp(vcpu, it.sptep, base_gfn, true, ACC_ALL);
3260	if (sp == ERR_PTR(error: -EEXIST))
3261	continue;
3262
3263	link_shadow_page(vcpu, sptep: it.sptep, sp);
3264	if (fault->huge_page_disallowed)
3265	account_nx_huge_page(kvm: vcpu->kvm, sp,
3266	nx_huge_page_possible: fault->req_level >= it.level);
3267	}
3268
3269	if (WARN_ON_ONCE(it.level != fault->goal_level))
3270	return -EFAULT;
3271
3272	ret = mmu_set_spte(vcpu, fault->slot, it.sptep, ACC_ALL,
3273	base_gfn, fault->pfn, fault);
3274	if (ret == RET_PF_SPURIOUS)
3275	return ret;
3276
3277	direct_pte_prefetch(vcpu, sptep: it.sptep);
3278	return ret;
3279	}
3280
3281	static void kvm_send_hwpoison_signal(struct kvm_memory_slot *slot, gfn_t gfn)
3282	{
3283	unsigned long hva = gfn_to_hva_memslot(slot, gfn);
3284
3285	send_sig_mceerr(BUS_MCEERR_AR, (void __user *)hva, PAGE_SHIFT, current);
3286	}
3287
3288	static int kvm_handle_error_pfn(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault)
3289	{
3290	if (is_sigpending_pfn(pfn: fault->pfn)) {
3291	kvm_handle_signal_exit(vcpu);
3292	return -EINTR;
3293	}
3294
3295	/*
3296	* Do not cache the mmio info caused by writing the readonly gfn
3297	* into the spte otherwise read access on readonly gfn also can
3298	* caused mmio page fault and treat it as mmio access.
3299	*/
3300	if (fault->pfn == KVM_PFN_ERR_RO_FAULT)
3301	return RET_PF_EMULATE;
3302
3303	if (fault->pfn == KVM_PFN_ERR_HWPOISON) {
3304	kvm_send_hwpoison_signal(slot: fault->slot, gfn: fault->gfn);
3305	return RET_PF_RETRY;
3306	}
3307
3308	return -EFAULT;
3309	}
3310
3311	static int kvm_handle_noslot_fault(struct kvm_vcpu *vcpu,
3312	struct kvm_page_fault *fault,
3313	unsigned int access)
3314	{
3315	gva_t gva = fault->is_tdp ? `0` : fault->addr;
3316
3317	vcpu_cache_mmio_info(vcpu, gva, fault->gfn,
3318	access & shadow_mmio_access_mask);
3319
3320	/*
3321	* If MMIO caching is disabled, emulate immediately without
3322	* touching the shadow page tables as attempting to install an
3323	* MMIO SPTE will just be an expensive nop.
3324	*/
3325	if (unlikely(!enable_mmio_caching))
3326	return RET_PF_EMULATE;
3327
3328	/*
3329	* Do not create an MMIO SPTE for a gfn greater than host.MAXPHYADDR,
3330	* any guest that generates such gfns is running nested and is being
3331	* tricked by L0 userspace (you can observe gfn > L1.MAXPHYADDR if and
3332	* only if L1's MAXPHYADDR is inaccurate with respect to the
3333	* hardware's).
3334	*/
3335	if (unlikely(fault->gfn > kvm_mmu_max_gfn()))
3336	return RET_PF_EMULATE;
3337
3338	return RET_PF_CONTINUE;
3339	}
3340
3341	static bool page_fault_can_be_fast(struct kvm_page_fault *fault)
3342	{
3343	/*
3344	* Page faults with reserved bits set, i.e. faults on MMIO SPTEs, only
3345	* reach the common page fault handler if the SPTE has an invalid MMIO
3346	* generation number. Refreshing the MMIO generation needs to go down
3347	* the slow path. Note, EPT Misconfigs do NOT set the PRESENT flag!
3348	*/
3349	if (fault->rsvd)
3350	return false;
3351
3352	/*
3353	* #PF can be fast if:
3354	*
3355	* 1. The shadow page table entry is not present and A/D bits are
3356	* disabled _by KVM_, which could mean that the fault is potentially
3357	* caused by access tracking (if enabled). If A/D bits are enabled
3358	* by KVM, but disabled by L1 for L2, KVM is forced to disable A/D
3359	* bits for L2 and employ access tracking, but the fast page fault
3360	* mechanism only supports direct MMUs.
3361	* 2. The shadow page table entry is present, the access is a write,
3362	* and no reserved bits are set (MMIO SPTEs cannot be "fixed"), i.e.
3363	* the fault was caused by a write-protection violation. If the
3364	* SPTE is MMU-writable (determined later), the fault can be fixed
3365	* by setting the Writable bit, which can be done out of mmu_lock.
3366	*/
3367	if (!fault->present)
3368	return !kvm_ad_enabled();
3369
3370	/*
3371	* Note, instruction fetches and writes are mutually exclusive, ignore
3372	* the "exec" flag.
3373	*/
3374	return fault->write;
3375	}
3376
3377	/*
3378	* Returns true if the SPTE was fixed successfully. Otherwise,
3379	* someone else modified the SPTE from its original value.
3380	*/
3381	static bool fast_pf_fix_direct_spte(struct kvm_vcpu *vcpu,
3382	struct kvm_page_fault *fault,
3383	u64 *sptep, u64 old_spte, u64 new_spte)
3384	{
3385	/*
3386	* Theoretically we could also set dirty bit (and flush TLB) here in
3387	* order to eliminate unnecessary PML logging. See comments in
3388	* set_spte. But fast_page_fault is very unlikely to happen with PML
3389	* enabled, so we do not do this. This might result in the same GPA
3390	* to be logged in PML buffer again when the write really happens, and
3391	* eventually to be called by mark_page_dirty twice. But it's also no
3392	* harm. This also avoids the TLB flush needed after setting dirty bit
3393	* so non-PML cases won't be impacted.
3394	*
3395	* Compare with set_spte where instead shadow_dirty_mask is set.
3396	*/
3397	if (!try_cmpxchg64(sptep, &old_spte, new_spte))
3398	return false;
3399
3400	if (is_writable_pte(pte: new_spte) && !is_writable_pte(pte: old_spte))
3401	mark_page_dirty_in_slot(kvm: vcpu->kvm, memslot: fault->slot, gfn: fault->gfn);
3402
3403	return true;
3404	}
3405
3406	static bool is_access_allowed(struct kvm_page_fault *fault, u64 spte)
3407	{
3408	if (fault->exec)
3409	return is_executable_pte(spte);
3410
3411	if (fault->write)
3412	return is_writable_pte(pte: spte);
3413
3414	/ Fault was on Read access /
3415	return spte & PT_PRESENT_MASK;
3416	}
3417
3418	/*
3419	* Returns the last level spte pointer of the shadow page walk for the given
3420	* gpa, and sets *spte to the spte value. This spte may be non-preset. If no
3421	* walk could be performed, returns NULL and *spte does not contain valid data.
3422	*
3423	* Contract:
3424	* - Must be called between walk_shadow_page_lockless_{begin,end}.
3425	* - The returned sptep must not be used after walk_shadow_page_lockless_end.
3426	*/
3427	static u64 fast_pf_get_last_sptep(struct* kvm_vcpu vcpu, gpa_t gpa, u64 spte)
3428	{
3429	struct kvm_shadow_walk_iterator iterator;
3430	u64 old_spte;
3431	u64 *sptep = NULL;
3432
3433	for_each_shadow_entry_lockless(vcpu, gpa, iterator, old_spte) {
3434	sptep = iterator.sptep;
3435	*spte = old_spte;
3436	}
3437
3438	return sptep;
3439	}
3440
3441	/*
3442	* Returns one of RET_PF_INVALID, RET_PF_FIXED or RET_PF_SPURIOUS.
3443	*/
3444	static int fast_page_fault(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault)
3445	{
3446	struct kvm_mmu_page *sp;
3447	int ret = RET_PF_INVALID;
3448	u64 spte;
3449	u64 *sptep;
3450	uint retry_count = `0`;
3451
3452	if (!page_fault_can_be_fast(fault))
3453	return ret;
3454
3455	walk_shadow_page_lockless_begin(vcpu);
3456
3457	do {
3458	u64 new_spte;
3459
3460	if (tdp_mmu_enabled)
3461	sptep = kvm_tdp_mmu_fast_pf_get_last_sptep(vcpu, addr: fault->addr, spte: &spte);
3462	else
3463	sptep = fast_pf_get_last_sptep(vcpu, gpa: fault->addr, spte: &spte);
3464
3465	/*
3466	* It's entirely possible for the mapping to have been zapped
3467	* by a different task, but the root page should always be
3468	* available as the vCPU holds a reference to its root(s).
3469	*/
3470	if (WARN_ON_ONCE(!sptep))
3471	spte = REMOVED_SPTE;
3472
3473	if (!is_shadow_present_pte(pte: spte))
3474	break;
3475
3476	sp = sptep_to_sp(sptep);
3477	if (!is_last_spte(pte: spte, level: sp->role.level))
3478	break;
3479
3480	/*
3481	* Check whether the memory access that caused the fault would
3482	* still cause it if it were to be performed right now. If not,
3483	* then this is a spurious fault caused by TLB lazily flushed,
3484	* or some other CPU has already fixed the PTE after the
3485	* current CPU took the fault.
3486	*
3487	* Need not check the access of upper level table entries since
3488	* they are always ACC_ALL.
3489	*/
3490	if (is_access_allowed(fault, spte)) {
3491	ret = RET_PF_SPURIOUS;
3492	break;
3493	}
3494
3495	new_spte = spte;
3496
3497	/*
3498	* KVM only supports fixing page faults outside of MMU lock for
3499	* direct MMUs, nested MMUs are always indirect, and KVM always
3500	* uses A/D bits for non-nested MMUs. Thus, if A/D bits are
3501	* enabled, the SPTE can't be an access-tracked SPTE.
3502	*/
3503	if (unlikely(!kvm_ad_enabled()) && is_access_track_spte(spte))
3504	new_spte = restore_acc_track_spte(spte: new_spte);
3505
3506	/*
3507	* To keep things simple, only SPTEs that are MMU-writable can
3508	* be made fully writable outside of mmu_lock, e.g. only SPTEs
3509	* that were write-protected for dirty-logging or access
3510	* tracking are handled here. Don't bother checking if the
3511	* SPTE is writable to prioritize running with A/D bits enabled.
3512	* The is_access_allowed() check above handles the common case
3513	* of the fault being spurious, and the SPTE is known to be
3514	* shadow-present, i.e. except for access tracking restoration
3515	* making the new SPTE writable, the check is wasteful.
3516	*/
3517	if (fault->write && is_mmu_writable_spte(spte)) {
3518	new_spte \|= PT_WRITABLE_MASK;
3519
3520	/*
3521	* Do not fix write-permission on the large spte when
3522	* dirty logging is enabled. Since we only dirty the
3523	* first page into the dirty-bitmap in
3524	* fast_pf_fix_direct_spte(), other pages are missed
3525	* if its slot has dirty logging enabled.
3526	*
3527	* Instead, we let the slow page fault path create a
3528	* normal spte to fix the access.
3529	*/
3530	if (sp->role.level > PG_LEVEL_4K &&
3531	kvm_slot_dirty_track_enabled(slot: fault->slot))
3532	break;
3533	}
3534
3535	/ Verify that the fault can be handled in the fast path /
3536	if (new_spte == spte \|\|
3537	!is_access_allowed(fault, spte: new_spte))
3538	break;
3539
3540	/*
3541	* Currently, fast page fault only works for direct mapping
3542	* since the gfn is not stable for indirect shadow page. See
3543	* Documentation/virt/kvm/locking.rst to get more detail.
3544	*/
3545	if (fast_pf_fix_direct_spte(vcpu, fault, sptep, old_spte: spte, new_spte)) {
3546	ret = RET_PF_FIXED;
3547	break;
3548	}
3549
3550	if (++retry_count > `4`) {
3551	pr_warn_once("Fast #PF retrying more than 4 times.\n");
3552	break;
3553	}
3554
3555	} while (true);
3556
3557	trace_fast_page_fault(vcpu, fault, sptep, old_spte: spte, ret);
3558	walk_shadow_page_lockless_end(vcpu);
3559
3560	if (ret != RET_PF_INVALID)
3561	vcpu->stat.pf_fast++;
3562
3563	return ret;
3564	}
3565
3566	static void mmu_free_root_page(struct kvm kvm, hpa_t root_hpa,
3567	struct list_head *invalid_list)
3568	{
3569	struct kvm_mmu_page *sp;
3570
3571	if (!VALID_PAGE(*root_hpa))
3572	return;
3573
3574	sp = root_to_sp(root: *root_hpa);
3575	if (WARN_ON_ONCE(!sp))
3576	return;
3577
3578	if (is_tdp_mmu_page(sp)) {
3579	lockdep_assert_held_read(&kvm->mmu_lock);
3580	kvm_tdp_mmu_put_root(kvm, root: sp);
3581	} else {
3582	lockdep_assert_held_write(&kvm->mmu_lock);
3583	if (!--sp->root_count && sp->role.invalid)
3584	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list);
3585	}
3586
3587	*root_hpa = INVALID_PAGE;
3588	}
3589
3590	/ roots_to_free must be some combination of the KVM_MMU_ROOT_* flags /
3591	void kvm_mmu_free_roots(struct kvm kvm, struct* kvm_mmu *mmu,
3592	ulong roots_to_free)
3593	{
3594	bool is_tdp_mmu = tdp_mmu_enabled && mmu->root_role.direct;
3595	int i;
3596	LIST_HEAD(invalid_list);
3597	bool free_active_root;
3598
3599	WARN_ON_ONCE(roots_to_free & ~KVM_MMU_ROOTS_ALL);
3600
3601	BUILD_BUG_ON(KVM_MMU_NUM_PREV_ROOTS >= BITS_PER_LONG);
3602
3603	/ Before acquiring the MMU lock, see if we need to do any real work. /
3604	free_active_root = (roots_to_free & KVM_MMU_ROOT_CURRENT)
3605	&& VALID_PAGE(mmu->root.hpa);
3606
3607	if (!free_active_root) {
3608	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3609	if ((roots_to_free & KVM_MMU_ROOT_PREVIOUS(i)) &&
3610	VALID_PAGE(mmu->prev_roots[i].hpa))
3611	break;
3612
3613	if (i == KVM_MMU_NUM_PREV_ROOTS)
3614	return;
3615	}
3616
3617	if (is_tdp_mmu)
3618	read_lock(&kvm->mmu_lock);
3619	else
3620	write_lock(&kvm->mmu_lock);
3621
3622	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++)
3623	if (roots_to_free & KVM_MMU_ROOT_PREVIOUS(i))
3624	mmu_free_root_page(kvm, root_hpa: &mmu->prev_roots[i].hpa,
3625	invalid_list: &invalid_list);
3626
3627	if (free_active_root) {
3628	if (kvm_mmu_is_dummy_root(shadow_page: mmu->root.hpa)) {
3629	/ Nothing to cleanup for dummy roots. /
3630	} else if (root_to_sp(root: mmu->root.hpa)) {
3631	mmu_free_root_page(kvm, root_hpa: &mmu->root.hpa, invalid_list: &invalid_list);
3632	} else if (mmu->pae_root) {
3633	for (i = `0`; i < `4`; ++i) {
3634	if (!IS_VALID_PAE_ROOT(mmu->pae_root[i]))
3635	continue;
3636
3637	mmu_free_root_page(kvm, root_hpa: &mmu->pae_root[i],
3638	invalid_list: &invalid_list);
3639	mmu->pae_root[i] = INVALID_PAE_ROOT;
3640	}
3641	}
3642	mmu->root.hpa = INVALID_PAGE;
3643	mmu->root.pgd = `0`;
3644	}
3645
3646	if (is_tdp_mmu) {
3647	read_unlock(&kvm->mmu_lock);
3648	WARN_ON_ONCE(!list_empty(&invalid_list));
3649	} else {
3650	kvm_mmu_commit_zap_page(kvm, invalid_list: &invalid_list);
3651	write_unlock(&kvm->mmu_lock);
3652	}
3653	}
3654	EXPORT_SYMBOL_GPL(kvm_mmu_free_roots);
3655
3656	void kvm_mmu_free_guest_mode_roots(struct kvm kvm, struct* kvm_mmu *mmu)
3657	{
3658	unsigned long roots_to_free = `0`;
3659	struct kvm_mmu_page *sp;
3660	hpa_t root_hpa;
3661	int i;
3662
3663	/*
3664	* This should not be called while L2 is active, L2 can't invalidate
3665	* _only_ its own roots, e.g. INVVPID unconditionally exits.
3666	*/
3667	WARN_ON_ONCE(mmu->root_role.guest_mode);
3668
3669	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
3670	root_hpa = mmu->prev_roots[i].hpa;
3671	if (!VALID_PAGE(root_hpa))
3672	continue;
3673
3674	sp = root_to_sp(root: root_hpa);
3675	if (!sp \|\| sp->role.guest_mode)
3676	roots_to_free \|= KVM_MMU_ROOT_PREVIOUS(i);
3677	}
3678
3679	kvm_mmu_free_roots(kvm, mmu, roots_to_free);
3680	}
3681	EXPORT_SYMBOL_GPL(kvm_mmu_free_guest_mode_roots);
3682
3683	static hpa_t mmu_alloc_root(struct kvm_vcpu vcpu, gfn_t gfn, int* quadrant,
3684	u8 level)
3685	{
3686	union kvm_mmu_page_role role = vcpu->arch.mmu->root_role;
3687	struct kvm_mmu_page *sp;
3688
3689	role.level = level;
3690	role.quadrant = quadrant;
3691
3692	WARN_ON_ONCE(quadrant && !role.has_4_byte_gpte);
3693	WARN_ON_ONCE(role.direct && role.has_4_byte_gpte);
3694
3695	sp = kvm_mmu_get_shadow_page(vcpu, gfn, role);
3696	++sp->root_count;
3697
3698	return __pa(sp->spt);
3699	}
3700
3701	static int mmu_alloc_direct_roots(struct kvm_vcpu *vcpu)
3702	{
3703	struct kvm_mmu *mmu = vcpu->arch.mmu;
3704	u8 shadow_root_level = mmu->root_role.level;
3705	hpa_t root;
3706	unsigned i;
3707	int r;
3708
3709	if (tdp_mmu_enabled)
3710	return kvm_tdp_mmu_alloc_root(vcpu);
3711
3712	write_lock(&vcpu->kvm->mmu_lock);
3713	r = make_mmu_pages_available(vcpu);
3714	if (r < `0`)
3715	goto out_unlock;
3716
3717	if (shadow_root_level >= PT64_ROOT_4LEVEL) {
3718	root = mmu_alloc_root(vcpu, gfn: `0`, quadrant: `0`, level: shadow_root_level);
3719	mmu->root.hpa = root;
3720	} else if (shadow_root_level == PT32E_ROOT_LEVEL) {
3721	if (WARN_ON_ONCE(!mmu->pae_root)) {
3722	r = -EIO;
3723	goto out_unlock;
3724	}
3725
3726	for (i = `0`; i < `4`; ++i) {
3727	WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3728
3729	root = mmu_alloc_root(vcpu, gfn: i << (`30` - PAGE_SHIFT), quadrant: `0`,
3730	level: PT32_ROOT_LEVEL);
3731	mmu->pae_root[i] = root \| PT_PRESENT_MASK \|
3732	shadow_me_value;
3733	}
3734	mmu->root.hpa = __pa(mmu->pae_root);
3735	} else {
3736	WARN_ONCE(`1`, "Bad TDP root level = %d\n", shadow_root_level);
3737	r = -EIO;
3738	goto out_unlock;
3739	}
3740
3741	/ root.pgd is ignored for direct MMUs. /
3742	mmu->root.pgd = `0`;
3743	out_unlock:
3744	write_unlock(&vcpu->kvm->mmu_lock);
3745	return r;
3746	}
3747
3748	static int mmu_first_shadow_root_alloc(struct kvm *kvm)
3749	{
3750	struct kvm_memslots *slots;
3751	struct kvm_memory_slot *slot;
3752	int r = `0`, i, bkt;
3753
3754	/*
3755	* Check if this is the first shadow root being allocated before
3756	* taking the lock.
3757	*/
3758	if (kvm_shadow_root_allocated(kvm))
3759	return `0`;
3760
3761	mutex_lock(&kvm->slots_arch_lock);
3762
3763	/ Recheck, under the lock, whether this is the first shadow root. /
3764	if (kvm_shadow_root_allocated(kvm))
3765	goto out_unlock;
3766
3767	/*
3768	* Check if anything actually needs to be allocated, e.g. all metadata
3769	* will be allocated upfront if TDP is disabled.
3770	*/
3771	if (kvm_memslots_have_rmaps(kvm) &&
3772	kvm_page_track_write_tracking_enabled(kvm))
3773	goto out_success;
3774
3775	for (i = `0`; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
3776	slots = __kvm_memslots(kvm, as_id: i);
3777	kvm_for_each_memslot(slot, bkt, slots) {
3778	/*
3779	* Both of these functions are no-ops if the target is
3780	* already allocated, so unconditionally calling both
3781	* is safe. Intentionally do NOT free allocations on
3782	* failure to avoid having to track which allocations
3783	* were made now versus when the memslot was created.
3784	* The metadata is guaranteed to be freed when the slot
3785	* is freed, and will be kept/used if userspace retries
3786	* KVM_RUN instead of killing the VM.
3787	*/
3788	r = memslot_rmap_alloc(slot, npages: slot->npages);
3789	if (r)
3790	goto out_unlock;
3791	r = kvm_page_track_write_tracking_alloc(slot);
3792	if (r)
3793	goto out_unlock;
3794	}
3795	}
3796
3797	/*
3798	* Ensure that shadow_root_allocated becomes true strictly after
3799	* all the related pointers are set.
3800	*/
3801	out_success:
3802	smp_store_release(&kvm->arch.shadow_root_allocated, true);
3803
3804	out_unlock:
3805	mutex_unlock(lock: &kvm->slots_arch_lock);
3806	return r;
3807	}
3808
3809	static int mmu_alloc_shadow_roots(struct kvm_vcpu *vcpu)
3810	{
3811	struct kvm_mmu *mmu = vcpu->arch.mmu;
3812	u64 pdptrs[`4`], pm_mask;
3813	gfn_t root_gfn, root_pgd;
3814	int quadrant, i, r;
3815	hpa_t root;
3816
3817	root_pgd = kvm_mmu_get_guest_pgd(vcpu, mmu);
3818	root_gfn = (root_pgd & __PT_BASE_ADDR_MASK) >> PAGE_SHIFT;
3819
3820	if (!kvm_vcpu_is_visible_gfn(vcpu, gfn: root_gfn)) {
3821	mmu->root.hpa = kvm_mmu_get_dummy_root();
3822	return `0`;
3823	}
3824
3825	/*
3826	* On SVM, reading PDPTRs might access guest memory, which might fault
3827	* and thus might sleep. Grab the PDPTRs before acquiring mmu_lock.
3828	*/
3829	if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3830	for (i = `0`; i < `4`; ++i) {
3831	pdptrs[i] = mmu->get_pdptr(vcpu, i);
3832	if (!(pdptrs[i] & PT_PRESENT_MASK))
3833	continue;
3834
3835	if (!kvm_vcpu_is_visible_gfn(vcpu, gfn: pdptrs[i] >> PAGE_SHIFT))
3836	pdptrs[i] = `0`;
3837	}
3838	}
3839
3840	r = mmu_first_shadow_root_alloc(kvm: vcpu->kvm);
3841	if (r)
3842	return r;
3843
3844	write_lock(&vcpu->kvm->mmu_lock);
3845	r = make_mmu_pages_available(vcpu);
3846	if (r < `0`)
3847	goto out_unlock;
3848
3849	/*
3850	* Do we shadow a long mode page table? If so we need to
3851	* write-protect the guests page table root.
3852	*/
3853	if (mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
3854	root = mmu_alloc_root(vcpu, gfn: root_gfn, quadrant: `0`,
3855	level: mmu->root_role.level);
3856	mmu->root.hpa = root;
3857	goto set_root_pgd;
3858	}
3859
3860	if (WARN_ON_ONCE(!mmu->pae_root)) {
3861	r = -EIO;
3862	goto out_unlock;
3863	}
3864
3865	/*
3866	* We shadow a 32 bit page table. This may be a legacy 2-level
3867	* or a PAE 3-level page table. In either case we need to be aware that
3868	* the shadow page table may be a PAE or a long mode page table.
3869	*/
3870	pm_mask = PT_PRESENT_MASK \| shadow_me_value;
3871	if (mmu->root_role.level >= PT64_ROOT_4LEVEL) {
3872	pm_mask \|= PT_ACCESSED_MASK \| PT_WRITABLE_MASK \| PT_USER_MASK;
3873
3874	if (WARN_ON_ONCE(!mmu->pml4_root)) {
3875	r = -EIO;
3876	goto out_unlock;
3877	}
3878	mmu->pml4_root[`0`] = __pa(mmu->pae_root) \| pm_mask;
3879
3880	if (mmu->root_role.level == PT64_ROOT_5LEVEL) {
3881	if (WARN_ON_ONCE(!mmu->pml5_root)) {
3882	r = -EIO;
3883	goto out_unlock;
3884	}
3885	mmu->pml5_root[`0`] = __pa(mmu->pml4_root) \| pm_mask;
3886	}
3887	}
3888
3889	for (i = `0`; i < `4`; ++i) {
3890	WARN_ON_ONCE(IS_VALID_PAE_ROOT(mmu->pae_root[i]));
3891
3892	if (mmu->cpu_role.base.level == PT32E_ROOT_LEVEL) {
3893	if (!(pdptrs[i] & PT_PRESENT_MASK)) {
3894	mmu->pae_root[i] = INVALID_PAE_ROOT;
3895	continue;
3896	}
3897	root_gfn = pdptrs[i] >> PAGE_SHIFT;
3898	}
3899
3900	/*
3901	* If shadowing 32-bit non-PAE page tables, each PAE page
3902	* directory maps one quarter of the guest's non-PAE page
3903	* directory. Othwerise each PAE page direct shadows one guest
3904	* PAE page directory so that quadrant should be 0.
3905	*/
3906	quadrant = (mmu->cpu_role.base.level == PT32_ROOT_LEVEL) ? i : `0`;
3907
3908	root = mmu_alloc_root(vcpu, root_gfn, quadrant, PT32_ROOT_LEVEL);
3909	mmu->pae_root[i] = root \| pm_mask;
3910	}
3911
3912	if (mmu->root_role.level == PT64_ROOT_5LEVEL)
3913	mmu->root.hpa = __pa(mmu->pml5_root);
3914	else if (mmu->root_role.level == PT64_ROOT_4LEVEL)
3915	mmu->root.hpa = __pa(mmu->pml4_root);
3916	else
3917	mmu->root.hpa = __pa(mmu->pae_root);
3918
3919	set_root_pgd:
3920	mmu->root.pgd = root_pgd;
3921	out_unlock:
3922	write_unlock(&vcpu->kvm->mmu_lock);
3923
3924	return r;
3925	}
3926
3927	static int mmu_alloc_special_roots(struct kvm_vcpu *vcpu)
3928	{
3929	struct kvm_mmu *mmu = vcpu->arch.mmu;
3930	bool need_pml5 = mmu->root_role.level > PT64_ROOT_4LEVEL;
3931	u64 *pml5_root = NULL;
3932	u64 *pml4_root = NULL;
3933	u64 *pae_root;
3934
3935	/*
3936	* When shadowing 32-bit or PAE NPT with 64-bit NPT, the PML4 and PDP
3937	* tables are allocated and initialized at root creation as there is no
3938	* equivalent level in the guest's NPT to shadow. Allocate the tables
3939	* on demand, as running a 32-bit L1 VMM on 64-bit KVM is very rare.
3940	*/
3941	if (mmu->root_role.direct \|\|
3942	mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL \|\|
3943	mmu->root_role.level < PT64_ROOT_4LEVEL)
3944	return `0`;
3945
3946	/*
3947	* NPT, the only paging mode that uses this horror, uses a fixed number
3948	* of levels for the shadow page tables, e.g. all MMUs are 4-level or
3949	* all MMus are 5-level. Thus, this can safely require that pml5_root
3950	* is allocated if the other roots are valid and pml5 is needed, as any
3951	* prior MMU would also have required pml5.
3952	*/
3953	if (mmu->pae_root && mmu->pml4_root && (!need_pml5 \|\| mmu->pml5_root))
3954	return `0`;
3955
3956	/*
3957	* The special roots should always be allocated in concert. Yell and
3958	* bail if KVM ends up in a state where only one of the roots is valid.
3959	*/
3960	if (WARN_ON_ONCE(!tdp_enabled \|\| mmu->pae_root \|\| mmu->pml4_root \|\|
3961	(need_pml5 && mmu->pml5_root)))
3962	return -EIO;
3963
3964	/*
3965	* Unlike 32-bit NPT, the PDP table doesn't need to be in low mem, and
3966	* doesn't need to be decrypted.
3967	*/
3968	pae_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3969	if (!pae_root)
3970	return -ENOMEM;
3971
3972	#ifdef CONFIG_X86_64
3973	pml4_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3974	if (!pml4_root)
3975	goto err_pml4;
3976
3977	if (need_pml5) {
3978	pml5_root = (void *)get_zeroed_page(GFP_KERNEL_ACCOUNT);
3979	if (!pml5_root)
3980	goto err_pml5;
3981	}
3982	#endif
3983
3984	mmu->pae_root = pae_root;
3985	mmu->pml4_root = pml4_root;
3986	mmu->pml5_root = pml5_root;
3987
3988	return `0`;
3989
3990	#ifdef CONFIG_X86_64
3991	err_pml5:
3992	free_page((unsigned long)pml4_root);
3993	err_pml4:
3994	free_page((unsigned long)pae_root);
3995	return -ENOMEM;
3996	#endif
3997	}
3998
3999	static bool is_unsync_root(hpa_t root)
4000	{
4001	struct kvm_mmu_page *sp;
4002
4003	if (!VALID_PAGE(root) \|\| kvm_mmu_is_dummy_root(shadow_page: root))
4004	return false;
4005
4006	/*
4007	* The read barrier orders the CPU's read of SPTE.W during the page table
4008	* walk before the reads of sp->unsync/sp->unsync_children here.
4009	*
4010	* Even if another CPU was marking the SP as unsync-ed simultaneously,
4011	* any guest page table changes are not guaranteed to be visible anyway
4012	* until this VCPU issues a TLB flush strictly after those changes are
4013	* made. We only need to ensure that the other CPU sets these flags
4014	* before any actual changes to the page tables are made. The comments
4015	* in mmu_try_to_unsync_pages() describe what could go wrong if this
4016	* requirement isn't satisfied.
4017	*/
4018	smp_rmb();
4019	sp = root_to_sp(root);
4020
4021	/*
4022	* PAE roots (somewhat arbitrarily) aren't backed by shadow pages, the
4023	* PDPTEs for a given PAE root need to be synchronized individually.
4024	*/
4025	if (WARN_ON_ONCE(!sp))
4026	return false;
4027
4028	if (sp->unsync \|\| sp->unsync_children)
4029	return true;
4030
4031	return false;
4032	}
4033
4034	void kvm_mmu_sync_roots(struct kvm_vcpu *vcpu)
4035	{
4036	int i;
4037	struct kvm_mmu_page *sp;
4038
4039	if (vcpu->arch.mmu->root_role.direct)
4040	return;
4041
4042	if (!VALID_PAGE(vcpu->arch.mmu->root.hpa))
4043	return;
4044
4045	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4046
4047	if (vcpu->arch.mmu->cpu_role.base.level >= PT64_ROOT_4LEVEL) {
4048	hpa_t root = vcpu->arch.mmu->root.hpa;
4049
4050	if (!is_unsync_root(root))
4051	return;
4052
4053	sp = root_to_sp(root);
4054
4055	write_lock(&vcpu->kvm->mmu_lock);
4056	mmu_sync_children(vcpu, parent: sp, can_yield: true);
4057	write_unlock(&vcpu->kvm->mmu_lock);
4058	return;
4059	}
4060
4061	write_lock(&vcpu->kvm->mmu_lock);
4062
4063	for (i = `0`; i < `4`; ++i) {
4064	hpa_t root = vcpu->arch.mmu->pae_root[i];
4065
4066	if (IS_VALID_PAE_ROOT(root)) {
4067	sp = spte_to_child_sp(spte: root);
4068	mmu_sync_children(vcpu, parent: sp, can_yield: true);
4069	}
4070	}
4071
4072	write_unlock(&vcpu->kvm->mmu_lock);
4073	}
4074
4075	void kvm_mmu_sync_prev_roots(struct kvm_vcpu *vcpu)
4076	{
4077	unsigned long roots_to_free = `0`;
4078	int i;
4079
4080	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4081	if (is_unsync_root(root: vcpu->arch.mmu->prev_roots[i].hpa))
4082	roots_to_free \|= KVM_MMU_ROOT_PREVIOUS(i);
4083
4084	/ sync prev_roots by simply freeing them /
4085	kvm_mmu_free_roots(vcpu->kvm, vcpu->arch.mmu, roots_to_free);
4086	}
4087
4088	static gpa_t nonpaging_gva_to_gpa(struct kvm_vcpu vcpu, struct* kvm_mmu *mmu,
4089	gpa_t vaddr, u64 access,
4090	struct x86_exception *exception)
4091	{
4092	if (exception)
4093	exception->error_code = `0`;
4094	return kvm_translate_gpa(vcpu, mmu, vaddr, access, exception);
4095	}
4096
4097	static bool mmio_info_in_cache(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4098	{
4099	/*
4100	* A nested guest cannot use the MMIO cache if it is using nested
4101	* page tables, because cr2 is a nGPA while the cache stores GPAs.
4102	*/
4103	if (mmu_is_nested(vcpu))
4104	return false;
4105
4106	if (direct)
4107	return vcpu_match_mmio_gpa(vcpu, addr);
4108
4109	return vcpu_match_mmio_gva(vcpu, addr);
4110	}
4111
4112	/*
4113	* Return the level of the lowest level SPTE added to sptes.
4114	* That SPTE may be non-present.
4115	*
4116	* Must be called between walk_shadow_page_lockless_{begin,end}.
4117	*/
4118	static int get_walk(struct kvm_vcpu vcpu, u64 addr, u64 sptes, int *root_level)
4119	{
4120	struct kvm_shadow_walk_iterator iterator;
4121	int leaf = -`1`;
4122	u64 spte;
4123
4124	for (shadow_walk_init(iterator: &iterator, vcpu, addr),
4125	*root_level = iterator.level;
4126	shadow_walk_okay(iterator: &iterator);
4127	__shadow_walk_next(iterator: &iterator, spte)) {
4128	leaf = iterator.level;
4129	spte = mmu_spte_get_lockless(sptep: iterator.sptep);
4130
4131	sptes[leaf] = spte;
4132	}
4133
4134	return leaf;
4135	}
4136
4137	/ return true if reserved bit(s) are detected on a valid, non-MMIO SPTE. /
4138	static bool get_mmio_spte(struct kvm_vcpu vcpu, u64 addr, u64 sptep)
4139	{
4140	u64 sptes[PT64_ROOT_MAX_LEVEL + `1`];
4141	struct rsvd_bits_validate *rsvd_check;
4142	int root, leaf, level;
4143	bool reserved = false;
4144
4145	walk_shadow_page_lockless_begin(vcpu);
4146
4147	if (is_tdp_mmu_active(vcpu))
4148	leaf = kvm_tdp_mmu_get_walk(vcpu, addr, sptes, root_level: &root);
4149	else
4150	leaf = get_walk(vcpu, addr, sptes, root_level: &root);
4151
4152	walk_shadow_page_lockless_end(vcpu);
4153
4154	if (unlikely(leaf < `0`)) {
4155	*sptep = `0ull`;
4156	return reserved;
4157	}
4158
4159	*sptep = sptes[leaf];
4160
4161	/*
4162	* Skip reserved bits checks on the terminal leaf if it's not a valid
4163	* SPTE. Note, this also (intentionally) skips MMIO SPTEs, which, by
4164	* design, always have reserved bits set. The purpose of the checks is
4165	* to detect reserved bits on non-MMIO SPTEs. i.e. buggy SPTEs.
4166	*/
4167	if (!is_shadow_present_pte(pte: sptes[leaf]))
4168	leaf++;
4169
4170	rsvd_check = &vcpu->arch.mmu->shadow_zero_check;
4171
4172	for (level = root; level >= leaf; level--)
4173	reserved \|= is_rsvd_spte(rsvd_check, spte: sptes[level], level);
4174
4175	if (reserved) {
4176	pr_err("%s: reserved bits set on MMU-present spte, addr 0x%llx, hierarchy:\n",
4177	__func__, addr);
4178	for (level = root; level >= leaf; level--)
4179	pr_err("------ spte = 0x%llx level = %d, rsvd bits = 0x%llx",
4180	sptes[level], level,
4181	get_rsvd_bits(rsvd_check, sptes[level], level));
4182	}
4183
4184	return reserved;
4185	}
4186
4187	static int handle_mmio_page_fault(struct kvm_vcpu *vcpu, u64 addr, bool direct)
4188	{
4189	u64 spte;
4190	bool reserved;
4191
4192	if (mmio_info_in_cache(vcpu, addr, direct))
4193	return RET_PF_EMULATE;
4194
4195	reserved = get_mmio_spte(vcpu, addr, sptep: &spte);
4196	if (WARN_ON_ONCE(reserved))
4197	return -EINVAL;
4198
4199	if (is_mmio_spte(spte)) {
4200	gfn_t gfn = get_mmio_spte_gfn(spte);
4201	unsigned int access = get_mmio_spte_access(spte);
4202
4203	if (!check_mmio_spte(vcpu, spte))
4204	return RET_PF_INVALID;
4205
4206	if (direct)
4207	addr = `0`;
4208
4209	trace_handle_mmio_page_fault(addr, gfn, access);
4210	vcpu_cache_mmio_info(vcpu, addr, gfn, access);
4211	return RET_PF_EMULATE;
4212	}
4213
4214	/*
4215	* If the page table is zapped by other cpus, let CPU fault again on
4216	* the address.
4217	*/
4218	return RET_PF_RETRY;
4219	}
4220
4221	static bool page_fault_handle_page_track(struct kvm_vcpu *vcpu,
4222	struct kvm_page_fault *fault)
4223	{
4224	if (unlikely(fault->rsvd))
4225	return false;
4226
4227	if (!fault->present \|\| !fault->write)
4228	return false;
4229
4230	/*
4231	* guest is writing the page which is write tracked which can
4232	* not be fixed by page fault handler.
4233	*/
4234	if (kvm_gfn_is_write_tracked(kvm: vcpu->kvm, slot: fault->slot, gfn: fault->gfn))
4235	return true;
4236
4237	return false;
4238	}
4239
4240	static void shadow_page_table_clear_flood(struct kvm_vcpu *vcpu, gva_t addr)
4241	{
4242	struct kvm_shadow_walk_iterator iterator;
4243	u64 spte;
4244
4245	walk_shadow_page_lockless_begin(vcpu);
4246	for_each_shadow_entry_lockless(vcpu, addr, iterator, spte)
4247	clear_sp_write_flooding_count(spte: iterator.sptep);
4248	walk_shadow_page_lockless_end(vcpu);
4249	}
4250
4251	static u32 alloc_apf_token(struct kvm_vcpu *vcpu)
4252	{
4253	/ make sure the token value is not 0 /
4254	u32 id = vcpu->arch.apf.id;
4255
4256	if (id << `12` == `0`)
4257	vcpu->arch.apf.id = `1`;
4258
4259	return (vcpu->arch.apf.id++ << `12`) \| vcpu->vcpu_id;
4260	}
4261
4262	static bool kvm_arch_setup_async_pf(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa,
4263	gfn_t gfn)
4264	{
4265	struct kvm_arch_async_pf arch;
4266
4267	arch.token = alloc_apf_token(vcpu);
4268	arch.gfn = gfn;
4269	arch.direct_map = vcpu->arch.mmu->root_role.direct;
4270	arch.cr3 = kvm_mmu_get_guest_pgd(vcpu, mmu: vcpu->arch.mmu);
4271
4272	return kvm_setup_async_pf(vcpu, cr2_or_gpa,
4273	hva: kvm_vcpu_gfn_to_hva(vcpu, gfn), arch: &arch);
4274	}
4275
4276	void kvm_arch_async_page_ready(struct kvm_vcpu vcpu, struct* kvm_async_pf *work)
4277	{
4278	int r;
4279
4280	if ((vcpu->arch.mmu->root_role.direct != work->arch.direct_map) \|\|
4281	work->wakeup_all)
4282	return;
4283
4284	r = kvm_mmu_reload(vcpu);
4285	if (unlikely(r))
4286	return;
4287
4288	if (!vcpu->arch.mmu->root_role.direct &&
4289	work->arch.cr3 != kvm_mmu_get_guest_pgd(vcpu, mmu: vcpu->arch.mmu))
4290	return;
4291
4292	kvm_mmu_do_page_fault(vcpu, cr2_or_gpa: work->cr2_or_gpa, err: `0`, prefetch: true, NULL);
4293	}
4294
4295	static inline u8 kvm_max_level_for_order(int order)
4296	{
4297	BUILD_BUG_ON(KVM_MAX_HUGEPAGE_LEVEL > PG_LEVEL_1G);
4298
4299	KVM_MMU_WARN_ON(order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G) &&
4300	order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M) &&
4301	order != KVM_HPAGE_GFN_SHIFT(PG_LEVEL_4K));
4302
4303	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_1G))
4304	return PG_LEVEL_1G;
4305
4306	if (order >= KVM_HPAGE_GFN_SHIFT(PG_LEVEL_2M))
4307	return PG_LEVEL_2M;
4308
4309	return PG_LEVEL_4K;
4310	}
4311
4312	static void kvm_mmu_prepare_memory_fault_exit(struct kvm_vcpu *vcpu,
4313	struct kvm_page_fault *fault)
4314	{
4315	kvm_prepare_memory_fault_exit(vcpu, gpa: fault->gfn << PAGE_SHIFT,
4316	PAGE_SIZE, is_write: fault->write, is_exec: fault->exec,
4317	is_private: fault->is_private);
4318	}
4319
4320	static int kvm_faultin_pfn_private(struct kvm_vcpu *vcpu,
4321	struct kvm_page_fault *fault)
4322	{
4323	int max_order, r;
4324
4325	if (!kvm_slot_can_be_private(slot: fault->slot)) {
4326	kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4327	return -EFAULT;
4328	}
4329
4330	r = kvm_gmem_get_pfn(kvm: vcpu->kvm, slot: fault->slot, gfn: fault->gfn, pfn: &fault->pfn,
4331	max_order: &max_order);
4332	if (r) {
4333	kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4334	return r;
4335	}
4336
4337	fault->max_level = min(kvm_max_level_for_order(max_order),
4338	fault->max_level);
4339	fault->map_writable = !(fault->slot->flags & KVM_MEM_READONLY);
4340
4341	return RET_PF_CONTINUE;
4342	}
4343
4344	static int __kvm_faultin_pfn(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault)
4345	{
4346	struct kvm_memory_slot *slot = fault->slot;
4347	bool async;
4348
4349	/*
4350	* Retry the page fault if the gfn hit a memslot that is being deleted
4351	* or moved. This ensures any existing SPTEs for the old memslot will
4352	* be zapped before KVM inserts a new MMIO SPTE for the gfn.
4353	*/
4354	if (slot && (slot->flags & KVM_MEMSLOT_INVALID))
4355	return RET_PF_RETRY;
4356
4357	if (!kvm_is_visible_memslot(memslot: slot)) {
4358	/ Don't expose private memslots to L2. /
4359	if (is_guest_mode(vcpu)) {
4360	fault->slot = NULL;
4361	fault->pfn = KVM_PFN_NOSLOT;
4362	fault->map_writable = false;
4363	return RET_PF_CONTINUE;
4364	}
4365	/*
4366	* If the APIC access page exists but is disabled, go directly
4367	* to emulation without caching the MMIO access or creating a
4368	* MMIO SPTE. That way the cache doesn't need to be purged
4369	* when the AVIC is re-enabled.
4370	*/
4371	if (slot && slot->id == APIC_ACCESS_PAGE_PRIVATE_MEMSLOT &&
4372	!kvm_apicv_activated(kvm: vcpu->kvm))
4373	return RET_PF_EMULATE;
4374	}
4375
4376	if (fault->is_private != kvm_mem_is_private(kvm: vcpu->kvm, gfn: fault->gfn)) {
4377	kvm_mmu_prepare_memory_fault_exit(vcpu, fault);
4378	return -EFAULT;
4379	}
4380
4381	if (fault->is_private)
4382	return kvm_faultin_pfn_private(vcpu, fault);
4383
4384	async = false;
4385	fault->pfn = __gfn_to_pfn_memslot(slot, gfn: fault->gfn, atomic: false, interruptible: false, async: &async,
4386	write_fault: fault->write, writable: &fault->map_writable,
4387	hva: &fault->hva);
4388	if (!async)
4389	return RET_PF_CONTINUE; / pfn has correct page already /*
4390
4391	if (!fault->prefetch && kvm_can_do_async_pf(vcpu)) {
4392	trace_kvm_try_async_get_page(gva: fault->addr, gfn: fault->gfn);
4393	if (kvm_find_async_pf_gfn(vcpu, gfn: fault->gfn)) {
4394	trace_kvm_async_pf_repeated_fault(gva: fault->addr, gfn: fault->gfn);
4395	kvm_make_request(KVM_REQ_APF_HALT, vcpu);
4396	return RET_PF_RETRY;
4397	} else if (kvm_arch_setup_async_pf(vcpu, cr2_or_gpa: fault->addr, gfn: fault->gfn)) {
4398	return RET_PF_RETRY;
4399	}
4400	}
4401
4402	/*
4403	* Allow gup to bail on pending non-fatal signals when it's also allowed
4404	* to wait for IO. Note, gup always bails if it is unable to quickly
4405	* get a page and a fatal signal, i.e. SIGKILL, is pending.
4406	*/
4407	fault->pfn = __gfn_to_pfn_memslot(slot, gfn: fault->gfn, atomic: false, interruptible: true, NULL,
4408	write_fault: fault->write, writable: &fault->map_writable,
4409	hva: &fault->hva);
4410	return RET_PF_CONTINUE;
4411	}
4412
4413	static int kvm_faultin_pfn(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault,
4414	unsigned int access)
4415	{
4416	int ret;
4417
4418	fault->mmu_seq = vcpu->kvm->mmu_invalidate_seq;
4419	smp_rmb();
4420
4421	/*
4422	* Check for a relevant mmu_notifier invalidation event before getting
4423	* the pfn from the primary MMU, and before acquiring mmu_lock.
4424	*
4425	* For mmu_lock, if there is an in-progress invalidation and the kernel
4426	* allows preemption, the invalidation task may drop mmu_lock and yield
4427	* in response to mmu_lock being contended, which is very counter-
4428	* productive as this vCPU can't actually make forward progress until
4429	* the invalidation completes.
4430	*
4431	* Retrying now can also avoid unnessary lock contention in the primary
4432	* MMU, as the primary MMU doesn't necessarily hold a single lock for
4433	* the duration of the invalidation, i.e. faulting in a conflicting pfn
4434	* can cause the invalidation to take longer by holding locks that are
4435	* needed to complete the invalidation.
4436	*
4437	* Do the pre-check even for non-preemtible kernels, i.e. even if KVM
4438	* will never yield mmu_lock in response to contention, as this vCPU is
4439	* guaranteed to need to retry, i.e. waiting until mmu_lock is held
4440	* to detect retry guarantees the worst case latency for the vCPU.
4441	*/
4442	if (fault->slot &&
4443	mmu_invalidate_retry_gfn_unsafe(kvm: vcpu->kvm, mmu_seq: fault->mmu_seq, gfn: fault->gfn))
4444	return RET_PF_RETRY;
4445
4446	ret = __kvm_faultin_pfn(vcpu, fault);
4447	if (ret != RET_PF_CONTINUE)
4448	return ret;
4449
4450	if (unlikely(is_error_pfn(fault->pfn)))
4451	return kvm_handle_error_pfn(vcpu, fault);
4452
4453	if (unlikely(!fault->slot))
4454	return kvm_handle_noslot_fault(vcpu, fault, access);
4455
4456	/*
4457	* Check again for a relevant mmu_notifier invalidation event purely to
4458	* avoid contending mmu_lock. Most invalidations will be detected by
4459	* the previous check, but checking is extremely cheap relative to the
4460	* overall cost of failing to detect the invalidation until after
4461	* mmu_lock is acquired.
4462	*/
4463	if (mmu_invalidate_retry_gfn_unsafe(kvm: vcpu->kvm, mmu_seq: fault->mmu_seq, gfn: fault->gfn)) {
4464	kvm_release_pfn_clean(pfn: fault->pfn);
4465	return RET_PF_RETRY;
4466	}
4467
4468	return RET_PF_CONTINUE;
4469	}
4470
4471	/*
4472	* Returns true if the page fault is stale and needs to be retried, i.e. if the
4473	* root was invalidated by a memslot update or a relevant mmu_notifier fired.
4474	*/
4475	static bool is_page_fault_stale(struct kvm_vcpu *vcpu,
4476	struct kvm_page_fault *fault)
4477	{
4478	struct kvm_mmu_page *sp = root_to_sp(root: vcpu->arch.mmu->root.hpa);
4479
4480	/ Special roots, e.g. pae_root, are not backed by shadow pages. /
4481	if (sp && is_obsolete_sp(kvm: vcpu->kvm, sp))
4482	return true;
4483
4484	/*
4485	* Roots without an associated shadow page are considered invalid if
4486	* there is a pending request to free obsolete roots. The request is
4487	* only a hint that the current root _may_ be obsolete and needs to be
4488	* reloaded, e.g. if the guest frees a PGD that KVM is tracking as a
4489	* previous root, then __kvm_mmu_prepare_zap_page() signals all vCPUs
4490	* to reload even if no vCPU is actively using the root.
4491	*/
4492	if (!sp && kvm_test_request(KVM_REQ_MMU_FREE_OBSOLETE_ROOTS, vcpu))
4493	return true;
4494
4495	/*
4496	* Check for a relevant mmu_notifier invalidation event one last time
4497	* now that mmu_lock is held, as the "unsafe" checks performed without
4498	* holding mmu_lock can get false negatives.
4499	*/
4500	return fault->slot &&
4501	mmu_invalidate_retry_gfn(kvm: vcpu->kvm, mmu_seq: fault->mmu_seq, gfn: fault->gfn);
4502	}
4503
4504	static int direct_page_fault(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault)
4505	{
4506	int r;
4507
4508	/ Dummy roots are used only for shadowing bad guest roots. /
4509	if (WARN_ON_ONCE(kvm_mmu_is_dummy_root(vcpu->arch.mmu->root.hpa)))
4510	return RET_PF_RETRY;
4511
4512	if (page_fault_handle_page_track(vcpu, fault))
4513	return RET_PF_EMULATE;
4514
4515	r = fast_page_fault(vcpu, fault);
4516	if (r != RET_PF_INVALID)
4517	return r;
4518
4519	r = mmu_topup_memory_caches(vcpu, maybe_indirect: false);
4520	if (r)
4521	return r;
4522
4523	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
4524	if (r != RET_PF_CONTINUE)
4525	return r;
4526
4527	r = RET_PF_RETRY;
4528	write_lock(&vcpu->kvm->mmu_lock);
4529
4530	if (is_page_fault_stale(vcpu, fault))
4531	goto out_unlock;
4532
4533	r = make_mmu_pages_available(vcpu);
4534	if (r)
4535	goto out_unlock;
4536
4537	r = direct_map(vcpu, fault);
4538
4539	out_unlock:
4540	write_unlock(&vcpu->kvm->mmu_lock);
4541	kvm_release_pfn_clean(pfn: fault->pfn);
4542	return r;
4543	}
4544
4545	static int nonpaging_page_fault(struct kvm_vcpu *vcpu,
4546	struct kvm_page_fault *fault)
4547	{
4548	/ This path builds a PAE pagetable, we can map 2mb pages at maximum. /
4549	fault->max_level = PG_LEVEL_2M;
4550	return direct_page_fault(vcpu, fault);
4551	}
4552
4553	int kvm_handle_page_fault(struct kvm_vcpu *vcpu, u64 error_code,
4554	u64 fault_address, char insn, int* insn_len)
4555	{
4556	int r = `1`;
4557	u32 flags = vcpu->arch.apf.host_apf_flags;
4558
4559	#ifndef CONFIG_X86_64
4560	/ A 64-bit CR2 should be impossible on 32-bit KVM. /
4561	if (WARN_ON_ONCE(fault_address >> `32`))
4562	return -EFAULT;
4563	#endif
4564
4565	vcpu->arch.l1tf_flush_l1d = true;
4566	if (!flags) {
4567	trace_kvm_page_fault(vcpu, fault_address, error_code);
4568
4569	if (kvm_event_needs_reinjection(vcpu))
4570	kvm_mmu_unprotect_page_virt(vcpu, gva: fault_address);
4571	r = kvm_mmu_page_fault(vcpu, cr2_or_gpa: fault_address, error_code, insn,
4572	insn_len);
4573	} else if (flags & KVM_PV_REASON_PAGE_NOT_PRESENT) {
4574	vcpu->arch.apf.host_apf_flags = `0`;
4575	local_irq_disable();
4576	kvm_async_pf_task_wait_schedule(token: fault_address);
4577	local_irq_enable();
4578	} else {
4579	WARN_ONCE(`1`, "Unexpected host async PF flags: %x\n", flags);
4580	}
4581
4582	return r;
4583	}
4584	EXPORT_SYMBOL_GPL(kvm_handle_page_fault);
4585
4586	#ifdef CONFIG_X86_64
4587	static int kvm_tdp_mmu_page_fault(struct kvm_vcpu *vcpu,
4588	struct kvm_page_fault *fault)
4589	{
4590	int r;
4591
4592	if (page_fault_handle_page_track(vcpu, fault))
4593	return RET_PF_EMULATE;
4594
4595	r = fast_page_fault(vcpu, fault);
4596	if (r != RET_PF_INVALID)
4597	return r;
4598
4599	r = mmu_topup_memory_caches(vcpu, maybe_indirect: false);
4600	if (r)
4601	return r;
4602
4603	r = kvm_faultin_pfn(vcpu, fault, ACC_ALL);
4604	if (r != RET_PF_CONTINUE)
4605	return r;
4606
4607	r = RET_PF_RETRY;
4608	read_lock(&vcpu->kvm->mmu_lock);
4609
4610	if (is_page_fault_stale(vcpu, fault))
4611	goto out_unlock;
4612
4613	r = kvm_tdp_mmu_map(vcpu, fault);
4614
4615	out_unlock:
4616	read_unlock(&vcpu->kvm->mmu_lock);
4617	kvm_release_pfn_clean(pfn: fault->pfn);
4618	return r;
4619	}
4620	#endif
4621
4622	bool __kvm_mmu_honors_guest_mtrrs(bool vm_has_noncoherent_dma)
4623	{
4624	/*
4625	* If host MTRRs are ignored (shadow_memtype_mask is non-zero), and the
4626	* VM has non-coherent DMA (DMA doesn't snoop CPU caches), KVM's ABI is
4627	* to honor the memtype from the guest's MTRRs so that guest accesses
4628	* to memory that is DMA'd aren't cached against the guest's wishes.
4629	*
4630	* Note, KVM may still ultimately ignore guest MTRRs for certain PFNs,
4631	* e.g. KVM will force UC memtype for host MMIO.
4632	*/
4633	return vm_has_noncoherent_dma && shadow_memtype_mask;
4634	}
4635
4636	int kvm_tdp_page_fault(struct kvm_vcpu vcpu, struct* kvm_page_fault *fault)
4637	{
4638	/*
4639	* If the guest's MTRRs may be used to compute the "real" memtype,
4640	* restrict the mapping level to ensure KVM uses a consistent memtype
4641	* across the entire mapping.
4642	*/
4643	if (kvm_mmu_honors_guest_mtrrs(vcpu->kvm)) {
4644	for ( ; fault->max_level > PG_LEVEL_4K; --fault->max_level) {
4645	int page_num = KVM_PAGES_PER_HPAGE(fault->max_level);
4646	gfn_t base = gfn_round_for_level(gfn: fault->gfn,
4647	level: fault->max_level);
4648
4649	if (kvm_mtrr_check_gfn_range_consistency(vcpu, base, page_num))
4650	break;
4651	}
4652	}
4653
4654	#ifdef CONFIG_X86_64
4655	if (tdp_mmu_enabled)
4656	return kvm_tdp_mmu_page_fault(vcpu, fault);
4657	#endif
4658
4659	return direct_page_fault(vcpu, fault);
4660	}
4661
4662	static void nonpaging_init_context(struct kvm_mmu *context)
4663	{
4664	context->page_fault = nonpaging_page_fault;
4665	context->gva_to_gpa = nonpaging_gva_to_gpa;
4666	context->sync_spte = NULL;
4667	}
4668
4669	static inline bool is_root_usable(struct kvm_mmu_root_info *root, gpa_t pgd,
4670	union kvm_mmu_page_role role)
4671	{
4672	struct kvm_mmu_page *sp;
4673
4674	if (!VALID_PAGE(root->hpa))
4675	return false;
4676
4677	if (!role.direct && pgd != root->pgd)
4678	return false;
4679
4680	sp = root_to_sp(root: root->hpa);
4681	if (WARN_ON_ONCE(!sp))
4682	return false;
4683
4684	return role.word == sp->role.word;
4685	}
4686
4687	/*
4688	* Find out if a previously cached root matching the new pgd/role is available,
4689	* and insert the current root as the MRU in the cache.
4690	* If a matching root is found, it is assigned to kvm_mmu->root and
4691	* true is returned.
4692	* If no match is found, kvm_mmu->root is left invalid, the LRU root is
4693	* evicted to make room for the current root, and false is returned.
4694	*/
4695	static bool cached_root_find_and_keep_current(struct kvm kvm, struct* kvm_mmu *mmu,
4696	gpa_t new_pgd,
4697	union kvm_mmu_page_role new_role)
4698	{
4699	uint i;
4700
4701	if (is_root_usable(root: &mmu->root, pgd: new_pgd, role: new_role))
4702	return true;
4703
4704	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
4705	/*
4706	* The swaps end up rotating the cache like this:
4707	* C 0 1 2 3 (on entry to the function)
4708	* 0 C 1 2 3
4709	* 1 C 0 2 3
4710	* 2 C 0 1 3
4711	* 3 C 0 1 2 (on exit from the loop)
4712	*/
4713	swap(mmu->root, mmu->prev_roots[i]);
4714	if (is_root_usable(root: &mmu->root, pgd: new_pgd, role: new_role))
4715	return true;
4716	}
4717
4718	kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4719	return false;
4720	}
4721
4722	/*
4723	* Find out if a previously cached root matching the new pgd/role is available.
4724	* On entry, mmu->root is invalid.
4725	* If a matching root is found, it is assigned to kvm_mmu->root, the LRU entry
4726	* of the cache becomes invalid, and true is returned.
4727	* If no match is found, kvm_mmu->root is left invalid and false is returned.
4728	*/
4729	static bool cached_root_find_without_current(struct kvm kvm, struct* kvm_mmu *mmu,
4730	gpa_t new_pgd,
4731	union kvm_mmu_page_role new_role)
4732	{
4733	uint i;
4734
4735	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++)
4736	if (is_root_usable(root: &mmu->prev_roots[i], pgd: new_pgd, role: new_role))
4737	goto hit;
4738
4739	return false;
4740
4741	hit:
4742	swap(mmu->root, mmu->prev_roots[i]);
4743	/ Bubble up the remaining roots. /
4744	for (; i < KVM_MMU_NUM_PREV_ROOTS - `1`; i++)
4745	mmu->prev_roots[i] = mmu->prev_roots[i + `1`];
4746	mmu->prev_roots[i].hpa = INVALID_PAGE;
4747	return true;
4748	}
4749
4750	static bool fast_pgd_switch(struct kvm kvm, struct* kvm_mmu *mmu,
4751	gpa_t new_pgd, union kvm_mmu_page_role new_role)
4752	{
4753	/*
4754	* Limit reuse to 64-bit hosts+VMs without "special" roots in order to
4755	* avoid having to deal with PDPTEs and other complexities.
4756	*/
4757	if (VALID_PAGE(mmu->root.hpa) && !root_to_sp(root: mmu->root.hpa))
4758	kvm_mmu_free_roots(kvm, mmu, KVM_MMU_ROOT_CURRENT);
4759
4760	if (VALID_PAGE(mmu->root.hpa))
4761	return cached_root_find_and_keep_current(kvm, mmu, new_pgd, new_role);
4762	else
4763	return cached_root_find_without_current(kvm, mmu, new_pgd, new_role);
4764	}
4765
4766	void kvm_mmu_new_pgd(struct kvm_vcpu *vcpu, gpa_t new_pgd)
4767	{
4768	struct kvm_mmu *mmu = vcpu->arch.mmu;
4769	union kvm_mmu_page_role new_role = mmu->root_role;
4770
4771	/*
4772	* Return immediately if no usable root was found, kvm_mmu_reload()
4773	* will establish a valid root prior to the next VM-Enter.
4774	*/
4775	if (!fast_pgd_switch(kvm: vcpu->kvm, mmu, new_pgd, new_role))
4776	return;
4777
4778	/*
4779	* It's possible that the cached previous root page is obsolete because
4780	* of a change in the MMU generation number. However, changing the
4781	* generation number is accompanied by KVM_REQ_MMU_FREE_OBSOLETE_ROOTS,
4782	* which will free the root set here and allocate a new one.
4783	*/
4784	kvm_make_request(KVM_REQ_LOAD_MMU_PGD, vcpu);
4785
4786	if (force_flush_and_sync_on_reuse) {
4787	kvm_make_request(KVM_REQ_MMU_SYNC, vcpu);
4788	kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu);
4789	}
4790
4791	/*
4792	* The last MMIO access's GVA and GPA are cached in the VCPU. When
4793	* switching to a new CR3, that GVA->GPA mapping may no longer be
4794	* valid. So clear any cached MMIO info even when we don't need to sync
4795	* the shadow page tables.
4796	*/
4797	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
4798
4799	/*
4800	* If this is a direct root page, it doesn't have a write flooding
4801	* count. Otherwise, clear the write flooding count.
4802	*/
4803	if (!new_role.direct) {
4804	struct kvm_mmu_page *sp = root_to_sp(root: vcpu->arch.mmu->root.hpa);
4805
4806	if (!WARN_ON_ONCE(!sp))
4807	__clear_sp_write_flooding_count(sp);
4808	}
4809	}
4810	EXPORT_SYMBOL_GPL(kvm_mmu_new_pgd);
4811
4812	static bool sync_mmio_spte(struct kvm_vcpu vcpu, u64 sptep, gfn_t gfn,
4813	unsigned int access)
4814	{
4815	if (unlikely(is_mmio_spte(*sptep))) {
4816	if (gfn != get_mmio_spte_gfn(spte: *sptep)) {
4817	mmu_spte_clear_no_track(sptep);
4818	return true;
4819	}
4820
4821	mark_mmio_spte(vcpu, sptep, gfn, access);
4822	return true;
4823	}
4824
4825	return false;
4826	}
4827
4828	#define PTTYPE_EPT 18 /* arbitrary */
4829	#define PTTYPE PTTYPE_EPT
4830	#include "paging_tmpl.h"
4831	#undef PTTYPE
4832
4833	#define PTTYPE 64
4834	#include "paging_tmpl.h"
4835	#undef PTTYPE
4836
4837	#define PTTYPE 32
4838	#include "paging_tmpl.h"
4839	#undef PTTYPE
4840
4841	static void __reset_rsvds_bits_mask(struct rsvd_bits_validate *rsvd_check,
4842	u64 pa_bits_rsvd, int level, bool nx,
4843	bool gbpages, bool pse, bool amd)
4844	{
4845	u64 gbpages_bit_rsvd = `0`;
4846	u64 nonleaf_bit8_rsvd = `0`;
4847	u64 high_bits_rsvd;
4848
4849	rsvd_check->bad_mt_xwr = `0`;
4850
4851	if (!gbpages)
4852	gbpages_bit_rsvd = rsvd_bits(`7`, `7`);
4853
4854	if (level == PT32E_ROOT_LEVEL)
4855	high_bits_rsvd = pa_bits_rsvd & rsvd_bits(`0`, `62`);
4856	else
4857	high_bits_rsvd = pa_bits_rsvd & rsvd_bits(`0`, `51`);
4858
4859	/ Note, NX doesn't exist in PDPTEs, this is handled below. /
4860	if (!nx)
4861	high_bits_rsvd \|= rsvd_bits(`63`, `63`);
4862
4863	/*
4864	* Non-leaf PML4Es and PDPEs reserve bit 8 (which would be the G bit for
4865	* leaf entries) on AMD CPUs only.
4866	*/
4867	if (amd)
4868	nonleaf_bit8_rsvd = rsvd_bits(`8`, `8`);
4869
4870	switch (level) {
4871	case PT32_ROOT_LEVEL:
4872	/ no rsvd bits for 2 level 4K page table entries /
4873	rsvd_check->rsvd_bits_mask[`0`][`1`] = `0`;
4874	rsvd_check->rsvd_bits_mask[`0`][`0`] = `0`;
4875	rsvd_check->rsvd_bits_mask[`1`][`0`] =
4876	rsvd_check->rsvd_bits_mask[`0`][`0`];
4877
4878	if (!pse) {
4879	rsvd_check->rsvd_bits_mask[`1`][`1`] = `0`;
4880	break;
4881	}
4882
4883	if (is_cpuid_PSE36())
4884	/ 36bits PSE 4MB page /
4885	rsvd_check->rsvd_bits_mask[`1`][`1`] = rsvd_bits(`17`, `21`);
4886	else
4887	/ 32 bits PSE 4MB page /
4888	rsvd_check->rsvd_bits_mask[`1`][`1`] = rsvd_bits(`13`, `21`);
4889	break;
4890	case PT32E_ROOT_LEVEL:
4891	rsvd_check->rsvd_bits_mask[`0`][`2`] = rsvd_bits(`63`, `63`) \|
4892	high_bits_rsvd \|
4893	rsvd_bits(`5`, `8`) \|
4894	rsvd_bits(`1`, `2`); / PDPTE /
4895	rsvd_check->rsvd_bits_mask[`0`][`1`] = high_bits_rsvd; / PDE /
4896	rsvd_check->rsvd_bits_mask[`0`][`0`] = high_bits_rsvd; / PTE /
4897	rsvd_check->rsvd_bits_mask[`1`][`1`] = high_bits_rsvd \|
4898	rsvd_bits(`13`, `20`); / large page /
4899	rsvd_check->rsvd_bits_mask[`1`][`0`] =
4900	rsvd_check->rsvd_bits_mask[`0`][`0`];
4901	break;
4902	case PT64_ROOT_5LEVEL:
4903	rsvd_check->rsvd_bits_mask[`0`][`4`] = high_bits_rsvd \|
4904	nonleaf_bit8_rsvd \|
4905	rsvd_bits(`7`, `7`);
4906	rsvd_check->rsvd_bits_mask[`1`][`4`] =
4907	rsvd_check->rsvd_bits_mask[`0`][`4`];
4908	fallthrough;
4909	case PT64_ROOT_4LEVEL:
4910	rsvd_check->rsvd_bits_mask[`0`][`3`] = high_bits_rsvd \|
4911	nonleaf_bit8_rsvd \|
4912	rsvd_bits(`7`, `7`);
4913	rsvd_check->rsvd_bits_mask[`0`][`2`] = high_bits_rsvd \|
4914	gbpages_bit_rsvd;
4915	rsvd_check->rsvd_bits_mask[`0`][`1`] = high_bits_rsvd;
4916	rsvd_check->rsvd_bits_mask[`0`][`0`] = high_bits_rsvd;
4917	rsvd_check->rsvd_bits_mask[`1`][`3`] =
4918	rsvd_check->rsvd_bits_mask[`0`][`3`];
4919	rsvd_check->rsvd_bits_mask[`1`][`2`] = high_bits_rsvd \|
4920	gbpages_bit_rsvd \|
4921	rsvd_bits(`13`, `29`);
4922	rsvd_check->rsvd_bits_mask[`1`][`1`] = high_bits_rsvd \|
4923	rsvd_bits(`13`, `20`); / large page /
4924	rsvd_check->rsvd_bits_mask[`1`][`0`] =
4925	rsvd_check->rsvd_bits_mask[`0`][`0`];
4926	break;
4927	}
4928	}
4929
4930	static void reset_guest_rsvds_bits_mask(struct kvm_vcpu *vcpu,
4931	struct kvm_mmu *context)
4932	{
4933	__reset_rsvds_bits_mask(rsvd_check: &context->guest_rsvd_check,
4934	pa_bits_rsvd: vcpu->arch.reserved_gpa_bits,
4935	level: context->cpu_role.base.level, nx: is_efer_nx(mmu: context),
4936	gbpages: guest_can_use(vcpu, X86_FEATURE_GBPAGES),
4937	pse: is_cr4_pse(mmu: context),
4938	amd: guest_cpuid_is_amd_compatible(vcpu));
4939	}
4940
4941	static void __reset_rsvds_bits_mask_ept(struct rsvd_bits_validate *rsvd_check,
4942	u64 pa_bits_rsvd, bool execonly,
4943	int huge_page_level)
4944	{
4945	u64 high_bits_rsvd = pa_bits_rsvd & rsvd_bits(`0`, `51`);
4946	u64 large_1g_rsvd = `0`, large_2m_rsvd = `0`;
4947	u64 bad_mt_xwr;
4948
4949	if (huge_page_level < PG_LEVEL_1G)
4950	large_1g_rsvd = rsvd_bits(`7`, `7`);
4951	if (huge_page_level < PG_LEVEL_2M)
4952	large_2m_rsvd = rsvd_bits(`7`, `7`);
4953
4954	rsvd_check->rsvd_bits_mask[`0`][`4`] = high_bits_rsvd \| rsvd_bits(`3`, `7`);
4955	rsvd_check->rsvd_bits_mask[`0`][`3`] = high_bits_rsvd \| rsvd_bits(`3`, `7`);
4956	rsvd_check->rsvd_bits_mask[`0`][`2`] = high_bits_rsvd \| rsvd_bits(`3`, `6`) \| large_1g_rsvd;
4957	rsvd_check->rsvd_bits_mask[`0`][`1`] = high_bits_rsvd \| rsvd_bits(`3`, `6`) \| large_2m_rsvd;
4958	rsvd_check->rsvd_bits_mask[`0`][`0`] = high_bits_rsvd;
4959
4960	/ large page /
4961	rsvd_check->rsvd_bits_mask[`1`][`4`] = rsvd_check->rsvd_bits_mask[`0`][`4`];
4962	rsvd_check->rsvd_bits_mask[`1`][`3`] = rsvd_check->rsvd_bits_mask[`0`][`3`];
4963	rsvd_check->rsvd_bits_mask[`1`][`2`] = high_bits_rsvd \| rsvd_bits(`12`, `29`) \| large_1g_rsvd;
4964	rsvd_check->rsvd_bits_mask[`1`][`1`] = high_bits_rsvd \| rsvd_bits(`12`, `20`) \| large_2m_rsvd;
4965	rsvd_check->rsvd_bits_mask[`1`][`0`] = rsvd_check->rsvd_bits_mask[`0`][`0`];
4966
4967	bad_mt_xwr = `0xFFull` << (`2` * `8`); / bits 3..5 must not be 2 /
4968	bad_mt_xwr \|= `0xFFull` << (`3` * `8`); / bits 3..5 must not be 3 /
4969	bad_mt_xwr \|= `0xFFull` << (`7` * `8`); / bits 3..5 must not be 7 /
4970	bad_mt_xwr \|= REPEAT_BYTE(`1ull` << `2`); / bits 0..2 must not be 010 /
4971	bad_mt_xwr \|= REPEAT_BYTE(`1ull` << `6`); / bits 0..2 must not be 110 /
4972	if (!execonly) {
4973	/ bits 0..2 must not be 100 unless VMX capabilities allow it /
4974	bad_mt_xwr \|= REPEAT_BYTE(`1ull` << `4`);
4975	}
4976	rsvd_check->bad_mt_xwr = bad_mt_xwr;
4977	}
4978
4979	static void reset_rsvds_bits_mask_ept(struct kvm_vcpu *vcpu,
4980	struct kvm_mmu context, bool execonly, int* huge_page_level)
4981	{
4982	__reset_rsvds_bits_mask_ept(rsvd_check: &context->guest_rsvd_check,
4983	pa_bits_rsvd: vcpu->arch.reserved_gpa_bits, execonly,
4984	huge_page_level);
4985	}
4986
4987	static inline u64 reserved_hpa_bits(void)
4988	{
4989	return rsvd_bits(shadow_phys_bits, `63`);
4990	}
4991
4992	/*
4993	* the page table on host is the shadow page table for the page
4994	* table in guest or amd nested guest, its mmu features completely
4995	* follow the features in guest.
4996	*/
4997	static void reset_shadow_zero_bits_mask(struct kvm_vcpu *vcpu,
4998	struct kvm_mmu *context)
4999	{
5000	/ @amd adds a check on bit of SPTEs, which KVM shouldn't use anyways. /
5001	bool is_amd = true;
5002	/ KVM doesn't use 2-level page tables for the shadow MMU. /
5003	bool is_pse = false;
5004	struct rsvd_bits_validate *shadow_zero_check;
5005	int i;
5006
5007	WARN_ON_ONCE(context->root_role.level < PT32E_ROOT_LEVEL);
5008
5009	shadow_zero_check = &context->shadow_zero_check;
5010	__reset_rsvds_bits_mask(rsvd_check: shadow_zero_check, pa_bits_rsvd: reserved_hpa_bits(),
5011	level: context->root_role.level,
5012	nx: context->root_role.efer_nx,
5013	gbpages: guest_can_use(vcpu, X86_FEATURE_GBPAGES),
5014	pse: is_pse, amd: is_amd);
5015
5016	if (!shadow_me_mask)
5017	return;
5018
5019	for (i = context->root_role.level; --i >= `0`;) {
5020	/*
5021	* So far shadow_me_value is a constant during KVM's life
5022	* time. Bits in shadow_me_value are allowed to be set.
5023	* Bits in shadow_me_mask but not in shadow_me_value are
5024	* not allowed to be set.
5025	*/
5026	shadow_zero_check->rsvd_bits_mask[`0`][i] \|= shadow_me_mask;
5027	shadow_zero_check->rsvd_bits_mask[`1`][i] \|= shadow_me_mask;
5028	shadow_zero_check->rsvd_bits_mask[`0`][i] &= ~shadow_me_value;
5029	shadow_zero_check->rsvd_bits_mask[`1`][i] &= ~shadow_me_value;
5030	}
5031
5032	}
5033
5034	static inline bool boot_cpu_is_amd(void)
5035	{
5036	WARN_ON_ONCE(!tdp_enabled);
5037	return shadow_x_mask == `0`;
5038	}
5039
5040	/*
5041	* the direct page table on host, use as much mmu features as
5042	* possible, however, kvm currently does not do execution-protection.
5043	*/
5044	static void reset_tdp_shadow_zero_bits_mask(struct kvm_mmu *context)
5045	{
5046	struct rsvd_bits_validate *shadow_zero_check;
5047	int i;
5048
5049	shadow_zero_check = &context->shadow_zero_check;
5050
5051	if (boot_cpu_is_amd())
5052	__reset_rsvds_bits_mask(rsvd_check: shadow_zero_check, pa_bits_rsvd: reserved_hpa_bits(),
5053	level: context->root_role.level, nx: true,
5054	boot_cpu_has(X86_FEATURE_GBPAGES),
5055	pse: false, amd: true);
5056	else
5057	__reset_rsvds_bits_mask_ept(rsvd_check: shadow_zero_check,
5058	pa_bits_rsvd: reserved_hpa_bits(), execonly: false,
5059	huge_page_level: max_huge_page_level);
5060
5061	if (!shadow_me_mask)
5062	return;
5063
5064	for (i = context->root_role.level; --i >= `0`;) {
5065	shadow_zero_check->rsvd_bits_mask[`0`][i] &= ~shadow_me_mask;
5066	shadow_zero_check->rsvd_bits_mask[`1`][i] &= ~shadow_me_mask;
5067	}
5068	}
5069
5070	/*
5071	* as the comments in reset_shadow_zero_bits_mask() except it
5072	* is the shadow page table for intel nested guest.
5073	*/
5074	static void
5075	reset_ept_shadow_zero_bits_mask(struct kvm_mmu *context, bool execonly)
5076	{
5077	__reset_rsvds_bits_mask_ept(rsvd_check: &context->shadow_zero_check,
5078	pa_bits_rsvd: reserved_hpa_bits(), execonly,
5079	huge_page_level: max_huge_page_level);
5080	}
5081
5082	#define BYTE_MASK(access) \
5083	((1 & (access) ? 2 : 0) \| \
5084	(2 & (access) ? 4 : 0) \| \
5085	(3 & (access) ? 8 : 0) \| \
5086	(4 & (access) ? 16 : 0) \| \
5087	(5 & (access) ? 32 : 0) \| \
5088	(6 & (access) ? 64 : 0) \| \
5089	(7 & (access) ? 128 : 0))
5090
5091
5092	static void update_permission_bitmask(struct kvm_mmu *mmu, bool ept)
5093	{
5094	unsigned byte;
5095
5096	const u8 x = BYTE_MASK(ACC_EXEC_MASK);
5097	const u8 w = BYTE_MASK(ACC_WRITE_MASK);
5098	const u8 u = BYTE_MASK(ACC_USER_MASK);
5099
5100	bool cr4_smep = is_cr4_smep(mmu);
5101	bool cr4_smap = is_cr4_smap(mmu);
5102	bool cr0_wp = is_cr0_wp(mmu);
5103	bool efer_nx = is_efer_nx(mmu);
5104
5105	for (byte = `0`; byte < ARRAY_SIZE(mmu->permissions); ++byte) {
5106	unsigned pfec = byte << `1`;
5107
5108	/*
5109	* Each "*f" variable has a 1 bit for each UWX value
5110	* that causes a fault with the given PFEC.
5111	*/
5112
5113	/ Faults from writes to non-writable pages /
5114	u8 wf = (pfec & PFERR_WRITE_MASK) ? (u8)~w : `0`;
5115	/ Faults from user mode accesses to supervisor pages /
5116	u8 uf = (pfec & PFERR_USER_MASK) ? (u8)~u : `0`;
5117	/ Faults from fetches of non-executable pages/
5118	u8 ff = (pfec & PFERR_FETCH_MASK) ? (u8)~x : `0`;
5119	/ Faults from kernel mode fetches of user pages /
5120	u8 smepf = `0`;
5121	/ Faults from kernel mode accesses of user pages /
5122	u8 smapf = `0`;
5123
5124	if (!ept) {
5125	/ Faults from kernel mode accesses to user pages /
5126	u8 kf = (pfec & PFERR_USER_MASK) ? `0` : u;
5127
5128	/ Not really needed: !nx will cause pte.nx to fault /
5129	if (!efer_nx)
5130	ff = `0`;
5131
5132	/ Allow supervisor writes if !cr0.wp /
5133	if (!cr0_wp)
5134	wf = (pfec & PFERR_USER_MASK) ? wf : `0`;
5135
5136	/ Disallow supervisor fetches of user code if cr4.smep /
5137	if (cr4_smep)
5138	smepf = (pfec & PFERR_FETCH_MASK) ? kf : `0`;
5139
5140	/*
5141	* SMAP:kernel-mode data accesses from user-mode
5142	* mappings should fault. A fault is considered
5143	* as a SMAP violation if all of the following
5144	* conditions are true:
5145	* - X86_CR4_SMAP is set in CR4
5146	* - A user page is accessed
5147	* - The access is not a fetch
5148	* - The access is supervisor mode
5149	* - If implicit supervisor access or X86_EFLAGS_AC is clear
5150	*
5151	* Here, we cover the first four conditions.
5152	* The fifth is computed dynamically in permission_fault();
5153	* PFERR_RSVD_MASK bit will be set in PFEC if the access is
5154	* not subject to SMAP restrictions.
5155	*/
5156	if (cr4_smap)
5157	smapf = (pfec & (PFERR_RSVD_MASK\|PFERR_FETCH_MASK)) ? `0` : kf;
5158	}
5159
5160	mmu->permissions[byte] = ff \| uf \| wf \| smepf \| smapf;
5161	}
5162	}
5163
5164	/*
5165	* PKU is an additional mechanism by which the paging controls access to
5166	* user-mode addresses based on the value in the PKRU register. Protection
5167	* key violations are reported through a bit in the page fault error code.
5168	* Unlike other bits of the error code, the PK bit is not known at the
5169	* call site of e.g. gva_to_gpa; it must be computed directly in
5170	* permission_fault based on two bits of PKRU, on some machine state (CR4,
5171	* CR0, EFER, CPL), and on other bits of the error code and the page tables.
5172	*
5173	* In particular the following conditions come from the error code, the
5174	* page tables and the machine state:
5175	* - PK is always zero unless CR4.PKE=1 and EFER.LMA=1
5176	* - PK is always zero if RSVD=1 (reserved bit set) or F=1 (instruction fetch)
5177	* - PK is always zero if U=0 in the page tables
5178	* - PKRU.WD is ignored if CR0.WP=0 and the access is a supervisor access.
5179	*
5180	* The PKRU bitmask caches the result of these four conditions. The error
5181	* code (minus the P bit) and the page table's U bit form an index into the
5182	* PKRU bitmask. Two bits of the PKRU bitmask are then extracted and ANDed
5183	* with the two bits of the PKRU register corresponding to the protection key.
5184	* For the first three conditions above the bits will be 00, thus masking
5185	* away both AD and WD. For all reads or if the last condition holds, WD
5186	* only will be masked away.
5187	*/
5188	static void update_pkru_bitmask(struct kvm_mmu *mmu)
5189	{
5190	unsigned bit;
5191	bool wp;
5192
5193	mmu->pkru_mask = `0`;
5194
5195	if (!is_cr4_pke(mmu))
5196	return;
5197
5198	wp = is_cr0_wp(mmu);
5199
5200	for (bit = `0`; bit < ARRAY_SIZE(mmu->permissions); ++bit) {
5201	unsigned pfec, pkey_bits;
5202	bool check_pkey, check_write, ff, uf, wf, pte_user;
5203
5204	pfec = bit << `1`;
5205	ff = pfec & PFERR_FETCH_MASK;
5206	uf = pfec & PFERR_USER_MASK;
5207	wf = pfec & PFERR_WRITE_MASK;
5208
5209	/ PFEC.RSVD is replaced by ACC_USER_MASK. /
5210	pte_user = pfec & PFERR_RSVD_MASK;
5211
5212	/*
5213	* Only need to check the access which is not an
5214	* instruction fetch and is to a user page.
5215	*/
5216	check_pkey = (!ff && pte_user);
5217	/*
5218	* write access is controlled by PKRU if it is a
5219	* user access or CR0.WP = 1.
5220	*/
5221	check_write = check_pkey && wf && (uf \|\| wp);
5222
5223	/ PKRU.AD stops both read and write access. /
5224	pkey_bits = !!check_pkey;
5225	/ PKRU.WD stops write access. /
5226	pkey_bits \|= (!!check_write) << `1`;
5227
5228	mmu->pkru_mask \|= (pkey_bits & `3`) << pfec;
5229	}
5230	}
5231
5232	static void reset_guest_paging_metadata(struct kvm_vcpu *vcpu,
5233	struct kvm_mmu *mmu)
5234	{
5235	if (!is_cr0_pg(mmu))
5236	return;
5237
5238	reset_guest_rsvds_bits_mask(vcpu, context: mmu);
5239	update_permission_bitmask(mmu, ept: false);
5240	update_pkru_bitmask(mmu);
5241	}
5242
5243	static void paging64_init_context(struct kvm_mmu *context)
5244	{
5245	context->page_fault = paging64_page_fault;
5246	context->gva_to_gpa = paging64_gva_to_gpa;
5247	context->sync_spte = paging64_sync_spte;
5248	}
5249
5250	static void paging32_init_context(struct kvm_mmu *context)
5251	{
5252	context->page_fault = paging32_page_fault;
5253	context->gva_to_gpa = paging32_gva_to_gpa;
5254	context->sync_spte = paging32_sync_spte;
5255	}
5256
5257	static union kvm_cpu_role kvm_calc_cpu_role(struct kvm_vcpu *vcpu,
5258	const struct kvm_mmu_role_regs *regs)
5259	{
5260	union kvm_cpu_role role = {`0`};
5261
5262	role.base.access = ACC_ALL;
5263	role.base.smm = is_smm(vcpu);
5264	role.base.guest_mode = is_guest_mode(vcpu);
5265	role.ext.valid = `1`;
5266
5267	if (!____is_cr0_pg(regs)) {
5268	role.base.direct = `1`;
5269	return role;
5270	}
5271
5272	role.base.efer_nx = ____is_efer_nx(regs);
5273	role.base.cr0_wp = ____is_cr0_wp(regs);
5274	role.base.smep_andnot_wp = ____is_cr4_smep(regs) && !____is_cr0_wp(regs);
5275	role.base.smap_andnot_wp = ____is_cr4_smap(regs) && !____is_cr0_wp(regs);
5276	role.base.has_4_byte_gpte = !____is_cr4_pae(regs);
5277
5278	if (____is_efer_lma(regs))
5279	role.base.level = ____is_cr4_la57(regs) ? PT64_ROOT_5LEVEL
5280	: PT64_ROOT_4LEVEL;
5281	else if (____is_cr4_pae(regs))
5282	role.base.level = PT32E_ROOT_LEVEL;
5283	else
5284	role.base.level = PT32_ROOT_LEVEL;
5285
5286	role.ext.cr4_smep = ____is_cr4_smep(regs);
5287	role.ext.cr4_smap = ____is_cr4_smap(regs);
5288	role.ext.cr4_pse = ____is_cr4_pse(regs);
5289
5290	/ PKEY and LA57 are active iff long mode is active. /
5291	role.ext.cr4_pke = ____is_efer_lma(regs) && ____is_cr4_pke(regs);
5292	role.ext.cr4_la57 = ____is_efer_lma(regs) && ____is_cr4_la57(regs);
5293	role.ext.efer_lma = ____is_efer_lma(regs);
5294	return role;
5295	}
5296
5297	void __kvm_mmu_refresh_passthrough_bits(struct kvm_vcpu *vcpu,
5298	struct kvm_mmu *mmu)
5299	{
5300	const bool cr0_wp = kvm_is_cr0_bit_set(vcpu, X86_CR0_WP);
5301
5302	BUILD_BUG_ON((KVM_MMU_CR0_ROLE_BITS & KVM_POSSIBLE_CR0_GUEST_BITS) != X86_CR0_WP);
5303	BUILD_BUG_ON((KVM_MMU_CR4_ROLE_BITS & KVM_POSSIBLE_CR4_GUEST_BITS));
5304
5305	if (is_cr0_wp(mmu) == cr0_wp)
5306	return;
5307
5308	mmu->cpu_role.base.cr0_wp = cr0_wp;
5309	reset_guest_paging_metadata(vcpu, mmu);
5310	}
5311
5312	static inline int kvm_mmu_get_tdp_level(struct kvm_vcpu *vcpu)
5313	{
5314	/ tdp_root_level is architecture forced level, use it if nonzero /
5315	if (tdp_root_level)
5316	return tdp_root_level;
5317
5318	/ Use 5-level TDP if and only if it's useful/necessary. /
5319	if (max_tdp_level == `5` && cpuid_maxphyaddr(vcpu) <= `48`)
5320	return `4`;
5321
5322	return max_tdp_level;
5323	}
5324
5325	static union kvm_mmu_page_role
5326	kvm_calc_tdp_mmu_root_page_role(struct kvm_vcpu *vcpu,
5327	union kvm_cpu_role cpu_role)
5328	{
5329	union kvm_mmu_page_role role = {`0`};
5330
5331	role.access = ACC_ALL;
5332	role.cr0_wp = true;
5333	role.efer_nx = true;
5334	role.smm = cpu_role.base.smm;
5335	role.guest_mode = cpu_role.base.guest_mode;
5336	role.ad_disabled = !kvm_ad_enabled();
5337	role.level = kvm_mmu_get_tdp_level(vcpu);
5338	role.direct = true;
5339	role.has_4_byte_gpte = false;
5340
5341	return role;
5342	}
5343
5344	static void init_kvm_tdp_mmu(struct kvm_vcpu *vcpu,
5345	union kvm_cpu_role cpu_role)
5346	{
5347	struct kvm_mmu *context = &vcpu->arch.root_mmu;
5348	union kvm_mmu_page_role root_role = kvm_calc_tdp_mmu_root_page_role(vcpu, cpu_role);
5349
5350	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5351	root_role.word == context->root_role.word)
5352	return;
5353
5354	context->cpu_role.as_u64 = cpu_role.as_u64;
5355	context->root_role.word = root_role.word;
5356	context->page_fault = kvm_tdp_page_fault;
5357	context->sync_spte = NULL;
5358	context->get_guest_pgd = get_guest_cr3;
5359	context->get_pdptr = kvm_pdptr_read;
5360	context->inject_page_fault = kvm_inject_page_fault;
5361
5362	if (!is_cr0_pg(mmu: context))
5363	context->gva_to_gpa = nonpaging_gva_to_gpa;
5364	else if (is_cr4_pae(mmu: context))
5365	context->gva_to_gpa = paging64_gva_to_gpa;
5366	else
5367	context->gva_to_gpa = paging32_gva_to_gpa;
5368
5369	reset_guest_paging_metadata(vcpu, mmu: context);
5370	reset_tdp_shadow_zero_bits_mask(context);
5371	}
5372
5373	static void shadow_mmu_init_context(struct kvm_vcpu vcpu, struct* kvm_mmu *context,
5374	union kvm_cpu_role cpu_role,
5375	union kvm_mmu_page_role root_role)
5376	{
5377	if (cpu_role.as_u64 == context->cpu_role.as_u64 &&
5378	root_role.word == context->root_role.word)
5379	return;
5380
5381	context->cpu_role.as_u64 = cpu_role.as_u64;
5382	context->root_role.word = root_role.word;
5383
5384	if (!is_cr0_pg(mmu: context))
5385	nonpaging_init_context(context);
5386	else if (is_cr4_pae(mmu: context))
5387	paging64_init_context(context);
5388	else
5389	paging32_init_context(context);
5390
5391	reset_guest_paging_metadata(vcpu, mmu: context);
5392	reset_shadow_zero_bits_mask(vcpu, context);
5393	}
5394
5395	static void kvm_init_shadow_mmu(struct kvm_vcpu *vcpu,
5396	union kvm_cpu_role cpu_role)
5397	{
5398	struct kvm_mmu *context = &vcpu->arch.root_mmu;
5399	union kvm_mmu_page_role root_role;
5400
5401	root_role = cpu_role.base;
5402
5403	/ KVM uses PAE paging whenever the guest isn't using 64-bit paging. /
5404	root_role.level = max_t(u32, root_role.level, PT32E_ROOT_LEVEL);
5405
5406	/*
5407	* KVM forces EFER.NX=1 when TDP is disabled, reflect it in the MMU role.
5408	* KVM uses NX when TDP is disabled to handle a variety of scenarios,
5409	* notably for huge SPTEs if iTLB multi-hit mitigation is enabled and
5410	* to generate correct permissions for CR0.WP=0/CR4.SMEP=1/EFER.NX=0.
5411	* The iTLB multi-hit workaround can be toggled at any time, so assume
5412	* NX can be used by any non-nested shadow MMU to avoid having to reset
5413	* MMU contexts.
5414	*/
5415	root_role.efer_nx = true;
5416
5417	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5418	}
5419
5420	void kvm_init_shadow_npt_mmu(struct kvm_vcpu vcpu, unsigned* long cr0,
5421	unsigned long cr4, u64 efer, gpa_t nested_cr3)
5422	{
5423	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5424	struct kvm_mmu_role_regs regs = {
5425	.cr0 = cr0,
5426	.cr4 = cr4 & ~X86_CR4_PKE,
5427	.efer = efer,
5428	};
5429	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, regs: &regs);
5430	union kvm_mmu_page_role root_role;
5431
5432	/ NPT requires CR0.PG=1. /
5433	WARN_ON_ONCE(cpu_role.base.direct);
5434
5435	root_role = cpu_role.base;
5436	root_role.level = kvm_mmu_get_tdp_level(vcpu);
5437	if (root_role.level == PT64_ROOT_5LEVEL &&
5438	cpu_role.base.level == PT64_ROOT_4LEVEL)
5439	root_role.passthrough = `1`;
5440
5441	shadow_mmu_init_context(vcpu, context, cpu_role, root_role);
5442	kvm_mmu_new_pgd(vcpu, nested_cr3);
5443	}
5444	EXPORT_SYMBOL_GPL(kvm_init_shadow_npt_mmu);
5445
5446	static union kvm_cpu_role
5447	kvm_calc_shadow_ept_root_page_role(struct kvm_vcpu *vcpu, bool accessed_dirty,
5448	bool execonly, u8 level)
5449	{
5450	union kvm_cpu_role role = {`0`};
5451
5452	/*
5453	* KVM does not support SMM transfer monitors, and consequently does not
5454	* support the "entry to SMM" control either. role.base.smm is always 0.
5455	*/
5456	WARN_ON_ONCE(is_smm(vcpu));
5457	role.base.level = level;
5458	role.base.has_4_byte_gpte = false;
5459	role.base.direct = false;
5460	role.base.ad_disabled = !accessed_dirty;
5461	role.base.guest_mode = true;
5462	role.base.access = ACC_ALL;
5463
5464	role.ext.word = `0`;
5465	role.ext.execonly = execonly;
5466	role.ext.valid = `1`;
5467
5468	return role;
5469	}
5470
5471	void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly,
5472	int huge_page_level, bool accessed_dirty,
5473	gpa_t new_eptp)
5474	{
5475	struct kvm_mmu *context = &vcpu->arch.guest_mmu;
5476	u8 level = vmx_eptp_page_walk_level(eptp: new_eptp);
5477	union kvm_cpu_role new_mode =
5478	kvm_calc_shadow_ept_root_page_role(vcpu, accessed_dirty,
5479	execonly, level);
5480
5481	if (new_mode.as_u64 != context->cpu_role.as_u64) {
5482	/ EPT, and thus nested EPT, does not consume CR0, CR4, nor EFER. /
5483	context->cpu_role.as_u64 = new_mode.as_u64;
5484	context->root_role.word = new_mode.base.word;
5485
5486	context->page_fault = ept_page_fault;
5487	context->gva_to_gpa = ept_gva_to_gpa;
5488	context->sync_spte = ept_sync_spte;
5489
5490	update_permission_bitmask(mmu: context, ept: true);
5491	context->pkru_mask = `0`;
5492	reset_rsvds_bits_mask_ept(vcpu, context, execonly, huge_page_level);
5493	reset_ept_shadow_zero_bits_mask(context, execonly);
5494	}
5495
5496	kvm_mmu_new_pgd(vcpu, new_eptp);
5497	}
5498	EXPORT_SYMBOL_GPL(kvm_init_shadow_ept_mmu);
5499
5500	static void init_kvm_softmmu(struct kvm_vcpu *vcpu,
5501	union kvm_cpu_role cpu_role)
5502	{
5503	struct kvm_mmu *context = &vcpu->arch.root_mmu;
5504
5505	kvm_init_shadow_mmu(vcpu, cpu_role);
5506
5507	context->get_guest_pgd = get_guest_cr3;
5508	context->get_pdptr = kvm_pdptr_read;
5509	context->inject_page_fault = kvm_inject_page_fault;
5510	}
5511
5512	static void init_kvm_nested_mmu(struct kvm_vcpu *vcpu,
5513	union kvm_cpu_role new_mode)
5514	{
5515	struct kvm_mmu *g_context = &vcpu->arch.nested_mmu;
5516
5517	if (new_mode.as_u64 == g_context->cpu_role.as_u64)
5518	return;
5519
5520	g_context->cpu_role.as_u64 = new_mode.as_u64;
5521	g_context->get_guest_pgd = get_guest_cr3;
5522	g_context->get_pdptr = kvm_pdptr_read;
5523	g_context->inject_page_fault = kvm_inject_page_fault;
5524
5525	/*
5526	* L2 page tables are never shadowed, so there is no need to sync
5527	* SPTEs.
5528	*/
5529	g_context->sync_spte = NULL;
5530
5531	/*
5532	* Note that arch.mmu->gva_to_gpa translates l2_gpa to l1_gpa using
5533	* L1's nested page tables (e.g. EPT12). The nested translation
5534	* of l2_gva to l1_gpa is done by arch.nested_mmu.gva_to_gpa using
5535	* L2's page tables as the first level of translation and L1's
5536	* nested page tables as the second level of translation. Basically
5537	* the gva_to_gpa functions between mmu and nested_mmu are swapped.
5538	*/
5539	if (!is_paging(vcpu))
5540	g_context->gva_to_gpa = nonpaging_gva_to_gpa;
5541	else if (is_long_mode(vcpu))
5542	g_context->gva_to_gpa = paging64_gva_to_gpa;
5543	else if (is_pae(vcpu))
5544	g_context->gva_to_gpa = paging64_gva_to_gpa;
5545	else
5546	g_context->gva_to_gpa = paging32_gva_to_gpa;
5547
5548	reset_guest_paging_metadata(vcpu, mmu: g_context);
5549	}
5550
5551	void kvm_init_mmu(struct kvm_vcpu *vcpu)
5552	{
5553	struct kvm_mmu_role_regs regs = vcpu_to_role_regs(vcpu);
5554	union kvm_cpu_role cpu_role = kvm_calc_cpu_role(vcpu, regs: &regs);
5555
5556	if (mmu_is_nested(vcpu))
5557	init_kvm_nested_mmu(vcpu, new_mode: cpu_role);
5558	else if (tdp_enabled)
5559	init_kvm_tdp_mmu(vcpu, cpu_role);
5560	else
5561	init_kvm_softmmu(vcpu, cpu_role);
5562	}
5563	EXPORT_SYMBOL_GPL(kvm_init_mmu);
5564
5565	void kvm_mmu_after_set_cpuid(struct kvm_vcpu *vcpu)
5566	{
5567	/*
5568	* Invalidate all MMU roles to force them to reinitialize as CPUID
5569	* information is factored into reserved bit calculations.
5570	*
5571	* Correctly handling multiple vCPU models with respect to paging and
5572	* physical address properties) in a single VM would require tracking
5573	* all relevant CPUID information in kvm_mmu_page_role. That is very
5574	* undesirable as it would increase the memory requirements for
5575	* gfn_write_track (see struct kvm_mmu_page_role comments). For now
5576	* that problem is swept under the rug; KVM's CPUID API is horrific and
5577	* it's all but impossible to solve it without introducing a new API.
5578	*/
5579	vcpu->arch.root_mmu.root_role.invalid = `1`;
5580	vcpu->arch.guest_mmu.root_role.invalid = `1`;
5581	vcpu->arch.nested_mmu.root_role.invalid = `1`;
5582	vcpu->arch.root_mmu.cpu_role.ext.valid = `0`;
5583	vcpu->arch.guest_mmu.cpu_role.ext.valid = `0`;
5584	vcpu->arch.nested_mmu.cpu_role.ext.valid = `0`;
5585	kvm_mmu_reset_context(vcpu);
5586
5587	/*
5588	* Changing guest CPUID after KVM_RUN is forbidden, see the comment in
5589	* kvm_arch_vcpu_ioctl().
5590	*/
5591	KVM_BUG_ON(kvm_vcpu_has_run(vcpu), vcpu->kvm);
5592	}
5593
5594	void kvm_mmu_reset_context(struct kvm_vcpu *vcpu)
5595	{
5596	kvm_mmu_unload(vcpu);
5597	kvm_init_mmu(vcpu);
5598	}
5599	EXPORT_SYMBOL_GPL(kvm_mmu_reset_context);
5600
5601	int kvm_mmu_load(struct kvm_vcpu *vcpu)
5602	{
5603	int r;
5604
5605	r = mmu_topup_memory_caches(vcpu, maybe_indirect: !vcpu->arch.mmu->root_role.direct);
5606	if (r)
5607	goto out;
5608	r = mmu_alloc_special_roots(vcpu);
5609	if (r)
5610	goto out;
5611	if (vcpu->arch.mmu->root_role.direct)
5612	r = mmu_alloc_direct_roots(vcpu);
5613	else
5614	r = mmu_alloc_shadow_roots(vcpu);
5615	if (r)
5616	goto out;
5617
5618	kvm_mmu_sync_roots(vcpu);
5619
5620	kvm_mmu_load_pgd(vcpu);
5621
5622	/*
5623	* Flush any TLB entries for the new root, the provenance of the root
5624	* is unknown. Even if KVM ensures there are no stale TLB entries
5625	* for a freed root, in theory another hypervisor could have left
5626	* stale entries. Flushing on alloc also allows KVM to skip the TLB
5627	* flush when freeing a root (see kvm_tdp_mmu_put_root()).
5628	*/
5629	static_call(kvm_x86_flush_tlb_current)(vcpu);
5630	out:
5631	return r;
5632	}
5633
5634	void kvm_mmu_unload(struct kvm_vcpu *vcpu)
5635	{
5636	struct kvm *kvm = vcpu->kvm;
5637
5638	kvm_mmu_free_roots(kvm, &vcpu->arch.root_mmu, KVM_MMU_ROOTS_ALL);
5639	WARN_ON_ONCE(VALID_PAGE(vcpu->arch.root_mmu.root.hpa));
5640	kvm_mmu_free_roots(kvm, &vcpu->arch.guest_mmu, KVM_MMU_ROOTS_ALL);
5641	WARN_ON_ONCE(VALID_PAGE(vcpu->arch.guest_mmu.root.hpa));
5642	vcpu_clear_mmio_info(vcpu, MMIO_GVA_ANY);
5643	}
5644
5645	static bool is_obsolete_root(struct kvm *kvm, hpa_t root_hpa)
5646	{
5647	struct kvm_mmu_page *sp;
5648
5649	if (!VALID_PAGE(root_hpa))
5650	return false;
5651
5652	/*
5653	* When freeing obsolete roots, treat roots as obsolete if they don't
5654	* have an associated shadow page, as it's impossible to determine if
5655	* such roots are fresh or stale. This does mean KVM will get false
5656	* positives and free roots that don't strictly need to be freed, but
5657	* such false positives are relatively rare:
5658	*
5659	* (a) only PAE paging and nested NPT have roots without shadow pages
5660	* (or any shadow paging flavor with a dummy root, see note below)
5661	* (b) remote reloads due to a memslot update obsoletes _all_ roots
5662	* (c) KVM doesn't track previous roots for PAE paging, and the guest
5663	* is unlikely to zap an in-use PGD.
5664	*
5665	* Note! Dummy roots are unique in that they are obsoleted by memslot
5666	* _creation_! See also FNAME(fetch).
5667	*/
5668	sp = root_to_sp(root: root_hpa);
5669	return !sp \|\| is_obsolete_sp(kvm, sp);
5670	}
5671
5672	static void __kvm_mmu_free_obsolete_roots(struct kvm kvm, struct* kvm_mmu *mmu)
5673	{
5674	unsigned long roots_to_free = `0`;
5675	int i;
5676
5677	if (is_obsolete_root(kvm, root_hpa: mmu->root.hpa))
5678	roots_to_free \|= KVM_MMU_ROOT_CURRENT;
5679
5680	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5681	if (is_obsolete_root(kvm, root_hpa: mmu->prev_roots[i].hpa))
5682	roots_to_free \|= KVM_MMU_ROOT_PREVIOUS(i);
5683	}
5684
5685	if (roots_to_free)
5686	kvm_mmu_free_roots(kvm, mmu, roots_to_free);
5687	}
5688
5689	void kvm_mmu_free_obsolete_roots(struct kvm_vcpu *vcpu)
5690	{
5691	__kvm_mmu_free_obsolete_roots(kvm: vcpu->kvm, mmu: &vcpu->arch.root_mmu);
5692	__kvm_mmu_free_obsolete_roots(kvm: vcpu->kvm, mmu: &vcpu->arch.guest_mmu);
5693	}
5694
5695	static u64 mmu_pte_write_fetch_gpte(struct kvm_vcpu vcpu, gpa_t gpa,
5696	int *bytes)
5697	{
5698	u64 gentry = `0`;
5699	int r;
5700
5701	/*
5702	* Assume that the pte write on a page table of the same type
5703	* as the current vcpu paging mode since we update the sptes only
5704	* when they have the same mode.
5705	*/
5706	if (is_pae(vcpu) && *bytes == `4`) {
5707	/ Handle a 32-bit guest writing two halves of a 64-bit gpte /
5708	*gpa &= ~(gpa_t)`7`;
5709	*bytes = `8`;
5710	}
5711
5712	if (bytes == `4` \|\| bytes == `8`) {
5713	r = kvm_vcpu_read_guest_atomic(vcpu, gpa: gpa, data: &gentry, len: bytes);
5714	if (r)
5715	gentry = `0`;
5716	}
5717
5718	return gentry;
5719	}
5720
5721	/*
5722	* If we're seeing too many writes to a page, it may no longer be a page table,
5723	* or we may be forking, in which case it is better to unmap the page.
5724	*/
5725	static bool detect_write_flooding(struct kvm_mmu_page *sp)
5726	{
5727	/*
5728	* Skip write-flooding detected for the sp whose level is 1, because
5729	* it can become unsync, then the guest page is not write-protected.
5730	*/
5731	if (sp->role.level == PG_LEVEL_4K)
5732	return false;
5733
5734	atomic_inc(v: &sp->write_flooding_count);
5735	return atomic_read(v: &sp->write_flooding_count) >= `3`;
5736	}
5737
5738	/*
5739	* Misaligned accesses are too much trouble to fix up; also, they usually
5740	* indicate a page is not used as a page table.
5741	*/
5742	static bool detect_write_misaligned(struct kvm_mmu_page *sp, gpa_t gpa,
5743	int bytes)
5744	{
5745	unsigned offset, pte_size, misaligned;
5746
5747	offset = offset_in_page(gpa);
5748	pte_size = sp->role.has_4_byte_gpte ? `4` : `8`;
5749
5750	/*
5751	* Sometimes, the OS only writes the last one bytes to update status
5752	* bits, for example, in linux, andb instruction is used in clear_bit().
5753	*/
5754	if (!(offset & (pte_size - `1`)) && bytes == `1`)
5755	return false;
5756
5757	misaligned = (offset ^ (offset + bytes - `1`)) & ~(pte_size - `1`);
5758	misaligned \|= bytes < `4`;
5759
5760	return misaligned;
5761	}
5762
5763	static u64 get_written_sptes(struct* kvm_mmu_page sp, gpa_t gpa, int* *nspte)
5764	{
5765	unsigned page_offset, quadrant;
5766	u64 *spte;
5767	int level;
5768
5769	page_offset = offset_in_page(gpa);
5770	level = sp->role.level;
5771	*nspte = `1`;
5772	if (sp->role.has_4_byte_gpte) {
5773	page_offset <<= `1`; / 32->64 /
5774	/*
5775	* A 32-bit pde maps 4MB while the shadow pdes map
5776	* only 2MB. So we need to double the offset again
5777	* and zap two pdes instead of one.
5778	*/
5779	if (level == PT32_ROOT_LEVEL) {
5780	page_offset &= ~`7`; / kill rounding error /
5781	page_offset <<= `1`;
5782	*nspte = `2`;
5783	}
5784	quadrant = page_offset >> PAGE_SHIFT;
5785	page_offset &= ~PAGE_MASK;
5786	if (quadrant != sp->role.quadrant)
5787	return NULL;
5788	}
5789
5790	spte = &sp->spt[page_offset / sizeof(*spte)];
5791	return spte;
5792	}
5793
5794	void kvm_mmu_track_write(struct kvm_vcpu vcpu, gpa_t gpa, const* u8 *new,
5795	int bytes)
5796	{
5797	gfn_t gfn = gpa >> PAGE_SHIFT;
5798	struct kvm_mmu_page *sp;
5799	LIST_HEAD(invalid_list);
5800	u64 entry, gentry, *spte;
5801	int npte;
5802	bool flush = false;
5803
5804	/*
5805	* If we don't have indirect shadow pages, it means no page is
5806	* write-protected, so we can exit simply.
5807	*/
5808	if (!READ_ONCE(vcpu->kvm->arch.indirect_shadow_pages))
5809	return;
5810
5811	write_lock(&vcpu->kvm->mmu_lock);
5812
5813	gentry = mmu_pte_write_fetch_gpte(vcpu, gpa: &gpa, bytes: &bytes);
5814
5815	++vcpu->kvm->stat.mmu_pte_write;
5816
5817	for_each_gfn_valid_sp_with_gptes(vcpu->kvm, sp, gfn) {
5818	if (detect_write_misaligned(sp, gpa, bytes) \|\|
5819	detect_write_flooding(sp)) {
5820	kvm_mmu_prepare_zap_page(kvm: vcpu->kvm, sp, invalid_list: &invalid_list);
5821	++vcpu->kvm->stat.mmu_flooded;
5822	continue;
5823	}
5824
5825	spte = get_written_sptes(sp, gpa, nspte: &npte);
5826	if (!spte)
5827	continue;
5828
5829	while (npte--) {
5830	entry = *spte;
5831	mmu_page_zap_pte(kvm: vcpu->kvm, sp, spte, NULL);
5832	if (gentry && sp->role.level != PG_LEVEL_4K)
5833	++vcpu->kvm->stat.mmu_pde_zapped;
5834	if (is_shadow_present_pte(pte: entry))
5835	flush = true;
5836	++spte;
5837	}
5838	}
5839	kvm_mmu_remote_flush_or_zap(kvm: vcpu->kvm, invalid_list: &invalid_list, remote_flush: flush);
5840	write_unlock(&vcpu->kvm->mmu_lock);
5841	}
5842
5843	int noinline kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gpa_t cr2_or_gpa, u64 error_code,
5844	void insn, int* insn_len)
5845	{
5846	int r, emulation_type = EMULTYPE_PF;
5847	bool direct = vcpu->arch.mmu->root_role.direct;
5848
5849	/*
5850	* IMPLICIT_ACCESS is a KVM-defined flag used to correctly perform SMAP
5851	* checks when emulating instructions that triggers implicit access.
5852	* WARN if hardware generates a fault with an error code that collides
5853	* with the KVM-defined value. Clear the flag and continue on, i.e.
5854	* don't terminate the VM, as KVM can't possibly be relying on a flag
5855	* that KVM doesn't know about.
5856	*/
5857	if (WARN_ON_ONCE(error_code & PFERR_IMPLICIT_ACCESS))
5858	error_code &= ~PFERR_IMPLICIT_ACCESS;
5859
5860	if (WARN_ON_ONCE(!VALID_PAGE(vcpu->arch.mmu->root.hpa)))
5861	return RET_PF_RETRY;
5862
5863	r = RET_PF_INVALID;
5864	if (unlikely(error_code & PFERR_RSVD_MASK)) {
5865	r = handle_mmio_page_fault(vcpu, addr: cr2_or_gpa, direct);
5866	if (r == RET_PF_EMULATE)
5867	goto emulate;
5868	}
5869
5870	if (r == RET_PF_INVALID) {
5871	r = kvm_mmu_do_page_fault(vcpu, cr2_or_gpa,
5872	lower_32_bits(error_code), prefetch: false,
5873	emulation_type: &emulation_type);
5874	if (KVM_BUG_ON(r == RET_PF_INVALID, vcpu->kvm))
5875	return -EIO;
5876	}
5877
5878	if (r < `0`)
5879	return r;
5880	if (r != RET_PF_EMULATE)
5881	return `1`;
5882
5883	/*
5884	* Before emulating the instruction, check if the error code
5885	* was due to a RO violation while translating the guest page.
5886	* This can occur when using nested virtualization with nested
5887	* paging in both guests. If true, we simply unprotect the page
5888	* and resume the guest.
5889	*/
5890	if (vcpu->arch.mmu->root_role.direct &&
5891	(error_code & PFERR_NESTED_GUEST_PAGE) == PFERR_NESTED_GUEST_PAGE) {
5892	kvm_mmu_unprotect_page(kvm: vcpu->kvm, gfn: gpa_to_gfn(gpa: cr2_or_gpa));
5893	return `1`;
5894	}
5895
5896	/*
5897	* vcpu->arch.mmu.page_fault returned RET_PF_EMULATE, but we can still
5898	* optimistically try to just unprotect the page and let the processor
5899	* re-execute the instruction that caused the page fault. Do not allow
5900	* retrying MMIO emulation, as it's not only pointless but could also
5901	* cause us to enter an infinite loop because the processor will keep
5902	* faulting on the non-existent MMIO address. Retrying an instruction
5903	* from a nested guest is also pointless and dangerous as we are only
5904	* explicitly shadowing L1's page tables, i.e. unprotecting something
5905	* for L1 isn't going to magically fix whatever issue cause L2 to fail.
5906	*/
5907	if (!mmio_info_in_cache(vcpu, addr: cr2_or_gpa, direct) && !is_guest_mode(vcpu))
5908	emulation_type \|= EMULTYPE_ALLOW_RETRY_PF;
5909	emulate:
5910	return x86_emulate_instruction(vcpu, cr2_or_gpa, emulation_type, insn,
5911	insn_len);
5912	}
5913	EXPORT_SYMBOL_GPL(kvm_mmu_page_fault);
5914
5915	static void __kvm_mmu_invalidate_addr(struct kvm_vcpu vcpu, struct* kvm_mmu *mmu,
5916	u64 addr, hpa_t root_hpa)
5917	{
5918	struct kvm_shadow_walk_iterator iterator;
5919
5920	vcpu_clear_mmio_info(vcpu, addr);
5921
5922	/*
5923	* Walking and synchronizing SPTEs both assume they are operating in
5924	* the context of the current MMU, and would need to be reworked if
5925	* this is ever used to sync the guest_mmu, e.g. to emulate INVEPT.
5926	*/
5927	if (WARN_ON_ONCE(mmu != vcpu->arch.mmu))
5928	return;
5929
5930	if (!VALID_PAGE(root_hpa))
5931	return;
5932
5933	write_lock(&vcpu->kvm->mmu_lock);
5934	for_each_shadow_entry_using_root(vcpu, root_hpa, addr, iterator) {
5935	struct kvm_mmu_page *sp = sptep_to_sp(sptep: iterator.sptep);
5936
5937	if (sp->unsync) {
5938	int ret = kvm_sync_spte(vcpu, sp, i: iterator.index);
5939
5940	if (ret < `0`)
5941	mmu_page_zap_pte(kvm: vcpu->kvm, sp, spte: iterator.sptep, NULL);
5942	if (ret)
5943	kvm_flush_remote_tlbs_sptep(kvm: vcpu->kvm, sptep: iterator.sptep);
5944	}
5945
5946	if (!sp->unsync_children)
5947	break;
5948	}
5949	write_unlock(&vcpu->kvm->mmu_lock);
5950	}
5951
5952	void kvm_mmu_invalidate_addr(struct kvm_vcpu vcpu, struct* kvm_mmu *mmu,
5953	u64 addr, unsigned long roots)
5954	{
5955	int i;
5956
5957	WARN_ON_ONCE(roots & ~KVM_MMU_ROOTS_ALL);
5958
5959	/ It's actually a GPA for vcpu->arch.guest_mmu. /
5960	if (mmu != &vcpu->arch.guest_mmu) {
5961	/ INVLPG on a non-canonical address is a NOP according to the SDM. /
5962	if (is_noncanonical_address(addr, vcpu))
5963	return;
5964
5965	static_call(kvm_x86_flush_tlb_gva)(vcpu, addr);
5966	}
5967
5968	if (!mmu->sync_spte)
5969	return;
5970
5971	if (roots & KVM_MMU_ROOT_CURRENT)
5972	__kvm_mmu_invalidate_addr(vcpu, mmu, addr, root_hpa: mmu->root.hpa);
5973
5974	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
5975	if (roots & KVM_MMU_ROOT_PREVIOUS(i))
5976	__kvm_mmu_invalidate_addr(vcpu, mmu, addr, root_hpa: mmu->prev_roots[i].hpa);
5977	}
5978	}
5979	EXPORT_SYMBOL_GPL(kvm_mmu_invalidate_addr);
5980
5981	void kvm_mmu_invlpg(struct kvm_vcpu *vcpu, gva_t gva)
5982	{
5983	/*
5984	* INVLPG is required to invalidate any global mappings for the VA,
5985	* irrespective of PCID. Blindly sync all roots as it would take
5986	* roughly the same amount of work/time to determine whether any of the
5987	* previous roots have a global mapping.
5988	*
5989	* Mappings not reachable via the current or previous cached roots will
5990	* be synced when switching to that new cr3, so nothing needs to be
5991	* done here for them.
5992	*/
5993	kvm_mmu_invalidate_addr(vcpu, vcpu->arch.walk_mmu, gva, KVM_MMU_ROOTS_ALL);
5994	++vcpu->stat.invlpg;
5995	}
5996	EXPORT_SYMBOL_GPL(kvm_mmu_invlpg);
5997
5998
5999	void kvm_mmu_invpcid_gva(struct kvm_vcpu vcpu, gva_t gva, unsigned* long pcid)
6000	{
6001	struct kvm_mmu *mmu = vcpu->arch.mmu;
6002	unsigned long roots = `0`;
6003	uint i;
6004
6005	if (pcid == kvm_get_active_pcid(vcpu))
6006	roots \|= KVM_MMU_ROOT_CURRENT;
6007
6008	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++) {
6009	if (VALID_PAGE(mmu->prev_roots[i].hpa) &&
6010	pcid == kvm_get_pcid(vcpu, mmu->prev_roots[i].pgd))
6011	roots \|= KVM_MMU_ROOT_PREVIOUS(i);
6012	}
6013
6014	if (roots)
6015	kvm_mmu_invalidate_addr(vcpu, mmu, gva, roots);
6016	++vcpu->stat.invlpg;
6017
6018	/*
6019	* Mappings not reachable via the current cr3 or the prev_roots will be
6020	* synced when switching to that cr3, so nothing needs to be done here
6021	* for them.
6022	*/
6023	}
6024
6025	void kvm_configure_mmu(bool enable_tdp, int tdp_forced_root_level,
6026	int tdp_max_root_level, int tdp_huge_page_level)
6027	{
6028	tdp_enabled = enable_tdp;
6029	tdp_root_level = tdp_forced_root_level;
6030	max_tdp_level = tdp_max_root_level;
6031
6032	#ifdef CONFIG_X86_64
6033	tdp_mmu_enabled = tdp_mmu_allowed && tdp_enabled;
6034	#endif
6035	/*
6036	* max_huge_page_level reflects KVM's MMU capabilities irrespective
6037	* of kernel support, e.g. KVM may be capable of using 1GB pages when
6038	* the kernel is not. But, KVM never creates a page size greater than
6039	* what is used by the kernel for any given HVA, i.e. the kernel's
6040	* capabilities are ultimately consulted by kvm_mmu_hugepage_adjust().
6041	*/
6042	if (tdp_enabled)
6043	max_huge_page_level = tdp_huge_page_level;
6044	else if (boot_cpu_has(X86_FEATURE_GBPAGES))
6045	max_huge_page_level = PG_LEVEL_1G;
6046	else
6047	max_huge_page_level = PG_LEVEL_2M;
6048	}
6049	EXPORT_SYMBOL_GPL(kvm_configure_mmu);
6050
6051	/ The return value indicates if tlb flush on all vcpus is needed. /
6052	typedef bool (slot_rmaps_handler) (struct* kvm *kvm,
6053	struct kvm_rmap_head *rmap_head,
6054	const struct kvm_memory_slot *slot);
6055
6056	static __always_inline bool __walk_slot_rmaps(struct kvm *kvm,
6057	const struct kvm_memory_slot *slot,
6058	slot_rmaps_handler fn,
6059	int start_level, int end_level,
6060	gfn_t start_gfn, gfn_t end_gfn,
6061	bool flush_on_yield, bool flush)
6062	{
6063	struct slot_rmap_walk_iterator iterator;
6064
6065	lockdep_assert_held_write(&kvm->mmu_lock);
6066
6067	for_each_slot_rmap_range(slot, start_level, end_level, start_gfn,
6068	end_gfn, &iterator) {
6069	if (iterator.rmap)
6070	flush \|= fn(kvm, iterator.rmap, slot);
6071
6072	if (need_resched() \|\| rwlock_needbreak(lock: &kvm->mmu_lock)) {
6073	if (flush && flush_on_yield) {
6074	kvm_flush_remote_tlbs_range(kvm, gfn: start_gfn,
6075	nr_pages: iterator.gfn - start_gfn + `1`);
6076	flush = false;
6077	}
6078	cond_resched_rwlock_write(&kvm->mmu_lock);
6079	}
6080	}
6081
6082	return flush;
6083	}
6084
6085	static __always_inline bool walk_slot_rmaps(struct kvm *kvm,
6086	const struct kvm_memory_slot *slot,
6087	slot_rmaps_handler fn,
6088	int start_level, int end_level,
6089	bool flush_on_yield)
6090	{
6091	return __walk_slot_rmaps(kvm, slot, fn, start_level, end_level,
6092	start_gfn: slot->base_gfn, end_gfn: slot->base_gfn + slot->npages - `1`,
6093	flush_on_yield, flush: false);
6094	}
6095
6096	static __always_inline bool walk_slot_rmaps_4k(struct kvm *kvm,
6097	const struct kvm_memory_slot *slot,
6098	slot_rmaps_handler fn,
6099	bool flush_on_yield)
6100	{
6101	return walk_slot_rmaps(kvm, slot, fn, start_level: PG_LEVEL_4K, end_level: PG_LEVEL_4K, flush_on_yield);
6102	}
6103
6104	static void free_mmu_pages(struct kvm_mmu *mmu)
6105	{
6106	if (!tdp_enabled && mmu->pae_root)
6107	set_memory_encrypted(addr: (unsigned long)mmu->pae_root, numpages: `1`);
6108	free_page((unsigned long)mmu->pae_root);
6109	free_page((unsigned long)mmu->pml4_root);
6110	free_page((unsigned long)mmu->pml5_root);
6111	}
6112
6113	static int __kvm_mmu_create(struct kvm_vcpu vcpu, struct* kvm_mmu *mmu)
6114	{
6115	struct page *page;
6116	int i;
6117
6118	mmu->root.hpa = INVALID_PAGE;
6119	mmu->root.pgd = `0`;
6120	for (i = `0`; i < KVM_MMU_NUM_PREV_ROOTS; i++)
6121	mmu->prev_roots[i] = KVM_MMU_ROOT_INFO_INVALID;
6122
6123	/ vcpu->arch.guest_mmu isn't used when !tdp_enabled. /
6124	if (!tdp_enabled && mmu == &vcpu->arch.guest_mmu)
6125	return `0`;
6126
6127	/*
6128	* When using PAE paging, the four PDPTEs are treated as 'root' pages,
6129	* while the PDP table is a per-vCPU construct that's allocated at MMU
6130	* creation. When emulating 32-bit mode, cr3 is only 32 bits even on
6131	* x86_64. Therefore we need to allocate the PDP table in the first
6132	* 4GB of memory, which happens to fit the DMA32 zone. TDP paging
6133	* generally doesn't use PAE paging and can skip allocating the PDP
6134	* table. The main exception, handled here, is SVM's 32-bit NPT. The
6135	* other exception is for shadowing L1's 32-bit or PAE NPT on 64-bit
6136	* KVM; that horror is handled on-demand by mmu_alloc_special_roots().
6137	*/
6138	if (tdp_enabled && kvm_mmu_get_tdp_level(vcpu) > PT32E_ROOT_LEVEL)
6139	return `0`;
6140
6141	page = alloc_page(GFP_KERNEL_ACCOUNT \| __GFP_DMA32);
6142	if (!page)
6143	return -ENOMEM;
6144
6145	mmu->pae_root = page_address(page);
6146
6147	/*
6148	* CR3 is only 32 bits when PAE paging is used, thus it's impossible to
6149	* get the CPU to treat the PDPTEs as encrypted. Decrypt the page so
6150	* that KVM's writes and the CPU's reads get along. Note, this is
6151	* only necessary when using shadow paging, as 64-bit NPT can get at
6152	* the C-bit even when shadowing 32-bit NPT, and SME isn't supported
6153	* by 32-bit kernels (when KVM itself uses 32-bit NPT).
6154	*/
6155	if (!tdp_enabled)
6156	set_memory_decrypted(addr: (unsigned long)mmu->pae_root, numpages: `1`);
6157	else
6158	WARN_ON_ONCE(shadow_me_value);
6159
6160	for (i = `0`; i < `4`; ++i)
6161	mmu->pae_root[i] = INVALID_PAE_ROOT;
6162
6163	return `0`;
6164	}
6165
6166	int kvm_mmu_create(struct kvm_vcpu *vcpu)
6167	{
6168	int ret;
6169
6170	vcpu->arch.mmu_pte_list_desc_cache.kmem_cache = pte_list_desc_cache;
6171	vcpu->arch.mmu_pte_list_desc_cache.gfp_zero = __GFP_ZERO;
6172
6173	vcpu->arch.mmu_page_header_cache.kmem_cache = mmu_page_header_cache;
6174	vcpu->arch.mmu_page_header_cache.gfp_zero = __GFP_ZERO;
6175
6176	vcpu->arch.mmu_shadow_page_cache.gfp_zero = __GFP_ZERO;
6177
6178	vcpu->arch.mmu = &vcpu->arch.root_mmu;
6179	vcpu->arch.walk_mmu = &vcpu->arch.root_mmu;
6180
6181	ret = __kvm_mmu_create(vcpu, mmu: &vcpu->arch.guest_mmu);
6182	if (ret)
6183	return ret;
6184
6185	ret = __kvm_mmu_create(vcpu, mmu: &vcpu->arch.root_mmu);
6186	if (ret)
6187	goto fail_allocate_root;
6188
6189	return ret;
6190	fail_allocate_root:
6191	free_mmu_pages(mmu: &vcpu->arch.guest_mmu);
6192	return ret;
6193	}
6194
6195	#define BATCH_ZAP_PAGES 10
6196	static void kvm_zap_obsolete_pages(struct kvm *kvm)
6197	{
6198	struct kvm_mmu_page sp, node;
6199	int nr_zapped, batch = `0`;
6200	bool unstable;
6201
6202	restart:
6203	list_for_each_entry_safe_reverse(sp, node,
6204	&kvm->arch.active_mmu_pages, link) {
6205	/*
6206	* No obsolete valid page exists before a newly created page
6207	* since active_mmu_pages is a FIFO list.
6208	*/
6209	if (!is_obsolete_sp(kvm, sp))
6210	break;
6211
6212	/*
6213	* Invalid pages should never land back on the list of active
6214	* pages. Skip the bogus page, otherwise we'll get stuck in an
6215	* infinite loop if the page gets put back on the list (again).
6216	*/
6217	if (WARN_ON_ONCE(sp->role.invalid))
6218	continue;
6219
6220	/*
6221	* No need to flush the TLB since we're only zapping shadow
6222	* pages with an obsolete generation number and all vCPUS have
6223	* loaded a new root, i.e. the shadow pages being zapped cannot
6224	* be in active use by the guest.
6225	*/
6226	if (batch >= BATCH_ZAP_PAGES &&
6227	cond_resched_rwlock_write(&kvm->mmu_lock)) {
6228	batch = `0`;
6229	goto restart;
6230	}
6231
6232	unstable = __kvm_mmu_prepare_zap_page(kvm, sp,
6233	invalid_list: &kvm->arch.zapped_obsolete_pages, nr_zapped: &nr_zapped);
6234	batch += nr_zapped;
6235
6236	if (unstable)
6237	goto restart;
6238	}
6239
6240	/*
6241	* Kick all vCPUs (via remote TLB flush) before freeing the page tables
6242	* to ensure KVM is not in the middle of a lockless shadow page table
6243	* walk, which may reference the pages. The remote TLB flush itself is
6244	* not required and is simply a convenient way to kick vCPUs as needed.
6245	* KVM performs a local TLB flush when allocating a new root (see
6246	* kvm_mmu_load()), and the reload in the caller ensure no vCPUs are
6247	* running with an obsolete MMU.
6248	*/
6249	kvm_mmu_commit_zap_page(kvm, invalid_list: &kvm->arch.zapped_obsolete_pages);
6250	}
6251
6252	/*
6253	* Fast invalidate all shadow pages and use lock-break technique
6254	* to zap obsolete pages.
6255	*
6256	* It's required when memslot is being deleted or VM is being
6257	* destroyed, in these cases, we should ensure that KVM MMU does
6258	* not use any resource of the being-deleted slot or all slots
6259	* after calling the function.
6260	*/
6261	static void kvm_mmu_zap_all_fast(struct kvm *kvm)
6262	{
6263	lockdep_assert_held(&kvm->slots_lock);
6264
6265	write_lock(&kvm->mmu_lock);
6266	trace_kvm_mmu_zap_all_fast(kvm);
6267
6268	/*
6269	* Toggle mmu_valid_gen between '0' and '1'. Because slots_lock is
6270	* held for the entire duration of zapping obsolete pages, it's
6271	* impossible for there to be multiple invalid generations associated
6272	* with valid shadow pages at any given time, i.e. there is exactly
6273	* one valid generation and (at most) one invalid generation.
6274	*/
6275	kvm->arch.mmu_valid_gen = kvm->arch.mmu_valid_gen ? `0` : `1`;
6276
6277	/*
6278	* In order to ensure all vCPUs drop their soon-to-be invalid roots,
6279	* invalidating TDP MMU roots must be done while holding mmu_lock for
6280	* write and in the same critical section as making the reload request,
6281	* e.g. before kvm_zap_obsolete_pages() could drop mmu_lock and yield.
6282	*/
6283	if (tdp_mmu_enabled)
6284	kvm_tdp_mmu_invalidate_all_roots(kvm);
6285
6286	/*
6287	* Notify all vcpus to reload its shadow page table and flush TLB.
6288	* Then all vcpus will switch to new shadow page table with the new
6289	* mmu_valid_gen.
6290	*
6291	* Note: we need to do this under the protection of mmu_lock,
6292	* otherwise, vcpu would purge shadow page but miss tlb flush.
6293	*/
6294	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_FREE_OBSOLETE_ROOTS);
6295
6296	kvm_zap_obsolete_pages(kvm);
6297
6298	write_unlock(&kvm->mmu_lock);
6299
6300	/*
6301	* Zap the invalidated TDP MMU roots, all SPTEs must be dropped before
6302	* returning to the caller, e.g. if the zap is in response to a memslot
6303	* deletion, mmu_notifier callbacks will be unable to reach the SPTEs
6304	* associated with the deleted memslot once the update completes, and
6305	* Deferring the zap until the final reference to the root is put would
6306	* lead to use-after-free.
6307	*/
6308	if (tdp_mmu_enabled)
6309	kvm_tdp_mmu_zap_invalidated_roots(kvm);
6310	}
6311
6312	static bool kvm_has_zapped_obsolete_pages(struct kvm *kvm)
6313	{
6314	return unlikely(!list_empty_careful(&kvm->arch.zapped_obsolete_pages));
6315	}
6316
6317	void kvm_mmu_init_vm(struct kvm *kvm)
6318	{
6319	INIT_LIST_HEAD(list: &kvm->arch.active_mmu_pages);
6320	INIT_LIST_HEAD(list: &kvm->arch.zapped_obsolete_pages);
6321	INIT_LIST_HEAD(list: &kvm->arch.possible_nx_huge_pages);
6322	spin_lock_init(&kvm->arch.mmu_unsync_pages_lock);
6323
6324	if (tdp_mmu_enabled)
6325	kvm_mmu_init_tdp_mmu(kvm);
6326
6327	kvm->arch.split_page_header_cache.kmem_cache = mmu_page_header_cache;
6328	kvm->arch.split_page_header_cache.gfp_zero = __GFP_ZERO;
6329
6330	kvm->arch.split_shadow_page_cache.gfp_zero = __GFP_ZERO;
6331
6332	kvm->arch.split_desc_cache.kmem_cache = pte_list_desc_cache;
6333	kvm->arch.split_desc_cache.gfp_zero = __GFP_ZERO;
6334	}
6335
6336	static void mmu_free_vm_memory_caches(struct kvm *kvm)
6337	{
6338	kvm_mmu_free_memory_cache(mc: &kvm->arch.split_desc_cache);
6339	kvm_mmu_free_memory_cache(mc: &kvm->arch.split_page_header_cache);
6340	kvm_mmu_free_memory_cache(mc: &kvm->arch.split_shadow_page_cache);
6341	}
6342
6343	void kvm_mmu_uninit_vm(struct kvm *kvm)
6344	{
6345	if (tdp_mmu_enabled)
6346	kvm_mmu_uninit_tdp_mmu(kvm);
6347
6348	mmu_free_vm_memory_caches(kvm);
6349	}
6350
6351	static bool kvm_rmap_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6352	{
6353	const struct kvm_memory_slot *memslot;
6354	struct kvm_memslots *slots;
6355	struct kvm_memslot_iter iter;
6356	bool flush = false;
6357	gfn_t start, end;
6358	int i;
6359
6360	if (!kvm_memslots_have_rmaps(kvm))
6361	return flush;
6362
6363	for (i = `0`; i < kvm_arch_nr_memslot_as_ids(kvm); i++) {
6364	slots = __kvm_memslots(kvm, as_id: i);
6365
6366	kvm_for_each_memslot_in_gfn_range(&iter, slots, gfn_start, gfn_end) {
6367	memslot = iter.slot;
6368	start = max(gfn_start, memslot->base_gfn);
6369	end = min(gfn_end, memslot->base_gfn + memslot->npages);
6370	if (WARN_ON_ONCE(start >= end))
6371	continue;
6372
6373	flush = __walk_slot_rmaps(kvm, slot: memslot, fn: __kvm_zap_rmap,
6374	start_level: PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL,
6375	start_gfn: start, end_gfn: end - `1`, flush_on_yield: true, flush);
6376	}
6377	}
6378
6379	return flush;
6380	}
6381
6382	/*
6383	* Invalidate (zap) SPTEs that cover GFNs from gfn_start and up to gfn_end
6384	* (not including it)
6385	*/
6386	void kvm_zap_gfn_range(struct kvm *kvm, gfn_t gfn_start, gfn_t gfn_end)
6387	{
6388	bool flush;
6389
6390	if (WARN_ON_ONCE(gfn_end <= gfn_start))
6391	return;
6392
6393	write_lock(&kvm->mmu_lock);
6394
6395	kvm_mmu_invalidate_begin(kvm);
6396
6397	kvm_mmu_invalidate_range_add(kvm, start: gfn_start, end: gfn_end);
6398
6399	flush = kvm_rmap_zap_gfn_range(kvm, gfn_start, gfn_end);
6400
6401	if (tdp_mmu_enabled)
6402	flush = kvm_tdp_mmu_zap_leafs(kvm, start: gfn_start, end: gfn_end, flush);
6403
6404	if (flush)
6405	kvm_flush_remote_tlbs_range(kvm, gfn: gfn_start, nr_pages: gfn_end - gfn_start);
6406
6407	kvm_mmu_invalidate_end(kvm);
6408
6409	write_unlock(&kvm->mmu_lock);
6410	}
6411
6412	static bool slot_rmap_write_protect(struct kvm *kvm,
6413	struct kvm_rmap_head *rmap_head,
6414	const struct kvm_memory_slot *slot)
6415	{
6416	return rmap_write_protect(rmap_head, pt_protect: false);
6417	}
6418
6419	void kvm_mmu_slot_remove_write_access(struct kvm *kvm,
6420	const struct kvm_memory_slot *memslot,
6421	int start_level)
6422	{
6423	if (kvm_memslots_have_rmaps(kvm)) {
6424	write_lock(&kvm->mmu_lock);
6425	walk_slot_rmaps(kvm, slot: memslot, fn: slot_rmap_write_protect,
6426	start_level, KVM_MAX_HUGEPAGE_LEVEL, flush_on_yield: false);
6427	write_unlock(&kvm->mmu_lock);
6428	}
6429
6430	if (tdp_mmu_enabled) {
6431	read_lock(&kvm->mmu_lock);
6432	kvm_tdp_mmu_wrprot_slot(kvm, slot: memslot, min_level: start_level);
6433	read_unlock(&kvm->mmu_lock);
6434	}
6435	}
6436
6437	static inline bool need_topup(struct kvm_mmu_memory_cache cache, int* min)
6438	{
6439	return kvm_mmu_memory_cache_nr_free_objects(mc: cache) < min;
6440	}
6441
6442	static bool need_topup_split_caches_or_resched(struct kvm *kvm)
6443	{
6444	if (need_resched() \|\| rwlock_needbreak(lock: &kvm->mmu_lock))
6445	return true;
6446
6447	/*
6448	* In the worst case, SPLIT_DESC_CACHE_MIN_NR_OBJECTS descriptors are needed
6449	* to split a single huge page. Calculating how many are actually needed
6450	* is possible but not worth the complexity.
6451	*/
6452	return need_topup(cache: &kvm->arch.split_desc_cache, SPLIT_DESC_CACHE_MIN_NR_OBJECTS) \|\|
6453	need_topup(cache: &kvm->arch.split_page_header_cache, min: `1`) \|\|
6454	need_topup(cache: &kvm->arch.split_shadow_page_cache, min: `1`);
6455	}
6456
6457	static int topup_split_caches(struct kvm *kvm)
6458	{
6459	/*
6460	* Allocating rmap list entries when splitting huge pages for nested
6461	* MMUs is uncommon as KVM needs to use a list if and only if there is
6462	* more than one rmap entry for a gfn, i.e. requires an L1 gfn to be
6463	* aliased by multiple L2 gfns and/or from multiple nested roots with
6464	* different roles. Aliasing gfns when using TDP is atypical for VMMs;
6465	* a few gfns are often aliased during boot, e.g. when remapping BIOS,
6466	* but aliasing rarely occurs post-boot or for many gfns. If there is
6467	* only one rmap entry, rmap->val points directly at that one entry and
6468	* doesn't need to allocate a list. Buffer the cache by the default
6469	* capacity so that KVM doesn't have to drop mmu_lock to topup if KVM
6470	* encounters an aliased gfn or two.
6471	*/
6472	const int capacity = SPLIT_DESC_CACHE_MIN_NR_OBJECTS +
6473	KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE;
6474	int r;
6475
6476	lockdep_assert_held(&kvm->slots_lock);
6477
6478	r = __kvm_mmu_topup_memory_cache(mc: &kvm->arch.split_desc_cache, capacity,
6479	SPLIT_DESC_CACHE_MIN_NR_OBJECTS);
6480	if (r)
6481	return r;
6482
6483	r = kvm_mmu_topup_memory_cache(mc: &kvm->arch.split_page_header_cache, min: `1`);
6484	if (r)
6485	return r;
6486
6487	return kvm_mmu_topup_memory_cache(mc: &kvm->arch.split_shadow_page_cache, min: `1`);
6488	}
6489
6490	static struct kvm_mmu_page shadow_mmu_get_sp_for_split(struct* kvm kvm, u64 huge_sptep)
6491	{
6492	struct kvm_mmu_page *huge_sp = sptep_to_sp(sptep: huge_sptep);
6493	struct shadow_page_caches caches = {};
6494	union kvm_mmu_page_role role;
6495	unsigned int access;
6496	gfn_t gfn;
6497
6498	gfn = kvm_mmu_page_get_gfn(sp: huge_sp, index: spte_index(sptep: huge_sptep));
6499	access = kvm_mmu_page_get_access(sp: huge_sp, index: spte_index(sptep: huge_sptep));
6500
6501	/*
6502	* Note, huge page splitting always uses direct shadow pages, regardless
6503	* of whether the huge page itself is mapped by a direct or indirect
6504	* shadow page, since the huge page region itself is being directly
6505	* mapped with smaller pages.
6506	*/
6507	role = kvm_mmu_child_role(sptep: huge_sptep, /direct=/true, access);
6508
6509	/ Direct SPs do not require a shadowed_info_cache. /
6510	caches.page_header_cache = &kvm->arch.split_page_header_cache;
6511	caches.shadow_page_cache = &kvm->arch.split_shadow_page_cache;
6512
6513	/ Safe to pass NULL for vCPU since requesting a direct SP. /
6514	return __kvm_mmu_get_shadow_page(kvm, NULL, caches: &caches, gfn, role);
6515	}
6516
6517	static void shadow_mmu_split_huge_page(struct kvm *kvm,
6518	const struct kvm_memory_slot *slot,
6519	u64 *huge_sptep)
6520
6521	{
6522	struct kvm_mmu_memory_cache *cache = &kvm->arch.split_desc_cache;
6523	u64 huge_spte = READ_ONCE(*huge_sptep);
6524	struct kvm_mmu_page *sp;
6525	bool flush = false;
6526	u64 *sptep, spte;
6527	gfn_t gfn;
6528	int index;
6529
6530	sp = shadow_mmu_get_sp_for_split(kvm, huge_sptep);
6531
6532	for (index = `0`; index < SPTE_ENT_PER_PAGE; index++) {
6533	sptep = &sp->spt[index];
6534	gfn = kvm_mmu_page_get_gfn(sp, index);
6535
6536	/*
6537	* The SP may already have populated SPTEs, e.g. if this huge
6538	* page is aliased by multiple sptes with the same access
6539	* permissions. These entries are guaranteed to map the same
6540	* gfn-to-pfn translation since the SP is direct, so no need to
6541	* modify them.
6542	*
6543	* However, if a given SPTE points to a lower level page table,
6544	* that lower level page table may only be partially populated.
6545	* Installing such SPTEs would effectively unmap a potion of the
6546	* huge page. Unmapping guest memory always requires a TLB flush
6547	* since a subsequent operation on the unmapped regions would
6548	* fail to detect the need to flush.
6549	*/
6550	if (is_shadow_present_pte(pte: *sptep)) {
6551	flush \|= !is_last_spte(pte: *sptep, level: sp->role.level);
6552	continue;
6553	}
6554
6555	spte = make_huge_page_split_spte(kvm, huge_spte, role: sp->role, index);
6556	mmu_spte_set(sptep, new_spte: spte);
6557	__rmap_add(kvm, cache, slot, spte: sptep, gfn, access: sp->role.access);
6558	}
6559
6560	__link_shadow_page(kvm, cache, sptep: huge_sptep, sp, flush);
6561	}
6562
6563	static int shadow_mmu_try_split_huge_page(struct kvm *kvm,
6564	const struct kvm_memory_slot *slot,
6565	u64 *huge_sptep)
6566	{
6567	struct kvm_mmu_page *huge_sp = sptep_to_sp(sptep: huge_sptep);
6568	int level, r = `0`;
6569	gfn_t gfn;
6570	u64 spte;
6571
6572	/ Grab information for the tracepoint before dropping the MMU lock. /
6573	gfn = kvm_mmu_page_get_gfn(sp: huge_sp, index: spte_index(sptep: huge_sptep));
6574	level = huge_sp->role.level;
6575	spte = *huge_sptep;
6576
6577	if (kvm_mmu_available_pages(kvm) <= KVM_MIN_FREE_MMU_PAGES) {
6578	r = -ENOSPC;
6579	goto out;
6580	}
6581
6582	if (need_topup_split_caches_or_resched(kvm)) {
6583	write_unlock(&kvm->mmu_lock);
6584	cond_resched();
6585	/*
6586	* If the topup succeeds, return -EAGAIN to indicate that the
6587	* rmap iterator should be restarted because the MMU lock was
6588	* dropped.
6589	*/
6590	r = topup_split_caches(kvm) ?: -EAGAIN;
6591	write_lock(&kvm->mmu_lock);
6592	goto out;
6593	}
6594
6595	shadow_mmu_split_huge_page(kvm, slot, huge_sptep);
6596
6597	out:
6598	trace_kvm_mmu_split_huge_page(gfn, spte, level, errno: r);
6599	return r;
6600	}
6601
6602	static bool shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6603	struct kvm_rmap_head *rmap_head,
6604	const struct kvm_memory_slot *slot)
6605	{
6606	struct rmap_iterator iter;
6607	struct kvm_mmu_page *sp;
6608	u64 *huge_sptep;
6609	int r;
6610
6611	restart:
6612	for_each_rmap_spte(rmap_head, &iter, huge_sptep) {
6613	sp = sptep_to_sp(sptep: huge_sptep);
6614
6615	/ TDP MMU is enabled, so rmap only contains nested MMU SPs. /
6616	if (WARN_ON_ONCE(!sp->role.guest_mode))
6617	continue;
6618
6619	/ The rmaps should never contain non-leaf SPTEs. /
6620	if (WARN_ON_ONCE(!is_large_pte(*huge_sptep)))
6621	continue;
6622
6623	/ SPs with level >PG_LEVEL_4K should never by unsync. /
6624	if (WARN_ON_ONCE(sp->unsync))
6625	continue;
6626
6627	/ Don't bother splitting huge pages on invalid SPs. /
6628	if (sp->role.invalid)
6629	continue;
6630
6631	r = shadow_mmu_try_split_huge_page(kvm, slot, huge_sptep);
6632
6633	/*
6634	* The split succeeded or needs to be retried because the MMU
6635	* lock was dropped. Either way, restart the iterator to get it
6636	* back into a consistent state.
6637	*/
6638	if (!r \|\| r == -EAGAIN)
6639	goto restart;
6640
6641	/ The split failed and shouldn't be retried (e.g. -ENOMEM). /
6642	break;
6643	}
6644
6645	return false;
6646	}
6647
6648	static void kvm_shadow_mmu_try_split_huge_pages(struct kvm *kvm,
6649	const struct kvm_memory_slot *slot,
6650	gfn_t start, gfn_t end,
6651	int target_level)
6652	{
6653	int level;
6654
6655	/*
6656	* Split huge pages starting with KVM_MAX_HUGEPAGE_LEVEL and working
6657	* down to the target level. This ensures pages are recursively split
6658	* all the way to the target level. There's no need to split pages
6659	* already at the target level.
6660	*/
6661	for (level = KVM_MAX_HUGEPAGE_LEVEL; level > target_level; level--)
6662	__walk_slot_rmaps(kvm, slot, fn: shadow_mmu_try_split_huge_pages,
6663	start_level: level, end_level: level, start_gfn: start, end_gfn: end - `1`, flush_on_yield: true, flush: false);
6664	}
6665
6666	/ Must be called with the mmu_lock held in write-mode. /
6667	void kvm_mmu_try_split_huge_pages(struct kvm *kvm,
6668	const struct kvm_memory_slot *memslot,
6669	u64 start, u64 end,
6670	int target_level)
6671	{
6672	if (!tdp_mmu_enabled)
6673	return;
6674
6675	if (kvm_memslots_have_rmaps(kvm))
6676	kvm_shadow_mmu_try_split_huge_pages(kvm, slot: memslot, start, end, target_level);
6677
6678	kvm_tdp_mmu_try_split_huge_pages(kvm, slot: memslot, start, end, target_level, shared: false);
6679
6680	/*
6681	* A TLB flush is unnecessary at this point for the same reasons as in
6682	* kvm_mmu_slot_try_split_huge_pages().
6683	*/
6684	}
6685
6686	void kvm_mmu_slot_try_split_huge_pages(struct kvm *kvm,
6687	const struct kvm_memory_slot *memslot,
6688	int target_level)
6689	{
6690	u64 start = memslot->base_gfn;
6691	u64 end = start + memslot->npages;
6692
6693	if (!tdp_mmu_enabled)
6694	return;
6695
6696	if (kvm_memslots_have_rmaps(kvm)) {
6697	write_lock(&kvm->mmu_lock);
6698	kvm_shadow_mmu_try_split_huge_pages(kvm, slot: memslot, start, end, target_level);
6699	write_unlock(&kvm->mmu_lock);
6700	}
6701
6702	read_lock(&kvm->mmu_lock);
6703	kvm_tdp_mmu_try_split_huge_pages(kvm, slot: memslot, start, end, target_level, shared: true);
6704	read_unlock(&kvm->mmu_lock);
6705
6706	/*
6707	* No TLB flush is necessary here. KVM will flush TLBs after
6708	* write-protecting and/or clearing dirty on the newly split SPTEs to
6709	* ensure that guest writes are reflected in the dirty log before the
6710	* ioctl to enable dirty logging on this memslot completes. Since the
6711	* split SPTEs retain the write and dirty bits of the huge SPTE, it is
6712	* safe for KVM to decide if a TLB flush is necessary based on the split
6713	* SPTEs.
6714	*/
6715	}
6716
6717	static bool kvm_mmu_zap_collapsible_spte(struct kvm *kvm,
6718	struct kvm_rmap_head *rmap_head,
6719	const struct kvm_memory_slot *slot)
6720	{
6721	u64 *sptep;
6722	struct rmap_iterator iter;
6723	int need_tlb_flush = `0`;
6724	struct kvm_mmu_page *sp;
6725
6726	restart:
6727	for_each_rmap_spte(rmap_head, &iter, sptep) {
6728	sp = sptep_to_sp(sptep);
6729
6730	/*
6731	* We cannot do huge page mapping for indirect shadow pages,
6732	* which are found on the last rmap (level = 1) when not using
6733	* tdp; such shadow pages are synced with the page table in
6734	* the guest, and the guest page table is using 4K page size
6735	* mapping if the indirect sp has level = 1.
6736	*/
6737	if (sp->role.direct &&
6738	sp->role.level < kvm_mmu_max_mapping_level(kvm, slot, gfn: sp->gfn,
6739	max_level: PG_LEVEL_NUM)) {
6740	kvm_zap_one_rmap_spte(kvm, rmap_head, sptep);
6741
6742	if (kvm_available_flush_remote_tlbs_range())
6743	kvm_flush_remote_tlbs_sptep(kvm, sptep);
6744	else
6745	need_tlb_flush = `1`;
6746
6747	goto restart;
6748	}
6749	}
6750
6751	return need_tlb_flush;
6752	}
6753
6754	static void kvm_rmap_zap_collapsible_sptes(struct kvm *kvm,
6755	const struct kvm_memory_slot *slot)
6756	{
6757	/*
6758	* Note, use KVM_MAX_HUGEPAGE_LEVEL - 1 since there's no need to zap
6759	* pages that are already mapped at the maximum hugepage level.
6760	*/
6761	if (walk_slot_rmaps(kvm, slot, fn: kvm_mmu_zap_collapsible_spte,
6762	start_level: PG_LEVEL_4K, KVM_MAX_HUGEPAGE_LEVEL - `1`, flush_on_yield: true))
6763	kvm_flush_remote_tlbs_memslot(kvm, memslot: slot);
6764	}
6765
6766	void kvm_mmu_zap_collapsible_sptes(struct kvm *kvm,
6767	const struct kvm_memory_slot *slot)
6768	{
6769	if (kvm_memslots_have_rmaps(kvm)) {
6770	write_lock(&kvm->mmu_lock);
6771	kvm_rmap_zap_collapsible_sptes(kvm, slot);
6772	write_unlock(&kvm->mmu_lock);
6773	}
6774
6775	if (tdp_mmu_enabled) {
6776	read_lock(&kvm->mmu_lock);
6777	kvm_tdp_mmu_zap_collapsible_sptes(kvm, slot);
6778	read_unlock(&kvm->mmu_lock);
6779	}
6780	}
6781
6782	void kvm_mmu_slot_leaf_clear_dirty(struct kvm *kvm,
6783	const struct kvm_memory_slot *memslot)
6784	{
6785	if (kvm_memslots_have_rmaps(kvm)) {
6786	write_lock(&kvm->mmu_lock);
6787	/*
6788	* Clear dirty bits only on 4k SPTEs since the legacy MMU only
6789	* support dirty logging at a 4k granularity.
6790	*/
6791	walk_slot_rmaps_4k(kvm, slot: memslot, fn: __rmap_clear_dirty, flush_on_yield: false);
6792	write_unlock(&kvm->mmu_lock);
6793	}
6794
6795	if (tdp_mmu_enabled) {
6796	read_lock(&kvm->mmu_lock);
6797	kvm_tdp_mmu_clear_dirty_slot(kvm, slot: memslot);
6798	read_unlock(&kvm->mmu_lock);
6799	}
6800
6801	/*
6802	* The caller will flush the TLBs after this function returns.
6803	*
6804	* It's also safe to flush TLBs out of mmu lock here as currently this
6805	* function is only used for dirty logging, in which case flushing TLB
6806	* out of mmu lock also guarantees no dirty pages will be lost in
6807	* dirty_bitmap.
6808	*/
6809	}
6810
6811	static void kvm_mmu_zap_all(struct kvm *kvm)
6812	{
6813	struct kvm_mmu_page sp, node;
6814	LIST_HEAD(invalid_list);
6815	int ign;
6816
6817	write_lock(&kvm->mmu_lock);
6818	restart:
6819	list_for_each_entry_safe(sp, node, &kvm->arch.active_mmu_pages, link) {
6820	if (WARN_ON_ONCE(sp->role.invalid))
6821	continue;
6822	if (__kvm_mmu_prepare_zap_page(kvm, sp, invalid_list: &invalid_list, nr_zapped: &ign))
6823	goto restart;
6824	if (cond_resched_rwlock_write(&kvm->mmu_lock))
6825	goto restart;
6826	}
6827
6828	kvm_mmu_commit_zap_page(kvm, invalid_list: &invalid_list);
6829
6830	if (tdp_mmu_enabled)
6831	kvm_tdp_mmu_zap_all(kvm);
6832
6833	write_unlock(&kvm->mmu_lock);
6834	}
6835
6836	void kvm_arch_flush_shadow_all(struct kvm *kvm)
6837	{
6838	kvm_mmu_zap_all(kvm);
6839	}
6840
6841	void kvm_arch_flush_shadow_memslot(struct kvm *kvm,
6842	struct kvm_memory_slot *slot)
6843	{
6844	kvm_mmu_zap_all_fast(kvm);
6845	}
6846
6847	void kvm_mmu_invalidate_mmio_sptes(struct kvm *kvm, u64 gen)
6848	{
6849	WARN_ON_ONCE(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
6850
6851	gen &= MMIO_SPTE_GEN_MASK;
6852
6853	/*
6854	* Generation numbers are incremented in multiples of the number of
6855	* address spaces in order to provide unique generations across all
6856	* address spaces. Strip what is effectively the address space
6857	* modifier prior to checking for a wrap of the MMIO generation so
6858	* that a wrap in any address space is detected.
6859	*/
6860	gen &= ~((u64)kvm_arch_nr_memslot_as_ids(kvm) - `1`);
6861
6862	/*
6863	* The very rare case: if the MMIO generation number has wrapped,
6864	* zap all shadow pages.
6865	*/
6866	if (unlikely(gen == `0`)) {
6867	kvm_debug_ratelimited("zapping shadow pages for mmio generation wraparound\n");
6868	kvm_mmu_zap_all_fast(kvm);
6869	}
6870	}
6871
6872	static unsigned long mmu_shrink_scan(struct shrinker *shrink,
6873	struct shrink_control *sc)
6874	{
6875	struct kvm *kvm;
6876	int nr_to_scan = sc->nr_to_scan;
6877	unsigned long freed = `0`;
6878
6879	mutex_lock(&kvm_lock);
6880
6881	list_for_each_entry(kvm, &vm_list, vm_list) {
6882	int idx;
6883	LIST_HEAD(invalid_list);
6884
6885	/*
6886	* Never scan more than sc->nr_to_scan VM instances.
6887	* Will not hit this condition practically since we do not try
6888	* to shrink more than one VM and it is very unlikely to see
6889	* !n_used_mmu_pages so many times.
6890	*/
6891	if (!nr_to_scan--)
6892	break;
6893	/*
6894	* n_used_mmu_pages is accessed without holding kvm->mmu_lock
6895	* here. We may skip a VM instance errorneosly, but we do not
6896	* want to shrink a VM that only started to populate its MMU
6897	* anyway.
6898	*/
6899	if (!kvm->arch.n_used_mmu_pages &&
6900	!kvm_has_zapped_obsolete_pages(kvm))
6901	continue;
6902
6903	idx = srcu_read_lock(ssp: &kvm->srcu);
6904	write_lock(&kvm->mmu_lock);
6905
6906	if (kvm_has_zapped_obsolete_pages(kvm)) {
6907	kvm_mmu_commit_zap_page(kvm,
6908	invalid_list: &kvm->arch.zapped_obsolete_pages);
6909	goto unlock;
6910	}
6911
6912	freed = kvm_mmu_zap_oldest_mmu_pages(kvm, nr_to_zap: sc->nr_to_scan);
6913
6914	unlock:
6915	write_unlock(&kvm->mmu_lock);
6916	srcu_read_unlock(ssp: &kvm->srcu, idx);
6917
6918	/*
6919	* unfair on small ones
6920	* per-vm shrinkers cry out
6921	* sadness comes quickly
6922	*/
6923	list_move_tail(list: &kvm->vm_list, head: &vm_list);
6924	break;
6925	}
6926
6927	mutex_unlock(lock: &kvm_lock);
6928	return freed;
6929	}
6930
6931	static unsigned long mmu_shrink_count(struct shrinker *shrink,
6932	struct shrink_control *sc)
6933	{
6934	return percpu_counter_read_positive(fbc: &kvm_total_used_mmu_pages);
6935	}
6936
6937	static struct shrinker *mmu_shrinker;
6938
6939	static void mmu_destroy_caches(void)
6940	{
6941	kmem_cache_destroy(s: pte_list_desc_cache);
6942	kmem_cache_destroy(s: mmu_page_header_cache);
6943	}
6944
6945	static int get_nx_huge_pages(char buffer, const* struct kernel_param *kp)
6946	{
6947	if (nx_hugepage_mitigation_hard_disabled)
6948	return sysfs_emit(buf: buffer, fmt: "never\n");
6949
6950	return param_get_bool(buffer, kp);
6951	}
6952
6953	static bool get_nx_auto_mode(void)
6954	{
6955	/ Return true when CPU has the bug, and mitigations are ON /
6956	return boot_cpu_has_bug(X86_BUG_ITLB_MULTIHIT) && !cpu_mitigations_off();
6957	}
6958
6959	static void __set_nx_huge_pages(bool val)
6960	{
6961	nx_huge_pages = itlb_multihit_kvm_mitigation = val;
6962	}
6963
6964	static int set_nx_huge_pages(const char val, const* struct kernel_param *kp)
6965	{
6966	bool old_val = nx_huge_pages;
6967	bool new_val;
6968
6969	if (nx_hugepage_mitigation_hard_disabled)
6970	return -EPERM;
6971
6972	/ In "auto" mode deploy workaround only if CPU has the bug. /
6973	if (sysfs_streq(s1: val, s2: "off")) {
6974	new_val = `0`;
6975	} else if (sysfs_streq(s1: val, s2: "force")) {
6976	new_val = `1`;
6977	} else if (sysfs_streq(s1: val, s2: "auto")) {
6978	new_val = get_nx_auto_mode();
6979	} else if (sysfs_streq(s1: val, s2: "never")) {
6980	new_val = `0`;
6981
6982	mutex_lock(&kvm_lock);
6983	if (!list_empty(head: &vm_list)) {
6984	mutex_unlock(lock: &kvm_lock);
6985	return -EBUSY;
6986	}
6987	nx_hugepage_mitigation_hard_disabled = true;
6988	mutex_unlock(lock: &kvm_lock);
6989	} else if (kstrtobool(s: val, res: &new_val) < `0`) {
6990	return -EINVAL;
6991	}
6992
6993	__set_nx_huge_pages(val: new_val);
6994
6995	if (new_val != old_val) {
6996	struct kvm *kvm;
6997
6998	mutex_lock(&kvm_lock);
6999
7000	list_for_each_entry(kvm, &vm_list, vm_list) {
7001	mutex_lock(&kvm->slots_lock);
7002	kvm_mmu_zap_all_fast(kvm);
7003	mutex_unlock(lock: &kvm->slots_lock);
7004
7005	wake_up_process(tsk: kvm->arch.nx_huge_page_recovery_thread);
7006	}
7007	mutex_unlock(lock: &kvm_lock);
7008	}
7009
7010	return `0`;
7011	}
7012
7013	/*
7014	* nx_huge_pages needs to be resolved to true/false when kvm.ko is loaded, as
7015	* its default value of -1 is technically undefined behavior for a boolean.
7016	* Forward the module init call to SPTE code so that it too can handle module
7017	* params that need to be resolved/snapshot.
7018	*/
7019	void __init kvm_mmu_x86_module_init(void)
7020	{
7021	if (nx_huge_pages == -`1`)
7022	__set_nx_huge_pages(val: get_nx_auto_mode());
7023
7024	/*
7025	* Snapshot userspace's desire to enable the TDP MMU. Whether or not the
7026	* TDP MMU is actually enabled is determined in kvm_configure_mmu()
7027	* when the vendor module is loaded.
7028	*/
7029	tdp_mmu_allowed = tdp_mmu_enabled;
7030
7031	kvm_mmu_spte_module_init();
7032	}
7033
7034	/*
7035	* The bulk of the MMU initialization is deferred until the vendor module is
7036	* loaded as many of the masks/values may be modified by VMX or SVM, i.e. need
7037	* to be reset when a potentially different vendor module is loaded.
7038	*/
7039	int kvm_mmu_vendor_module_init(void)
7040	{
7041	int ret = -ENOMEM;
7042
7043	/*
7044	* MMU roles use union aliasing which is, generally speaking, an
7045	* undefined behavior. However, we supposedly know how compilers behave
7046	* and the current status quo is unlikely to change. Guardians below are
7047	* supposed to let us know if the assumption becomes false.
7048	*/
7049	BUILD_BUG_ON(sizeof(union kvm_mmu_page_role) != sizeof(u32));
7050	BUILD_BUG_ON(sizeof(union kvm_mmu_extended_role) != sizeof(u32));
7051	BUILD_BUG_ON(sizeof(union kvm_cpu_role) != sizeof(u64));
7052
7053	kvm_mmu_reset_all_pte_masks();
7054
7055	pte_list_desc_cache = KMEM_CACHE(pte_list_desc, SLAB_ACCOUNT);
7056	if (!pte_list_desc_cache)
7057	goto out;
7058
7059	mmu_page_header_cache = kmem_cache_create(name: "kvm_mmu_page_header",
7060	size: sizeof(struct kvm_mmu_page),
7061	align: `0`, SLAB_ACCOUNT, NULL);
7062	if (!mmu_page_header_cache)
7063	goto out;
7064
7065	if (percpu_counter_init(&kvm_total_used_mmu_pages, `0`, GFP_KERNEL))
7066	goto out;
7067
7068	mmu_shrinker = shrinker_alloc(flags: `0`, fmt: "x86-mmu");
7069	if (!mmu_shrinker)
7070	goto out_shrinker;
7071
7072	mmu_shrinker->count_objects = mmu_shrink_count;
7073	mmu_shrinker->scan_objects = mmu_shrink_scan;
7074	mmu_shrinker->seeks = DEFAULT_SEEKS * `10`;
7075
7076	shrinker_register(shrinker: mmu_shrinker);
7077
7078	return `0`;
7079
7080	out_shrinker:
7081	percpu_counter_destroy(fbc: &kvm_total_used_mmu_pages);
7082	out:
7083	mmu_destroy_caches();
7084	return ret;
7085	}
7086
7087	void kvm_mmu_destroy(struct kvm_vcpu *vcpu)
7088	{
7089	kvm_mmu_unload(vcpu);
7090	free_mmu_pages(mmu: &vcpu->arch.root_mmu);
7091	free_mmu_pages(mmu: &vcpu->arch.guest_mmu);
7092	mmu_free_memory_caches(vcpu);
7093	}
7094
7095	void kvm_mmu_vendor_module_exit(void)
7096	{
7097	mmu_destroy_caches();
7098	percpu_counter_destroy(fbc: &kvm_total_used_mmu_pages);
7099	shrinker_free(shrinker: mmu_shrinker);
7100	}
7101
7102	/*
7103	* Calculate the effective recovery period, accounting for '0' meaning "let KVM
7104	* select a halving time of 1 hour". Returns true if recovery is enabled.
7105	*/
7106	static bool calc_nx_huge_pages_recovery_period(uint *period)
7107	{
7108	/*
7109	* Use READ_ONCE to get the params, this may be called outside of the
7110	* param setters, e.g. by the kthread to compute its next timeout.
7111	*/
7112	bool enabled = READ_ONCE(nx_huge_pages);
7113	uint ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7114
7115	if (!enabled \|\| !ratio)
7116	return false;
7117
7118	*period = READ_ONCE(nx_huge_pages_recovery_period_ms);
7119	if (!*period) {
7120	/ Make sure the period is not less than one second. /
7121	ratio = min(ratio, `3600u`);
7122	period = `60` `60` * `1000` / ratio;
7123	}
7124	return true;
7125	}
7126
7127	static int set_nx_huge_pages_recovery_param(const char val, const* struct kernel_param *kp)
7128	{
7129	bool was_recovery_enabled, is_recovery_enabled;
7130	uint old_period, new_period;
7131	int err;
7132
7133	if (nx_hugepage_mitigation_hard_disabled)
7134	return -EPERM;
7135
7136	was_recovery_enabled = calc_nx_huge_pages_recovery_period(period: &old_period);
7137
7138	err = param_set_uint(val, kp);
7139	if (err)
7140	return err;
7141
7142	is_recovery_enabled = calc_nx_huge_pages_recovery_period(period: &new_period);
7143
7144	if (is_recovery_enabled &&
7145	(!was_recovery_enabled \|\| old_period > new_period)) {
7146	struct kvm *kvm;
7147
7148	mutex_lock(&kvm_lock);
7149
7150	list_for_each_entry(kvm, &vm_list, vm_list)
7151	wake_up_process(tsk: kvm->arch.nx_huge_page_recovery_thread);
7152
7153	mutex_unlock(lock: &kvm_lock);
7154	}
7155
7156	return err;
7157	}
7158
7159	static void kvm_recover_nx_huge_pages(struct kvm *kvm)
7160	{
7161	unsigned long nx_lpage_splits = kvm->stat.nx_lpage_splits;
7162	struct kvm_memory_slot *slot;
7163	int rcu_idx;
7164	struct kvm_mmu_page *sp;
7165	unsigned int ratio;
7166	LIST_HEAD(invalid_list);
7167	bool flush = false;
7168	ulong to_zap;
7169
7170	rcu_idx = srcu_read_lock(ssp: &kvm->srcu);
7171	write_lock(&kvm->mmu_lock);
7172
7173	/*
7174	* Zapping TDP MMU shadow pages, including the remote TLB flush, must
7175	* be done under RCU protection, because the pages are freed via RCU
7176	* callback.
7177	*/
7178	rcu_read_lock();
7179
7180	ratio = READ_ONCE(nx_huge_pages_recovery_ratio);
7181	to_zap = ratio ? DIV_ROUND_UP(nx_lpage_splits, ratio) : `0`;
7182	for ( ; to_zap; --to_zap) {
7183	if (list_empty(head: &kvm->arch.possible_nx_huge_pages))
7184	break;
7185
7186	/*
7187	* We use a separate list instead of just using active_mmu_pages
7188	* because the number of shadow pages that be replaced with an
7189	* NX huge page is expected to be relatively small compared to
7190	* the total number of shadow pages. And because the TDP MMU
7191	* doesn't use active_mmu_pages.
7192	*/
7193	sp = list_first_entry(&kvm->arch.possible_nx_huge_pages,
7194	struct kvm_mmu_page,
7195	possible_nx_huge_page_link);
7196	WARN_ON_ONCE(!sp->nx_huge_page_disallowed);
7197	WARN_ON_ONCE(!sp->role.direct);
7198
7199	/*
7200	* Unaccount and do not attempt to recover any NX Huge Pages
7201	* that are being dirty tracked, as they would just be faulted
7202	* back in as 4KiB pages. The NX Huge Pages in this slot will be
7203	* recovered, along with all the other huge pages in the slot,
7204	* when dirty logging is disabled.
7205	*
7206	* Since gfn_to_memslot() is relatively expensive, it helps to
7207	* skip it if it the test cannot possibly return true. On the
7208	* other hand, if any memslot has logging enabled, chances are
7209	* good that all of them do, in which case unaccount_nx_huge_page()
7210	* is much cheaper than zapping the page.
7211	*
7212	* If a memslot update is in progress, reading an incorrect value
7213	* of kvm->nr_memslots_dirty_logging is not a problem: if it is
7214	* becoming zero, gfn_to_memslot() will be done unnecessarily; if
7215	* it is becoming nonzero, the page will be zapped unnecessarily.
7216	* Either way, this only affects efficiency in racy situations,
7217	* and not correctness.
7218	*/
7219	slot = NULL;
7220	if (atomic_read(v: &kvm->nr_memslots_dirty_logging)) {
7221	struct kvm_memslots *slots;
7222
7223	slots = kvm_memslots_for_spte_role(kvm, sp->role);
7224	slot = __gfn_to_memslot(slots, gfn: sp->gfn);
7225	WARN_ON_ONCE(!slot);
7226	}
7227
7228	if (slot && kvm_slot_dirty_track_enabled(slot))
7229	unaccount_nx_huge_page(kvm, sp);
7230	else if (is_tdp_mmu_page(sp))
7231	flush \|= kvm_tdp_mmu_zap_sp(kvm, sp);
7232	else
7233	kvm_mmu_prepare_zap_page(kvm, sp, invalid_list: &invalid_list);
7234	WARN_ON_ONCE(sp->nx_huge_page_disallowed);
7235
7236	if (need_resched() \|\| rwlock_needbreak(lock: &kvm->mmu_lock)) {
7237	kvm_mmu_remote_flush_or_zap(kvm, invalid_list: &invalid_list, remote_flush: flush);
7238	rcu_read_unlock();
7239
7240	cond_resched_rwlock_write(&kvm->mmu_lock);
7241	flush = false;
7242
7243	rcu_read_lock();
7244	}
7245	}
7246	kvm_mmu_remote_flush_or_zap(kvm, invalid_list: &invalid_list, remote_flush: flush);
7247
7248	rcu_read_unlock();
7249
7250	write_unlock(&kvm->mmu_lock);
7251	srcu_read_unlock(ssp: &kvm->srcu, idx: rcu_idx);
7252	}
7253
7254	static long get_nx_huge_page_recovery_timeout(u64 start_time)
7255	{
7256	bool enabled;
7257	uint period;
7258
7259	enabled = calc_nx_huge_pages_recovery_period(period: &period);
7260
7261	return enabled ? start_time + msecs_to_jiffies(m: period) - get_jiffies_64()
7262	: MAX_SCHEDULE_TIMEOUT;
7263	}
7264
7265	static int kvm_nx_huge_page_recovery_worker(struct kvm *kvm, uintptr_t data)
7266	{
7267	u64 start_time;
7268	long remaining_time;
7269
7270	while (true) {
7271	start_time = get_jiffies_64();
7272	remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7273
7274	set_current_state(TASK_INTERRUPTIBLE);
7275	while (!kthread_should_stop() && remaining_time > `0`) {
7276	schedule_timeout(timeout: remaining_time);
7277	remaining_time = get_nx_huge_page_recovery_timeout(start_time);
7278	set_current_state(TASK_INTERRUPTIBLE);
7279	}
7280
7281	set_current_state(TASK_RUNNING);
7282
7283	if (kthread_should_stop())
7284	return `0`;
7285
7286	kvm_recover_nx_huge_pages(kvm);
7287	}
7288	}
7289
7290	int kvm_mmu_post_init_vm(struct kvm *kvm)
7291	{
7292	int err;
7293
7294	if (nx_hugepage_mitigation_hard_disabled)
7295	return `0`;
7296
7297	err = kvm_vm_create_worker_thread(kvm, thread_fn: kvm_nx_huge_page_recovery_worker, data: `0`,
7298	name: "kvm-nx-lpage-recovery",
7299	thread_ptr: &kvm->arch.nx_huge_page_recovery_thread);
7300	if (!err)
7301	kthread_unpark(k: kvm->arch.nx_huge_page_recovery_thread);
7302
7303	return err;
7304	}
7305
7306	void kvm_mmu_pre_destroy_vm(struct kvm *kvm)
7307	{
7308	if (kvm->arch.nx_huge_page_recovery_thread)
7309	kthread_stop(k: kvm->arch.nx_huge_page_recovery_thread);
7310	}
7311
7312	#ifdef CONFIG_KVM_GENERIC_MEMORY_ATTRIBUTES
7313	bool kvm_arch_pre_set_memory_attributes(struct kvm *kvm,
7314	struct kvm_gfn_range *range)
7315	{
7316	/*
7317	* Zap SPTEs even if the slot can't be mapped PRIVATE. KVM x86 only
7318	* supports KVM_MEMORY_ATTRIBUTE_PRIVATE, and so it seems like KVM
7319	* can simply ignore such slots. But if userspace is making memory
7320	* PRIVATE, then KVM must prevent the guest from accessing the memory
7321	* as shared. And if userspace is making memory SHARED and this point
7322	* is reached, then at least one page within the range was previously
7323	* PRIVATE, i.e. the slot's possible hugepage ranges are changing.
7324	* Zapping SPTEs in this case ensures KVM will reassess whether or not
7325	* a hugepage can be used for affected ranges.
7326	*/
7327	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
7328	return false;
7329
7330	return kvm_unmap_gfn_range(kvm, range);
7331	}
7332
7333	static bool hugepage_test_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
7334	int level)
7335	{
7336	return lpage_info_slot(gfn, slot, level)->disallow_lpage & KVM_LPAGE_MIXED_FLAG;
7337	}
7338
7339	static void hugepage_clear_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
7340	int level)
7341	{
7342	lpage_info_slot(gfn, slot, level)->disallow_lpage &= ~KVM_LPAGE_MIXED_FLAG;
7343	}
7344
7345	static void hugepage_set_mixed(struct kvm_memory_slot *slot, gfn_t gfn,
7346	int level)
7347	{
7348	lpage_info_slot(gfn, slot, level)->disallow_lpage \|= KVM_LPAGE_MIXED_FLAG;
7349	}
7350
7351	static bool hugepage_has_attrs(struct kvm kvm, struct* kvm_memory_slot *slot,
7352	gfn_t gfn, int level, unsigned long attrs)
7353	{
7354	const unsigned long start = gfn;
7355	const unsigned long end = start + KVM_PAGES_PER_HPAGE(level);
7356
7357	if (level == PG_LEVEL_2M)
7358	return kvm_range_has_memory_attributes(kvm, start, end, attrs);
7359
7360	for (gfn = start; gfn < end; gfn += KVM_PAGES_PER_HPAGE(level - `1`)) {
7361	if (hugepage_test_mixed(slot, gfn, level: level - `1`) \|\|
7362	attrs != kvm_get_memory_attributes(kvm, gfn))
7363	return false;
7364	}
7365	return true;
7366	}
7367
7368	bool kvm_arch_post_set_memory_attributes(struct kvm *kvm,
7369	struct kvm_gfn_range *range)
7370	{
7371	unsigned long attrs = range->arg.attributes;
7372	struct kvm_memory_slot *slot = range->slot;
7373	int level;
7374
7375	lockdep_assert_held_write(&kvm->mmu_lock);
7376	lockdep_assert_held(&kvm->slots_lock);
7377
7378	/*
7379	* Calculate which ranges can be mapped with hugepages even if the slot
7380	* can't map memory PRIVATE. KVM mustn't create a SHARED hugepage over
7381	* a range that has PRIVATE GFNs, and conversely converting a range to
7382	* SHARED may now allow hugepages.
7383	*/
7384	if (WARN_ON_ONCE(!kvm_arch_has_private_mem(kvm)))
7385	return false;
7386
7387	/*
7388	* The sequence matters here: upper levels consume the result of lower
7389	* level's scanning.
7390	*/
7391	for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
7392	gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
7393	gfn_t gfn = gfn_round_for_level(gfn: range->start, level);
7394
7395	/ Process the head page if it straddles the range. /
7396	if (gfn != range->start \|\| gfn + nr_pages > range->end) {
7397	/*
7398	* Skip mixed tracking if the aligned gfn isn't covered
7399	* by the memslot, KVM can't use a hugepage due to the
7400	* misaligned address regardless of memory attributes.
7401	*/
7402	if (gfn >= slot->base_gfn &&
7403	gfn + nr_pages <= slot->base_gfn + slot->npages) {
7404	if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
7405	hugepage_clear_mixed(slot, gfn, level);
7406	else
7407	hugepage_set_mixed(slot, gfn, level);
7408	}
7409	gfn += nr_pages;
7410	}
7411
7412	/*
7413	* Pages entirely covered by the range are guaranteed to have
7414	* only the attributes which were just set.
7415	*/
7416	for ( ; gfn + nr_pages <= range->end; gfn += nr_pages)
7417	hugepage_clear_mixed(slot, gfn, level);
7418
7419	/*
7420	* Process the last tail page if it straddles the range and is
7421	* contained by the memslot. Like the head page, KVM can't
7422	* create a hugepage if the slot size is misaligned.
7423	*/
7424	if (gfn < range->end &&
7425	(gfn + nr_pages) <= (slot->base_gfn + slot->npages)) {
7426	if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
7427	hugepage_clear_mixed(slot, gfn, level);
7428	else
7429	hugepage_set_mixed(slot, gfn, level);
7430	}
7431	}
7432	return false;
7433	}
7434
7435	void kvm_mmu_init_memslot_memory_attributes(struct kvm *kvm,
7436	struct kvm_memory_slot *slot)
7437	{
7438	int level;
7439
7440	if (!kvm_arch_has_private_mem(kvm))
7441	return;
7442
7443	for (level = PG_LEVEL_2M; level <= KVM_MAX_HUGEPAGE_LEVEL; level++) {
7444	/*
7445	* Don't bother tracking mixed attributes for pages that can't
7446	* be huge due to alignment, i.e. process only pages that are
7447	* entirely contained by the memslot.
7448	*/
7449	gfn_t end = gfn_round_for_level(gfn: slot->base_gfn + slot->npages, level);
7450	gfn_t start = gfn_round_for_level(gfn: slot->base_gfn, level);
7451	gfn_t nr_pages = KVM_PAGES_PER_HPAGE(level);
7452	gfn_t gfn;
7453
7454	if (start < slot->base_gfn)
7455	start += nr_pages;
7456
7457	/*
7458	* Unlike setting attributes, every potential hugepage needs to
7459	* be manually checked as the attributes may already be mixed.
7460	*/
7461	for (gfn = start; gfn < end; gfn += nr_pages) {
7462	unsigned long attrs = kvm_get_memory_attributes(kvm, gfn);
7463
7464	if (hugepage_has_attrs(kvm, slot, gfn, level, attrs))
7465	hugepage_clear_mixed(slot, gfn, level);
7466	else
7467	hugepage_set_mixed(slot, gfn, level);
7468	}
7469	}
7470	}
7471	#endif
7472

source code of linux/arch/x86/kvm/mmu/mmu.c