kvm_main.c source code [linux/virt/kvm/kvm_main.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* Kernel-based Virtual Machine driver for Linux
4	*
5	* This module enables machines with Intel VT-x extensions to run virtual
6	* machines without emulation or binary translation.
7	*
8	* Copyright (C) 2006 Qumranet, Inc.
9	* Copyright 2010 Red Hat, Inc. and/or its affiliates.
10	*
11	* Authors:
12	* Avi Kivity <avi@qumranet.com>
13	* Yaniv Kamay <yaniv@qumranet.com>
14	*/
15
16	#include <kvm/iodev.h>
17
18	#include <linux/kvm_host.h>
19	#include <linux/kvm.h>
20	#include <linux/module.h>
21	#include <linux/errno.h>
22	#include <linux/percpu.h>
23	#include <linux/mm.h>
24	#include <linux/miscdevice.h>
25	#include <linux/vmalloc.h>
26	#include <linux/reboot.h>
27	#include <linux/debugfs.h>
28	#include <linux/highmem.h>
29	#include <linux/file.h>
30	#include <linux/syscore_ops.h>
31	#include <linux/cpu.h>
32	#include <linux/sched/signal.h>
33	#include <linux/sched/mm.h>
34	#include <linux/sched/stat.h>
35	#include <linux/cpumask.h>
36	#include <linux/smp.h>
37	#include <linux/anon_inodes.h>
38	#include <linux/profile.h>
39	#include <linux/kvm_para.h>
40	#include <linux/pagemap.h>
41	#include <linux/mman.h>
42	#include <linux/swap.h>
43	#include <linux/bitops.h>
44	#include <linux/spinlock.h>
45	#include <linux/compat.h>
46	#include <linux/srcu.h>
47	#include <linux/hugetlb.h>
48	#include <linux/slab.h>
49	#include <linux/sort.h>
50	#include <linux/bsearch.h>
51	#include <linux/io.h>
52	#include <linux/lockdep.h>
53	#include <linux/kthread.h>
54	#include <linux/suspend.h>
55
56	#include <asm/processor.h>
57	#include <asm/ioctl.h>
58	#include <linux/uaccess.h>
59
60	#include "coalesced_mmio.h"
61	#include "async_pf.h"
62	#include "kvm_mm.h"
63	#include "vfio.h"
64
65	#include <trace/events/ipi.h>
66
67	#define CREATE_TRACE_POINTS
68	#include <trace/events/kvm.h>
69
70	#include <linux/kvm_dirty_ring.h>
71
72
73	/ Worst case buffer size needed for holding an integer. /
74	#define ITOA_MAX_LEN 12
75
76	MODULE_AUTHOR("Qumranet");
77	MODULE_LICENSE("GPL");
78
79	/ Architectures should define their poll value according to the halt latency /
80	unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
81	module_param(halt_poll_ns, uint, `0644`);
82	EXPORT_SYMBOL_GPL(halt_poll_ns);
83
84	/ Default doubles per-vcpu halt_poll_ns. /
85	unsigned int halt_poll_ns_grow = `2`;
86	module_param(halt_poll_ns_grow, uint, `0644`);
87	EXPORT_SYMBOL_GPL(halt_poll_ns_grow);
88
89	/ The start value to grow halt_poll_ns from /
90	unsigned int halt_poll_ns_grow_start = `10000`; / 10us /
91	module_param(halt_poll_ns_grow_start, uint, `0644`);
92	EXPORT_SYMBOL_GPL(halt_poll_ns_grow_start);
93
94	/ Default resets per-vcpu halt_poll_ns . /
95	unsigned int halt_poll_ns_shrink;
96	module_param(halt_poll_ns_shrink, uint, `0644`);
97	EXPORT_SYMBOL_GPL(halt_poll_ns_shrink);
98
99	/*
100	* Ordering of locks:
101	*
102	* kvm->lock --> kvm->slots_lock --> kvm->irq_lock
103	*/
104
105	DEFINE_MUTEX(kvm_lock);
106	LIST_HEAD(vm_list);
107
108	static struct kmem_cache *kvm_vcpu_cache;
109
110	static __read_mostly struct preempt_ops kvm_preempt_ops;
111	static DEFINE_PER_CPU(struct kvm_vcpu *, kvm_running_vcpu);
112
113	struct dentry *kvm_debugfs_dir;
114	EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
115
116	static const struct file_operations stat_fops_per_vm;
117
118	static struct file_operations kvm_chardev_ops;
119
120	static long kvm_vcpu_ioctl(struct file file, unsigned* int ioctl,
121	unsigned long arg);
122	#ifdef CONFIG_KVM_COMPAT
123	static long kvm_vcpu_compat_ioctl(struct file file, unsigned* int ioctl,
124	unsigned long arg);
125	#define KVM_COMPAT(c) .compat_ioctl = (c)
126	#else
127	/*
128	* For architectures that don't implement a compat infrastructure,
129	* adopt a double line of defense:
130	* - Prevent a compat task from opening /dev/kvm
131	* - If the open has been done by a 64bit task, and the KVM fd
132	* passed to a compat task, let the ioctls fail.
133	*/
134	static long kvm_no_compat_ioctl(struct file file, unsigned* int ioctl,
135	unsigned long arg) { return -EINVAL; }
136
137	static int kvm_no_compat_open(struct inode inode, struct* file *file)
138	{
139	return is_compat_task() ? -ENODEV : `0`;
140	}
141	#define KVM_COMPAT(c) .compat_ioctl = kvm_no_compat_ioctl, \
142	.open = kvm_no_compat_open
143	#endif
144	static int hardware_enable_all(void);
145	static void hardware_disable_all(void);
146
147	static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
148
149	#define KVM_EVENT_CREATE_VM 0
150	#define KVM_EVENT_DESTROY_VM 1
151	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm);
152	static unsigned long long kvm_createvm_count;
153	static unsigned long long kvm_active_vms;
154
155	static DEFINE_PER_CPU(cpumask_var_t, cpu_kick_mask);
156
157	__weak void kvm_arch_guest_memory_reclaimed(struct kvm *kvm)
158	{
159	}
160
161	bool kvm_is_zone_device_page(struct page *page)
162	{
163	/*
164	* The metadata used by is_zone_device_page() to determine whether or
165	* not a page is ZONE_DEVICE is guaranteed to be valid if and only if
166	* the device has been pinned, e.g. by get_user_pages(). WARN if the
167	* page_count() is zero to help detect bad usage of this helper.
168	*/
169	if (WARN_ON_ONCE(!page_count(page)))
170	return false;
171
172	return is_zone_device_page(page);
173	}
174
175	/*
176	* Returns a 'struct page' if the pfn is "valid" and backed by a refcounted
177	* page, NULL otherwise. Note, the list of refcounted PG_reserved page types
178	* is likely incomplete, it has been compiled purely through people wanting to
179	* back guest with a certain type of memory and encountering issues.
180	*/
181	struct page *kvm_pfn_to_refcounted_page(kvm_pfn_t pfn)
182	{
183	struct page *page;
184
185	if (!pfn_valid(pfn))
186	return NULL;
187
188	page = pfn_to_page(pfn);
189	if (!PageReserved(page))
190	return page;
191
192	/ The ZERO_PAGE(s) is marked PG_reserved, but is refcounted. /
193	if (is_zero_pfn(pfn))
194	return page;
195
196	/*
197	* ZONE_DEVICE pages currently set PG_reserved, but from a refcounting
198	* perspective they are "normal" pages, albeit with slightly different
199	* usage rules.
200	*/
201	if (kvm_is_zone_device_page(page))
202	return page;
203
204	return NULL;
205	}
206
207	/*
208	* Switches to specified vcpu, until a matching vcpu_put()
209	*/
210	void vcpu_load(struct kvm_vcpu *vcpu)
211	{
212	int cpu = get_cpu();
213
214	__this_cpu_write(kvm_running_vcpu, vcpu);
215	preempt_notifier_register(notifier: &vcpu->preempt_notifier);
216	kvm_arch_vcpu_load(vcpu, cpu);
217	put_cpu();
218	}
219	EXPORT_SYMBOL_GPL(vcpu_load);
220
221	void vcpu_put(struct kvm_vcpu *vcpu)
222	{
223	preempt_disable();
224	kvm_arch_vcpu_put(vcpu);
225	preempt_notifier_unregister(notifier: &vcpu->preempt_notifier);
226	__this_cpu_write(kvm_running_vcpu, NULL);
227	preempt_enable();
228	}
229	EXPORT_SYMBOL_GPL(vcpu_put);
230
231	/ TODO: merge with kvm_arch_vcpu_should_kick /
232	static bool kvm_request_needs_ipi(struct kvm_vcpu vcpu, unsigned* req)
233	{
234	int mode = kvm_vcpu_exiting_guest_mode(vcpu);
235
236	/*
237	* We need to wait for the VCPU to reenable interrupts and get out of
238	* READING_SHADOW_PAGE_TABLES mode.
239	*/
240	if (req & KVM_REQUEST_WAIT)
241	return mode != OUTSIDE_GUEST_MODE;
242
243	/*
244	* Need to kick a running VCPU, but otherwise there is nothing to do.
245	*/
246	return mode == IN_GUEST_MODE;
247	}
248
249	static void ack_kick(void *_completed)
250	{
251	}
252
253	static inline bool kvm_kick_many_cpus(struct cpumask *cpus, bool wait)
254	{
255	if (cpumask_empty(srcp: cpus))
256	return false;
257
258	smp_call_function_many(mask: cpus, func: ack_kick, NULL, wait);
259	return true;
260	}
261
262	static void kvm_make_vcpu_request(struct kvm_vcpu vcpu, unsigned* int req,
263	struct cpumask tmp, int* current_cpu)
264	{
265	int cpu;
266
267	if (likely(!(req & KVM_REQUEST_NO_ACTION)))
268	__kvm_make_request(req, vcpu);
269
270	if (!(req & KVM_REQUEST_NO_WAKEUP) && kvm_vcpu_wake_up(vcpu))
271	return;
272
273	/*
274	* Note, the vCPU could get migrated to a different pCPU at any point
275	* after kvm_request_needs_ipi(), which could result in sending an IPI
276	* to the previous pCPU. But, that's OK because the purpose of the IPI
277	* is to ensure the vCPU returns to OUTSIDE_GUEST_MODE, which is
278	* satisfied if the vCPU migrates. Entering READING_SHADOW_PAGE_TABLES
279	* after this point is also OK, as the requirement is only that KVM wait
280	* for vCPUs that were reading SPTEs _before_ any changes were
281	* finalized. See kvm_vcpu_kick() for more details on handling requests.
282	*/
283	if (kvm_request_needs_ipi(vcpu, req)) {
284	cpu = READ_ONCE(vcpu->cpu);
285	if (cpu != -`1` && cpu != current_cpu)
286	__cpumask_set_cpu(cpu, dstp: tmp);
287	}
288	}
289
290	bool kvm_make_vcpus_request_mask(struct kvm kvm, unsigned* int req,
291	unsigned long *vcpu_bitmap)
292	{
293	struct kvm_vcpu *vcpu;
294	struct cpumask *cpus;
295	int i, me;
296	bool called;
297
298	me = get_cpu();
299
300	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
301	cpumask_clear(dstp: cpus);
302
303	for_each_set_bit(i, vcpu_bitmap, KVM_MAX_VCPUS) {
304	vcpu = kvm_get_vcpu(kvm, i);
305	if (!vcpu)
306	continue;
307	kvm_make_vcpu_request(vcpu, req, tmp: cpus, current_cpu: me);
308	}
309
310	called = kvm_kick_many_cpus(cpus, wait: !!(req & KVM_REQUEST_WAIT));
311	put_cpu();
312
313	return called;
314	}
315
316	bool kvm_make_all_cpus_request_except(struct kvm kvm, unsigned* int req,
317	struct kvm_vcpu *except)
318	{
319	struct kvm_vcpu *vcpu;
320	struct cpumask *cpus;
321	unsigned long i;
322	bool called;
323	int me;
324
325	me = get_cpu();
326
327	cpus = this_cpu_cpumask_var_ptr(cpu_kick_mask);
328	cpumask_clear(dstp: cpus);
329
330	kvm_for_each_vcpu(i, vcpu, kvm) {
331	if (vcpu == except)
332	continue;
333	kvm_make_vcpu_request(vcpu, req, tmp: cpus, current_cpu: me);
334	}
335
336	called = kvm_kick_many_cpus(cpus, wait: !!(req & KVM_REQUEST_WAIT));
337	put_cpu();
338
339	return called;
340	}
341
342	bool kvm_make_all_cpus_request(struct kvm kvm, unsigned* int req)
343	{
344	return kvm_make_all_cpus_request_except(kvm, req, NULL);
345	}
346	EXPORT_SYMBOL_GPL(kvm_make_all_cpus_request);
347
348	void kvm_flush_remote_tlbs(struct kvm *kvm)
349	{
350	++kvm->stat.generic.remote_tlb_flush_requests;
351
352	/*
353	* We want to publish modifications to the page tables before reading
354	* mode. Pairs with a memory barrier in arch-specific code.
355	* - x86: smp_mb__after_srcu_read_unlock in vcpu_enter_guest
356	* and smp_mb in walk_shadow_page_lockless_begin/end.
357	* - powerpc: smp_mb in kvmppc_prepare_to_enter.
358	*
359	* There is already an smp_mb__after_atomic() before
360	* kvm_make_all_cpus_request() reads vcpu->mode. We reuse that
361	* barrier here.
362	*/
363	if (!kvm_arch_flush_remote_tlbs(kvm)
364	\|\| kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
365	++kvm->stat.generic.remote_tlb_flush;
366	}
367	EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
368
369	void kvm_flush_remote_tlbs_range(struct kvm *kvm, gfn_t gfn, u64 nr_pages)
370	{
371	if (!kvm_arch_flush_remote_tlbs_range(kvm, gfn, nr_pages))
372	return;
373
374	/*
375	* Fall back to a flushing entire TLBs if the architecture range-based
376	* TLB invalidation is unsupported or can't be performed for whatever
377	* reason.
378	*/
379	kvm_flush_remote_tlbs(kvm);
380	}
381
382	void kvm_flush_remote_tlbs_memslot(struct kvm *kvm,
383	const struct kvm_memory_slot *memslot)
384	{
385	/*
386	* All current use cases for flushing the TLBs for a specific memslot
387	* are related to dirty logging, and many do the TLB flush out of
388	* mmu_lock. The interaction between the various operations on memslot
389	* must be serialized by slots_locks to ensure the TLB flush from one
390	* operation is observed by any other operation on the same memslot.
391	*/
392	lockdep_assert_held(&kvm->slots_lock);
393	kvm_flush_remote_tlbs_range(kvm, gfn: memslot->base_gfn, nr_pages: memslot->npages);
394	}
395
396	static void kvm_flush_shadow_all(struct kvm *kvm)
397	{
398	kvm_arch_flush_shadow_all(kvm);
399	kvm_arch_guest_memory_reclaimed(kvm);
400	}
401
402	#ifdef KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE
403	static inline void mmu_memory_cache_alloc_obj(struct* kvm_mmu_memory_cache *mc,
404	gfp_t gfp_flags)
405	{
406	gfp_flags \|= mc->gfp_zero;
407
408	if (mc->kmem_cache)
409	return kmem_cache_alloc(cachep: mc->kmem_cache, flags: gfp_flags);
410	else
411	return (void *)__get_free_page(gfp_flags);
412	}
413
414	int __kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache mc, int* capacity, int min)
415	{
416	gfp_t gfp = mc->gfp_custom ? mc->gfp_custom : GFP_KERNEL_ACCOUNT;
417	void *obj;
418
419	if (mc->nobjs >= min)
420	return `0`;
421
422	if (unlikely(!mc->objects)) {
423	if (WARN_ON_ONCE(!capacity))
424	return -EIO;
425
426	mc->objects = kvmalloc_array(n: sizeof(void *), size: capacity, flags: gfp);
427	if (!mc->objects)
428	return -ENOMEM;
429
430	mc->capacity = capacity;
431	}
432
433	/ It is illegal to request a different capacity across topups. /
434	if (WARN_ON_ONCE(mc->capacity != capacity))
435	return -EIO;
436
437	while (mc->nobjs < mc->capacity) {
438	obj = mmu_memory_cache_alloc_obj(mc, gfp_flags: gfp);
439	if (!obj)
440	return mc->nobjs >= min ? `0` : -ENOMEM;
441	mc->objects[mc->nobjs++] = obj;
442	}
443	return `0`;
444	}
445
446	int kvm_mmu_topup_memory_cache(struct kvm_mmu_memory_cache mc, int* min)
447	{
448	return __kvm_mmu_topup_memory_cache(mc, KVM_ARCH_NR_OBJS_PER_MEMORY_CACHE, min);
449	}
450
451	int kvm_mmu_memory_cache_nr_free_objects(struct kvm_mmu_memory_cache *mc)
452	{
453	return mc->nobjs;
454	}
455
456	void kvm_mmu_free_memory_cache(struct kvm_mmu_memory_cache *mc)
457	{
458	while (mc->nobjs) {
459	if (mc->kmem_cache)
460	kmem_cache_free(s: mc->kmem_cache, objp: mc->objects[--mc->nobjs]);
461	else
462	free_page((unsigned long)mc->objects[--mc->nobjs]);
463	}
464
465	kvfree(addr: mc->objects);
466
467	mc->objects = NULL;
468	mc->capacity = `0`;
469	}
470
471	void kvm_mmu_memory_cache_alloc(struct* kvm_mmu_memory_cache *mc)
472	{
473	void *p;
474
475	if (WARN_ON(!mc->nobjs))
476	p = mmu_memory_cache_alloc_obj(mc, GFP_ATOMIC \| __GFP_ACCOUNT);
477	else
478	p = mc->objects[--mc->nobjs];
479	BUG_ON(!p);
480	return p;
481	}
482	#endif
483
484	static void kvm_vcpu_init(struct kvm_vcpu vcpu, struct* kvm kvm, unsigned* id)
485	{
486	mutex_init(&vcpu->mutex);
487	vcpu->cpu = -`1`;
488	vcpu->kvm = kvm;
489	vcpu->vcpu_id = id;
490	vcpu->pid = NULL;
491	#ifndef __KVM_HAVE_ARCH_WQP
492	rcuwait_init(w: &vcpu->wait);
493	#endif
494	kvm_async_pf_vcpu_init(vcpu);
495
496	kvm_vcpu_set_in_spin_loop(vcpu, val: false);
497	kvm_vcpu_set_dy_eligible(vcpu, val: false);
498	vcpu->preempted = false;
499	vcpu->ready = false;
500	preempt_notifier_init(notifier: &vcpu->preempt_notifier, ops: &kvm_preempt_ops);
501	vcpu->last_used_slot = NULL;
502
503	/ Fill the stats id string for the vcpu /
504	snprintf(buf: vcpu->stats_id, size: sizeof(vcpu->stats_id), fmt: "kvm-%d/vcpu-%d",
505	task_pid_nr(current), id);
506	}
507
508	static void kvm_vcpu_destroy(struct kvm_vcpu *vcpu)
509	{
510	kvm_arch_vcpu_destroy(vcpu);
511	kvm_dirty_ring_free(ring: &vcpu->dirty_ring);
512
513	/*
514	* No need for rcu_read_lock as VCPU_RUN is the only place that changes
515	* the vcpu->pid pointer, and at destruction time all file descriptors
516	* are already gone.
517	*/
518	put_pid(rcu_dereference_protected(vcpu->pid, `1`));
519
520	free_page((unsigned long)vcpu->run);
521	kmem_cache_free(s: kvm_vcpu_cache, objp: vcpu);
522	}
523
524	void kvm_destroy_vcpus(struct kvm *kvm)
525	{
526	unsigned long i;
527	struct kvm_vcpu *vcpu;
528
529	kvm_for_each_vcpu(i, vcpu, kvm) {
530	kvm_vcpu_destroy(vcpu);
531	xa_erase(&kvm->vcpu_array, index: i);
532	}
533
534	atomic_set(v: &kvm->online_vcpus, i: `0`);
535	}
536	EXPORT_SYMBOL_GPL(kvm_destroy_vcpus);
537
538	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
539	static inline struct kvm mmu_notifier_to_kvm(struct* mmu_notifier *mn)
540	{
541	return container_of(mn, struct kvm, mmu_notifier);
542	}
543
544	typedef bool (hva_handler_t)(struct* kvm kvm, struct* kvm_gfn_range *range);
545
546	typedef void (on_lock_fn_t)(struct* kvm kvm, unsigned* long start,
547	unsigned long end);
548
549	typedef void (on_unlock_fn_t)(struct* kvm *kvm);
550
551	struct kvm_hva_range {
552	unsigned long start;
553	unsigned long end;
554	union kvm_mmu_notifier_arg arg;
555	hva_handler_t handler;
556	on_lock_fn_t on_lock;
557	on_unlock_fn_t on_unlock;
558	bool flush_on_ret;
559	bool may_block;
560	};
561
562	/*
563	* Use a dedicated stub instead of NULL to indicate that there is no callback
564	* function/handler. The compiler technically can't guarantee that a real
565	* function will have a non-zero address, and so it will generate code to
566	* check for !NULL, whereas comparing against a stub will be elided at compile
567	* time (unless the compiler is getting long in the tooth, e.g. gcc 4.9).
568	*/
569	static void kvm_null_fn(void)
570	{
571
572	}
573	#define IS_KVM_NULL_FN(fn) ((fn) == (void *)kvm_null_fn)
574
575	static const union kvm_mmu_notifier_arg KVM_MMU_NOTIFIER_NO_ARG;
576
577	/ Iterate over each memslot intersecting [start, last] (inclusive) range /
578	#define kvm_for_each_memslot_in_hva_range(node, slots, start, last) \
579	for (node = interval_tree_iter_first(&slots->hva_tree, start, last); \
580	node; \
581	node = interval_tree_iter_next(node, start, last)) \
582
583	static __always_inline int __kvm_handle_hva_range(struct kvm *kvm,
584	const struct kvm_hva_range *range)
585	{
586	bool ret = false, locked = false;
587	struct kvm_gfn_range gfn_range;
588	struct kvm_memory_slot *slot;
589	struct kvm_memslots *slots;
590	int i, idx;
591
592	if (WARN_ON_ONCE(range->end <= range->start))
593	return `0`;
594
595	/ A null handler is allowed if and only if on_lock() is provided. /
596	if (WARN_ON_ONCE(IS_KVM_NULL_FN(range->on_lock) &&
597	IS_KVM_NULL_FN(range->handler)))
598	return `0`;
599
600	idx = srcu_read_lock(ssp: &kvm->srcu);
601
602	for (i = `0`; i < KVM_ADDRESS_SPACE_NUM; i++) {
603	struct interval_tree_node *node;
604
605	slots = __kvm_memslots(kvm, as_id: i);
606	kvm_for_each_memslot_in_hva_range(node, slots,
607	range->start, range->end - `1`) {
608	unsigned long hva_start, hva_end;
609
610	slot = container_of(node, struct kvm_memory_slot, hva_node[slots->node_idx]);
611	hva_start = max(range->start, slot->userspace_addr);
612	hva_end = min(range->end, slot->userspace_addr +
613	(slot->npages << PAGE_SHIFT));
614
615	/*
616	* To optimize for the likely case where the address
617	* range is covered by zero or one memslots, don't
618	* bother making these conditional (to avoid writes on
619	* the second or later invocation of the handler).
620	*/
621	gfn_range.arg = range->arg;
622	gfn_range.may_block = range->may_block;
623
624	/*
625	* {gfn(page) \| page intersects with [hva_start, hva_end)} =
626	* {gfn_start, gfn_start+1, ..., gfn_end-1}.
627	*/
628	gfn_range.start = hva_to_gfn_memslot(hva: hva_start, slot);
629	gfn_range.end = hva_to_gfn_memslot(hva: hva_end + PAGE_SIZE - `1`, slot);
630	gfn_range.slot = slot;
631
632	if (!locked) {
633	locked = true;
634	KVM_MMU_LOCK(kvm);
635	if (!IS_KVM_NULL_FN(range->on_lock))
636	range->on_lock(kvm, range->start, range->end);
637	if (IS_KVM_NULL_FN(range->handler))
638	break;
639	}
640	ret \|= range->handler(kvm, &gfn_range);
641	}
642	}
643
644	if (range->flush_on_ret && ret)
645	kvm_flush_remote_tlbs(kvm);
646
647	if (locked) {
648	KVM_MMU_UNLOCK(kvm);
649	if (!IS_KVM_NULL_FN(range->on_unlock))
650	range->on_unlock(kvm);
651	}
652
653	srcu_read_unlock(ssp: &kvm->srcu, idx);
654
655	/ The notifiers are averse to booleans. :-( /
656	return (int)ret;
657	}
658
659	static __always_inline int kvm_handle_hva_range(struct mmu_notifier *mn,
660	unsigned long start,
661	unsigned long end,
662	union kvm_mmu_notifier_arg arg,
663	hva_handler_t handler)
664	{
665	struct kvm *kvm = mmu_notifier_to_kvm(mn);
666	const struct kvm_hva_range range = {
667	.start = start,
668	.end = end,
669	.arg = arg,
670	.handler = handler,
671	.on_lock = (void *)kvm_null_fn,
672	.on_unlock = (void *)kvm_null_fn,
673	.flush_on_ret = true,
674	.may_block = false,
675	};
676
677	return __kvm_handle_hva_range(kvm, range: &range);
678	}
679
680	static __always_inline int kvm_handle_hva_range_no_flush(struct mmu_notifier *mn,
681	unsigned long start,
682	unsigned long end,
683	hva_handler_t handler)
684	{
685	struct kvm *kvm = mmu_notifier_to_kvm(mn);
686	const struct kvm_hva_range range = {
687	.start = start,
688	.end = end,
689	.handler = handler,
690	.on_lock = (void *)kvm_null_fn,
691	.on_unlock = (void *)kvm_null_fn,
692	.flush_on_ret = false,
693	.may_block = false,
694	};
695
696	return __kvm_handle_hva_range(kvm, range: &range);
697	}
698
699	static bool kvm_change_spte_gfn(struct kvm kvm, struct* kvm_gfn_range *range)
700	{
701	/*
702	* Skipping invalid memslots is correct if and only change_pte() is
703	* surrounded by invalidate_range_{start,end}(), which is currently
704	* guaranteed by the primary MMU. If that ever changes, KVM needs to
705	* unmap the memslot instead of skipping the memslot to ensure that KVM
706	* doesn't hold references to the old PFN.
707	*/
708	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
709
710	if (range->slot->flags & KVM_MEMSLOT_INVALID)
711	return false;
712
713	return kvm_set_spte_gfn(kvm, range);
714	}
715
716	static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
717	struct mm_struct *mm,
718	unsigned long address,
719	pte_t pte)
720	{
721	struct kvm *kvm = mmu_notifier_to_kvm(mn);
722	const union kvm_mmu_notifier_arg arg = { .pte = pte };
723
724	trace_kvm_set_spte_hva(hva: address);
725
726	/*
727	* .change_pte() must be surrounded by .invalidate_range_{start,end}().
728	* If mmu_invalidate_in_progress is zero, then no in-progress
729	* invalidations, including this one, found a relevant memslot at
730	* start(); rechecking memslots here is unnecessary. Note, a false
731	* positive (count elevated by a different invalidation) is sub-optimal
732	* but functionally ok.
733	*/
734	WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
735	if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
736	return;
737
738	kvm_handle_hva_range(mn, start: address, end: address + `1`, arg, handler: kvm_change_spte_gfn);
739	}
740
741	void kvm_mmu_invalidate_begin(struct kvm kvm, unsigned* long start,
742	unsigned long end)
743	{
744	/*
745	* The count increase must become visible at unlock time as no
746	* spte can be established without taking the mmu_lock and
747	* count is also read inside the mmu_lock critical section.
748	*/
749	kvm->mmu_invalidate_in_progress++;
750	if (likely(kvm->mmu_invalidate_in_progress == `1`)) {
751	kvm->mmu_invalidate_range_start = start;
752	kvm->mmu_invalidate_range_end = end;
753	} else {
754	/*
755	* Fully tracking multiple concurrent ranges has diminishing
756	* returns. Keep things simple and just find the minimal range
757	* which includes the current and new ranges. As there won't be
758	* enough information to subtract a range after its invalidate
759	* completes, any ranges invalidated concurrently will
760	* accumulate and persist until all outstanding invalidates
761	* complete.
762	*/
763	kvm->mmu_invalidate_range_start =
764	min(kvm->mmu_invalidate_range_start, start);
765	kvm->mmu_invalidate_range_end =
766	max(kvm->mmu_invalidate_range_end, end);
767	}
768	}
769
770	static int kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
771	const struct mmu_notifier_range *range)
772	{
773	struct kvm *kvm = mmu_notifier_to_kvm(mn);
774	const struct kvm_hva_range hva_range = {
775	.start = range->start,
776	.end = range->end,
777	.handler = kvm_unmap_gfn_range,
778	.on_lock = kvm_mmu_invalidate_begin,
779	.on_unlock = kvm_arch_guest_memory_reclaimed,
780	.flush_on_ret = true,
781	.may_block = mmu_notifier_range_blockable(range),
782	};
783
784	trace_kvm_unmap_hva_range(start: range->start, end: range->end);
785
786	/*
787	* Prevent memslot modification between range_start() and range_end()
788	* so that conditionally locking provides the same result in both
789	* functions. Without that guarantee, the mmu_invalidate_in_progress
790	* adjustments will be imbalanced.
791	*
792	* Pairs with the decrement in range_end().
793	*/
794	spin_lock(lock: &kvm->mn_invalidate_lock);
795	kvm->mn_active_invalidate_count++;
796	spin_unlock(lock: &kvm->mn_invalidate_lock);
797
798	/*
799	* Invalidate pfn caches _before_ invalidating the secondary MMUs, i.e.
800	* before acquiring mmu_lock, to avoid holding mmu_lock while acquiring
801	* each cache's lock. There are relatively few caches in existence at
802	* any given time, and the caches themselves can check for hva overlap,
803	* i.e. don't need to rely on memslot overlap checks for performance.
804	* Because this runs without holding mmu_lock, the pfn caches must use
805	* mn_active_invalidate_count (see above) instead of
806	* mmu_invalidate_in_progress.
807	*/
808	gfn_to_pfn_cache_invalidate_start(kvm, start: range->start, end: range->end,
809	may_block: hva_range.may_block);
810
811	__kvm_handle_hva_range(kvm, range: &hva_range);
812
813	return `0`;
814	}
815
816	void kvm_mmu_invalidate_end(struct kvm kvm, unsigned* long start,
817	unsigned long end)
818	{
819	/*
820	* This sequence increase will notify the kvm page fault that
821	* the page that is going to be mapped in the spte could have
822	* been freed.
823	*/
824	kvm->mmu_invalidate_seq++;
825	smp_wmb();
826	/*
827	* The above sequence increase must be visible before the
828	* below count decrease, which is ensured by the smp_wmb above
829	* in conjunction with the smp_rmb in mmu_invalidate_retry().
830	*/
831	kvm->mmu_invalidate_in_progress--;
832	}
833
834	static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
835	const struct mmu_notifier_range *range)
836	{
837	struct kvm *kvm = mmu_notifier_to_kvm(mn);
838	const struct kvm_hva_range hva_range = {
839	.start = range->start,
840	.end = range->end,
841	.handler = (void *)kvm_null_fn,
842	.on_lock = kvm_mmu_invalidate_end,
843	.on_unlock = (void *)kvm_null_fn,
844	.flush_on_ret = false,
845	.may_block = mmu_notifier_range_blockable(range),
846	};
847	bool wake;
848
849	__kvm_handle_hva_range(kvm, range: &hva_range);
850
851	/ Pairs with the increment in range_start(). /
852	spin_lock(lock: &kvm->mn_invalidate_lock);
853	wake = (--kvm->mn_active_invalidate_count == `0`);
854	spin_unlock(lock: &kvm->mn_invalidate_lock);
855
856	/*
857	* There can only be one waiter, since the wait happens under
858	* slots_lock.
859	*/
860	if (wake)
861	rcuwait_wake_up(w: &kvm->mn_memslots_update_rcuwait);
862
863	BUG_ON(kvm->mmu_invalidate_in_progress < `0`);
864	}
865
866	static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
867	struct mm_struct *mm,
868	unsigned long start,
869	unsigned long end)
870	{
871	trace_kvm_age_hva(start, end);
872
873	return kvm_handle_hva_range(mn, start, end, arg: KVM_MMU_NOTIFIER_NO_ARG,
874	handler: kvm_age_gfn);
875	}
876
877	static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
878	struct mm_struct *mm,
879	unsigned long start,
880	unsigned long end)
881	{
882	trace_kvm_age_hva(start, end);
883
884	/*
885	* Even though we do not flush TLB, this will still adversely
886	* affect performance on pre-Haswell Intel EPT, where there is
887	* no EPT Access Bit to clear so that we have to tear down EPT
888	* tables instead. If we find this unacceptable, we can always
889	* add a parameter to kvm_age_hva so that it effectively doesn't
890	* do anything on clear_young.
891	*
892	* Also note that currently we never issue secondary TLB flushes
893	* from clear_young, leaving this job up to the regular system
894	* cadence. If we find this inaccurate, we might come up with a
895	* more sophisticated heuristic later.
896	*/
897	return kvm_handle_hva_range_no_flush(mn, start, end, handler: kvm_age_gfn);
898	}
899
900	static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
901	struct mm_struct *mm,
902	unsigned long address)
903	{
904	trace_kvm_test_age_hva(hva: address);
905
906	return kvm_handle_hva_range_no_flush(mn, start: address, end: address + `1`,
907	handler: kvm_test_age_gfn);
908	}
909
910	static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
911	struct mm_struct *mm)
912	{
913	struct kvm *kvm = mmu_notifier_to_kvm(mn);
914	int idx;
915
916	idx = srcu_read_lock(ssp: &kvm->srcu);
917	kvm_flush_shadow_all(kvm);
918	srcu_read_unlock(ssp: &kvm->srcu, idx);
919	}
920
921	static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
922	.invalidate_range_start = kvm_mmu_notifier_invalidate_range_start,
923	.invalidate_range_end = kvm_mmu_notifier_invalidate_range_end,
924	.clear_flush_young = kvm_mmu_notifier_clear_flush_young,
925	.clear_young = kvm_mmu_notifier_clear_young,
926	.test_young = kvm_mmu_notifier_test_young,
927	.change_pte = kvm_mmu_notifier_change_pte,
928	.release = kvm_mmu_notifier_release,
929	};
930
931	static int kvm_init_mmu_notifier(struct kvm *kvm)
932	{
933	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
934	return mmu_notifier_register(subscription: &kvm->mmu_notifier, current->mm);
935	}
936
937	#else /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */
938
939	static int kvm_init_mmu_notifier(struct kvm *kvm)
940	{
941	return `0`;
942	}
943
944	#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */
945
946	#ifdef CONFIG_HAVE_KVM_PM_NOTIFIER
947	static int kvm_pm_notifier_call(struct notifier_block *bl,
948	unsigned long state,
949	void *unused)
950	{
951	struct kvm kvm = container_of(bl, struct* kvm, pm_notifier);
952
953	return kvm_arch_pm_notifier(kvm, state);
954	}
955
956	static void kvm_init_pm_notifier(struct kvm *kvm)
957	{
958	kvm->pm_notifier.notifier_call = kvm_pm_notifier_call;
959	/ Suspend KVM before we suspend ftrace, RCU, etc. /
960	kvm->pm_notifier.priority = INT_MAX;
961	register_pm_notifier(nb: &kvm->pm_notifier);
962	}
963
964	static void kvm_destroy_pm_notifier(struct kvm *kvm)
965	{
966	unregister_pm_notifier(nb: &kvm->pm_notifier);
967	}
968	#else /* !CONFIG_HAVE_KVM_PM_NOTIFIER */
969	static void kvm_init_pm_notifier(struct kvm *kvm)
970	{
971	}
972
973	static void kvm_destroy_pm_notifier(struct kvm *kvm)
974	{
975	}
976	#endif /* CONFIG_HAVE_KVM_PM_NOTIFIER */
977
978	static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
979	{
980	if (!memslot->dirty_bitmap)
981	return;
982
983	kvfree(addr: memslot->dirty_bitmap);
984	memslot->dirty_bitmap = NULL;
985	}
986
987	/ This does not remove the slot from struct kvm_memslots data structures /
988	static void kvm_free_memslot(struct kvm kvm, struct* kvm_memory_slot *slot)
989	{
990	kvm_destroy_dirty_bitmap(memslot: slot);
991
992	kvm_arch_free_memslot(kvm, slot);
993
994	kfree(objp: slot);
995	}
996
997	static void kvm_free_memslots(struct kvm kvm, struct* kvm_memslots *slots)
998	{
999	struct hlist_node *idnode;
1000	struct kvm_memory_slot *memslot;
1001	int bkt;
1002
1003	/*
1004	* The same memslot objects live in both active and inactive sets,
1005	* arbitrarily free using index '1' so the second invocation of this
1006	* function isn't operating over a structure with dangling pointers
1007	* (even though this function isn't actually touching them).
1008	*/
1009	if (!slots->node_idx)
1010	return;
1011
1012	hash_for_each_safe(slots->id_hash, bkt, idnode, memslot, id_node[`1`])
1013	kvm_free_memslot(kvm, slot: memslot);
1014	}
1015
1016	static umode_t kvm_stats_debugfs_mode(const struct _kvm_stats_desc *pdesc)
1017	{
1018	switch (pdesc->desc.flags & KVM_STATS_TYPE_MASK) {
1019	case KVM_STATS_TYPE_INSTANT:
1020	return `0444`;
1021	case KVM_STATS_TYPE_CUMULATIVE:
1022	case KVM_STATS_TYPE_PEAK:
1023	default:
1024	return `0644`;
1025	}
1026	}
1027
1028
1029	static void kvm_destroy_vm_debugfs(struct kvm *kvm)
1030	{
1031	int i;
1032	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1033	kvm_vcpu_stats_header.num_desc;
1034
1035	if (IS_ERR(ptr: kvm->debugfs_dentry))
1036	return;
1037
1038	debugfs_remove_recursive(dentry: kvm->debugfs_dentry);
1039
1040	if (kvm->debugfs_stat_data) {
1041	for (i = `0`; i < kvm_debugfs_num_entries; i++)
1042	kfree(objp: kvm->debugfs_stat_data[i]);
1043	kfree(objp: kvm->debugfs_stat_data);
1044	}
1045	}
1046
1047	static int kvm_create_vm_debugfs(struct kvm kvm, const* char *fdname)
1048	{
1049	static DEFINE_MUTEX(kvm_debugfs_lock);
1050	struct dentry *dent;
1051	char dir_name[ITOA_MAX_LEN * `2`];
1052	struct kvm_stat_data *stat_data;
1053	const struct _kvm_stats_desc *pdesc;
1054	int i, ret = -ENOMEM;
1055	int kvm_debugfs_num_entries = kvm_vm_stats_header.num_desc +
1056	kvm_vcpu_stats_header.num_desc;
1057
1058	if (!debugfs_initialized())
1059	return `0`;
1060
1061	snprintf(buf: dir_name, size: sizeof(dir_name), fmt: "%d-%s", task_pid_nr(current), fdname);
1062	mutex_lock(&kvm_debugfs_lock);
1063	dent = debugfs_lookup(name: dir_name, parent: kvm_debugfs_dir);
1064	if (dent) {
1065	pr_warn_ratelimited("KVM: debugfs: duplicate directory %s\n", dir_name);
1066	dput(dent);
1067	mutex_unlock(lock: &kvm_debugfs_lock);
1068	return `0`;
1069	}
1070	dent = debugfs_create_dir(name: dir_name, parent: kvm_debugfs_dir);
1071	mutex_unlock(lock: &kvm_debugfs_lock);
1072	if (IS_ERR(ptr: dent))
1073	return `0`;
1074
1075	kvm->debugfs_dentry = dent;
1076	kvm->debugfs_stat_data = kcalloc(n: kvm_debugfs_num_entries,
1077	size: sizeof(*kvm->debugfs_stat_data),
1078	GFP_KERNEL_ACCOUNT);
1079	if (!kvm->debugfs_stat_data)
1080	goto out_err;
1081
1082	for (i = `0`; i < kvm_vm_stats_header.num_desc; ++i) {
1083	pdesc = &kvm_vm_stats_desc[i];
1084	stat_data = kzalloc(size: sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1085	if (!stat_data)
1086	goto out_err;
1087
1088	stat_data->kvm = kvm;
1089	stat_data->desc = pdesc;
1090	stat_data->kind = KVM_STAT_VM;
1091	kvm->debugfs_stat_data[i] = stat_data;
1092	debugfs_create_file(name: pdesc->name, mode: kvm_stats_debugfs_mode(pdesc),
1093	parent: kvm->debugfs_dentry, data: stat_data,
1094	fops: &stat_fops_per_vm);
1095	}
1096
1097	for (i = `0`; i < kvm_vcpu_stats_header.num_desc; ++i) {
1098	pdesc = &kvm_vcpu_stats_desc[i];
1099	stat_data = kzalloc(size: sizeof(*stat_data), GFP_KERNEL_ACCOUNT);
1100	if (!stat_data)
1101	goto out_err;
1102
1103	stat_data->kvm = kvm;
1104	stat_data->desc = pdesc;
1105	stat_data->kind = KVM_STAT_VCPU;
1106	kvm->debugfs_stat_data[i + kvm_vm_stats_header.num_desc] = stat_data;
1107	debugfs_create_file(name: pdesc->name, mode: kvm_stats_debugfs_mode(pdesc),
1108	parent: kvm->debugfs_dentry, data: stat_data,
1109	fops: &stat_fops_per_vm);
1110	}
1111
1112	ret = kvm_arch_create_vm_debugfs(kvm);
1113	if (ret)
1114	goto out_err;
1115
1116	return `0`;
1117	out_err:
1118	kvm_destroy_vm_debugfs(kvm);
1119	return ret;
1120	}
1121
1122	/*
1123	* Called after the VM is otherwise initialized, but just before adding it to
1124	* the vm_list.
1125	*/
1126	int __weak kvm_arch_post_init_vm(struct kvm *kvm)
1127	{
1128	return `0`;
1129	}
1130
1131	/*
1132	* Called just after removing the VM from the vm_list, but before doing any
1133	* other destruction.
1134	*/
1135	void __weak kvm_arch_pre_destroy_vm(struct kvm *kvm)
1136	{
1137	}
1138
1139	/*
1140	* Called after per-vm debugfs created. When called kvm->debugfs_dentry should
1141	* be setup already, so we can create arch-specific debugfs entries under it.
1142	* Cleanup should be automatic done in kvm_destroy_vm_debugfs() recursively, so
1143	* a per-arch destroy interface is not needed.
1144	*/
1145	int __weak kvm_arch_create_vm_debugfs(struct kvm *kvm)
1146	{
1147	return `0`;
1148	}
1149
1150	static struct kvm kvm_create_vm(unsigned* long type, const char *fdname)
1151	{
1152	struct kvm *kvm = kvm_arch_alloc_vm();
1153	struct kvm_memslots *slots;
1154	int r = -ENOMEM;
1155	int i, j;
1156
1157	if (!kvm)
1158	return ERR_PTR(error: -ENOMEM);
1159
1160	/ KVM is pinned via open("/dev/kvm"), the fd passed to this ioctl(). /
1161	__module_get(module: kvm_chardev_ops.owner);
1162
1163	KVM_MMU_LOCK_INIT(kvm);
1164	mmgrab(current->mm);
1165	kvm->mm = current->mm;
1166	kvm_eventfd_init(kvm);
1167	mutex_init(&kvm->lock);
1168	mutex_init(&kvm->irq_lock);
1169	mutex_init(&kvm->slots_lock);
1170	mutex_init(&kvm->slots_arch_lock);
1171	spin_lock_init(&kvm->mn_invalidate_lock);
1172	rcuwait_init(w: &kvm->mn_memslots_update_rcuwait);
1173	xa_init(xa: &kvm->vcpu_array);
1174
1175	INIT_LIST_HEAD(list: &kvm->gpc_list);
1176	spin_lock_init(&kvm->gpc_lock);
1177
1178	INIT_LIST_HEAD(list: &kvm->devices);
1179	kvm->max_vcpus = KVM_MAX_VCPUS;
1180
1181	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);
1182
1183	/*
1184	* Force subsequent debugfs file creations to fail if the VM directory
1185	* is not created (by kvm_create_vm_debugfs()).
1186	*/
1187	kvm->debugfs_dentry = ERR_PTR(error: -ENOENT);
1188
1189	snprintf(buf: kvm->stats_id, size: sizeof(kvm->stats_id), fmt: "kvm-%d",
1190	task_pid_nr(current));
1191
1192	if (init_srcu_struct(&kvm->srcu))
1193	goto out_err_no_srcu;
1194	if (init_srcu_struct(&kvm->irq_srcu))
1195	goto out_err_no_irq_srcu;
1196
1197	refcount_set(r: &kvm->users_count, n: `1`);
1198	for (i = `0`; i < KVM_ADDRESS_SPACE_NUM; i++) {
1199	for (j = `0`; j < `2`; j++) {
1200	slots = &kvm->__memslots[i][j];
1201
1202	atomic_long_set(v: &slots->last_used_slot, i: (unsigned long)NULL);
1203	slots->hva_tree = RB_ROOT_CACHED;
1204	slots->gfn_tree = RB_ROOT;
1205	hash_init(slots->id_hash);
1206	slots->node_idx = j;
1207
1208	/ Generations must be different for each address space. /
1209	slots->generation = i;
1210	}
1211
1212	rcu_assign_pointer(kvm->memslots[i], &kvm->__memslots[i][`0`]);
1213	}
1214
1215	for (i = `0`; i < KVM_NR_BUSES; i++) {
1216	rcu_assign_pointer(kvm->buses[i],
1217	kzalloc(sizeof(struct kvm_io_bus), GFP_KERNEL_ACCOUNT));
1218	if (!kvm->buses[i])
1219	goto out_err_no_arch_destroy_vm;
1220	}
1221
1222	r = kvm_arch_init_vm(kvm, type);
1223	if (r)
1224	goto out_err_no_arch_destroy_vm;
1225
1226	r = hardware_enable_all();
1227	if (r)
1228	goto out_err_no_disable;
1229
1230	#ifdef CONFIG_HAVE_KVM_IRQFD
1231	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
1232	#endif
1233
1234	r = kvm_init_mmu_notifier(kvm);
1235	if (r)
1236	goto out_err_no_mmu_notifier;
1237
1238	r = kvm_coalesced_mmio_init(kvm);
1239	if (r < `0`)
1240	goto out_no_coalesced_mmio;
1241
1242	r = kvm_create_vm_debugfs(kvm, fdname);
1243	if (r)
1244	goto out_err_no_debugfs;
1245
1246	r = kvm_arch_post_init_vm(kvm);
1247	if (r)
1248	goto out_err;
1249
1250	mutex_lock(&kvm_lock);
1251	list_add(new: &kvm->vm_list, head: &vm_list);
1252	mutex_unlock(lock: &kvm_lock);
1253
1254	preempt_notifier_inc();
1255	kvm_init_pm_notifier(kvm);
1256
1257	return kvm;
1258
1259	out_err:
1260	kvm_destroy_vm_debugfs(kvm);
1261	out_err_no_debugfs:
1262	kvm_coalesced_mmio_free(kvm);
1263	out_no_coalesced_mmio:
1264	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1265	if (kvm->mmu_notifier.ops)
1266	mmu_notifier_unregister(subscription: &kvm->mmu_notifier, current->mm);
1267	#endif
1268	out_err_no_mmu_notifier:
1269	hardware_disable_all();
1270	out_err_no_disable:
1271	kvm_arch_destroy_vm(kvm);
1272	out_err_no_arch_destroy_vm:
1273	WARN_ON_ONCE(!refcount_dec_and_test(&kvm->users_count));
1274	for (i = `0`; i < KVM_NR_BUSES; i++)
1275	kfree(objp: kvm_get_bus(kvm, idx: i));
1276	cleanup_srcu_struct(ssp: &kvm->irq_srcu);
1277	out_err_no_irq_srcu:
1278	cleanup_srcu_struct(ssp: &kvm->srcu);
1279	out_err_no_srcu:
1280	kvm_arch_free_vm(kvm);
1281	mmdrop(current->mm);
1282	module_put(module: kvm_chardev_ops.owner);
1283	return ERR_PTR(error: r);
1284	}
1285
1286	static void kvm_destroy_devices(struct kvm *kvm)
1287	{
1288	struct kvm_device dev, tmp;
1289
1290	/*
1291	* We do not need to take the kvm->lock here, because nobody else
1292	* has a reference to the struct kvm at this point and therefore
1293	* cannot access the devices list anyhow.
1294	*/
1295	list_for_each_entry_safe(dev, tmp, &kvm->devices, vm_node) {
1296	list_del(entry: &dev->vm_node);
1297	dev->ops->destroy(dev);
1298	}
1299	}
1300
1301	static void kvm_destroy_vm(struct kvm *kvm)
1302	{
1303	int i;
1304	struct mm_struct *mm = kvm->mm;
1305
1306	kvm_destroy_pm_notifier(kvm);
1307	kvm_uevent_notify_change(KVM_EVENT_DESTROY_VM, kvm);
1308	kvm_destroy_vm_debugfs(kvm);
1309	kvm_arch_sync_events(kvm);
1310	mutex_lock(&kvm_lock);
1311	list_del(entry: &kvm->vm_list);
1312	mutex_unlock(lock: &kvm_lock);
1313	kvm_arch_pre_destroy_vm(kvm);
1314
1315	kvm_free_irq_routing(kvm);
1316	for (i = `0`; i < KVM_NR_BUSES; i++) {
1317	struct kvm_io_bus *bus = kvm_get_bus(kvm, idx: i);
1318
1319	if (bus)
1320	kvm_io_bus_destroy(bus);
1321	kvm->buses[i] = NULL;
1322	}
1323	kvm_coalesced_mmio_free(kvm);
1324	#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
1325	mmu_notifier_unregister(subscription: &kvm->mmu_notifier, mm: kvm->mm);
1326	/*
1327	* At this point, pending calls to invalidate_range_start()
1328	* have completed but no more MMU notifiers will run, so
1329	* mn_active_invalidate_count may remain unbalanced.
1330	* No threads can be waiting in kvm_swap_active_memslots() as the
1331	* last reference on KVM has been dropped, but freeing
1332	* memslots would deadlock without this manual intervention.
1333	*/
1334	WARN_ON(rcuwait_active(&kvm->mn_memslots_update_rcuwait));
1335	kvm->mn_active_invalidate_count = `0`;
1336	#else
1337	kvm_flush_shadow_all(kvm);
1338	#endif
1339	kvm_arch_destroy_vm(kvm);
1340	kvm_destroy_devices(kvm);
1341	for (i = `0`; i < KVM_ADDRESS_SPACE_NUM; i++) {
1342	kvm_free_memslots(kvm, slots: &kvm->__memslots[i][`0`]);
1343	kvm_free_memslots(kvm, slots: &kvm->__memslots[i][`1`]);
1344	}
1345	cleanup_srcu_struct(ssp: &kvm->irq_srcu);
1346	cleanup_srcu_struct(ssp: &kvm->srcu);
1347	kvm_arch_free_vm(kvm);
1348	preempt_notifier_dec();
1349	hardware_disable_all();
1350	mmdrop(mm);
1351	module_put(module: kvm_chardev_ops.owner);
1352	}
1353
1354	void kvm_get_kvm(struct kvm *kvm)
1355	{
1356	refcount_inc(r: &kvm->users_count);
1357	}
1358	EXPORT_SYMBOL_GPL(kvm_get_kvm);
1359
1360	/*
1361	* Make sure the vm is not during destruction, which is a safe version of
1362	* kvm_get_kvm(). Return true if kvm referenced successfully, false otherwise.
1363	*/
1364	bool kvm_get_kvm_safe(struct kvm *kvm)
1365	{
1366	return refcount_inc_not_zero(r: &kvm->users_count);
1367	}
1368	EXPORT_SYMBOL_GPL(kvm_get_kvm_safe);
1369
1370	void kvm_put_kvm(struct kvm *kvm)
1371	{
1372	if (refcount_dec_and_test(r: &kvm->users_count))
1373	kvm_destroy_vm(kvm);
1374	}
1375	EXPORT_SYMBOL_GPL(kvm_put_kvm);
1376
1377	/*
1378	* Used to put a reference that was taken on behalf of an object associated
1379	* with a user-visible file descriptor, e.g. a vcpu or device, if installation
1380	* of the new file descriptor fails and the reference cannot be transferred to
1381	* its final owner. In such cases, the caller is still actively using @kvm and
1382	* will fail miserably if the refcount unexpectedly hits zero.
1383	*/
1384	void kvm_put_kvm_no_destroy(struct kvm *kvm)
1385	{
1386	WARN_ON(refcount_dec_and_test(&kvm->users_count));
1387	}
1388	EXPORT_SYMBOL_GPL(kvm_put_kvm_no_destroy);
1389
1390	static int kvm_vm_release(struct inode inode, struct* file *filp)
1391	{
1392	struct kvm *kvm = filp->private_data;
1393
1394	kvm_irqfd_release(kvm);
1395
1396	kvm_put_kvm(kvm);
1397	return `0`;
1398	}
1399
1400	/*
1401	* Allocation size is twice as large as the actual dirty bitmap size.
1402	* See kvm_vm_ioctl_get_dirty_log() why this is needed.
1403	*/
1404	static int kvm_alloc_dirty_bitmap(struct kvm_memory_slot *memslot)
1405	{
1406	unsigned long dirty_bytes = kvm_dirty_bitmap_bytes(memslot);
1407
1408	memslot->dirty_bitmap = __vcalloc(n: `2`, size: dirty_bytes, GFP_KERNEL_ACCOUNT);
1409	if (!memslot->dirty_bitmap)
1410	return -ENOMEM;
1411
1412	return `0`;
1413	}
1414
1415	static struct kvm_memslots kvm_get_inactive_memslots(struct* kvm kvm, int* as_id)
1416	{
1417	struct kvm_memslots *active = __kvm_memslots(kvm, as_id);
1418	int node_idx_inactive = active->node_idx ^ `1`;
1419
1420	return &kvm->__memslots[as_id][node_idx_inactive];
1421	}
1422
1423	/*
1424	* Helper to get the address space ID when one of memslot pointers may be NULL.
1425	* This also serves as a sanity that at least one of the pointers is non-NULL,
1426	* and that their address space IDs don't diverge.
1427	*/
1428	static int kvm_memslots_get_as_id(struct kvm_memory_slot *a,
1429	struct kvm_memory_slot *b)
1430	{
1431	if (WARN_ON_ONCE(!a && !b))
1432	return `0`;
1433
1434	if (!a)
1435	return b->as_id;
1436	if (!b)
1437	return a->as_id;
1438
1439	WARN_ON_ONCE(a->as_id != b->as_id);
1440	return a->as_id;
1441	}
1442
1443	static void kvm_insert_gfn_node(struct kvm_memslots *slots,
1444	struct kvm_memory_slot *slot)
1445	{
1446	struct rb_root *gfn_tree = &slots->gfn_tree;
1447	struct rb_node *node, parent;
1448	int idx = slots->node_idx;
1449
1450	parent = NULL;
1451	for (node = &gfn_tree->rb_node; *node; ) {
1452	struct kvm_memory_slot *tmp;
1453
1454	tmp = container_of(node, struct* kvm_memory_slot, gfn_node[idx]);
1455	parent = *node;
1456	if (slot->base_gfn < tmp->base_gfn)
1457	node = &(*node)->rb_left;
1458	else if (slot->base_gfn > tmp->base_gfn)
1459	node = &(*node)->rb_right;
1460	else
1461	BUG();
1462	}
1463
1464	rb_link_node(node: &slot->gfn_node[idx], parent, rb_link: node);
1465	rb_insert_color(&slot->gfn_node[idx], gfn_tree);
1466	}
1467
1468	static void kvm_erase_gfn_node(struct kvm_memslots *slots,
1469	struct kvm_memory_slot *slot)
1470	{
1471	rb_erase(&slot->gfn_node[slots->node_idx], &slots->gfn_tree);
1472	}
1473
1474	static void kvm_replace_gfn_node(struct kvm_memslots *slots,
1475	struct kvm_memory_slot *old,
1476	struct kvm_memory_slot *new)
1477	{
1478	int idx = slots->node_idx;
1479
1480	WARN_ON_ONCE(old->base_gfn != new->base_gfn);
1481
1482	rb_replace_node(victim: &old->gfn_node[idx], new: &new->gfn_node[idx],
1483	root: &slots->gfn_tree);
1484	}
1485
1486	/*
1487	* Replace @old with @new in the inactive memslots.
1488	*
1489	* With NULL @old this simply adds @new.
1490	* With NULL @new this simply removes @old.
1491	*
1492	* If @new is non-NULL its hva_node[slots_idx] range has to be set
1493	* appropriately.
1494	*/
1495	static void kvm_replace_memslot(struct kvm *kvm,
1496	struct kvm_memory_slot *old,
1497	struct kvm_memory_slot *new)
1498	{
1499	int as_id = kvm_memslots_get_as_id(a: old, b: new);
1500	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1501	int idx = slots->node_idx;
1502
1503	if (old) {
1504	hash_del(node: &old->id_node[idx]);
1505	interval_tree_remove(node: &old->hva_node[idx], root: &slots->hva_tree);
1506
1507	if ((long)old == atomic_long_read(v: &slots->last_used_slot))
1508	atomic_long_set(v: &slots->last_used_slot, i: (long)new);
1509
1510	if (!new) {
1511	kvm_erase_gfn_node(slots, slot: old);
1512	return;
1513	}
1514	}
1515
1516	/*
1517	* Initialize @new's hva range. Do this even when replacing an @old
1518	* slot, kvm_copy_memslot() deliberately does not touch node data.
1519	*/
1520	new->hva_node[idx].start = new->userspace_addr;
1521	new->hva_node[idx].last = new->userspace_addr +
1522	(new->npages << PAGE_SHIFT) - `1`;
1523
1524	/*
1525	* (Re)Add the new memslot. There is no O(1) interval_tree_replace(),
1526	* hva_node needs to be swapped with remove+insert even though hva can't
1527	* change when replacing an existing slot.
1528	*/
1529	hash_add(slots->id_hash, &new->id_node[idx], new->id);
1530	interval_tree_insert(node: &new->hva_node[idx], root: &slots->hva_tree);
1531
1532	/*
1533	* If the memslot gfn is unchanged, rb_replace_node() can be used to
1534	* switch the node in the gfn tree instead of removing the old and
1535	* inserting the new as two separate operations. Replacement is a
1536	* single O(1) operation versus two O(log(n)) operations for
1537	* remove+insert.
1538	*/
1539	if (old && old->base_gfn == new->base_gfn) {
1540	kvm_replace_gfn_node(slots, old, new);
1541	} else {
1542	if (old)
1543	kvm_erase_gfn_node(slots, slot: old);
1544	kvm_insert_gfn_node(slots, slot: new);
1545	}
1546	}
1547
1548	static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
1549	{
1550	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;
1551
1552	#ifdef __KVM_HAVE_READONLY_MEM
1553	valid_flags \|= KVM_MEM_READONLY;
1554	#endif
1555
1556	if (mem->flags & ~valid_flags)
1557	return -EINVAL;
1558
1559	return `0`;
1560	}
1561
1562	static void kvm_swap_active_memslots(struct kvm kvm, int* as_id)
1563	{
1564	struct kvm_memslots *slots = kvm_get_inactive_memslots(kvm, as_id);
1565
1566	/ Grab the generation from the activate memslots. /
1567	u64 gen = __kvm_memslots(kvm, as_id)->generation;
1568
1569	WARN_ON(gen & KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS);
1570	slots->generation = gen \| KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1571
1572	/*
1573	* Do not store the new memslots while there are invalidations in
1574	* progress, otherwise the locking in invalidate_range_start and
1575	* invalidate_range_end will be unbalanced.
1576	*/
1577	spin_lock(lock: &kvm->mn_invalidate_lock);
1578	prepare_to_rcuwait(w: &kvm->mn_memslots_update_rcuwait);
1579	while (kvm->mn_active_invalidate_count) {
1580	set_current_state(TASK_UNINTERRUPTIBLE);
1581	spin_unlock(lock: &kvm->mn_invalidate_lock);
1582	schedule();
1583	spin_lock(lock: &kvm->mn_invalidate_lock);
1584	}
1585	finish_rcuwait(w: &kvm->mn_memslots_update_rcuwait);
1586	rcu_assign_pointer(kvm->memslots[as_id], slots);
1587	spin_unlock(lock: &kvm->mn_invalidate_lock);
1588
1589	/*
1590	* Acquired in kvm_set_memslot. Must be released before synchronize
1591	* SRCU below in order to avoid deadlock with another thread
1592	* acquiring the slots_arch_lock in an srcu critical section.
1593	*/
1594	mutex_unlock(lock: &kvm->slots_arch_lock);
1595
1596	synchronize_srcu_expedited(ssp: &kvm->srcu);
1597
1598	/*
1599	* Increment the new memslot generation a second time, dropping the
1600	* update in-progress flag and incrementing the generation based on
1601	* the number of address spaces. This provides a unique and easily
1602	* identifiable generation number while the memslots are in flux.
1603	*/
1604	gen = slots->generation & ~KVM_MEMSLOT_GEN_UPDATE_IN_PROGRESS;
1605
1606	/*
1607	* Generations must be unique even across address spaces. We do not need
1608	* a global counter for that, instead the generation space is evenly split
1609	* across address spaces. For example, with two address spaces, address
1610	* space 0 will use generations 0, 2, 4, ... while address space 1 will
1611	* use generations 1, 3, 5, ...
1612	*/
1613	gen += KVM_ADDRESS_SPACE_NUM;
1614
1615	kvm_arch_memslots_updated(kvm, gen);
1616
1617	slots->generation = gen;
1618	}
1619
1620	static int kvm_prepare_memory_region(struct kvm *kvm,
1621	const struct kvm_memory_slot *old,
1622	struct kvm_memory_slot *new,
1623	enum kvm_mr_change change)
1624	{
1625	int r;
1626
1627	/*
1628	* If dirty logging is disabled, nullify the bitmap; the old bitmap
1629	* will be freed on "commit". If logging is enabled in both old and
1630	* new, reuse the existing bitmap. If logging is enabled only in the
1631	* new and KVM isn't using a ring buffer, allocate and initialize a
1632	* new bitmap.
1633	*/
1634	if (change != KVM_MR_DELETE) {
1635	if (!(new->flags & KVM_MEM_LOG_DIRTY_PAGES))
1636	new->dirty_bitmap = NULL;
1637	else if (old && old->dirty_bitmap)
1638	new->dirty_bitmap = old->dirty_bitmap;
1639	else if (kvm_use_dirty_bitmap(kvm)) {
1640	r = kvm_alloc_dirty_bitmap(memslot: new);
1641	if (r)
1642	return r;
1643
1644	if (kvm_dirty_log_manual_protect_and_init_set(kvm))
1645	bitmap_set(map: new->dirty_bitmap, start: `0`, nbits: new->npages);
1646	}
1647	}
1648
1649	r = kvm_arch_prepare_memory_region(kvm, old, new, change);
1650
1651	/ Free the bitmap on failure if it was allocated above. /
1652	if (r && new && new->dirty_bitmap && (!old \|\| !old->dirty_bitmap))
1653	kvm_destroy_dirty_bitmap(memslot: new);
1654
1655	return r;
1656	}
1657
1658	static void kvm_commit_memory_region(struct kvm *kvm,
1659	struct kvm_memory_slot *old,
1660	const struct kvm_memory_slot *new,
1661	enum kvm_mr_change change)
1662	{
1663	int old_flags = old ? old->flags : `0`;
1664	int new_flags = new ? new->flags : `0`;
1665	/*
1666	* Update the total number of memslot pages before calling the arch
1667	* hook so that architectures can consume the result directly.
1668	*/
1669	if (change == KVM_MR_DELETE)
1670	kvm->nr_memslot_pages -= old->npages;
1671	else if (change == KVM_MR_CREATE)
1672	kvm->nr_memslot_pages += new->npages;
1673
1674	if ((old_flags ^ new_flags) & KVM_MEM_LOG_DIRTY_PAGES) {
1675	int change = (new_flags & KVM_MEM_LOG_DIRTY_PAGES) ? `1` : -`1`;
1676	atomic_set(v: &kvm->nr_memslots_dirty_logging,
1677	i: atomic_read(v: &kvm->nr_memslots_dirty_logging) + change);
1678	}
1679
1680	kvm_arch_commit_memory_region(kvm, old, new, change);
1681
1682	switch (change) {
1683	case KVM_MR_CREATE:
1684	/ Nothing more to do. /
1685	break;
1686	case KVM_MR_DELETE:
1687	/ Free the old memslot and all its metadata. /
1688	kvm_free_memslot(kvm, slot: old);
1689	break;
1690	case KVM_MR_MOVE:
1691	case KVM_MR_FLAGS_ONLY:
1692	/*
1693	* Free the dirty bitmap as needed; the below check encompasses
1694	* both the flags and whether a ring buffer is being used)
1695	*/
1696	if (old->dirty_bitmap && !new->dirty_bitmap)
1697	kvm_destroy_dirty_bitmap(memslot: old);
1698
1699	/*
1700	* The final quirk. Free the detached, old slot, but only its
1701	* memory, not any metadata. Metadata, including arch specific
1702	* data, may be reused by @new.
1703	*/
1704	kfree(objp: old);
1705	break;
1706	default:
1707	BUG();
1708	}
1709	}
1710
1711	/*
1712	* Activate @new, which must be installed in the inactive slots by the caller,
1713	* by swapping the active slots and then propagating @new to @old once @old is
1714	* unreachable and can be safely modified.
1715	*
1716	* With NULL @old this simply adds @new to @active (while swapping the sets).
1717	* With NULL @new this simply removes @old from @active and frees it
1718	* (while also swapping the sets).
1719	*/
1720	static void kvm_activate_memslot(struct kvm *kvm,
1721	struct kvm_memory_slot *old,
1722	struct kvm_memory_slot *new)
1723	{
1724	int as_id = kvm_memslots_get_as_id(a: old, b: new);
1725
1726	kvm_swap_active_memslots(kvm, as_id);
1727
1728	/ Propagate the new memslot to the now inactive memslots. /
1729	kvm_replace_memslot(kvm, old, new);
1730	}
1731
1732	static void kvm_copy_memslot(struct kvm_memory_slot *dest,
1733	const struct kvm_memory_slot *src)
1734	{
1735	dest->base_gfn = src->base_gfn;
1736	dest->npages = src->npages;
1737	dest->dirty_bitmap = src->dirty_bitmap;
1738	dest->arch = src->arch;
1739	dest->userspace_addr = src->userspace_addr;
1740	dest->flags = src->flags;
1741	dest->id = src->id;
1742	dest->as_id = src->as_id;
1743	}
1744
1745	static void kvm_invalidate_memslot(struct kvm *kvm,
1746	struct kvm_memory_slot *old,
1747	struct kvm_memory_slot *invalid_slot)
1748	{
1749	/*
1750	* Mark the current slot INVALID. As with all memslot modifications,
1751	* this must be done on an unreachable slot to avoid modifying the
1752	* current slot in the active tree.
1753	*/
1754	kvm_copy_memslot(dest: invalid_slot, src: old);
1755	invalid_slot->flags \|= KVM_MEMSLOT_INVALID;
1756	kvm_replace_memslot(kvm, old, new: invalid_slot);
1757
1758	/*
1759	* Activate the slot that is now marked INVALID, but don't propagate
1760	* the slot to the now inactive slots. The slot is either going to be
1761	* deleted or recreated as a new slot.
1762	*/
1763	kvm_swap_active_memslots(kvm, as_id: old->as_id);
1764
1765	/*
1766	* From this point no new shadow pages pointing to a deleted, or moved,
1767	* memslot will be created. Validation of sp->gfn happens in:
1768	* - gfn_to_hva (kvm_read_guest, gfn_to_pfn)
1769	* - kvm_is_visible_gfn (mmu_check_root)
1770	*/
1771	kvm_arch_flush_shadow_memslot(kvm, slot: old);
1772	kvm_arch_guest_memory_reclaimed(kvm);
1773
1774	/ Was released by kvm_swap_active_memslots(), reacquire. /
1775	mutex_lock(&kvm->slots_arch_lock);
1776
1777	/*
1778	* Copy the arch-specific field of the newly-installed slot back to the
1779	* old slot as the arch data could have changed between releasing
1780	* slots_arch_lock in kvm_swap_active_memslots() and re-acquiring the lock
1781	* above. Writers are required to retrieve memslots after acquiring
1782	* slots_arch_lock, thus the active slot's data is guaranteed to be fresh.
1783	*/
1784	old->arch = invalid_slot->arch;
1785	}
1786
1787	static void kvm_create_memslot(struct kvm *kvm,
1788	struct kvm_memory_slot *new)
1789	{
1790	/ Add the new memslot to the inactive set and activate. /
1791	kvm_replace_memslot(kvm, NULL, new);
1792	kvm_activate_memslot(kvm, NULL, new);
1793	}
1794
1795	static void kvm_delete_memslot(struct kvm *kvm,
1796	struct kvm_memory_slot *old,
1797	struct kvm_memory_slot *invalid_slot)
1798	{
1799	/*
1800	* Remove the old memslot (in the inactive memslots) by passing NULL as
1801	* the "new" slot, and for the invalid version in the active slots.
1802	*/
1803	kvm_replace_memslot(kvm, old, NULL);
1804	kvm_activate_memslot(kvm, old: invalid_slot, NULL);
1805	}
1806
1807	static void kvm_move_memslot(struct kvm *kvm,
1808	struct kvm_memory_slot *old,
1809	struct kvm_memory_slot *new,
1810	struct kvm_memory_slot *invalid_slot)
1811	{
1812	/*
1813	* Replace the old memslot in the inactive slots, and then swap slots
1814	* and replace the current INVALID with the new as well.
1815	*/
1816	kvm_replace_memslot(kvm, old, new);
1817	kvm_activate_memslot(kvm, old: invalid_slot, new);
1818	}
1819
1820	static void kvm_update_flags_memslot(struct kvm *kvm,
1821	struct kvm_memory_slot *old,
1822	struct kvm_memory_slot *new)
1823	{
1824	/*
1825	* Similar to the MOVE case, but the slot doesn't need to be zapped as
1826	* an intermediate step. Instead, the old memslot is simply replaced
1827	* with a new, updated copy in both memslot sets.
1828	*/
1829	kvm_replace_memslot(kvm, old, new);
1830	kvm_activate_memslot(kvm, old, new);
1831	}
1832
1833	static int kvm_set_memslot(struct kvm *kvm,
1834	struct kvm_memory_slot *old,
1835	struct kvm_memory_slot *new,
1836	enum kvm_mr_change change)
1837	{
1838	struct kvm_memory_slot *invalid_slot;
1839	int r;
1840
1841	/*
1842	* Released in kvm_swap_active_memslots().
1843	*
1844	* Must be held from before the current memslots are copied until after
1845	* the new memslots are installed with rcu_assign_pointer, then
1846	* released before the synchronize srcu in kvm_swap_active_memslots().
1847	*
1848	* When modifying memslots outside of the slots_lock, must be held
1849	* before reading the pointer to the current memslots until after all
1850	* changes to those memslots are complete.
1851	*
1852	* These rules ensure that installing new memslots does not lose
1853	* changes made to the previous memslots.
1854	*/
1855	mutex_lock(&kvm->slots_arch_lock);
1856
1857	/*
1858	* Invalidate the old slot if it's being deleted or moved. This is
1859	* done prior to actually deleting/moving the memslot to allow vCPUs to
1860	* continue running by ensuring there are no mappings or shadow pages
1861	* for the memslot when it is deleted/moved. Without pre-invalidation
1862	* (and without a lock), a window would exist between effecting the
1863	* delete/move and committing the changes in arch code where KVM or a
1864	* guest could access a non-existent memslot.
1865	*
1866	* Modifications are done on a temporary, unreachable slot. The old
1867	* slot needs to be preserved in case a later step fails and the
1868	* invalidation needs to be reverted.
1869	*/
1870	if (change == KVM_MR_DELETE \|\| change == KVM_MR_MOVE) {
1871	invalid_slot = kzalloc(size: sizeof(*invalid_slot), GFP_KERNEL_ACCOUNT);
1872	if (!invalid_slot) {
1873	mutex_unlock(lock: &kvm->slots_arch_lock);
1874	return -ENOMEM;
1875	}
1876	kvm_invalidate_memslot(kvm, old, invalid_slot);
1877	}
1878
1879	r = kvm_prepare_memory_region(kvm, old, new, change);
1880	if (r) {
1881	/*
1882	* For DELETE/MOVE, revert the above INVALID change. No
1883	* modifications required since the original slot was preserved
1884	* in the inactive slots. Changing the active memslots also
1885	* release slots_arch_lock.
1886	*/
1887	if (change == KVM_MR_DELETE \|\| change == KVM_MR_MOVE) {
1888	kvm_activate_memslot(kvm, old: invalid_slot, new: old);
1889	kfree(objp: invalid_slot);
1890	} else {
1891	mutex_unlock(lock: &kvm->slots_arch_lock);
1892	}
1893	return r;
1894	}
1895
1896	/*
1897	* For DELETE and MOVE, the working slot is now active as the INVALID
1898	* version of the old slot. MOVE is particularly special as it reuses
1899	* the old slot and returns a copy of the old slot (in working_slot).
1900	* For CREATE, there is no old slot. For DELETE and FLAGS_ONLY, the
1901	* old slot is detached but otherwise preserved.
1902	*/
1903	if (change == KVM_MR_CREATE)
1904	kvm_create_memslot(kvm, new);
1905	else if (change == KVM_MR_DELETE)
1906	kvm_delete_memslot(kvm, old, invalid_slot);
1907	else if (change == KVM_MR_MOVE)
1908	kvm_move_memslot(kvm, old, new, invalid_slot);
1909	else if (change == KVM_MR_FLAGS_ONLY)
1910	kvm_update_flags_memslot(kvm, old, new);
1911	else
1912	BUG();
1913
1914	/ Free the temporary INVALID slot used for DELETE and MOVE. /
1915	if (change == KVM_MR_DELETE \|\| change == KVM_MR_MOVE)
1916	kfree(objp: invalid_slot);
1917
1918	/*
1919	* No need to refresh new->arch, changes after dropping slots_arch_lock
1920	* will directly hit the final, active memslot. Architectures are
1921	* responsible for knowing that new->arch may be stale.
1922	*/
1923	kvm_commit_memory_region(kvm, old, new, change);
1924
1925	return `0`;
1926	}
1927
1928	static bool kvm_check_memslot_overlap(struct kvm_memslots slots, int* id,
1929	gfn_t start, gfn_t end)
1930	{
1931	struct kvm_memslot_iter iter;
1932
1933	kvm_for_each_memslot_in_gfn_range(&iter, slots, start, end) {
1934	if (iter.slot->id != id)
1935	return true;
1936	}
1937
1938	return false;
1939	}
1940
1941	/*
1942	* Allocate some memory and give it an address in the guest physical address
1943	* space.
1944	*
1945	* Discontiguous memory is allowed, mostly for framebuffers.
1946	*
1947	* Must be called holding kvm->slots_lock for write.
1948	*/
1949	int __kvm_set_memory_region(struct kvm *kvm,
1950	const struct kvm_userspace_memory_region *mem)
1951	{
1952	struct kvm_memory_slot old, new;
1953	struct kvm_memslots *slots;
1954	enum kvm_mr_change change;
1955	unsigned long npages;
1956	gfn_t base_gfn;
1957	int as_id, id;
1958	int r;
1959
1960	r = check_memory_region_flags(mem);
1961	if (r)
1962	return r;
1963
1964	as_id = mem->slot >> `16`;
1965	id = (u16)mem->slot;
1966
1967	/ General sanity checks /
1968	if ((mem->memory_size & (PAGE_SIZE - `1`)) \|\|
1969	(mem->memory_size != (unsigned long)mem->memory_size))
1970	return -EINVAL;
1971	if (mem->guest_phys_addr & (PAGE_SIZE - `1`))
1972	return -EINVAL;
1973	/ We can read the guest memory with __xxx_user() later on. /
1974	if ((mem->userspace_addr & (PAGE_SIZE - `1`)) \|\|
1975	(mem->userspace_addr != untagged_addr(mem->userspace_addr)) \|\|
1976	!access_ok((void __user )(unsigned* long)mem->userspace_addr,
1977	mem->memory_size))
1978	return -EINVAL;
1979	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_MEM_SLOTS_NUM)
1980	return -EINVAL;
1981	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
1982	return -EINVAL;
1983	if ((mem->memory_size >> PAGE_SHIFT) > KVM_MEM_MAX_NR_PAGES)
1984	return -EINVAL;
1985
1986	slots = __kvm_memslots(kvm, as_id);
1987
1988	/*
1989	* Note, the old memslot (and the pointer itself!) may be invalidated
1990	* and/or destroyed by kvm_set_memslot().
1991	*/
1992	old = id_to_memslot(slots, id);
1993
1994	if (!mem->memory_size) {
1995	if (!old \|\| !old->npages)
1996	return -EINVAL;
1997
1998	if (WARN_ON_ONCE(kvm->nr_memslot_pages < old->npages))
1999	return -EIO;
2000
2001	return kvm_set_memslot(kvm, old, NULL, change: KVM_MR_DELETE);
2002	}
2003
2004	base_gfn = (mem->guest_phys_addr >> PAGE_SHIFT);
2005	npages = (mem->memory_size >> PAGE_SHIFT);
2006
2007	if (!old \|\| !old->npages) {
2008	change = KVM_MR_CREATE;
2009
2010	/*
2011	* To simplify KVM internals, the total number of pages across
2012	* all memslots must fit in an unsigned long.
2013	*/
2014	if ((kvm->nr_memslot_pages + npages) < kvm->nr_memslot_pages)
2015	return -EINVAL;
2016	} else { / Modify an existing slot. /
2017	if ((mem->userspace_addr != old->userspace_addr) \|\|
2018	(npages != old->npages) \|\|
2019	((mem->flags ^ old->flags) & KVM_MEM_READONLY))
2020	return -EINVAL;
2021
2022	if (base_gfn != old->base_gfn)
2023	change = KVM_MR_MOVE;
2024	else if (mem->flags != old->flags)
2025	change = KVM_MR_FLAGS_ONLY;
2026	else / Nothing to change. /
2027	return `0`;
2028	}
2029
2030	if ((change == KVM_MR_CREATE \|\| change == KVM_MR_MOVE) &&
2031	kvm_check_memslot_overlap(slots, id, start: base_gfn, end: base_gfn + npages))
2032	return -EEXIST;
2033
2034	/ Allocate a slot that will persist in the memslot. /
2035	new = kzalloc(size: sizeof(*new), GFP_KERNEL_ACCOUNT);
2036	if (!new)
2037	return -ENOMEM;
2038
2039	new->as_id = as_id;
2040	new->id = id;
2041	new->base_gfn = base_gfn;
2042	new->npages = npages;
2043	new->flags = mem->flags;
2044	new->userspace_addr = mem->userspace_addr;
2045
2046	r = kvm_set_memslot(kvm, old, new, change);
2047	if (r)
2048	kfree(objp: new);
2049	return r;
2050	}
2051	EXPORT_SYMBOL_GPL(__kvm_set_memory_region);
2052
2053	int kvm_set_memory_region(struct kvm *kvm,
2054	const struct kvm_userspace_memory_region *mem)
2055	{
2056	int r;
2057
2058	mutex_lock(&kvm->slots_lock);
2059	r = __kvm_set_memory_region(kvm, mem);
2060	mutex_unlock(lock: &kvm->slots_lock);
2061	return r;
2062	}
2063	EXPORT_SYMBOL_GPL(kvm_set_memory_region);
2064
2065	static int kvm_vm_ioctl_set_memory_region(struct kvm *kvm,
2066	struct kvm_userspace_memory_region *mem)
2067	{
2068	if ((u16)mem->slot >= KVM_USER_MEM_SLOTS)
2069	return -EINVAL;
2070
2071	return kvm_set_memory_region(kvm, mem);
2072	}
2073
2074	#ifndef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
2075	/**
2076	* kvm_get_dirty_log - get a snapshot of dirty pages
2077	* @kvm: pointer to kvm instance
2078	* @log: slot id and address to which we copy the log
2079	* @is_dirty: set to '1' if any dirty pages were found
2080	* @memslot: set to the associated memslot, always valid on success
2081	*/
2082	int kvm_get_dirty_log(struct kvm kvm, struct* kvm_dirty_log *log,
2083	int is_dirty, struct* kvm_memory_slot **memslot)
2084	{
2085	struct kvm_memslots *slots;
2086	int i, as_id, id;
2087	unsigned long n;
2088	unsigned long any = `0`;
2089
2090	/ Dirty ring tracking may be exclusive to dirty log tracking /
2091	if (!kvm_use_dirty_bitmap(kvm))
2092	return -ENXIO;
2093
2094	*memslot = NULL;
2095	*is_dirty = `0`;
2096
2097	as_id = log->slot >> `16`;
2098	id = (u16)log->slot;
2099	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
2100	return -EINVAL;
2101
2102	slots = __kvm_memslots(kvm, as_id);
2103	*memslot = id_to_memslot(slots, id);
2104	if (!(memslot) \|\| !(memslot)->dirty_bitmap)
2105	return -ENOENT;
2106
2107	kvm_arch_sync_dirty_log(kvm, *memslot);
2108
2109	n = kvm_dirty_bitmap_bytes(*memslot);
2110
2111	for (i = `0`; !any && i < n/sizeof(long); ++i)
2112	any = (*memslot)->dirty_bitmap[i];
2113
2114	if (copy_to_user(log->dirty_bitmap, (*memslot)->dirty_bitmap, n))
2115	return -EFAULT;
2116
2117	if (any)
2118	*is_dirty = `1`;
2119	return `0`;
2120	}
2121	EXPORT_SYMBOL_GPL(kvm_get_dirty_log);
2122
2123	#else /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2124	/**
2125	* kvm_get_dirty_log_protect - get a snapshot of dirty pages
2126	* and reenable dirty page tracking for the corresponding pages.
2127	* @kvm: pointer to kvm instance
2128	* @log: slot id and address to which we copy the log
2129	*
2130	* We need to keep it in mind that VCPU threads can write to the bitmap
2131	* concurrently. So, to avoid losing track of dirty pages we keep the
2132	* following order:
2133	*
2134	* 1. Take a snapshot of the bit and clear it if needed.
2135	* 2. Write protect the corresponding page.
2136	* 3. Copy the snapshot to the userspace.
2137	* 4. Upon return caller flushes TLB's if needed.
2138	*
2139	* Between 2 and 4, the guest may write to the page using the remaining TLB
2140	* entry. This is not a problem because the page is reported dirty using
2141	* the snapshot taken before and step 4 ensures that writes done after
2142	* exiting to userspace will be logged for the next call.
2143	*
2144	*/
2145	static int kvm_get_dirty_log_protect(struct kvm kvm, struct* kvm_dirty_log *log)
2146	{
2147	struct kvm_memslots *slots;
2148	struct kvm_memory_slot *memslot;
2149	int i, as_id, id;
2150	unsigned long n;
2151	unsigned long *dirty_bitmap;
2152	unsigned long *dirty_bitmap_buffer;
2153	bool flush;
2154
2155	/ Dirty ring tracking may be exclusive to dirty log tracking /
2156	if (!kvm_use_dirty_bitmap(kvm))
2157	return -ENXIO;
2158
2159	as_id = log->slot >> `16`;
2160	id = (u16)log->slot;
2161	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
2162	return -EINVAL;
2163
2164	slots = __kvm_memslots(kvm, as_id);
2165	memslot = id_to_memslot(slots, id);
2166	if (!memslot \|\| !memslot->dirty_bitmap)
2167	return -ENOENT;
2168
2169	dirty_bitmap = memslot->dirty_bitmap;
2170
2171	kvm_arch_sync_dirty_log(kvm, memslot);
2172
2173	n = kvm_dirty_bitmap_bytes(memslot);
2174	flush = false;
2175	if (kvm->manual_dirty_log_protect) {
2176	/*
2177	* Unlike kvm_get_dirty_log, we always return false in *flush,
2178	* because no flush is needed until KVM_CLEAR_DIRTY_LOG. There
2179	* is some code duplication between this function and
2180	* kvm_get_dirty_log, but hopefully all architecture
2181	* transition to kvm_get_dirty_log_protect and kvm_get_dirty_log
2182	* can be eliminated.
2183	*/
2184	dirty_bitmap_buffer = dirty_bitmap;
2185	} else {
2186	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2187	memset(dirty_bitmap_buffer, `0`, n);
2188
2189	KVM_MMU_LOCK(kvm);
2190	for (i = `0`; i < n / sizeof(long); i++) {
2191	unsigned long mask;
2192	gfn_t offset;
2193
2194	if (!dirty_bitmap[i])
2195	continue;
2196
2197	flush = true;
2198	mask = xchg(&dirty_bitmap[i], `0`);
2199	dirty_bitmap_buffer[i] = mask;
2200
2201	offset = i * BITS_PER_LONG;
2202	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot: memslot,
2203	gfn_offset: offset, mask);
2204	}
2205	KVM_MMU_UNLOCK(kvm);
2206	}
2207
2208	if (flush)
2209	kvm_flush_remote_tlbs_memslot(kvm, memslot);
2210
2211	if (copy_to_user(to: log->dirty_bitmap, from: dirty_bitmap_buffer, n))
2212	return -EFAULT;
2213	return `0`;
2214	}
2215
2216
2217	/**
2218	* kvm_vm_ioctl_get_dirty_log - get and clear the log of dirty pages in a slot
2219	* @kvm: kvm instance
2220	* @log: slot id and address to which we copy the log
2221	*
2222	* Steps 1-4 below provide general overview of dirty page logging. See
2223	* kvm_get_dirty_log_protect() function description for additional details.
2224	*
2225	* We call kvm_get_dirty_log_protect() to handle steps 1-3, upon return we
2226	* always flush the TLB (step 4) even if previous step failed and the dirty
2227	* bitmap may be corrupt. Regardless of previous outcome the KVM logging API
2228	* does not preclude user space subsequent dirty log read. Flushing TLB ensures
2229	* writes will be marked dirty for next log read.
2230	*
2231	* 1. Take a snapshot of the bit and clear it if needed.
2232	* 2. Write protect the corresponding page.
2233	* 3. Copy the snapshot to the userspace.
2234	* 4. Flush TLB's if needed.
2235	*/
2236	static int kvm_vm_ioctl_get_dirty_log(struct kvm *kvm,
2237	struct kvm_dirty_log *log)
2238	{
2239	int r;
2240
2241	mutex_lock(&kvm->slots_lock);
2242
2243	r = kvm_get_dirty_log_protect(kvm, log);
2244
2245	mutex_unlock(lock: &kvm->slots_lock);
2246	return r;
2247	}
2248
2249	/**
2250	* kvm_clear_dirty_log_protect - clear dirty bits in the bitmap
2251	* and reenable dirty page tracking for the corresponding pages.
2252	* @kvm: pointer to kvm instance
2253	* @log: slot id and address from which to fetch the bitmap of dirty pages
2254	*/
2255	static int kvm_clear_dirty_log_protect(struct kvm *kvm,
2256	struct kvm_clear_dirty_log *log)
2257	{
2258	struct kvm_memslots *slots;
2259	struct kvm_memory_slot *memslot;
2260	int as_id, id;
2261	gfn_t offset;
2262	unsigned long i, n;
2263	unsigned long *dirty_bitmap;
2264	unsigned long *dirty_bitmap_buffer;
2265	bool flush;
2266
2267	/ Dirty ring tracking may be exclusive to dirty log tracking /
2268	if (!kvm_use_dirty_bitmap(kvm))
2269	return -ENXIO;
2270
2271	as_id = log->slot >> `16`;
2272	id = (u16)log->slot;
2273	if (as_id >= KVM_ADDRESS_SPACE_NUM \|\| id >= KVM_USER_MEM_SLOTS)
2274	return -EINVAL;
2275
2276	if (log->first_page & `63`)
2277	return -EINVAL;
2278
2279	slots = __kvm_memslots(kvm, as_id);
2280	memslot = id_to_memslot(slots, id);
2281	if (!memslot \|\| !memslot->dirty_bitmap)
2282	return -ENOENT;
2283
2284	dirty_bitmap = memslot->dirty_bitmap;
2285
2286	n = ALIGN(log->num_pages, BITS_PER_LONG) / `8`;
2287
2288	if (log->first_page > memslot->npages \|\|
2289	log->num_pages > memslot->npages - log->first_page \|\|
2290	(log->num_pages < memslot->npages - log->first_page && (log->num_pages & `63`)))
2291	return -EINVAL;
2292
2293	kvm_arch_sync_dirty_log(kvm, memslot);
2294
2295	flush = false;
2296	dirty_bitmap_buffer = kvm_second_dirty_bitmap(memslot);
2297	if (copy_from_user(to: dirty_bitmap_buffer, from: log->dirty_bitmap, n))
2298	return -EFAULT;
2299
2300	KVM_MMU_LOCK(kvm);
2301	for (offset = log->first_page, i = offset / BITS_PER_LONG,
2302	n = DIV_ROUND_UP(log->num_pages, BITS_PER_LONG); n--;
2303	i++, offset += BITS_PER_LONG) {
2304	unsigned long mask = *dirty_bitmap_buffer++;
2305	atomic_long_t p = (atomic_long_t ) &dirty_bitmap[i];
2306	if (!mask)
2307	continue;
2308
2309	mask &= atomic_long_fetch_andnot(i: mask, v: p);
2310
2311	/*
2312	* mask contains the bits that really have been cleared. This
2313	* never includes any bits beyond the length of the memslot (if
2314	* the length is not aligned to 64 pages), therefore it is not
2315	* a problem if userspace sets them in log->dirty_bitmap.
2316	*/
2317	if (mask) {
2318	flush = true;
2319	kvm_arch_mmu_enable_log_dirty_pt_masked(kvm, slot: memslot,
2320	gfn_offset: offset, mask);
2321	}
2322	}
2323	KVM_MMU_UNLOCK(kvm);
2324
2325	if (flush)
2326	kvm_flush_remote_tlbs_memslot(kvm, memslot);
2327
2328	return `0`;
2329	}
2330
2331	static int kvm_vm_ioctl_clear_dirty_log(struct kvm *kvm,
2332	struct kvm_clear_dirty_log *log)
2333	{
2334	int r;
2335
2336	mutex_lock(&kvm->slots_lock);
2337
2338	r = kvm_clear_dirty_log_protect(kvm, log);
2339
2340	mutex_unlock(lock: &kvm->slots_lock);
2341	return r;
2342	}
2343	#endif /* CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT */
2344
2345	struct kvm_memory_slot gfn_to_memslot(struct* kvm *kvm, gfn_t gfn)
2346	{
2347	return __gfn_to_memslot(slots: kvm_memslots(kvm), gfn);
2348	}
2349	EXPORT_SYMBOL_GPL(gfn_to_memslot);
2350
2351	struct kvm_memory_slot kvm_vcpu_gfn_to_memslot(struct* kvm_vcpu *vcpu, gfn_t gfn)
2352	{
2353	struct kvm_memslots *slots = kvm_vcpu_memslots(vcpu);
2354	u64 gen = slots->generation;
2355	struct kvm_memory_slot *slot;
2356
2357	/*
2358	* This also protects against using a memslot from a different address space,
2359	* since different address spaces have different generation numbers.
2360	*/
2361	if (unlikely(gen != vcpu->last_used_slot_gen)) {
2362	vcpu->last_used_slot = NULL;
2363	vcpu->last_used_slot_gen = gen;
2364	}
2365
2366	slot = try_get_memslot(slot: vcpu->last_used_slot, gfn);
2367	if (slot)
2368	return slot;
2369
2370	/*
2371	* Fall back to searching all memslots. We purposely use
2372	* search_memslots() instead of __gfn_to_memslot() to avoid
2373	* thrashing the VM-wide last_used_slot in kvm_memslots.
2374	*/
2375	slot = search_memslots(slots, gfn, approx: false);
2376	if (slot) {
2377	vcpu->last_used_slot = slot;
2378	return slot;
2379	}
2380
2381	return NULL;
2382	}
2383
2384	bool kvm_is_visible_gfn(struct kvm *kvm, gfn_t gfn)
2385	{
2386	struct kvm_memory_slot *memslot = gfn_to_memslot(kvm, gfn);
2387
2388	return kvm_is_visible_memslot(memslot);
2389	}
2390	EXPORT_SYMBOL_GPL(kvm_is_visible_gfn);
2391
2392	bool kvm_vcpu_is_visible_gfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2393	{
2394	struct kvm_memory_slot *memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2395
2396	return kvm_is_visible_memslot(memslot);
2397	}
2398	EXPORT_SYMBOL_GPL(kvm_vcpu_is_visible_gfn);
2399
2400	unsigned long kvm_host_page_size(struct kvm_vcpu *vcpu, gfn_t gfn)
2401	{
2402	struct vm_area_struct *vma;
2403	unsigned long addr, size;
2404
2405	size = PAGE_SIZE;
2406
2407	addr = kvm_vcpu_gfn_to_hva_prot(vcpu, gfn, NULL);
2408	if (kvm_is_error_hva(addr))
2409	return PAGE_SIZE;
2410
2411	mmap_read_lock(current->mm);
2412	vma = find_vma(current->mm, addr);
2413	if (!vma)
2414	goto out;
2415
2416	size = vma_kernel_pagesize(vma);
2417
2418	out:
2419	mmap_read_unlock(current->mm);
2420
2421	return size;
2422	}
2423
2424	static bool memslot_is_readonly(const struct kvm_memory_slot *slot)
2425	{
2426	return slot->flags & KVM_MEM_READONLY;
2427	}
2428
2429	static unsigned long __gfn_to_hva_many(const struct kvm_memory_slot *slot, gfn_t gfn,
2430	gfn_t *nr_pages, bool write)
2431	{
2432	if (!slot \|\| slot->flags & KVM_MEMSLOT_INVALID)
2433	return KVM_HVA_ERR_BAD;
2434
2435	if (memslot_is_readonly(slot) && write)
2436	return KVM_HVA_ERR_RO_BAD;
2437
2438	if (nr_pages)
2439	*nr_pages = slot->npages - (gfn - slot->base_gfn);
2440
2441	return __gfn_to_hva_memslot(slot, gfn);
2442	}
2443
2444	static unsigned long gfn_to_hva_many(struct kvm_memory_slot *slot, gfn_t gfn,
2445	gfn_t *nr_pages)
2446	{
2447	return __gfn_to_hva_many(slot, gfn, nr_pages, write: true);
2448	}
2449
2450	unsigned long gfn_to_hva_memslot(struct kvm_memory_slot *slot,
2451	gfn_t gfn)
2452	{
2453	return gfn_to_hva_many(slot, gfn, NULL);
2454	}
2455	EXPORT_SYMBOL_GPL(gfn_to_hva_memslot);
2456
2457	unsigned long gfn_to_hva(struct kvm *kvm, gfn_t gfn)
2458	{
2459	return gfn_to_hva_many(slot: gfn_to_memslot(kvm, gfn), gfn, NULL);
2460	}
2461	EXPORT_SYMBOL_GPL(gfn_to_hva);
2462
2463	unsigned long kvm_vcpu_gfn_to_hva(struct kvm_vcpu *vcpu, gfn_t gfn)
2464	{
2465	return gfn_to_hva_many(slot: kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn, NULL);
2466	}
2467	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_hva);
2468
2469	/*
2470	* Return the hva of a @gfn and the R/W attribute if possible.
2471	*
2472	* @slot: the kvm_memory_slot which contains @gfn
2473	* @gfn: the gfn to be translated
2474	* @writable: used to return the read/write attribute of the @slot if the hva
2475	* is valid and @writable is not NULL
2476	*/
2477	unsigned long gfn_to_hva_memslot_prot(struct kvm_memory_slot *slot,
2478	gfn_t gfn, bool *writable)
2479	{
2480	unsigned long hva = __gfn_to_hva_many(slot, gfn, NULL, write: false);
2481
2482	if (!kvm_is_error_hva(addr: hva) && writable)
2483	*writable = !memslot_is_readonly(slot);
2484
2485	return hva;
2486	}
2487
2488	unsigned long gfn_to_hva_prot(struct kvm kvm, gfn_t gfn, bool writable)
2489	{
2490	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
2491
2492	return gfn_to_hva_memslot_prot(slot, gfn, writable);
2493	}
2494
2495	unsigned long kvm_vcpu_gfn_to_hva_prot(struct kvm_vcpu vcpu, gfn_t gfn, bool writable)
2496	{
2497	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
2498
2499	return gfn_to_hva_memslot_prot(slot, gfn, writable);
2500	}
2501
2502	static inline int check_user_page_hwpoison(unsigned long addr)
2503	{
2504	int rc, flags = FOLL_HWPOISON \| FOLL_WRITE;
2505
2506	rc = get_user_pages(start: addr, nr_pages: `1`, gup_flags: flags, NULL);
2507	return rc == -EHWPOISON;
2508	}
2509
2510	/*
2511	* The fast path to get the writable pfn which will be stored in @pfn,
2512	* true indicates success, otherwise false is returned. It's also the
2513	* only part that runs if we can in atomic context.
2514	*/
2515	static bool hva_to_pfn_fast(unsigned long addr, bool write_fault,
2516	bool writable, kvm_pfn_t pfn)
2517	{
2518	struct page *page[`1`];
2519
2520	/*
2521	* Fast pin a writable pfn only if it is a write fault request
2522	* or the caller allows to map a writable pfn for a read fault
2523	* request.
2524	*/
2525	if (!(write_fault \|\| writable))
2526	return false;
2527
2528	if (get_user_page_fast_only(addr, gup_flags: FOLL_WRITE, pagep: page)) {
2529	*pfn = page_to_pfn(page[`0`]);
2530
2531	if (writable)
2532	*writable = true;
2533	return true;
2534	}
2535
2536	return false;
2537	}
2538
2539	/*
2540	* The slow path to get the pfn of the specified host virtual address,
2541	* 1 indicates success, -errno is returned if error is detected.
2542	*/
2543	static int hva_to_pfn_slow(unsigned long addr, bool *async, bool write_fault,
2544	bool interruptible, bool writable, kvm_pfn_t pfn)
2545	{
2546	/*
2547	* When a VCPU accesses a page that is not mapped into the secondary
2548	* MMU, we lookup the page using GUP to map it, so the guest VCPU can
2549	* make progress. We always want to honor NUMA hinting faults in that
2550	* case, because GUP usage corresponds to memory accesses from the VCPU.
2551	* Otherwise, we'd not trigger NUMA hinting faults once a page is
2552	* mapped into the secondary MMU and gets accessed by a VCPU.
2553	*
2554	* Note that get_user_page_fast_only() and FOLL_WRITE for now
2555	* implicitly honor NUMA hinting faults and don't need this flag.
2556	*/
2557	unsigned int flags = FOLL_HWPOISON \| FOLL_HONOR_NUMA_FAULT;
2558	struct page *page;
2559	int npages;
2560
2561	might_sleep();
2562
2563	if (writable)
2564	*writable = write_fault;
2565
2566	if (write_fault)
2567	flags \|= FOLL_WRITE;
2568	if (async)
2569	flags \|= FOLL_NOWAIT;
2570	if (interruptible)
2571	flags \|= FOLL_INTERRUPTIBLE;
2572
2573	npages = get_user_pages_unlocked(start: addr, nr_pages: `1`, pages: &page, gup_flags: flags);
2574	if (npages != `1`)
2575	return npages;
2576
2577	/ map read fault as writable if possible /
2578	if (unlikely(!write_fault) && writable) {
2579	struct page *wpage;
2580
2581	if (get_user_page_fast_only(addr, gup_flags: FOLL_WRITE, pagep: &wpage)) {
2582	*writable = true;
2583	put_page(page);
2584	page = wpage;
2585	}
2586	}
2587	*pfn = page_to_pfn(page);
2588	return npages;
2589	}
2590
2591	static bool vma_is_valid(struct vm_area_struct *vma, bool write_fault)
2592	{
2593	if (unlikely(!(vma->vm_flags & VM_READ)))
2594	return false;
2595
2596	if (write_fault && (unlikely(!(vma->vm_flags & VM_WRITE))))
2597	return false;
2598
2599	return true;
2600	}
2601
2602	static int kvm_try_get_pfn(kvm_pfn_t pfn)
2603	{
2604	struct page *page = kvm_pfn_to_refcounted_page(pfn);
2605
2606	if (!page)
2607	return `1`;
2608
2609	return get_page_unless_zero(page);
2610	}
2611
2612	static int hva_to_pfn_remapped(struct vm_area_struct *vma,
2613	unsigned long addr, bool write_fault,
2614	bool writable, kvm_pfn_t p_pfn)
2615	{
2616	kvm_pfn_t pfn;
2617	pte_t *ptep;
2618	pte_t pte;
2619	spinlock_t *ptl;
2620	int r;
2621
2622	r = follow_pte(mm: vma->vm_mm, address: addr, ptepp: &ptep, ptlp: &ptl);
2623	if (r) {
2624	/*
2625	* get_user_pages fails for VM_IO and VM_PFNMAP vmas and does
2626	* not call the fault handler, so do it here.
2627	*/
2628	bool unlocked = false;
2629	r = fixup_user_fault(current->mm, address: addr,
2630	fault_flags: (write_fault ? FAULT_FLAG_WRITE : `0`),
2631	unlocked: &unlocked);
2632	if (unlocked)
2633	return -EAGAIN;
2634	if (r)
2635	return r;
2636
2637	r = follow_pte(mm: vma->vm_mm, address: addr, ptepp: &ptep, ptlp: &ptl);
2638	if (r)
2639	return r;
2640	}
2641
2642	pte = ptep_get(ptep);
2643
2644	if (write_fault && !pte_write(pte)) {
2645	pfn = KVM_PFN_ERR_RO_FAULT;
2646	goto out;
2647	}
2648
2649	if (writable)
2650	*writable = pte_write(pte);
2651	pfn = pte_pfn(pte);
2652
2653	/*
2654	* Get a reference here because callers of hva_to_pfn and
2655	* gfn_to_pfn ultimately call kvm_release_pfn_clean on the
2656	* returned pfn. This is only needed if the VMA has VM_MIXEDMAP
2657	* set, but the kvm_try_get_pfn/kvm_release_pfn_clean pair will
2658	* simply do nothing for reserved pfns.
2659	*
2660	* Whoever called remap_pfn_range is also going to call e.g.
2661	* unmap_mapping_range before the underlying pages are freed,
2662	* causing a call to our MMU notifier.
2663	*
2664	* Certain IO or PFNMAP mappings can be backed with valid
2665	* struct pages, but be allocated without refcounting e.g.,
2666	* tail pages of non-compound higher order allocations, which
2667	* would then underflow the refcount when the caller does the
2668	* required put_page. Don't allow those pages here.
2669	*/
2670	if (!kvm_try_get_pfn(pfn))
2671	r = -EFAULT;
2672
2673	out:
2674	pte_unmap_unlock(ptep, ptl);
2675	*p_pfn = pfn;
2676
2677	return r;
2678	}
2679
2680	/*
2681	* Pin guest page in memory and return its pfn.
2682	* @addr: host virtual address which maps memory to the guest
2683	* @atomic: whether this function can sleep
2684	* @interruptible: whether the process can be interrupted by non-fatal signals
2685	* @async: whether this function need to wait IO complete if the
2686	* host page is not in the memory
2687	* @write_fault: whether we should get a writable host page
2688	* @writable: whether it allows to map a writable host page for !@write_fault
2689	*
2690	* The function will map a writable host page for these two cases:
2691	* 1): @write_fault = true
2692	* 2): @write_fault = false && @writable, @writable will tell the caller
2693	* whether the mapping is writable.
2694	*/
2695	kvm_pfn_t hva_to_pfn(unsigned long addr, bool atomic, bool interruptible,
2696	bool async, bool write_fault, bool writable)
2697	{
2698	struct vm_area_struct *vma;
2699	kvm_pfn_t pfn;
2700	int npages, r;
2701
2702	/ we can do it either atomically or asynchronously, not both /
2703	BUG_ON(atomic && async);
2704
2705	if (hva_to_pfn_fast(addr, write_fault, writable, pfn: &pfn))
2706	return pfn;
2707
2708	if (atomic)
2709	return KVM_PFN_ERR_FAULT;
2710
2711	npages = hva_to_pfn_slow(addr, async, write_fault, interruptible,
2712	writable, pfn: &pfn);
2713	if (npages == `1`)
2714	return pfn;
2715	if (npages == -EINTR)
2716	return KVM_PFN_ERR_SIGPENDING;
2717
2718	mmap_read_lock(current->mm);
2719	if (npages == -EHWPOISON \|\|
2720	(!async && check_user_page_hwpoison(addr))) {
2721	pfn = KVM_PFN_ERR_HWPOISON;
2722	goto exit;
2723	}
2724
2725	retry:
2726	vma = vma_lookup(current->mm, addr);
2727
2728	if (vma == NULL)
2729	pfn = KVM_PFN_ERR_FAULT;
2730	else if (vma->vm_flags & (VM_IO \| VM_PFNMAP)) {
2731	r = hva_to_pfn_remapped(vma, addr, write_fault, writable, p_pfn: &pfn);
2732	if (r == -EAGAIN)
2733	goto retry;
2734	if (r < `0`)
2735	pfn = KVM_PFN_ERR_FAULT;
2736	} else {
2737	if (async && vma_is_valid(vma, write_fault))
2738	*async = true;
2739	pfn = KVM_PFN_ERR_FAULT;
2740	}
2741	exit:
2742	mmap_read_unlock(current->mm);
2743	return pfn;
2744	}
2745
2746	kvm_pfn_t __gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn,
2747	bool atomic, bool interruptible, bool *async,
2748	bool write_fault, bool writable, hva_t hva)
2749	{
2750	unsigned long addr = __gfn_to_hva_many(slot, gfn, NULL, write: write_fault);
2751
2752	if (hva)
2753	*hva = addr;
2754
2755	if (addr == KVM_HVA_ERR_RO_BAD) {
2756	if (writable)
2757	*writable = false;
2758	return KVM_PFN_ERR_RO_FAULT;
2759	}
2760
2761	if (kvm_is_error_hva(addr)) {
2762	if (writable)
2763	*writable = false;
2764	return KVM_PFN_NOSLOT;
2765	}
2766
2767	/ Do not map writable pfn in the readonly memslot. /
2768	if (writable && memslot_is_readonly(slot)) {
2769	*writable = false;
2770	writable = NULL;
2771	}
2772
2773	return hva_to_pfn(addr, atomic, interruptible, async, write_fault,
2774	writable);
2775	}
2776	EXPORT_SYMBOL_GPL(__gfn_to_pfn_memslot);
2777
2778	kvm_pfn_t gfn_to_pfn_prot(struct kvm *kvm, gfn_t gfn, bool write_fault,
2779	bool *writable)
2780	{
2781	return __gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn, false, false,
2782	NULL, write_fault, writable, NULL);
2783	}
2784	EXPORT_SYMBOL_GPL(gfn_to_pfn_prot);
2785
2786	kvm_pfn_t gfn_to_pfn_memslot(const struct kvm_memory_slot *slot, gfn_t gfn)
2787	{
2788	return __gfn_to_pfn_memslot(slot, gfn, false, false, NULL, true,
2789	NULL, NULL);
2790	}
2791	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot);
2792
2793	kvm_pfn_t gfn_to_pfn_memslot_atomic(const struct kvm_memory_slot *slot, gfn_t gfn)
2794	{
2795	return __gfn_to_pfn_memslot(slot, gfn, true, false, NULL, true,
2796	NULL, NULL);
2797	}
2798	EXPORT_SYMBOL_GPL(gfn_to_pfn_memslot_atomic);
2799
2800	kvm_pfn_t kvm_vcpu_gfn_to_pfn_atomic(struct kvm_vcpu *vcpu, gfn_t gfn)
2801	{
2802	return gfn_to_pfn_memslot_atomic(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2803	}
2804	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn_atomic);
2805
2806	kvm_pfn_t gfn_to_pfn(struct kvm *kvm, gfn_t gfn)
2807	{
2808	return gfn_to_pfn_memslot(gfn_to_memslot(kvm, gfn), gfn);
2809	}
2810	EXPORT_SYMBOL_GPL(gfn_to_pfn);
2811
2812	kvm_pfn_t kvm_vcpu_gfn_to_pfn(struct kvm_vcpu *vcpu, gfn_t gfn)
2813	{
2814	return gfn_to_pfn_memslot(kvm_vcpu_gfn_to_memslot(vcpu, gfn), gfn);
2815	}
2816	EXPORT_SYMBOL_GPL(kvm_vcpu_gfn_to_pfn);
2817
2818	int gfn_to_page_many_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
2819	struct page *pages, int* nr_pages)
2820	{
2821	unsigned long addr;
2822	gfn_t entry = `0`;
2823
2824	addr = gfn_to_hva_many(slot, gfn, nr_pages: &entry);
2825	if (kvm_is_error_hva(addr))
2826	return -`1`;
2827
2828	if (entry < nr_pages)
2829	return `0`;
2830
2831	return get_user_pages_fast_only(start: addr, nr_pages, gup_flags: FOLL_WRITE, pages);
2832	}
2833	EXPORT_SYMBOL_GPL(gfn_to_page_many_atomic);
2834
2835	/*
2836	* Do not use this helper unless you are absolutely certain the gfn _must_ be
2837	* backed by 'struct page'. A valid example is if the backing memslot is
2838	* controlled by KVM. Note, if the returned page is valid, it's refcount has
2839	* been elevated by gfn_to_pfn().
2840	*/
2841	struct page gfn_to_page(struct* kvm *kvm, gfn_t gfn)
2842	{
2843	struct page *page;
2844	kvm_pfn_t pfn;
2845
2846	pfn = gfn_to_pfn(kvm, gfn);
2847
2848	if (is_error_noslot_pfn(pfn))
2849	return KVM_ERR_PTR_BAD_PAGE;
2850
2851	page = kvm_pfn_to_refcounted_page(pfn);
2852	if (!page)
2853	return KVM_ERR_PTR_BAD_PAGE;
2854
2855	return page;
2856	}
2857	EXPORT_SYMBOL_GPL(gfn_to_page);
2858
2859	void kvm_release_pfn(kvm_pfn_t pfn, bool dirty)
2860	{
2861	if (dirty)
2862	kvm_release_pfn_dirty(pfn);
2863	else
2864	kvm_release_pfn_clean(pfn);
2865	}
2866
2867	int kvm_vcpu_map(struct kvm_vcpu vcpu, gfn_t gfn, struct* kvm_host_map *map)
2868	{
2869	kvm_pfn_t pfn;
2870	void *hva = NULL;
2871	struct page *page = KVM_UNMAPPED_PAGE;
2872
2873	if (!map)
2874	return -EINVAL;
2875
2876	pfn = gfn_to_pfn(vcpu->kvm, gfn);
2877	if (is_error_noslot_pfn(pfn))
2878	return -EINVAL;
2879
2880	if (pfn_valid(pfn)) {
2881	page = pfn_to_page(pfn);
2882	hva = kmap(page);
2883	#ifdef CONFIG_HAS_IOMEM
2884	} else {
2885	hva = memremap(offset: pfn_to_hpa(pfn), PAGE_SIZE, flags: MEMREMAP_WB);
2886	#endif
2887	}
2888
2889	if (!hva)
2890	return -EFAULT;
2891
2892	map->page = page;
2893	map->hva = hva;
2894	map->pfn = pfn;
2895	map->gfn = gfn;
2896
2897	return `0`;
2898	}
2899	EXPORT_SYMBOL_GPL(kvm_vcpu_map);
2900
2901	void kvm_vcpu_unmap(struct kvm_vcpu vcpu, struct* kvm_host_map *map, bool dirty)
2902	{
2903	if (!map)
2904	return;
2905
2906	if (!map->hva)
2907	return;
2908
2909	if (map->page != KVM_UNMAPPED_PAGE)
2910	kunmap(page: map->page);
2911	#ifdef CONFIG_HAS_IOMEM
2912	else
2913	memunmap(addr: map->hva);
2914	#endif
2915
2916	if (dirty)
2917	kvm_vcpu_mark_page_dirty(vcpu, gfn: map->gfn);
2918
2919	kvm_release_pfn(pfn: map->pfn, dirty);
2920
2921	map->hva = NULL;
2922	map->page = NULL;
2923	}
2924	EXPORT_SYMBOL_GPL(kvm_vcpu_unmap);
2925
2926	static bool kvm_is_ad_tracked_page(struct page *page)
2927	{
2928	/*
2929	* Per page-flags.h, pages tagged PG_reserved "should in general not be
2930	* touched (e.g. set dirty) except by its owner".
2931	*/
2932	return !PageReserved(page);
2933	}
2934
2935	static void kvm_set_page_dirty(struct page *page)
2936	{
2937	if (kvm_is_ad_tracked_page(page))
2938	SetPageDirty(page);
2939	}
2940
2941	static void kvm_set_page_accessed(struct page *page)
2942	{
2943	if (kvm_is_ad_tracked_page(page))
2944	mark_page_accessed(page);
2945	}
2946
2947	void kvm_release_page_clean(struct page *page)
2948	{
2949	WARN_ON(is_error_page(page));
2950
2951	kvm_set_page_accessed(page);
2952	put_page(page);
2953	}
2954	EXPORT_SYMBOL_GPL(kvm_release_page_clean);
2955
2956	void kvm_release_pfn_clean(kvm_pfn_t pfn)
2957	{
2958	struct page *page;
2959
2960	if (is_error_noslot_pfn(pfn))
2961	return;
2962
2963	page = kvm_pfn_to_refcounted_page(pfn);
2964	if (!page)
2965	return;
2966
2967	kvm_release_page_clean(page);
2968	}
2969	EXPORT_SYMBOL_GPL(kvm_release_pfn_clean);
2970
2971	void kvm_release_page_dirty(struct page *page)
2972	{
2973	WARN_ON(is_error_page(page));
2974
2975	kvm_set_page_dirty(page);
2976	kvm_release_page_clean(page);
2977	}
2978	EXPORT_SYMBOL_GPL(kvm_release_page_dirty);
2979
2980	void kvm_release_pfn_dirty(kvm_pfn_t pfn)
2981	{
2982	struct page *page;
2983
2984	if (is_error_noslot_pfn(pfn))
2985	return;
2986
2987	page = kvm_pfn_to_refcounted_page(pfn);
2988	if (!page)
2989	return;
2990
2991	kvm_release_page_dirty(page);
2992	}
2993	EXPORT_SYMBOL_GPL(kvm_release_pfn_dirty);
2994
2995	/*
2996	* Note, checking for an error/noslot pfn is the caller's responsibility when
2997	* directly marking a page dirty/accessed. Unlike the "release" helpers, the
2998	* "set" helpers are not to be used when the pfn might point at garbage.
2999	*/
3000	void kvm_set_pfn_dirty(kvm_pfn_t pfn)
3001	{
3002	if (WARN_ON(is_error_noslot_pfn(pfn)))
3003	return;
3004
3005	if (pfn_valid(pfn))
3006	kvm_set_page_dirty(pfn_to_page(pfn));
3007	}
3008	EXPORT_SYMBOL_GPL(kvm_set_pfn_dirty);
3009
3010	void kvm_set_pfn_accessed(kvm_pfn_t pfn)
3011	{
3012	if (WARN_ON(is_error_noslot_pfn(pfn)))
3013	return;
3014
3015	if (pfn_valid(pfn))
3016	kvm_set_page_accessed(pfn_to_page(pfn));
3017	}
3018	EXPORT_SYMBOL_GPL(kvm_set_pfn_accessed);
3019
3020	static int next_segment(unsigned long len, int offset)
3021	{
3022	if (len > PAGE_SIZE - offset)
3023	return PAGE_SIZE - offset;
3024	else
3025	return len;
3026	}
3027
3028	static int __kvm_read_guest_page(struct kvm_memory_slot *slot, gfn_t gfn,
3029	void data, int* offset, int len)
3030	{
3031	int r;
3032	unsigned long addr;
3033
3034	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3035	if (kvm_is_error_hva(addr))
3036	return -EFAULT;
3037	r = __copy_from_user(to: data, from: (void __user *)addr + offset, n: len);
3038	if (r)
3039	return -EFAULT;
3040	return `0`;
3041	}
3042
3043	int kvm_read_guest_page(struct kvm kvm, gfn_t gfn, void* data, int* offset,
3044	int len)
3045	{
3046	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3047
3048	return __kvm_read_guest_page(slot, gfn, data, offset, len);
3049	}
3050	EXPORT_SYMBOL_GPL(kvm_read_guest_page);
3051
3052	int kvm_vcpu_read_guest_page(struct kvm_vcpu vcpu, gfn_t gfn, void* *data,
3053	int offset, int len)
3054	{
3055	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3056
3057	return __kvm_read_guest_page(slot, gfn, data, offset, len);
3058	}
3059	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_page);
3060
3061	int kvm_read_guest(struct kvm kvm, gpa_t gpa, void* data, unsigned* long len)
3062	{
3063	gfn_t gfn = gpa >> PAGE_SHIFT;
3064	int seg;
3065	int offset = offset_in_page(gpa);
3066	int ret;
3067
3068	while ((seg = next_segment(len, offset)) != `0`) {
3069	ret = kvm_read_guest_page(kvm, gfn, data, offset, seg);
3070	if (ret < `0`)
3071	return ret;
3072	offset = `0`;
3073	len -= seg;
3074	data += seg;
3075	++gfn;
3076	}
3077	return `0`;
3078	}
3079	EXPORT_SYMBOL_GPL(kvm_read_guest);
3080
3081	int kvm_vcpu_read_guest(struct kvm_vcpu vcpu, gpa_t gpa, void* data, unsigned* long len)
3082	{
3083	gfn_t gfn = gpa >> PAGE_SHIFT;
3084	int seg;
3085	int offset = offset_in_page(gpa);
3086	int ret;
3087
3088	while ((seg = next_segment(len, offset)) != `0`) {
3089	ret = kvm_vcpu_read_guest_page(vcpu, gfn, data, offset, seg);
3090	if (ret < `0`)
3091	return ret;
3092	offset = `0`;
3093	len -= seg;
3094	data += seg;
3095	++gfn;
3096	}
3097	return `0`;
3098	}
3099	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest);
3100
3101	static int __kvm_read_guest_atomic(struct kvm_memory_slot *slot, gfn_t gfn,
3102	void data, int* offset, unsigned long len)
3103	{
3104	int r;
3105	unsigned long addr;
3106
3107	addr = gfn_to_hva_memslot_prot(slot, gfn, NULL);
3108	if (kvm_is_error_hva(addr))
3109	return -EFAULT;
3110	pagefault_disable();
3111	r = __copy_from_user_inatomic(to: data, from: (void __user *)addr + offset, n: len);
3112	pagefault_enable();
3113	if (r)
3114	return -EFAULT;
3115	return `0`;
3116	}
3117
3118	int kvm_vcpu_read_guest_atomic(struct kvm_vcpu *vcpu, gpa_t gpa,
3119	void data, unsigned* long len)
3120	{
3121	gfn_t gfn = gpa >> PAGE_SHIFT;
3122	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3123	int offset = offset_in_page(gpa);
3124
3125	return __kvm_read_guest_atomic(slot, gfn, data, offset, len);
3126	}
3127	EXPORT_SYMBOL_GPL(kvm_vcpu_read_guest_atomic);
3128
3129	static int __kvm_write_guest_page(struct kvm *kvm,
3130	struct kvm_memory_slot *memslot, gfn_t gfn,
3131	const void data, int* offset, int len)
3132	{
3133	int r;
3134	unsigned long addr;
3135
3136	addr = gfn_to_hva_memslot(memslot, gfn);
3137	if (kvm_is_error_hva(addr))
3138	return -EFAULT;
3139	r = __copy_to_user(to: (void __user *)addr + offset, from: data, n: len);
3140	if (r)
3141	return -EFAULT;
3142	mark_page_dirty_in_slot(kvm, memslot, gfn);
3143	return `0`;
3144	}
3145
3146	int kvm_write_guest_page(struct kvm *kvm, gfn_t gfn,
3147	const void data, int* offset, int len)
3148	{
3149	struct kvm_memory_slot *slot = gfn_to_memslot(kvm, gfn);
3150
3151	return __kvm_write_guest_page(kvm, memslot: slot, gfn, data, offset, len);
3152	}
3153	EXPORT_SYMBOL_GPL(kvm_write_guest_page);
3154
3155	int kvm_vcpu_write_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
3156	const void data, int* offset, int len)
3157	{
3158	struct kvm_memory_slot *slot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3159
3160	return __kvm_write_guest_page(kvm: vcpu->kvm, memslot: slot, gfn, data, offset, len);
3161	}
3162	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest_page);
3163
3164	int kvm_write_guest(struct kvm kvm, gpa_t gpa, const* void *data,
3165	unsigned long len)
3166	{
3167	gfn_t gfn = gpa >> PAGE_SHIFT;
3168	int seg;
3169	int offset = offset_in_page(gpa);
3170	int ret;
3171
3172	while ((seg = next_segment(len, offset)) != `0`) {
3173	ret = kvm_write_guest_page(kvm, gfn, data, offset, seg);
3174	if (ret < `0`)
3175	return ret;
3176	offset = `0`;
3177	len -= seg;
3178	data += seg;
3179	++gfn;
3180	}
3181	return `0`;
3182	}
3183	EXPORT_SYMBOL_GPL(kvm_write_guest);
3184
3185	int kvm_vcpu_write_guest(struct kvm_vcpu vcpu, gpa_t gpa, const* void *data,
3186	unsigned long len)
3187	{
3188	gfn_t gfn = gpa >> PAGE_SHIFT;
3189	int seg;
3190	int offset = offset_in_page(gpa);
3191	int ret;
3192
3193	while ((seg = next_segment(len, offset)) != `0`) {
3194	ret = kvm_vcpu_write_guest_page(vcpu, gfn, data, offset, seg);
3195	if (ret < `0`)
3196	return ret;
3197	offset = `0`;
3198	len -= seg;
3199	data += seg;
3200	++gfn;
3201	}
3202	return `0`;
3203	}
3204	EXPORT_SYMBOL_GPL(kvm_vcpu_write_guest);
3205
3206	static int __kvm_gfn_to_hva_cache_init(struct kvm_memslots *slots,
3207	struct gfn_to_hva_cache *ghc,
3208	gpa_t gpa, unsigned long len)
3209	{
3210	int offset = offset_in_page(gpa);
3211	gfn_t start_gfn = gpa >> PAGE_SHIFT;
3212	gfn_t end_gfn = (gpa + len - `1`) >> PAGE_SHIFT;
3213	gfn_t nr_pages_needed = end_gfn - start_gfn + `1`;
3214	gfn_t nr_pages_avail;
3215
3216	/ Update ghc->generation before performing any error checks. /
3217	ghc->generation = slots->generation;
3218
3219	if (start_gfn > end_gfn) {
3220	ghc->hva = KVM_HVA_ERR_BAD;
3221	return -EINVAL;
3222	}
3223
3224	/*
3225	* If the requested region crosses two memslots, we still
3226	* verify that the entire region is valid here.
3227	*/
3228	for ( ; start_gfn <= end_gfn; start_gfn += nr_pages_avail) {
3229	ghc->memslot = __gfn_to_memslot(slots, gfn: start_gfn);
3230	ghc->hva = gfn_to_hva_many(slot: ghc->memslot, gfn: start_gfn,
3231	nr_pages: &nr_pages_avail);
3232	if (kvm_is_error_hva(addr: ghc->hva))
3233	return -EFAULT;
3234	}
3235
3236	/ Use the slow path for cross page reads and writes. /
3237	if (nr_pages_needed == `1`)
3238	ghc->hva += offset;
3239	else
3240	ghc->memslot = NULL;
3241
3242	ghc->gpa = gpa;
3243	ghc->len = len;
3244	return `0`;
3245	}
3246
3247	int kvm_gfn_to_hva_cache_init(struct kvm kvm, struct* gfn_to_hva_cache *ghc,
3248	gpa_t gpa, unsigned long len)
3249	{
3250	struct kvm_memslots *slots = kvm_memslots(kvm);
3251	return __kvm_gfn_to_hva_cache_init(slots, ghc, gpa, len);
3252	}
3253	EXPORT_SYMBOL_GPL(kvm_gfn_to_hva_cache_init);
3254
3255	int kvm_write_guest_offset_cached(struct kvm kvm, struct* gfn_to_hva_cache *ghc,
3256	void data, unsigned* int offset,
3257	unsigned long len)
3258	{
3259	struct kvm_memslots *slots = kvm_memslots(kvm);
3260	int r;
3261	gpa_t gpa = ghc->gpa + offset;
3262
3263	if (WARN_ON_ONCE(len + offset > ghc->len))
3264	return -EINVAL;
3265
3266	if (slots->generation != ghc->generation) {
3267	if (__kvm_gfn_to_hva_cache_init(slots, ghc, gpa: ghc->gpa, len: ghc->len))
3268	return -EFAULT;
3269	}
3270
3271	if (kvm_is_error_hva(addr: ghc->hva))
3272	return -EFAULT;
3273
3274	if (unlikely(!ghc->memslot))
3275	return kvm_write_guest(kvm, gpa, data, len);
3276
3277	r = __copy_to_user(to: (void __user *)ghc->hva + offset, from: data, n: len);
3278	if (r)
3279	return -EFAULT;
3280	mark_page_dirty_in_slot(kvm, memslot: ghc->memslot, gfn: gpa >> PAGE_SHIFT);
3281
3282	return `0`;
3283	}
3284	EXPORT_SYMBOL_GPL(kvm_write_guest_offset_cached);
3285
3286	int kvm_write_guest_cached(struct kvm kvm, struct* gfn_to_hva_cache *ghc,
3287	void data, unsigned* long len)
3288	{
3289	return kvm_write_guest_offset_cached(kvm, ghc, data, `0`, len);
3290	}
3291	EXPORT_SYMBOL_GPL(kvm_write_guest_cached);
3292
3293	int kvm_read_guest_offset_cached(struct kvm kvm, struct* gfn_to_hva_cache *ghc,
3294	void data, unsigned* int offset,
3295	unsigned long len)
3296	{
3297	struct kvm_memslots *slots = kvm_memslots(kvm);
3298	int r;
3299	gpa_t gpa = ghc->gpa + offset;
3300
3301	if (WARN_ON_ONCE(len + offset > ghc->len))
3302	return -EINVAL;
3303
3304	if (slots->generation != ghc->generation) {
3305	if (__kvm_gfn_to_hva_cache_init(slots, ghc, gpa: ghc->gpa, len: ghc->len))
3306	return -EFAULT;
3307	}
3308
3309	if (kvm_is_error_hva(addr: ghc->hva))
3310	return -EFAULT;
3311
3312	if (unlikely(!ghc->memslot))
3313	return kvm_read_guest(kvm, gpa, data, len);
3314
3315	r = __copy_from_user(to: data, from: (void __user *)ghc->hva + offset, n: len);
3316	if (r)
3317	return -EFAULT;
3318
3319	return `0`;
3320	}
3321	EXPORT_SYMBOL_GPL(kvm_read_guest_offset_cached);
3322
3323	int kvm_read_guest_cached(struct kvm kvm, struct* gfn_to_hva_cache *ghc,
3324	void data, unsigned* long len)
3325	{
3326	return kvm_read_guest_offset_cached(kvm, ghc, data, `0`, len);
3327	}
3328	EXPORT_SYMBOL_GPL(kvm_read_guest_cached);
3329
3330	int kvm_clear_guest(struct kvm kvm, gpa_t gpa, unsigned* long len)
3331	{
3332	const void zero_page = (const* void *) __va(page_to_phys(ZERO_PAGE(`0`)));
3333	gfn_t gfn = gpa >> PAGE_SHIFT;
3334	int seg;
3335	int offset = offset_in_page(gpa);
3336	int ret;
3337
3338	while ((seg = next_segment(len, offset)) != `0`) {
3339	ret = kvm_write_guest_page(kvm, gfn, zero_page, offset, len);
3340	if (ret < `0`)
3341	return ret;
3342	offset = `0`;
3343	len -= seg;
3344	++gfn;
3345	}
3346	return `0`;
3347	}
3348	EXPORT_SYMBOL_GPL(kvm_clear_guest);
3349
3350	void mark_page_dirty_in_slot(struct kvm *kvm,
3351	const struct kvm_memory_slot *memslot,
3352	gfn_t gfn)
3353	{
3354	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
3355
3356	#ifdef CONFIG_HAVE_KVM_DIRTY_RING
3357	if (WARN_ON_ONCE(vcpu && vcpu->kvm != kvm))
3358	return;
3359
3360	WARN_ON_ONCE(!vcpu && !kvm_arch_allow_write_without_running_vcpu(kvm));
3361	#endif
3362
3363	if (memslot && kvm_slot_dirty_track_enabled(slot: memslot)) {
3364	unsigned long rel_gfn = gfn - memslot->base_gfn;
3365	u32 slot = (memslot->as_id << `16`) \| memslot->id;
3366
3367	if (kvm->dirty_ring_size && vcpu)
3368	kvm_dirty_ring_push(vcpu, slot, offset: rel_gfn);
3369	else if (memslot->dirty_bitmap)
3370	set_bit_le(nr: rel_gfn, addr: memslot->dirty_bitmap);
3371	}
3372	}
3373	EXPORT_SYMBOL_GPL(mark_page_dirty_in_slot);
3374
3375	void mark_page_dirty(struct kvm *kvm, gfn_t gfn)
3376	{
3377	struct kvm_memory_slot *memslot;
3378
3379	memslot = gfn_to_memslot(kvm, gfn);
3380	mark_page_dirty_in_slot(kvm, memslot, gfn);
3381	}
3382	EXPORT_SYMBOL_GPL(mark_page_dirty);
3383
3384	void kvm_vcpu_mark_page_dirty(struct kvm_vcpu *vcpu, gfn_t gfn)
3385	{
3386	struct kvm_memory_slot *memslot;
3387
3388	memslot = kvm_vcpu_gfn_to_memslot(vcpu, gfn);
3389	mark_page_dirty_in_slot(vcpu->kvm, memslot, gfn);
3390	}
3391	EXPORT_SYMBOL_GPL(kvm_vcpu_mark_page_dirty);
3392
3393	void kvm_sigset_activate(struct kvm_vcpu *vcpu)
3394	{
3395	if (!vcpu->sigset_active)
3396	return;
3397
3398	/*
3399	* This does a lockless modification of ->real_blocked, which is fine
3400	* because, only current can change ->real_blocked and all readers of
3401	* ->real_blocked don't care as long ->real_blocked is always a subset
3402	* of ->blocked.
3403	*/
3404	sigprocmask(SIG_SETMASK, &vcpu->sigset, &current->real_blocked);
3405	}
3406
3407	void kvm_sigset_deactivate(struct kvm_vcpu *vcpu)
3408	{
3409	if (!vcpu->sigset_active)
3410	return;
3411
3412	sigprocmask(SIG_SETMASK, &current->real_blocked, NULL);
3413	sigemptyset(set: &current->real_blocked);
3414	}
3415
3416	static void grow_halt_poll_ns(struct kvm_vcpu *vcpu)
3417	{
3418	unsigned int old, val, grow, grow_start;
3419
3420	old = val = vcpu->halt_poll_ns;
3421	grow_start = READ_ONCE(halt_poll_ns_grow_start);
3422	grow = READ_ONCE(halt_poll_ns_grow);
3423	if (!grow)
3424	goto out;
3425
3426	val *= grow;
3427	if (val < grow_start)
3428	val = grow_start;
3429
3430	vcpu->halt_poll_ns = val;
3431	out:
3432	trace_kvm_halt_poll_ns_grow(vcpu->vcpu_id, val, old);
3433	}
3434
3435	static void shrink_halt_poll_ns(struct kvm_vcpu *vcpu)
3436	{
3437	unsigned int old, val, shrink, grow_start;
3438
3439	old = val = vcpu->halt_poll_ns;
3440	shrink = READ_ONCE(halt_poll_ns_shrink);
3441	grow_start = READ_ONCE(halt_poll_ns_grow_start);
3442	if (shrink == `0`)
3443	val = `0`;
3444	else
3445	val /= shrink;
3446
3447	if (val < grow_start)
3448	val = `0`;
3449
3450	vcpu->halt_poll_ns = val;
3451	trace_kvm_halt_poll_ns_shrink(vcpu->vcpu_id, val, old);
3452	}
3453
3454	static int kvm_vcpu_check_block(struct kvm_vcpu *vcpu)
3455	{
3456	int ret = -EINTR;
3457	int idx = srcu_read_lock(ssp: &vcpu->kvm->srcu);
3458
3459	if (kvm_arch_vcpu_runnable(vcpu))
3460	goto out;
3461	if (kvm_cpu_has_pending_timer(vcpu))
3462	goto out;
3463	if (signal_pending(current))
3464	goto out;
3465	if (kvm_check_request(KVM_REQ_UNBLOCK, vcpu))
3466	goto out;
3467
3468	ret = `0`;
3469	out:
3470	srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx);
3471	return ret;
3472	}
3473
3474	/*
3475	* Block the vCPU until the vCPU is runnable, an event arrives, or a signal is
3476	* pending. This is mostly used when halting a vCPU, but may also be used
3477	* directly for other vCPU non-runnable states, e.g. x86's Wait-For-SIPI.
3478	*/
3479	bool kvm_vcpu_block(struct kvm_vcpu *vcpu)
3480	{
3481	struct rcuwait *wait = kvm_arch_vcpu_get_wait(vcpu);
3482	bool waited = false;
3483
3484	vcpu->stat.generic.blocking = `1`;
3485
3486	preempt_disable();
3487	kvm_arch_vcpu_blocking(vcpu);
3488	prepare_to_rcuwait(w: wait);
3489	preempt_enable();
3490
3491	for (;;) {
3492	set_current_state(TASK_INTERRUPTIBLE);
3493
3494	if (kvm_vcpu_check_block(vcpu) < `0`)
3495	break;
3496
3497	waited = true;
3498	schedule();
3499	}
3500
3501	preempt_disable();
3502	finish_rcuwait(w: wait);
3503	kvm_arch_vcpu_unblocking(vcpu);
3504	preempt_enable();
3505
3506	vcpu->stat.generic.blocking = `0`;
3507
3508	return waited;
3509	}
3510
3511	static inline void update_halt_poll_stats(struct kvm_vcpu *vcpu, ktime_t start,
3512	ktime_t end, bool success)
3513	{
3514	struct kvm_vcpu_stat_generic *stats = &vcpu->stat.generic;
3515	u64 poll_ns = ktime_to_ns(ktime_sub(end, start));
3516
3517	++vcpu->stat.generic.halt_attempted_poll;
3518
3519	if (success) {
3520	++vcpu->stat.generic.halt_successful_poll;
3521
3522	if (!vcpu_valid_wakeup(vcpu))
3523	++vcpu->stat.generic.halt_poll_invalid;
3524
3525	stats->halt_poll_success_ns += poll_ns;
3526	KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_success_hist, poll_ns);
3527	} else {
3528	stats->halt_poll_fail_ns += poll_ns;
3529	KVM_STATS_LOG_HIST_UPDATE(stats->halt_poll_fail_hist, poll_ns);
3530	}
3531	}
3532
3533	static unsigned int kvm_vcpu_max_halt_poll_ns(struct kvm_vcpu *vcpu)
3534	{
3535	struct kvm *kvm = vcpu->kvm;
3536
3537	if (kvm->override_halt_poll_ns) {
3538	/*
3539	* Ensure kvm->max_halt_poll_ns is not read before
3540	* kvm->override_halt_poll_ns.
3541	*
3542	* Pairs with the smp_wmb() when enabling KVM_CAP_HALT_POLL.
3543	*/
3544	smp_rmb();
3545	return READ_ONCE(kvm->max_halt_poll_ns);
3546	}
3547
3548	return READ_ONCE(halt_poll_ns);
3549	}
3550
3551	/*
3552	* Emulate a vCPU halt condition, e.g. HLT on x86, WFI on arm, etc... If halt
3553	* polling is enabled, busy wait for a short time before blocking to avoid the
3554	* expensive block+unblock sequence if a wake event arrives soon after the vCPU
3555	* is halted.
3556	*/
3557	void kvm_vcpu_halt(struct kvm_vcpu *vcpu)
3558	{
3559	unsigned int max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3560	bool halt_poll_allowed = !kvm_arch_no_poll(vcpu);
3561	ktime_t start, cur, poll_end;
3562	bool waited = false;
3563	bool do_halt_poll;
3564	u64 halt_ns;
3565
3566	if (vcpu->halt_poll_ns > max_halt_poll_ns)
3567	vcpu->halt_poll_ns = max_halt_poll_ns;
3568
3569	do_halt_poll = halt_poll_allowed && vcpu->halt_poll_ns;
3570
3571	start = cur = poll_end = ktime_get();
3572	if (do_halt_poll) {
3573	ktime_t stop = ktime_add_ns(start, vcpu->halt_poll_ns);
3574
3575	do {
3576	if (kvm_vcpu_check_block(vcpu) < `0`)
3577	goto out;
3578	cpu_relax();
3579	poll_end = cur = ktime_get();
3580	} while (kvm_vcpu_can_poll(cur, stop));
3581	}
3582
3583	waited = kvm_vcpu_block(vcpu);
3584
3585	cur = ktime_get();
3586	if (waited) {
3587	vcpu->stat.generic.halt_wait_ns +=
3588	ktime_to_ns(kt: cur) - ktime_to_ns(kt: poll_end);
3589	KVM_STATS_LOG_HIST_UPDATE(vcpu->stat.generic.halt_wait_hist,
3590	ktime_to_ns(cur) - ktime_to_ns(poll_end));
3591	}
3592	out:
3593	/ The total time the vCPU was "halted", including polling time. /
3594	halt_ns = ktime_to_ns(kt: cur) - ktime_to_ns(kt: start);
3595
3596	/*
3597	* Note, halt-polling is considered successful so long as the vCPU was
3598	* never actually scheduled out, i.e. even if the wake event arrived
3599	* after of the halt-polling loop itself, but before the full wait.
3600	*/
3601	if (do_halt_poll)
3602	update_halt_poll_stats(vcpu, start, end: poll_end, success: !waited);
3603
3604	if (halt_poll_allowed) {
3605	/ Recompute the max halt poll time in case it changed. /
3606	max_halt_poll_ns = kvm_vcpu_max_halt_poll_ns(vcpu);
3607
3608	if (!vcpu_valid_wakeup(vcpu)) {
3609	shrink_halt_poll_ns(vcpu);
3610	} else if (max_halt_poll_ns) {
3611	if (halt_ns <= vcpu->halt_poll_ns)
3612	;
3613	/ we had a long block, shrink polling /
3614	else if (vcpu->halt_poll_ns &&
3615	halt_ns > max_halt_poll_ns)
3616	shrink_halt_poll_ns(vcpu);
3617	/ we had a short halt and our poll time is too small /
3618	else if (vcpu->halt_poll_ns < max_halt_poll_ns &&
3619	halt_ns < max_halt_poll_ns)
3620	grow_halt_poll_ns(vcpu);
3621	} else {
3622	vcpu->halt_poll_ns = `0`;
3623	}
3624	}
3625
3626	trace_kvm_vcpu_wakeup(ns: halt_ns, waited, valid: vcpu_valid_wakeup(vcpu));
3627	}
3628	EXPORT_SYMBOL_GPL(kvm_vcpu_halt);
3629
3630	bool kvm_vcpu_wake_up(struct kvm_vcpu *vcpu)
3631	{
3632	if (__kvm_vcpu_wake_up(vcpu)) {
3633	WRITE_ONCE(vcpu->ready, true);
3634	++vcpu->stat.generic.halt_wakeup;
3635	return true;
3636	}
3637
3638	return false;
3639	}
3640	EXPORT_SYMBOL_GPL(kvm_vcpu_wake_up);
3641
3642	#ifndef CONFIG_S390
3643	/*
3644	* Kick a sleeping VCPU, or a guest VCPU in guest mode, into host kernel mode.
3645	*/
3646	void kvm_vcpu_kick(struct kvm_vcpu *vcpu)
3647	{
3648	int me, cpu;
3649
3650	if (kvm_vcpu_wake_up(vcpu))
3651	return;
3652
3653	me = get_cpu();
3654	/*
3655	* The only state change done outside the vcpu mutex is IN_GUEST_MODE
3656	* to EXITING_GUEST_MODE. Therefore the moderately expensive "should
3657	* kick" check does not need atomic operations if kvm_vcpu_kick is used
3658	* within the vCPU thread itself.
3659	*/
3660	if (vcpu == __this_cpu_read(kvm_running_vcpu)) {
3661	if (vcpu->mode == IN_GUEST_MODE)
3662	WRITE_ONCE(vcpu->mode, EXITING_GUEST_MODE);
3663	goto out;
3664	}
3665
3666	/*
3667	* Note, the vCPU could get migrated to a different pCPU at any point
3668	* after kvm_arch_vcpu_should_kick(), which could result in sending an
3669	* IPI to the previous pCPU. But, that's ok because the purpose of the
3670	* IPI is to force the vCPU to leave IN_GUEST_MODE, and migrating the
3671	* vCPU also requires it to leave IN_GUEST_MODE.
3672	*/
3673	if (kvm_arch_vcpu_should_kick(vcpu)) {
3674	cpu = READ_ONCE(vcpu->cpu);
3675	if (cpu != me && (unsigned)cpu < nr_cpu_ids && cpu_online(cpu))
3676	smp_send_reschedule(cpu);
3677	}
3678	out:
3679	put_cpu();
3680	}
3681	EXPORT_SYMBOL_GPL(kvm_vcpu_kick);
3682	#endif /* !CONFIG_S390 */
3683
3684	int kvm_vcpu_yield_to(struct kvm_vcpu *target)
3685	{
3686	struct pid *pid;
3687	struct task_struct *task = NULL;
3688	int ret = `0`;
3689
3690	rcu_read_lock();
3691	pid = rcu_dereference(target->pid);
3692	if (pid)
3693	task = get_pid_task(pid, PIDTYPE_PID);
3694	rcu_read_unlock();
3695	if (!task)
3696	return ret;
3697	ret = yield_to(p: task, preempt: `1`);
3698	put_task_struct(t: task);
3699
3700	return ret;
3701	}
3702	EXPORT_SYMBOL_GPL(kvm_vcpu_yield_to);
3703
3704	/*
3705	* Helper that checks whether a VCPU is eligible for directed yield.
3706	* Most eligible candidate to yield is decided by following heuristics:
3707	*
3708	* (a) VCPU which has not done pl-exit or cpu relax intercepted recently
3709	* (preempted lock holder), indicated by @in_spin_loop.
3710	* Set at the beginning and cleared at the end of interception/PLE handler.
3711	*
3712	* (b) VCPU which has done pl-exit/ cpu relax intercepted but did not get
3713	* chance last time (mostly it has become eligible now since we have probably
3714	* yielded to lockholder in last iteration. This is done by toggling
3715	* @dy_eligible each time a VCPU checked for eligibility.)
3716	*
3717	* Yielding to a recently pl-exited/cpu relax intercepted VCPU before yielding
3718	* to preempted lock-holder could result in wrong VCPU selection and CPU
3719	* burning. Giving priority for a potential lock-holder increases lock
3720	* progress.
3721	*
3722	* Since algorithm is based on heuristics, accessing another VCPU data without
3723	* locking does not harm. It may result in trying to yield to same VCPU, fail
3724	* and continue with next VCPU and so on.
3725	*/
3726	static bool kvm_vcpu_eligible_for_directed_yield(struct kvm_vcpu *vcpu)
3727	{
3728	#ifdef CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT
3729	bool eligible;
3730
3731	eligible = !vcpu->spin_loop.in_spin_loop \|\|
3732	vcpu->spin_loop.dy_eligible;
3733
3734	if (vcpu->spin_loop.in_spin_loop)
3735	kvm_vcpu_set_dy_eligible(vcpu, val: !vcpu->spin_loop.dy_eligible);
3736
3737	return eligible;
3738	#else
3739	return true;
3740	#endif
3741	}
3742
3743	/*
3744	* Unlike kvm_arch_vcpu_runnable, this function is called outside
3745	* a vcpu_load/vcpu_put pair. However, for most architectures
3746	* kvm_arch_vcpu_runnable does not require vcpu_load.
3747	*/
3748	bool __weak kvm_arch_dy_runnable(struct kvm_vcpu *vcpu)
3749	{
3750	return kvm_arch_vcpu_runnable(vcpu);
3751	}
3752
3753	static bool vcpu_dy_runnable(struct kvm_vcpu *vcpu)
3754	{
3755	if (kvm_arch_dy_runnable(vcpu))
3756	return true;
3757
3758	#ifdef CONFIG_KVM_ASYNC_PF
3759	if (!list_empty_careful(head: &vcpu->async_pf.done))
3760	return true;
3761	#endif
3762
3763	return false;
3764	}
3765
3766	bool __weak kvm_arch_dy_has_pending_interrupt(struct kvm_vcpu *vcpu)
3767	{
3768	return false;
3769	}
3770
3771	void kvm_vcpu_on_spin(struct kvm_vcpu *me, bool yield_to_kernel_mode)
3772	{
3773	struct kvm *kvm = me->kvm;
3774	struct kvm_vcpu *vcpu;
3775	int last_boosted_vcpu = me->kvm->last_boosted_vcpu;
3776	unsigned long i;
3777	int yielded = `0`;
3778	int try = `3`;
3779	int pass;
3780
3781	kvm_vcpu_set_in_spin_loop(vcpu: me, val: true);
3782	/*
3783	* We boost the priority of a VCPU that is runnable but not
3784	* currently running, because it got preempted by something
3785	* else and called schedule in __vcpu_run. Hopefully that
3786	* VCPU is holding the lock that we need and will release it.
3787	* We approximate round-robin by starting at the last boosted VCPU.
3788	*/
3789	for (pass = `0`; pass < `2` && !yielded && try; pass++) {
3790	kvm_for_each_vcpu(i, vcpu, kvm) {
3791	if (!pass && i <= last_boosted_vcpu) {
3792	i = last_boosted_vcpu;
3793	continue;
3794	} else if (pass && i > last_boosted_vcpu)
3795	break;
3796	if (!READ_ONCE(vcpu->ready))
3797	continue;
3798	if (vcpu == me)
3799	continue;
3800	if (kvm_vcpu_is_blocking(vcpu) && !vcpu_dy_runnable(vcpu))
3801	continue;
3802	if (READ_ONCE(vcpu->preempted) && yield_to_kernel_mode &&
3803	!kvm_arch_dy_has_pending_interrupt(vcpu) &&
3804	!kvm_arch_vcpu_in_kernel(vcpu))
3805	continue;
3806	if (!kvm_vcpu_eligible_for_directed_yield(vcpu))
3807	continue;
3808
3809	yielded = kvm_vcpu_yield_to(vcpu);
3810	if (yielded > `0`) {
3811	kvm->last_boosted_vcpu = i;
3812	break;
3813	} else if (yielded < `0`) {
3814	try--;
3815	if (!try)
3816	break;
3817	}
3818	}
3819	}
3820	kvm_vcpu_set_in_spin_loop(vcpu: me, val: false);
3821
3822	/ Ensure vcpu is not eligible during next spinloop /
3823	kvm_vcpu_set_dy_eligible(vcpu: me, val: false);
3824	}
3825	EXPORT_SYMBOL_GPL(kvm_vcpu_on_spin);
3826
3827	static bool kvm_page_in_dirty_ring(struct kvm kvm, unsigned* long pgoff)
3828	{
3829	#ifdef CONFIG_HAVE_KVM_DIRTY_RING
3830	return (pgoff >= KVM_DIRTY_LOG_PAGE_OFFSET) &&
3831	(pgoff < KVM_DIRTY_LOG_PAGE_OFFSET +
3832	kvm->dirty_ring_size / PAGE_SIZE);
3833	#else
3834	return false;
3835	#endif
3836	}
3837
3838	static vm_fault_t kvm_vcpu_fault(struct vm_fault *vmf)
3839	{
3840	struct kvm_vcpu *vcpu = vmf->vma->vm_file->private_data;
3841	struct page *page;
3842
3843	if (vmf->pgoff == `0`)
3844	page = virt_to_page(vcpu->run);
3845	#ifdef CONFIG_X86
3846	else if (vmf->pgoff == KVM_PIO_PAGE_OFFSET)
3847	page = virt_to_page(vcpu->arch.pio_data);
3848	#endif
3849	#ifdef CONFIG_KVM_MMIO
3850	else if (vmf->pgoff == KVM_COALESCED_MMIO_PAGE_OFFSET)
3851	page = virt_to_page(vcpu->kvm->coalesced_mmio_ring);
3852	#endif
3853	else if (kvm_page_in_dirty_ring(kvm: vcpu->kvm, pgoff: vmf->pgoff))
3854	page = kvm_dirty_ring_get_page(
3855	ring: &vcpu->dirty_ring,
3856	offset: vmf->pgoff - KVM_DIRTY_LOG_PAGE_OFFSET);
3857	else
3858	return kvm_arch_vcpu_fault(vcpu, vmf);
3859	get_page(page);
3860	vmf->page = page;
3861	return `0`;
3862	}
3863
3864	static const struct vm_operations_struct kvm_vcpu_vm_ops = {
3865	.fault = kvm_vcpu_fault,
3866	};
3867
3868	static int kvm_vcpu_mmap(struct file file, struct* vm_area_struct *vma)
3869	{
3870	struct kvm_vcpu *vcpu = file->private_data;
3871	unsigned long pages = vma_pages(vma);
3872
3873	if ((kvm_page_in_dirty_ring(kvm: vcpu->kvm, pgoff: vma->vm_pgoff) \|\|
3874	kvm_page_in_dirty_ring(kvm: vcpu->kvm, pgoff: vma->vm_pgoff + pages - `1`)) &&
3875	((vma->vm_flags & VM_EXEC) \|\| !(vma->vm_flags & VM_SHARED)))
3876	return -EINVAL;
3877
3878	vma->vm_ops = &kvm_vcpu_vm_ops;
3879	return `0`;
3880	}
3881
3882	static int kvm_vcpu_release(struct inode inode, struct* file *filp)
3883	{
3884	struct kvm_vcpu *vcpu = filp->private_data;
3885
3886	kvm_put_kvm(vcpu->kvm);
3887	return `0`;
3888	}
3889
3890	static const struct file_operations kvm_vcpu_fops = {
3891	.release = kvm_vcpu_release,
3892	.unlocked_ioctl = kvm_vcpu_ioctl,
3893	.mmap = kvm_vcpu_mmap,
3894	.llseek = noop_llseek,
3895	KVM_COMPAT(kvm_vcpu_compat_ioctl),
3896	};
3897
3898	/*
3899	* Allocates an inode for the vcpu.
3900	*/
3901	static int create_vcpu_fd(struct kvm_vcpu *vcpu)
3902	{
3903	char name[`8` + `1` + ITOA_MAX_LEN + `1`];
3904
3905	snprintf(buf: name, size: sizeof(name), fmt: "kvm-vcpu:%d", vcpu->vcpu_id);
3906	return anon_inode_getfd(name, fops: &kvm_vcpu_fops, priv: vcpu, O_RDWR \| O_CLOEXEC);
3907	}
3908
3909	#ifdef __KVM_HAVE_ARCH_VCPU_DEBUGFS
3910	static int vcpu_get_pid(void data, u64 val)
3911	{
3912	struct kvm_vcpu *vcpu = data;
3913
3914	rcu_read_lock();
3915	*val = pid_nr(rcu_dereference(vcpu->pid));
3916	rcu_read_unlock();
3917	return `0`;
3918	}
3919
3920	DEFINE_SIMPLE_ATTRIBUTE(vcpu_get_pid_fops, vcpu_get_pid, NULL, "%llu\n");
3921
3922	static void kvm_create_vcpu_debugfs(struct kvm_vcpu *vcpu)
3923	{
3924	struct dentry *debugfs_dentry;
3925	char dir_name[ITOA_MAX_LEN * `2`];
3926
3927	if (!debugfs_initialized())
3928	return;
3929
3930	snprintf(buf: dir_name, size: sizeof(dir_name), fmt: "vcpu%d", vcpu->vcpu_id);
3931	debugfs_dentry = debugfs_create_dir(name: dir_name,
3932	parent: vcpu->kvm->debugfs_dentry);
3933	debugfs_create_file(name: "pid", mode: `0444`, parent: debugfs_dentry, data: vcpu,
3934	fops: &vcpu_get_pid_fops);
3935
3936	kvm_arch_create_vcpu_debugfs(vcpu, debugfs_dentry);
3937	}
3938	#endif
3939
3940	/*
3941	* Creates some virtual cpus. Good luck creating more than one.
3942	*/
3943	static int kvm_vm_ioctl_create_vcpu(struct kvm *kvm, u32 id)
3944	{
3945	int r;
3946	struct kvm_vcpu *vcpu;
3947	struct page *page;
3948
3949	if (id >= KVM_MAX_VCPU_IDS)
3950	return -EINVAL;
3951
3952	mutex_lock(&kvm->lock);
3953	if (kvm->created_vcpus >= kvm->max_vcpus) {
3954	mutex_unlock(lock: &kvm->lock);
3955	return -EINVAL;
3956	}
3957
3958	r = kvm_arch_vcpu_precreate(kvm, id);
3959	if (r) {
3960	mutex_unlock(lock: &kvm->lock);
3961	return r;
3962	}
3963
3964	kvm->created_vcpus++;
3965	mutex_unlock(lock: &kvm->lock);
3966
3967	vcpu = kmem_cache_zalloc(k: kvm_vcpu_cache, GFP_KERNEL_ACCOUNT);
3968	if (!vcpu) {
3969	r = -ENOMEM;
3970	goto vcpu_decrement;
3971	}
3972
3973	BUILD_BUG_ON(sizeof(struct kvm_run) > PAGE_SIZE);
3974	page = alloc_page(GFP_KERNEL_ACCOUNT \| __GFP_ZERO);
3975	if (!page) {
3976	r = -ENOMEM;
3977	goto vcpu_free;
3978	}
3979	vcpu->run = page_address(page);
3980
3981	kvm_vcpu_init(vcpu, kvm, id);
3982
3983	r = kvm_arch_vcpu_create(vcpu);
3984	if (r)
3985	goto vcpu_free_run_page;
3986
3987	if (kvm->dirty_ring_size) {
3988	r = kvm_dirty_ring_alloc(ring: &vcpu->dirty_ring,
3989	index: id, size: kvm->dirty_ring_size);
3990	if (r)
3991	goto arch_vcpu_destroy;
3992	}
3993
3994	mutex_lock(&kvm->lock);
3995
3996	#ifdef CONFIG_LOCKDEP
3997	/ Ensure that lockdep knows vcpu->mutex is taken inside kvm->lock /
3998	mutex_lock(&vcpu->mutex);
3999	mutex_unlock(lock: &vcpu->mutex);
4000	#endif
4001
4002	if (kvm_get_vcpu_by_id(kvm, id)) {
4003	r = -EEXIST;
4004	goto unlock_vcpu_destroy;
4005	}
4006
4007	vcpu->vcpu_idx = atomic_read(v: &kvm->online_vcpus);
4008	r = xa_reserve(xa: &kvm->vcpu_array, index: vcpu->vcpu_idx, GFP_KERNEL_ACCOUNT);
4009	if (r)
4010	goto unlock_vcpu_destroy;
4011
4012	/ Now it's all set up, let userspace reach it /
4013	kvm_get_kvm(kvm);
4014	r = create_vcpu_fd(vcpu);
4015	if (r < `0`)
4016	goto kvm_put_xa_release;
4017
4018	if (KVM_BUG_ON(xa_store(&kvm->vcpu_array, vcpu->vcpu_idx, vcpu, `0`), kvm)) {
4019	r = -EINVAL;
4020	goto kvm_put_xa_release;
4021	}
4022
4023	/*
4024	* Pairs with smp_rmb() in kvm_get_vcpu. Store the vcpu
4025	* pointer before kvm->online_vcpu's incremented value.
4026	*/
4027	smp_wmb();
4028	atomic_inc(v: &kvm->online_vcpus);
4029
4030	mutex_unlock(lock: &kvm->lock);
4031	kvm_arch_vcpu_postcreate(vcpu);
4032	kvm_create_vcpu_debugfs(vcpu);
4033	return r;
4034
4035	kvm_put_xa_release:
4036	kvm_put_kvm_no_destroy(kvm);
4037	xa_release(xa: &kvm->vcpu_array, index: vcpu->vcpu_idx);
4038	unlock_vcpu_destroy:
4039	mutex_unlock(lock: &kvm->lock);
4040	kvm_dirty_ring_free(ring: &vcpu->dirty_ring);
4041	arch_vcpu_destroy:
4042	kvm_arch_vcpu_destroy(vcpu);
4043	vcpu_free_run_page:
4044	free_page((unsigned long)vcpu->run);
4045	vcpu_free:
4046	kmem_cache_free(s: kvm_vcpu_cache, objp: vcpu);
4047	vcpu_decrement:
4048	mutex_lock(&kvm->lock);
4049	kvm->created_vcpus--;
4050	mutex_unlock(lock: &kvm->lock);
4051	return r;
4052	}
4053
4054	static int kvm_vcpu_ioctl_set_sigmask(struct kvm_vcpu vcpu, sigset_t sigset)
4055	{
4056	if (sigset) {
4057	sigdelsetmask(set: sigset, sigmask(SIGKILL)\|sigmask(SIGSTOP));
4058	vcpu->sigset_active = `1`;
4059	vcpu->sigset = *sigset;
4060	} else
4061	vcpu->sigset_active = `0`;
4062	return `0`;
4063	}
4064
4065	static ssize_t kvm_vcpu_stats_read(struct file file, char* __user *user_buffer,
4066	size_t size, loff_t *offset)
4067	{
4068	struct kvm_vcpu *vcpu = file->private_data;
4069
4070	return kvm_stats_read(id: vcpu->stats_id, header: &kvm_vcpu_stats_header,
4071	desc: &kvm_vcpu_stats_desc[`0`], stats: &vcpu->stat,
4072	size_stats: sizeof(vcpu->stat), user_buffer, size, offset);
4073	}
4074
4075	static int kvm_vcpu_stats_release(struct inode inode, struct* file *file)
4076	{
4077	struct kvm_vcpu *vcpu = file->private_data;
4078
4079	kvm_put_kvm(vcpu->kvm);
4080	return `0`;
4081	}
4082
4083	static const struct file_operations kvm_vcpu_stats_fops = {
4084	.read = kvm_vcpu_stats_read,
4085	.release = kvm_vcpu_stats_release,
4086	.llseek = noop_llseek,
4087	};
4088
4089	static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
4090	{
4091	int fd;
4092	struct file *file;
4093	char name[`15` + ITOA_MAX_LEN + `1`];
4094
4095	snprintf(buf: name, size: sizeof(name), fmt: "kvm-vcpu-stats:%d", vcpu->vcpu_id);
4096
4097	fd = get_unused_fd_flags(O_CLOEXEC);
4098	if (fd < `0`)
4099	return fd;
4100
4101	file = anon_inode_getfile(name, fops: &kvm_vcpu_stats_fops, priv: vcpu, O_RDONLY);
4102	if (IS_ERR(ptr: file)) {
4103	put_unused_fd(fd);
4104	return PTR_ERR(ptr: file);
4105	}
4106
4107	kvm_get_kvm(vcpu->kvm);
4108
4109	file->f_mode \|= FMODE_PREAD;
4110	fd_install(fd, file);
4111
4112	return fd;
4113	}
4114
4115	static long kvm_vcpu_ioctl(struct file *filp,
4116	unsigned int ioctl, unsigned long arg)
4117	{
4118	struct kvm_vcpu *vcpu = filp->private_data;
4119	void __user argp = (void* __user *)arg;
4120	int r;
4121	struct kvm_fpu *fpu = NULL;
4122	struct kvm_sregs *kvm_sregs = NULL;
4123
4124	if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_dead)
4125	return -EIO;
4126
4127	if (unlikely(_IOC_TYPE(ioctl) != KVMIO))
4128	return -EINVAL;
4129
4130	/*
4131	* Some architectures have vcpu ioctls that are asynchronous to vcpu
4132	* execution; mutex_lock() would break them.
4133	*/
4134	r = kvm_arch_vcpu_async_ioctl(filp, ioctl, arg);
4135	if (r != -ENOIOCTLCMD)
4136	return r;
4137
4138	if (mutex_lock_killable(&vcpu->mutex))
4139	return -EINTR;
4140	switch (ioctl) {
4141	case KVM_RUN: {
4142	struct pid *oldpid;
4143	r = -EINVAL;
4144	if (arg)
4145	goto out;
4146	oldpid = rcu_access_pointer(vcpu->pid);
4147	if (unlikely(oldpid != task_pid(current))) {
4148	/ The thread running this VCPU changed. /
4149	struct pid *newpid;
4150
4151	r = kvm_arch_vcpu_run_pid_change(vcpu);
4152	if (r)
4153	break;
4154
4155	newpid = get_task_pid(current, type: PIDTYPE_PID);
4156	rcu_assign_pointer(vcpu->pid, newpid);
4157	if (oldpid)
4158	synchronize_rcu();
4159	put_pid(pid: oldpid);
4160	}
4161	r = kvm_arch_vcpu_ioctl_run(vcpu);
4162	trace_kvm_userspace_exit(reason: vcpu->run->exit_reason, errno: r);
4163	break;
4164	}
4165	case KVM_GET_REGS: {
4166	struct kvm_regs *kvm_regs;
4167
4168	r = -ENOMEM;
4169	kvm_regs = kzalloc(size: sizeof(struct kvm_regs), GFP_KERNEL_ACCOUNT);
4170	if (!kvm_regs)
4171	goto out;
4172	r = kvm_arch_vcpu_ioctl_get_regs(vcpu, regs: kvm_regs);
4173	if (r)
4174	goto out_free1;
4175	r = -EFAULT;
4176	if (copy_to_user(to: argp, from: kvm_regs, n: sizeof(struct kvm_regs)))
4177	goto out_free1;
4178	r = `0`;
4179	out_free1:
4180	kfree(objp: kvm_regs);
4181	break;
4182	}
4183	case KVM_SET_REGS: {
4184	struct kvm_regs *kvm_regs;
4185
4186	kvm_regs = memdup_user(argp, sizeof(*kvm_regs));
4187	if (IS_ERR(ptr: kvm_regs)) {
4188	r = PTR_ERR(ptr: kvm_regs);
4189	goto out;
4190	}
4191	r = kvm_arch_vcpu_ioctl_set_regs(vcpu, regs: kvm_regs);
4192	kfree(objp: kvm_regs);
4193	break;
4194	}
4195	case KVM_GET_SREGS: {
4196	kvm_sregs = kzalloc(size: sizeof(struct kvm_sregs),
4197	GFP_KERNEL_ACCOUNT);
4198	r = -ENOMEM;
4199	if (!kvm_sregs)
4200	goto out;
4201	r = kvm_arch_vcpu_ioctl_get_sregs(vcpu, sregs: kvm_sregs);
4202	if (r)
4203	goto out;
4204	r = -EFAULT;
4205	if (copy_to_user(to: argp, from: kvm_sregs, n: sizeof(struct kvm_sregs)))
4206	goto out;
4207	r = `0`;
4208	break;
4209	}
4210	case KVM_SET_SREGS: {
4211	kvm_sregs = memdup_user(argp, sizeof(*kvm_sregs));
4212	if (IS_ERR(ptr: kvm_sregs)) {
4213	r = PTR_ERR(ptr: kvm_sregs);
4214	kvm_sregs = NULL;
4215	goto out;
4216	}
4217	r = kvm_arch_vcpu_ioctl_set_sregs(vcpu, sregs: kvm_sregs);
4218	break;
4219	}
4220	case KVM_GET_MP_STATE: {
4221	struct kvm_mp_state mp_state;
4222
4223	r = kvm_arch_vcpu_ioctl_get_mpstate(vcpu, mp_state: &mp_state);
4224	if (r)
4225	goto out;
4226	r = -EFAULT;
4227	if (copy_to_user(to: argp, from: &mp_state, n: sizeof(mp_state)))
4228	goto out;
4229	r = `0`;
4230	break;
4231	}
4232	case KVM_SET_MP_STATE: {
4233	struct kvm_mp_state mp_state;
4234
4235	r = -EFAULT;
4236	if (copy_from_user(to: &mp_state, from: argp, n: sizeof(mp_state)))
4237	goto out;
4238	r = kvm_arch_vcpu_ioctl_set_mpstate(vcpu, mp_state: &mp_state);
4239	break;
4240	}
4241	case KVM_TRANSLATE: {
4242	struct kvm_translation tr;
4243
4244	r = -EFAULT;
4245	if (copy_from_user(to: &tr, from: argp, n: sizeof(tr)))
4246	goto out;
4247	r = kvm_arch_vcpu_ioctl_translate(vcpu, tr: &tr);
4248	if (r)
4249	goto out;
4250	r = -EFAULT;
4251	if (copy_to_user(to: argp, from: &tr, n: sizeof(tr)))
4252	goto out;
4253	r = `0`;
4254	break;
4255	}
4256	case KVM_SET_GUEST_DEBUG: {
4257	struct kvm_guest_debug dbg;
4258
4259	r = -EFAULT;
4260	if (copy_from_user(to: &dbg, from: argp, n: sizeof(dbg)))
4261	goto out;
4262	r = kvm_arch_vcpu_ioctl_set_guest_debug(vcpu, dbg: &dbg);
4263	break;
4264	}
4265	case KVM_SET_SIGNAL_MASK: {
4266	struct kvm_signal_mask __user *sigmask_arg = argp;
4267	struct kvm_signal_mask kvm_sigmask;
4268	sigset_t sigset, *p;
4269
4270	p = NULL;
4271	if (argp) {
4272	r = -EFAULT;
4273	if (copy_from_user(to: &kvm_sigmask, from: argp,
4274	n: sizeof(kvm_sigmask)))
4275	goto out;
4276	r = -EINVAL;
4277	if (kvm_sigmask.len != sizeof(sigset))
4278	goto out;
4279	r = -EFAULT;
4280	if (copy_from_user(to: &sigset, from: sigmask_arg->sigset,
4281	n: sizeof(sigset)))
4282	goto out;
4283	p = &sigset;
4284	}
4285	r = kvm_vcpu_ioctl_set_sigmask(vcpu, sigset: p);
4286	break;
4287	}
4288	case KVM_GET_FPU: {
4289	fpu = kzalloc(size: sizeof(struct kvm_fpu), GFP_KERNEL_ACCOUNT);
4290	r = -ENOMEM;
4291	if (!fpu)
4292	goto out;
4293	r = kvm_arch_vcpu_ioctl_get_fpu(vcpu, fpu);
4294	if (r)
4295	goto out;
4296	r = -EFAULT;
4297	if (copy_to_user(to: argp, from: fpu, n: sizeof(struct kvm_fpu)))
4298	goto out;
4299	r = `0`;
4300	break;
4301	}
4302	case KVM_SET_FPU: {
4303	fpu = memdup_user(argp, sizeof(*fpu));
4304	if (IS_ERR(ptr: fpu)) {
4305	r = PTR_ERR(ptr: fpu);
4306	fpu = NULL;
4307	goto out;
4308	}
4309	r = kvm_arch_vcpu_ioctl_set_fpu(vcpu, fpu);
4310	break;
4311	}
4312	case KVM_GET_STATS_FD: {
4313	r = kvm_vcpu_ioctl_get_stats_fd(vcpu);
4314	break;
4315	}
4316	default:
4317	r = kvm_arch_vcpu_ioctl(filp, ioctl, arg);
4318	}
4319	out:
4320	mutex_unlock(lock: &vcpu->mutex);
4321	kfree(objp: fpu);
4322	kfree(objp: kvm_sregs);
4323	return r;
4324	}
4325
4326	#ifdef CONFIG_KVM_COMPAT
4327	static long kvm_vcpu_compat_ioctl(struct file *filp,
4328	unsigned int ioctl, unsigned long arg)
4329	{
4330	struct kvm_vcpu *vcpu = filp->private_data;
4331	void __user *argp = compat_ptr(uptr: arg);
4332	int r;
4333
4334	if (vcpu->kvm->mm != current->mm \|\| vcpu->kvm->vm_dead)
4335	return -EIO;
4336
4337	switch (ioctl) {
4338	case KVM_SET_SIGNAL_MASK: {
4339	struct kvm_signal_mask __user *sigmask_arg = argp;
4340	struct kvm_signal_mask kvm_sigmask;
4341	sigset_t sigset;
4342
4343	if (argp) {
4344	r = -EFAULT;
4345	if (copy_from_user(to: &kvm_sigmask, from: argp,
4346	n: sizeof(kvm_sigmask)))
4347	goto out;
4348	r = -EINVAL;
4349	if (kvm_sigmask.len != sizeof(compat_sigset_t))
4350	goto out;
4351	r = -EFAULT;
4352	if (get_compat_sigset(set: &sigset,
4353	compat: (compat_sigset_t __user *)sigmask_arg->sigset))
4354	goto out;
4355	r = kvm_vcpu_ioctl_set_sigmask(vcpu, sigset: &sigset);
4356	} else
4357	r = kvm_vcpu_ioctl_set_sigmask(vcpu, NULL);
4358	break;
4359	}
4360	default:
4361	r = kvm_vcpu_ioctl(filp, ioctl, arg);
4362	}
4363
4364	out:
4365	return r;
4366	}
4367	#endif
4368
4369	static int kvm_device_mmap(struct file filp, struct* vm_area_struct *vma)
4370	{
4371	struct kvm_device *dev = filp->private_data;
4372
4373	if (dev->ops->mmap)
4374	return dev->ops->mmap(dev, vma);
4375
4376	return -ENODEV;
4377	}
4378
4379	static int kvm_device_ioctl_attr(struct kvm_device *dev,
4380	int (accessor)(struct* kvm_device *dev,
4381	struct kvm_device_attr *attr),
4382	unsigned long arg)
4383	{
4384	struct kvm_device_attr attr;
4385
4386	if (!accessor)
4387	return -EPERM;
4388
4389	if (copy_from_user(to: &attr, from: (void __user )arg, n: sizeof*(attr)))
4390	return -EFAULT;
4391
4392	return accessor(dev, &attr);
4393	}
4394
4395	static long kvm_device_ioctl(struct file filp, unsigned* int ioctl,
4396	unsigned long arg)
4397	{
4398	struct kvm_device *dev = filp->private_data;
4399
4400	if (dev->kvm->mm != current->mm \|\| dev->kvm->vm_dead)
4401	return -EIO;
4402
4403	switch (ioctl) {
4404	case KVM_SET_DEVICE_ATTR:
4405	return kvm_device_ioctl_attr(dev, accessor: dev->ops->set_attr, arg);
4406	case KVM_GET_DEVICE_ATTR:
4407	return kvm_device_ioctl_attr(dev, accessor: dev->ops->get_attr, arg);
4408	case KVM_HAS_DEVICE_ATTR:
4409	return kvm_device_ioctl_attr(dev, accessor: dev->ops->has_attr, arg);
4410	default:
4411	if (dev->ops->ioctl)
4412	return dev->ops->ioctl(dev, ioctl, arg);
4413
4414	return -ENOTTY;
4415	}
4416	}
4417
4418	static int kvm_device_release(struct inode inode, struct* file *filp)
4419	{
4420	struct kvm_device *dev = filp->private_data;
4421	struct kvm *kvm = dev->kvm;
4422
4423	if (dev->ops->release) {
4424	mutex_lock(&kvm->lock);
4425	list_del(entry: &dev->vm_node);
4426	dev->ops->release(dev);
4427	mutex_unlock(lock: &kvm->lock);
4428	}
4429
4430	kvm_put_kvm(kvm);
4431	return `0`;
4432	}
4433
4434	static const struct file_operations kvm_device_fops = {
4435	.unlocked_ioctl = kvm_device_ioctl,
4436	.release = kvm_device_release,
4437	KVM_COMPAT(kvm_device_ioctl),
4438	.mmap = kvm_device_mmap,
4439	};
4440
4441	struct kvm_device kvm_device_from_filp(struct* file *filp)
4442	{
4443	if (filp->f_op != &kvm_device_fops)
4444	return NULL;
4445
4446	return filp->private_data;
4447	}
4448
4449	static const struct kvm_device_ops *kvm_device_ops_table[KVM_DEV_TYPE_MAX] = {
4450	#ifdef CONFIG_KVM_MPIC
4451	[KVM_DEV_TYPE_FSL_MPIC_20] = &kvm_mpic_ops,
4452	[KVM_DEV_TYPE_FSL_MPIC_42] = &kvm_mpic_ops,
4453	#endif
4454	};
4455
4456	int kvm_register_device_ops(const struct kvm_device_ops *ops, u32 type)
4457	{
4458	if (type >= ARRAY_SIZE(kvm_device_ops_table))
4459	return -ENOSPC;
4460
4461	if (kvm_device_ops_table[type] != NULL)
4462	return -EEXIST;
4463
4464	kvm_device_ops_table[type] = ops;
4465	return `0`;
4466	}
4467
4468	void kvm_unregister_device_ops(u32 type)
4469	{
4470	if (kvm_device_ops_table[type] != NULL)
4471	kvm_device_ops_table[type] = NULL;
4472	}
4473
4474	static int kvm_ioctl_create_device(struct kvm *kvm,
4475	struct kvm_create_device *cd)
4476	{
4477	const struct kvm_device_ops *ops;
4478	struct kvm_device *dev;
4479	bool test = cd->flags & KVM_CREATE_DEVICE_TEST;
4480	int type;
4481	int ret;
4482
4483	if (cd->type >= ARRAY_SIZE(kvm_device_ops_table))
4484	return -ENODEV;
4485
4486	type = array_index_nospec(cd->type, ARRAY_SIZE(kvm_device_ops_table));
4487	ops = kvm_device_ops_table[type];
4488	if (ops == NULL)
4489	return -ENODEV;
4490
4491	if (test)
4492	return `0`;
4493
4494	dev = kzalloc(size: sizeof(*dev), GFP_KERNEL_ACCOUNT);
4495	if (!dev)
4496	return -ENOMEM;
4497
4498	dev->ops = ops;
4499	dev->kvm = kvm;
4500
4501	mutex_lock(&kvm->lock);
4502	ret = ops->create(dev, type);
4503	if (ret < `0`) {
4504	mutex_unlock(lock: &kvm->lock);
4505	kfree(objp: dev);
4506	return ret;
4507	}
4508	list_add(new: &dev->vm_node, head: &kvm->devices);
4509	mutex_unlock(lock: &kvm->lock);
4510
4511	if (ops->init)
4512	ops->init(dev);
4513
4514	kvm_get_kvm(kvm);
4515	ret = anon_inode_getfd(name: ops->name, fops: &kvm_device_fops, priv: dev, O_RDWR \| O_CLOEXEC);
4516	if (ret < `0`) {
4517	kvm_put_kvm_no_destroy(kvm);
4518	mutex_lock(&kvm->lock);
4519	list_del(entry: &dev->vm_node);
4520	if (ops->release)
4521	ops->release(dev);
4522	mutex_unlock(lock: &kvm->lock);
4523	if (ops->destroy)
4524	ops->destroy(dev);
4525	return ret;
4526	}
4527
4528	cd->fd = ret;
4529	return `0`;
4530	}
4531
4532	static int kvm_vm_ioctl_check_extension_generic(struct kvm kvm, long* arg)
4533	{
4534	switch (arg) {
4535	case KVM_CAP_USER_MEMORY:
4536	case KVM_CAP_DESTROY_MEMORY_REGION_WORKS:
4537	case KVM_CAP_JOIN_MEMORY_REGIONS_WORKS:
4538	case KVM_CAP_INTERNAL_ERROR_DATA:
4539	#ifdef CONFIG_HAVE_KVM_MSI
4540	case KVM_CAP_SIGNAL_MSI:
4541	#endif
4542	#ifdef CONFIG_HAVE_KVM_IRQFD
4543	case KVM_CAP_IRQFD:
4544	#endif
4545	case KVM_CAP_IOEVENTFD_ANY_LENGTH:
4546	case KVM_CAP_CHECK_EXTENSION_VM:
4547	case KVM_CAP_ENABLE_CAP_VM:
4548	case KVM_CAP_HALT_POLL:
4549	return `1`;
4550	#ifdef CONFIG_KVM_MMIO
4551	case KVM_CAP_COALESCED_MMIO:
4552	return KVM_COALESCED_MMIO_PAGE_OFFSET;
4553	case KVM_CAP_COALESCED_PIO:
4554	return `1`;
4555	#endif
4556	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4557	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2:
4558	return KVM_DIRTY_LOG_MANUAL_CAPS;
4559	#endif
4560	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4561	case KVM_CAP_IRQ_ROUTING:
4562	return KVM_MAX_IRQ_ROUTES;
4563	#endif
4564	#if KVM_ADDRESS_SPACE_NUM > 1
4565	case KVM_CAP_MULTI_ADDRESS_SPACE:
4566	return KVM_ADDRESS_SPACE_NUM;
4567	#endif
4568	case KVM_CAP_NR_MEMSLOTS:
4569	return KVM_USER_MEM_SLOTS;
4570	case KVM_CAP_DIRTY_LOG_RING:
4571	#ifdef CONFIG_HAVE_KVM_DIRTY_RING_TSO
4572	return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4573	#else
4574	return `0`;
4575	#endif
4576	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4577	#ifdef CONFIG_HAVE_KVM_DIRTY_RING_ACQ_REL
4578	return KVM_DIRTY_RING_MAX_ENTRIES * sizeof(struct kvm_dirty_gfn);
4579	#else
4580	return `0`;
4581	#endif
4582	#ifdef CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP
4583	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP:
4584	#endif
4585	case KVM_CAP_BINARY_STATS_FD:
4586	case KVM_CAP_SYSTEM_EVENT_DATA:
4587	return `1`;
4588	default:
4589	break;
4590	}
4591	return kvm_vm_ioctl_check_extension(kvm, ext: arg);
4592	}
4593
4594	static int kvm_vm_ioctl_enable_dirty_log_ring(struct kvm *kvm, u32 size)
4595	{
4596	int r;
4597
4598	if (!KVM_DIRTY_LOG_PAGE_OFFSET)
4599	return -EINVAL;
4600
4601	/ the size should be power of 2 /
4602	if (!size \|\| (size & (size - `1`)))
4603	return -EINVAL;
4604
4605	/ Should be bigger to keep the reserved entries, or a page /
4606	if (size < kvm_dirty_ring_get_rsvd_entries() *
4607	sizeof(struct kvm_dirty_gfn) \|\| size < PAGE_SIZE)
4608	return -EINVAL;
4609
4610	if (size > KVM_DIRTY_RING_MAX_ENTRIES *
4611	sizeof(struct kvm_dirty_gfn))
4612	return -E2BIG;
4613
4614	/ We only allow it to set once /
4615	if (kvm->dirty_ring_size)
4616	return -EINVAL;
4617
4618	mutex_lock(&kvm->lock);
4619
4620	if (kvm->created_vcpus) {
4621	/ We don't allow to change this value after vcpu created /
4622	r = -EINVAL;
4623	} else {
4624	kvm->dirty_ring_size = size;
4625	r = `0`;
4626	}
4627
4628	mutex_unlock(lock: &kvm->lock);
4629	return r;
4630	}
4631
4632	static int kvm_vm_ioctl_reset_dirty_pages(struct kvm *kvm)
4633	{
4634	unsigned long i;
4635	struct kvm_vcpu *vcpu;
4636	int cleared = `0`;
4637
4638	if (!kvm->dirty_ring_size)
4639	return -EINVAL;
4640
4641	mutex_lock(&kvm->slots_lock);
4642
4643	kvm_for_each_vcpu(i, vcpu, kvm)
4644	cleared += kvm_dirty_ring_reset(kvm: vcpu->kvm, ring: &vcpu->dirty_ring);
4645
4646	mutex_unlock(lock: &kvm->slots_lock);
4647
4648	if (cleared)
4649	kvm_flush_remote_tlbs(kvm);
4650
4651	return cleared;
4652	}
4653
4654	int __attribute__((weak)) kvm_vm_ioctl_enable_cap(struct kvm *kvm,
4655	struct kvm_enable_cap *cap)
4656	{
4657	return -EINVAL;
4658	}
4659
4660	bool kvm_are_all_memslots_empty(struct kvm *kvm)
4661	{
4662	int i;
4663
4664	lockdep_assert_held(&kvm->slots_lock);
4665
4666	for (i = `0`; i < KVM_ADDRESS_SPACE_NUM; i++) {
4667	if (!kvm_memslots_empty(slots: __kvm_memslots(kvm, as_id: i)))
4668	return false;
4669	}
4670
4671	return true;
4672	}
4673	EXPORT_SYMBOL_GPL(kvm_are_all_memslots_empty);
4674
4675	static int kvm_vm_ioctl_enable_cap_generic(struct kvm *kvm,
4676	struct kvm_enable_cap *cap)
4677	{
4678	switch (cap->cap) {
4679	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4680	case KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2: {
4681	u64 allowed_options = KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE;
4682
4683	if (cap->args[`0`] & KVM_DIRTY_LOG_MANUAL_PROTECT_ENABLE)
4684	allowed_options = KVM_DIRTY_LOG_MANUAL_CAPS;
4685
4686	if (cap->flags \|\| (cap->args[`0`] & ~allowed_options))
4687	return -EINVAL;
4688	kvm->manual_dirty_log_protect = cap->args[`0`];
4689	return `0`;
4690	}
4691	#endif
4692	case KVM_CAP_HALT_POLL: {
4693	if (cap->flags \|\| cap->args[`0`] != (unsigned int)cap->args[`0`])
4694	return -EINVAL;
4695
4696	kvm->max_halt_poll_ns = cap->args[`0`];
4697
4698	/*
4699	* Ensure kvm->override_halt_poll_ns does not become visible
4700	* before kvm->max_halt_poll_ns.
4701	*
4702	* Pairs with the smp_rmb() in kvm_vcpu_max_halt_poll_ns().
4703	*/
4704	smp_wmb();
4705	kvm->override_halt_poll_ns = true;
4706
4707	return `0`;
4708	}
4709	case KVM_CAP_DIRTY_LOG_RING:
4710	case KVM_CAP_DIRTY_LOG_RING_ACQ_REL:
4711	if (!kvm_vm_ioctl_check_extension_generic(kvm, arg: cap->cap))
4712	return -EINVAL;
4713
4714	return kvm_vm_ioctl_enable_dirty_log_ring(kvm, size: cap->args[`0`]);
4715	case KVM_CAP_DIRTY_LOG_RING_WITH_BITMAP: {
4716	int r = -EINVAL;
4717
4718	if (!IS_ENABLED(CONFIG_NEED_KVM_DIRTY_RING_WITH_BITMAP) \|\|
4719	!kvm->dirty_ring_size \|\| cap->flags)
4720	return r;
4721
4722	mutex_lock(&kvm->slots_lock);
4723
4724	/*
4725	* For simplicity, allow enabling ring+bitmap if and only if
4726	* there are no memslots, e.g. to ensure all memslots allocate
4727	* a bitmap after the capability is enabled.
4728	*/
4729	if (kvm_are_all_memslots_empty(kvm)) {
4730	kvm->dirty_ring_with_bitmap = true;
4731	r = `0`;
4732	}
4733
4734	mutex_unlock(lock: &kvm->slots_lock);
4735
4736	return r;
4737	}
4738	default:
4739	return kvm_vm_ioctl_enable_cap(kvm, cap);
4740	}
4741	}
4742
4743	static ssize_t kvm_vm_stats_read(struct file file, char* __user *user_buffer,
4744	size_t size, loff_t *offset)
4745	{
4746	struct kvm *kvm = file->private_data;
4747
4748	return kvm_stats_read(id: kvm->stats_id, header: &kvm_vm_stats_header,
4749	desc: &kvm_vm_stats_desc[`0`], stats: &kvm->stat,
4750	size_stats: sizeof(kvm->stat), user_buffer, size, offset);
4751	}
4752
4753	static int kvm_vm_stats_release(struct inode inode, struct* file *file)
4754	{
4755	struct kvm *kvm = file->private_data;
4756
4757	kvm_put_kvm(kvm);
4758	return `0`;
4759	}
4760
4761	static const struct file_operations kvm_vm_stats_fops = {
4762	.read = kvm_vm_stats_read,
4763	.release = kvm_vm_stats_release,
4764	.llseek = noop_llseek,
4765	};
4766
4767	static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
4768	{
4769	int fd;
4770	struct file *file;
4771
4772	fd = get_unused_fd_flags(O_CLOEXEC);
4773	if (fd < `0`)
4774	return fd;
4775
4776	file = anon_inode_getfile(name: "kvm-vm-stats",
4777	fops: &kvm_vm_stats_fops, priv: kvm, O_RDONLY);
4778	if (IS_ERR(ptr: file)) {
4779	put_unused_fd(fd);
4780	return PTR_ERR(ptr: file);
4781	}
4782
4783	kvm_get_kvm(kvm);
4784
4785	file->f_mode \|= FMODE_PREAD;
4786	fd_install(fd, file);
4787
4788	return fd;
4789	}
4790
4791	static long kvm_vm_ioctl(struct file *filp,
4792	unsigned int ioctl, unsigned long arg)
4793	{
4794	struct kvm *kvm = filp->private_data;
4795	void __user argp = (void* __user *)arg;
4796	int r;
4797
4798	if (kvm->mm != current->mm \|\| kvm->vm_dead)
4799	return -EIO;
4800	switch (ioctl) {
4801	case KVM_CREATE_VCPU:
4802	r = kvm_vm_ioctl_create_vcpu(kvm, id: arg);
4803	break;
4804	case KVM_ENABLE_CAP: {
4805	struct kvm_enable_cap cap;
4806
4807	r = -EFAULT;
4808	if (copy_from_user(to: &cap, from: argp, n: sizeof(cap)))
4809	goto out;
4810	r = kvm_vm_ioctl_enable_cap_generic(kvm, cap: &cap);
4811	break;
4812	}
4813	case KVM_SET_USER_MEMORY_REGION: {
4814	struct kvm_userspace_memory_region kvm_userspace_mem;
4815
4816	r = -EFAULT;
4817	if (copy_from_user(to: &kvm_userspace_mem, from: argp,
4818	n: sizeof(kvm_userspace_mem)))
4819	goto out;
4820
4821	r = kvm_vm_ioctl_set_memory_region(kvm, mem: &kvm_userspace_mem);
4822	break;
4823	}
4824	case KVM_GET_DIRTY_LOG: {
4825	struct kvm_dirty_log log;
4826
4827	r = -EFAULT;
4828	if (copy_from_user(to: &log, from: argp, n: sizeof(log)))
4829	goto out;
4830	r = kvm_vm_ioctl_get_dirty_log(kvm, log: &log);
4831	break;
4832	}
4833	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
4834	case KVM_CLEAR_DIRTY_LOG: {
4835	struct kvm_clear_dirty_log log;
4836
4837	r = -EFAULT;
4838	if (copy_from_user(to: &log, from: argp, n: sizeof(log)))
4839	goto out;
4840	r = kvm_vm_ioctl_clear_dirty_log(kvm, log: &log);
4841	break;
4842	}
4843	#endif
4844	#ifdef CONFIG_KVM_MMIO
4845	case KVM_REGISTER_COALESCED_MMIO: {
4846	struct kvm_coalesced_mmio_zone zone;
4847
4848	r = -EFAULT;
4849	if (copy_from_user(to: &zone, from: argp, n: sizeof(zone)))
4850	goto out;
4851	r = kvm_vm_ioctl_register_coalesced_mmio(kvm, zone: &zone);
4852	break;
4853	}
4854	case KVM_UNREGISTER_COALESCED_MMIO: {
4855	struct kvm_coalesced_mmio_zone zone;
4856
4857	r = -EFAULT;
4858	if (copy_from_user(to: &zone, from: argp, n: sizeof(zone)))
4859	goto out;
4860	r = kvm_vm_ioctl_unregister_coalesced_mmio(kvm, zone: &zone);
4861	break;
4862	}
4863	#endif
4864	case KVM_IRQFD: {
4865	struct kvm_irqfd data;
4866
4867	r = -EFAULT;
4868	if (copy_from_user(to: &data, from: argp, n: sizeof(data)))
4869	goto out;
4870	r = kvm_irqfd(kvm, args: &data);
4871	break;
4872	}
4873	case KVM_IOEVENTFD: {
4874	struct kvm_ioeventfd data;
4875
4876	r = -EFAULT;
4877	if (copy_from_user(to: &data, from: argp, n: sizeof(data)))
4878	goto out;
4879	r = kvm_ioeventfd(kvm, args: &data);
4880	break;
4881	}
4882	#ifdef CONFIG_HAVE_KVM_MSI
4883	case KVM_SIGNAL_MSI: {
4884	struct kvm_msi msi;
4885
4886	r = -EFAULT;
4887	if (copy_from_user(to: &msi, from: argp, n: sizeof(msi)))
4888	goto out;
4889	r = kvm_send_userspace_msi(kvm, msi: &msi);
4890	break;
4891	}
4892	#endif
4893	#ifdef __KVM_HAVE_IRQ_LINE
4894	case KVM_IRQ_LINE_STATUS:
4895	case KVM_IRQ_LINE: {
4896	struct kvm_irq_level irq_event;
4897
4898	r = -EFAULT;
4899	if (copy_from_user(to: &irq_event, from: argp, n: sizeof(irq_event)))
4900	goto out;
4901
4902	r = kvm_vm_ioctl_irq_line(kvm, irq_level: &irq_event,
4903	line_status: ioctl == KVM_IRQ_LINE_STATUS);
4904	if (r)
4905	goto out;
4906
4907	r = -EFAULT;
4908	if (ioctl == KVM_IRQ_LINE_STATUS) {
4909	if (copy_to_user(to: argp, from: &irq_event, n: sizeof(irq_event)))
4910	goto out;
4911	}
4912
4913	r = `0`;
4914	break;
4915	}
4916	#endif
4917	#ifdef CONFIG_HAVE_KVM_IRQ_ROUTING
4918	case KVM_SET_GSI_ROUTING: {
4919	struct kvm_irq_routing routing;
4920	struct kvm_irq_routing __user *urouting;
4921	struct kvm_irq_routing_entry *entries = NULL;
4922
4923	r = -EFAULT;
4924	if (copy_from_user(to: &routing, from: argp, n: sizeof(routing)))
4925	goto out;
4926	r = -EINVAL;
4927	if (!kvm_arch_can_set_irq_routing(kvm))
4928	goto out;
4929	if (routing.nr > KVM_MAX_IRQ_ROUTES)
4930	goto out;
4931	if (routing.flags)
4932	goto out;
4933	if (routing.nr) {
4934	urouting = argp;
4935	entries = vmemdup_user(urouting->entries,
4936	array_size(sizeof(*entries),
4937	routing.nr));
4938	if (IS_ERR(ptr: entries)) {
4939	r = PTR_ERR(ptr: entries);
4940	goto out;
4941	}
4942	}
4943	r = kvm_set_irq_routing(kvm, entries, nr: routing.nr,
4944	flags: routing.flags);
4945	kvfree(addr: entries);
4946	break;
4947	}
4948	#endif /* CONFIG_HAVE_KVM_IRQ_ROUTING */
4949	case KVM_CREATE_DEVICE: {
4950	struct kvm_create_device cd;
4951
4952	r = -EFAULT;
4953	if (copy_from_user(to: &cd, from: argp, n: sizeof(cd)))
4954	goto out;
4955
4956	r = kvm_ioctl_create_device(kvm, cd: &cd);
4957	if (r)
4958	goto out;
4959
4960	r = -EFAULT;
4961	if (copy_to_user(to: argp, from: &cd, n: sizeof(cd)))
4962	goto out;
4963
4964	r = `0`;
4965	break;
4966	}
4967	case KVM_CHECK_EXTENSION:
4968	r = kvm_vm_ioctl_check_extension_generic(kvm, arg);
4969	break;
4970	case KVM_RESET_DIRTY_RINGS:
4971	r = kvm_vm_ioctl_reset_dirty_pages(kvm);
4972	break;
4973	case KVM_GET_STATS_FD:
4974	r = kvm_vm_ioctl_get_stats_fd(kvm);
4975	break;
4976	default:
4977	r = kvm_arch_vm_ioctl(filp, ioctl, arg);
4978	}
4979	out:
4980	return r;
4981	}
4982
4983	#ifdef CONFIG_KVM_COMPAT
4984	struct compat_kvm_dirty_log {
4985	__u32 slot;
4986	__u32 padding1;
4987	union {
4988	compat_uptr_t dirty_bitmap; / one bit per page /
4989	__u64 padding2;
4990	};
4991	};
4992
4993	struct compat_kvm_clear_dirty_log {
4994	__u32 slot;
4995	__u32 num_pages;
4996	__u64 first_page;
4997	union {
4998	compat_uptr_t dirty_bitmap; / one bit per page /
4999	__u64 padding2;
5000	};
5001	};
5002
5003	long __weak kvm_arch_vm_compat_ioctl(struct file filp, unsigned* int ioctl,
5004	unsigned long arg)
5005	{
5006	return -ENOTTY;
5007	}
5008
5009	static long kvm_vm_compat_ioctl(struct file *filp,
5010	unsigned int ioctl, unsigned long arg)
5011	{
5012	struct kvm *kvm = filp->private_data;
5013	int r;
5014
5015	if (kvm->mm != current->mm \|\| kvm->vm_dead)
5016	return -EIO;
5017
5018	r = kvm_arch_vm_compat_ioctl(filp, ioctl, arg);
5019	if (r != -ENOTTY)
5020	return r;
5021
5022	switch (ioctl) {
5023	#ifdef CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT
5024	case KVM_CLEAR_DIRTY_LOG: {
5025	struct compat_kvm_clear_dirty_log compat_log;
5026	struct kvm_clear_dirty_log log;
5027
5028	if (copy_from_user(to: &compat_log, from: (void __user *)arg,
5029	n: sizeof(compat_log)))
5030	return -EFAULT;
5031	log.slot = compat_log.slot;
5032	log.num_pages = compat_log.num_pages;
5033	log.first_page = compat_log.first_page;
5034	log.padding2 = compat_log.padding2;
5035	log.dirty_bitmap = compat_ptr(uptr: compat_log.dirty_bitmap);
5036
5037	r = kvm_vm_ioctl_clear_dirty_log(kvm, log: &log);
5038	break;
5039	}
5040	#endif
5041	case KVM_GET_DIRTY_LOG: {
5042	struct compat_kvm_dirty_log compat_log;
5043	struct kvm_dirty_log log;
5044
5045	if (copy_from_user(to: &compat_log, from: (void __user *)arg,
5046	n: sizeof(compat_log)))
5047	return -EFAULT;
5048	log.slot = compat_log.slot;
5049	log.padding1 = compat_log.padding1;
5050	log.padding2 = compat_log.padding2;
5051	log.dirty_bitmap = compat_ptr(uptr: compat_log.dirty_bitmap);
5052
5053	r = kvm_vm_ioctl_get_dirty_log(kvm, log: &log);
5054	break;
5055	}
5056	default:
5057	r = kvm_vm_ioctl(filp, ioctl, arg);
5058	}
5059	return r;
5060	}
5061	#endif
5062
5063	static const struct file_operations kvm_vm_fops = {
5064	.release = kvm_vm_release,
5065	.unlocked_ioctl = kvm_vm_ioctl,
5066	.llseek = noop_llseek,
5067	KVM_COMPAT(kvm_vm_compat_ioctl),
5068	};
5069
5070	bool file_is_kvm(struct file *file)
5071	{
5072	return file && file->f_op == &kvm_vm_fops;
5073	}
5074	EXPORT_SYMBOL_GPL(file_is_kvm);
5075
5076	static int kvm_dev_ioctl_create_vm(unsigned long type)
5077	{
5078	char fdname[ITOA_MAX_LEN + `1`];
5079	int r, fd;
5080	struct kvm *kvm;
5081	struct file *file;
5082
5083	fd = get_unused_fd_flags(O_CLOEXEC);
5084	if (fd < `0`)
5085	return fd;
5086
5087	snprintf(buf: fdname, size: sizeof(fdname), fmt: "%d", fd);
5088
5089	kvm = kvm_create_vm(type, fdname);
5090	if (IS_ERR(ptr: kvm)) {
5091	r = PTR_ERR(ptr: kvm);
5092	goto put_fd;
5093	}
5094
5095	file = anon_inode_getfile(name: "kvm-vm", fops: &kvm_vm_fops, priv: kvm, O_RDWR);
5096	if (IS_ERR(ptr: file)) {
5097	r = PTR_ERR(ptr: file);
5098	goto put_kvm;
5099	}
5100
5101	/*
5102	* Don't call kvm_put_kvm anymore at this point; file->f_op is
5103	* already set, with ->release() being kvm_vm_release(). In error
5104	* cases it will be called by the final fput(file) and will take
5105	* care of doing kvm_put_kvm(kvm).
5106	*/
5107	kvm_uevent_notify_change(KVM_EVENT_CREATE_VM, kvm);
5108
5109	fd_install(fd, file);
5110	return fd;
5111
5112	put_kvm:
5113	kvm_put_kvm(kvm);
5114	put_fd:
5115	put_unused_fd(fd);
5116	return r;
5117	}
5118
5119	static long kvm_dev_ioctl(struct file *filp,
5120	unsigned int ioctl, unsigned long arg)
5121	{
5122	int r = -EINVAL;
5123
5124	switch (ioctl) {
5125	case KVM_GET_API_VERSION:
5126	if (arg)
5127	goto out;
5128	r = KVM_API_VERSION;
5129	break;
5130	case KVM_CREATE_VM:
5131	r = kvm_dev_ioctl_create_vm(type: arg);
5132	break;
5133	case KVM_CHECK_EXTENSION:
5134	r = kvm_vm_ioctl_check_extension_generic(NULL, arg);
5135	break;
5136	case KVM_GET_VCPU_MMAP_SIZE:
5137	if (arg)
5138	goto out;
5139	r = PAGE_SIZE; / struct kvm_run /
5140	#ifdef CONFIG_X86
5141	r += PAGE_SIZE; / pio data page /
5142	#endif
5143	#ifdef CONFIG_KVM_MMIO
5144	r += PAGE_SIZE; / coalesced mmio ring page /
5145	#endif
5146	break;
5147	case KVM_TRACE_ENABLE:
5148	case KVM_TRACE_PAUSE:
5149	case KVM_TRACE_DISABLE:
5150	r = -EOPNOTSUPP;
5151	break;
5152	default:
5153	return kvm_arch_dev_ioctl(filp, ioctl, arg);
5154	}
5155	out:
5156	return r;
5157	}
5158
5159	static struct file_operations kvm_chardev_ops = {
5160	.unlocked_ioctl = kvm_dev_ioctl,
5161	.llseek = noop_llseek,
5162	KVM_COMPAT(kvm_dev_ioctl),
5163	};
5164
5165	static struct miscdevice kvm_dev = {
5166	KVM_MINOR,
5167	"kvm",
5168	&kvm_chardev_ops,
5169	};
5170
5171	#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
5172	__visible bool kvm_rebooting;
5173	EXPORT_SYMBOL_GPL(kvm_rebooting);
5174
5175	static DEFINE_PER_CPU(bool, hardware_enabled);
5176	static int kvm_usage_count;
5177
5178	static int __hardware_enable_nolock(void)
5179	{
5180	if (__this_cpu_read(hardware_enabled))
5181	return `0`;
5182
5183	if (kvm_arch_hardware_enable()) {
5184	pr_info("kvm: enabling virtualization on CPU%d failed\n",
5185	raw_smp_processor_id());
5186	return -EIO;
5187	}
5188
5189	__this_cpu_write(hardware_enabled, true);
5190	return `0`;
5191	}
5192
5193	static void hardware_enable_nolock(void *failed)
5194	{
5195	if (__hardware_enable_nolock())
5196	atomic_inc(v: failed);
5197	}
5198
5199	static int kvm_online_cpu(unsigned int cpu)
5200	{
5201	int ret = `0`;
5202
5203	/*
5204	* Abort the CPU online process if hardware virtualization cannot
5205	* be enabled. Otherwise running VMs would encounter unrecoverable
5206	* errors when scheduled to this CPU.
5207	*/
5208	mutex_lock(&kvm_lock);
5209	if (kvm_usage_count)
5210	ret = __hardware_enable_nolock();
5211	mutex_unlock(lock: &kvm_lock);
5212	return ret;
5213	}
5214
5215	static void hardware_disable_nolock(void *junk)
5216	{
5217	/*
5218	* Note, hardware_disable_all_nolock() tells all online CPUs to disable
5219	* hardware, not just CPUs that successfully enabled hardware!
5220	*/
5221	if (!__this_cpu_read(hardware_enabled))
5222	return;
5223
5224	kvm_arch_hardware_disable();
5225
5226	__this_cpu_write(hardware_enabled, false);
5227	}
5228
5229	static int kvm_offline_cpu(unsigned int cpu)
5230	{
5231	mutex_lock(&kvm_lock);
5232	if (kvm_usage_count)
5233	hardware_disable_nolock(NULL);
5234	mutex_unlock(lock: &kvm_lock);
5235	return `0`;
5236	}
5237
5238	static void hardware_disable_all_nolock(void)
5239	{
5240	BUG_ON(!kvm_usage_count);
5241
5242	kvm_usage_count--;
5243	if (!kvm_usage_count)
5244	on_each_cpu(func: hardware_disable_nolock, NULL, wait: `1`);
5245	}
5246
5247	static void hardware_disable_all(void)
5248	{
5249	cpus_read_lock();
5250	mutex_lock(&kvm_lock);
5251	hardware_disable_all_nolock();
5252	mutex_unlock(lock: &kvm_lock);
5253	cpus_read_unlock();
5254	}
5255
5256	static int hardware_enable_all(void)
5257	{
5258	atomic_t failed = ATOMIC_INIT(`0`);
5259	int r;
5260
5261	/*
5262	* Do not enable hardware virtualization if the system is going down.
5263	* If userspace initiated a forced reboot, e.g. reboot -f, then it's
5264	* possible for an in-flight KVM_CREATE_VM to trigger hardware enabling
5265	* after kvm_reboot() is called. Note, this relies on system_state
5266	* being set _before_ kvm_reboot(), which is why KVM uses a syscore ops
5267	* hook instead of registering a dedicated reboot notifier (the latter
5268	* runs before system_state is updated).
5269	*/
5270	if (system_state == SYSTEM_HALT \|\| system_state == SYSTEM_POWER_OFF \|\|
5271	system_state == SYSTEM_RESTART)
5272	return -EBUSY;
5273
5274	/*
5275	* When onlining a CPU, cpu_online_mask is set before kvm_online_cpu()
5276	* is called, and so on_each_cpu() between them includes the CPU that
5277	* is being onlined. As a result, hardware_enable_nolock() may get
5278	* invoked before kvm_online_cpu(), which also enables hardware if the
5279	* usage count is non-zero. Disable CPU hotplug to avoid attempting to
5280	* enable hardware multiple times.
5281	*/
5282	cpus_read_lock();
5283	mutex_lock(&kvm_lock);
5284
5285	r = `0`;
5286
5287	kvm_usage_count++;
5288	if (kvm_usage_count == `1`) {
5289	on_each_cpu(func: hardware_enable_nolock, info: &failed, wait: `1`);
5290
5291	if (atomic_read(v: &failed)) {
5292	hardware_disable_all_nolock();
5293	r = -EBUSY;
5294	}
5295	}
5296
5297	mutex_unlock(lock: &kvm_lock);
5298	cpus_read_unlock();
5299
5300	return r;
5301	}
5302
5303	static void kvm_shutdown(void)
5304	{
5305	/*
5306	* Disable hardware virtualization and set kvm_rebooting to indicate
5307	* that KVM has asynchronously disabled hardware virtualization, i.e.
5308	* that relevant errors and exceptions aren't entirely unexpected.
5309	* Some flavors of hardware virtualization need to be disabled before
5310	* transferring control to firmware (to perform shutdown/reboot), e.g.
5311	* on x86, virtualization can block INIT interrupts, which are used by
5312	* firmware to pull APs back under firmware control. Note, this path
5313	* is used for both shutdown and reboot scenarios, i.e. neither name is
5314	* 100% comprehensive.
5315	*/
5316	pr_info("kvm: exiting hardware virtualization\n");
5317	kvm_rebooting = true;
5318	on_each_cpu(func: hardware_disable_nolock, NULL, wait: `1`);
5319	}
5320
5321	static int kvm_suspend(void)
5322	{
5323	/*
5324	* Secondary CPUs and CPU hotplug are disabled across the suspend/resume
5325	* callbacks, i.e. no need to acquire kvm_lock to ensure the usage count
5326	* is stable. Assert that kvm_lock is not held to ensure the system
5327	* isn't suspended while KVM is enabling hardware. Hardware enabling
5328	* can be preempted, but the task cannot be frozen until it has dropped
5329	* all locks (userspace tasks are frozen via a fake signal).
5330	*/
5331	lockdep_assert_not_held(&kvm_lock);
5332	lockdep_assert_irqs_disabled();
5333
5334	if (kvm_usage_count)
5335	hardware_disable_nolock(NULL);
5336	return `0`;
5337	}
5338
5339	static void kvm_resume(void)
5340	{
5341	lockdep_assert_not_held(&kvm_lock);
5342	lockdep_assert_irqs_disabled();
5343
5344	if (kvm_usage_count)
5345	WARN_ON_ONCE(__hardware_enable_nolock());
5346	}
5347
5348	static struct syscore_ops kvm_syscore_ops = {
5349	.suspend = kvm_suspend,
5350	.resume = kvm_resume,
5351	.shutdown = kvm_shutdown,
5352	};
5353	#else /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5354	static int hardware_enable_all(void)
5355	{
5356	return `0`;
5357	}
5358
5359	static void hardware_disable_all(void)
5360	{
5361
5362	}
5363	#endif /* CONFIG_KVM_GENERIC_HARDWARE_ENABLING */
5364
5365	static void kvm_iodevice_destructor(struct kvm_io_device *dev)
5366	{
5367	if (dev->ops->destructor)
5368	dev->ops->destructor(dev);
5369	}
5370
5371	static void kvm_io_bus_destroy(struct kvm_io_bus *bus)
5372	{
5373	int i;
5374
5375	for (i = `0`; i < bus->dev_count; i++) {
5376	struct kvm_io_device *pos = bus->range[i].dev;
5377
5378	kvm_iodevice_destructor(dev: pos);
5379	}
5380	kfree(objp: bus);
5381	}
5382
5383	static inline int kvm_io_bus_cmp(const struct kvm_io_range *r1,
5384	const struct kvm_io_range *r2)
5385	{
5386	gpa_t addr1 = r1->addr;
5387	gpa_t addr2 = r2->addr;
5388
5389	if (addr1 < addr2)
5390	return -`1`;
5391
5392	/ If r2->len == 0, match the exact address. If r2->len != 0,*
5393	* accept any overlapping write. Any order is acceptable for
5394	* overlapping ranges, because kvm_io_bus_get_first_dev ensures
5395	* we process all of them.
5396	*/
5397	if (r2->len) {
5398	addr1 += r1->len;
5399	addr2 += r2->len;
5400	}
5401
5402	if (addr1 > addr2)
5403	return `1`;
5404
5405	return `0`;
5406	}
5407
5408	static int kvm_io_bus_sort_cmp(const void p1, const* void *p2)
5409	{
5410	return kvm_io_bus_cmp(r1: p1, r2: p2);
5411	}
5412
5413	static int kvm_io_bus_get_first_dev(struct kvm_io_bus *bus,
5414	gpa_t addr, int len)
5415	{
5416	struct kvm_io_range *range, key;
5417	int off;
5418
5419	key = (struct kvm_io_range) {
5420	.addr = addr,
5421	.len = len,
5422	};
5423
5424	range = bsearch(key: &key, base: bus->range, num: bus->dev_count,
5425	size: sizeof(struct kvm_io_range), cmp: kvm_io_bus_sort_cmp);
5426	if (range == NULL)
5427	return -ENOENT;
5428
5429	off = range - bus->range;
5430
5431	while (off > `0` && kvm_io_bus_cmp(r1: &key, r2: &bus->range[off-`1`]) == `0`)
5432	off--;
5433
5434	return off;
5435	}
5436
5437	static int __kvm_io_bus_write(struct kvm_vcpu vcpu, struct* kvm_io_bus *bus,
5438	struct kvm_io_range range, const* void *val)
5439	{
5440	int idx;
5441
5442	idx = kvm_io_bus_get_first_dev(bus, addr: range->addr, len: range->len);
5443	if (idx < `0`)
5444	return -EOPNOTSUPP;
5445
5446	while (idx < bus->dev_count &&
5447	kvm_io_bus_cmp(r1: range, r2: &bus->range[idx]) == `0`) {
5448	if (!kvm_iodevice_write(vcpu, dev: bus->range[idx].dev, addr: range->addr,
5449	l: range->len, v: val))
5450	return idx;
5451	idx++;
5452	}
5453
5454	return -EOPNOTSUPP;
5455	}
5456
5457	/ kvm_io_bus_write - called under kvm->slots_lock /
5458	int kvm_io_bus_write(struct kvm_vcpu vcpu, enum* kvm_bus bus_idx, gpa_t addr,
5459	int len, const void *val)
5460	{
5461	struct kvm_io_bus *bus;
5462	struct kvm_io_range range;
5463	int r;
5464
5465	range = (struct kvm_io_range) {
5466	.addr = addr,
5467	.len = len,
5468	};
5469
5470	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5471	if (!bus)
5472	return -ENOMEM;
5473	r = __kvm_io_bus_write(vcpu, bus, range: &range, val);
5474	return r < `0` ? r : `0`;
5475	}
5476	EXPORT_SYMBOL_GPL(kvm_io_bus_write);
5477
5478	/ kvm_io_bus_write_cookie - called under kvm->slots_lock /
5479	int kvm_io_bus_write_cookie(struct kvm_vcpu vcpu, enum* kvm_bus bus_idx,
5480	gpa_t addr, int len, const void val, long* cookie)
5481	{
5482	struct kvm_io_bus *bus;
5483	struct kvm_io_range range;
5484
5485	range = (struct kvm_io_range) {
5486	.addr = addr,
5487	.len = len,
5488	};
5489
5490	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5491	if (!bus)
5492	return -ENOMEM;
5493
5494	/ First try the device referenced by cookie. /
5495	if ((cookie >= `0`) && (cookie < bus->dev_count) &&
5496	(kvm_io_bus_cmp(r1: &range, r2: &bus->range[cookie]) == `0`))
5497	if (!kvm_iodevice_write(vcpu, dev: bus->range[cookie].dev, addr, l: len,
5498	v: val))
5499	return cookie;
5500
5501	/*
5502	* cookie contained garbage; fall back to search and return the
5503	* correct cookie value.
5504	*/
5505	return __kvm_io_bus_write(vcpu, bus, range: &range, val);
5506	}
5507
5508	static int __kvm_io_bus_read(struct kvm_vcpu vcpu, struct* kvm_io_bus *bus,
5509	struct kvm_io_range range, void* *val)
5510	{
5511	int idx;
5512
5513	idx = kvm_io_bus_get_first_dev(bus, addr: range->addr, len: range->len);
5514	if (idx < `0`)
5515	return -EOPNOTSUPP;
5516
5517	while (idx < bus->dev_count &&
5518	kvm_io_bus_cmp(r1: range, r2: &bus->range[idx]) == `0`) {
5519	if (!kvm_iodevice_read(vcpu, dev: bus->range[idx].dev, addr: range->addr,
5520	l: range->len, v: val))
5521	return idx;
5522	idx++;
5523	}
5524
5525	return -EOPNOTSUPP;
5526	}
5527
5528	/ kvm_io_bus_read - called under kvm->slots_lock /
5529	int kvm_io_bus_read(struct kvm_vcpu vcpu, enum* kvm_bus bus_idx, gpa_t addr,
5530	int len, void *val)
5531	{
5532	struct kvm_io_bus *bus;
5533	struct kvm_io_range range;
5534	int r;
5535
5536	range = (struct kvm_io_range) {
5537	.addr = addr,
5538	.len = len,
5539	};
5540
5541	bus = srcu_dereference(vcpu->kvm->buses[bus_idx], &vcpu->kvm->srcu);
5542	if (!bus)
5543	return -ENOMEM;
5544	r = __kvm_io_bus_read(vcpu, bus, range: &range, val);
5545	return r < `0` ? r : `0`;
5546	}
5547
5548	/ Caller must hold slots_lock. /
5549	int kvm_io_bus_register_dev(struct kvm kvm, enum* kvm_bus bus_idx, gpa_t addr,
5550	int len, struct kvm_io_device *dev)
5551	{
5552	int i;
5553	struct kvm_io_bus new_bus, bus;
5554	struct kvm_io_range range;
5555
5556	bus = kvm_get_bus(kvm, idx: bus_idx);
5557	if (!bus)
5558	return -ENOMEM;
5559
5560	/ exclude ioeventfd which is limited by maximum fd /
5561	if (bus->dev_count - bus->ioeventfd_count > NR_IOBUS_DEVS - `1`)
5562	return -ENOSPC;
5563
5564	new_bus = kmalloc(struct_size(bus, range, bus->dev_count + `1`),
5565	GFP_KERNEL_ACCOUNT);
5566	if (!new_bus)
5567	return -ENOMEM;
5568
5569	range = (struct kvm_io_range) {
5570	.addr = addr,
5571	.len = len,
5572	.dev = dev,
5573	};
5574
5575	for (i = `0`; i < bus->dev_count; i++)
5576	if (kvm_io_bus_cmp(r1: &bus->range[i], r2: &range) > `0`)
5577	break;
5578
5579	memcpy(new_bus, bus, sizeof(bus) + i sizeof(struct kvm_io_range));
5580	new_bus->dev_count++;
5581	new_bus->range[i] = range;
5582	memcpy(new_bus->range + i + `1`, bus->range + i,
5583	(bus->dev_count - i) * sizeof(struct kvm_io_range));
5584	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5585	synchronize_srcu_expedited(ssp: &kvm->srcu);
5586	kfree(objp: bus);
5587
5588	return `0`;
5589	}
5590
5591	int kvm_io_bus_unregister_dev(struct kvm kvm, enum* kvm_bus bus_idx,
5592	struct kvm_io_device *dev)
5593	{
5594	int i;
5595	struct kvm_io_bus new_bus, bus;
5596
5597	lockdep_assert_held(&kvm->slots_lock);
5598
5599	bus = kvm_get_bus(kvm, idx: bus_idx);
5600	if (!bus)
5601	return `0`;
5602
5603	for (i = `0`; i < bus->dev_count; i++) {
5604	if (bus->range[i].dev == dev) {
5605	break;
5606	}
5607	}
5608
5609	if (i == bus->dev_count)
5610	return `0`;
5611
5612	new_bus = kmalloc(struct_size(bus, range, bus->dev_count - `1`),
5613	GFP_KERNEL_ACCOUNT);
5614	if (new_bus) {
5615	memcpy(new_bus, bus, struct_size(bus, range, i));
5616	new_bus->dev_count--;
5617	memcpy(new_bus->range + i, bus->range + i + `1`,
5618	flex_array_size(new_bus, range, new_bus->dev_count - i));
5619	}
5620
5621	rcu_assign_pointer(kvm->buses[bus_idx], new_bus);
5622	synchronize_srcu_expedited(ssp: &kvm->srcu);
5623
5624	/*
5625	* If NULL bus is installed, destroy the old bus, including all the
5626	* attached devices. Otherwise, destroy the caller's device only.
5627	*/
5628	if (!new_bus) {
5629	pr_err("kvm: failed to shrink bus, removing it completely\n");
5630	kvm_io_bus_destroy(bus);
5631	return -ENOMEM;
5632	}
5633
5634	kvm_iodevice_destructor(dev);
5635	kfree(objp: bus);
5636	return `0`;
5637	}
5638
5639	struct kvm_io_device kvm_io_bus_get_dev(struct* kvm kvm, enum* kvm_bus bus_idx,
5640	gpa_t addr)
5641	{
5642	struct kvm_io_bus *bus;
5643	int dev_idx, srcu_idx;
5644	struct kvm_io_device *iodev = NULL;
5645
5646	srcu_idx = srcu_read_lock(ssp: &kvm->srcu);
5647
5648	bus = srcu_dereference(kvm->buses[bus_idx], &kvm->srcu);
5649	if (!bus)
5650	goto out_unlock;
5651
5652	dev_idx = kvm_io_bus_get_first_dev(bus, addr, len: `1`);
5653	if (dev_idx < `0`)
5654	goto out_unlock;
5655
5656	iodev = bus->range[dev_idx].dev;
5657
5658	out_unlock:
5659	srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx);
5660
5661	return iodev;
5662	}
5663	EXPORT_SYMBOL_GPL(kvm_io_bus_get_dev);
5664
5665	static int kvm_debugfs_open(struct inode inode, struct* file *file,
5666	int (get)(void* , u64 ), int (set)(void* *, u64),
5667	const char *fmt)
5668	{
5669	int ret;
5670	struct kvm_stat_data *stat_data = inode->i_private;
5671
5672	/*
5673	* The debugfs files are a reference to the kvm struct which
5674	* is still valid when kvm_destroy_vm is called. kvm_get_kvm_safe
5675	* avoids the race between open and the removal of the debugfs directory.
5676	*/
5677	if (!kvm_get_kvm_safe(stat_data->kvm))
5678	return -ENOENT;
5679
5680	ret = simple_attr_open(inode, file, get,
5681	set: kvm_stats_debugfs_mode(pdesc: stat_data->desc) & `0222`
5682	? set : NULL, fmt);
5683	if (ret)
5684	kvm_put_kvm(stat_data->kvm);
5685
5686	return ret;
5687	}
5688
5689	static int kvm_debugfs_release(struct inode inode, struct* file *file)
5690	{
5691	struct kvm_stat_data *stat_data = inode->i_private;
5692
5693	simple_attr_release(inode, file);
5694	kvm_put_kvm(stat_data->kvm);
5695
5696	return `0`;
5697	}
5698
5699	static int kvm_get_stat_per_vm(struct kvm kvm, size_t offset, u64 val)
5700	{
5701	val = (u64 )((void* *)(&kvm->stat) + offset);
5702
5703	return `0`;
5704	}
5705
5706	static int kvm_clear_stat_per_vm(struct kvm *kvm, size_t offset)
5707	{
5708	(u64 )((void *)(&kvm->stat) + offset) = `0`;
5709
5710	return `0`;
5711	}
5712
5713	static int kvm_get_stat_per_vcpu(struct kvm kvm, size_t offset, u64 val)
5714	{
5715	unsigned long i;
5716	struct kvm_vcpu *vcpu;
5717
5718	*val = `0`;
5719
5720	kvm_for_each_vcpu(i, vcpu, kvm)
5721	val += (u64 )((void* *)(&vcpu->stat) + offset);
5722
5723	return `0`;
5724	}
5725
5726	static int kvm_clear_stat_per_vcpu(struct kvm *kvm, size_t offset)
5727	{
5728	unsigned long i;
5729	struct kvm_vcpu *vcpu;
5730
5731	kvm_for_each_vcpu(i, vcpu, kvm)
5732	(u64 )((void *)(&vcpu->stat) + offset) = `0`;
5733
5734	return `0`;
5735	}
5736
5737	static int kvm_stat_data_get(void data, u64 val)
5738	{
5739	int r = -EFAULT;
5740	struct kvm_stat_data *stat_data = data;
5741
5742	switch (stat_data->kind) {
5743	case KVM_STAT_VM:
5744	r = kvm_get_stat_per_vm(kvm: stat_data->kvm,
5745	offset: stat_data->desc->desc.offset, val);
5746	break;
5747	case KVM_STAT_VCPU:
5748	r = kvm_get_stat_per_vcpu(kvm: stat_data->kvm,
5749	offset: stat_data->desc->desc.offset, val);
5750	break;
5751	}
5752
5753	return r;
5754	}
5755
5756	static int kvm_stat_data_clear(void *data, u64 val)
5757	{
5758	int r = -EFAULT;
5759	struct kvm_stat_data *stat_data = data;
5760
5761	if (val)
5762	return -EINVAL;
5763
5764	switch (stat_data->kind) {
5765	case KVM_STAT_VM:
5766	r = kvm_clear_stat_per_vm(kvm: stat_data->kvm,
5767	offset: stat_data->desc->desc.offset);
5768	break;
5769	case KVM_STAT_VCPU:
5770	r = kvm_clear_stat_per_vcpu(kvm: stat_data->kvm,
5771	offset: stat_data->desc->desc.offset);
5772	break;
5773	}
5774
5775	return r;
5776	}
5777
5778	static int kvm_stat_data_open(struct inode inode, struct* file *file)
5779	{
5780	__simple_attr_check_format(fmt: "%llu\n", `0ull`);
5781	return kvm_debugfs_open(inode, file, get: kvm_stat_data_get,
5782	set: kvm_stat_data_clear, fmt: "%llu\n");
5783	}
5784
5785	static const struct file_operations stat_fops_per_vm = {
5786	.owner = THIS_MODULE,
5787	.open = kvm_stat_data_open,
5788	.release = kvm_debugfs_release,
5789	.read = simple_attr_read,
5790	.write = simple_attr_write,
5791	.llseek = no_llseek,
5792	};
5793
5794	static int vm_stat_get(void _offset, u64 val)
5795	{
5796	unsigned offset = (long)_offset;
5797	struct kvm *kvm;
5798	u64 tmp_val;
5799
5800	*val = `0`;
5801	mutex_lock(&kvm_lock);
5802	list_for_each_entry(kvm, &vm_list, vm_list) {
5803	kvm_get_stat_per_vm(kvm, offset, val: &tmp_val);
5804	*val += tmp_val;
5805	}
5806	mutex_unlock(lock: &kvm_lock);
5807	return `0`;
5808	}
5809
5810	static int vm_stat_clear(void *_offset, u64 val)
5811	{
5812	unsigned offset = (long)_offset;
5813	struct kvm *kvm;
5814
5815	if (val)
5816	return -EINVAL;
5817
5818	mutex_lock(&kvm_lock);
5819	list_for_each_entry(kvm, &vm_list, vm_list) {
5820	kvm_clear_stat_per_vm(kvm, offset);
5821	}
5822	mutex_unlock(lock: &kvm_lock);
5823
5824	return `0`;
5825	}
5826
5827	DEFINE_SIMPLE_ATTRIBUTE(vm_stat_fops, vm_stat_get, vm_stat_clear, "%llu\n");
5828	DEFINE_SIMPLE_ATTRIBUTE(vm_stat_readonly_fops, vm_stat_get, NULL, "%llu\n");
5829
5830	static int vcpu_stat_get(void _offset, u64 val)
5831	{
5832	unsigned offset = (long)_offset;
5833	struct kvm *kvm;
5834	u64 tmp_val;
5835
5836	*val = `0`;
5837	mutex_lock(&kvm_lock);
5838	list_for_each_entry(kvm, &vm_list, vm_list) {
5839	kvm_get_stat_per_vcpu(kvm, offset, val: &tmp_val);
5840	*val += tmp_val;
5841	}
5842	mutex_unlock(lock: &kvm_lock);
5843	return `0`;
5844	}
5845
5846	static int vcpu_stat_clear(void *_offset, u64 val)
5847	{
5848	unsigned offset = (long)_offset;
5849	struct kvm *kvm;
5850
5851	if (val)
5852	return -EINVAL;
5853
5854	mutex_lock(&kvm_lock);
5855	list_for_each_entry(kvm, &vm_list, vm_list) {
5856	kvm_clear_stat_per_vcpu(kvm, offset);
5857	}
5858	mutex_unlock(lock: &kvm_lock);
5859
5860	return `0`;
5861	}
5862
5863	DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_fops, vcpu_stat_get, vcpu_stat_clear,
5864	"%llu\n");
5865	DEFINE_SIMPLE_ATTRIBUTE(vcpu_stat_readonly_fops, vcpu_stat_get, NULL, "%llu\n");
5866
5867	static void kvm_uevent_notify_change(unsigned int type, struct kvm *kvm)
5868	{
5869	struct kobj_uevent_env *env;
5870	unsigned long long created, active;
5871
5872	if (!kvm_dev.this_device \|\| !kvm)
5873	return;
5874
5875	mutex_lock(&kvm_lock);
5876	if (type == KVM_EVENT_CREATE_VM) {
5877	kvm_createvm_count++;
5878	kvm_active_vms++;
5879	} else if (type == KVM_EVENT_DESTROY_VM) {
5880	kvm_active_vms--;
5881	}
5882	created = kvm_createvm_count;
5883	active = kvm_active_vms;
5884	mutex_unlock(lock: &kvm_lock);
5885
5886	env = kzalloc(size: sizeof(*env), GFP_KERNEL_ACCOUNT);
5887	if (!env)
5888	return;
5889
5890	add_uevent_var(env, format: "CREATED=%llu", created);
5891	add_uevent_var(env, format: "COUNT=%llu", active);
5892
5893	if (type == KVM_EVENT_CREATE_VM) {
5894	add_uevent_var(env, format: "EVENT=create");
5895	kvm->userspace_pid = task_pid_nr(current);
5896	} else if (type == KVM_EVENT_DESTROY_VM) {
5897	add_uevent_var(env, format: "EVENT=destroy");
5898	}
5899	add_uevent_var(env, format: "PID=%d", kvm->userspace_pid);
5900
5901	if (!IS_ERR(ptr: kvm->debugfs_dentry)) {
5902	char tmp, p = kmalloc(PATH_MAX, GFP_KERNEL_ACCOUNT);
5903
5904	if (p) {
5905	tmp = dentry_path_raw(kvm->debugfs_dentry, p, PATH_MAX);
5906	if (!IS_ERR(ptr: tmp))
5907	add_uevent_var(env, format: "STATS_PATH=%s", tmp);
5908	kfree(objp: p);
5909	}
5910	}
5911	/ no need for checks, since we are adding at most only 5 keys /
5912	env->envp[env->envp_idx++] = NULL;
5913	kobject_uevent_env(kobj: &kvm_dev.this_device->kobj, action: KOBJ_CHANGE, envp: env->envp);
5914	kfree(objp: env);
5915	}
5916
5917	static void kvm_init_debug(void)
5918	{
5919	const struct file_operations *fops;
5920	const struct _kvm_stats_desc *pdesc;
5921	int i;
5922
5923	kvm_debugfs_dir = debugfs_create_dir(name: "kvm", NULL);
5924
5925	for (i = `0`; i < kvm_vm_stats_header.num_desc; ++i) {
5926	pdesc = &kvm_vm_stats_desc[i];
5927	if (kvm_stats_debugfs_mode(pdesc) & `0222`)
5928	fops = &vm_stat_fops;
5929	else
5930	fops = &vm_stat_readonly_fops;
5931	debugfs_create_file(name: pdesc->name, mode: kvm_stats_debugfs_mode(pdesc),
5932	parent: kvm_debugfs_dir,
5933	data: (void )(long*)pdesc->desc.offset, fops);
5934	}
5935
5936	for (i = `0`; i < kvm_vcpu_stats_header.num_desc; ++i) {
5937	pdesc = &kvm_vcpu_stats_desc[i];
5938	if (kvm_stats_debugfs_mode(pdesc) & `0222`)
5939	fops = &vcpu_stat_fops;
5940	else
5941	fops = &vcpu_stat_readonly_fops;
5942	debugfs_create_file(name: pdesc->name, mode: kvm_stats_debugfs_mode(pdesc),
5943	parent: kvm_debugfs_dir,
5944	data: (void )(long*)pdesc->desc.offset, fops);
5945	}
5946	}
5947
5948	static inline
5949	struct kvm_vcpu preempt_notifier_to_vcpu(struct* preempt_notifier *pn)
5950	{
5951	return container_of(pn, struct kvm_vcpu, preempt_notifier);
5952	}
5953
5954	static void kvm_sched_in(struct preempt_notifier pn, int* cpu)
5955	{
5956	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5957
5958	WRITE_ONCE(vcpu->preempted, false);
5959	WRITE_ONCE(vcpu->ready, false);
5960
5961	__this_cpu_write(kvm_running_vcpu, vcpu);
5962	kvm_arch_sched_in(vcpu, cpu);
5963	kvm_arch_vcpu_load(vcpu, cpu);
5964	}
5965
5966	static void kvm_sched_out(struct preempt_notifier *pn,
5967	struct task_struct *next)
5968	{
5969	struct kvm_vcpu *vcpu = preempt_notifier_to_vcpu(pn);
5970
5971	if (current->on_rq) {
5972	WRITE_ONCE(vcpu->preempted, true);
5973	WRITE_ONCE(vcpu->ready, true);
5974	}
5975	kvm_arch_vcpu_put(vcpu);
5976	__this_cpu_write(kvm_running_vcpu, NULL);
5977	}
5978
5979	/**
5980	* kvm_get_running_vcpu - get the vcpu running on the current CPU.
5981	*
5982	* We can disable preemption locally around accessing the per-CPU variable,
5983	* and use the resolved vcpu pointer after enabling preemption again,
5984	* because even if the current thread is migrated to another CPU, reading
5985	* the per-CPU value later will give us the same value as we update the
5986	* per-CPU variable in the preempt notifier handlers.
5987	*/
5988	struct kvm_vcpu kvm_get_running_vcpu(void*)
5989	{
5990	struct kvm_vcpu *vcpu;
5991
5992	preempt_disable();
5993	vcpu = __this_cpu_read(kvm_running_vcpu);
5994	preempt_enable();
5995
5996	return vcpu;
5997	}
5998	EXPORT_SYMBOL_GPL(kvm_get_running_vcpu);
5999
6000	/**
6001	* kvm_get_running_vcpus - get the per-CPU array of currently running vcpus.
6002	*/
6003	struct kvm_vcpu * __percpu kvm_get_running_vcpus(void*)
6004	{
6005	return &kvm_running_vcpu;
6006	}
6007
6008	#ifdef CONFIG_GUEST_PERF_EVENTS
6009	static unsigned int kvm_guest_state(void)
6010	{
6011	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6012	unsigned int state;
6013
6014	if (!kvm_arch_pmi_in_guest(vcpu))
6015	return `0`;
6016
6017	state = PERF_GUEST_ACTIVE;
6018	if (!kvm_arch_vcpu_in_kernel(vcpu))
6019	state \|= PERF_GUEST_USER;
6020
6021	return state;
6022	}
6023
6024	static unsigned long kvm_guest_get_ip(void)
6025	{
6026	struct kvm_vcpu *vcpu = kvm_get_running_vcpu();
6027
6028	/ Retrieving the IP must be guarded by a call to kvm_guest_state(). /
6029	if (WARN_ON_ONCE(!kvm_arch_pmi_in_guest(vcpu)))
6030	return `0`;
6031
6032	return kvm_arch_vcpu_get_ip(vcpu);
6033	}
6034
6035	static struct perf_guest_info_callbacks kvm_guest_cbs = {
6036	.state = kvm_guest_state,
6037	.get_ip = kvm_guest_get_ip,
6038	.handle_intel_pt_intr = NULL,
6039	};
6040
6041	void kvm_register_perf_callbacks(unsigned int (pt_intr_handler)(void*))
6042	{
6043	kvm_guest_cbs.handle_intel_pt_intr = pt_intr_handler;
6044	perf_register_guest_info_callbacks(cbs: &kvm_guest_cbs);
6045	}
6046	void kvm_unregister_perf_callbacks(void)
6047	{
6048	perf_unregister_guest_info_callbacks(cbs: &kvm_guest_cbs);
6049	}
6050	#endif
6051
6052	int kvm_init(unsigned vcpu_size, unsigned vcpu_align, struct module *module)
6053	{
6054	int r;
6055	int cpu;
6056
6057	#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6058	r = cpuhp_setup_state_nocalls(state: CPUHP_AP_KVM_ONLINE, name: "kvm/cpu:online",
6059	startup: kvm_online_cpu, teardown: kvm_offline_cpu);
6060	if (r)
6061	return r;
6062
6063	register_syscore_ops(ops: &kvm_syscore_ops);
6064	#endif
6065
6066	/ A kmem cache lets us meet the alignment requirements of fx_save. /
6067	if (!vcpu_align)
6068	vcpu_align = __alignof__(struct kvm_vcpu);
6069	kvm_vcpu_cache =
6070	kmem_cache_create_usercopy(name: "kvm_vcpu", size: vcpu_size, align: vcpu_align,
6071	SLAB_ACCOUNT,
6072	offsetof(struct kvm_vcpu, arch),
6073	offsetofend(struct kvm_vcpu, stats_id)
6074	- offsetof(struct kvm_vcpu, arch),
6075	NULL);
6076	if (!kvm_vcpu_cache) {
6077	r = -ENOMEM;
6078	goto err_vcpu_cache;
6079	}
6080
6081	for_each_possible_cpu(cpu) {
6082	if (!alloc_cpumask_var_node(mask: &per_cpu(cpu_kick_mask, cpu),
6083	GFP_KERNEL, cpu_to_node(cpu))) {
6084	r = -ENOMEM;
6085	goto err_cpu_kick_mask;
6086	}
6087	}
6088
6089	r = kvm_irqfd_init();
6090	if (r)
6091	goto err_irqfd;
6092
6093	r = kvm_async_pf_init();
6094	if (r)
6095	goto err_async_pf;
6096
6097	kvm_chardev_ops.owner = module;
6098
6099	kvm_preempt_ops.sched_in = kvm_sched_in;
6100	kvm_preempt_ops.sched_out = kvm_sched_out;
6101
6102	kvm_init_debug();
6103
6104	r = kvm_vfio_ops_init();
6105	if (WARN_ON_ONCE(r))
6106	goto err_vfio;
6107
6108	/*
6109	* Registration _must_ be the very last thing done, as this exposes
6110	* /dev/kvm to userspace, i.e. all infrastructure must be setup!
6111	*/
6112	r = misc_register(misc: &kvm_dev);
6113	if (r) {
6114	pr_err("kvm: misc device register failed\n");
6115	goto err_register;
6116	}
6117
6118	return `0`;
6119
6120	err_register:
6121	kvm_vfio_ops_exit();
6122	err_vfio:
6123	kvm_async_pf_deinit();
6124	err_async_pf:
6125	kvm_irqfd_exit();
6126	err_irqfd:
6127	err_cpu_kick_mask:
6128	for_each_possible_cpu(cpu)
6129	free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6130	kmem_cache_destroy(s: kvm_vcpu_cache);
6131	err_vcpu_cache:
6132	#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6133	unregister_syscore_ops(ops: &kvm_syscore_ops);
6134	cpuhp_remove_state_nocalls(state: CPUHP_AP_KVM_ONLINE);
6135	#endif
6136	return r;
6137	}
6138	EXPORT_SYMBOL_GPL(kvm_init);
6139
6140	void kvm_exit(void)
6141	{
6142	int cpu;
6143
6144	/*
6145	* Note, unregistering /dev/kvm doesn't strictly need to come first,
6146	* fops_get(), a.k.a. try_module_get(), prevents acquiring references
6147	* to KVM while the module is being stopped.
6148	*/
6149	misc_deregister(misc: &kvm_dev);
6150
6151	debugfs_remove_recursive(dentry: kvm_debugfs_dir);
6152	for_each_possible_cpu(cpu)
6153	free_cpumask_var(per_cpu(cpu_kick_mask, cpu));
6154	kmem_cache_destroy(s: kvm_vcpu_cache);
6155	kvm_vfio_ops_exit();
6156	kvm_async_pf_deinit();
6157	#ifdef CONFIG_KVM_GENERIC_HARDWARE_ENABLING
6158	unregister_syscore_ops(ops: &kvm_syscore_ops);
6159	cpuhp_remove_state_nocalls(state: CPUHP_AP_KVM_ONLINE);
6160	#endif
6161	kvm_irqfd_exit();
6162	}
6163	EXPORT_SYMBOL_GPL(kvm_exit);
6164
6165	struct kvm_vm_worker_thread_context {
6166	struct kvm *kvm;
6167	struct task_struct *parent;
6168	struct completion init_done;
6169	kvm_vm_thread_fn_t thread_fn;
6170	uintptr_t data;
6171	int err;
6172	};
6173
6174	static int kvm_vm_worker_thread(void *context)
6175	{
6176	/*
6177	* The init_context is allocated on the stack of the parent thread, so
6178	* we have to locally copy anything that is needed beyond initialization
6179	*/
6180	struct kvm_vm_worker_thread_context *init_context = context;
6181	struct task_struct *parent;
6182	struct kvm *kvm = init_context->kvm;
6183	kvm_vm_thread_fn_t thread_fn = init_context->thread_fn;
6184	uintptr_t data = init_context->data;
6185	int err;
6186
6187	err = kthread_park(current);
6188	/ kthread_park(current) is never supposed to return an error /
6189	WARN_ON(err != `0`);
6190	if (err)
6191	goto init_complete;
6192
6193	err = cgroup_attach_task_all(from: init_context->parent, current);
6194	if (err) {
6195	kvm_err("%s: cgroup_attach_task_all failed with err %d\n",
6196	__func__, err);
6197	goto init_complete;
6198	}
6199
6200	set_user_nice(current, nice: task_nice(p: init_context->parent));
6201
6202	init_complete:
6203	init_context->err = err;
6204	complete(&init_context->init_done);
6205	init_context = NULL;
6206
6207	if (err)
6208	goto out;
6209
6210	/ Wait to be woken up by the spawner before proceeding. /
6211	kthread_parkme();
6212
6213	if (!kthread_should_stop())
6214	err = thread_fn(kvm, data);
6215
6216	out:
6217	/*
6218	* Move kthread back to its original cgroup to prevent it lingering in
6219	* the cgroup of the VM process, after the latter finishes its
6220	* execution.
6221	*
6222	* kthread_stop() waits on the 'exited' completion condition which is
6223	* set in exit_mm(), via mm_release(), in do_exit(). However, the
6224	* kthread is removed from the cgroup in the cgroup_exit() which is
6225	* called after the exit_mm(). This causes the kthread_stop() to return
6226	* before the kthread actually quits the cgroup.
6227	*/
6228	rcu_read_lock();
6229	parent = rcu_dereference(current->real_parent);
6230	get_task_struct(t: parent);
6231	rcu_read_unlock();
6232	cgroup_attach_task_all(from: parent, current);
6233	put_task_struct(t: parent);
6234
6235	return err;
6236	}
6237
6238	int kvm_vm_create_worker_thread(struct kvm *kvm, kvm_vm_thread_fn_t thread_fn,
6239	uintptr_t data, const char *name,
6240	struct task_struct **thread_ptr)
6241	{
6242	struct kvm_vm_worker_thread_context init_context = {};
6243	struct task_struct *thread;
6244
6245	*thread_ptr = NULL;
6246	init_context.kvm = kvm;
6247	init_context.parent = current;
6248	init_context.thread_fn = thread_fn;
6249	init_context.data = data;
6250	init_completion(x: &init_context.init_done);
6251
6252	thread = kthread_run(kvm_vm_worker_thread, &init_context,
6253	"%s-%d", name, task_pid_nr(current));
6254	if (IS_ERR(ptr: thread))
6255	return PTR_ERR(ptr: thread);
6256
6257	/ kthread_run is never supposed to return NULL /
6258	WARN_ON(thread == NULL);
6259
6260	wait_for_completion(&init_context.init_done);
6261
6262	if (!init_context.err)
6263	*thread_ptr = thread;
6264
6265	return init_context.err;
6266	}
6267

source code of linux/virt/kvm/kvm_main.c