xen.c source code [linux/arch/x86/kvm/xen.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* Copyright © 2019 Oracle and/or its affiliates. All rights reserved.
4	* Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved.
5	*
6	* KVM Xen emulation
7	*/
8	#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
9
10	#include "x86.h"
11	#include "xen.h"
12	#include "hyperv.h"
13	#include "irq.h"
14
15	#include <linux/eventfd.h>
16	#include <linux/kvm_host.h>
17	#include <linux/sched/stat.h>
18
19	#include <trace/events/kvm.h>
20	#include <xen/interface/xen.h>
21	#include <xen/interface/vcpu.h>
22	#include <xen/interface/version.h>
23	#include <xen/interface/event_channel.h>
24	#include <xen/interface/sched.h>
25
26	#include <asm/xen/cpuid.h>
27	#include <asm/pvclock.h>
28
29	#include "cpuid.h"
30	#include "trace.h"
31
32	static int kvm_xen_set_evtchn(struct kvm_xen_evtchn xe, struct* kvm *kvm);
33	static int kvm_xen_setattr_evtchn(struct kvm kvm, struct* kvm_xen_hvm_attr *data);
34	static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu vcpu, u64 param, u64 r);
35
36	DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ);
37
38	static int kvm_xen_shared_info_init(struct kvm *kvm)
39	{
40	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
41	struct pvclock_wall_clock *wc;
42	u32 *wc_sec_hi;
43	u32 wc_version;
44	u64 wall_nsec;
45	int ret = `0`;
46	int idx = srcu_read_lock(ssp: &kvm->srcu);
47
48	read_lock_irq(&gpc->lock);
49	while (!kvm_gpc_check(gpc, PAGE_SIZE)) {
50	read_unlock_irq(&gpc->lock);
51
52	ret = kvm_gpc_refresh(gpc, PAGE_SIZE);
53	if (ret)
54	goto out;
55
56	read_lock_irq(&gpc->lock);
57	}
58
59	/*
60	* This code mirrors kvm_write_wall_clock() except that it writes
61	* directly through the pfn cache and doesn't mark the page dirty.
62	*/
63	wall_nsec = kvm_get_wall_clock_epoch(kvm);
64
65	/ Paranoia checks on the 32-bit struct layout /
66	BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != `0x900`);
67	BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != `0x924`);
68	BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != `0`);
69
70	#ifdef CONFIG_X86_64
71	/ Paranoia checks on the 64-bit struct layout /
72	BUILD_BUG_ON(offsetof(struct shared_info, wc) != `0xc00`);
73	BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != `0xc0c`);
74
75	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
76	struct shared_info *shinfo = gpc->khva;
77
78	wc_sec_hi = &shinfo->wc_sec_hi;
79	wc = &shinfo->wc;
80	} else
81	#endif
82	{
83	struct compat_shared_info *shinfo = gpc->khva;
84
85	wc_sec_hi = &shinfo->arch.wc_sec_hi;
86	wc = &shinfo->wc;
87	}
88
89	/ Increment and ensure an odd value /
90	wc_version = wc->version = (wc->version + `1`) \| `1`;
91	smp_wmb();
92
93	wc->nsec = do_div(wall_nsec, NSEC_PER_SEC);
94	wc->sec = (u32)wall_nsec;
95	*wc_sec_hi = wall_nsec >> `32`;
96	smp_wmb();
97
98	wc->version = wc_version + `1`;
99	read_unlock_irq(&gpc->lock);
100
101	kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE);
102
103	out:
104	srcu_read_unlock(ssp: &kvm->srcu, idx);
105	return ret;
106	}
107
108	void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu)
109	{
110	if (atomic_read(v: &vcpu->arch.xen.timer_pending) > `0`) {
111	struct kvm_xen_evtchn e;
112
113	e.vcpu_id = vcpu->vcpu_id;
114	e.vcpu_idx = vcpu->vcpu_idx;
115	e.port = vcpu->arch.xen.timer_virq;
116	e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
117
118	kvm_xen_set_evtchn(xe: &e, kvm: vcpu->kvm);
119
120	vcpu->arch.xen.timer_expires = `0`;
121	atomic_set(v: &vcpu->arch.xen.timer_pending, i: `0`);
122	}
123	}
124
125	static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer)
126	{
127	struct kvm_vcpu vcpu = container_of(timer, struct* kvm_vcpu,
128	arch.xen.timer);
129	struct kvm_xen_evtchn e;
130	int rc;
131
132	if (atomic_read(v: &vcpu->arch.xen.timer_pending))
133	return HRTIMER_NORESTART;
134
135	e.vcpu_id = vcpu->vcpu_id;
136	e.vcpu_idx = vcpu->vcpu_idx;
137	e.port = vcpu->arch.xen.timer_virq;
138	e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
139
140	rc = kvm_xen_set_evtchn_fast(xe: &e, kvm: vcpu->kvm);
141	if (rc != -EWOULDBLOCK) {
142	vcpu->arch.xen.timer_expires = `0`;
143	return HRTIMER_NORESTART;
144	}
145
146	atomic_inc(v: &vcpu->arch.xen.timer_pending);
147	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
148	kvm_vcpu_kick(vcpu);
149
150	return HRTIMER_NORESTART;
151	}
152
153	static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs,
154	bool linux_wa)
155	{
156	int64_t kernel_now, delta;
157	uint64_t guest_now;
158
159	/*
160	* The guest provides the requested timeout in absolute nanoseconds
161	* of the KVM clock — as it sees it, based on the scaled TSC and
162	* the pvclock information provided by KVM.
163	*
164	* The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW
165	* so use CLOCK_MONOTONIC. In the timescales covered by timers, the
166	* difference won't matter much as there is no cumulative effect.
167	*
168	* Calculate the time for some arbitrary point in time around "now"
169	* in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the
170	* delta between the kvmclock "now" value and the guest's requested
171	* timeout, apply the "Linux workaround" described below, and add
172	* the resulting delta to the CLOCK_MONOTONIC "now" value, to get
173	* the absolute CLOCK_MONOTONIC time at which the timer should
174	* fire.
175	*/
176	if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock &&
177	static_cpu_has(X86_FEATURE_CONSTANT_TSC)) {
178	uint64_t host_tsc, guest_tsc;
179
180	if (!IS_ENABLED(CONFIG_64BIT) \|\|
181	!kvm_get_monotonic_and_clockread(kernel_ns: &kernel_now, tsc_timestamp: &host_tsc)) {
182	/*
183	* Don't fall back to get_kvmclock_ns() because it's
184	* broken; it has a systemic error in its results
185	* because it scales directly from host TSC to
186	* nanoseconds, and doesn't scale first to guest TSC
187	* and then to nanoseconds as the guest does.
188	*
189	* There is a small error introduced here because time
190	* continues to elapse between the ktime_get() and the
191	* subsequent rdtsc(). But not the systemic drift due
192	* to get_kvmclock_ns().
193	*/
194	kernel_now = ktime_get(); / This is CLOCK_MONOTONIC /
195	host_tsc = rdtsc();
196	}
197
198	/ Calculate the guest kvmclock as the guest would do it. /
199	guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc);
200	guest_now = __pvclock_read_cycles(src: &vcpu->arch.hv_clock,
201	tsc: guest_tsc);
202	} else {
203	/*
204	* Without CONSTANT_TSC, get_kvmclock_ns() is the only option.
205	*
206	* Also if the guest PV clock hasn't been set up yet, as is
207	* likely to be the case during migration when the vCPU has
208	* not been run yet. It would be possible to calculate the
209	* scaling factors properly in that case but there's not much
210	* point in doing so. The get_kvmclock_ns() drift accumulates
211	* over time, so it's OK to use it at startup. Besides, on
212	* migration there's going to be a little bit of skew in the
213	* precise moment at which timers fire anyway. Often they'll
214	* be in the "past" by the time the VM is running again after
215	* migration.
216	*/
217	guest_now = get_kvmclock_ns(kvm: vcpu->kvm);
218	kernel_now = ktime_get();
219	}
220
221	delta = guest_abs - guest_now;
222
223	/*
224	* Xen has a 'Linux workaround' in do_set_timer_op() which checks for
225	* negative absolute timeout values (caused by integer overflow), and
226	* for values about 13 days in the future (2^50ns) which would be
227	* caused by jiffies overflow. For those cases, Xen sets the timeout
228	* 100ms in the future (not too soon, since if a guest really did
229	* set a long timeout on purpose we don't want to keep churning CPU
230	* time by waking it up). Emulate Xen's workaround when starting the
231	* timer in response to __HYPERVISOR_set_timer_op.
232	*/
233	if (linux_wa &&
234	unlikely((int64_t)guest_abs < `0` \|\|
235	(delta > `0` && (uint32_t) (delta >> `50`) != `0`))) {
236	delta = `100` * NSEC_PER_MSEC;
237	guest_abs = guest_now + delta;
238	}
239
240	/*
241	* Avoid races with the old timer firing. Checking timer_expires
242	* to avoid calling hrtimer_cancel() will only have false positives
243	* so is fine.
244	*/
245	if (vcpu->arch.xen.timer_expires)
246	hrtimer_cancel(timer: &vcpu->arch.xen.timer);
247
248	atomic_set(v: &vcpu->arch.xen.timer_pending, i: `0`);
249	vcpu->arch.xen.timer_expires = guest_abs;
250
251	if (delta <= `0`)
252	xen_timer_callback(timer: &vcpu->arch.xen.timer);
253	else
254	hrtimer_start(timer: &vcpu->arch.xen.timer,
255	ktime_add_ns(kernel_now, delta),
256	mode: HRTIMER_MODE_ABS_HARD);
257	}
258
259	static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu)
260	{
261	hrtimer_cancel(timer: &vcpu->arch.xen.timer);
262	vcpu->arch.xen.timer_expires = `0`;
263	atomic_set(v: &vcpu->arch.xen.timer_pending, i: `0`);
264	}
265
266	static void kvm_xen_init_timer(struct kvm_vcpu *vcpu)
267	{
268	hrtimer_init(timer: &vcpu->arch.xen.timer, CLOCK_MONOTONIC,
269	mode: HRTIMER_MODE_ABS_HARD);
270	vcpu->arch.xen.timer.function = xen_timer_callback;
271	}
272
273	static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic)
274	{
275	struct kvm_vcpu_xen *vx = &v->arch.xen;
276	struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache;
277	struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache;
278	size_t user_len, user_len1, user_len2;
279	struct vcpu_runstate_info rs;
280	unsigned long flags;
281	size_t times_ofs;
282	uint8_t *update_bit = NULL;
283	uint64_t entry_time;
284	uint64_t *rs_times;
285	int *rs_state;
286
287	/*
288	* The only difference between 32-bit and 64-bit versions of the
289	* runstate struct is the alignment of uint64_t in 32-bit, which
290	* means that the 64-bit version has an additional 4 bytes of
291	* padding after the first field 'state'. Let's be really really
292	* paranoid about that, and matching it with our internal data
293	* structures that we memcpy into it...
294	*/
295	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != `0`);
296	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != `0`);
297	BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != `0x2c`);
298	#ifdef CONFIG_X86_64
299	/*
300	* The 64-bit structure has 4 bytes of padding before 'state_entry_time'
301	* so each subsequent field is shifted by 4, and it's 4 bytes longer.
302	*/
303	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
304	offsetof(struct compat_vcpu_runstate_info, state_entry_time) + `4`);
305	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) !=
306	offsetof(struct compat_vcpu_runstate_info, time) + `4`);
307	BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != `0x2c` + `4`);
308	#endif
309	/*
310	* The state field is in the same place at the start of both structs,
311	* and is the same size (int) as vx->current_runstate.
312	*/
313	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) !=
314	offsetof(struct compat_vcpu_runstate_info, state));
315	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) !=
316	sizeof(vx->current_runstate));
317	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) !=
318	sizeof(vx->current_runstate));
319
320	/*
321	* The state_entry_time field is 64 bits in both versions, and the
322	* XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86
323	* is little-endian means that it's in the last byte of the word.
324	* That detail is important later.
325	*/
326	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) !=
327	sizeof(uint64_t));
328	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) !=
329	sizeof(uint64_t));
330	BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> `56`) != `0x80`);
331
332	/*
333	* The time array is four 64-bit quantities in both versions, matching
334	* the vx->runstate_times and immediately following state_entry_time.
335	*/
336	BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) !=
337	offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t));
338	BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) !=
339	offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t));
340	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
341	sizeof_field(struct compat_vcpu_runstate_info, time));
342	BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) !=
343	sizeof(vx->runstate_times));
344
345	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
346	user_len = sizeof(struct vcpu_runstate_info);
347	times_ofs = offsetof(struct vcpu_runstate_info,
348	state_entry_time);
349	} else {
350	user_len = sizeof(struct compat_vcpu_runstate_info);
351	times_ofs = offsetof(struct compat_vcpu_runstate_info,
352	state_entry_time);
353	}
354
355	/*
356	* There are basically no alignment constraints. The guest can set it
357	* up so it crosses from one page to the next, and at arbitrary byte
358	* alignment (and the 32-bit ABI doesn't align the 64-bit integers
359	* anyway, even if the overall struct had been 64-bit aligned).
360	*/
361	if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) {
362	user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK);
363	user_len2 = user_len - user_len1;
364	} else {
365	user_len1 = user_len;
366	user_len2 = `0`;
367	}
368	BUG_ON(user_len1 + user_len2 != user_len);
369
370	retry:
371	/*
372	* Attempt to obtain the GPC lock on both (if there are two)
373	* gfn_to_pfn caches that cover the region.
374	*/
375	if (atomic) {
376	local_irq_save(flags);
377	if (!read_trylock(&gpc1->lock)) {
378	local_irq_restore(flags);
379	return;
380	}
381	} else {
382	read_lock_irqsave(&gpc1->lock, flags);
383	}
384	while (!kvm_gpc_check(gpc: gpc1, len: user_len1)) {
385	read_unlock_irqrestore(&gpc1->lock, flags);
386
387	/ When invoked from kvm_sched_out() we cannot sleep /
388	if (atomic)
389	return;
390
391	if (kvm_gpc_refresh(gpc: gpc1, len: user_len1))
392	return;
393
394	read_lock_irqsave(&gpc1->lock, flags);
395	}
396
397	if (likely(!user_len2)) {
398	/*
399	* Set up three pointers directly to the runstate_info
400	* struct in the guest (via the GPC).
401	*
402	* • @rs_state → state field
403	* • @rs_times → state_entry_time field.
404	* • @update_bit → last byte of state_entry_time, which
405	* contains the XEN_RUNSTATE_UPDATE bit.
406	*/
407	rs_state = gpc1->khva;
408	rs_times = gpc1->khva + times_ofs;
409	if (v->kvm->arch.xen.runstate_update_flag)
410	update_bit = ((void *)(&rs_times[`1`])) - `1`;
411	} else {
412	/*
413	* The guest's runstate_info is split across two pages and we
414	* need to hold and validate both GPCs simultaneously. We can
415	* declare a lock ordering GPC1 > GPC2 because nothing else
416	* takes them more than one at a time. Set a subclass on the
417	* gpc1 lock to make lockdep shut up about it.
418	*/
419	lock_set_subclass(lock: &gpc1->lock.dep_map, subclass: `1`, _THIS_IP_);
420	if (atomic) {
421	if (!read_trylock(&gpc2->lock)) {
422	read_unlock_irqrestore(&gpc1->lock, flags);
423	return;
424	}
425	} else {
426	read_lock(&gpc2->lock);
427	}
428
429	if (!kvm_gpc_check(gpc: gpc2, len: user_len2)) {
430	read_unlock(&gpc2->lock);
431	read_unlock_irqrestore(&gpc1->lock, flags);
432
433	/ When invoked from kvm_sched_out() we cannot sleep /
434	if (atomic)
435	return;
436
437	/*
438	* Use kvm_gpc_activate() here because if the runstate
439	* area was configured in 32-bit mode and only extends
440	* to the second page now because the guest changed to
441	* 64-bit mode, the second GPC won't have been set up.
442	*/
443	if (kvm_gpc_activate(gpc: gpc2, gpa: gpc1->gpa + user_len1,
444	len: user_len2))
445	return;
446
447	/*
448	* We dropped the lock on GPC1 so we have to go all the
449	* way back and revalidate that too.
450	*/
451	goto retry;
452	}
453
454	/*
455	* In this case, the runstate_info struct will be assembled on
456	* the kernel stack (compat or not as appropriate) and will
457	* be copied to GPC1/GPC2 with a dual memcpy. Set up the three
458	* rs pointers accordingly.
459	*/
460	rs_times = &rs.state_entry_time;
461
462	/*
463	* The rs_state pointer points to the start of what we'll
464	* copy to the guest, which in the case of a compat guest
465	* is the 32-bit field that the compiler thinks is padding.
466	*/
467	rs_state = ((void *)rs_times) - times_ofs;
468
469	/*
470	* The update_bit is still directly in the guest memory,
471	* via one GPC or the other.
472	*/
473	if (v->kvm->arch.xen.runstate_update_flag) {
474	if (user_len1 >= times_ofs + sizeof(uint64_t))
475	update_bit = gpc1->khva + times_ofs +
476	sizeof(uint64_t) - `1`;
477	else
478	update_bit = gpc2->khva + times_ofs +
479	sizeof(uint64_t) - `1` - user_len1;
480	}
481
482	#ifdef CONFIG_X86_64
483	/*
484	* Don't leak kernel memory through the padding in the 64-bit
485	* version of the struct.
486	*/
487	memset(&rs, `0`, offsetof(struct vcpu_runstate_info, state_entry_time));
488	#endif
489	}
490
491	/*
492	* First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the
493	* state_entry_time field, directly in the guest. We need to set
494	* that (and write-barrier) before writing to the rest of the
495	* structure, and clear it last. Just as Xen does, we address the
496	* single byte in which it resides because it might be in a
497	* different cache line to the rest of the 64-bit word, due to
498	* the (lack of) alignment constraints.
499	*/
500	entry_time = vx->runstate_entry_time;
501	if (update_bit) {
502	entry_time \|= XEN_RUNSTATE_UPDATE;
503	*update_bit = (vx->runstate_entry_time \| XEN_RUNSTATE_UPDATE) >> `56`;
504	smp_wmb();
505	}
506
507	/*
508	* Now assemble the actual structure, either on our kernel stack
509	* or directly in the guest according to how the rs_state and
510	* rs_times pointers were set up above.
511	*/
512	*rs_state = vx->current_runstate;
513	rs_times[`0`] = entry_time;
514	memcpy(rs_times + `1`, vx->runstate_times, sizeof(vx->runstate_times));
515
516	/ For the split case, we have to then copy it to the guest. /
517	if (user_len2) {
518	memcpy(gpc1->khva, rs_state, user_len1);
519	memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2);
520	}
521	smp_wmb();
522
523	/ Finally, clear the XEN_RUNSTATE_UPDATE bit. /
524	if (update_bit) {
525	entry_time &= ~XEN_RUNSTATE_UPDATE;
526	*update_bit = entry_time >> `56`;
527	smp_wmb();
528	}
529
530	if (user_len2) {
531	kvm_gpc_mark_dirty_in_slot(gpc: gpc2);
532	read_unlock(&gpc2->lock);
533	}
534
535	kvm_gpc_mark_dirty_in_slot(gpc: gpc1);
536	read_unlock_irqrestore(&gpc1->lock, flags);
537	}
538
539	void kvm_xen_update_runstate(struct kvm_vcpu v, int* state)
540	{
541	struct kvm_vcpu_xen *vx = &v->arch.xen;
542	u64 now = get_kvmclock_ns(kvm: v->kvm);
543	u64 delta_ns = now - vx->runstate_entry_time;
544	u64 run_delay = current->sched_info.run_delay;
545
546	if (unlikely(!vx->runstate_entry_time))
547	vx->current_runstate = RUNSTATE_offline;
548
549	/*
550	* Time waiting for the scheduler isn't "stolen" if the
551	* vCPU wasn't running anyway.
552	*/
553	if (vx->current_runstate == RUNSTATE_running) {
554	u64 steal_ns = run_delay - vx->last_steal;
555
556	delta_ns -= steal_ns;
557
558	vx->runstate_times[RUNSTATE_runnable] += steal_ns;
559	}
560	vx->last_steal = run_delay;
561
562	vx->runstate_times[vx->current_runstate] += delta_ns;
563	vx->current_runstate = state;
564	vx->runstate_entry_time = now;
565
566	if (vx->runstate_cache.active)
567	kvm_xen_update_runstate_guest(v, atomic: state == RUNSTATE_runnable);
568	}
569
570	void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v)
571	{
572	struct kvm_lapic_irq irq = { };
573
574	irq.dest_id = v->vcpu_id;
575	irq.vector = v->arch.xen.upcall_vector;
576	irq.dest_mode = APIC_DEST_PHYSICAL;
577	irq.shorthand = APIC_DEST_NOSHORT;
578	irq.delivery_mode = APIC_DM_FIXED;
579	irq.level = `1`;
580
581	kvm_irq_delivery_to_apic(kvm: v->kvm, NULL, irq: &irq, NULL);
582	}
583
584	/*
585	* On event channel delivery, the vcpu_info may not have been accessible.
586	* In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which
587	* need to be marked into the vcpu_info (and evtchn_upcall_pending set).
588	* Do so now that we can sleep in the context of the vCPU to bring the
589	* page in, and refresh the pfn cache for it.
590	*/
591	void kvm_xen_inject_pending_events(struct kvm_vcpu *v)
592	{
593	unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel);
594	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
595	unsigned long flags;
596
597	if (!evtchn_pending_sel)
598	return;
599
600	/*
601	* Yes, this is an open-coded loop. But that's just what put_user()
602	* does anyway. Page it in and retry the instruction. We're just a
603	* little more honest about it.
604	*/
605	read_lock_irqsave(&gpc->lock, flags);
606	while (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) {
607	read_unlock_irqrestore(&gpc->lock, flags);
608
609	if (kvm_gpc_refresh(gpc, len: sizeof(struct vcpu_info)))
610	return;
611
612	read_lock_irqsave(&gpc->lock, flags);
613	}
614
615	/ Now gpc->khva is a valid kernel address for the vcpu_info /
616	if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) {
617	struct vcpu_info *vi = gpc->khva;
618
619	asm volatile(LOCK_PREFIX "orq %0, %1\n"
620	"notq %0\n"
621	LOCK_PREFIX "andq %0, %2\n"
622	: "=r" (evtchn_pending_sel),
623	"+m" (vi->evtchn_pending_sel),
624	"+m" (v->arch.xen.evtchn_pending_sel)
625	: "0" (evtchn_pending_sel));
626	WRITE_ONCE(vi->evtchn_upcall_pending, `1`);
627	} else {
628	u32 evtchn_pending_sel32 = evtchn_pending_sel;
629	struct compat_vcpu_info *vi = gpc->khva;
630
631	asm volatile(LOCK_PREFIX "orl %0, %1\n"
632	"notl %0\n"
633	LOCK_PREFIX "andl %0, %2\n"
634	: "=r" (evtchn_pending_sel32),
635	"+m" (vi->evtchn_pending_sel),
636	"+m" (v->arch.xen.evtchn_pending_sel)
637	: "0" (evtchn_pending_sel32));
638	WRITE_ONCE(vi->evtchn_upcall_pending, `1`);
639	}
640
641	kvm_gpc_mark_dirty_in_slot(gpc);
642	read_unlock_irqrestore(&gpc->lock, flags);
643
644	/ For the per-vCPU lapic vector, deliver it as MSI. /
645	if (v->arch.xen.upcall_vector)
646	kvm_xen_inject_vcpu_vector(v);
647	}
648
649	int __kvm_xen_has_interrupt(struct kvm_vcpu *v)
650	{
651	struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache;
652	unsigned long flags;
653	u8 rc = `0`;
654
655	/*
656	* If the global upcall vector (HVMIRQ_callback_vector) is set and
657	* the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending.
658	*/
659
660	/ No need for compat handling here /
661	BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) !=
662	offsetof(struct compat_vcpu_info, evtchn_upcall_pending));
663	BUILD_BUG_ON(sizeof(rc) !=
664	sizeof_field(struct vcpu_info, evtchn_upcall_pending));
665	BUILD_BUG_ON(sizeof(rc) !=
666	sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending));
667
668	read_lock_irqsave(&gpc->lock, flags);
669	while (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) {
670	read_unlock_irqrestore(&gpc->lock, flags);
671
672	/*
673	* This function gets called from kvm_vcpu_block() after setting the
674	* task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately
675	* from a HLT. So we really mustn't sleep. If the page ended up absent
676	* at that point, just return 1 in order to trigger an immediate wake,
677	* and we'll end up getting called again from a context where we can
678	* fault in the page and wait for it.
679	*/
680	if (in_atomic() \|\| !task_is_running(current))
681	return `1`;
682
683	if (kvm_gpc_refresh(gpc, len: sizeof(struct vcpu_info))) {
684	/*
685	* If this failed, userspace has screwed up the
686	* vcpu_info mapping. No interrupts for you.
687	*/
688	return `0`;
689	}
690	read_lock_irqsave(&gpc->lock, flags);
691	}
692
693	rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending;
694	read_unlock_irqrestore(&gpc->lock, flags);
695	return rc;
696	}
697
698	int kvm_xen_hvm_set_attr(struct kvm kvm, struct* kvm_xen_hvm_attr *data)
699	{
700	int r = -ENOENT;
701
702
703	switch (data->type) {
704	case KVM_XEN_ATTR_TYPE_LONG_MODE:
705	if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) {
706	r = -EINVAL;
707	} else {
708	mutex_lock(&kvm->arch.xen.xen_lock);
709	kvm->arch.xen.long_mode = !!data->u.long_mode;
710
711	/*
712	* Re-initialize shared_info to put the wallclock in the
713	* correct place. Whilst it's not necessary to do this
714	* unless the mode is actually changed, it does no harm
715	* to make the call anyway.
716	*/
717	r = kvm->arch.xen.shinfo_cache.active ?
718	kvm_xen_shared_info_init(kvm) : `0`;
719	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
720	}
721	break;
722
723	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
724	case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: {
725	int idx;
726
727	mutex_lock(&kvm->arch.xen.xen_lock);
728
729	idx = srcu_read_lock(ssp: &kvm->srcu);
730
731	if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) {
732	gfn_t gfn = data->u.shared_info.gfn;
733
734	if (gfn == KVM_XEN_INVALID_GFN) {
735	kvm_gpc_deactivate(gpc: &kvm->arch.xen.shinfo_cache);
736	r = `0`;
737	} else {
738	r = kvm_gpc_activate(gpc: &kvm->arch.xen.shinfo_cache,
739	gpa: gfn_to_gpa(gfn), PAGE_SIZE);
740	}
741	} else {
742	void __user * hva = u64_to_user_ptr(data->u.shared_info.hva);
743
744	if (!PAGE_ALIGNED(hva) \|\| !access_ok(hva, PAGE_SIZE)) {
745	r = -EINVAL;
746	} else if (!hva) {
747	kvm_gpc_deactivate(gpc: &kvm->arch.xen.shinfo_cache);
748	r = `0`;
749	} else {
750	r = kvm_gpc_activate_hva(gpc: &kvm->arch.xen.shinfo_cache,
751	hva: (unsigned long)hva, PAGE_SIZE);
752	}
753	}
754
755	srcu_read_unlock(ssp: &kvm->srcu, idx);
756
757	if (!r && kvm->arch.xen.shinfo_cache.active)
758	r = kvm_xen_shared_info_init(kvm);
759
760	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
761	break;
762	}
763	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
764	if (data->u.vector && data->u.vector < `0x10`)
765	r = -EINVAL;
766	else {
767	mutex_lock(&kvm->arch.xen.xen_lock);
768	kvm->arch.xen.upcall_vector = data->u.vector;
769	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
770	r = `0`;
771	}
772	break;
773
774	case KVM_XEN_ATTR_TYPE_EVTCHN:
775	r = kvm_xen_setattr_evtchn(kvm, data);
776	break;
777
778	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
779	mutex_lock(&kvm->arch.xen.xen_lock);
780	kvm->arch.xen.xen_version = data->u.xen_version;
781	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
782	r = `0`;
783	break;
784
785	case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
786	if (!sched_info_on()) {
787	r = -EOPNOTSUPP;
788	break;
789	}
790	mutex_lock(&kvm->arch.xen.xen_lock);
791	kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag;
792	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
793	r = `0`;
794	break;
795
796	default:
797	break;
798	}
799
800	return r;
801	}
802
803	int kvm_xen_hvm_get_attr(struct kvm kvm, struct* kvm_xen_hvm_attr *data)
804	{
805	int r = -ENOENT;
806
807	mutex_lock(&kvm->arch.xen.xen_lock);
808
809	switch (data->type) {
810	case KVM_XEN_ATTR_TYPE_LONG_MODE:
811	data->u.long_mode = kvm->arch.xen.long_mode;
812	r = `0`;
813	break;
814
815	case KVM_XEN_ATTR_TYPE_SHARED_INFO:
816	if (kvm_gpc_is_gpa_active(gpc: &kvm->arch.xen.shinfo_cache))
817	data->u.shared_info.gfn = gpa_to_gfn(gpa: kvm->arch.xen.shinfo_cache.gpa);
818	else
819	data->u.shared_info.gfn = KVM_XEN_INVALID_GFN;
820	r = `0`;
821	break;
822
823	case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA:
824	if (kvm_gpc_is_hva_active(gpc: &kvm->arch.xen.shinfo_cache))
825	data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva;
826	else
827	data->u.shared_info.hva = `0`;
828	r = `0`;
829	break;
830
831	case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR:
832	data->u.vector = kvm->arch.xen.upcall_vector;
833	r = `0`;
834	break;
835
836	case KVM_XEN_ATTR_TYPE_XEN_VERSION:
837	data->u.xen_version = kvm->arch.xen.xen_version;
838	r = `0`;
839	break;
840
841	case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG:
842	if (!sched_info_on()) {
843	r = -EOPNOTSUPP;
844	break;
845	}
846	data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag;
847	r = `0`;
848	break;
849
850	default:
851	break;
852	}
853
854	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
855	return r;
856	}
857
858	int kvm_xen_vcpu_set_attr(struct kvm_vcpu vcpu, struct* kvm_xen_vcpu_attr *data)
859	{
860	int idx, r = -ENOENT;
861
862	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
863	idx = srcu_read_lock(ssp: &vcpu->kvm->srcu);
864
865	switch (data->type) {
866	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
867	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
868	/ No compat necessary here. /
869	BUILD_BUG_ON(sizeof(struct vcpu_info) !=
870	sizeof(struct compat_vcpu_info));
871	BUILD_BUG_ON(offsetof(struct vcpu_info, time) !=
872	offsetof(struct compat_vcpu_info, time));
873
874	if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) {
875	if (data->u.gpa == KVM_XEN_INVALID_GPA) {
876	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache);
877	r = `0`;
878	break;
879	}
880
881	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.vcpu_info_cache,
882	gpa: data->u.gpa, len: sizeof(struct vcpu_info));
883	} else {
884	if (data->u.hva == `0`) {
885	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache);
886	r = `0`;
887	break;
888	}
889
890	r = kvm_gpc_activate_hva(gpc: &vcpu->arch.xen.vcpu_info_cache,
891	hva: data->u.hva, len: sizeof(struct vcpu_info));
892	}
893
894	if (!r)
895	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
896
897	break;
898
899	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
900	if (data->u.gpa == KVM_XEN_INVALID_GPA) {
901	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_time_info_cache);
902	r = `0`;
903	break;
904	}
905
906	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.vcpu_time_info_cache,
907	gpa: data->u.gpa,
908	len: sizeof(struct pvclock_vcpu_time_info));
909	if (!r)
910	kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu);
911	break;
912
913	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: {
914	size_t sz, sz1, sz2;
915
916	if (!sched_info_on()) {
917	r = -EOPNOTSUPP;
918	break;
919	}
920	if (data->u.gpa == KVM_XEN_INVALID_GPA) {
921	r = `0`;
922	deactivate_out:
923	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate_cache);
924	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache);
925	break;
926	}
927
928	/*
929	* If the guest switches to 64-bit mode after setting the runstate
930	* address, that's actually OK. kvm_xen_update_runstate_guest()
931	* will cope.
932	*/
933	if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode)
934	sz = sizeof(struct vcpu_runstate_info);
935	else
936	sz = sizeof(struct compat_vcpu_runstate_info);
937
938	/ How much fits in the (first) page? /
939	sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK);
940	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.runstate_cache,
941	gpa: data->u.gpa, len: sz1);
942	if (r)
943	goto deactivate_out;
944
945	/ Either map the second page, or deactivate the second GPC /
946	if (sz1 >= sz) {
947	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache);
948	} else {
949	sz2 = sz - sz1;
950	BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK);
951	r = kvm_gpc_activate(gpc: &vcpu->arch.xen.runstate2_cache,
952	gpa: data->u.gpa + sz1, len: sz2);
953	if (r)
954	goto deactivate_out;
955	}
956
957	kvm_xen_update_runstate_guest(v: vcpu, atomic: false);
958	break;
959	}
960	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
961	if (!sched_info_on()) {
962	r = -EOPNOTSUPP;
963	break;
964	}
965	if (data->u.runstate.state > RUNSTATE_offline) {
966	r = -EINVAL;
967	break;
968	}
969
970	kvm_xen_update_runstate(v: vcpu, state: data->u.runstate.state);
971	r = `0`;
972	break;
973
974	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
975	if (!sched_info_on()) {
976	r = -EOPNOTSUPP;
977	break;
978	}
979	if (data->u.runstate.state > RUNSTATE_offline) {
980	r = -EINVAL;
981	break;
982	}
983	if (data->u.runstate.state_entry_time !=
984	(data->u.runstate.time_running +
985	data->u.runstate.time_runnable +
986	data->u.runstate.time_blocked +
987	data->u.runstate.time_offline)) {
988	r = -EINVAL;
989	break;
990	}
991	if (get_kvmclock_ns(kvm: vcpu->kvm) <
992	data->u.runstate.state_entry_time) {
993	r = -EINVAL;
994	break;
995	}
996
997	vcpu->arch.xen.current_runstate = data->u.runstate.state;
998	vcpu->arch.xen.runstate_entry_time =
999	data->u.runstate.state_entry_time;
1000	vcpu->arch.xen.runstate_times[RUNSTATE_running] =
1001	data->u.runstate.time_running;
1002	vcpu->arch.xen.runstate_times[RUNSTATE_runnable] =
1003	data->u.runstate.time_runnable;
1004	vcpu->arch.xen.runstate_times[RUNSTATE_blocked] =
1005	data->u.runstate.time_blocked;
1006	vcpu->arch.xen.runstate_times[RUNSTATE_offline] =
1007	data->u.runstate.time_offline;
1008	vcpu->arch.xen.last_steal = current->sched_info.run_delay;
1009	r = `0`;
1010	break;
1011
1012	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
1013	if (!sched_info_on()) {
1014	r = -EOPNOTSUPP;
1015	break;
1016	}
1017	if (data->u.runstate.state > RUNSTATE_offline &&
1018	data->u.runstate.state != (u64)-`1`) {
1019	r = -EINVAL;
1020	break;
1021	}
1022	/ The adjustment must add up /
1023	if (data->u.runstate.state_entry_time !=
1024	(data->u.runstate.time_running +
1025	data->u.runstate.time_runnable +
1026	data->u.runstate.time_blocked +
1027	data->u.runstate.time_offline)) {
1028	r = -EINVAL;
1029	break;
1030	}
1031
1032	if (get_kvmclock_ns(kvm: vcpu->kvm) <
1033	(vcpu->arch.xen.runstate_entry_time +
1034	data->u.runstate.state_entry_time)) {
1035	r = -EINVAL;
1036	break;
1037	}
1038
1039	vcpu->arch.xen.runstate_entry_time +=
1040	data->u.runstate.state_entry_time;
1041	vcpu->arch.xen.runstate_times[RUNSTATE_running] +=
1042	data->u.runstate.time_running;
1043	vcpu->arch.xen.runstate_times[RUNSTATE_runnable] +=
1044	data->u.runstate.time_runnable;
1045	vcpu->arch.xen.runstate_times[RUNSTATE_blocked] +=
1046	data->u.runstate.time_blocked;
1047	vcpu->arch.xen.runstate_times[RUNSTATE_offline] +=
1048	data->u.runstate.time_offline;
1049
1050	if (data->u.runstate.state <= RUNSTATE_offline)
1051	kvm_xen_update_runstate(v: vcpu, state: data->u.runstate.state);
1052	else if (vcpu->arch.xen.runstate_cache.active)
1053	kvm_xen_update_runstate_guest(v: vcpu, atomic: false);
1054	r = `0`;
1055	break;
1056
1057	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
1058	if (data->u.vcpu_id >= KVM_MAX_VCPUS)
1059	r = -EINVAL;
1060	else {
1061	vcpu->arch.xen.vcpu_id = data->u.vcpu_id;
1062	r = `0`;
1063	}
1064	break;
1065
1066	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
1067	if (data->u.timer.port &&
1068	data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) {
1069	r = -EINVAL;
1070	break;
1071	}
1072
1073	if (!vcpu->arch.xen.timer.function)
1074	kvm_xen_init_timer(vcpu);
1075
1076	/ Stop the timer (if it's running) before changing the vector /
1077	kvm_xen_stop_timer(vcpu);
1078	vcpu->arch.xen.timer_virq = data->u.timer.port;
1079
1080	/ Start the timer if the new value has a valid vector+expiry. /
1081	if (data->u.timer.port && data->u.timer.expires_ns)
1082	kvm_xen_start_timer(vcpu, guest_abs: data->u.timer.expires_ns, linux_wa: false);
1083
1084	r = `0`;
1085	break;
1086
1087	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
1088	if (data->u.vector && data->u.vector < `0x10`)
1089	r = -EINVAL;
1090	else {
1091	vcpu->arch.xen.upcall_vector = data->u.vector;
1092	r = `0`;
1093	}
1094	break;
1095
1096	default:
1097	break;
1098	}
1099
1100	srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx);
1101	mutex_unlock(lock: &vcpu->kvm->arch.xen.xen_lock);
1102	return r;
1103	}
1104
1105	int kvm_xen_vcpu_get_attr(struct kvm_vcpu vcpu, struct* kvm_xen_vcpu_attr *data)
1106	{
1107	int r = -ENOENT;
1108
1109	mutex_lock(&vcpu->kvm->arch.xen.xen_lock);
1110
1111	switch (data->type) {
1112	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO:
1113	if (kvm_gpc_is_gpa_active(gpc: &vcpu->arch.xen.vcpu_info_cache))
1114	data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa;
1115	else
1116	data->u.gpa = KVM_XEN_INVALID_GPA;
1117	r = `0`;
1118	break;
1119
1120	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA:
1121	if (kvm_gpc_is_hva_active(gpc: &vcpu->arch.xen.vcpu_info_cache))
1122	data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva;
1123	else
1124	data->u.hva = `0`;
1125	r = `0`;
1126	break;
1127
1128	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO:
1129	if (vcpu->arch.xen.vcpu_time_info_cache.active)
1130	data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa;
1131	else
1132	data->u.gpa = KVM_XEN_INVALID_GPA;
1133	r = `0`;
1134	break;
1135
1136	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR:
1137	if (!sched_info_on()) {
1138	r = -EOPNOTSUPP;
1139	break;
1140	}
1141	if (vcpu->arch.xen.runstate_cache.active) {
1142	data->u.gpa = vcpu->arch.xen.runstate_cache.gpa;
1143	r = `0`;
1144	}
1145	break;
1146
1147	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT:
1148	if (!sched_info_on()) {
1149	r = -EOPNOTSUPP;
1150	break;
1151	}
1152	data->u.runstate.state = vcpu->arch.xen.current_runstate;
1153	r = `0`;
1154	break;
1155
1156	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA:
1157	if (!sched_info_on()) {
1158	r = -EOPNOTSUPP;
1159	break;
1160	}
1161	data->u.runstate.state = vcpu->arch.xen.current_runstate;
1162	data->u.runstate.state_entry_time =
1163	vcpu->arch.xen.runstate_entry_time;
1164	data->u.runstate.time_running =
1165	vcpu->arch.xen.runstate_times[RUNSTATE_running];
1166	data->u.runstate.time_runnable =
1167	vcpu->arch.xen.runstate_times[RUNSTATE_runnable];
1168	data->u.runstate.time_blocked =
1169	vcpu->arch.xen.runstate_times[RUNSTATE_blocked];
1170	data->u.runstate.time_offline =
1171	vcpu->arch.xen.runstate_times[RUNSTATE_offline];
1172	r = `0`;
1173	break;
1174
1175	case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST:
1176	r = -EINVAL;
1177	break;
1178
1179	case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID:
1180	data->u.vcpu_id = vcpu->arch.xen.vcpu_id;
1181	r = `0`;
1182	break;
1183
1184	case KVM_XEN_VCPU_ATTR_TYPE_TIMER:
1185	/*
1186	* Ensure a consistent snapshot of state is captured, with a
1187	* timer either being pending, or the event channel delivered
1188	* to the corresponding bit in the shared_info. Not still
1189	* lurking in the timer_pending flag for deferred delivery.
1190	* Purely as an optimisation, if the timer_expires field is
1191	* zero, that means the timer isn't active (or even in the
1192	* timer_pending flag) and there is no need to cancel it.
1193	*/
1194	if (vcpu->arch.xen.timer_expires) {
1195	hrtimer_cancel(timer: &vcpu->arch.xen.timer);
1196	kvm_xen_inject_timer_irqs(vcpu);
1197	}
1198
1199	data->u.timer.port = vcpu->arch.xen.timer_virq;
1200	data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL;
1201	data->u.timer.expires_ns = vcpu->arch.xen.timer_expires;
1202
1203	/*
1204	* The hrtimer may trigger and raise the IRQ immediately,
1205	* while the returned state causes it to be set up and
1206	* raised again on the destination system after migration.
1207	* That's fine, as the guest won't even have had a chance
1208	* to run and handle the interrupt. Asserting an already
1209	* pending event channel is idempotent.
1210	*/
1211	if (vcpu->arch.xen.timer_expires)
1212	hrtimer_start_expires(timer: &vcpu->arch.xen.timer,
1213	mode: HRTIMER_MODE_ABS_HARD);
1214
1215	r = `0`;
1216	break;
1217
1218	case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR:
1219	data->u.vector = vcpu->arch.xen.upcall_vector;
1220	r = `0`;
1221	break;
1222
1223	default:
1224	break;
1225	}
1226
1227	mutex_unlock(lock: &vcpu->kvm->arch.xen.xen_lock);
1228	return r;
1229	}
1230
1231	int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data)
1232	{
1233	struct kvm *kvm = vcpu->kvm;
1234	u32 page_num = data & ~PAGE_MASK;
1235	u64 page_addr = data & PAGE_MASK;
1236	bool lm = is_long_mode(vcpu);
1237	int r = `0`;
1238
1239	mutex_lock(&kvm->arch.xen.xen_lock);
1240	if (kvm->arch.xen.long_mode != lm) {
1241	kvm->arch.xen.long_mode = lm;
1242
1243	/*
1244	* Re-initialize shared_info to put the wallclock in the
1245	* correct place.
1246	*/
1247	if (kvm->arch.xen.shinfo_cache.active &&
1248	kvm_xen_shared_info_init(kvm))
1249	r = `1`;
1250	}
1251	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
1252
1253	if (r)
1254	return r;
1255
1256	/*
1257	* If Xen hypercall intercept is enabled, fill the hypercall
1258	* page with VMCALL/VMMCALL instructions since that's what
1259	* we catch. Else the VMM has provided the hypercall pages
1260	* with instructions of its own choosing, so use those.
1261	*/
1262	if (kvm_xen_hypercall_enabled(kvm)) {
1263	u8 instructions[`32`];
1264	int i;
1265
1266	if (page_num)
1267	return `1`;
1268
1269	/ mov imm32, %eax /
1270	instructions[`0`] = `0xb8`;
1271
1272	/ vmcall / vmmcall /
1273	static_call(kvm_x86_patch_hypercall)(vcpu, instructions + `5`);
1274
1275	/ ret /
1276	instructions[`8`] = `0xc3`;
1277
1278	/ int3 to pad /
1279	memset(instructions + `9`, `0xcc`, sizeof(instructions) - `9`);
1280
1281	for (i = `0`; i < PAGE_SIZE / sizeof(instructions); i++) {
1282	(u32 )&instructions[`1`] = i;
1283	if (kvm_vcpu_write_guest(vcpu,
1284	gpa: page_addr + (i * sizeof(instructions)),
1285	data: instructions, len: sizeof(instructions)))
1286	return `1`;
1287	}
1288	} else {
1289	/*
1290	* Note, truncation is a non-issue as 'lm' is guaranteed to be
1291	* false for a 32-bit kernel, i.e. when hva_t is only 4 bytes.
1292	*/
1293	hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64
1294	: kvm->arch.xen_hvm_config.blob_addr_32;
1295	u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64
1296	: kvm->arch.xen_hvm_config.blob_size_32;
1297	u8 *page;
1298	int ret;
1299
1300	if (page_num >= blob_size)
1301	return `1`;
1302
1303	blob_addr += page_num * PAGE_SIZE;
1304
1305	page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE);
1306	if (IS_ERR(ptr: page))
1307	return PTR_ERR(ptr: page);
1308
1309	ret = kvm_vcpu_write_guest(vcpu, gpa: page_addr, data: page, PAGE_SIZE);
1310	kfree(objp: page);
1311	if (ret)
1312	return `1`;
1313	}
1314	return `0`;
1315	}
1316
1317	int kvm_xen_hvm_config(struct kvm kvm, struct* kvm_xen_hvm_config *xhc)
1318	{
1319	/ Only some feature flags need to be enabled by userspace /
1320	u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL \|
1321	KVM_XEN_HVM_CONFIG_EVTCHN_SEND \|
1322	KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE;
1323	u32 old_flags;
1324
1325	if (xhc->flags & ~permitted_flags)
1326	return -EINVAL;
1327
1328	/*
1329	* With hypercall interception the kernel generates its own
1330	* hypercall page so it must not be provided.
1331	*/
1332	if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) &&
1333	(xhc->blob_addr_32 \|\| xhc->blob_addr_64 \|\|
1334	xhc->blob_size_32 \|\| xhc->blob_size_64))
1335	return -EINVAL;
1336
1337	mutex_lock(&kvm->arch.xen.xen_lock);
1338
1339	if (xhc->msr && !kvm->arch.xen_hvm_config.msr)
1340	static_branch_inc(&kvm_xen_enabled.key);
1341	else if (!xhc->msr && kvm->arch.xen_hvm_config.msr)
1342	static_branch_slow_dec_deferred(&kvm_xen_enabled);
1343
1344	old_flags = kvm->arch.xen_hvm_config.flags;
1345	memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc));
1346
1347	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
1348
1349	if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE)
1350	kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE);
1351
1352	return `0`;
1353	}
1354
1355	static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result)
1356	{
1357	kvm_rax_write(vcpu, val: result);
1358	return kvm_skip_emulated_instruction(vcpu);
1359	}
1360
1361	static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu)
1362	{
1363	struct kvm_run *run = vcpu->run;
1364
1365	if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip)))
1366	return `1`;
1367
1368	return kvm_xen_hypercall_set_result(vcpu, result: run->xen.u.hcall.result);
1369	}
1370
1371	static inline int max_evtchn_port(struct kvm *kvm)
1372	{
1373	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode)
1374	return EVTCHN_2L_NR_CHANNELS;
1375	else
1376	return COMPAT_EVTCHN_2L_NR_CHANNELS;
1377	}
1378
1379	static bool wait_pending_event(struct kvm_vcpu vcpu, int* nr_ports,
1380	evtchn_port_t *ports)
1381	{
1382	struct kvm *kvm = vcpu->kvm;
1383	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1384	unsigned long *pending_bits;
1385	unsigned long flags;
1386	bool ret = true;
1387	int idx, i;
1388
1389	idx = srcu_read_lock(ssp: &kvm->srcu);
1390	read_lock_irqsave(&gpc->lock, flags);
1391	if (!kvm_gpc_check(gpc, PAGE_SIZE))
1392	goto out_rcu;
1393
1394	ret = false;
1395	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1396	struct shared_info *shinfo = gpc->khva;
1397	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1398	} else {
1399	struct compat_shared_info *shinfo = gpc->khva;
1400	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1401	}
1402
1403	for (i = `0`; i < nr_ports; i++) {
1404	if (test_bit(ports[i], pending_bits)) {
1405	ret = true;
1406	break;
1407	}
1408	}
1409
1410	out_rcu:
1411	read_unlock_irqrestore(&gpc->lock, flags);
1412	srcu_read_unlock(ssp: &kvm->srcu, idx);
1413
1414	return ret;
1415	}
1416
1417	static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode,
1418	u64 param, u64 *r)
1419	{
1420	struct sched_poll sched_poll;
1421	evtchn_port_t port, *ports;
1422	struct x86_exception e;
1423	int i;
1424
1425	if (!lapic_in_kernel(vcpu) \|\|
1426	!(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND))
1427	return false;
1428
1429	if (IS_ENABLED(CONFIG_64BIT) && !longmode) {
1430	struct compat_sched_poll sp32;
1431
1432	/ Sanity check that the compat struct definition is correct /
1433	BUILD_BUG_ON(sizeof(sp32) != `16`);
1434
1435	if (kvm_read_guest_virt(vcpu, addr: param, val: &sp32, bytes: sizeof(sp32), exception: &e)) {
1436	*r = -EFAULT;
1437	return true;
1438	}
1439
1440	/*
1441	* This is a 32-bit pointer to an array of evtchn_port_t which
1442	* are uint32_t, so once it's converted no further compat
1443	* handling is needed.
1444	*/
1445	sched_poll.ports = (void )(unsigned* long)(sp32.ports);
1446	sched_poll.nr_ports = sp32.nr_ports;
1447	sched_poll.timeout = sp32.timeout;
1448	} else {
1449	if (kvm_read_guest_virt(vcpu, addr: param, val: &sched_poll,
1450	bytes: sizeof(sched_poll), exception: &e)) {
1451	*r = -EFAULT;
1452	return true;
1453	}
1454	}
1455
1456	if (unlikely(sched_poll.nr_ports > `1`)) {
1457	/ Xen (unofficially) limits number of pollers to 128 /
1458	if (sched_poll.nr_ports > `128`) {
1459	*r = -EINVAL;
1460	return true;
1461	}
1462
1463	ports = kmalloc_array(n: sched_poll.nr_ports,
1464	size: sizeof(*ports), GFP_KERNEL);
1465	if (!ports) {
1466	*r = -ENOMEM;
1467	return true;
1468	}
1469	} else
1470	ports = &port;
1471
1472	if (kvm_read_guest_virt(vcpu, addr: (gva_t)sched_poll.ports, val: ports,
1473	bytes: sched_poll.nr_ports * sizeof(*ports), exception: &e)) {
1474	*r = -EFAULT;
1475	return true;
1476	}
1477
1478	for (i = `0`; i < sched_poll.nr_ports; i++) {
1479	if (ports[i] >= max_evtchn_port(kvm: vcpu->kvm)) {
1480	*r = -EINVAL;
1481	goto out;
1482	}
1483	}
1484
1485	if (sched_poll.nr_ports == `1`)
1486	vcpu->arch.xen.poll_evtchn = port;
1487	else
1488	vcpu->arch.xen.poll_evtchn = -`1`;
1489
1490	set_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask);
1491
1492	if (!wait_pending_event(vcpu, nr_ports: sched_poll.nr_ports, ports)) {
1493	vcpu->arch.mp_state = KVM_MP_STATE_HALTED;
1494
1495	if (sched_poll.timeout)
1496	mod_timer(timer: &vcpu->arch.xen.poll_timer,
1497	expires: jiffies + nsecs_to_jiffies(n: sched_poll.timeout));
1498
1499	kvm_vcpu_halt(vcpu);
1500
1501	if (sched_poll.timeout)
1502	del_timer(timer: &vcpu->arch.xen.poll_timer);
1503
1504	vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE;
1505	}
1506
1507	vcpu->arch.xen.poll_evtchn = `0`;
1508	*r = `0`;
1509	out:
1510	/ Really, this is only needed in case of timeout /
1511	clear_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask);
1512
1513	if (unlikely(sched_poll.nr_ports > `1`))
1514	kfree(objp: ports);
1515	return true;
1516	}
1517
1518	static void cancel_evtchn_poll(struct timer_list *t)
1519	{
1520	struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer);
1521
1522	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1523	kvm_vcpu_kick(vcpu);
1524	}
1525
1526	static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode,
1527	int cmd, u64 param, u64 *r)
1528	{
1529	switch (cmd) {
1530	case SCHEDOP_poll:
1531	if (kvm_xen_schedop_poll(vcpu, longmode, param, r))
1532	return true;
1533	fallthrough;
1534	case SCHEDOP_yield:
1535	kvm_vcpu_on_spin(vcpu, yield_to_kernel_mode: true);
1536	*r = `0`;
1537	return true;
1538	default:
1539	break;
1540	}
1541
1542	return false;
1543	}
1544
1545	struct compat_vcpu_set_singleshot_timer {
1546	uint64_t timeout_abs_ns;
1547	uint32_t flags;
1548	} __attribute__((packed));
1549
1550	static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu vcpu, bool longmode, int* cmd,
1551	int vcpu_id, u64 param, u64 *r)
1552	{
1553	struct vcpu_set_singleshot_timer oneshot;
1554	struct x86_exception e;
1555
1556	if (!kvm_xen_timer_enabled(vcpu))
1557	return false;
1558
1559	switch (cmd) {
1560	case VCPUOP_set_singleshot_timer:
1561	if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1562	*r = -EINVAL;
1563	return true;
1564	}
1565
1566	/*
1567	* The only difference for 32-bit compat is the 4 bytes of
1568	* padding after the interesting part of the structure. So
1569	* for a faithful emulation of Xen we have to try to copy
1570	* the padding and return -EFAULT if we can't. Otherwise we
1571	* might as well just have copied the 12-byte 32-bit struct.
1572	*/
1573	BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1574	offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1575	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) !=
1576	sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns));
1577	BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) !=
1578	offsetof(struct vcpu_set_singleshot_timer, flags));
1579	BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) !=
1580	sizeof_field(struct vcpu_set_singleshot_timer, flags));
1581
1582	if (kvm_read_guest_virt(vcpu, addr: param, val: &oneshot, bytes: longmode ? sizeof(oneshot) :
1583	sizeof(struct compat_vcpu_set_singleshot_timer), exception: &e)) {
1584	*r = -EFAULT;
1585	return true;
1586	}
1587
1588	kvm_xen_start_timer(vcpu, guest_abs: oneshot.timeout_abs_ns, linux_wa: false);
1589	*r = `0`;
1590	return true;
1591
1592	case VCPUOP_stop_singleshot_timer:
1593	if (vcpu->arch.xen.vcpu_id != vcpu_id) {
1594	*r = -EINVAL;
1595	return true;
1596	}
1597	kvm_xen_stop_timer(vcpu);
1598	*r = `0`;
1599	return true;
1600	}
1601
1602	return false;
1603	}
1604
1605	static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout,
1606	u64 *r)
1607	{
1608	if (!kvm_xen_timer_enabled(vcpu))
1609	return false;
1610
1611	if (timeout)
1612	kvm_xen_start_timer(vcpu, guest_abs: timeout, linux_wa: true);
1613	else
1614	kvm_xen_stop_timer(vcpu);
1615
1616	*r = `0`;
1617	return true;
1618	}
1619
1620	int kvm_xen_hypercall(struct kvm_vcpu *vcpu)
1621	{
1622	bool longmode;
1623	u64 input, params[`6`], r = -ENOSYS;
1624	bool handled = false;
1625	u8 cpl;
1626
1627	input = (u64)kvm_register_read(vcpu, reg: VCPU_REGS_RAX);
1628
1629	/ Hyper-V hypercalls get bit 31 set in EAX /
1630	if ((input & `0x80000000`) &&
1631	kvm_hv_hypercall_enabled(vcpu))
1632	return kvm_hv_hypercall(vcpu);
1633
1634	longmode = is_64_bit_hypercall(vcpu);
1635	if (!longmode) {
1636	params[`0`] = (u32)kvm_rbx_read(vcpu);
1637	params[`1`] = (u32)kvm_rcx_read(vcpu);
1638	params[`2`] = (u32)kvm_rdx_read(vcpu);
1639	params[`3`] = (u32)kvm_rsi_read(vcpu);
1640	params[`4`] = (u32)kvm_rdi_read(vcpu);
1641	params[`5`] = (u32)kvm_rbp_read(vcpu);
1642	}
1643	#ifdef CONFIG_X86_64
1644	else {
1645	params[`0`] = (u64)kvm_rdi_read(vcpu);
1646	params[`1`] = (u64)kvm_rsi_read(vcpu);
1647	params[`2`] = (u64)kvm_rdx_read(vcpu);
1648	params[`3`] = (u64)kvm_r10_read(vcpu);
1649	params[`4`] = (u64)kvm_r8_read(vcpu);
1650	params[`5`] = (u64)kvm_r9_read(vcpu);
1651	}
1652	#endif
1653	cpl = static_call(kvm_x86_get_cpl)(vcpu);
1654	trace_kvm_xen_hypercall(cpl, nr: input, a0: params[`0`], a1: params[`1`], a2: params[`2`],
1655	a3: params[`3`], a4: params[`4`], a5: params[`5`]);
1656
1657	/*
1658	* Only allow hypercall acceleration for CPL0. The rare hypercalls that
1659	* are permitted in guest userspace can be handled by the VMM.
1660	*/
1661	if (unlikely(cpl > `0`))
1662	goto handle_in_userspace;
1663
1664	switch (input) {
1665	case __HYPERVISOR_xen_version:
1666	if (params[`0`] == XENVER_version && vcpu->kvm->arch.xen.xen_version) {
1667	r = vcpu->kvm->arch.xen.xen_version;
1668	handled = true;
1669	}
1670	break;
1671	case __HYPERVISOR_event_channel_op:
1672	if (params[`0`] == EVTCHNOP_send)
1673	handled = kvm_xen_hcall_evtchn_send(vcpu, param: params[`1`], r: &r);
1674	break;
1675	case __HYPERVISOR_sched_op:
1676	handled = kvm_xen_hcall_sched_op(vcpu, longmode, cmd: params[`0`],
1677	param: params[`1`], r: &r);
1678	break;
1679	case __HYPERVISOR_vcpu_op:
1680	handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, cmd: params[`0`], vcpu_id: params[`1`],
1681	param: params[`2`], r: &r);
1682	break;
1683	case __HYPERVISOR_set_timer_op: {
1684	u64 timeout = params[`0`];
1685	/ In 32-bit mode, the 64-bit timeout is in two 32-bit params. /
1686	if (!longmode)
1687	timeout \|= params[`1`] << `32`;
1688	handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, r: &r);
1689	break;
1690	}
1691	default:
1692	break;
1693	}
1694
1695	if (handled)
1696	return kvm_xen_hypercall_set_result(vcpu, result: r);
1697
1698	handle_in_userspace:
1699	vcpu->run->exit_reason = KVM_EXIT_XEN;
1700	vcpu->run->xen.type = KVM_EXIT_XEN_HCALL;
1701	vcpu->run->xen.u.hcall.longmode = longmode;
1702	vcpu->run->xen.u.hcall.cpl = cpl;
1703	vcpu->run->xen.u.hcall.input = input;
1704	vcpu->run->xen.u.hcall.params[`0`] = params[`0`];
1705	vcpu->run->xen.u.hcall.params[`1`] = params[`1`];
1706	vcpu->run->xen.u.hcall.params[`2`] = params[`2`];
1707	vcpu->run->xen.u.hcall.params[`3`] = params[`3`];
1708	vcpu->run->xen.u.hcall.params[`4`] = params[`4`];
1709	vcpu->run->xen.u.hcall.params[`5`] = params[`5`];
1710	vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu);
1711	vcpu->arch.complete_userspace_io =
1712	kvm_xen_hypercall_complete_userspace;
1713
1714	return `0`;
1715	}
1716
1717	static void kvm_xen_check_poller(struct kvm_vcpu vcpu, int* port)
1718	{
1719	int poll_evtchn = vcpu->arch.xen.poll_evtchn;
1720
1721	if ((poll_evtchn == port \|\| poll_evtchn == -`1`) &&
1722	test_and_clear_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask)) {
1723	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1724	kvm_vcpu_kick(vcpu);
1725	}
1726	}
1727
1728	/*
1729	* The return value from this function is propagated to kvm_set_irq() API,
1730	* so it returns:
1731	* < 0 Interrupt was ignored (masked or not delivered for other reasons)
1732	* = 0 Interrupt was coalesced (previous irq is still pending)
1733	* > 0 Number of CPUs interrupt was delivered to
1734	*
1735	* It is also called directly from kvm_arch_set_irq_inatomic(), where the
1736	* only check on its return value is a comparison with -EWOULDBLOCK'.
1737	*/
1738	int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn xe, struct* kvm *kvm)
1739	{
1740	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1741	struct kvm_vcpu *vcpu;
1742	unsigned long pending_bits, mask_bits;
1743	unsigned long flags;
1744	int port_word_bit;
1745	bool kick_vcpu = false;
1746	int vcpu_idx, idx, rc;
1747
1748	vcpu_idx = READ_ONCE(xe->vcpu_idx);
1749	if (vcpu_idx >= `0`)
1750	vcpu = kvm_get_vcpu(kvm, i: vcpu_idx);
1751	else {
1752	vcpu = kvm_get_vcpu_by_id(kvm, id: xe->vcpu_id);
1753	if (!vcpu)
1754	return -EINVAL;
1755	WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx);
1756	}
1757
1758	if (xe->port >= max_evtchn_port(kvm))
1759	return -EINVAL;
1760
1761	rc = -EWOULDBLOCK;
1762
1763	idx = srcu_read_lock(ssp: &kvm->srcu);
1764
1765	read_lock_irqsave(&gpc->lock, flags);
1766	if (!kvm_gpc_check(gpc, PAGE_SIZE))
1767	goto out_rcu;
1768
1769	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1770	struct shared_info *shinfo = gpc->khva;
1771	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1772	mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1773	port_word_bit = xe->port / `64`;
1774	} else {
1775	struct compat_shared_info *shinfo = gpc->khva;
1776	pending_bits = (unsigned long *)&shinfo->evtchn_pending;
1777	mask_bits = (unsigned long *)&shinfo->evtchn_mask;
1778	port_word_bit = xe->port / `32`;
1779	}
1780
1781	/*
1782	* If this port wasn't already set, and if it isn't masked, then
1783	* we try to set the corresponding bit in the in-kernel shadow of
1784	* evtchn_pending_sel for the target vCPU. And if that wasn't
1785	* already set, then we kick the vCPU in question to write to the
1786	* real evtchn_pending_sel in its own guest vcpu_info struct.
1787	*/
1788	if (test_and_set_bit(nr: xe->port, addr: pending_bits)) {
1789	rc = `0`; / It was already raised /
1790	} else if (test_bit(xe->port, mask_bits)) {
1791	rc = -ENOTCONN; / Masked /
1792	kvm_xen_check_poller(vcpu, port: xe->port);
1793	} else {
1794	rc = `1`; / Delivered to the bitmap in shared_info. /
1795	/ Now switch to the vCPU's vcpu_info to set the index and pending_sel /
1796	read_unlock_irqrestore(&gpc->lock, flags);
1797	gpc = &vcpu->arch.xen.vcpu_info_cache;
1798
1799	read_lock_irqsave(&gpc->lock, flags);
1800	if (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) {
1801	/*
1802	* Could not access the vcpu_info. Set the bit in-kernel
1803	* and prod the vCPU to deliver it for itself.
1804	*/
1805	if (!test_and_set_bit(nr: port_word_bit, addr: &vcpu->arch.xen.evtchn_pending_sel))
1806	kick_vcpu = true;
1807	goto out_rcu;
1808	}
1809
1810	if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) {
1811	struct vcpu_info *vcpu_info = gpc->khva;
1812	if (!test_and_set_bit(nr: port_word_bit, addr: &vcpu_info->evtchn_pending_sel)) {
1813	WRITE_ONCE(vcpu_info->evtchn_upcall_pending, `1`);
1814	kick_vcpu = true;
1815	}
1816	} else {
1817	struct compat_vcpu_info *vcpu_info = gpc->khva;
1818	if (!test_and_set_bit(nr: port_word_bit,
1819	addr: (unsigned long *)&vcpu_info->evtchn_pending_sel)) {
1820	WRITE_ONCE(vcpu_info->evtchn_upcall_pending, `1`);
1821	kick_vcpu = true;
1822	}
1823	}
1824
1825	/ For the per-vCPU lapic vector, deliver it as MSI. /
1826	if (kick_vcpu && vcpu->arch.xen.upcall_vector) {
1827	kvm_xen_inject_vcpu_vector(v: vcpu);
1828	kick_vcpu = false;
1829	}
1830	}
1831
1832	out_rcu:
1833	read_unlock_irqrestore(&gpc->lock, flags);
1834	srcu_read_unlock(ssp: &kvm->srcu, idx);
1835
1836	if (kick_vcpu) {
1837	kvm_make_request(KVM_REQ_UNBLOCK, vcpu);
1838	kvm_vcpu_kick(vcpu);
1839	}
1840
1841	return rc;
1842	}
1843
1844	static int kvm_xen_set_evtchn(struct kvm_xen_evtchn xe, struct* kvm *kvm)
1845	{
1846	bool mm_borrowed = false;
1847	int rc;
1848
1849	rc = kvm_xen_set_evtchn_fast(xe, kvm);
1850	if (rc != -EWOULDBLOCK)
1851	return rc;
1852
1853	if (current->mm != kvm->mm) {
1854	/*
1855	* If not on a thread which already belongs to this KVM,
1856	* we'd better be in the irqfd workqueue.
1857	*/
1858	if (WARN_ON_ONCE(current->mm))
1859	return -EINVAL;
1860
1861	kthread_use_mm(mm: kvm->mm);
1862	mm_borrowed = true;
1863	}
1864
1865	/*
1866	* It is theoretically possible for the page to be unmapped
1867	* and the MMU notifier to invalidate the shared_info before
1868	* we even get to use it. In that case, this looks like an
1869	* infinite loop. It was tempting to do it via the userspace
1870	* HVA instead... but that just hides the fact that it's
1871	* an infinite loop, because if a fault occurs and it waits
1872	* for the page to come back, it can still immediately
1873	* fault and have to wait again, repeatedly.
1874	*
1875	* Conversely, the page could also have been reinstated by
1876	* another thread before we even obtain the mutex above, so
1877	* check again first before remapping it.
1878	*/
1879	do {
1880	struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache;
1881	int idx;
1882
1883	rc = kvm_xen_set_evtchn_fast(xe, kvm);
1884	if (rc != -EWOULDBLOCK)
1885	break;
1886
1887	idx = srcu_read_lock(ssp: &kvm->srcu);
1888	rc = kvm_gpc_refresh(gpc, PAGE_SIZE);
1889	srcu_read_unlock(ssp: &kvm->srcu, idx);
1890	} while(!rc);
1891
1892	if (mm_borrowed)
1893	kthread_unuse_mm(mm: kvm->mm);
1894
1895	return rc;
1896	}
1897
1898	/ This is the version called from kvm_set_irq() as the .set function /
1899	static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry e, struct* kvm *kvm,
1900	int irq_source_id, int level, bool line_status)
1901	{
1902	if (!level)
1903	return -EINVAL;
1904
1905	return kvm_xen_set_evtchn(xe: &e->xen_evtchn, kvm);
1906	}
1907
1908	/*
1909	* Set up an event channel interrupt from the KVM IRQ routing table.
1910	* Used for e.g. PIRQ from passed through physical devices.
1911	*/
1912	int kvm_xen_setup_evtchn(struct kvm *kvm,
1913	struct kvm_kernel_irq_routing_entry *e,
1914	const struct kvm_irq_routing_entry *ue)
1915
1916	{
1917	struct kvm_vcpu *vcpu;
1918
1919	if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm))
1920	return -EINVAL;
1921
1922	/ We only support 2 level event channels for now /
1923	if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1924	return -EINVAL;
1925
1926	/*
1927	* Xen gives us interesting mappings from vCPU index to APIC ID,
1928	* which means kvm_get_vcpu_by_id() has to iterate over all vCPUs
1929	* to find it. Do that once at setup time, instead of every time.
1930	* But beware that on live update / live migration, the routing
1931	* table might be reinstated before the vCPU threads have finished
1932	* recreating their vCPUs.
1933	*/
1934	vcpu = kvm_get_vcpu_by_id(kvm, id: ue->u.xen_evtchn.vcpu);
1935	if (vcpu)
1936	e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx;
1937	else
1938	e->xen_evtchn.vcpu_idx = -`1`;
1939
1940	e->xen_evtchn.port = ue->u.xen_evtchn.port;
1941	e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu;
1942	e->xen_evtchn.priority = ue->u.xen_evtchn.priority;
1943	e->set = evtchn_set_fn;
1944
1945	return `0`;
1946	}
1947
1948	/*
1949	* Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl.
1950	*/
1951	int kvm_xen_hvm_evtchn_send(struct kvm kvm, struct* kvm_irq_routing_xen_evtchn *uxe)
1952	{
1953	struct kvm_xen_evtchn e;
1954	int ret;
1955
1956	if (!uxe->port \|\| uxe->port >= max_evtchn_port(kvm))
1957	return -EINVAL;
1958
1959	/ We only support 2 level event channels for now /
1960	if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
1961	return -EINVAL;
1962
1963	e.port = uxe->port;
1964	e.vcpu_id = uxe->vcpu;
1965	e.vcpu_idx = -`1`;
1966	e.priority = uxe->priority;
1967
1968	ret = kvm_xen_set_evtchn(xe: &e, kvm);
1969
1970	/*
1971	* None of that 'return 1 if it actually got delivered' nonsense.
1972	* We don't care if it was masked (-ENOTCONN) either.
1973	*/
1974	if (ret > `0` \|\| ret == -ENOTCONN)
1975	ret = `0`;
1976
1977	return ret;
1978	}
1979
1980	/*
1981	* Support for outbound event channel events via the EVTCHNOP_send hypercall.
1982	*/
1983	struct evtchnfd {
1984	u32 send_port;
1985	u32 type;
1986	union {
1987	struct kvm_xen_evtchn port;
1988	struct {
1989	u32 port; / zero /
1990	struct eventfd_ctx *ctx;
1991	} eventfd;
1992	} deliver;
1993	};
1994
1995	/*
1996	* Update target vCPU or priority for a registered sending channel.
1997	*/
1998	static int kvm_xen_eventfd_update(struct kvm *kvm,
1999	struct kvm_xen_hvm_attr *data)
2000	{
2001	u32 port = data->u.evtchn.send_port;
2002	struct evtchnfd *evtchnfd;
2003	int ret;
2004
2005	/ Protect writes to evtchnfd as well as the idr lookup. /
2006	mutex_lock(&kvm->arch.xen.xen_lock);
2007	evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, id: port);
2008
2009	ret = -ENOENT;
2010	if (!evtchnfd)
2011	goto out_unlock;
2012
2013	/ For an UPDATE, nothing may change except the priority/vcpu /
2014	ret = -EINVAL;
2015	if (evtchnfd->type != data->u.evtchn.type)
2016	goto out_unlock;
2017
2018	/*
2019	* Port cannot change, and if it's zero that was an eventfd
2020	* which can't be changed either.
2021	*/
2022	if (!evtchnfd->deliver.port.port \|\|
2023	evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port)
2024	goto out_unlock;
2025
2026	/ We only support 2 level event channels for now /
2027	if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2028	goto out_unlock;
2029
2030	evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
2031	if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) {
2032	evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
2033	evtchnfd->deliver.port.vcpu_idx = -`1`;
2034	}
2035	ret = `0`;
2036	out_unlock:
2037	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
2038	return ret;
2039	}
2040
2041	/*
2042	* Configure the target (eventfd or local port delivery) for sending on
2043	* a given event channel.
2044	*/
2045	static int kvm_xen_eventfd_assign(struct kvm *kvm,
2046	struct kvm_xen_hvm_attr *data)
2047	{
2048	u32 port = data->u.evtchn.send_port;
2049	struct eventfd_ctx *eventfd = NULL;
2050	struct evtchnfd *evtchnfd;
2051	int ret = -EINVAL;
2052
2053	evtchnfd = kzalloc(size: sizeof(struct evtchnfd), GFP_KERNEL);
2054	if (!evtchnfd)
2055	return -ENOMEM;
2056
2057	switch(data->u.evtchn.type) {
2058	case EVTCHNSTAT_ipi:
2059	/ IPI must map back to the same port# /
2060	if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port)
2061	goto out_noeventfd; / -EINVAL /
2062	break;
2063
2064	case EVTCHNSTAT_interdomain:
2065	if (data->u.evtchn.deliver.port.port) {
2066	if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm))
2067	goto out_noeventfd; / -EINVAL /
2068	} else {
2069	eventfd = eventfd_ctx_fdget(fd: data->u.evtchn.deliver.eventfd.fd);
2070	if (IS_ERR(ptr: eventfd)) {
2071	ret = PTR_ERR(ptr: eventfd);
2072	goto out_noeventfd;
2073	}
2074	}
2075	break;
2076
2077	case EVTCHNSTAT_virq:
2078	case EVTCHNSTAT_closed:
2079	case EVTCHNSTAT_unbound:
2080	case EVTCHNSTAT_pirq:
2081	default: / Unknown event channel type /
2082	goto out; / -EINVAL /
2083	}
2084
2085	evtchnfd->send_port = data->u.evtchn.send_port;
2086	evtchnfd->type = data->u.evtchn.type;
2087	if (eventfd) {
2088	evtchnfd->deliver.eventfd.ctx = eventfd;
2089	} else {
2090	/ We only support 2 level event channels for now /
2091	if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL)
2092	goto out; / -EINVAL; /
2093
2094	evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port;
2095	evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu;
2096	evtchnfd->deliver.port.vcpu_idx = -`1`;
2097	evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority;
2098	}
2099
2100	mutex_lock(&kvm->arch.xen.xen_lock);
2101	ret = idr_alloc(&kvm->arch.xen.evtchn_ports, ptr: evtchnfd, start: port, end: port + `1`,
2102	GFP_KERNEL);
2103	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
2104	if (ret >= `0`)
2105	return `0`;
2106
2107	if (ret == -ENOSPC)
2108	ret = -EEXIST;
2109	out:
2110	if (eventfd)
2111	eventfd_ctx_put(ctx: eventfd);
2112	out_noeventfd:
2113	kfree(objp: evtchnfd);
2114	return ret;
2115	}
2116
2117	static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port)
2118	{
2119	struct evtchnfd *evtchnfd;
2120
2121	mutex_lock(&kvm->arch.xen.xen_lock);
2122	evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, id: port);
2123	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
2124
2125	if (!evtchnfd)
2126	return -ENOENT;
2127
2128	synchronize_srcu(ssp: &kvm->srcu);
2129	if (!evtchnfd->deliver.port.port)
2130	eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx);
2131	kfree(objp: evtchnfd);
2132	return `0`;
2133	}
2134
2135	static int kvm_xen_eventfd_reset(struct kvm *kvm)
2136	{
2137	struct evtchnfd evtchnfd, *all_evtchnfds;
2138	int i;
2139	int n = `0`;
2140
2141	mutex_lock(&kvm->arch.xen.xen_lock);
2142
2143	/*
2144	* Because synchronize_srcu() cannot be called inside the
2145	* critical section, first collect all the evtchnfd objects
2146	* in an array as they are removed from evtchn_ports.
2147	*/
2148	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i)
2149	n++;
2150
2151	all_evtchnfds = kmalloc_array(n, size: sizeof(struct evtchnfd *), GFP_KERNEL);
2152	if (!all_evtchnfds) {
2153	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
2154	return -ENOMEM;
2155	}
2156
2157	n = `0`;
2158	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2159	all_evtchnfds[n++] = evtchnfd;
2160	idr_remove(&kvm->arch.xen.evtchn_ports, id: evtchnfd->send_port);
2161	}
2162	mutex_unlock(lock: &kvm->arch.xen.xen_lock);
2163
2164	synchronize_srcu(ssp: &kvm->srcu);
2165
2166	while (n--) {
2167	evtchnfd = all_evtchnfds[n];
2168	if (!evtchnfd->deliver.port.port)
2169	eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx);
2170	kfree(objp: evtchnfd);
2171	}
2172	kfree(objp: all_evtchnfds);
2173
2174	return `0`;
2175	}
2176
2177	static int kvm_xen_setattr_evtchn(struct kvm kvm, struct* kvm_xen_hvm_attr *data)
2178	{
2179	u32 port = data->u.evtchn.send_port;
2180
2181	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET)
2182	return kvm_xen_eventfd_reset(kvm);
2183
2184	if (!port \|\| port >= max_evtchn_port(kvm))
2185	return -EINVAL;
2186
2187	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN)
2188	return kvm_xen_eventfd_deassign(kvm, port);
2189	if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE)
2190	return kvm_xen_eventfd_update(kvm, data);
2191	if (data->u.evtchn.flags)
2192	return -EINVAL;
2193
2194	return kvm_xen_eventfd_assign(kvm, data);
2195	}
2196
2197	static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu vcpu, u64 param, u64 r)
2198	{
2199	struct evtchnfd *evtchnfd;
2200	struct evtchn_send send;
2201	struct x86_exception e;
2202
2203	/ Sanity check: this structure is the same for 32-bit and 64-bit /
2204	BUILD_BUG_ON(sizeof(send) != `4`);
2205	if (kvm_read_guest_virt(vcpu, addr: param, val: &send, bytes: sizeof(send), exception: &e)) {
2206	*r = -EFAULT;
2207	return true;
2208	}
2209
2210	/*
2211	* evtchnfd is protected by kvm->srcu; the idr lookup instead
2212	* is protected by RCU.
2213	*/
2214	rcu_read_lock();
2215	evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, id: send.port);
2216	rcu_read_unlock();
2217	if (!evtchnfd)
2218	return false;
2219
2220	if (evtchnfd->deliver.port.port) {
2221	int ret = kvm_xen_set_evtchn(xe: &evtchnfd->deliver.port, kvm: vcpu->kvm);
2222	if (ret < `0` && ret != -ENOTCONN)
2223	return false;
2224	} else {
2225	eventfd_signal(ctx: evtchnfd->deliver.eventfd.ctx);
2226	}
2227
2228	*r = `0`;
2229	return true;
2230	}
2231
2232	void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu)
2233	{
2234	vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx;
2235	vcpu->arch.xen.poll_evtchn = `0`;
2236
2237	timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, `0`);
2238
2239	kvm_gpc_init(gpc: &vcpu->arch.xen.runstate_cache, kvm: vcpu->kvm);
2240	kvm_gpc_init(gpc: &vcpu->arch.xen.runstate2_cache, kvm: vcpu->kvm);
2241	kvm_gpc_init(gpc: &vcpu->arch.xen.vcpu_info_cache, kvm: vcpu->kvm);
2242	kvm_gpc_init(gpc: &vcpu->arch.xen.vcpu_time_info_cache, kvm: vcpu->kvm);
2243	}
2244
2245	void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu)
2246	{
2247	if (kvm_xen_timer_enabled(vcpu))
2248	kvm_xen_stop_timer(vcpu);
2249
2250	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate_cache);
2251	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache);
2252	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache);
2253	kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_time_info_cache);
2254
2255	del_timer_sync(timer: &vcpu->arch.xen.poll_timer);
2256	}
2257
2258	void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu)
2259	{
2260	struct kvm_cpuid_entry2 *entry;
2261	u32 function;
2262
2263	if (!vcpu->arch.xen.cpuid.base)
2264	return;
2265
2266	function = vcpu->arch.xen.cpuid.base \| XEN_CPUID_LEAF(`3`);
2267	if (function > vcpu->arch.xen.cpuid.limit)
2268	return;
2269
2270	entry = kvm_find_cpuid_entry_index(vcpu, function, index: `1`);
2271	if (entry) {
2272	entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul;
2273	entry->edx = vcpu->arch.hv_clock.tsc_shift;
2274	}
2275
2276	entry = kvm_find_cpuid_entry_index(vcpu, function, index: `2`);
2277	if (entry)
2278	entry->eax = vcpu->arch.hw_tsc_khz;
2279	}
2280
2281	void kvm_xen_init_vm(struct kvm *kvm)
2282	{
2283	mutex_init(&kvm->arch.xen.xen_lock);
2284	idr_init(idr: &kvm->arch.xen.evtchn_ports);
2285	kvm_gpc_init(gpc: &kvm->arch.xen.shinfo_cache, kvm);
2286	}
2287
2288	void kvm_xen_destroy_vm(struct kvm *kvm)
2289	{
2290	struct evtchnfd *evtchnfd;
2291	int i;
2292
2293	kvm_gpc_deactivate(gpc: &kvm->arch.xen.shinfo_cache);
2294
2295	idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) {
2296	if (!evtchnfd->deliver.port.port)
2297	eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx);
2298	kfree(objp: evtchnfd);
2299	}
2300	idr_destroy(&kvm->arch.xen.evtchn_ports);
2301
2302	if (kvm->arch.xen_hvm_config.msr)
2303	static_branch_slow_dec_deferred(&kvm_xen_enabled);
2304	}
2305

source code of linux/arch/x86/kvm/xen.c