1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright © 2019 Oracle and/or its affiliates. All rights reserved. |
4 | * Copyright © 2020 Amazon.com, Inc. or its affiliates. All Rights Reserved. |
5 | * |
6 | * KVM Xen emulation |
7 | */ |
8 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
9 | |
10 | #include "x86.h" |
11 | #include "xen.h" |
12 | #include "hyperv.h" |
13 | #include "irq.h" |
14 | |
15 | #include <linux/eventfd.h> |
16 | #include <linux/kvm_host.h> |
17 | #include <linux/sched/stat.h> |
18 | |
19 | #include <trace/events/kvm.h> |
20 | #include <xen/interface/xen.h> |
21 | #include <xen/interface/vcpu.h> |
22 | #include <xen/interface/version.h> |
23 | #include <xen/interface/event_channel.h> |
24 | #include <xen/interface/sched.h> |
25 | |
26 | #include <asm/xen/cpuid.h> |
27 | #include <asm/pvclock.h> |
28 | |
29 | #include "cpuid.h" |
30 | #include "trace.h" |
31 | |
32 | static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm); |
33 | static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data); |
34 | static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r); |
35 | |
36 | DEFINE_STATIC_KEY_DEFERRED_FALSE(kvm_xen_enabled, HZ); |
37 | |
38 | static int kvm_xen_shared_info_init(struct kvm *kvm) |
39 | { |
40 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; |
41 | struct pvclock_wall_clock *wc; |
42 | u32 *wc_sec_hi; |
43 | u32 wc_version; |
44 | u64 wall_nsec; |
45 | int ret = 0; |
46 | int idx = srcu_read_lock(ssp: &kvm->srcu); |
47 | |
48 | read_lock_irq(&gpc->lock); |
49 | while (!kvm_gpc_check(gpc, PAGE_SIZE)) { |
50 | read_unlock_irq(&gpc->lock); |
51 | |
52 | ret = kvm_gpc_refresh(gpc, PAGE_SIZE); |
53 | if (ret) |
54 | goto out; |
55 | |
56 | read_lock_irq(&gpc->lock); |
57 | } |
58 | |
59 | /* |
60 | * This code mirrors kvm_write_wall_clock() except that it writes |
61 | * directly through the pfn cache and doesn't mark the page dirty. |
62 | */ |
63 | wall_nsec = kvm_get_wall_clock_epoch(kvm); |
64 | |
65 | /* Paranoia checks on the 32-bit struct layout */ |
66 | BUILD_BUG_ON(offsetof(struct compat_shared_info, wc) != 0x900); |
67 | BUILD_BUG_ON(offsetof(struct compat_shared_info, arch.wc_sec_hi) != 0x924); |
68 | BUILD_BUG_ON(offsetof(struct pvclock_vcpu_time_info, version) != 0); |
69 | |
70 | #ifdef CONFIG_X86_64 |
71 | /* Paranoia checks on the 64-bit struct layout */ |
72 | BUILD_BUG_ON(offsetof(struct shared_info, wc) != 0xc00); |
73 | BUILD_BUG_ON(offsetof(struct shared_info, wc_sec_hi) != 0xc0c); |
74 | |
75 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { |
76 | struct shared_info *shinfo = gpc->khva; |
77 | |
78 | wc_sec_hi = &shinfo->wc_sec_hi; |
79 | wc = &shinfo->wc; |
80 | } else |
81 | #endif |
82 | { |
83 | struct compat_shared_info *shinfo = gpc->khva; |
84 | |
85 | wc_sec_hi = &shinfo->arch.wc_sec_hi; |
86 | wc = &shinfo->wc; |
87 | } |
88 | |
89 | /* Increment and ensure an odd value */ |
90 | wc_version = wc->version = (wc->version + 1) | 1; |
91 | smp_wmb(); |
92 | |
93 | wc->nsec = do_div(wall_nsec, NSEC_PER_SEC); |
94 | wc->sec = (u32)wall_nsec; |
95 | *wc_sec_hi = wall_nsec >> 32; |
96 | smp_wmb(); |
97 | |
98 | wc->version = wc_version + 1; |
99 | read_unlock_irq(&gpc->lock); |
100 | |
101 | kvm_make_all_cpus_request(kvm, KVM_REQ_MASTERCLOCK_UPDATE); |
102 | |
103 | out: |
104 | srcu_read_unlock(ssp: &kvm->srcu, idx); |
105 | return ret; |
106 | } |
107 | |
108 | void kvm_xen_inject_timer_irqs(struct kvm_vcpu *vcpu) |
109 | { |
110 | if (atomic_read(v: &vcpu->arch.xen.timer_pending) > 0) { |
111 | struct kvm_xen_evtchn e; |
112 | |
113 | e.vcpu_id = vcpu->vcpu_id; |
114 | e.vcpu_idx = vcpu->vcpu_idx; |
115 | e.port = vcpu->arch.xen.timer_virq; |
116 | e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; |
117 | |
118 | kvm_xen_set_evtchn(xe: &e, kvm: vcpu->kvm); |
119 | |
120 | vcpu->arch.xen.timer_expires = 0; |
121 | atomic_set(v: &vcpu->arch.xen.timer_pending, i: 0); |
122 | } |
123 | } |
124 | |
125 | static enum hrtimer_restart xen_timer_callback(struct hrtimer *timer) |
126 | { |
127 | struct kvm_vcpu *vcpu = container_of(timer, struct kvm_vcpu, |
128 | arch.xen.timer); |
129 | struct kvm_xen_evtchn e; |
130 | int rc; |
131 | |
132 | if (atomic_read(v: &vcpu->arch.xen.timer_pending)) |
133 | return HRTIMER_NORESTART; |
134 | |
135 | e.vcpu_id = vcpu->vcpu_id; |
136 | e.vcpu_idx = vcpu->vcpu_idx; |
137 | e.port = vcpu->arch.xen.timer_virq; |
138 | e.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; |
139 | |
140 | rc = kvm_xen_set_evtchn_fast(xe: &e, kvm: vcpu->kvm); |
141 | if (rc != -EWOULDBLOCK) { |
142 | vcpu->arch.xen.timer_expires = 0; |
143 | return HRTIMER_NORESTART; |
144 | } |
145 | |
146 | atomic_inc(v: &vcpu->arch.xen.timer_pending); |
147 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); |
148 | kvm_vcpu_kick(vcpu); |
149 | |
150 | return HRTIMER_NORESTART; |
151 | } |
152 | |
153 | static void kvm_xen_start_timer(struct kvm_vcpu *vcpu, u64 guest_abs, |
154 | bool linux_wa) |
155 | { |
156 | int64_t kernel_now, delta; |
157 | uint64_t guest_now; |
158 | |
159 | /* |
160 | * The guest provides the requested timeout in absolute nanoseconds |
161 | * of the KVM clock — as *it* sees it, based on the scaled TSC and |
162 | * the pvclock information provided by KVM. |
163 | * |
164 | * The kernel doesn't support hrtimers based on CLOCK_MONOTONIC_RAW |
165 | * so use CLOCK_MONOTONIC. In the timescales covered by timers, the |
166 | * difference won't matter much as there is no cumulative effect. |
167 | * |
168 | * Calculate the time for some arbitrary point in time around "now" |
169 | * in terms of both kvmclock and CLOCK_MONOTONIC. Calculate the |
170 | * delta between the kvmclock "now" value and the guest's requested |
171 | * timeout, apply the "Linux workaround" described below, and add |
172 | * the resulting delta to the CLOCK_MONOTONIC "now" value, to get |
173 | * the absolute CLOCK_MONOTONIC time at which the timer should |
174 | * fire. |
175 | */ |
176 | if (vcpu->arch.hv_clock.version && vcpu->kvm->arch.use_master_clock && |
177 | static_cpu_has(X86_FEATURE_CONSTANT_TSC)) { |
178 | uint64_t host_tsc, guest_tsc; |
179 | |
180 | if (!IS_ENABLED(CONFIG_64BIT) || |
181 | !kvm_get_monotonic_and_clockread(kernel_ns: &kernel_now, tsc_timestamp: &host_tsc)) { |
182 | /* |
183 | * Don't fall back to get_kvmclock_ns() because it's |
184 | * broken; it has a systemic error in its results |
185 | * because it scales directly from host TSC to |
186 | * nanoseconds, and doesn't scale first to guest TSC |
187 | * and *then* to nanoseconds as the guest does. |
188 | * |
189 | * There is a small error introduced here because time |
190 | * continues to elapse between the ktime_get() and the |
191 | * subsequent rdtsc(). But not the systemic drift due |
192 | * to get_kvmclock_ns(). |
193 | */ |
194 | kernel_now = ktime_get(); /* This is CLOCK_MONOTONIC */ |
195 | host_tsc = rdtsc(); |
196 | } |
197 | |
198 | /* Calculate the guest kvmclock as the guest would do it. */ |
199 | guest_tsc = kvm_read_l1_tsc(vcpu, host_tsc); |
200 | guest_now = __pvclock_read_cycles(src: &vcpu->arch.hv_clock, |
201 | tsc: guest_tsc); |
202 | } else { |
203 | /* |
204 | * Without CONSTANT_TSC, get_kvmclock_ns() is the only option. |
205 | * |
206 | * Also if the guest PV clock hasn't been set up yet, as is |
207 | * likely to be the case during migration when the vCPU has |
208 | * not been run yet. It would be possible to calculate the |
209 | * scaling factors properly in that case but there's not much |
210 | * point in doing so. The get_kvmclock_ns() drift accumulates |
211 | * over time, so it's OK to use it at startup. Besides, on |
212 | * migration there's going to be a little bit of skew in the |
213 | * precise moment at which timers fire anyway. Often they'll |
214 | * be in the "past" by the time the VM is running again after |
215 | * migration. |
216 | */ |
217 | guest_now = get_kvmclock_ns(kvm: vcpu->kvm); |
218 | kernel_now = ktime_get(); |
219 | } |
220 | |
221 | delta = guest_abs - guest_now; |
222 | |
223 | /* |
224 | * Xen has a 'Linux workaround' in do_set_timer_op() which checks for |
225 | * negative absolute timeout values (caused by integer overflow), and |
226 | * for values about 13 days in the future (2^50ns) which would be |
227 | * caused by jiffies overflow. For those cases, Xen sets the timeout |
228 | * 100ms in the future (not *too* soon, since if a guest really did |
229 | * set a long timeout on purpose we don't want to keep churning CPU |
230 | * time by waking it up). Emulate Xen's workaround when starting the |
231 | * timer in response to __HYPERVISOR_set_timer_op. |
232 | */ |
233 | if (linux_wa && |
234 | unlikely((int64_t)guest_abs < 0 || |
235 | (delta > 0 && (uint32_t) (delta >> 50) != 0))) { |
236 | delta = 100 * NSEC_PER_MSEC; |
237 | guest_abs = guest_now + delta; |
238 | } |
239 | |
240 | /* |
241 | * Avoid races with the old timer firing. Checking timer_expires |
242 | * to avoid calling hrtimer_cancel() will only have false positives |
243 | * so is fine. |
244 | */ |
245 | if (vcpu->arch.xen.timer_expires) |
246 | hrtimer_cancel(timer: &vcpu->arch.xen.timer); |
247 | |
248 | atomic_set(v: &vcpu->arch.xen.timer_pending, i: 0); |
249 | vcpu->arch.xen.timer_expires = guest_abs; |
250 | |
251 | if (delta <= 0) |
252 | xen_timer_callback(timer: &vcpu->arch.xen.timer); |
253 | else |
254 | hrtimer_start(timer: &vcpu->arch.xen.timer, |
255 | ktime_add_ns(kernel_now, delta), |
256 | mode: HRTIMER_MODE_ABS_HARD); |
257 | } |
258 | |
259 | static void kvm_xen_stop_timer(struct kvm_vcpu *vcpu) |
260 | { |
261 | hrtimer_cancel(timer: &vcpu->arch.xen.timer); |
262 | vcpu->arch.xen.timer_expires = 0; |
263 | atomic_set(v: &vcpu->arch.xen.timer_pending, i: 0); |
264 | } |
265 | |
266 | static void kvm_xen_init_timer(struct kvm_vcpu *vcpu) |
267 | { |
268 | hrtimer_init(timer: &vcpu->arch.xen.timer, CLOCK_MONOTONIC, |
269 | mode: HRTIMER_MODE_ABS_HARD); |
270 | vcpu->arch.xen.timer.function = xen_timer_callback; |
271 | } |
272 | |
273 | static void kvm_xen_update_runstate_guest(struct kvm_vcpu *v, bool atomic) |
274 | { |
275 | struct kvm_vcpu_xen *vx = &v->arch.xen; |
276 | struct gfn_to_pfn_cache *gpc1 = &vx->runstate_cache; |
277 | struct gfn_to_pfn_cache *gpc2 = &vx->runstate2_cache; |
278 | size_t user_len, user_len1, user_len2; |
279 | struct vcpu_runstate_info rs; |
280 | unsigned long flags; |
281 | size_t times_ofs; |
282 | uint8_t *update_bit = NULL; |
283 | uint64_t entry_time; |
284 | uint64_t *rs_times; |
285 | int *rs_state; |
286 | |
287 | /* |
288 | * The only difference between 32-bit and 64-bit versions of the |
289 | * runstate struct is the alignment of uint64_t in 32-bit, which |
290 | * means that the 64-bit version has an additional 4 bytes of |
291 | * padding after the first field 'state'. Let's be really really |
292 | * paranoid about that, and matching it with our internal data |
293 | * structures that we memcpy into it... |
294 | */ |
295 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != 0); |
296 | BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state) != 0); |
297 | BUILD_BUG_ON(sizeof(struct compat_vcpu_runstate_info) != 0x2c); |
298 | #ifdef CONFIG_X86_64 |
299 | /* |
300 | * The 64-bit structure has 4 bytes of padding before 'state_entry_time' |
301 | * so each subsequent field is shifted by 4, and it's 4 bytes longer. |
302 | */ |
303 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != |
304 | offsetof(struct compat_vcpu_runstate_info, state_entry_time) + 4); |
305 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, time) != |
306 | offsetof(struct compat_vcpu_runstate_info, time) + 4); |
307 | BUILD_BUG_ON(sizeof(struct vcpu_runstate_info) != 0x2c + 4); |
308 | #endif |
309 | /* |
310 | * The state field is in the same place at the start of both structs, |
311 | * and is the same size (int) as vx->current_runstate. |
312 | */ |
313 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state) != |
314 | offsetof(struct compat_vcpu_runstate_info, state)); |
315 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state) != |
316 | sizeof(vx->current_runstate)); |
317 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state) != |
318 | sizeof(vx->current_runstate)); |
319 | |
320 | /* |
321 | * The state_entry_time field is 64 bits in both versions, and the |
322 | * XEN_RUNSTATE_UPDATE flag is in the top bit, which given that x86 |
323 | * is little-endian means that it's in the last *byte* of the word. |
324 | * That detail is important later. |
325 | */ |
326 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, state_entry_time) != |
327 | sizeof(uint64_t)); |
328 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_runstate_info, state_entry_time) != |
329 | sizeof(uint64_t)); |
330 | BUILD_BUG_ON((XEN_RUNSTATE_UPDATE >> 56) != 0x80); |
331 | |
332 | /* |
333 | * The time array is four 64-bit quantities in both versions, matching |
334 | * the vx->runstate_times and immediately following state_entry_time. |
335 | */ |
336 | BUILD_BUG_ON(offsetof(struct vcpu_runstate_info, state_entry_time) != |
337 | offsetof(struct vcpu_runstate_info, time) - sizeof(uint64_t)); |
338 | BUILD_BUG_ON(offsetof(struct compat_vcpu_runstate_info, state_entry_time) != |
339 | offsetof(struct compat_vcpu_runstate_info, time) - sizeof(uint64_t)); |
340 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != |
341 | sizeof_field(struct compat_vcpu_runstate_info, time)); |
342 | BUILD_BUG_ON(sizeof_field(struct vcpu_runstate_info, time) != |
343 | sizeof(vx->runstate_times)); |
344 | |
345 | if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { |
346 | user_len = sizeof(struct vcpu_runstate_info); |
347 | times_ofs = offsetof(struct vcpu_runstate_info, |
348 | state_entry_time); |
349 | } else { |
350 | user_len = sizeof(struct compat_vcpu_runstate_info); |
351 | times_ofs = offsetof(struct compat_vcpu_runstate_info, |
352 | state_entry_time); |
353 | } |
354 | |
355 | /* |
356 | * There are basically no alignment constraints. The guest can set it |
357 | * up so it crosses from one page to the next, and at arbitrary byte |
358 | * alignment (and the 32-bit ABI doesn't align the 64-bit integers |
359 | * anyway, even if the overall struct had been 64-bit aligned). |
360 | */ |
361 | if ((gpc1->gpa & ~PAGE_MASK) + user_len >= PAGE_SIZE) { |
362 | user_len1 = PAGE_SIZE - (gpc1->gpa & ~PAGE_MASK); |
363 | user_len2 = user_len - user_len1; |
364 | } else { |
365 | user_len1 = user_len; |
366 | user_len2 = 0; |
367 | } |
368 | BUG_ON(user_len1 + user_len2 != user_len); |
369 | |
370 | retry: |
371 | /* |
372 | * Attempt to obtain the GPC lock on *both* (if there are two) |
373 | * gfn_to_pfn caches that cover the region. |
374 | */ |
375 | if (atomic) { |
376 | local_irq_save(flags); |
377 | if (!read_trylock(&gpc1->lock)) { |
378 | local_irq_restore(flags); |
379 | return; |
380 | } |
381 | } else { |
382 | read_lock_irqsave(&gpc1->lock, flags); |
383 | } |
384 | while (!kvm_gpc_check(gpc: gpc1, len: user_len1)) { |
385 | read_unlock_irqrestore(&gpc1->lock, flags); |
386 | |
387 | /* When invoked from kvm_sched_out() we cannot sleep */ |
388 | if (atomic) |
389 | return; |
390 | |
391 | if (kvm_gpc_refresh(gpc: gpc1, len: user_len1)) |
392 | return; |
393 | |
394 | read_lock_irqsave(&gpc1->lock, flags); |
395 | } |
396 | |
397 | if (likely(!user_len2)) { |
398 | /* |
399 | * Set up three pointers directly to the runstate_info |
400 | * struct in the guest (via the GPC). |
401 | * |
402 | * • @rs_state → state field |
403 | * • @rs_times → state_entry_time field. |
404 | * • @update_bit → last byte of state_entry_time, which |
405 | * contains the XEN_RUNSTATE_UPDATE bit. |
406 | */ |
407 | rs_state = gpc1->khva; |
408 | rs_times = gpc1->khva + times_ofs; |
409 | if (v->kvm->arch.xen.runstate_update_flag) |
410 | update_bit = ((void *)(&rs_times[1])) - 1; |
411 | } else { |
412 | /* |
413 | * The guest's runstate_info is split across two pages and we |
414 | * need to hold and validate both GPCs simultaneously. We can |
415 | * declare a lock ordering GPC1 > GPC2 because nothing else |
416 | * takes them more than one at a time. Set a subclass on the |
417 | * gpc1 lock to make lockdep shut up about it. |
418 | */ |
419 | lock_set_subclass(lock: &gpc1->lock.dep_map, subclass: 1, _THIS_IP_); |
420 | if (atomic) { |
421 | if (!read_trylock(&gpc2->lock)) { |
422 | read_unlock_irqrestore(&gpc1->lock, flags); |
423 | return; |
424 | } |
425 | } else { |
426 | read_lock(&gpc2->lock); |
427 | } |
428 | |
429 | if (!kvm_gpc_check(gpc: gpc2, len: user_len2)) { |
430 | read_unlock(&gpc2->lock); |
431 | read_unlock_irqrestore(&gpc1->lock, flags); |
432 | |
433 | /* When invoked from kvm_sched_out() we cannot sleep */ |
434 | if (atomic) |
435 | return; |
436 | |
437 | /* |
438 | * Use kvm_gpc_activate() here because if the runstate |
439 | * area was configured in 32-bit mode and only extends |
440 | * to the second page now because the guest changed to |
441 | * 64-bit mode, the second GPC won't have been set up. |
442 | */ |
443 | if (kvm_gpc_activate(gpc: gpc2, gpa: gpc1->gpa + user_len1, |
444 | len: user_len2)) |
445 | return; |
446 | |
447 | /* |
448 | * We dropped the lock on GPC1 so we have to go all the |
449 | * way back and revalidate that too. |
450 | */ |
451 | goto retry; |
452 | } |
453 | |
454 | /* |
455 | * In this case, the runstate_info struct will be assembled on |
456 | * the kernel stack (compat or not as appropriate) and will |
457 | * be copied to GPC1/GPC2 with a dual memcpy. Set up the three |
458 | * rs pointers accordingly. |
459 | */ |
460 | rs_times = &rs.state_entry_time; |
461 | |
462 | /* |
463 | * The rs_state pointer points to the start of what we'll |
464 | * copy to the guest, which in the case of a compat guest |
465 | * is the 32-bit field that the compiler thinks is padding. |
466 | */ |
467 | rs_state = ((void *)rs_times) - times_ofs; |
468 | |
469 | /* |
470 | * The update_bit is still directly in the guest memory, |
471 | * via one GPC or the other. |
472 | */ |
473 | if (v->kvm->arch.xen.runstate_update_flag) { |
474 | if (user_len1 >= times_ofs + sizeof(uint64_t)) |
475 | update_bit = gpc1->khva + times_ofs + |
476 | sizeof(uint64_t) - 1; |
477 | else |
478 | update_bit = gpc2->khva + times_ofs + |
479 | sizeof(uint64_t) - 1 - user_len1; |
480 | } |
481 | |
482 | #ifdef CONFIG_X86_64 |
483 | /* |
484 | * Don't leak kernel memory through the padding in the 64-bit |
485 | * version of the struct. |
486 | */ |
487 | memset(&rs, 0, offsetof(struct vcpu_runstate_info, state_entry_time)); |
488 | #endif |
489 | } |
490 | |
491 | /* |
492 | * First, set the XEN_RUNSTATE_UPDATE bit in the top bit of the |
493 | * state_entry_time field, directly in the guest. We need to set |
494 | * that (and write-barrier) before writing to the rest of the |
495 | * structure, and clear it last. Just as Xen does, we address the |
496 | * single *byte* in which it resides because it might be in a |
497 | * different cache line to the rest of the 64-bit word, due to |
498 | * the (lack of) alignment constraints. |
499 | */ |
500 | entry_time = vx->runstate_entry_time; |
501 | if (update_bit) { |
502 | entry_time |= XEN_RUNSTATE_UPDATE; |
503 | *update_bit = (vx->runstate_entry_time | XEN_RUNSTATE_UPDATE) >> 56; |
504 | smp_wmb(); |
505 | } |
506 | |
507 | /* |
508 | * Now assemble the actual structure, either on our kernel stack |
509 | * or directly in the guest according to how the rs_state and |
510 | * rs_times pointers were set up above. |
511 | */ |
512 | *rs_state = vx->current_runstate; |
513 | rs_times[0] = entry_time; |
514 | memcpy(rs_times + 1, vx->runstate_times, sizeof(vx->runstate_times)); |
515 | |
516 | /* For the split case, we have to then copy it to the guest. */ |
517 | if (user_len2) { |
518 | memcpy(gpc1->khva, rs_state, user_len1); |
519 | memcpy(gpc2->khva, ((void *)rs_state) + user_len1, user_len2); |
520 | } |
521 | smp_wmb(); |
522 | |
523 | /* Finally, clear the XEN_RUNSTATE_UPDATE bit. */ |
524 | if (update_bit) { |
525 | entry_time &= ~XEN_RUNSTATE_UPDATE; |
526 | *update_bit = entry_time >> 56; |
527 | smp_wmb(); |
528 | } |
529 | |
530 | if (user_len2) { |
531 | kvm_gpc_mark_dirty_in_slot(gpc: gpc2); |
532 | read_unlock(&gpc2->lock); |
533 | } |
534 | |
535 | kvm_gpc_mark_dirty_in_slot(gpc: gpc1); |
536 | read_unlock_irqrestore(&gpc1->lock, flags); |
537 | } |
538 | |
539 | void kvm_xen_update_runstate(struct kvm_vcpu *v, int state) |
540 | { |
541 | struct kvm_vcpu_xen *vx = &v->arch.xen; |
542 | u64 now = get_kvmclock_ns(kvm: v->kvm); |
543 | u64 delta_ns = now - vx->runstate_entry_time; |
544 | u64 run_delay = current->sched_info.run_delay; |
545 | |
546 | if (unlikely(!vx->runstate_entry_time)) |
547 | vx->current_runstate = RUNSTATE_offline; |
548 | |
549 | /* |
550 | * Time waiting for the scheduler isn't "stolen" if the |
551 | * vCPU wasn't running anyway. |
552 | */ |
553 | if (vx->current_runstate == RUNSTATE_running) { |
554 | u64 steal_ns = run_delay - vx->last_steal; |
555 | |
556 | delta_ns -= steal_ns; |
557 | |
558 | vx->runstate_times[RUNSTATE_runnable] += steal_ns; |
559 | } |
560 | vx->last_steal = run_delay; |
561 | |
562 | vx->runstate_times[vx->current_runstate] += delta_ns; |
563 | vx->current_runstate = state; |
564 | vx->runstate_entry_time = now; |
565 | |
566 | if (vx->runstate_cache.active) |
567 | kvm_xen_update_runstate_guest(v, atomic: state == RUNSTATE_runnable); |
568 | } |
569 | |
570 | void kvm_xen_inject_vcpu_vector(struct kvm_vcpu *v) |
571 | { |
572 | struct kvm_lapic_irq irq = { }; |
573 | |
574 | irq.dest_id = v->vcpu_id; |
575 | irq.vector = v->arch.xen.upcall_vector; |
576 | irq.dest_mode = APIC_DEST_PHYSICAL; |
577 | irq.shorthand = APIC_DEST_NOSHORT; |
578 | irq.delivery_mode = APIC_DM_FIXED; |
579 | irq.level = 1; |
580 | |
581 | kvm_irq_delivery_to_apic(kvm: v->kvm, NULL, irq: &irq, NULL); |
582 | } |
583 | |
584 | /* |
585 | * On event channel delivery, the vcpu_info may not have been accessible. |
586 | * In that case, there are bits in vcpu->arch.xen.evtchn_pending_sel which |
587 | * need to be marked into the vcpu_info (and evtchn_upcall_pending set). |
588 | * Do so now that we can sleep in the context of the vCPU to bring the |
589 | * page in, and refresh the pfn cache for it. |
590 | */ |
591 | void kvm_xen_inject_pending_events(struct kvm_vcpu *v) |
592 | { |
593 | unsigned long evtchn_pending_sel = READ_ONCE(v->arch.xen.evtchn_pending_sel); |
594 | struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; |
595 | unsigned long flags; |
596 | |
597 | if (!evtchn_pending_sel) |
598 | return; |
599 | |
600 | /* |
601 | * Yes, this is an open-coded loop. But that's just what put_user() |
602 | * does anyway. Page it in and retry the instruction. We're just a |
603 | * little more honest about it. |
604 | */ |
605 | read_lock_irqsave(&gpc->lock, flags); |
606 | while (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) { |
607 | read_unlock_irqrestore(&gpc->lock, flags); |
608 | |
609 | if (kvm_gpc_refresh(gpc, len: sizeof(struct vcpu_info))) |
610 | return; |
611 | |
612 | read_lock_irqsave(&gpc->lock, flags); |
613 | } |
614 | |
615 | /* Now gpc->khva is a valid kernel address for the vcpu_info */ |
616 | if (IS_ENABLED(CONFIG_64BIT) && v->kvm->arch.xen.long_mode) { |
617 | struct vcpu_info *vi = gpc->khva; |
618 | |
619 | asm volatile(LOCK_PREFIX "orq %0, %1\n" |
620 | "notq %0\n" |
621 | LOCK_PREFIX "andq %0, %2\n" |
622 | : "=r" (evtchn_pending_sel), |
623 | "+m" (vi->evtchn_pending_sel), |
624 | "+m" (v->arch.xen.evtchn_pending_sel) |
625 | : "0" (evtchn_pending_sel)); |
626 | WRITE_ONCE(vi->evtchn_upcall_pending, 1); |
627 | } else { |
628 | u32 evtchn_pending_sel32 = evtchn_pending_sel; |
629 | struct compat_vcpu_info *vi = gpc->khva; |
630 | |
631 | asm volatile(LOCK_PREFIX "orl %0, %1\n" |
632 | "notl %0\n" |
633 | LOCK_PREFIX "andl %0, %2\n" |
634 | : "=r" (evtchn_pending_sel32), |
635 | "+m" (vi->evtchn_pending_sel), |
636 | "+m" (v->arch.xen.evtchn_pending_sel) |
637 | : "0" (evtchn_pending_sel32)); |
638 | WRITE_ONCE(vi->evtchn_upcall_pending, 1); |
639 | } |
640 | |
641 | kvm_gpc_mark_dirty_in_slot(gpc); |
642 | read_unlock_irqrestore(&gpc->lock, flags); |
643 | |
644 | /* For the per-vCPU lapic vector, deliver it as MSI. */ |
645 | if (v->arch.xen.upcall_vector) |
646 | kvm_xen_inject_vcpu_vector(v); |
647 | } |
648 | |
649 | int __kvm_xen_has_interrupt(struct kvm_vcpu *v) |
650 | { |
651 | struct gfn_to_pfn_cache *gpc = &v->arch.xen.vcpu_info_cache; |
652 | unsigned long flags; |
653 | u8 rc = 0; |
654 | |
655 | /* |
656 | * If the global upcall vector (HVMIRQ_callback_vector) is set and |
657 | * the vCPU's evtchn_upcall_pending flag is set, the IRQ is pending. |
658 | */ |
659 | |
660 | /* No need for compat handling here */ |
661 | BUILD_BUG_ON(offsetof(struct vcpu_info, evtchn_upcall_pending) != |
662 | offsetof(struct compat_vcpu_info, evtchn_upcall_pending)); |
663 | BUILD_BUG_ON(sizeof(rc) != |
664 | sizeof_field(struct vcpu_info, evtchn_upcall_pending)); |
665 | BUILD_BUG_ON(sizeof(rc) != |
666 | sizeof_field(struct compat_vcpu_info, evtchn_upcall_pending)); |
667 | |
668 | read_lock_irqsave(&gpc->lock, flags); |
669 | while (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) { |
670 | read_unlock_irqrestore(&gpc->lock, flags); |
671 | |
672 | /* |
673 | * This function gets called from kvm_vcpu_block() after setting the |
674 | * task to TASK_INTERRUPTIBLE, to see if it needs to wake immediately |
675 | * from a HLT. So we really mustn't sleep. If the page ended up absent |
676 | * at that point, just return 1 in order to trigger an immediate wake, |
677 | * and we'll end up getting called again from a context where we *can* |
678 | * fault in the page and wait for it. |
679 | */ |
680 | if (in_atomic() || !task_is_running(current)) |
681 | return 1; |
682 | |
683 | if (kvm_gpc_refresh(gpc, len: sizeof(struct vcpu_info))) { |
684 | /* |
685 | * If this failed, userspace has screwed up the |
686 | * vcpu_info mapping. No interrupts for you. |
687 | */ |
688 | return 0; |
689 | } |
690 | read_lock_irqsave(&gpc->lock, flags); |
691 | } |
692 | |
693 | rc = ((struct vcpu_info *)gpc->khva)->evtchn_upcall_pending; |
694 | read_unlock_irqrestore(&gpc->lock, flags); |
695 | return rc; |
696 | } |
697 | |
698 | int kvm_xen_hvm_set_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) |
699 | { |
700 | int r = -ENOENT; |
701 | |
702 | |
703 | switch (data->type) { |
704 | case KVM_XEN_ATTR_TYPE_LONG_MODE: |
705 | if (!IS_ENABLED(CONFIG_64BIT) && data->u.long_mode) { |
706 | r = -EINVAL; |
707 | } else { |
708 | mutex_lock(&kvm->arch.xen.xen_lock); |
709 | kvm->arch.xen.long_mode = !!data->u.long_mode; |
710 | |
711 | /* |
712 | * Re-initialize shared_info to put the wallclock in the |
713 | * correct place. Whilst it's not necessary to do this |
714 | * unless the mode is actually changed, it does no harm |
715 | * to make the call anyway. |
716 | */ |
717 | r = kvm->arch.xen.shinfo_cache.active ? |
718 | kvm_xen_shared_info_init(kvm) : 0; |
719 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
720 | } |
721 | break; |
722 | |
723 | case KVM_XEN_ATTR_TYPE_SHARED_INFO: |
724 | case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: { |
725 | int idx; |
726 | |
727 | mutex_lock(&kvm->arch.xen.xen_lock); |
728 | |
729 | idx = srcu_read_lock(ssp: &kvm->srcu); |
730 | |
731 | if (data->type == KVM_XEN_ATTR_TYPE_SHARED_INFO) { |
732 | gfn_t gfn = data->u.shared_info.gfn; |
733 | |
734 | if (gfn == KVM_XEN_INVALID_GFN) { |
735 | kvm_gpc_deactivate(gpc: &kvm->arch.xen.shinfo_cache); |
736 | r = 0; |
737 | } else { |
738 | r = kvm_gpc_activate(gpc: &kvm->arch.xen.shinfo_cache, |
739 | gpa: gfn_to_gpa(gfn), PAGE_SIZE); |
740 | } |
741 | } else { |
742 | void __user * hva = u64_to_user_ptr(data->u.shared_info.hva); |
743 | |
744 | if (!PAGE_ALIGNED(hva) || !access_ok(hva, PAGE_SIZE)) { |
745 | r = -EINVAL; |
746 | } else if (!hva) { |
747 | kvm_gpc_deactivate(gpc: &kvm->arch.xen.shinfo_cache); |
748 | r = 0; |
749 | } else { |
750 | r = kvm_gpc_activate_hva(gpc: &kvm->arch.xen.shinfo_cache, |
751 | hva: (unsigned long)hva, PAGE_SIZE); |
752 | } |
753 | } |
754 | |
755 | srcu_read_unlock(ssp: &kvm->srcu, idx); |
756 | |
757 | if (!r && kvm->arch.xen.shinfo_cache.active) |
758 | r = kvm_xen_shared_info_init(kvm); |
759 | |
760 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
761 | break; |
762 | } |
763 | case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: |
764 | if (data->u.vector && data->u.vector < 0x10) |
765 | r = -EINVAL; |
766 | else { |
767 | mutex_lock(&kvm->arch.xen.xen_lock); |
768 | kvm->arch.xen.upcall_vector = data->u.vector; |
769 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
770 | r = 0; |
771 | } |
772 | break; |
773 | |
774 | case KVM_XEN_ATTR_TYPE_EVTCHN: |
775 | r = kvm_xen_setattr_evtchn(kvm, data); |
776 | break; |
777 | |
778 | case KVM_XEN_ATTR_TYPE_XEN_VERSION: |
779 | mutex_lock(&kvm->arch.xen.xen_lock); |
780 | kvm->arch.xen.xen_version = data->u.xen_version; |
781 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
782 | r = 0; |
783 | break; |
784 | |
785 | case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: |
786 | if (!sched_info_on()) { |
787 | r = -EOPNOTSUPP; |
788 | break; |
789 | } |
790 | mutex_lock(&kvm->arch.xen.xen_lock); |
791 | kvm->arch.xen.runstate_update_flag = !!data->u.runstate_update_flag; |
792 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
793 | r = 0; |
794 | break; |
795 | |
796 | default: |
797 | break; |
798 | } |
799 | |
800 | return r; |
801 | } |
802 | |
803 | int kvm_xen_hvm_get_attr(struct kvm *kvm, struct kvm_xen_hvm_attr *data) |
804 | { |
805 | int r = -ENOENT; |
806 | |
807 | mutex_lock(&kvm->arch.xen.xen_lock); |
808 | |
809 | switch (data->type) { |
810 | case KVM_XEN_ATTR_TYPE_LONG_MODE: |
811 | data->u.long_mode = kvm->arch.xen.long_mode; |
812 | r = 0; |
813 | break; |
814 | |
815 | case KVM_XEN_ATTR_TYPE_SHARED_INFO: |
816 | if (kvm_gpc_is_gpa_active(gpc: &kvm->arch.xen.shinfo_cache)) |
817 | data->u.shared_info.gfn = gpa_to_gfn(gpa: kvm->arch.xen.shinfo_cache.gpa); |
818 | else |
819 | data->u.shared_info.gfn = KVM_XEN_INVALID_GFN; |
820 | r = 0; |
821 | break; |
822 | |
823 | case KVM_XEN_ATTR_TYPE_SHARED_INFO_HVA: |
824 | if (kvm_gpc_is_hva_active(gpc: &kvm->arch.xen.shinfo_cache)) |
825 | data->u.shared_info.hva = kvm->arch.xen.shinfo_cache.uhva; |
826 | else |
827 | data->u.shared_info.hva = 0; |
828 | r = 0; |
829 | break; |
830 | |
831 | case KVM_XEN_ATTR_TYPE_UPCALL_VECTOR: |
832 | data->u.vector = kvm->arch.xen.upcall_vector; |
833 | r = 0; |
834 | break; |
835 | |
836 | case KVM_XEN_ATTR_TYPE_XEN_VERSION: |
837 | data->u.xen_version = kvm->arch.xen.xen_version; |
838 | r = 0; |
839 | break; |
840 | |
841 | case KVM_XEN_ATTR_TYPE_RUNSTATE_UPDATE_FLAG: |
842 | if (!sched_info_on()) { |
843 | r = -EOPNOTSUPP; |
844 | break; |
845 | } |
846 | data->u.runstate_update_flag = kvm->arch.xen.runstate_update_flag; |
847 | r = 0; |
848 | break; |
849 | |
850 | default: |
851 | break; |
852 | } |
853 | |
854 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
855 | return r; |
856 | } |
857 | |
858 | int kvm_xen_vcpu_set_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) |
859 | { |
860 | int idx, r = -ENOENT; |
861 | |
862 | mutex_lock(&vcpu->kvm->arch.xen.xen_lock); |
863 | idx = srcu_read_lock(ssp: &vcpu->kvm->srcu); |
864 | |
865 | switch (data->type) { |
866 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: |
867 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: |
868 | /* No compat necessary here. */ |
869 | BUILD_BUG_ON(sizeof(struct vcpu_info) != |
870 | sizeof(struct compat_vcpu_info)); |
871 | BUILD_BUG_ON(offsetof(struct vcpu_info, time) != |
872 | offsetof(struct compat_vcpu_info, time)); |
873 | |
874 | if (data->type == KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO) { |
875 | if (data->u.gpa == KVM_XEN_INVALID_GPA) { |
876 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache); |
877 | r = 0; |
878 | break; |
879 | } |
880 | |
881 | r = kvm_gpc_activate(gpc: &vcpu->arch.xen.vcpu_info_cache, |
882 | gpa: data->u.gpa, len: sizeof(struct vcpu_info)); |
883 | } else { |
884 | if (data->u.hva == 0) { |
885 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache); |
886 | r = 0; |
887 | break; |
888 | } |
889 | |
890 | r = kvm_gpc_activate_hva(gpc: &vcpu->arch.xen.vcpu_info_cache, |
891 | hva: data->u.hva, len: sizeof(struct vcpu_info)); |
892 | } |
893 | |
894 | if (!r) |
895 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
896 | |
897 | break; |
898 | |
899 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: |
900 | if (data->u.gpa == KVM_XEN_INVALID_GPA) { |
901 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_time_info_cache); |
902 | r = 0; |
903 | break; |
904 | } |
905 | |
906 | r = kvm_gpc_activate(gpc: &vcpu->arch.xen.vcpu_time_info_cache, |
907 | gpa: data->u.gpa, |
908 | len: sizeof(struct pvclock_vcpu_time_info)); |
909 | if (!r) |
910 | kvm_make_request(KVM_REQ_CLOCK_UPDATE, vcpu); |
911 | break; |
912 | |
913 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: { |
914 | size_t sz, sz1, sz2; |
915 | |
916 | if (!sched_info_on()) { |
917 | r = -EOPNOTSUPP; |
918 | break; |
919 | } |
920 | if (data->u.gpa == KVM_XEN_INVALID_GPA) { |
921 | r = 0; |
922 | deactivate_out: |
923 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate_cache); |
924 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache); |
925 | break; |
926 | } |
927 | |
928 | /* |
929 | * If the guest switches to 64-bit mode after setting the runstate |
930 | * address, that's actually OK. kvm_xen_update_runstate_guest() |
931 | * will cope. |
932 | */ |
933 | if (IS_ENABLED(CONFIG_64BIT) && vcpu->kvm->arch.xen.long_mode) |
934 | sz = sizeof(struct vcpu_runstate_info); |
935 | else |
936 | sz = sizeof(struct compat_vcpu_runstate_info); |
937 | |
938 | /* How much fits in the (first) page? */ |
939 | sz1 = PAGE_SIZE - (data->u.gpa & ~PAGE_MASK); |
940 | r = kvm_gpc_activate(gpc: &vcpu->arch.xen.runstate_cache, |
941 | gpa: data->u.gpa, len: sz1); |
942 | if (r) |
943 | goto deactivate_out; |
944 | |
945 | /* Either map the second page, or deactivate the second GPC */ |
946 | if (sz1 >= sz) { |
947 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache); |
948 | } else { |
949 | sz2 = sz - sz1; |
950 | BUG_ON((data->u.gpa + sz1) & ~PAGE_MASK); |
951 | r = kvm_gpc_activate(gpc: &vcpu->arch.xen.runstate2_cache, |
952 | gpa: data->u.gpa + sz1, len: sz2); |
953 | if (r) |
954 | goto deactivate_out; |
955 | } |
956 | |
957 | kvm_xen_update_runstate_guest(v: vcpu, atomic: false); |
958 | break; |
959 | } |
960 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: |
961 | if (!sched_info_on()) { |
962 | r = -EOPNOTSUPP; |
963 | break; |
964 | } |
965 | if (data->u.runstate.state > RUNSTATE_offline) { |
966 | r = -EINVAL; |
967 | break; |
968 | } |
969 | |
970 | kvm_xen_update_runstate(v: vcpu, state: data->u.runstate.state); |
971 | r = 0; |
972 | break; |
973 | |
974 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: |
975 | if (!sched_info_on()) { |
976 | r = -EOPNOTSUPP; |
977 | break; |
978 | } |
979 | if (data->u.runstate.state > RUNSTATE_offline) { |
980 | r = -EINVAL; |
981 | break; |
982 | } |
983 | if (data->u.runstate.state_entry_time != |
984 | (data->u.runstate.time_running + |
985 | data->u.runstate.time_runnable + |
986 | data->u.runstate.time_blocked + |
987 | data->u.runstate.time_offline)) { |
988 | r = -EINVAL; |
989 | break; |
990 | } |
991 | if (get_kvmclock_ns(kvm: vcpu->kvm) < |
992 | data->u.runstate.state_entry_time) { |
993 | r = -EINVAL; |
994 | break; |
995 | } |
996 | |
997 | vcpu->arch.xen.current_runstate = data->u.runstate.state; |
998 | vcpu->arch.xen.runstate_entry_time = |
999 | data->u.runstate.state_entry_time; |
1000 | vcpu->arch.xen.runstate_times[RUNSTATE_running] = |
1001 | data->u.runstate.time_running; |
1002 | vcpu->arch.xen.runstate_times[RUNSTATE_runnable] = |
1003 | data->u.runstate.time_runnable; |
1004 | vcpu->arch.xen.runstate_times[RUNSTATE_blocked] = |
1005 | data->u.runstate.time_blocked; |
1006 | vcpu->arch.xen.runstate_times[RUNSTATE_offline] = |
1007 | data->u.runstate.time_offline; |
1008 | vcpu->arch.xen.last_steal = current->sched_info.run_delay; |
1009 | r = 0; |
1010 | break; |
1011 | |
1012 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: |
1013 | if (!sched_info_on()) { |
1014 | r = -EOPNOTSUPP; |
1015 | break; |
1016 | } |
1017 | if (data->u.runstate.state > RUNSTATE_offline && |
1018 | data->u.runstate.state != (u64)-1) { |
1019 | r = -EINVAL; |
1020 | break; |
1021 | } |
1022 | /* The adjustment must add up */ |
1023 | if (data->u.runstate.state_entry_time != |
1024 | (data->u.runstate.time_running + |
1025 | data->u.runstate.time_runnable + |
1026 | data->u.runstate.time_blocked + |
1027 | data->u.runstate.time_offline)) { |
1028 | r = -EINVAL; |
1029 | break; |
1030 | } |
1031 | |
1032 | if (get_kvmclock_ns(kvm: vcpu->kvm) < |
1033 | (vcpu->arch.xen.runstate_entry_time + |
1034 | data->u.runstate.state_entry_time)) { |
1035 | r = -EINVAL; |
1036 | break; |
1037 | } |
1038 | |
1039 | vcpu->arch.xen.runstate_entry_time += |
1040 | data->u.runstate.state_entry_time; |
1041 | vcpu->arch.xen.runstate_times[RUNSTATE_running] += |
1042 | data->u.runstate.time_running; |
1043 | vcpu->arch.xen.runstate_times[RUNSTATE_runnable] += |
1044 | data->u.runstate.time_runnable; |
1045 | vcpu->arch.xen.runstate_times[RUNSTATE_blocked] += |
1046 | data->u.runstate.time_blocked; |
1047 | vcpu->arch.xen.runstate_times[RUNSTATE_offline] += |
1048 | data->u.runstate.time_offline; |
1049 | |
1050 | if (data->u.runstate.state <= RUNSTATE_offline) |
1051 | kvm_xen_update_runstate(v: vcpu, state: data->u.runstate.state); |
1052 | else if (vcpu->arch.xen.runstate_cache.active) |
1053 | kvm_xen_update_runstate_guest(v: vcpu, atomic: false); |
1054 | r = 0; |
1055 | break; |
1056 | |
1057 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: |
1058 | if (data->u.vcpu_id >= KVM_MAX_VCPUS) |
1059 | r = -EINVAL; |
1060 | else { |
1061 | vcpu->arch.xen.vcpu_id = data->u.vcpu_id; |
1062 | r = 0; |
1063 | } |
1064 | break; |
1065 | |
1066 | case KVM_XEN_VCPU_ATTR_TYPE_TIMER: |
1067 | if (data->u.timer.port && |
1068 | data->u.timer.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) { |
1069 | r = -EINVAL; |
1070 | break; |
1071 | } |
1072 | |
1073 | if (!vcpu->arch.xen.timer.function) |
1074 | kvm_xen_init_timer(vcpu); |
1075 | |
1076 | /* Stop the timer (if it's running) before changing the vector */ |
1077 | kvm_xen_stop_timer(vcpu); |
1078 | vcpu->arch.xen.timer_virq = data->u.timer.port; |
1079 | |
1080 | /* Start the timer if the new value has a valid vector+expiry. */ |
1081 | if (data->u.timer.port && data->u.timer.expires_ns) |
1082 | kvm_xen_start_timer(vcpu, guest_abs: data->u.timer.expires_ns, linux_wa: false); |
1083 | |
1084 | r = 0; |
1085 | break; |
1086 | |
1087 | case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: |
1088 | if (data->u.vector && data->u.vector < 0x10) |
1089 | r = -EINVAL; |
1090 | else { |
1091 | vcpu->arch.xen.upcall_vector = data->u.vector; |
1092 | r = 0; |
1093 | } |
1094 | break; |
1095 | |
1096 | default: |
1097 | break; |
1098 | } |
1099 | |
1100 | srcu_read_unlock(ssp: &vcpu->kvm->srcu, idx); |
1101 | mutex_unlock(lock: &vcpu->kvm->arch.xen.xen_lock); |
1102 | return r; |
1103 | } |
1104 | |
1105 | int kvm_xen_vcpu_get_attr(struct kvm_vcpu *vcpu, struct kvm_xen_vcpu_attr *data) |
1106 | { |
1107 | int r = -ENOENT; |
1108 | |
1109 | mutex_lock(&vcpu->kvm->arch.xen.xen_lock); |
1110 | |
1111 | switch (data->type) { |
1112 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO: |
1113 | if (kvm_gpc_is_gpa_active(gpc: &vcpu->arch.xen.vcpu_info_cache)) |
1114 | data->u.gpa = vcpu->arch.xen.vcpu_info_cache.gpa; |
1115 | else |
1116 | data->u.gpa = KVM_XEN_INVALID_GPA; |
1117 | r = 0; |
1118 | break; |
1119 | |
1120 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_INFO_HVA: |
1121 | if (kvm_gpc_is_hva_active(gpc: &vcpu->arch.xen.vcpu_info_cache)) |
1122 | data->u.hva = vcpu->arch.xen.vcpu_info_cache.uhva; |
1123 | else |
1124 | data->u.hva = 0; |
1125 | r = 0; |
1126 | break; |
1127 | |
1128 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_TIME_INFO: |
1129 | if (vcpu->arch.xen.vcpu_time_info_cache.active) |
1130 | data->u.gpa = vcpu->arch.xen.vcpu_time_info_cache.gpa; |
1131 | else |
1132 | data->u.gpa = KVM_XEN_INVALID_GPA; |
1133 | r = 0; |
1134 | break; |
1135 | |
1136 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADDR: |
1137 | if (!sched_info_on()) { |
1138 | r = -EOPNOTSUPP; |
1139 | break; |
1140 | } |
1141 | if (vcpu->arch.xen.runstate_cache.active) { |
1142 | data->u.gpa = vcpu->arch.xen.runstate_cache.gpa; |
1143 | r = 0; |
1144 | } |
1145 | break; |
1146 | |
1147 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_CURRENT: |
1148 | if (!sched_info_on()) { |
1149 | r = -EOPNOTSUPP; |
1150 | break; |
1151 | } |
1152 | data->u.runstate.state = vcpu->arch.xen.current_runstate; |
1153 | r = 0; |
1154 | break; |
1155 | |
1156 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_DATA: |
1157 | if (!sched_info_on()) { |
1158 | r = -EOPNOTSUPP; |
1159 | break; |
1160 | } |
1161 | data->u.runstate.state = vcpu->arch.xen.current_runstate; |
1162 | data->u.runstate.state_entry_time = |
1163 | vcpu->arch.xen.runstate_entry_time; |
1164 | data->u.runstate.time_running = |
1165 | vcpu->arch.xen.runstate_times[RUNSTATE_running]; |
1166 | data->u.runstate.time_runnable = |
1167 | vcpu->arch.xen.runstate_times[RUNSTATE_runnable]; |
1168 | data->u.runstate.time_blocked = |
1169 | vcpu->arch.xen.runstate_times[RUNSTATE_blocked]; |
1170 | data->u.runstate.time_offline = |
1171 | vcpu->arch.xen.runstate_times[RUNSTATE_offline]; |
1172 | r = 0; |
1173 | break; |
1174 | |
1175 | case KVM_XEN_VCPU_ATTR_TYPE_RUNSTATE_ADJUST: |
1176 | r = -EINVAL; |
1177 | break; |
1178 | |
1179 | case KVM_XEN_VCPU_ATTR_TYPE_VCPU_ID: |
1180 | data->u.vcpu_id = vcpu->arch.xen.vcpu_id; |
1181 | r = 0; |
1182 | break; |
1183 | |
1184 | case KVM_XEN_VCPU_ATTR_TYPE_TIMER: |
1185 | /* |
1186 | * Ensure a consistent snapshot of state is captured, with a |
1187 | * timer either being pending, or the event channel delivered |
1188 | * to the corresponding bit in the shared_info. Not still |
1189 | * lurking in the timer_pending flag for deferred delivery. |
1190 | * Purely as an optimisation, if the timer_expires field is |
1191 | * zero, that means the timer isn't active (or even in the |
1192 | * timer_pending flag) and there is no need to cancel it. |
1193 | */ |
1194 | if (vcpu->arch.xen.timer_expires) { |
1195 | hrtimer_cancel(timer: &vcpu->arch.xen.timer); |
1196 | kvm_xen_inject_timer_irqs(vcpu); |
1197 | } |
1198 | |
1199 | data->u.timer.port = vcpu->arch.xen.timer_virq; |
1200 | data->u.timer.priority = KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL; |
1201 | data->u.timer.expires_ns = vcpu->arch.xen.timer_expires; |
1202 | |
1203 | /* |
1204 | * The hrtimer may trigger and raise the IRQ immediately, |
1205 | * while the returned state causes it to be set up and |
1206 | * raised again on the destination system after migration. |
1207 | * That's fine, as the guest won't even have had a chance |
1208 | * to run and handle the interrupt. Asserting an already |
1209 | * pending event channel is idempotent. |
1210 | */ |
1211 | if (vcpu->arch.xen.timer_expires) |
1212 | hrtimer_start_expires(timer: &vcpu->arch.xen.timer, |
1213 | mode: HRTIMER_MODE_ABS_HARD); |
1214 | |
1215 | r = 0; |
1216 | break; |
1217 | |
1218 | case KVM_XEN_VCPU_ATTR_TYPE_UPCALL_VECTOR: |
1219 | data->u.vector = vcpu->arch.xen.upcall_vector; |
1220 | r = 0; |
1221 | break; |
1222 | |
1223 | default: |
1224 | break; |
1225 | } |
1226 | |
1227 | mutex_unlock(lock: &vcpu->kvm->arch.xen.xen_lock); |
1228 | return r; |
1229 | } |
1230 | |
1231 | int kvm_xen_write_hypercall_page(struct kvm_vcpu *vcpu, u64 data) |
1232 | { |
1233 | struct kvm *kvm = vcpu->kvm; |
1234 | u32 page_num = data & ~PAGE_MASK; |
1235 | u64 page_addr = data & PAGE_MASK; |
1236 | bool lm = is_long_mode(vcpu); |
1237 | int r = 0; |
1238 | |
1239 | mutex_lock(&kvm->arch.xen.xen_lock); |
1240 | if (kvm->arch.xen.long_mode != lm) { |
1241 | kvm->arch.xen.long_mode = lm; |
1242 | |
1243 | /* |
1244 | * Re-initialize shared_info to put the wallclock in the |
1245 | * correct place. |
1246 | */ |
1247 | if (kvm->arch.xen.shinfo_cache.active && |
1248 | kvm_xen_shared_info_init(kvm)) |
1249 | r = 1; |
1250 | } |
1251 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
1252 | |
1253 | if (r) |
1254 | return r; |
1255 | |
1256 | /* |
1257 | * If Xen hypercall intercept is enabled, fill the hypercall |
1258 | * page with VMCALL/VMMCALL instructions since that's what |
1259 | * we catch. Else the VMM has provided the hypercall pages |
1260 | * with instructions of its own choosing, so use those. |
1261 | */ |
1262 | if (kvm_xen_hypercall_enabled(kvm)) { |
1263 | u8 instructions[32]; |
1264 | int i; |
1265 | |
1266 | if (page_num) |
1267 | return 1; |
1268 | |
1269 | /* mov imm32, %eax */ |
1270 | instructions[0] = 0xb8; |
1271 | |
1272 | /* vmcall / vmmcall */ |
1273 | static_call(kvm_x86_patch_hypercall)(vcpu, instructions + 5); |
1274 | |
1275 | /* ret */ |
1276 | instructions[8] = 0xc3; |
1277 | |
1278 | /* int3 to pad */ |
1279 | memset(instructions + 9, 0xcc, sizeof(instructions) - 9); |
1280 | |
1281 | for (i = 0; i < PAGE_SIZE / sizeof(instructions); i++) { |
1282 | *(u32 *)&instructions[1] = i; |
1283 | if (kvm_vcpu_write_guest(vcpu, |
1284 | gpa: page_addr + (i * sizeof(instructions)), |
1285 | data: instructions, len: sizeof(instructions))) |
1286 | return 1; |
1287 | } |
1288 | } else { |
1289 | /* |
1290 | * Note, truncation is a non-issue as 'lm' is guaranteed to be |
1291 | * false for a 32-bit kernel, i.e. when hva_t is only 4 bytes. |
1292 | */ |
1293 | hva_t blob_addr = lm ? kvm->arch.xen_hvm_config.blob_addr_64 |
1294 | : kvm->arch.xen_hvm_config.blob_addr_32; |
1295 | u8 blob_size = lm ? kvm->arch.xen_hvm_config.blob_size_64 |
1296 | : kvm->arch.xen_hvm_config.blob_size_32; |
1297 | u8 *page; |
1298 | int ret; |
1299 | |
1300 | if (page_num >= blob_size) |
1301 | return 1; |
1302 | |
1303 | blob_addr += page_num * PAGE_SIZE; |
1304 | |
1305 | page = memdup_user((u8 __user *)blob_addr, PAGE_SIZE); |
1306 | if (IS_ERR(ptr: page)) |
1307 | return PTR_ERR(ptr: page); |
1308 | |
1309 | ret = kvm_vcpu_write_guest(vcpu, gpa: page_addr, data: page, PAGE_SIZE); |
1310 | kfree(objp: page); |
1311 | if (ret) |
1312 | return 1; |
1313 | } |
1314 | return 0; |
1315 | } |
1316 | |
1317 | int kvm_xen_hvm_config(struct kvm *kvm, struct kvm_xen_hvm_config *xhc) |
1318 | { |
1319 | /* Only some feature flags need to be *enabled* by userspace */ |
1320 | u32 permitted_flags = KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL | |
1321 | KVM_XEN_HVM_CONFIG_EVTCHN_SEND | |
1322 | KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE; |
1323 | u32 old_flags; |
1324 | |
1325 | if (xhc->flags & ~permitted_flags) |
1326 | return -EINVAL; |
1327 | |
1328 | /* |
1329 | * With hypercall interception the kernel generates its own |
1330 | * hypercall page so it must not be provided. |
1331 | */ |
1332 | if ((xhc->flags & KVM_XEN_HVM_CONFIG_INTERCEPT_HCALL) && |
1333 | (xhc->blob_addr_32 || xhc->blob_addr_64 || |
1334 | xhc->blob_size_32 || xhc->blob_size_64)) |
1335 | return -EINVAL; |
1336 | |
1337 | mutex_lock(&kvm->arch.xen.xen_lock); |
1338 | |
1339 | if (xhc->msr && !kvm->arch.xen_hvm_config.msr) |
1340 | static_branch_inc(&kvm_xen_enabled.key); |
1341 | else if (!xhc->msr && kvm->arch.xen_hvm_config.msr) |
1342 | static_branch_slow_dec_deferred(&kvm_xen_enabled); |
1343 | |
1344 | old_flags = kvm->arch.xen_hvm_config.flags; |
1345 | memcpy(&kvm->arch.xen_hvm_config, xhc, sizeof(*xhc)); |
1346 | |
1347 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
1348 | |
1349 | if ((old_flags ^ xhc->flags) & KVM_XEN_HVM_CONFIG_PVCLOCK_TSC_UNSTABLE) |
1350 | kvm_make_all_cpus_request(kvm, KVM_REQ_CLOCK_UPDATE); |
1351 | |
1352 | return 0; |
1353 | } |
1354 | |
1355 | static int kvm_xen_hypercall_set_result(struct kvm_vcpu *vcpu, u64 result) |
1356 | { |
1357 | kvm_rax_write(vcpu, val: result); |
1358 | return kvm_skip_emulated_instruction(vcpu); |
1359 | } |
1360 | |
1361 | static int kvm_xen_hypercall_complete_userspace(struct kvm_vcpu *vcpu) |
1362 | { |
1363 | struct kvm_run *run = vcpu->run; |
1364 | |
1365 | if (unlikely(!kvm_is_linear_rip(vcpu, vcpu->arch.xen.hypercall_rip))) |
1366 | return 1; |
1367 | |
1368 | return kvm_xen_hypercall_set_result(vcpu, result: run->xen.u.hcall.result); |
1369 | } |
1370 | |
1371 | static inline int max_evtchn_port(struct kvm *kvm) |
1372 | { |
1373 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) |
1374 | return EVTCHN_2L_NR_CHANNELS; |
1375 | else |
1376 | return COMPAT_EVTCHN_2L_NR_CHANNELS; |
1377 | } |
1378 | |
1379 | static bool wait_pending_event(struct kvm_vcpu *vcpu, int nr_ports, |
1380 | evtchn_port_t *ports) |
1381 | { |
1382 | struct kvm *kvm = vcpu->kvm; |
1383 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; |
1384 | unsigned long *pending_bits; |
1385 | unsigned long flags; |
1386 | bool ret = true; |
1387 | int idx, i; |
1388 | |
1389 | idx = srcu_read_lock(ssp: &kvm->srcu); |
1390 | read_lock_irqsave(&gpc->lock, flags); |
1391 | if (!kvm_gpc_check(gpc, PAGE_SIZE)) |
1392 | goto out_rcu; |
1393 | |
1394 | ret = false; |
1395 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { |
1396 | struct shared_info *shinfo = gpc->khva; |
1397 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; |
1398 | } else { |
1399 | struct compat_shared_info *shinfo = gpc->khva; |
1400 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; |
1401 | } |
1402 | |
1403 | for (i = 0; i < nr_ports; i++) { |
1404 | if (test_bit(ports[i], pending_bits)) { |
1405 | ret = true; |
1406 | break; |
1407 | } |
1408 | } |
1409 | |
1410 | out_rcu: |
1411 | read_unlock_irqrestore(&gpc->lock, flags); |
1412 | srcu_read_unlock(ssp: &kvm->srcu, idx); |
1413 | |
1414 | return ret; |
1415 | } |
1416 | |
1417 | static bool kvm_xen_schedop_poll(struct kvm_vcpu *vcpu, bool longmode, |
1418 | u64 param, u64 *r) |
1419 | { |
1420 | struct sched_poll sched_poll; |
1421 | evtchn_port_t port, *ports; |
1422 | struct x86_exception e; |
1423 | int i; |
1424 | |
1425 | if (!lapic_in_kernel(vcpu) || |
1426 | !(vcpu->kvm->arch.xen_hvm_config.flags & KVM_XEN_HVM_CONFIG_EVTCHN_SEND)) |
1427 | return false; |
1428 | |
1429 | if (IS_ENABLED(CONFIG_64BIT) && !longmode) { |
1430 | struct compat_sched_poll sp32; |
1431 | |
1432 | /* Sanity check that the compat struct definition is correct */ |
1433 | BUILD_BUG_ON(sizeof(sp32) != 16); |
1434 | |
1435 | if (kvm_read_guest_virt(vcpu, addr: param, val: &sp32, bytes: sizeof(sp32), exception: &e)) { |
1436 | *r = -EFAULT; |
1437 | return true; |
1438 | } |
1439 | |
1440 | /* |
1441 | * This is a 32-bit pointer to an array of evtchn_port_t which |
1442 | * are uint32_t, so once it's converted no further compat |
1443 | * handling is needed. |
1444 | */ |
1445 | sched_poll.ports = (void *)(unsigned long)(sp32.ports); |
1446 | sched_poll.nr_ports = sp32.nr_ports; |
1447 | sched_poll.timeout = sp32.timeout; |
1448 | } else { |
1449 | if (kvm_read_guest_virt(vcpu, addr: param, val: &sched_poll, |
1450 | bytes: sizeof(sched_poll), exception: &e)) { |
1451 | *r = -EFAULT; |
1452 | return true; |
1453 | } |
1454 | } |
1455 | |
1456 | if (unlikely(sched_poll.nr_ports > 1)) { |
1457 | /* Xen (unofficially) limits number of pollers to 128 */ |
1458 | if (sched_poll.nr_ports > 128) { |
1459 | *r = -EINVAL; |
1460 | return true; |
1461 | } |
1462 | |
1463 | ports = kmalloc_array(n: sched_poll.nr_ports, |
1464 | size: sizeof(*ports), GFP_KERNEL); |
1465 | if (!ports) { |
1466 | *r = -ENOMEM; |
1467 | return true; |
1468 | } |
1469 | } else |
1470 | ports = &port; |
1471 | |
1472 | if (kvm_read_guest_virt(vcpu, addr: (gva_t)sched_poll.ports, val: ports, |
1473 | bytes: sched_poll.nr_ports * sizeof(*ports), exception: &e)) { |
1474 | *r = -EFAULT; |
1475 | return true; |
1476 | } |
1477 | |
1478 | for (i = 0; i < sched_poll.nr_ports; i++) { |
1479 | if (ports[i] >= max_evtchn_port(kvm: vcpu->kvm)) { |
1480 | *r = -EINVAL; |
1481 | goto out; |
1482 | } |
1483 | } |
1484 | |
1485 | if (sched_poll.nr_ports == 1) |
1486 | vcpu->arch.xen.poll_evtchn = port; |
1487 | else |
1488 | vcpu->arch.xen.poll_evtchn = -1; |
1489 | |
1490 | set_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask); |
1491 | |
1492 | if (!wait_pending_event(vcpu, nr_ports: sched_poll.nr_ports, ports)) { |
1493 | vcpu->arch.mp_state = KVM_MP_STATE_HALTED; |
1494 | |
1495 | if (sched_poll.timeout) |
1496 | mod_timer(timer: &vcpu->arch.xen.poll_timer, |
1497 | expires: jiffies + nsecs_to_jiffies(n: sched_poll.timeout)); |
1498 | |
1499 | kvm_vcpu_halt(vcpu); |
1500 | |
1501 | if (sched_poll.timeout) |
1502 | del_timer(timer: &vcpu->arch.xen.poll_timer); |
1503 | |
1504 | vcpu->arch.mp_state = KVM_MP_STATE_RUNNABLE; |
1505 | } |
1506 | |
1507 | vcpu->arch.xen.poll_evtchn = 0; |
1508 | *r = 0; |
1509 | out: |
1510 | /* Really, this is only needed in case of timeout */ |
1511 | clear_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask); |
1512 | |
1513 | if (unlikely(sched_poll.nr_ports > 1)) |
1514 | kfree(objp: ports); |
1515 | return true; |
1516 | } |
1517 | |
1518 | static void cancel_evtchn_poll(struct timer_list *t) |
1519 | { |
1520 | struct kvm_vcpu *vcpu = from_timer(vcpu, t, arch.xen.poll_timer); |
1521 | |
1522 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); |
1523 | kvm_vcpu_kick(vcpu); |
1524 | } |
1525 | |
1526 | static bool kvm_xen_hcall_sched_op(struct kvm_vcpu *vcpu, bool longmode, |
1527 | int cmd, u64 param, u64 *r) |
1528 | { |
1529 | switch (cmd) { |
1530 | case SCHEDOP_poll: |
1531 | if (kvm_xen_schedop_poll(vcpu, longmode, param, r)) |
1532 | return true; |
1533 | fallthrough; |
1534 | case SCHEDOP_yield: |
1535 | kvm_vcpu_on_spin(vcpu, yield_to_kernel_mode: true); |
1536 | *r = 0; |
1537 | return true; |
1538 | default: |
1539 | break; |
1540 | } |
1541 | |
1542 | return false; |
1543 | } |
1544 | |
1545 | struct compat_vcpu_set_singleshot_timer { |
1546 | uint64_t timeout_abs_ns; |
1547 | uint32_t flags; |
1548 | } __attribute__((packed)); |
1549 | |
1550 | static bool kvm_xen_hcall_vcpu_op(struct kvm_vcpu *vcpu, bool longmode, int cmd, |
1551 | int vcpu_id, u64 param, u64 *r) |
1552 | { |
1553 | struct vcpu_set_singleshot_timer oneshot; |
1554 | struct x86_exception e; |
1555 | |
1556 | if (!kvm_xen_timer_enabled(vcpu)) |
1557 | return false; |
1558 | |
1559 | switch (cmd) { |
1560 | case VCPUOP_set_singleshot_timer: |
1561 | if (vcpu->arch.xen.vcpu_id != vcpu_id) { |
1562 | *r = -EINVAL; |
1563 | return true; |
1564 | } |
1565 | |
1566 | /* |
1567 | * The only difference for 32-bit compat is the 4 bytes of |
1568 | * padding after the interesting part of the structure. So |
1569 | * for a faithful emulation of Xen we have to *try* to copy |
1570 | * the padding and return -EFAULT if we can't. Otherwise we |
1571 | * might as well just have copied the 12-byte 32-bit struct. |
1572 | */ |
1573 | BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != |
1574 | offsetof(struct vcpu_set_singleshot_timer, timeout_abs_ns)); |
1575 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, timeout_abs_ns) != |
1576 | sizeof_field(struct vcpu_set_singleshot_timer, timeout_abs_ns)); |
1577 | BUILD_BUG_ON(offsetof(struct compat_vcpu_set_singleshot_timer, flags) != |
1578 | offsetof(struct vcpu_set_singleshot_timer, flags)); |
1579 | BUILD_BUG_ON(sizeof_field(struct compat_vcpu_set_singleshot_timer, flags) != |
1580 | sizeof_field(struct vcpu_set_singleshot_timer, flags)); |
1581 | |
1582 | if (kvm_read_guest_virt(vcpu, addr: param, val: &oneshot, bytes: longmode ? sizeof(oneshot) : |
1583 | sizeof(struct compat_vcpu_set_singleshot_timer), exception: &e)) { |
1584 | *r = -EFAULT; |
1585 | return true; |
1586 | } |
1587 | |
1588 | kvm_xen_start_timer(vcpu, guest_abs: oneshot.timeout_abs_ns, linux_wa: false); |
1589 | *r = 0; |
1590 | return true; |
1591 | |
1592 | case VCPUOP_stop_singleshot_timer: |
1593 | if (vcpu->arch.xen.vcpu_id != vcpu_id) { |
1594 | *r = -EINVAL; |
1595 | return true; |
1596 | } |
1597 | kvm_xen_stop_timer(vcpu); |
1598 | *r = 0; |
1599 | return true; |
1600 | } |
1601 | |
1602 | return false; |
1603 | } |
1604 | |
1605 | static bool kvm_xen_hcall_set_timer_op(struct kvm_vcpu *vcpu, uint64_t timeout, |
1606 | u64 *r) |
1607 | { |
1608 | if (!kvm_xen_timer_enabled(vcpu)) |
1609 | return false; |
1610 | |
1611 | if (timeout) |
1612 | kvm_xen_start_timer(vcpu, guest_abs: timeout, linux_wa: true); |
1613 | else |
1614 | kvm_xen_stop_timer(vcpu); |
1615 | |
1616 | *r = 0; |
1617 | return true; |
1618 | } |
1619 | |
1620 | int kvm_xen_hypercall(struct kvm_vcpu *vcpu) |
1621 | { |
1622 | bool longmode; |
1623 | u64 input, params[6], r = -ENOSYS; |
1624 | bool handled = false; |
1625 | u8 cpl; |
1626 | |
1627 | input = (u64)kvm_register_read(vcpu, reg: VCPU_REGS_RAX); |
1628 | |
1629 | /* Hyper-V hypercalls get bit 31 set in EAX */ |
1630 | if ((input & 0x80000000) && |
1631 | kvm_hv_hypercall_enabled(vcpu)) |
1632 | return kvm_hv_hypercall(vcpu); |
1633 | |
1634 | longmode = is_64_bit_hypercall(vcpu); |
1635 | if (!longmode) { |
1636 | params[0] = (u32)kvm_rbx_read(vcpu); |
1637 | params[1] = (u32)kvm_rcx_read(vcpu); |
1638 | params[2] = (u32)kvm_rdx_read(vcpu); |
1639 | params[3] = (u32)kvm_rsi_read(vcpu); |
1640 | params[4] = (u32)kvm_rdi_read(vcpu); |
1641 | params[5] = (u32)kvm_rbp_read(vcpu); |
1642 | } |
1643 | #ifdef CONFIG_X86_64 |
1644 | else { |
1645 | params[0] = (u64)kvm_rdi_read(vcpu); |
1646 | params[1] = (u64)kvm_rsi_read(vcpu); |
1647 | params[2] = (u64)kvm_rdx_read(vcpu); |
1648 | params[3] = (u64)kvm_r10_read(vcpu); |
1649 | params[4] = (u64)kvm_r8_read(vcpu); |
1650 | params[5] = (u64)kvm_r9_read(vcpu); |
1651 | } |
1652 | #endif |
1653 | cpl = static_call(kvm_x86_get_cpl)(vcpu); |
1654 | trace_kvm_xen_hypercall(cpl, nr: input, a0: params[0], a1: params[1], a2: params[2], |
1655 | a3: params[3], a4: params[4], a5: params[5]); |
1656 | |
1657 | /* |
1658 | * Only allow hypercall acceleration for CPL0. The rare hypercalls that |
1659 | * are permitted in guest userspace can be handled by the VMM. |
1660 | */ |
1661 | if (unlikely(cpl > 0)) |
1662 | goto handle_in_userspace; |
1663 | |
1664 | switch (input) { |
1665 | case __HYPERVISOR_xen_version: |
1666 | if (params[0] == XENVER_version && vcpu->kvm->arch.xen.xen_version) { |
1667 | r = vcpu->kvm->arch.xen.xen_version; |
1668 | handled = true; |
1669 | } |
1670 | break; |
1671 | case __HYPERVISOR_event_channel_op: |
1672 | if (params[0] == EVTCHNOP_send) |
1673 | handled = kvm_xen_hcall_evtchn_send(vcpu, param: params[1], r: &r); |
1674 | break; |
1675 | case __HYPERVISOR_sched_op: |
1676 | handled = kvm_xen_hcall_sched_op(vcpu, longmode, cmd: params[0], |
1677 | param: params[1], r: &r); |
1678 | break; |
1679 | case __HYPERVISOR_vcpu_op: |
1680 | handled = kvm_xen_hcall_vcpu_op(vcpu, longmode, cmd: params[0], vcpu_id: params[1], |
1681 | param: params[2], r: &r); |
1682 | break; |
1683 | case __HYPERVISOR_set_timer_op: { |
1684 | u64 timeout = params[0]; |
1685 | /* In 32-bit mode, the 64-bit timeout is in two 32-bit params. */ |
1686 | if (!longmode) |
1687 | timeout |= params[1] << 32; |
1688 | handled = kvm_xen_hcall_set_timer_op(vcpu, timeout, r: &r); |
1689 | break; |
1690 | } |
1691 | default: |
1692 | break; |
1693 | } |
1694 | |
1695 | if (handled) |
1696 | return kvm_xen_hypercall_set_result(vcpu, result: r); |
1697 | |
1698 | handle_in_userspace: |
1699 | vcpu->run->exit_reason = KVM_EXIT_XEN; |
1700 | vcpu->run->xen.type = KVM_EXIT_XEN_HCALL; |
1701 | vcpu->run->xen.u.hcall.longmode = longmode; |
1702 | vcpu->run->xen.u.hcall.cpl = cpl; |
1703 | vcpu->run->xen.u.hcall.input = input; |
1704 | vcpu->run->xen.u.hcall.params[0] = params[0]; |
1705 | vcpu->run->xen.u.hcall.params[1] = params[1]; |
1706 | vcpu->run->xen.u.hcall.params[2] = params[2]; |
1707 | vcpu->run->xen.u.hcall.params[3] = params[3]; |
1708 | vcpu->run->xen.u.hcall.params[4] = params[4]; |
1709 | vcpu->run->xen.u.hcall.params[5] = params[5]; |
1710 | vcpu->arch.xen.hypercall_rip = kvm_get_linear_rip(vcpu); |
1711 | vcpu->arch.complete_userspace_io = |
1712 | kvm_xen_hypercall_complete_userspace; |
1713 | |
1714 | return 0; |
1715 | } |
1716 | |
1717 | static void kvm_xen_check_poller(struct kvm_vcpu *vcpu, int port) |
1718 | { |
1719 | int poll_evtchn = vcpu->arch.xen.poll_evtchn; |
1720 | |
1721 | if ((poll_evtchn == port || poll_evtchn == -1) && |
1722 | test_and_clear_bit(nr: vcpu->vcpu_idx, addr: vcpu->kvm->arch.xen.poll_mask)) { |
1723 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); |
1724 | kvm_vcpu_kick(vcpu); |
1725 | } |
1726 | } |
1727 | |
1728 | /* |
1729 | * The return value from this function is propagated to kvm_set_irq() API, |
1730 | * so it returns: |
1731 | * < 0 Interrupt was ignored (masked or not delivered for other reasons) |
1732 | * = 0 Interrupt was coalesced (previous irq is still pending) |
1733 | * > 0 Number of CPUs interrupt was delivered to |
1734 | * |
1735 | * It is also called directly from kvm_arch_set_irq_inatomic(), where the |
1736 | * only check on its return value is a comparison with -EWOULDBLOCK'. |
1737 | */ |
1738 | int kvm_xen_set_evtchn_fast(struct kvm_xen_evtchn *xe, struct kvm *kvm) |
1739 | { |
1740 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; |
1741 | struct kvm_vcpu *vcpu; |
1742 | unsigned long *pending_bits, *mask_bits; |
1743 | unsigned long flags; |
1744 | int port_word_bit; |
1745 | bool kick_vcpu = false; |
1746 | int vcpu_idx, idx, rc; |
1747 | |
1748 | vcpu_idx = READ_ONCE(xe->vcpu_idx); |
1749 | if (vcpu_idx >= 0) |
1750 | vcpu = kvm_get_vcpu(kvm, i: vcpu_idx); |
1751 | else { |
1752 | vcpu = kvm_get_vcpu_by_id(kvm, id: xe->vcpu_id); |
1753 | if (!vcpu) |
1754 | return -EINVAL; |
1755 | WRITE_ONCE(xe->vcpu_idx, vcpu->vcpu_idx); |
1756 | } |
1757 | |
1758 | if (xe->port >= max_evtchn_port(kvm)) |
1759 | return -EINVAL; |
1760 | |
1761 | rc = -EWOULDBLOCK; |
1762 | |
1763 | idx = srcu_read_lock(ssp: &kvm->srcu); |
1764 | |
1765 | read_lock_irqsave(&gpc->lock, flags); |
1766 | if (!kvm_gpc_check(gpc, PAGE_SIZE)) |
1767 | goto out_rcu; |
1768 | |
1769 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { |
1770 | struct shared_info *shinfo = gpc->khva; |
1771 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; |
1772 | mask_bits = (unsigned long *)&shinfo->evtchn_mask; |
1773 | port_word_bit = xe->port / 64; |
1774 | } else { |
1775 | struct compat_shared_info *shinfo = gpc->khva; |
1776 | pending_bits = (unsigned long *)&shinfo->evtchn_pending; |
1777 | mask_bits = (unsigned long *)&shinfo->evtchn_mask; |
1778 | port_word_bit = xe->port / 32; |
1779 | } |
1780 | |
1781 | /* |
1782 | * If this port wasn't already set, and if it isn't masked, then |
1783 | * we try to set the corresponding bit in the in-kernel shadow of |
1784 | * evtchn_pending_sel for the target vCPU. And if *that* wasn't |
1785 | * already set, then we kick the vCPU in question to write to the |
1786 | * *real* evtchn_pending_sel in its own guest vcpu_info struct. |
1787 | */ |
1788 | if (test_and_set_bit(nr: xe->port, addr: pending_bits)) { |
1789 | rc = 0; /* It was already raised */ |
1790 | } else if (test_bit(xe->port, mask_bits)) { |
1791 | rc = -ENOTCONN; /* Masked */ |
1792 | kvm_xen_check_poller(vcpu, port: xe->port); |
1793 | } else { |
1794 | rc = 1; /* Delivered to the bitmap in shared_info. */ |
1795 | /* Now switch to the vCPU's vcpu_info to set the index and pending_sel */ |
1796 | read_unlock_irqrestore(&gpc->lock, flags); |
1797 | gpc = &vcpu->arch.xen.vcpu_info_cache; |
1798 | |
1799 | read_lock_irqsave(&gpc->lock, flags); |
1800 | if (!kvm_gpc_check(gpc, len: sizeof(struct vcpu_info))) { |
1801 | /* |
1802 | * Could not access the vcpu_info. Set the bit in-kernel |
1803 | * and prod the vCPU to deliver it for itself. |
1804 | */ |
1805 | if (!test_and_set_bit(nr: port_word_bit, addr: &vcpu->arch.xen.evtchn_pending_sel)) |
1806 | kick_vcpu = true; |
1807 | goto out_rcu; |
1808 | } |
1809 | |
1810 | if (IS_ENABLED(CONFIG_64BIT) && kvm->arch.xen.long_mode) { |
1811 | struct vcpu_info *vcpu_info = gpc->khva; |
1812 | if (!test_and_set_bit(nr: port_word_bit, addr: &vcpu_info->evtchn_pending_sel)) { |
1813 | WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); |
1814 | kick_vcpu = true; |
1815 | } |
1816 | } else { |
1817 | struct compat_vcpu_info *vcpu_info = gpc->khva; |
1818 | if (!test_and_set_bit(nr: port_word_bit, |
1819 | addr: (unsigned long *)&vcpu_info->evtchn_pending_sel)) { |
1820 | WRITE_ONCE(vcpu_info->evtchn_upcall_pending, 1); |
1821 | kick_vcpu = true; |
1822 | } |
1823 | } |
1824 | |
1825 | /* For the per-vCPU lapic vector, deliver it as MSI. */ |
1826 | if (kick_vcpu && vcpu->arch.xen.upcall_vector) { |
1827 | kvm_xen_inject_vcpu_vector(v: vcpu); |
1828 | kick_vcpu = false; |
1829 | } |
1830 | } |
1831 | |
1832 | out_rcu: |
1833 | read_unlock_irqrestore(&gpc->lock, flags); |
1834 | srcu_read_unlock(ssp: &kvm->srcu, idx); |
1835 | |
1836 | if (kick_vcpu) { |
1837 | kvm_make_request(KVM_REQ_UNBLOCK, vcpu); |
1838 | kvm_vcpu_kick(vcpu); |
1839 | } |
1840 | |
1841 | return rc; |
1842 | } |
1843 | |
1844 | static int kvm_xen_set_evtchn(struct kvm_xen_evtchn *xe, struct kvm *kvm) |
1845 | { |
1846 | bool mm_borrowed = false; |
1847 | int rc; |
1848 | |
1849 | rc = kvm_xen_set_evtchn_fast(xe, kvm); |
1850 | if (rc != -EWOULDBLOCK) |
1851 | return rc; |
1852 | |
1853 | if (current->mm != kvm->mm) { |
1854 | /* |
1855 | * If not on a thread which already belongs to this KVM, |
1856 | * we'd better be in the irqfd workqueue. |
1857 | */ |
1858 | if (WARN_ON_ONCE(current->mm)) |
1859 | return -EINVAL; |
1860 | |
1861 | kthread_use_mm(mm: kvm->mm); |
1862 | mm_borrowed = true; |
1863 | } |
1864 | |
1865 | /* |
1866 | * It is theoretically possible for the page to be unmapped |
1867 | * and the MMU notifier to invalidate the shared_info before |
1868 | * we even get to use it. In that case, this looks like an |
1869 | * infinite loop. It was tempting to do it via the userspace |
1870 | * HVA instead... but that just *hides* the fact that it's |
1871 | * an infinite loop, because if a fault occurs and it waits |
1872 | * for the page to come back, it can *still* immediately |
1873 | * fault and have to wait again, repeatedly. |
1874 | * |
1875 | * Conversely, the page could also have been reinstated by |
1876 | * another thread before we even obtain the mutex above, so |
1877 | * check again *first* before remapping it. |
1878 | */ |
1879 | do { |
1880 | struct gfn_to_pfn_cache *gpc = &kvm->arch.xen.shinfo_cache; |
1881 | int idx; |
1882 | |
1883 | rc = kvm_xen_set_evtchn_fast(xe, kvm); |
1884 | if (rc != -EWOULDBLOCK) |
1885 | break; |
1886 | |
1887 | idx = srcu_read_lock(ssp: &kvm->srcu); |
1888 | rc = kvm_gpc_refresh(gpc, PAGE_SIZE); |
1889 | srcu_read_unlock(ssp: &kvm->srcu, idx); |
1890 | } while(!rc); |
1891 | |
1892 | if (mm_borrowed) |
1893 | kthread_unuse_mm(mm: kvm->mm); |
1894 | |
1895 | return rc; |
1896 | } |
1897 | |
1898 | /* This is the version called from kvm_set_irq() as the .set function */ |
1899 | static int evtchn_set_fn(struct kvm_kernel_irq_routing_entry *e, struct kvm *kvm, |
1900 | int irq_source_id, int level, bool line_status) |
1901 | { |
1902 | if (!level) |
1903 | return -EINVAL; |
1904 | |
1905 | return kvm_xen_set_evtchn(xe: &e->xen_evtchn, kvm); |
1906 | } |
1907 | |
1908 | /* |
1909 | * Set up an event channel interrupt from the KVM IRQ routing table. |
1910 | * Used for e.g. PIRQ from passed through physical devices. |
1911 | */ |
1912 | int kvm_xen_setup_evtchn(struct kvm *kvm, |
1913 | struct kvm_kernel_irq_routing_entry *e, |
1914 | const struct kvm_irq_routing_entry *ue) |
1915 | |
1916 | { |
1917 | struct kvm_vcpu *vcpu; |
1918 | |
1919 | if (ue->u.xen_evtchn.port >= max_evtchn_port(kvm)) |
1920 | return -EINVAL; |
1921 | |
1922 | /* We only support 2 level event channels for now */ |
1923 | if (ue->u.xen_evtchn.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) |
1924 | return -EINVAL; |
1925 | |
1926 | /* |
1927 | * Xen gives us interesting mappings from vCPU index to APIC ID, |
1928 | * which means kvm_get_vcpu_by_id() has to iterate over all vCPUs |
1929 | * to find it. Do that once at setup time, instead of every time. |
1930 | * But beware that on live update / live migration, the routing |
1931 | * table might be reinstated before the vCPU threads have finished |
1932 | * recreating their vCPUs. |
1933 | */ |
1934 | vcpu = kvm_get_vcpu_by_id(kvm, id: ue->u.xen_evtchn.vcpu); |
1935 | if (vcpu) |
1936 | e->xen_evtchn.vcpu_idx = vcpu->vcpu_idx; |
1937 | else |
1938 | e->xen_evtchn.vcpu_idx = -1; |
1939 | |
1940 | e->xen_evtchn.port = ue->u.xen_evtchn.port; |
1941 | e->xen_evtchn.vcpu_id = ue->u.xen_evtchn.vcpu; |
1942 | e->xen_evtchn.priority = ue->u.xen_evtchn.priority; |
1943 | e->set = evtchn_set_fn; |
1944 | |
1945 | return 0; |
1946 | } |
1947 | |
1948 | /* |
1949 | * Explicit event sending from userspace with KVM_XEN_HVM_EVTCHN_SEND ioctl. |
1950 | */ |
1951 | int kvm_xen_hvm_evtchn_send(struct kvm *kvm, struct kvm_irq_routing_xen_evtchn *uxe) |
1952 | { |
1953 | struct kvm_xen_evtchn e; |
1954 | int ret; |
1955 | |
1956 | if (!uxe->port || uxe->port >= max_evtchn_port(kvm)) |
1957 | return -EINVAL; |
1958 | |
1959 | /* We only support 2 level event channels for now */ |
1960 | if (uxe->priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) |
1961 | return -EINVAL; |
1962 | |
1963 | e.port = uxe->port; |
1964 | e.vcpu_id = uxe->vcpu; |
1965 | e.vcpu_idx = -1; |
1966 | e.priority = uxe->priority; |
1967 | |
1968 | ret = kvm_xen_set_evtchn(xe: &e, kvm); |
1969 | |
1970 | /* |
1971 | * None of that 'return 1 if it actually got delivered' nonsense. |
1972 | * We don't care if it was masked (-ENOTCONN) either. |
1973 | */ |
1974 | if (ret > 0 || ret == -ENOTCONN) |
1975 | ret = 0; |
1976 | |
1977 | return ret; |
1978 | } |
1979 | |
1980 | /* |
1981 | * Support for *outbound* event channel events via the EVTCHNOP_send hypercall. |
1982 | */ |
1983 | struct evtchnfd { |
1984 | u32 send_port; |
1985 | u32 type; |
1986 | union { |
1987 | struct kvm_xen_evtchn port; |
1988 | struct { |
1989 | u32 port; /* zero */ |
1990 | struct eventfd_ctx *ctx; |
1991 | } eventfd; |
1992 | } deliver; |
1993 | }; |
1994 | |
1995 | /* |
1996 | * Update target vCPU or priority for a registered sending channel. |
1997 | */ |
1998 | static int kvm_xen_eventfd_update(struct kvm *kvm, |
1999 | struct kvm_xen_hvm_attr *data) |
2000 | { |
2001 | u32 port = data->u.evtchn.send_port; |
2002 | struct evtchnfd *evtchnfd; |
2003 | int ret; |
2004 | |
2005 | /* Protect writes to evtchnfd as well as the idr lookup. */ |
2006 | mutex_lock(&kvm->arch.xen.xen_lock); |
2007 | evtchnfd = idr_find(&kvm->arch.xen.evtchn_ports, id: port); |
2008 | |
2009 | ret = -ENOENT; |
2010 | if (!evtchnfd) |
2011 | goto out_unlock; |
2012 | |
2013 | /* For an UPDATE, nothing may change except the priority/vcpu */ |
2014 | ret = -EINVAL; |
2015 | if (evtchnfd->type != data->u.evtchn.type) |
2016 | goto out_unlock; |
2017 | |
2018 | /* |
2019 | * Port cannot change, and if it's zero that was an eventfd |
2020 | * which can't be changed either. |
2021 | */ |
2022 | if (!evtchnfd->deliver.port.port || |
2023 | evtchnfd->deliver.port.port != data->u.evtchn.deliver.port.port) |
2024 | goto out_unlock; |
2025 | |
2026 | /* We only support 2 level event channels for now */ |
2027 | if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) |
2028 | goto out_unlock; |
2029 | |
2030 | evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; |
2031 | if (evtchnfd->deliver.port.vcpu_id != data->u.evtchn.deliver.port.vcpu) { |
2032 | evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; |
2033 | evtchnfd->deliver.port.vcpu_idx = -1; |
2034 | } |
2035 | ret = 0; |
2036 | out_unlock: |
2037 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
2038 | return ret; |
2039 | } |
2040 | |
2041 | /* |
2042 | * Configure the target (eventfd or local port delivery) for sending on |
2043 | * a given event channel. |
2044 | */ |
2045 | static int kvm_xen_eventfd_assign(struct kvm *kvm, |
2046 | struct kvm_xen_hvm_attr *data) |
2047 | { |
2048 | u32 port = data->u.evtchn.send_port; |
2049 | struct eventfd_ctx *eventfd = NULL; |
2050 | struct evtchnfd *evtchnfd; |
2051 | int ret = -EINVAL; |
2052 | |
2053 | evtchnfd = kzalloc(size: sizeof(struct evtchnfd), GFP_KERNEL); |
2054 | if (!evtchnfd) |
2055 | return -ENOMEM; |
2056 | |
2057 | switch(data->u.evtchn.type) { |
2058 | case EVTCHNSTAT_ipi: |
2059 | /* IPI must map back to the same port# */ |
2060 | if (data->u.evtchn.deliver.port.port != data->u.evtchn.send_port) |
2061 | goto out_noeventfd; /* -EINVAL */ |
2062 | break; |
2063 | |
2064 | case EVTCHNSTAT_interdomain: |
2065 | if (data->u.evtchn.deliver.port.port) { |
2066 | if (data->u.evtchn.deliver.port.port >= max_evtchn_port(kvm)) |
2067 | goto out_noeventfd; /* -EINVAL */ |
2068 | } else { |
2069 | eventfd = eventfd_ctx_fdget(fd: data->u.evtchn.deliver.eventfd.fd); |
2070 | if (IS_ERR(ptr: eventfd)) { |
2071 | ret = PTR_ERR(ptr: eventfd); |
2072 | goto out_noeventfd; |
2073 | } |
2074 | } |
2075 | break; |
2076 | |
2077 | case EVTCHNSTAT_virq: |
2078 | case EVTCHNSTAT_closed: |
2079 | case EVTCHNSTAT_unbound: |
2080 | case EVTCHNSTAT_pirq: |
2081 | default: /* Unknown event channel type */ |
2082 | goto out; /* -EINVAL */ |
2083 | } |
2084 | |
2085 | evtchnfd->send_port = data->u.evtchn.send_port; |
2086 | evtchnfd->type = data->u.evtchn.type; |
2087 | if (eventfd) { |
2088 | evtchnfd->deliver.eventfd.ctx = eventfd; |
2089 | } else { |
2090 | /* We only support 2 level event channels for now */ |
2091 | if (data->u.evtchn.deliver.port.priority != KVM_IRQ_ROUTING_XEN_EVTCHN_PRIO_2LEVEL) |
2092 | goto out; /* -EINVAL; */ |
2093 | |
2094 | evtchnfd->deliver.port.port = data->u.evtchn.deliver.port.port; |
2095 | evtchnfd->deliver.port.vcpu_id = data->u.evtchn.deliver.port.vcpu; |
2096 | evtchnfd->deliver.port.vcpu_idx = -1; |
2097 | evtchnfd->deliver.port.priority = data->u.evtchn.deliver.port.priority; |
2098 | } |
2099 | |
2100 | mutex_lock(&kvm->arch.xen.xen_lock); |
2101 | ret = idr_alloc(&kvm->arch.xen.evtchn_ports, ptr: evtchnfd, start: port, end: port + 1, |
2102 | GFP_KERNEL); |
2103 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
2104 | if (ret >= 0) |
2105 | return 0; |
2106 | |
2107 | if (ret == -ENOSPC) |
2108 | ret = -EEXIST; |
2109 | out: |
2110 | if (eventfd) |
2111 | eventfd_ctx_put(ctx: eventfd); |
2112 | out_noeventfd: |
2113 | kfree(objp: evtchnfd); |
2114 | return ret; |
2115 | } |
2116 | |
2117 | static int kvm_xen_eventfd_deassign(struct kvm *kvm, u32 port) |
2118 | { |
2119 | struct evtchnfd *evtchnfd; |
2120 | |
2121 | mutex_lock(&kvm->arch.xen.xen_lock); |
2122 | evtchnfd = idr_remove(&kvm->arch.xen.evtchn_ports, id: port); |
2123 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
2124 | |
2125 | if (!evtchnfd) |
2126 | return -ENOENT; |
2127 | |
2128 | synchronize_srcu(ssp: &kvm->srcu); |
2129 | if (!evtchnfd->deliver.port.port) |
2130 | eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx); |
2131 | kfree(objp: evtchnfd); |
2132 | return 0; |
2133 | } |
2134 | |
2135 | static int kvm_xen_eventfd_reset(struct kvm *kvm) |
2136 | { |
2137 | struct evtchnfd *evtchnfd, **all_evtchnfds; |
2138 | int i; |
2139 | int n = 0; |
2140 | |
2141 | mutex_lock(&kvm->arch.xen.xen_lock); |
2142 | |
2143 | /* |
2144 | * Because synchronize_srcu() cannot be called inside the |
2145 | * critical section, first collect all the evtchnfd objects |
2146 | * in an array as they are removed from evtchn_ports. |
2147 | */ |
2148 | idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) |
2149 | n++; |
2150 | |
2151 | all_evtchnfds = kmalloc_array(n, size: sizeof(struct evtchnfd *), GFP_KERNEL); |
2152 | if (!all_evtchnfds) { |
2153 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
2154 | return -ENOMEM; |
2155 | } |
2156 | |
2157 | n = 0; |
2158 | idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { |
2159 | all_evtchnfds[n++] = evtchnfd; |
2160 | idr_remove(&kvm->arch.xen.evtchn_ports, id: evtchnfd->send_port); |
2161 | } |
2162 | mutex_unlock(lock: &kvm->arch.xen.xen_lock); |
2163 | |
2164 | synchronize_srcu(ssp: &kvm->srcu); |
2165 | |
2166 | while (n--) { |
2167 | evtchnfd = all_evtchnfds[n]; |
2168 | if (!evtchnfd->deliver.port.port) |
2169 | eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx); |
2170 | kfree(objp: evtchnfd); |
2171 | } |
2172 | kfree(objp: all_evtchnfds); |
2173 | |
2174 | return 0; |
2175 | } |
2176 | |
2177 | static int kvm_xen_setattr_evtchn(struct kvm *kvm, struct kvm_xen_hvm_attr *data) |
2178 | { |
2179 | u32 port = data->u.evtchn.send_port; |
2180 | |
2181 | if (data->u.evtchn.flags == KVM_XEN_EVTCHN_RESET) |
2182 | return kvm_xen_eventfd_reset(kvm); |
2183 | |
2184 | if (!port || port >= max_evtchn_port(kvm)) |
2185 | return -EINVAL; |
2186 | |
2187 | if (data->u.evtchn.flags == KVM_XEN_EVTCHN_DEASSIGN) |
2188 | return kvm_xen_eventfd_deassign(kvm, port); |
2189 | if (data->u.evtchn.flags == KVM_XEN_EVTCHN_UPDATE) |
2190 | return kvm_xen_eventfd_update(kvm, data); |
2191 | if (data->u.evtchn.flags) |
2192 | return -EINVAL; |
2193 | |
2194 | return kvm_xen_eventfd_assign(kvm, data); |
2195 | } |
2196 | |
2197 | static bool kvm_xen_hcall_evtchn_send(struct kvm_vcpu *vcpu, u64 param, u64 *r) |
2198 | { |
2199 | struct evtchnfd *evtchnfd; |
2200 | struct evtchn_send send; |
2201 | struct x86_exception e; |
2202 | |
2203 | /* Sanity check: this structure is the same for 32-bit and 64-bit */ |
2204 | BUILD_BUG_ON(sizeof(send) != 4); |
2205 | if (kvm_read_guest_virt(vcpu, addr: param, val: &send, bytes: sizeof(send), exception: &e)) { |
2206 | *r = -EFAULT; |
2207 | return true; |
2208 | } |
2209 | |
2210 | /* |
2211 | * evtchnfd is protected by kvm->srcu; the idr lookup instead |
2212 | * is protected by RCU. |
2213 | */ |
2214 | rcu_read_lock(); |
2215 | evtchnfd = idr_find(&vcpu->kvm->arch.xen.evtchn_ports, id: send.port); |
2216 | rcu_read_unlock(); |
2217 | if (!evtchnfd) |
2218 | return false; |
2219 | |
2220 | if (evtchnfd->deliver.port.port) { |
2221 | int ret = kvm_xen_set_evtchn(xe: &evtchnfd->deliver.port, kvm: vcpu->kvm); |
2222 | if (ret < 0 && ret != -ENOTCONN) |
2223 | return false; |
2224 | } else { |
2225 | eventfd_signal(ctx: evtchnfd->deliver.eventfd.ctx); |
2226 | } |
2227 | |
2228 | *r = 0; |
2229 | return true; |
2230 | } |
2231 | |
2232 | void kvm_xen_init_vcpu(struct kvm_vcpu *vcpu) |
2233 | { |
2234 | vcpu->arch.xen.vcpu_id = vcpu->vcpu_idx; |
2235 | vcpu->arch.xen.poll_evtchn = 0; |
2236 | |
2237 | timer_setup(&vcpu->arch.xen.poll_timer, cancel_evtchn_poll, 0); |
2238 | |
2239 | kvm_gpc_init(gpc: &vcpu->arch.xen.runstate_cache, kvm: vcpu->kvm); |
2240 | kvm_gpc_init(gpc: &vcpu->arch.xen.runstate2_cache, kvm: vcpu->kvm); |
2241 | kvm_gpc_init(gpc: &vcpu->arch.xen.vcpu_info_cache, kvm: vcpu->kvm); |
2242 | kvm_gpc_init(gpc: &vcpu->arch.xen.vcpu_time_info_cache, kvm: vcpu->kvm); |
2243 | } |
2244 | |
2245 | void kvm_xen_destroy_vcpu(struct kvm_vcpu *vcpu) |
2246 | { |
2247 | if (kvm_xen_timer_enabled(vcpu)) |
2248 | kvm_xen_stop_timer(vcpu); |
2249 | |
2250 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate_cache); |
2251 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.runstate2_cache); |
2252 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_info_cache); |
2253 | kvm_gpc_deactivate(gpc: &vcpu->arch.xen.vcpu_time_info_cache); |
2254 | |
2255 | del_timer_sync(timer: &vcpu->arch.xen.poll_timer); |
2256 | } |
2257 | |
2258 | void kvm_xen_update_tsc_info(struct kvm_vcpu *vcpu) |
2259 | { |
2260 | struct kvm_cpuid_entry2 *entry; |
2261 | u32 function; |
2262 | |
2263 | if (!vcpu->arch.xen.cpuid.base) |
2264 | return; |
2265 | |
2266 | function = vcpu->arch.xen.cpuid.base | XEN_CPUID_LEAF(3); |
2267 | if (function > vcpu->arch.xen.cpuid.limit) |
2268 | return; |
2269 | |
2270 | entry = kvm_find_cpuid_entry_index(vcpu, function, index: 1); |
2271 | if (entry) { |
2272 | entry->ecx = vcpu->arch.hv_clock.tsc_to_system_mul; |
2273 | entry->edx = vcpu->arch.hv_clock.tsc_shift; |
2274 | } |
2275 | |
2276 | entry = kvm_find_cpuid_entry_index(vcpu, function, index: 2); |
2277 | if (entry) |
2278 | entry->eax = vcpu->arch.hw_tsc_khz; |
2279 | } |
2280 | |
2281 | void kvm_xen_init_vm(struct kvm *kvm) |
2282 | { |
2283 | mutex_init(&kvm->arch.xen.xen_lock); |
2284 | idr_init(idr: &kvm->arch.xen.evtchn_ports); |
2285 | kvm_gpc_init(gpc: &kvm->arch.xen.shinfo_cache, kvm); |
2286 | } |
2287 | |
2288 | void kvm_xen_destroy_vm(struct kvm *kvm) |
2289 | { |
2290 | struct evtchnfd *evtchnfd; |
2291 | int i; |
2292 | |
2293 | kvm_gpc_deactivate(gpc: &kvm->arch.xen.shinfo_cache); |
2294 | |
2295 | idr_for_each_entry(&kvm->arch.xen.evtchn_ports, evtchnfd, i) { |
2296 | if (!evtchnfd->deliver.port.port) |
2297 | eventfd_ctx_put(ctx: evtchnfd->deliver.eventfd.ctx); |
2298 | kfree(objp: evtchnfd); |
2299 | } |
2300 | idr_destroy(&kvm->arch.xen.evtchn_ports); |
2301 | |
2302 | if (kvm->arch.xen_hvm_config.msr) |
2303 | static_branch_slow_dec_deferred(&kvm_xen_enabled); |
2304 | } |
2305 | |