1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
3 | |
4 | #include <linux/kvm_host.h> |
5 | |
6 | #include <asm/irq_remapping.h> |
7 | #include <asm/cpu.h> |
8 | |
9 | #include "lapic.h" |
10 | #include "irq.h" |
11 | #include "posted_intr.h" |
12 | #include "trace.h" |
13 | #include "vmx.h" |
14 | |
15 | /* |
16 | * Maintain a per-CPU list of vCPUs that need to be awakened by wakeup_handler() |
17 | * when a WAKEUP_VECTOR interrupted is posted. vCPUs are added to the list when |
18 | * the vCPU is scheduled out and is blocking (e.g. in HLT) with IRQs enabled. |
19 | * The vCPUs posted interrupt descriptor is updated at the same time to set its |
20 | * notification vector to WAKEUP_VECTOR, so that posted interrupt from devices |
21 | * wake the target vCPUs. vCPUs are removed from the list and the notification |
22 | * vector is reset when the vCPU is scheduled in. |
23 | */ |
24 | static DEFINE_PER_CPU(struct list_head, wakeup_vcpus_on_cpu); |
25 | /* |
26 | * Protect the per-CPU list with a per-CPU spinlock to handle task migration. |
27 | * When a blocking vCPU is awakened _and_ migrated to a different pCPU, the |
28 | * ->sched_in() path will need to take the vCPU off the list of the _previous_ |
29 | * CPU. IRQs must be disabled when taking this lock, otherwise deadlock will |
30 | * occur if a wakeup IRQ arrives and attempts to acquire the lock. |
31 | */ |
32 | static DEFINE_PER_CPU(raw_spinlock_t, wakeup_vcpus_on_cpu_lock); |
33 | |
34 | static inline struct pi_desc *vcpu_to_pi_desc(struct kvm_vcpu *vcpu) |
35 | { |
36 | return &(to_vmx(vcpu)->pi_desc); |
37 | } |
38 | |
39 | static int pi_try_set_control(struct pi_desc *pi_desc, u64 *pold, u64 new) |
40 | { |
41 | /* |
42 | * PID.ON can be set at any time by a different vCPU or by hardware, |
43 | * e.g. a device. PID.control must be written atomically, and the |
44 | * update must be retried with a fresh snapshot an ON change causes |
45 | * the cmpxchg to fail. |
46 | */ |
47 | if (!try_cmpxchg64(&pi_desc->control, pold, new)) |
48 | return -EBUSY; |
49 | |
50 | return 0; |
51 | } |
52 | |
53 | void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) |
54 | { |
55 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
56 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
57 | struct pi_desc old, new; |
58 | unsigned long flags; |
59 | unsigned int dest; |
60 | |
61 | /* |
62 | * To simplify hot-plug and dynamic toggling of APICv, keep PI.NDST and |
63 | * PI.SN up-to-date even if there is no assigned device or if APICv is |
64 | * deactivated due to a dynamic inhibit bit, e.g. for Hyper-V's SyncIC. |
65 | */ |
66 | if (!enable_apicv || !lapic_in_kernel(vcpu)) |
67 | return; |
68 | |
69 | /* |
70 | * If the vCPU wasn't on the wakeup list and wasn't migrated, then the |
71 | * full update can be skipped as neither the vector nor the destination |
72 | * needs to be changed. |
73 | */ |
74 | if (pi_desc->nv != POSTED_INTR_WAKEUP_VECTOR && vcpu->cpu == cpu) { |
75 | /* |
76 | * Clear SN if it was set due to being preempted. Again, do |
77 | * this even if there is no assigned device for simplicity. |
78 | */ |
79 | if (pi_test_and_clear_sn(pi_desc)) |
80 | goto after_clear_sn; |
81 | return; |
82 | } |
83 | |
84 | local_irq_save(flags); |
85 | |
86 | /* |
87 | * If the vCPU was waiting for wakeup, remove the vCPU from the wakeup |
88 | * list of the _previous_ pCPU, which will not be the same as the |
89 | * current pCPU if the task was migrated. |
90 | */ |
91 | if (pi_desc->nv == POSTED_INTR_WAKEUP_VECTOR) { |
92 | raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); |
93 | list_del(entry: &vmx->pi_wakeup_list); |
94 | raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); |
95 | } |
96 | |
97 | dest = cpu_physical_id(cpu); |
98 | if (!x2apic_mode) |
99 | dest = (dest << 8) & 0xFF00; |
100 | |
101 | old.control = READ_ONCE(pi_desc->control); |
102 | do { |
103 | new.control = old.control; |
104 | |
105 | /* |
106 | * Clear SN (as above) and refresh the destination APIC ID to |
107 | * handle task migration (@cpu != vcpu->cpu). |
108 | */ |
109 | new.ndst = dest; |
110 | new.sn = 0; |
111 | |
112 | /* |
113 | * Restore the notification vector; in the blocking case, the |
114 | * descriptor was modified on "put" to use the wakeup vector. |
115 | */ |
116 | new.nv = POSTED_INTR_VECTOR; |
117 | } while (pi_try_set_control(pi_desc, pold: &old.control, new: new.control)); |
118 | |
119 | local_irq_restore(flags); |
120 | |
121 | after_clear_sn: |
122 | |
123 | /* |
124 | * Clear SN before reading the bitmap. The VT-d firmware |
125 | * writes the bitmap and reads SN atomically (5.2.3 in the |
126 | * spec), so it doesn't really have a memory barrier that |
127 | * pairs with this, but we cannot do that and we need one. |
128 | */ |
129 | smp_mb__after_atomic(); |
130 | |
131 | if (!pi_is_pir_empty(pi_desc)) |
132 | pi_set_on(pi_desc); |
133 | } |
134 | |
135 | static bool vmx_can_use_vtd_pi(struct kvm *kvm) |
136 | { |
137 | return irqchip_in_kernel(kvm) && enable_apicv && |
138 | kvm_arch_has_assigned_device(kvm) && |
139 | irq_remapping_cap(cap: IRQ_POSTING_CAP); |
140 | } |
141 | |
142 | /* |
143 | * Put the vCPU on this pCPU's list of vCPUs that needs to be awakened and set |
144 | * WAKEUP as the notification vector in the PI descriptor. |
145 | */ |
146 | static void pi_enable_wakeup_handler(struct kvm_vcpu *vcpu) |
147 | { |
148 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
149 | struct vcpu_vmx *vmx = to_vmx(vcpu); |
150 | struct pi_desc old, new; |
151 | unsigned long flags; |
152 | |
153 | local_irq_save(flags); |
154 | |
155 | raw_spin_lock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); |
156 | list_add_tail(new: &vmx->pi_wakeup_list, |
157 | head: &per_cpu(wakeup_vcpus_on_cpu, vcpu->cpu)); |
158 | raw_spin_unlock(&per_cpu(wakeup_vcpus_on_cpu_lock, vcpu->cpu)); |
159 | |
160 | WARN(pi_desc->sn, "PI descriptor SN field set before blocking" ); |
161 | |
162 | old.control = READ_ONCE(pi_desc->control); |
163 | do { |
164 | /* set 'NV' to 'wakeup vector' */ |
165 | new.control = old.control; |
166 | new.nv = POSTED_INTR_WAKEUP_VECTOR; |
167 | } while (pi_try_set_control(pi_desc, pold: &old.control, new: new.control)); |
168 | |
169 | /* |
170 | * Send a wakeup IPI to this CPU if an interrupt may have been posted |
171 | * before the notification vector was updated, in which case the IRQ |
172 | * will arrive on the non-wakeup vector. An IPI is needed as calling |
173 | * try_to_wake_up() from ->sched_out() isn't allowed (IRQs are not |
174 | * enabled until it is safe to call try_to_wake_up() on the task being |
175 | * scheduled out). |
176 | */ |
177 | if (pi_test_on(pi_desc: &new)) |
178 | __apic_send_IPI_self(POSTED_INTR_WAKEUP_VECTOR); |
179 | |
180 | local_irq_restore(flags); |
181 | } |
182 | |
183 | static bool vmx_needs_pi_wakeup(struct kvm_vcpu *vcpu) |
184 | { |
185 | /* |
186 | * The default posted interrupt vector does nothing when |
187 | * invoked outside guest mode. Return whether a blocked vCPU |
188 | * can be the target of posted interrupts, as is the case when |
189 | * using either IPI virtualization or VT-d PI, so that the |
190 | * notification vector is switched to the one that calls |
191 | * back to the pi_wakeup_handler() function. |
192 | */ |
193 | return vmx_can_use_ipiv(vcpu) || vmx_can_use_vtd_pi(kvm: vcpu->kvm); |
194 | } |
195 | |
196 | void vmx_vcpu_pi_put(struct kvm_vcpu *vcpu) |
197 | { |
198 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
199 | |
200 | if (!vmx_needs_pi_wakeup(vcpu)) |
201 | return; |
202 | |
203 | if (kvm_vcpu_is_blocking(vcpu) && !vmx_interrupt_blocked(vcpu)) |
204 | pi_enable_wakeup_handler(vcpu); |
205 | |
206 | /* |
207 | * Set SN when the vCPU is preempted. Note, the vCPU can both be seen |
208 | * as blocking and preempted, e.g. if it's preempted between setting |
209 | * its wait state and manually scheduling out. |
210 | */ |
211 | if (vcpu->preempted) |
212 | pi_set_sn(pi_desc); |
213 | } |
214 | |
215 | /* |
216 | * Handler for POSTED_INTERRUPT_WAKEUP_VECTOR. |
217 | */ |
218 | void pi_wakeup_handler(void) |
219 | { |
220 | int cpu = smp_processor_id(); |
221 | struct list_head *wakeup_list = &per_cpu(wakeup_vcpus_on_cpu, cpu); |
222 | raw_spinlock_t *spinlock = &per_cpu(wakeup_vcpus_on_cpu_lock, cpu); |
223 | struct vcpu_vmx *vmx; |
224 | |
225 | raw_spin_lock(spinlock); |
226 | list_for_each_entry(vmx, wakeup_list, pi_wakeup_list) { |
227 | |
228 | if (pi_test_on(pi_desc: &vmx->pi_desc)) |
229 | kvm_vcpu_wake_up(vcpu: &vmx->vcpu); |
230 | } |
231 | raw_spin_unlock(spinlock); |
232 | } |
233 | |
234 | void __init pi_init_cpu(int cpu) |
235 | { |
236 | INIT_LIST_HEAD(list: &per_cpu(wakeup_vcpus_on_cpu, cpu)); |
237 | raw_spin_lock_init(&per_cpu(wakeup_vcpus_on_cpu_lock, cpu)); |
238 | } |
239 | |
240 | bool pi_has_pending_interrupt(struct kvm_vcpu *vcpu) |
241 | { |
242 | struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu); |
243 | |
244 | return pi_test_on(pi_desc) || |
245 | (pi_test_sn(pi_desc) && !pi_is_pir_empty(pi_desc)); |
246 | } |
247 | |
248 | |
249 | /* |
250 | * Bail out of the block loop if the VM has an assigned |
251 | * device, but the blocking vCPU didn't reconfigure the |
252 | * PI.NV to the wakeup vector, i.e. the assigned device |
253 | * came along after the initial check in vmx_vcpu_pi_put(). |
254 | */ |
255 | void vmx_pi_start_assignment(struct kvm *kvm) |
256 | { |
257 | if (!irq_remapping_cap(cap: IRQ_POSTING_CAP)) |
258 | return; |
259 | |
260 | kvm_make_all_cpus_request(kvm, KVM_REQ_UNBLOCK); |
261 | } |
262 | |
263 | /* |
264 | * vmx_pi_update_irte - set IRTE for Posted-Interrupts |
265 | * |
266 | * @kvm: kvm |
267 | * @host_irq: host irq of the interrupt |
268 | * @guest_irq: gsi of the interrupt |
269 | * @set: set or unset PI |
270 | * returns 0 on success, < 0 on failure |
271 | */ |
272 | int vmx_pi_update_irte(struct kvm *kvm, unsigned int host_irq, |
273 | uint32_t guest_irq, bool set) |
274 | { |
275 | struct kvm_kernel_irq_routing_entry *e; |
276 | struct kvm_irq_routing_table *irq_rt; |
277 | struct kvm_lapic_irq irq; |
278 | struct kvm_vcpu *vcpu; |
279 | struct vcpu_data vcpu_info; |
280 | int idx, ret = 0; |
281 | |
282 | if (!vmx_can_use_vtd_pi(kvm)) |
283 | return 0; |
284 | |
285 | idx = srcu_read_lock(ssp: &kvm->irq_srcu); |
286 | irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); |
287 | if (guest_irq >= irq_rt->nr_rt_entries || |
288 | hlist_empty(h: &irq_rt->map[guest_irq])) { |
289 | pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n" , |
290 | guest_irq, irq_rt->nr_rt_entries); |
291 | goto out; |
292 | } |
293 | |
294 | hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { |
295 | if (e->type != KVM_IRQ_ROUTING_MSI) |
296 | continue; |
297 | /* |
298 | * VT-d PI cannot support posting multicast/broadcast |
299 | * interrupts to a vCPU, we still use interrupt remapping |
300 | * for these kind of interrupts. |
301 | * |
302 | * For lowest-priority interrupts, we only support |
303 | * those with single CPU as the destination, e.g. user |
304 | * configures the interrupts via /proc/irq or uses |
305 | * irqbalance to make the interrupts single-CPU. |
306 | * |
307 | * We will support full lowest-priority interrupt later. |
308 | * |
309 | * In addition, we can only inject generic interrupts using |
310 | * the PI mechanism, refuse to route others through it. |
311 | */ |
312 | |
313 | kvm_set_msi_irq(kvm, e, irq: &irq); |
314 | if (!kvm_intr_is_single_vcpu(kvm, irq: &irq, dest_vcpu: &vcpu) || |
315 | !kvm_irq_is_postable(irq: &irq)) { |
316 | /* |
317 | * Make sure the IRTE is in remapped mode if |
318 | * we don't handle it in posted mode. |
319 | */ |
320 | ret = irq_set_vcpu_affinity(irq: host_irq, NULL); |
321 | if (ret < 0) { |
322 | printk(KERN_INFO |
323 | "failed to back to remapped mode, irq: %u\n" , |
324 | host_irq); |
325 | goto out; |
326 | } |
327 | |
328 | continue; |
329 | } |
330 | |
331 | vcpu_info.pi_desc_addr = __pa(vcpu_to_pi_desc(vcpu)); |
332 | vcpu_info.vector = irq.vector; |
333 | |
334 | trace_kvm_pi_irte_update(host_irq, vcpu->vcpu_id, e->gsi, |
335 | vcpu_info.vector, vcpu_info.pi_desc_addr, set); |
336 | |
337 | if (set) |
338 | ret = irq_set_vcpu_affinity(irq: host_irq, vcpu_info: &vcpu_info); |
339 | else |
340 | ret = irq_set_vcpu_affinity(irq: host_irq, NULL); |
341 | |
342 | if (ret < 0) { |
343 | printk(KERN_INFO "%s: failed to update PI IRTE\n" , |
344 | __func__); |
345 | goto out; |
346 | } |
347 | } |
348 | |
349 | ret = 0; |
350 | out: |
351 | srcu_read_unlock(ssp: &kvm->irq_srcu, idx); |
352 | return ret; |
353 | } |
354 | |