1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Kernel-based Virtual Machine driver for Linux |
4 | * |
5 | * AMD SVM support |
6 | * |
7 | * Copyright (C) 2006 Qumranet, Inc. |
8 | * Copyright 2010 Red Hat, Inc. and/or its affiliates. |
9 | * |
10 | * Authors: |
11 | * Yaniv Kamay <yaniv@qumranet.com> |
12 | * Avi Kivity <avi@qumranet.com> |
13 | */ |
14 | |
15 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
16 | |
17 | #include <linux/kvm_types.h> |
18 | #include <linux/hashtable.h> |
19 | #include <linux/amd-iommu.h> |
20 | #include <linux/kvm_host.h> |
21 | |
22 | #include <asm/irq_remapping.h> |
23 | |
24 | #include "trace.h" |
25 | #include "lapic.h" |
26 | #include "x86.h" |
27 | #include "irq.h" |
28 | #include "svm.h" |
29 | |
30 | /* |
31 | * Encode the arbitrary VM ID and the vCPU's default APIC ID, i.e the vCPU ID, |
32 | * into the GATag so that KVM can retrieve the correct vCPU from a GALog entry |
33 | * if an interrupt can't be delivered, e.g. because the vCPU isn't running. |
34 | * |
35 | * For the vCPU ID, use however many bits are currently allowed for the max |
36 | * guest physical APIC ID (limited by the size of the physical ID table), and |
37 | * use whatever bits remain to assign arbitrary AVIC IDs to VMs. Note, the |
38 | * size of the GATag is defined by hardware (32 bits), but is an opaque value |
39 | * as far as hardware is concerned. |
40 | */ |
41 | #define AVIC_VCPU_ID_MASK AVIC_PHYSICAL_MAX_INDEX_MASK |
42 | |
43 | #define AVIC_VM_ID_SHIFT HWEIGHT32(AVIC_PHYSICAL_MAX_INDEX_MASK) |
44 | #define AVIC_VM_ID_MASK (GENMASK(31, AVIC_VM_ID_SHIFT) >> AVIC_VM_ID_SHIFT) |
45 | |
46 | #define AVIC_GATAG_TO_VMID(x) ((x >> AVIC_VM_ID_SHIFT) & AVIC_VM_ID_MASK) |
47 | #define AVIC_GATAG_TO_VCPUID(x) (x & AVIC_VCPU_ID_MASK) |
48 | |
49 | #define __AVIC_GATAG(vm_id, vcpu_id) ((((vm_id) & AVIC_VM_ID_MASK) << AVIC_VM_ID_SHIFT) | \ |
50 | ((vcpu_id) & AVIC_VCPU_ID_MASK)) |
51 | #define AVIC_GATAG(vm_id, vcpu_id) \ |
52 | ({ \ |
53 | u32 ga_tag = __AVIC_GATAG(vm_id, vcpu_id); \ |
54 | \ |
55 | WARN_ON_ONCE(AVIC_GATAG_TO_VCPUID(ga_tag) != (vcpu_id)); \ |
56 | WARN_ON_ONCE(AVIC_GATAG_TO_VMID(ga_tag) != (vm_id)); \ |
57 | ga_tag; \ |
58 | }) |
59 | |
60 | static_assert(__AVIC_GATAG(AVIC_VM_ID_MASK, AVIC_VCPU_ID_MASK) == -1u); |
61 | |
62 | static bool force_avic; |
63 | module_param_unsafe(force_avic, bool, 0444); |
64 | |
65 | /* Note: |
66 | * This hash table is used to map VM_ID to a struct kvm_svm, |
67 | * when handling AMD IOMMU GALOG notification to schedule in |
68 | * a particular vCPU. |
69 | */ |
70 | #define SVM_VM_DATA_HASH_BITS 8 |
71 | static DEFINE_HASHTABLE(svm_vm_data_hash, SVM_VM_DATA_HASH_BITS); |
72 | static u32 next_vm_id = 0; |
73 | static bool next_vm_id_wrapped = 0; |
74 | static DEFINE_SPINLOCK(svm_vm_data_hash_lock); |
75 | bool x2avic_enabled; |
76 | |
77 | /* |
78 | * This is a wrapper of struct amd_iommu_ir_data. |
79 | */ |
80 | struct amd_svm_iommu_ir { |
81 | struct list_head node; /* Used by SVM for per-vcpu ir_list */ |
82 | void *data; /* Storing pointer to struct amd_ir_data */ |
83 | }; |
84 | |
85 | static void avic_activate_vmcb(struct vcpu_svm *svm) |
86 | { |
87 | struct vmcb *vmcb = svm->vmcb01.ptr; |
88 | |
89 | vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); |
90 | vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; |
91 | |
92 | vmcb->control.int_ctl |= AVIC_ENABLE_MASK; |
93 | |
94 | /* |
95 | * Note: KVM supports hybrid-AVIC mode, where KVM emulates x2APIC MSR |
96 | * accesses, while interrupt injection to a running vCPU can be |
97 | * achieved using AVIC doorbell. KVM disables the APIC access page |
98 | * (deletes the memslot) if any vCPU has x2APIC enabled, thus enabling |
99 | * AVIC in hybrid mode activates only the doorbell mechanism. |
100 | */ |
101 | if (x2avic_enabled && apic_x2apic_mode(svm->vcpu.arch.apic)) { |
102 | vmcb->control.int_ctl |= X2APIC_MODE_MASK; |
103 | vmcb->control.avic_physical_id |= X2AVIC_MAX_PHYSICAL_ID; |
104 | /* Disabling MSR intercept for x2APIC registers */ |
105 | svm_set_x2apic_msr_interception(svm, disable: false); |
106 | } else { |
107 | /* |
108 | * Flush the TLB, the guest may have inserted a non-APIC |
109 | * mapping into the TLB while AVIC was disabled. |
110 | */ |
111 | kvm_make_request(KVM_REQ_TLB_FLUSH_CURRENT, vcpu: &svm->vcpu); |
112 | |
113 | /* For xAVIC and hybrid-xAVIC modes */ |
114 | vmcb->control.avic_physical_id |= AVIC_MAX_PHYSICAL_ID; |
115 | /* Enabling MSR intercept for x2APIC registers */ |
116 | svm_set_x2apic_msr_interception(svm, disable: true); |
117 | } |
118 | } |
119 | |
120 | static void avic_deactivate_vmcb(struct vcpu_svm *svm) |
121 | { |
122 | struct vmcb *vmcb = svm->vmcb01.ptr; |
123 | |
124 | vmcb->control.int_ctl &= ~(AVIC_ENABLE_MASK | X2APIC_MODE_MASK); |
125 | vmcb->control.avic_physical_id &= ~AVIC_PHYSICAL_MAX_INDEX_MASK; |
126 | |
127 | /* |
128 | * If running nested and the guest uses its own MSR bitmap, there |
129 | * is no need to update L0's msr bitmap |
130 | */ |
131 | if (is_guest_mode(&svm->vcpu) && |
132 | vmcb12_is_intercept(control: &svm->nested.ctl, bit: INTERCEPT_MSR_PROT)) |
133 | return; |
134 | |
135 | /* Enabling MSR intercept for x2APIC registers */ |
136 | svm_set_x2apic_msr_interception(svm, disable: true); |
137 | } |
138 | |
139 | /* Note: |
140 | * This function is called from IOMMU driver to notify |
141 | * SVM to schedule in a particular vCPU of a particular VM. |
142 | */ |
143 | int avic_ga_log_notifier(u32 ga_tag) |
144 | { |
145 | unsigned long flags; |
146 | struct kvm_svm *kvm_svm; |
147 | struct kvm_vcpu *vcpu = NULL; |
148 | u32 vm_id = AVIC_GATAG_TO_VMID(ga_tag); |
149 | u32 vcpu_id = AVIC_GATAG_TO_VCPUID(ga_tag); |
150 | |
151 | pr_debug("SVM: %s: vm_id=%#x, vcpu_id=%#x\n" , __func__, vm_id, vcpu_id); |
152 | trace_kvm_avic_ga_log(vm_id, vcpu_id); |
153 | |
154 | spin_lock_irqsave(&svm_vm_data_hash_lock, flags); |
155 | hash_for_each_possible(svm_vm_data_hash, kvm_svm, hnode, vm_id) { |
156 | if (kvm_svm->avic_vm_id != vm_id) |
157 | continue; |
158 | vcpu = kvm_get_vcpu_by_id(kvm: &kvm_svm->kvm, id: vcpu_id); |
159 | break; |
160 | } |
161 | spin_unlock_irqrestore(lock: &svm_vm_data_hash_lock, flags); |
162 | |
163 | /* Note: |
164 | * At this point, the IOMMU should have already set the pending |
165 | * bit in the vAPIC backing page. So, we just need to schedule |
166 | * in the vcpu. |
167 | */ |
168 | if (vcpu) |
169 | kvm_vcpu_wake_up(vcpu); |
170 | |
171 | return 0; |
172 | } |
173 | |
174 | void avic_vm_destroy(struct kvm *kvm) |
175 | { |
176 | unsigned long flags; |
177 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm); |
178 | |
179 | if (!enable_apicv) |
180 | return; |
181 | |
182 | if (kvm_svm->avic_logical_id_table_page) |
183 | __free_page(kvm_svm->avic_logical_id_table_page); |
184 | if (kvm_svm->avic_physical_id_table_page) |
185 | __free_page(kvm_svm->avic_physical_id_table_page); |
186 | |
187 | spin_lock_irqsave(&svm_vm_data_hash_lock, flags); |
188 | hash_del(node: &kvm_svm->hnode); |
189 | spin_unlock_irqrestore(lock: &svm_vm_data_hash_lock, flags); |
190 | } |
191 | |
192 | int avic_vm_init(struct kvm *kvm) |
193 | { |
194 | unsigned long flags; |
195 | int err = -ENOMEM; |
196 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm); |
197 | struct kvm_svm *k2; |
198 | struct page *p_page; |
199 | struct page *l_page; |
200 | u32 vm_id; |
201 | |
202 | if (!enable_apicv) |
203 | return 0; |
204 | |
205 | /* Allocating physical APIC ID table (4KB) */ |
206 | p_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); |
207 | if (!p_page) |
208 | goto free_avic; |
209 | |
210 | kvm_svm->avic_physical_id_table_page = p_page; |
211 | |
212 | /* Allocating logical APIC ID table (4KB) */ |
213 | l_page = alloc_page(GFP_KERNEL_ACCOUNT | __GFP_ZERO); |
214 | if (!l_page) |
215 | goto free_avic; |
216 | |
217 | kvm_svm->avic_logical_id_table_page = l_page; |
218 | |
219 | spin_lock_irqsave(&svm_vm_data_hash_lock, flags); |
220 | again: |
221 | vm_id = next_vm_id = (next_vm_id + 1) & AVIC_VM_ID_MASK; |
222 | if (vm_id == 0) { /* id is 1-based, zero is not okay */ |
223 | next_vm_id_wrapped = 1; |
224 | goto again; |
225 | } |
226 | /* Is it still in use? Only possible if wrapped at least once */ |
227 | if (next_vm_id_wrapped) { |
228 | hash_for_each_possible(svm_vm_data_hash, k2, hnode, vm_id) { |
229 | if (k2->avic_vm_id == vm_id) |
230 | goto again; |
231 | } |
232 | } |
233 | kvm_svm->avic_vm_id = vm_id; |
234 | hash_add(svm_vm_data_hash, &kvm_svm->hnode, kvm_svm->avic_vm_id); |
235 | spin_unlock_irqrestore(lock: &svm_vm_data_hash_lock, flags); |
236 | |
237 | return 0; |
238 | |
239 | free_avic: |
240 | avic_vm_destroy(kvm); |
241 | return err; |
242 | } |
243 | |
244 | void avic_init_vmcb(struct vcpu_svm *svm, struct vmcb *vmcb) |
245 | { |
246 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm: svm->vcpu.kvm); |
247 | phys_addr_t bpa = __sme_set(page_to_phys(svm->avic_backing_page)); |
248 | phys_addr_t lpa = __sme_set(page_to_phys(kvm_svm->avic_logical_id_table_page)); |
249 | phys_addr_t ppa = __sme_set(page_to_phys(kvm_svm->avic_physical_id_table_page)); |
250 | |
251 | vmcb->control.avic_backing_page = bpa & AVIC_HPA_MASK; |
252 | vmcb->control.avic_logical_id = lpa & AVIC_HPA_MASK; |
253 | vmcb->control.avic_physical_id = ppa & AVIC_HPA_MASK; |
254 | vmcb->control.avic_vapic_bar = APIC_DEFAULT_PHYS_BASE & VMCB_AVIC_APIC_BAR_MASK; |
255 | |
256 | if (kvm_apicv_activated(kvm: svm->vcpu.kvm)) |
257 | avic_activate_vmcb(svm); |
258 | else |
259 | avic_deactivate_vmcb(svm); |
260 | } |
261 | |
262 | static u64 *avic_get_physical_id_entry(struct kvm_vcpu *vcpu, |
263 | unsigned int index) |
264 | { |
265 | u64 *avic_physical_id_table; |
266 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm: vcpu->kvm); |
267 | |
268 | if ((!x2avic_enabled && index > AVIC_MAX_PHYSICAL_ID) || |
269 | (index > X2AVIC_MAX_PHYSICAL_ID)) |
270 | return NULL; |
271 | |
272 | avic_physical_id_table = page_address(kvm_svm->avic_physical_id_table_page); |
273 | |
274 | return &avic_physical_id_table[index]; |
275 | } |
276 | |
277 | static int avic_init_backing_page(struct kvm_vcpu *vcpu) |
278 | { |
279 | u64 *entry, new_entry; |
280 | int id = vcpu->vcpu_id; |
281 | struct vcpu_svm *svm = to_svm(vcpu); |
282 | |
283 | if ((!x2avic_enabled && id > AVIC_MAX_PHYSICAL_ID) || |
284 | (id > X2AVIC_MAX_PHYSICAL_ID)) |
285 | return -EINVAL; |
286 | |
287 | if (!vcpu->arch.apic->regs) |
288 | return -EINVAL; |
289 | |
290 | if (kvm_apicv_activated(kvm: vcpu->kvm)) { |
291 | int ret; |
292 | |
293 | /* |
294 | * Note, AVIC hardware walks the nested page table to check |
295 | * permissions, but does not use the SPA address specified in |
296 | * the leaf SPTE since it uses address in the AVIC_BACKING_PAGE |
297 | * pointer field of the VMCB. |
298 | */ |
299 | ret = kvm_alloc_apic_access_page(vcpu->kvm); |
300 | if (ret) |
301 | return ret; |
302 | } |
303 | |
304 | svm->avic_backing_page = virt_to_page(vcpu->arch.apic->regs); |
305 | |
306 | /* Setting AVIC backing page address in the phy APIC ID table */ |
307 | entry = avic_get_physical_id_entry(vcpu, index: id); |
308 | if (!entry) |
309 | return -EINVAL; |
310 | |
311 | new_entry = __sme_set((page_to_phys(svm->avic_backing_page) & |
312 | AVIC_PHYSICAL_ID_ENTRY_BACKING_PAGE_MASK) | |
313 | AVIC_PHYSICAL_ID_ENTRY_VALID_MASK); |
314 | WRITE_ONCE(*entry, new_entry); |
315 | |
316 | svm->avic_physical_id_cache = entry; |
317 | |
318 | return 0; |
319 | } |
320 | |
321 | void avic_ring_doorbell(struct kvm_vcpu *vcpu) |
322 | { |
323 | /* |
324 | * Note, the vCPU could get migrated to a different pCPU at any point, |
325 | * which could result in signalling the wrong/previous pCPU. But if |
326 | * that happens the vCPU is guaranteed to do a VMRUN (after being |
327 | * migrated) and thus will process pending interrupts, i.e. a doorbell |
328 | * is not needed (and the spurious one is harmless). |
329 | */ |
330 | int cpu = READ_ONCE(vcpu->cpu); |
331 | |
332 | if (cpu != get_cpu()) { |
333 | wrmsrl(MSR_AMD64_SVM_AVIC_DOORBELL, val: kvm_cpu_get_apicid(mps_cpu: cpu)); |
334 | trace_kvm_avic_doorbell(vcpu->vcpu_id, kvm_cpu_get_apicid(mps_cpu: cpu)); |
335 | } |
336 | put_cpu(); |
337 | } |
338 | |
339 | |
340 | static void avic_kick_vcpu(struct kvm_vcpu *vcpu, u32 icrl) |
341 | { |
342 | vcpu->arch.apic->irr_pending = true; |
343 | svm_complete_interrupt_delivery(vcpu, |
344 | delivery_mode: icrl & APIC_MODE_MASK, |
345 | trig_mode: icrl & APIC_INT_LEVELTRIG, |
346 | vec: icrl & APIC_VECTOR_MASK); |
347 | } |
348 | |
349 | static void avic_kick_vcpu_by_physical_id(struct kvm *kvm, u32 physical_id, |
350 | u32 icrl) |
351 | { |
352 | /* |
353 | * KVM inhibits AVIC if any vCPU ID diverges from the vCPUs APIC ID, |
354 | * i.e. APIC ID == vCPU ID. |
355 | */ |
356 | struct kvm_vcpu *target_vcpu = kvm_get_vcpu_by_id(kvm, id: physical_id); |
357 | |
358 | /* Once again, nothing to do if the target vCPU doesn't exist. */ |
359 | if (unlikely(!target_vcpu)) |
360 | return; |
361 | |
362 | avic_kick_vcpu(vcpu: target_vcpu, icrl); |
363 | } |
364 | |
365 | static void avic_kick_vcpu_by_logical_id(struct kvm *kvm, u32 *avic_logical_id_table, |
366 | u32 logid_index, u32 icrl) |
367 | { |
368 | u32 physical_id; |
369 | |
370 | if (avic_logical_id_table) { |
371 | u32 logid_entry = avic_logical_id_table[logid_index]; |
372 | |
373 | /* Nothing to do if the logical destination is invalid. */ |
374 | if (unlikely(!(logid_entry & AVIC_LOGICAL_ID_ENTRY_VALID_MASK))) |
375 | return; |
376 | |
377 | physical_id = logid_entry & |
378 | AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; |
379 | } else { |
380 | /* |
381 | * For x2APIC, the logical APIC ID is a read-only value that is |
382 | * derived from the x2APIC ID, thus the x2APIC ID can be found |
383 | * by reversing the calculation (stored in logid_index). Note, |
384 | * bits 31:20 of the x2APIC ID aren't propagated to the logical |
385 | * ID, but KVM limits the x2APIC ID limited to KVM_MAX_VCPU_IDS. |
386 | */ |
387 | physical_id = logid_index; |
388 | } |
389 | |
390 | avic_kick_vcpu_by_physical_id(kvm, physical_id, icrl); |
391 | } |
392 | |
393 | /* |
394 | * A fast-path version of avic_kick_target_vcpus(), which attempts to match |
395 | * destination APIC ID to vCPU without looping through all vCPUs. |
396 | */ |
397 | static int avic_kick_target_vcpus_fast(struct kvm *kvm, struct kvm_lapic *source, |
398 | u32 icrl, u32 icrh, u32 index) |
399 | { |
400 | int dest_mode = icrl & APIC_DEST_MASK; |
401 | int shorthand = icrl & APIC_SHORT_MASK; |
402 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm); |
403 | u32 dest; |
404 | |
405 | if (shorthand != APIC_DEST_NOSHORT) |
406 | return -EINVAL; |
407 | |
408 | if (apic_x2apic_mode(source)) |
409 | dest = icrh; |
410 | else |
411 | dest = GET_XAPIC_DEST_FIELD(icrh); |
412 | |
413 | if (dest_mode == APIC_DEST_PHYSICAL) { |
414 | /* broadcast destination, use slow path */ |
415 | if (apic_x2apic_mode(source) && dest == X2APIC_BROADCAST) |
416 | return -EINVAL; |
417 | if (!apic_x2apic_mode(source) && dest == APIC_BROADCAST) |
418 | return -EINVAL; |
419 | |
420 | if (WARN_ON_ONCE(dest != index)) |
421 | return -EINVAL; |
422 | |
423 | avic_kick_vcpu_by_physical_id(kvm, physical_id: dest, icrl); |
424 | } else { |
425 | u32 *avic_logical_id_table; |
426 | unsigned long bitmap, i; |
427 | u32 cluster; |
428 | |
429 | if (apic_x2apic_mode(source)) { |
430 | /* 16 bit dest mask, 16 bit cluster id */ |
431 | bitmap = dest & 0xFFFF; |
432 | cluster = (dest >> 16) << 4; |
433 | } else if (kvm_lapic_get_reg(source, APIC_DFR) == APIC_DFR_FLAT) { |
434 | /* 8 bit dest mask*/ |
435 | bitmap = dest; |
436 | cluster = 0; |
437 | } else { |
438 | /* 4 bit desk mask, 4 bit cluster id */ |
439 | bitmap = dest & 0xF; |
440 | cluster = (dest >> 4) << 2; |
441 | } |
442 | |
443 | /* Nothing to do if there are no destinations in the cluster. */ |
444 | if (unlikely(!bitmap)) |
445 | return 0; |
446 | |
447 | if (apic_x2apic_mode(source)) |
448 | avic_logical_id_table = NULL; |
449 | else |
450 | avic_logical_id_table = page_address(kvm_svm->avic_logical_id_table_page); |
451 | |
452 | /* |
453 | * AVIC is inhibited if vCPUs aren't mapped 1:1 with logical |
454 | * IDs, thus each bit in the destination is guaranteed to map |
455 | * to at most one vCPU. |
456 | */ |
457 | for_each_set_bit(i, &bitmap, 16) |
458 | avic_kick_vcpu_by_logical_id(kvm, avic_logical_id_table, |
459 | logid_index: cluster + i, icrl); |
460 | } |
461 | |
462 | return 0; |
463 | } |
464 | |
465 | static void avic_kick_target_vcpus(struct kvm *kvm, struct kvm_lapic *source, |
466 | u32 icrl, u32 icrh, u32 index) |
467 | { |
468 | u32 dest = apic_x2apic_mode(source) ? icrh : GET_XAPIC_DEST_FIELD(icrh); |
469 | unsigned long i; |
470 | struct kvm_vcpu *vcpu; |
471 | |
472 | if (!avic_kick_target_vcpus_fast(kvm, source, icrl, icrh, index)) |
473 | return; |
474 | |
475 | trace_kvm_avic_kick_vcpu_slowpath(icrh, icrl, index); |
476 | |
477 | /* |
478 | * Wake any target vCPUs that are blocking, i.e. waiting for a wake |
479 | * event. There's no need to signal doorbells, as hardware has handled |
480 | * vCPUs that were in guest at the time of the IPI, and vCPUs that have |
481 | * since entered the guest will have processed pending IRQs at VMRUN. |
482 | */ |
483 | kvm_for_each_vcpu(i, vcpu, kvm) { |
484 | if (kvm_apic_match_dest(vcpu, source, icrl & APIC_SHORT_MASK, |
485 | dest, icrl & APIC_DEST_MASK)) |
486 | avic_kick_vcpu(vcpu, icrl); |
487 | } |
488 | } |
489 | |
490 | int avic_incomplete_ipi_interception(struct kvm_vcpu *vcpu) |
491 | { |
492 | struct vcpu_svm *svm = to_svm(vcpu); |
493 | u32 icrh = svm->vmcb->control.exit_info_1 >> 32; |
494 | u32 icrl = svm->vmcb->control.exit_info_1; |
495 | u32 id = svm->vmcb->control.exit_info_2 >> 32; |
496 | u32 index = svm->vmcb->control.exit_info_2 & 0x1FF; |
497 | struct kvm_lapic *apic = vcpu->arch.apic; |
498 | |
499 | trace_kvm_avic_incomplete_ipi(vcpu->vcpu_id, icrh, icrl, id, index); |
500 | |
501 | switch (id) { |
502 | case AVIC_IPI_FAILURE_INVALID_TARGET: |
503 | case AVIC_IPI_FAILURE_INVALID_INT_TYPE: |
504 | /* |
505 | * Emulate IPIs that are not handled by AVIC hardware, which |
506 | * only virtualizes Fixed, Edge-Triggered INTRs, and falls over |
507 | * if _any_ targets are invalid, e.g. if the logical mode mask |
508 | * is a superset of running vCPUs. |
509 | * |
510 | * The exit is a trap, e.g. ICR holds the correct value and RIP |
511 | * has been advanced, KVM is responsible only for emulating the |
512 | * IPI. Sadly, hardware may sometimes leave the BUSY flag set, |
513 | * in which case KVM needs to emulate the ICR write as well in |
514 | * order to clear the BUSY flag. |
515 | */ |
516 | if (icrl & APIC_ICR_BUSY) |
517 | kvm_apic_write_nodecode(vcpu, APIC_ICR); |
518 | else |
519 | kvm_apic_send_ipi(apic, icrl, icrh); |
520 | break; |
521 | case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: |
522 | /* |
523 | * At this point, we expect that the AVIC HW has already |
524 | * set the appropriate IRR bits on the valid target |
525 | * vcpus. So, we just need to kick the appropriate vcpu. |
526 | */ |
527 | avic_kick_target_vcpus(kvm: vcpu->kvm, source: apic, icrl, icrh, index); |
528 | break; |
529 | case AVIC_IPI_FAILURE_INVALID_BACKING_PAGE: |
530 | WARN_ONCE(1, "Invalid backing page\n" ); |
531 | break; |
532 | case AVIC_IPI_FAILURE_INVALID_IPI_VECTOR: |
533 | /* Invalid IPI with vector < 16 */ |
534 | break; |
535 | default: |
536 | vcpu_unimpl(vcpu, "Unknown avic incomplete IPI interception\n" ); |
537 | } |
538 | |
539 | return 1; |
540 | } |
541 | |
542 | unsigned long avic_vcpu_get_apicv_inhibit_reasons(struct kvm_vcpu *vcpu) |
543 | { |
544 | if (is_guest_mode(vcpu)) |
545 | return APICV_INHIBIT_REASON_NESTED; |
546 | return 0; |
547 | } |
548 | |
549 | static u32 *avic_get_logical_id_entry(struct kvm_vcpu *vcpu, u32 ldr, bool flat) |
550 | { |
551 | struct kvm_svm *kvm_svm = to_kvm_svm(kvm: vcpu->kvm); |
552 | u32 *logical_apic_id_table; |
553 | u32 cluster, index; |
554 | |
555 | ldr = GET_APIC_LOGICAL_ID(ldr); |
556 | |
557 | if (flat) { |
558 | cluster = 0; |
559 | } else { |
560 | cluster = (ldr >> 4); |
561 | if (cluster >= 0xf) |
562 | return NULL; |
563 | ldr &= 0xf; |
564 | } |
565 | if (!ldr || !is_power_of_2(n: ldr)) |
566 | return NULL; |
567 | |
568 | index = __ffs(ldr); |
569 | if (WARN_ON_ONCE(index > 7)) |
570 | return NULL; |
571 | index += (cluster << 2); |
572 | |
573 | logical_apic_id_table = (u32 *) page_address(kvm_svm->avic_logical_id_table_page); |
574 | |
575 | return &logical_apic_id_table[index]; |
576 | } |
577 | |
578 | static void avic_ldr_write(struct kvm_vcpu *vcpu, u8 g_physical_id, u32 ldr) |
579 | { |
580 | bool flat; |
581 | u32 *entry, new_entry; |
582 | |
583 | flat = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR) == APIC_DFR_FLAT; |
584 | entry = avic_get_logical_id_entry(vcpu, ldr, flat); |
585 | if (!entry) |
586 | return; |
587 | |
588 | new_entry = READ_ONCE(*entry); |
589 | new_entry &= ~AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK; |
590 | new_entry |= (g_physical_id & AVIC_LOGICAL_ID_ENTRY_GUEST_PHYSICAL_ID_MASK); |
591 | new_entry |= AVIC_LOGICAL_ID_ENTRY_VALID_MASK; |
592 | WRITE_ONCE(*entry, new_entry); |
593 | } |
594 | |
595 | static void avic_invalidate_logical_id_entry(struct kvm_vcpu *vcpu) |
596 | { |
597 | struct vcpu_svm *svm = to_svm(vcpu); |
598 | bool flat = svm->dfr_reg == APIC_DFR_FLAT; |
599 | u32 *entry; |
600 | |
601 | /* Note: x2AVIC does not use logical APIC ID table */ |
602 | if (apic_x2apic_mode(vcpu->arch.apic)) |
603 | return; |
604 | |
605 | entry = avic_get_logical_id_entry(vcpu, ldr: svm->ldr_reg, flat); |
606 | if (entry) |
607 | clear_bit(AVIC_LOGICAL_ID_ENTRY_VALID_BIT, addr: (unsigned long *)entry); |
608 | } |
609 | |
610 | static void avic_handle_ldr_update(struct kvm_vcpu *vcpu) |
611 | { |
612 | struct vcpu_svm *svm = to_svm(vcpu); |
613 | u32 ldr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_LDR); |
614 | u32 id = kvm_xapic_id(vcpu->arch.apic); |
615 | |
616 | /* AVIC does not support LDR update for x2APIC */ |
617 | if (apic_x2apic_mode(vcpu->arch.apic)) |
618 | return; |
619 | |
620 | if (ldr == svm->ldr_reg) |
621 | return; |
622 | |
623 | avic_invalidate_logical_id_entry(vcpu); |
624 | |
625 | svm->ldr_reg = ldr; |
626 | avic_ldr_write(vcpu, g_physical_id: id, ldr); |
627 | } |
628 | |
629 | static void avic_handle_dfr_update(struct kvm_vcpu *vcpu) |
630 | { |
631 | struct vcpu_svm *svm = to_svm(vcpu); |
632 | u32 dfr = kvm_lapic_get_reg(vcpu->arch.apic, APIC_DFR); |
633 | |
634 | if (svm->dfr_reg == dfr) |
635 | return; |
636 | |
637 | avic_invalidate_logical_id_entry(vcpu); |
638 | svm->dfr_reg = dfr; |
639 | } |
640 | |
641 | static int avic_unaccel_trap_write(struct kvm_vcpu *vcpu) |
642 | { |
643 | u32 offset = to_svm(vcpu)->vmcb->control.exit_info_1 & |
644 | AVIC_UNACCEL_ACCESS_OFFSET_MASK; |
645 | |
646 | switch (offset) { |
647 | case APIC_LDR: |
648 | avic_handle_ldr_update(vcpu); |
649 | break; |
650 | case APIC_DFR: |
651 | avic_handle_dfr_update(vcpu); |
652 | break; |
653 | case APIC_RRR: |
654 | /* Ignore writes to Read Remote Data, it's read-only. */ |
655 | return 1; |
656 | default: |
657 | break; |
658 | } |
659 | |
660 | kvm_apic_write_nodecode(vcpu, offset); |
661 | return 1; |
662 | } |
663 | |
664 | static bool is_avic_unaccelerated_access_trap(u32 offset) |
665 | { |
666 | bool ret = false; |
667 | |
668 | switch (offset) { |
669 | case APIC_ID: |
670 | case APIC_EOI: |
671 | case APIC_RRR: |
672 | case APIC_LDR: |
673 | case APIC_DFR: |
674 | case APIC_SPIV: |
675 | case APIC_ESR: |
676 | case APIC_ICR: |
677 | case APIC_LVTT: |
678 | case APIC_LVTTHMR: |
679 | case APIC_LVTPC: |
680 | case APIC_LVT0: |
681 | case APIC_LVT1: |
682 | case APIC_LVTERR: |
683 | case APIC_TMICT: |
684 | case APIC_TDCR: |
685 | ret = true; |
686 | break; |
687 | default: |
688 | break; |
689 | } |
690 | return ret; |
691 | } |
692 | |
693 | int avic_unaccelerated_access_interception(struct kvm_vcpu *vcpu) |
694 | { |
695 | struct vcpu_svm *svm = to_svm(vcpu); |
696 | int ret = 0; |
697 | u32 offset = svm->vmcb->control.exit_info_1 & |
698 | AVIC_UNACCEL_ACCESS_OFFSET_MASK; |
699 | u32 vector = svm->vmcb->control.exit_info_2 & |
700 | AVIC_UNACCEL_ACCESS_VECTOR_MASK; |
701 | bool write = (svm->vmcb->control.exit_info_1 >> 32) & |
702 | AVIC_UNACCEL_ACCESS_WRITE_MASK; |
703 | bool trap = is_avic_unaccelerated_access_trap(offset); |
704 | |
705 | trace_kvm_avic_unaccelerated_access(vcpu->vcpu_id, offset, |
706 | trap, write, vector); |
707 | if (trap) { |
708 | /* Handling Trap */ |
709 | WARN_ONCE(!write, "svm: Handling trap read.\n" ); |
710 | ret = avic_unaccel_trap_write(vcpu); |
711 | } else { |
712 | /* Handling Fault */ |
713 | ret = kvm_emulate_instruction(vcpu, emulation_type: 0); |
714 | } |
715 | |
716 | return ret; |
717 | } |
718 | |
719 | int avic_init_vcpu(struct vcpu_svm *svm) |
720 | { |
721 | int ret; |
722 | struct kvm_vcpu *vcpu = &svm->vcpu; |
723 | |
724 | if (!enable_apicv || !irqchip_in_kernel(vcpu->kvm)) |
725 | return 0; |
726 | |
727 | ret = avic_init_backing_page(vcpu); |
728 | if (ret) |
729 | return ret; |
730 | |
731 | INIT_LIST_HEAD(list: &svm->ir_list); |
732 | spin_lock_init(&svm->ir_list_lock); |
733 | svm->dfr_reg = APIC_DFR_FLAT; |
734 | |
735 | return ret; |
736 | } |
737 | |
738 | void avic_apicv_post_state_restore(struct kvm_vcpu *vcpu) |
739 | { |
740 | avic_handle_dfr_update(vcpu); |
741 | avic_handle_ldr_update(vcpu); |
742 | } |
743 | |
744 | static int avic_set_pi_irte_mode(struct kvm_vcpu *vcpu, bool activate) |
745 | { |
746 | int ret = 0; |
747 | unsigned long flags; |
748 | struct amd_svm_iommu_ir *ir; |
749 | struct vcpu_svm *svm = to_svm(vcpu); |
750 | |
751 | if (!kvm_arch_has_assigned_device(kvm: vcpu->kvm)) |
752 | return 0; |
753 | |
754 | /* |
755 | * Here, we go through the per-vcpu ir_list to update all existing |
756 | * interrupt remapping table entry targeting this vcpu. |
757 | */ |
758 | spin_lock_irqsave(&svm->ir_list_lock, flags); |
759 | |
760 | if (list_empty(head: &svm->ir_list)) |
761 | goto out; |
762 | |
763 | list_for_each_entry(ir, &svm->ir_list, node) { |
764 | if (activate) |
765 | ret = amd_iommu_activate_guest_mode(data: ir->data); |
766 | else |
767 | ret = amd_iommu_deactivate_guest_mode(data: ir->data); |
768 | if (ret) |
769 | break; |
770 | } |
771 | out: |
772 | spin_unlock_irqrestore(lock: &svm->ir_list_lock, flags); |
773 | return ret; |
774 | } |
775 | |
776 | static void svm_ir_list_del(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) |
777 | { |
778 | unsigned long flags; |
779 | struct amd_svm_iommu_ir *cur; |
780 | |
781 | spin_lock_irqsave(&svm->ir_list_lock, flags); |
782 | list_for_each_entry(cur, &svm->ir_list, node) { |
783 | if (cur->data != pi->ir_data) |
784 | continue; |
785 | list_del(entry: &cur->node); |
786 | kfree(objp: cur); |
787 | break; |
788 | } |
789 | spin_unlock_irqrestore(lock: &svm->ir_list_lock, flags); |
790 | } |
791 | |
792 | static int svm_ir_list_add(struct vcpu_svm *svm, struct amd_iommu_pi_data *pi) |
793 | { |
794 | int ret = 0; |
795 | unsigned long flags; |
796 | struct amd_svm_iommu_ir *ir; |
797 | u64 entry; |
798 | |
799 | /** |
800 | * In some cases, the existing irte is updated and re-set, |
801 | * so we need to check here if it's already been * added |
802 | * to the ir_list. |
803 | */ |
804 | if (pi->ir_data && (pi->prev_ga_tag != 0)) { |
805 | struct kvm *kvm = svm->vcpu.kvm; |
806 | u32 vcpu_id = AVIC_GATAG_TO_VCPUID(pi->prev_ga_tag); |
807 | struct kvm_vcpu *prev_vcpu = kvm_get_vcpu_by_id(kvm, id: vcpu_id); |
808 | struct vcpu_svm *prev_svm; |
809 | |
810 | if (!prev_vcpu) { |
811 | ret = -EINVAL; |
812 | goto out; |
813 | } |
814 | |
815 | prev_svm = to_svm(vcpu: prev_vcpu); |
816 | svm_ir_list_del(svm: prev_svm, pi); |
817 | } |
818 | |
819 | /** |
820 | * Allocating new amd_iommu_pi_data, which will get |
821 | * add to the per-vcpu ir_list. |
822 | */ |
823 | ir = kzalloc(size: sizeof(struct amd_svm_iommu_ir), GFP_KERNEL_ACCOUNT); |
824 | if (!ir) { |
825 | ret = -ENOMEM; |
826 | goto out; |
827 | } |
828 | ir->data = pi->ir_data; |
829 | |
830 | spin_lock_irqsave(&svm->ir_list_lock, flags); |
831 | |
832 | /* |
833 | * Update the target pCPU for IOMMU doorbells if the vCPU is running. |
834 | * If the vCPU is NOT running, i.e. is blocking or scheduled out, KVM |
835 | * will update the pCPU info when the vCPU awkened and/or scheduled in. |
836 | * See also avic_vcpu_load(). |
837 | */ |
838 | entry = READ_ONCE(*(svm->avic_physical_id_cache)); |
839 | if (entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK) |
840 | amd_iommu_update_ga(cpu: entry & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK, |
841 | is_run: true, data: pi->ir_data); |
842 | |
843 | list_add(new: &ir->node, head: &svm->ir_list); |
844 | spin_unlock_irqrestore(lock: &svm->ir_list_lock, flags); |
845 | out: |
846 | return ret; |
847 | } |
848 | |
849 | /* |
850 | * Note: |
851 | * The HW cannot support posting multicast/broadcast |
852 | * interrupts to a vCPU. So, we still use legacy interrupt |
853 | * remapping for these kind of interrupts. |
854 | * |
855 | * For lowest-priority interrupts, we only support |
856 | * those with single CPU as the destination, e.g. user |
857 | * configures the interrupts via /proc/irq or uses |
858 | * irqbalance to make the interrupts single-CPU. |
859 | */ |
860 | static int |
861 | get_pi_vcpu_info(struct kvm *kvm, struct kvm_kernel_irq_routing_entry *e, |
862 | struct vcpu_data *vcpu_info, struct vcpu_svm **svm) |
863 | { |
864 | struct kvm_lapic_irq irq; |
865 | struct kvm_vcpu *vcpu = NULL; |
866 | |
867 | kvm_set_msi_irq(kvm, e, irq: &irq); |
868 | |
869 | if (!kvm_intr_is_single_vcpu(kvm, irq: &irq, dest_vcpu: &vcpu) || |
870 | !kvm_irq_is_postable(irq: &irq)) { |
871 | pr_debug("SVM: %s: use legacy intr remap mode for irq %u\n" , |
872 | __func__, irq.vector); |
873 | return -1; |
874 | } |
875 | |
876 | pr_debug("SVM: %s: use GA mode for irq %u\n" , __func__, |
877 | irq.vector); |
878 | *svm = to_svm(vcpu); |
879 | vcpu_info->pi_desc_addr = __sme_set(page_to_phys((*svm)->avic_backing_page)); |
880 | vcpu_info->vector = irq.vector; |
881 | |
882 | return 0; |
883 | } |
884 | |
885 | /* |
886 | * avic_pi_update_irte - set IRTE for Posted-Interrupts |
887 | * |
888 | * @kvm: kvm |
889 | * @host_irq: host irq of the interrupt |
890 | * @guest_irq: gsi of the interrupt |
891 | * @set: set or unset PI |
892 | * returns 0 on success, < 0 on failure |
893 | */ |
894 | int avic_pi_update_irte(struct kvm *kvm, unsigned int host_irq, |
895 | uint32_t guest_irq, bool set) |
896 | { |
897 | struct kvm_kernel_irq_routing_entry *e; |
898 | struct kvm_irq_routing_table *irq_rt; |
899 | int idx, ret = 0; |
900 | |
901 | if (!kvm_arch_has_assigned_device(kvm) || |
902 | !irq_remapping_cap(cap: IRQ_POSTING_CAP)) |
903 | return 0; |
904 | |
905 | pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n" , |
906 | __func__, host_irq, guest_irq, set); |
907 | |
908 | idx = srcu_read_lock(ssp: &kvm->irq_srcu); |
909 | irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); |
910 | |
911 | if (guest_irq >= irq_rt->nr_rt_entries || |
912 | hlist_empty(h: &irq_rt->map[guest_irq])) { |
913 | pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n" , |
914 | guest_irq, irq_rt->nr_rt_entries); |
915 | goto out; |
916 | } |
917 | |
918 | hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { |
919 | struct vcpu_data vcpu_info; |
920 | struct vcpu_svm *svm = NULL; |
921 | |
922 | if (e->type != KVM_IRQ_ROUTING_MSI) |
923 | continue; |
924 | |
925 | /** |
926 | * Here, we setup with legacy mode in the following cases: |
927 | * 1. When cannot target interrupt to a specific vcpu. |
928 | * 2. Unsetting posted interrupt. |
929 | * 3. APIC virtualization is disabled for the vcpu. |
930 | * 4. IRQ has incompatible delivery mode (SMI, INIT, etc) |
931 | */ |
932 | if (!get_pi_vcpu_info(kvm, e, vcpu_info: &vcpu_info, svm: &svm) && set && |
933 | kvm_vcpu_apicv_active(&svm->vcpu)) { |
934 | struct amd_iommu_pi_data pi; |
935 | |
936 | /* Try to enable guest_mode in IRTE */ |
937 | pi.base = __sme_set(page_to_phys(svm->avic_backing_page) & |
938 | AVIC_HPA_MASK); |
939 | pi.ga_tag = AVIC_GATAG(to_kvm_svm(kvm)->avic_vm_id, |
940 | svm->vcpu.vcpu_id); |
941 | pi.is_guest_mode = true; |
942 | pi.vcpu_data = &vcpu_info; |
943 | ret = irq_set_vcpu_affinity(irq: host_irq, vcpu_info: &pi); |
944 | |
945 | /** |
946 | * Here, we successfully setting up vcpu affinity in |
947 | * IOMMU guest mode. Now, we need to store the posted |
948 | * interrupt information in a per-vcpu ir_list so that |
949 | * we can reference to them directly when we update vcpu |
950 | * scheduling information in IOMMU irte. |
951 | */ |
952 | if (!ret && pi.is_guest_mode) |
953 | svm_ir_list_add(svm, pi: &pi); |
954 | } else { |
955 | /* Use legacy mode in IRTE */ |
956 | struct amd_iommu_pi_data pi; |
957 | |
958 | /** |
959 | * Here, pi is used to: |
960 | * - Tell IOMMU to use legacy mode for this interrupt. |
961 | * - Retrieve ga_tag of prior interrupt remapping data. |
962 | */ |
963 | pi.prev_ga_tag = 0; |
964 | pi.is_guest_mode = false; |
965 | ret = irq_set_vcpu_affinity(irq: host_irq, vcpu_info: &pi); |
966 | |
967 | /** |
968 | * Check if the posted interrupt was previously |
969 | * setup with the guest_mode by checking if the ga_tag |
970 | * was cached. If so, we need to clean up the per-vcpu |
971 | * ir_list. |
972 | */ |
973 | if (!ret && pi.prev_ga_tag) { |
974 | int id = AVIC_GATAG_TO_VCPUID(pi.prev_ga_tag); |
975 | struct kvm_vcpu *vcpu; |
976 | |
977 | vcpu = kvm_get_vcpu_by_id(kvm, id); |
978 | if (vcpu) |
979 | svm_ir_list_del(svm: to_svm(vcpu), pi: &pi); |
980 | } |
981 | } |
982 | |
983 | if (!ret && svm) { |
984 | trace_kvm_pi_irte_update(host_irq, svm->vcpu.vcpu_id, |
985 | e->gsi, vcpu_info.vector, |
986 | vcpu_info.pi_desc_addr, set); |
987 | } |
988 | |
989 | if (ret < 0) { |
990 | pr_err("%s: failed to update PI IRTE\n" , __func__); |
991 | goto out; |
992 | } |
993 | } |
994 | |
995 | ret = 0; |
996 | out: |
997 | srcu_read_unlock(ssp: &kvm->irq_srcu, idx); |
998 | return ret; |
999 | } |
1000 | |
1001 | static inline int |
1002 | avic_update_iommu_vcpu_affinity(struct kvm_vcpu *vcpu, int cpu, bool r) |
1003 | { |
1004 | int ret = 0; |
1005 | struct amd_svm_iommu_ir *ir; |
1006 | struct vcpu_svm *svm = to_svm(vcpu); |
1007 | |
1008 | lockdep_assert_held(&svm->ir_list_lock); |
1009 | |
1010 | if (!kvm_arch_has_assigned_device(kvm: vcpu->kvm)) |
1011 | return 0; |
1012 | |
1013 | /* |
1014 | * Here, we go through the per-vcpu ir_list to update all existing |
1015 | * interrupt remapping table entry targeting this vcpu. |
1016 | */ |
1017 | if (list_empty(head: &svm->ir_list)) |
1018 | return 0; |
1019 | |
1020 | list_for_each_entry(ir, &svm->ir_list, node) { |
1021 | ret = amd_iommu_update_ga(cpu, is_run: r, data: ir->data); |
1022 | if (ret) |
1023 | return ret; |
1024 | } |
1025 | return 0; |
1026 | } |
1027 | |
1028 | void avic_vcpu_load(struct kvm_vcpu *vcpu, int cpu) |
1029 | { |
1030 | u64 entry; |
1031 | int h_physical_id = kvm_cpu_get_apicid(mps_cpu: cpu); |
1032 | struct vcpu_svm *svm = to_svm(vcpu); |
1033 | unsigned long flags; |
1034 | |
1035 | lockdep_assert_preemption_disabled(); |
1036 | |
1037 | if (WARN_ON(h_physical_id & ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK)) |
1038 | return; |
1039 | |
1040 | /* |
1041 | * No need to update anything if the vCPU is blocking, i.e. if the vCPU |
1042 | * is being scheduled in after being preempted. The CPU entries in the |
1043 | * Physical APIC table and IRTE are consumed iff IsRun{ning} is '1'. |
1044 | * If the vCPU was migrated, its new CPU value will be stuffed when the |
1045 | * vCPU unblocks. |
1046 | */ |
1047 | if (kvm_vcpu_is_blocking(vcpu)) |
1048 | return; |
1049 | |
1050 | /* |
1051 | * Grab the per-vCPU interrupt remapping lock even if the VM doesn't |
1052 | * _currently_ have assigned devices, as that can change. Holding |
1053 | * ir_list_lock ensures that either svm_ir_list_add() will consume |
1054 | * up-to-date entry information, or that this task will wait until |
1055 | * svm_ir_list_add() completes to set the new target pCPU. |
1056 | */ |
1057 | spin_lock_irqsave(&svm->ir_list_lock, flags); |
1058 | |
1059 | entry = READ_ONCE(*(svm->avic_physical_id_cache)); |
1060 | WARN_ON_ONCE(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK); |
1061 | |
1062 | entry &= ~AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK; |
1063 | entry |= (h_physical_id & AVIC_PHYSICAL_ID_ENTRY_HOST_PHYSICAL_ID_MASK); |
1064 | entry |= AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; |
1065 | |
1066 | WRITE_ONCE(*(svm->avic_physical_id_cache), entry); |
1067 | avic_update_iommu_vcpu_affinity(vcpu, cpu: h_physical_id, r: true); |
1068 | |
1069 | spin_unlock_irqrestore(lock: &svm->ir_list_lock, flags); |
1070 | } |
1071 | |
1072 | void avic_vcpu_put(struct kvm_vcpu *vcpu) |
1073 | { |
1074 | u64 entry; |
1075 | struct vcpu_svm *svm = to_svm(vcpu); |
1076 | unsigned long flags; |
1077 | |
1078 | lockdep_assert_preemption_disabled(); |
1079 | |
1080 | /* |
1081 | * Note, reading the Physical ID entry outside of ir_list_lock is safe |
1082 | * as only the pCPU that has loaded (or is loading) the vCPU is allowed |
1083 | * to modify the entry, and preemption is disabled. I.e. the vCPU |
1084 | * can't be scheduled out and thus avic_vcpu_{put,load}() can't run |
1085 | * recursively. |
1086 | */ |
1087 | entry = READ_ONCE(*(svm->avic_physical_id_cache)); |
1088 | |
1089 | /* Nothing to do if IsRunning == '0' due to vCPU blocking. */ |
1090 | if (!(entry & AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK)) |
1091 | return; |
1092 | |
1093 | /* |
1094 | * Take and hold the per-vCPU interrupt remapping lock while updating |
1095 | * the Physical ID entry even though the lock doesn't protect against |
1096 | * multiple writers (see above). Holding ir_list_lock ensures that |
1097 | * either svm_ir_list_add() will consume up-to-date entry information, |
1098 | * or that this task will wait until svm_ir_list_add() completes to |
1099 | * mark the vCPU as not running. |
1100 | */ |
1101 | spin_lock_irqsave(&svm->ir_list_lock, flags); |
1102 | |
1103 | avic_update_iommu_vcpu_affinity(vcpu, cpu: -1, r: 0); |
1104 | |
1105 | entry &= ~AVIC_PHYSICAL_ID_ENTRY_IS_RUNNING_MASK; |
1106 | WRITE_ONCE(*(svm->avic_physical_id_cache), entry); |
1107 | |
1108 | spin_unlock_irqrestore(lock: &svm->ir_list_lock, flags); |
1109 | |
1110 | } |
1111 | |
1112 | void avic_refresh_virtual_apic_mode(struct kvm_vcpu *vcpu) |
1113 | { |
1114 | struct vcpu_svm *svm = to_svm(vcpu); |
1115 | struct vmcb *vmcb = svm->vmcb01.ptr; |
1116 | |
1117 | if (!lapic_in_kernel(vcpu) || !enable_apicv) |
1118 | return; |
1119 | |
1120 | if (kvm_vcpu_apicv_active(vcpu)) { |
1121 | /** |
1122 | * During AVIC temporary deactivation, guest could update |
1123 | * APIC ID, DFR and LDR registers, which would not be trapped |
1124 | * by avic_unaccelerated_access_interception(). In this case, |
1125 | * we need to check and update the AVIC logical APIC ID table |
1126 | * accordingly before re-activating. |
1127 | */ |
1128 | avic_apicv_post_state_restore(vcpu); |
1129 | avic_activate_vmcb(svm); |
1130 | } else { |
1131 | avic_deactivate_vmcb(svm); |
1132 | } |
1133 | vmcb_mark_dirty(vmcb, bit: VMCB_AVIC); |
1134 | } |
1135 | |
1136 | void avic_refresh_apicv_exec_ctrl(struct kvm_vcpu *vcpu) |
1137 | { |
1138 | bool activated = kvm_vcpu_apicv_active(vcpu); |
1139 | |
1140 | if (!enable_apicv) |
1141 | return; |
1142 | |
1143 | avic_refresh_virtual_apic_mode(vcpu); |
1144 | |
1145 | if (activated) |
1146 | avic_vcpu_load(vcpu, cpu: vcpu->cpu); |
1147 | else |
1148 | avic_vcpu_put(vcpu); |
1149 | |
1150 | avic_set_pi_irte_mode(vcpu, activate: activated); |
1151 | } |
1152 | |
1153 | void avic_vcpu_blocking(struct kvm_vcpu *vcpu) |
1154 | { |
1155 | if (!kvm_vcpu_apicv_active(vcpu)) |
1156 | return; |
1157 | |
1158 | /* |
1159 | * Unload the AVIC when the vCPU is about to block, _before_ |
1160 | * the vCPU actually blocks. |
1161 | * |
1162 | * Any IRQs that arrive before IsRunning=0 will not cause an |
1163 | * incomplete IPI vmexit on the source, therefore vIRR will also |
1164 | * be checked by kvm_vcpu_check_block() before blocking. The |
1165 | * memory barrier implicit in set_current_state orders writing |
1166 | * IsRunning=0 before reading the vIRR. The processor needs a |
1167 | * matching memory barrier on interrupt delivery between writing |
1168 | * IRR and reading IsRunning; the lack of this barrier might be |
1169 | * the cause of errata #1235). |
1170 | */ |
1171 | avic_vcpu_put(vcpu); |
1172 | } |
1173 | |
1174 | void avic_vcpu_unblocking(struct kvm_vcpu *vcpu) |
1175 | { |
1176 | if (!kvm_vcpu_apicv_active(vcpu)) |
1177 | return; |
1178 | |
1179 | avic_vcpu_load(vcpu, cpu: vcpu->cpu); |
1180 | } |
1181 | |
1182 | /* |
1183 | * Note: |
1184 | * - The module param avic enable both xAPIC and x2APIC mode. |
1185 | * - Hypervisor can support both xAVIC and x2AVIC in the same guest. |
1186 | * - The mode can be switched at run-time. |
1187 | */ |
1188 | bool avic_hardware_setup(void) |
1189 | { |
1190 | if (!npt_enabled) |
1191 | return false; |
1192 | |
1193 | /* AVIC is a prerequisite for x2AVIC. */ |
1194 | if (!boot_cpu_has(X86_FEATURE_AVIC) && !force_avic) { |
1195 | if (boot_cpu_has(X86_FEATURE_X2AVIC)) { |
1196 | pr_warn(FW_BUG "Cannot support x2AVIC due to AVIC is disabled" ); |
1197 | pr_warn(FW_BUG "Try enable AVIC using force_avic option" ); |
1198 | } |
1199 | return false; |
1200 | } |
1201 | |
1202 | if (boot_cpu_has(X86_FEATURE_AVIC)) { |
1203 | pr_info("AVIC enabled\n" ); |
1204 | } else if (force_avic) { |
1205 | /* |
1206 | * Some older systems does not advertise AVIC support. |
1207 | * See Revision Guide for specific AMD processor for more detail. |
1208 | */ |
1209 | pr_warn("AVIC is not supported in CPUID but force enabled" ); |
1210 | pr_warn("Your system might crash and burn" ); |
1211 | } |
1212 | |
1213 | /* AVIC is a prerequisite for x2AVIC. */ |
1214 | x2avic_enabled = boot_cpu_has(X86_FEATURE_X2AVIC); |
1215 | if (x2avic_enabled) |
1216 | pr_info("x2AVIC enabled\n" ); |
1217 | |
1218 | amd_iommu_register_ga_log_notifier(notifier: &avic_ga_log_notifier); |
1219 | |
1220 | return true; |
1221 | } |
1222 | |