1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Copyright IBM Corporation, 2018 |
4 | * Authors Suraj Jitindar Singh <sjitindarsingh@gmail.com> |
5 | * Paul Mackerras <paulus@ozlabs.org> |
6 | * |
7 | * Description: KVM functions specific to running nested KVM-HV guests |
8 | * on Book3S processors (specifically POWER9 and later). |
9 | */ |
10 | |
11 | #include <linux/kernel.h> |
12 | #include <linux/kvm_host.h> |
13 | #include <linux/llist.h> |
14 | #include <linux/pgtable.h> |
15 | |
16 | #include <asm/kvm_ppc.h> |
17 | #include <asm/kvm_book3s.h> |
18 | #include <asm/mmu.h> |
19 | #include <asm/pgalloc.h> |
20 | #include <asm/pte-walk.h> |
21 | #include <asm/reg.h> |
22 | #include <asm/plpar_wrappers.h> |
23 | #include <asm/firmware.h> |
24 | |
25 | static struct patb_entry *pseries_partition_tb; |
26 | |
27 | static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp); |
28 | static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free); |
29 | |
30 | void kvmhv_save_hv_regs(struct kvm_vcpu *vcpu, struct hv_guest_state *hr) |
31 | { |
32 | struct kvmppc_vcore *vc = vcpu->arch.vcore; |
33 | |
34 | hr->pcr = vc->pcr | PCR_MASK; |
35 | hr->dpdes = vc->dpdes; |
36 | hr->hfscr = vcpu->arch.hfscr; |
37 | hr->tb_offset = vc->tb_offset; |
38 | hr->dawr0 = vcpu->arch.dawr0; |
39 | hr->dawrx0 = vcpu->arch.dawrx0; |
40 | hr->ciabr = vcpu->arch.ciabr; |
41 | hr->purr = vcpu->arch.purr; |
42 | hr->spurr = vcpu->arch.spurr; |
43 | hr->ic = vcpu->arch.ic; |
44 | hr->vtb = vc->vtb; |
45 | hr->srr0 = vcpu->arch.shregs.srr0; |
46 | hr->srr1 = vcpu->arch.shregs.srr1; |
47 | hr->sprg[0] = vcpu->arch.shregs.sprg0; |
48 | hr->sprg[1] = vcpu->arch.shregs.sprg1; |
49 | hr->sprg[2] = vcpu->arch.shregs.sprg2; |
50 | hr->sprg[3] = vcpu->arch.shregs.sprg3; |
51 | hr->pidr = vcpu->arch.pid; |
52 | hr->cfar = vcpu->arch.cfar; |
53 | hr->ppr = vcpu->arch.ppr; |
54 | hr->dawr1 = vcpu->arch.dawr1; |
55 | hr->dawrx1 = vcpu->arch.dawrx1; |
56 | } |
57 | |
58 | /* Use noinline_for_stack due to https://llvm.org/pr49610 */ |
59 | static noinline_for_stack void byteswap_pt_regs(struct pt_regs *regs) |
60 | { |
61 | unsigned long *addr = (unsigned long *) regs; |
62 | |
63 | for (; addr < ((unsigned long *) (regs + 1)); addr++) |
64 | *addr = swab64(*addr); |
65 | } |
66 | |
67 | static void byteswap_hv_regs(struct hv_guest_state *hr) |
68 | { |
69 | hr->version = swab64(hr->version); |
70 | hr->lpid = swab32(hr->lpid); |
71 | hr->vcpu_token = swab32(hr->vcpu_token); |
72 | hr->lpcr = swab64(hr->lpcr); |
73 | hr->pcr = swab64(hr->pcr) | PCR_MASK; |
74 | hr->amor = swab64(hr->amor); |
75 | hr->dpdes = swab64(hr->dpdes); |
76 | hr->hfscr = swab64(hr->hfscr); |
77 | hr->tb_offset = swab64(hr->tb_offset); |
78 | hr->dawr0 = swab64(hr->dawr0); |
79 | hr->dawrx0 = swab64(hr->dawrx0); |
80 | hr->ciabr = swab64(hr->ciabr); |
81 | hr->hdec_expiry = swab64(hr->hdec_expiry); |
82 | hr->purr = swab64(hr->purr); |
83 | hr->spurr = swab64(hr->spurr); |
84 | hr->ic = swab64(hr->ic); |
85 | hr->vtb = swab64(hr->vtb); |
86 | hr->hdar = swab64(hr->hdar); |
87 | hr->hdsisr = swab64(hr->hdsisr); |
88 | hr->heir = swab64(hr->heir); |
89 | hr->asdr = swab64(hr->asdr); |
90 | hr->srr0 = swab64(hr->srr0); |
91 | hr->srr1 = swab64(hr->srr1); |
92 | hr->sprg[0] = swab64(hr->sprg[0]); |
93 | hr->sprg[1] = swab64(hr->sprg[1]); |
94 | hr->sprg[2] = swab64(hr->sprg[2]); |
95 | hr->sprg[3] = swab64(hr->sprg[3]); |
96 | hr->pidr = swab64(hr->pidr); |
97 | hr->cfar = swab64(hr->cfar); |
98 | hr->ppr = swab64(hr->ppr); |
99 | hr->dawr1 = swab64(hr->dawr1); |
100 | hr->dawrx1 = swab64(hr->dawrx1); |
101 | } |
102 | |
103 | static void save_hv_return_state(struct kvm_vcpu *vcpu, |
104 | struct hv_guest_state *hr) |
105 | { |
106 | struct kvmppc_vcore *vc = vcpu->arch.vcore; |
107 | |
108 | hr->dpdes = vc->dpdes; |
109 | hr->purr = vcpu->arch.purr; |
110 | hr->spurr = vcpu->arch.spurr; |
111 | hr->ic = vcpu->arch.ic; |
112 | hr->vtb = vc->vtb; |
113 | hr->srr0 = vcpu->arch.shregs.srr0; |
114 | hr->srr1 = vcpu->arch.shregs.srr1; |
115 | hr->sprg[0] = vcpu->arch.shregs.sprg0; |
116 | hr->sprg[1] = vcpu->arch.shregs.sprg1; |
117 | hr->sprg[2] = vcpu->arch.shregs.sprg2; |
118 | hr->sprg[3] = vcpu->arch.shregs.sprg3; |
119 | hr->pidr = vcpu->arch.pid; |
120 | hr->cfar = vcpu->arch.cfar; |
121 | hr->ppr = vcpu->arch.ppr; |
122 | switch (vcpu->arch.trap) { |
123 | case BOOK3S_INTERRUPT_H_DATA_STORAGE: |
124 | hr->hdar = vcpu->arch.fault_dar; |
125 | hr->hdsisr = vcpu->arch.fault_dsisr; |
126 | hr->asdr = vcpu->arch.fault_gpa; |
127 | break; |
128 | case BOOK3S_INTERRUPT_H_INST_STORAGE: |
129 | hr->asdr = vcpu->arch.fault_gpa; |
130 | break; |
131 | case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: |
132 | hr->hfscr = ((~HFSCR_INTR_CAUSE & hr->hfscr) | |
133 | (HFSCR_INTR_CAUSE & vcpu->arch.hfscr)); |
134 | break; |
135 | case BOOK3S_INTERRUPT_H_EMUL_ASSIST: |
136 | hr->heir = vcpu->arch.emul_inst; |
137 | break; |
138 | } |
139 | } |
140 | |
141 | static void restore_hv_regs(struct kvm_vcpu *vcpu, const struct hv_guest_state *hr) |
142 | { |
143 | struct kvmppc_vcore *vc = vcpu->arch.vcore; |
144 | |
145 | vc->pcr = hr->pcr | PCR_MASK; |
146 | vc->dpdes = hr->dpdes; |
147 | vcpu->arch.hfscr = hr->hfscr; |
148 | vcpu->arch.dawr0 = hr->dawr0; |
149 | vcpu->arch.dawrx0 = hr->dawrx0; |
150 | vcpu->arch.ciabr = hr->ciabr; |
151 | vcpu->arch.purr = hr->purr; |
152 | vcpu->arch.spurr = hr->spurr; |
153 | vcpu->arch.ic = hr->ic; |
154 | vc->vtb = hr->vtb; |
155 | vcpu->arch.shregs.srr0 = hr->srr0; |
156 | vcpu->arch.shregs.srr1 = hr->srr1; |
157 | vcpu->arch.shregs.sprg0 = hr->sprg[0]; |
158 | vcpu->arch.shregs.sprg1 = hr->sprg[1]; |
159 | vcpu->arch.shregs.sprg2 = hr->sprg[2]; |
160 | vcpu->arch.shregs.sprg3 = hr->sprg[3]; |
161 | vcpu->arch.pid = hr->pidr; |
162 | vcpu->arch.cfar = hr->cfar; |
163 | vcpu->arch.ppr = hr->ppr; |
164 | vcpu->arch.dawr1 = hr->dawr1; |
165 | vcpu->arch.dawrx1 = hr->dawrx1; |
166 | } |
167 | |
168 | void kvmhv_restore_hv_return_state(struct kvm_vcpu *vcpu, |
169 | struct hv_guest_state *hr) |
170 | { |
171 | struct kvmppc_vcore *vc = vcpu->arch.vcore; |
172 | |
173 | vc->dpdes = hr->dpdes; |
174 | vcpu->arch.hfscr = hr->hfscr; |
175 | vcpu->arch.purr = hr->purr; |
176 | vcpu->arch.spurr = hr->spurr; |
177 | vcpu->arch.ic = hr->ic; |
178 | vc->vtb = hr->vtb; |
179 | vcpu->arch.fault_dar = hr->hdar; |
180 | vcpu->arch.fault_dsisr = hr->hdsisr; |
181 | vcpu->arch.fault_gpa = hr->asdr; |
182 | vcpu->arch.emul_inst = hr->heir; |
183 | vcpu->arch.shregs.srr0 = hr->srr0; |
184 | vcpu->arch.shregs.srr1 = hr->srr1; |
185 | vcpu->arch.shregs.sprg0 = hr->sprg[0]; |
186 | vcpu->arch.shregs.sprg1 = hr->sprg[1]; |
187 | vcpu->arch.shregs.sprg2 = hr->sprg[2]; |
188 | vcpu->arch.shregs.sprg3 = hr->sprg[3]; |
189 | vcpu->arch.pid = hr->pidr; |
190 | vcpu->arch.cfar = hr->cfar; |
191 | vcpu->arch.ppr = hr->ppr; |
192 | } |
193 | |
194 | static void kvmhv_nested_mmio_needed(struct kvm_vcpu *vcpu, u64 regs_ptr) |
195 | { |
196 | /* No need to reflect the page fault to L1, we've handled it */ |
197 | vcpu->arch.trap = 0; |
198 | |
199 | /* |
200 | * Since the L2 gprs have already been written back into L1 memory when |
201 | * we complete the mmio, store the L1 memory location of the L2 gpr |
202 | * being loaded into by the mmio so that the loaded value can be |
203 | * written there in kvmppc_complete_mmio_load() |
204 | */ |
205 | if (((vcpu->arch.io_gpr & KVM_MMIO_REG_EXT_MASK) == KVM_MMIO_REG_GPR) |
206 | && (vcpu->mmio_is_write == 0)) { |
207 | vcpu->arch.nested_io_gpr = (gpa_t) regs_ptr + |
208 | offsetof(struct pt_regs, |
209 | gpr[vcpu->arch.io_gpr]); |
210 | vcpu->arch.io_gpr = KVM_MMIO_REG_NESTED_GPR; |
211 | } |
212 | } |
213 | |
214 | static int kvmhv_read_guest_state_and_regs(struct kvm_vcpu *vcpu, |
215 | struct hv_guest_state *l2_hv, |
216 | struct pt_regs *l2_regs, |
217 | u64 hv_ptr, u64 regs_ptr) |
218 | { |
219 | int size; |
220 | |
221 | if (kvm_vcpu_read_guest(vcpu, gpa: hv_ptr, data: &l2_hv->version, |
222 | len: sizeof(l2_hv->version))) |
223 | return -1; |
224 | |
225 | if (kvmppc_need_byteswap(vcpu)) |
226 | l2_hv->version = swab64(l2_hv->version); |
227 | |
228 | size = hv_guest_state_size(l2_hv->version); |
229 | if (size < 0) |
230 | return -1; |
231 | |
232 | return kvm_vcpu_read_guest(vcpu, gpa: hv_ptr, data: l2_hv, len: size) || |
233 | kvm_vcpu_read_guest(vcpu, gpa: regs_ptr, data: l2_regs, |
234 | len: sizeof(struct pt_regs)); |
235 | } |
236 | |
237 | static int kvmhv_write_guest_state_and_regs(struct kvm_vcpu *vcpu, |
238 | struct hv_guest_state *l2_hv, |
239 | struct pt_regs *l2_regs, |
240 | u64 hv_ptr, u64 regs_ptr) |
241 | { |
242 | int size; |
243 | |
244 | size = hv_guest_state_size(l2_hv->version); |
245 | if (size < 0) |
246 | return -1; |
247 | |
248 | return kvm_vcpu_write_guest(vcpu, gpa: hv_ptr, data: l2_hv, len: size) || |
249 | kvm_vcpu_write_guest(vcpu, gpa: regs_ptr, data: l2_regs, |
250 | len: sizeof(struct pt_regs)); |
251 | } |
252 | |
253 | static void load_l2_hv_regs(struct kvm_vcpu *vcpu, |
254 | const struct hv_guest_state *l2_hv, |
255 | const struct hv_guest_state *l1_hv, u64 *lpcr) |
256 | { |
257 | struct kvmppc_vcore *vc = vcpu->arch.vcore; |
258 | u64 mask; |
259 | |
260 | restore_hv_regs(vcpu, hr: l2_hv); |
261 | |
262 | /* |
263 | * Don't let L1 change LPCR bits for the L2 except these: |
264 | */ |
265 | mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | LPCR_MER; |
266 | |
267 | /* |
268 | * Additional filtering is required depending on hardware |
269 | * and configuration. |
270 | */ |
271 | *lpcr = kvmppc_filter_lpcr_hv(vcpu->kvm, |
272 | (vc->lpcr & ~mask) | (*lpcr & mask)); |
273 | |
274 | /* |
275 | * Don't let L1 enable features for L2 which we don't allow for L1, |
276 | * but preserve the interrupt cause field. |
277 | */ |
278 | vcpu->arch.hfscr = l2_hv->hfscr & (HFSCR_INTR_CAUSE | vcpu->arch.hfscr_permitted); |
279 | |
280 | /* Don't let data address watchpoint match in hypervisor state */ |
281 | vcpu->arch.dawrx0 = l2_hv->dawrx0 & ~DAWRX_HYP; |
282 | vcpu->arch.dawrx1 = l2_hv->dawrx1 & ~DAWRX_HYP; |
283 | |
284 | /* Don't let completed instruction address breakpt match in HV state */ |
285 | if ((l2_hv->ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER) |
286 | vcpu->arch.ciabr = l2_hv->ciabr & ~CIABR_PRIV; |
287 | } |
288 | |
289 | long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) |
290 | { |
291 | long int err, r; |
292 | struct kvm_nested_guest *l2; |
293 | struct pt_regs l2_regs, saved_l1_regs; |
294 | struct hv_guest_state l2_hv = {0}, saved_l1_hv; |
295 | struct kvmppc_vcore *vc = vcpu->arch.vcore; |
296 | u64 hv_ptr, regs_ptr; |
297 | u64 hdec_exp, lpcr; |
298 | s64 delta_purr, delta_spurr, delta_ic, delta_vtb; |
299 | |
300 | if (vcpu->kvm->arch.l1_ptcr == 0) |
301 | return H_NOT_AVAILABLE; |
302 | |
303 | if (MSR_TM_TRANSACTIONAL(vcpu->arch.shregs.msr)) |
304 | return H_BAD_MODE; |
305 | |
306 | /* copy parameters in */ |
307 | hv_ptr = kvmppc_get_gpr(vcpu, 4); |
308 | regs_ptr = kvmppc_get_gpr(vcpu, 5); |
309 | kvm_vcpu_srcu_read_lock(vcpu); |
310 | err = kvmhv_read_guest_state_and_regs(vcpu, l2_hv: &l2_hv, l2_regs: &l2_regs, |
311 | hv_ptr, regs_ptr); |
312 | kvm_vcpu_srcu_read_unlock(vcpu); |
313 | if (err) |
314 | return H_PARAMETER; |
315 | |
316 | if (kvmppc_need_byteswap(vcpu)) |
317 | byteswap_hv_regs(hr: &l2_hv); |
318 | if (l2_hv.version > HV_GUEST_STATE_VERSION) |
319 | return H_P2; |
320 | |
321 | if (kvmppc_need_byteswap(vcpu)) |
322 | byteswap_pt_regs(regs: &l2_regs); |
323 | if (l2_hv.vcpu_token >= NR_CPUS) |
324 | return H_PARAMETER; |
325 | |
326 | /* |
327 | * L1 must have set up a suspended state to enter the L2 in a |
328 | * transactional state, and only in that case. These have to be |
329 | * filtered out here to prevent causing a TM Bad Thing in the |
330 | * host HRFID. We could synthesize a TM Bad Thing back to the L1 |
331 | * here but there doesn't seem like much point. |
332 | */ |
333 | if (MSR_TM_SUSPENDED(vcpu->arch.shregs.msr)) { |
334 | if (!MSR_TM_ACTIVE(l2_regs.msr)) |
335 | return H_BAD_MODE; |
336 | } else { |
337 | if (l2_regs.msr & MSR_TS_MASK) |
338 | return H_BAD_MODE; |
339 | if (WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_TS_MASK)) |
340 | return H_BAD_MODE; |
341 | } |
342 | |
343 | /* translate lpid */ |
344 | l2 = kvmhv_get_nested(vcpu->kvm, l2_hv.lpid, true); |
345 | if (!l2) |
346 | return H_PARAMETER; |
347 | if (!l2->l1_gr_to_hr) { |
348 | mutex_lock(&l2->tlb_lock); |
349 | kvmhv_update_ptbl_cache(gp: l2); |
350 | mutex_unlock(lock: &l2->tlb_lock); |
351 | } |
352 | |
353 | /* save l1 values of things */ |
354 | vcpu->arch.regs.msr = vcpu->arch.shregs.msr; |
355 | saved_l1_regs = vcpu->arch.regs; |
356 | kvmhv_save_hv_regs(vcpu, hr: &saved_l1_hv); |
357 | |
358 | /* convert TB values/offsets to host (L0) values */ |
359 | hdec_exp = l2_hv.hdec_expiry - vc->tb_offset; |
360 | vc->tb_offset += l2_hv.tb_offset; |
361 | vcpu->arch.dec_expires += l2_hv.tb_offset; |
362 | |
363 | /* set L1 state to L2 state */ |
364 | vcpu->arch.nested = l2; |
365 | vcpu->arch.nested_vcpu_id = l2_hv.vcpu_token; |
366 | vcpu->arch.nested_hfscr = l2_hv.hfscr; |
367 | vcpu->arch.regs = l2_regs; |
368 | |
369 | /* Guest must always run with ME enabled, HV disabled. */ |
370 | vcpu->arch.shregs.msr = (vcpu->arch.regs.msr | MSR_ME) & ~MSR_HV; |
371 | |
372 | lpcr = l2_hv.lpcr; |
373 | load_l2_hv_regs(vcpu, l2_hv: &l2_hv, l1_hv: &saved_l1_hv, lpcr: &lpcr); |
374 | |
375 | vcpu->arch.ret = RESUME_GUEST; |
376 | vcpu->arch.trap = 0; |
377 | do { |
378 | r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr); |
379 | } while (is_kvmppc_resume_guest(r)); |
380 | |
381 | /* save L2 state for return */ |
382 | l2_regs = vcpu->arch.regs; |
383 | l2_regs.msr = vcpu->arch.shregs.msr; |
384 | delta_purr = vcpu->arch.purr - l2_hv.purr; |
385 | delta_spurr = vcpu->arch.spurr - l2_hv.spurr; |
386 | delta_ic = vcpu->arch.ic - l2_hv.ic; |
387 | delta_vtb = vc->vtb - l2_hv.vtb; |
388 | save_hv_return_state(vcpu, hr: &l2_hv); |
389 | |
390 | /* restore L1 state */ |
391 | vcpu->arch.nested = NULL; |
392 | vcpu->arch.regs = saved_l1_regs; |
393 | vcpu->arch.shregs.msr = saved_l1_regs.msr & ~MSR_TS_MASK; |
394 | /* set L1 MSR TS field according to L2 transaction state */ |
395 | if (l2_regs.msr & MSR_TS_MASK) |
396 | vcpu->arch.shregs.msr |= MSR_TS_S; |
397 | vc->tb_offset = saved_l1_hv.tb_offset; |
398 | /* XXX: is this always the same delta as saved_l1_hv.tb_offset? */ |
399 | vcpu->arch.dec_expires -= l2_hv.tb_offset; |
400 | restore_hv_regs(vcpu, hr: &saved_l1_hv); |
401 | vcpu->arch.purr += delta_purr; |
402 | vcpu->arch.spurr += delta_spurr; |
403 | vcpu->arch.ic += delta_ic; |
404 | vc->vtb += delta_vtb; |
405 | |
406 | kvmhv_put_nested(l2); |
407 | |
408 | /* copy l2_hv_state and regs back to guest */ |
409 | if (kvmppc_need_byteswap(vcpu)) { |
410 | byteswap_hv_regs(hr: &l2_hv); |
411 | byteswap_pt_regs(regs: &l2_regs); |
412 | } |
413 | kvm_vcpu_srcu_read_lock(vcpu); |
414 | err = kvmhv_write_guest_state_and_regs(vcpu, l2_hv: &l2_hv, l2_regs: &l2_regs, |
415 | hv_ptr, regs_ptr); |
416 | kvm_vcpu_srcu_read_unlock(vcpu); |
417 | if (err) |
418 | return H_AUTHORITY; |
419 | |
420 | if (r == -EINTR) |
421 | return H_INTERRUPT; |
422 | |
423 | if (vcpu->mmio_needed) { |
424 | kvmhv_nested_mmio_needed(vcpu, regs_ptr); |
425 | return H_TOO_HARD; |
426 | } |
427 | |
428 | return vcpu->arch.trap; |
429 | } |
430 | |
431 | unsigned long nested_capabilities; |
432 | |
433 | long kvmhv_nested_init(void) |
434 | { |
435 | long int ptb_order; |
436 | unsigned long ptcr, host_capabilities; |
437 | long rc; |
438 | |
439 | if (!kvmhv_on_pseries()) |
440 | return 0; |
441 | if (!radix_enabled()) |
442 | return -ENODEV; |
443 | |
444 | rc = plpar_guest_get_capabilities(0, &host_capabilities); |
445 | if (rc == H_SUCCESS) { |
446 | unsigned long capabilities = 0; |
447 | |
448 | if (cpu_has_feature(CPU_FTR_ARCH_31)) |
449 | capabilities |= H_GUEST_CAP_POWER10; |
450 | if (cpu_has_feature(CPU_FTR_ARCH_300)) |
451 | capabilities |= H_GUEST_CAP_POWER9; |
452 | |
453 | nested_capabilities = capabilities & host_capabilities; |
454 | rc = plpar_guest_set_capabilities(0, nested_capabilities); |
455 | if (rc != H_SUCCESS) { |
456 | pr_err("kvm-hv: Could not configure parent hypervisor capabilities (rc=%ld)" , |
457 | rc); |
458 | return -ENODEV; |
459 | } |
460 | |
461 | static_branch_enable(&__kvmhv_is_nestedv2); |
462 | return 0; |
463 | } |
464 | |
465 | pr_info("kvm-hv: nestedv2 get capabilities hcall failed, falling back to nestedv1 (rc=%ld)\n" , |
466 | rc); |
467 | /* Partition table entry is 1<<4 bytes in size, hence the 4. */ |
468 | ptb_order = KVM_MAX_NESTED_GUESTS_SHIFT + 4; |
469 | /* Minimum partition table size is 1<<12 bytes */ |
470 | if (ptb_order < 12) |
471 | ptb_order = 12; |
472 | pseries_partition_tb = kmalloc(sizeof(struct patb_entry) << ptb_order, |
473 | GFP_KERNEL); |
474 | if (!pseries_partition_tb) { |
475 | pr_err("kvm-hv: failed to allocated nested partition table\n" ); |
476 | return -ENOMEM; |
477 | } |
478 | |
479 | ptcr = __pa(pseries_partition_tb) | (ptb_order - 12); |
480 | rc = plpar_hcall_norets(H_SET_PARTITION_TABLE, ptcr); |
481 | if (rc != H_SUCCESS) { |
482 | pr_err("kvm-hv: Parent hypervisor does not support nesting (rc=%ld)\n" , |
483 | rc); |
484 | kfree(objp: pseries_partition_tb); |
485 | pseries_partition_tb = NULL; |
486 | return -ENODEV; |
487 | } |
488 | |
489 | return 0; |
490 | } |
491 | |
492 | void kvmhv_nested_exit(void) |
493 | { |
494 | /* |
495 | * N.B. the kvmhv_on_pseries() test is there because it enables |
496 | * the compiler to remove the call to plpar_hcall_norets() |
497 | * when CONFIG_PPC_PSERIES=n. |
498 | */ |
499 | if (kvmhv_on_pseries() && pseries_partition_tb) { |
500 | plpar_hcall_norets(H_SET_PARTITION_TABLE, 0); |
501 | kfree(objp: pseries_partition_tb); |
502 | pseries_partition_tb = NULL; |
503 | } |
504 | } |
505 | |
506 | void kvmhv_flush_lpid(u64 lpid) |
507 | { |
508 | long rc; |
509 | |
510 | if (!kvmhv_on_pseries()) { |
511 | radix__flush_all_lpid(lpid); |
512 | return; |
513 | } |
514 | |
515 | if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) |
516 | rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(2, 0, 1), |
517 | lpid, TLBIEL_INVAL_SET_LPID); |
518 | else |
519 | rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU, |
520 | H_RPTI_TYPE_NESTED | |
521 | H_RPTI_TYPE_TLB | H_RPTI_TYPE_PWC | |
522 | H_RPTI_TYPE_PAT, |
523 | H_RPTI_PAGE_ALL, 0, -1UL); |
524 | if (rc) |
525 | pr_err("KVM: TLB LPID invalidation hcall failed, rc=%ld\n" , rc); |
526 | } |
527 | |
528 | void kvmhv_set_ptbl_entry(u64 lpid, u64 dw0, u64 dw1) |
529 | { |
530 | if (!kvmhv_on_pseries()) { |
531 | mmu_partition_table_set_entry(lpid, dw0, dw1, true); |
532 | return; |
533 | } |
534 | |
535 | if (kvmhv_is_nestedv1()) { |
536 | pseries_partition_tb[lpid].patb0 = cpu_to_be64(dw0); |
537 | pseries_partition_tb[lpid].patb1 = cpu_to_be64(dw1); |
538 | /* L0 will do the necessary barriers */ |
539 | kvmhv_flush_lpid(lpid); |
540 | } |
541 | |
542 | if (kvmhv_is_nestedv2()) |
543 | kvmhv_nestedv2_set_ptbl_entry(lpid, dw0, dw1); |
544 | } |
545 | |
546 | static void kvmhv_set_nested_ptbl(struct kvm_nested_guest *gp) |
547 | { |
548 | unsigned long dw0; |
549 | |
550 | dw0 = PATB_HR | radix__get_tree_size() | |
551 | __pa(gp->shadow_pgtable) | RADIX_PGD_INDEX_SIZE; |
552 | kvmhv_set_ptbl_entry(lpid: gp->shadow_lpid, dw0, dw1: gp->process_table); |
553 | } |
554 | |
555 | /* |
556 | * Handle the H_SET_PARTITION_TABLE hcall. |
557 | * r4 = guest real address of partition table + log_2(size) - 12 |
558 | * (formatted as for the PTCR). |
559 | */ |
560 | long kvmhv_set_partition_table(struct kvm_vcpu *vcpu) |
561 | { |
562 | struct kvm *kvm = vcpu->kvm; |
563 | unsigned long ptcr = kvmppc_get_gpr(vcpu, 4); |
564 | int srcu_idx; |
565 | long ret = H_SUCCESS; |
566 | |
567 | srcu_idx = srcu_read_lock(ssp: &kvm->srcu); |
568 | /* Check partition size and base address. */ |
569 | if ((ptcr & PRTS_MASK) + 12 - 4 > KVM_MAX_NESTED_GUESTS_SHIFT || |
570 | !kvm_is_visible_gfn(vcpu->kvm, (ptcr & PRTB_MASK) >> PAGE_SHIFT)) |
571 | ret = H_PARAMETER; |
572 | srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx); |
573 | if (ret == H_SUCCESS) |
574 | kvm->arch.l1_ptcr = ptcr; |
575 | |
576 | return ret; |
577 | } |
578 | |
579 | /* |
580 | * Handle the H_COPY_TOFROM_GUEST hcall. |
581 | * r4 = L1 lpid of nested guest |
582 | * r5 = pid |
583 | * r6 = eaddr to access |
584 | * r7 = to buffer (L1 gpa) |
585 | * r8 = from buffer (L1 gpa) |
586 | * r9 = n bytes to copy |
587 | */ |
588 | long kvmhv_copy_tofrom_guest_nested(struct kvm_vcpu *vcpu) |
589 | { |
590 | struct kvm_nested_guest *gp; |
591 | int l1_lpid = kvmppc_get_gpr(vcpu, 4); |
592 | int pid = kvmppc_get_gpr(vcpu, 5); |
593 | gva_t eaddr = kvmppc_get_gpr(vcpu, 6); |
594 | gpa_t gp_to = (gpa_t) kvmppc_get_gpr(vcpu, 7); |
595 | gpa_t gp_from = (gpa_t) kvmppc_get_gpr(vcpu, 8); |
596 | void *buf; |
597 | unsigned long n = kvmppc_get_gpr(vcpu, 9); |
598 | bool is_load = !!gp_to; |
599 | long rc; |
600 | |
601 | if (gp_to && gp_from) /* One must be NULL to determine the direction */ |
602 | return H_PARAMETER; |
603 | |
604 | if (eaddr & (0xFFFUL << 52)) |
605 | return H_PARAMETER; |
606 | |
607 | buf = kzalloc(size: n, GFP_KERNEL | __GFP_NOWARN); |
608 | if (!buf) |
609 | return H_NO_MEM; |
610 | |
611 | gp = kvmhv_get_nested(vcpu->kvm, l1_lpid, false); |
612 | if (!gp) { |
613 | rc = H_PARAMETER; |
614 | goto out_free; |
615 | } |
616 | |
617 | mutex_lock(&gp->tlb_lock); |
618 | |
619 | if (is_load) { |
620 | /* Load from the nested guest into our buffer */ |
621 | rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid, |
622 | eaddr, buf, NULL, n); |
623 | if (rc) |
624 | goto not_found; |
625 | |
626 | /* Write what was loaded into our buffer back to the L1 guest */ |
627 | kvm_vcpu_srcu_read_lock(vcpu); |
628 | rc = kvm_vcpu_write_guest(vcpu, gpa: gp_to, data: buf, len: n); |
629 | kvm_vcpu_srcu_read_unlock(vcpu); |
630 | if (rc) |
631 | goto not_found; |
632 | } else { |
633 | /* Load the data to be stored from the L1 guest into our buf */ |
634 | kvm_vcpu_srcu_read_lock(vcpu); |
635 | rc = kvm_vcpu_read_guest(vcpu, gpa: gp_from, data: buf, len: n); |
636 | kvm_vcpu_srcu_read_unlock(vcpu); |
637 | if (rc) |
638 | goto not_found; |
639 | |
640 | /* Store from our buffer into the nested guest */ |
641 | rc = __kvmhv_copy_tofrom_guest_radix(gp->shadow_lpid, pid, |
642 | eaddr, NULL, buf, n); |
643 | if (rc) |
644 | goto not_found; |
645 | } |
646 | |
647 | out_unlock: |
648 | mutex_unlock(lock: &gp->tlb_lock); |
649 | kvmhv_put_nested(gp); |
650 | out_free: |
651 | kfree(objp: buf); |
652 | return rc; |
653 | not_found: |
654 | rc = H_NOT_FOUND; |
655 | goto out_unlock; |
656 | } |
657 | |
658 | /* |
659 | * Reload the partition table entry for a guest. |
660 | * Caller must hold gp->tlb_lock. |
661 | */ |
662 | static void kvmhv_update_ptbl_cache(struct kvm_nested_guest *gp) |
663 | { |
664 | int ret; |
665 | struct patb_entry ptbl_entry; |
666 | unsigned long ptbl_addr; |
667 | struct kvm *kvm = gp->l1_host; |
668 | |
669 | ret = -EFAULT; |
670 | ptbl_addr = (kvm->arch.l1_ptcr & PRTB_MASK) + (gp->l1_lpid << 4); |
671 | if (gp->l1_lpid < (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) { |
672 | int srcu_idx = srcu_read_lock(ssp: &kvm->srcu); |
673 | ret = kvm_read_guest(kvm, gpa: ptbl_addr, |
674 | data: &ptbl_entry, len: sizeof(ptbl_entry)); |
675 | srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx); |
676 | } |
677 | if (ret) { |
678 | gp->l1_gr_to_hr = 0; |
679 | gp->process_table = 0; |
680 | } else { |
681 | gp->l1_gr_to_hr = be64_to_cpu(ptbl_entry.patb0); |
682 | gp->process_table = be64_to_cpu(ptbl_entry.patb1); |
683 | } |
684 | kvmhv_set_nested_ptbl(gp); |
685 | } |
686 | |
687 | void kvmhv_vm_nested_init(struct kvm *kvm) |
688 | { |
689 | idr_init(idr: &kvm->arch.kvm_nested_guest_idr); |
690 | } |
691 | |
692 | static struct kvm_nested_guest *__find_nested(struct kvm *kvm, int lpid) |
693 | { |
694 | return idr_find(&kvm->arch.kvm_nested_guest_idr, id: lpid); |
695 | } |
696 | |
697 | static bool __prealloc_nested(struct kvm *kvm, int lpid) |
698 | { |
699 | if (idr_alloc(&kvm->arch.kvm_nested_guest_idr, |
700 | NULL, start: lpid, end: lpid + 1, GFP_KERNEL) != lpid) |
701 | return false; |
702 | return true; |
703 | } |
704 | |
705 | static void __add_nested(struct kvm *kvm, int lpid, struct kvm_nested_guest *gp) |
706 | { |
707 | if (idr_replace(&kvm->arch.kvm_nested_guest_idr, gp, id: lpid)) |
708 | WARN_ON(1); |
709 | } |
710 | |
711 | static void __remove_nested(struct kvm *kvm, int lpid) |
712 | { |
713 | idr_remove(&kvm->arch.kvm_nested_guest_idr, id: lpid); |
714 | } |
715 | |
716 | static struct kvm_nested_guest *kvmhv_alloc_nested(struct kvm *kvm, unsigned int lpid) |
717 | { |
718 | struct kvm_nested_guest *gp; |
719 | long shadow_lpid; |
720 | |
721 | gp = kzalloc(sizeof(*gp), GFP_KERNEL); |
722 | if (!gp) |
723 | return NULL; |
724 | gp->l1_host = kvm; |
725 | gp->l1_lpid = lpid; |
726 | mutex_init(&gp->tlb_lock); |
727 | gp->shadow_pgtable = pgd_alloc(kvm->mm); |
728 | if (!gp->shadow_pgtable) |
729 | goto out_free; |
730 | shadow_lpid = kvmppc_alloc_lpid(); |
731 | if (shadow_lpid < 0) |
732 | goto out_free2; |
733 | gp->shadow_lpid = shadow_lpid; |
734 | gp->radix = 1; |
735 | |
736 | memset(gp->prev_cpu, -1, sizeof(gp->prev_cpu)); |
737 | |
738 | return gp; |
739 | |
740 | out_free2: |
741 | pgd_free(mm: kvm->mm, pgd: gp->shadow_pgtable); |
742 | out_free: |
743 | kfree(objp: gp); |
744 | return NULL; |
745 | } |
746 | |
747 | /* |
748 | * Free up any resources allocated for a nested guest. |
749 | */ |
750 | static void kvmhv_release_nested(struct kvm_nested_guest *gp) |
751 | { |
752 | struct kvm *kvm = gp->l1_host; |
753 | |
754 | if (gp->shadow_pgtable) { |
755 | /* |
756 | * No vcpu is using this struct and no call to |
757 | * kvmhv_get_nested can find this struct, |
758 | * so we don't need to hold kvm->mmu_lock. |
759 | */ |
760 | kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, |
761 | gp->shadow_lpid); |
762 | pgd_free(mm: kvm->mm, pgd: gp->shadow_pgtable); |
763 | } |
764 | kvmhv_set_ptbl_entry(lpid: gp->shadow_lpid, dw0: 0, dw1: 0); |
765 | kvmppc_free_lpid(gp->shadow_lpid); |
766 | kfree(objp: gp); |
767 | } |
768 | |
769 | static void kvmhv_remove_nested(struct kvm_nested_guest *gp) |
770 | { |
771 | struct kvm *kvm = gp->l1_host; |
772 | int lpid = gp->l1_lpid; |
773 | long ref; |
774 | |
775 | spin_lock(lock: &kvm->mmu_lock); |
776 | if (gp == __find_nested(kvm, lpid)) { |
777 | __remove_nested(kvm, lpid); |
778 | --gp->refcnt; |
779 | } |
780 | ref = gp->refcnt; |
781 | spin_unlock(lock: &kvm->mmu_lock); |
782 | if (ref == 0) |
783 | kvmhv_release_nested(gp); |
784 | } |
785 | |
786 | /* |
787 | * Free up all nested resources allocated for this guest. |
788 | * This is called with no vcpus of the guest running, when |
789 | * switching the guest to HPT mode or when destroying the |
790 | * guest. |
791 | */ |
792 | void kvmhv_release_all_nested(struct kvm *kvm) |
793 | { |
794 | int lpid; |
795 | struct kvm_nested_guest *gp; |
796 | struct kvm_nested_guest *freelist = NULL; |
797 | struct kvm_memory_slot *memslot; |
798 | int srcu_idx, bkt; |
799 | |
800 | spin_lock(lock: &kvm->mmu_lock); |
801 | idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) { |
802 | __remove_nested(kvm, lpid); |
803 | if (--gp->refcnt == 0) { |
804 | gp->next = freelist; |
805 | freelist = gp; |
806 | } |
807 | } |
808 | idr_destroy(&kvm->arch.kvm_nested_guest_idr); |
809 | /* idr is empty and may be reused at this point */ |
810 | spin_unlock(lock: &kvm->mmu_lock); |
811 | while ((gp = freelist) != NULL) { |
812 | freelist = gp->next; |
813 | kvmhv_release_nested(gp); |
814 | } |
815 | |
816 | srcu_idx = srcu_read_lock(ssp: &kvm->srcu); |
817 | kvm_for_each_memslot(memslot, bkt, kvm_memslots(kvm)) |
818 | kvmhv_free_memslot_nest_rmap(free: memslot); |
819 | srcu_read_unlock(ssp: &kvm->srcu, idx: srcu_idx); |
820 | } |
821 | |
822 | /* caller must hold gp->tlb_lock */ |
823 | static void kvmhv_flush_nested(struct kvm_nested_guest *gp) |
824 | { |
825 | struct kvm *kvm = gp->l1_host; |
826 | |
827 | spin_lock(lock: &kvm->mmu_lock); |
828 | kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, gp->shadow_lpid); |
829 | spin_unlock(lock: &kvm->mmu_lock); |
830 | kvmhv_flush_lpid(lpid: gp->shadow_lpid); |
831 | kvmhv_update_ptbl_cache(gp); |
832 | if (gp->l1_gr_to_hr == 0) |
833 | kvmhv_remove_nested(gp); |
834 | } |
835 | |
836 | struct kvm_nested_guest *kvmhv_get_nested(struct kvm *kvm, int l1_lpid, |
837 | bool create) |
838 | { |
839 | struct kvm_nested_guest *gp, *newgp; |
840 | |
841 | if (l1_lpid >= (1ul << ((kvm->arch.l1_ptcr & PRTS_MASK) + 12 - 4))) |
842 | return NULL; |
843 | |
844 | spin_lock(lock: &kvm->mmu_lock); |
845 | gp = __find_nested(kvm, lpid: l1_lpid); |
846 | if (gp) |
847 | ++gp->refcnt; |
848 | spin_unlock(lock: &kvm->mmu_lock); |
849 | |
850 | if (gp || !create) |
851 | return gp; |
852 | |
853 | newgp = kvmhv_alloc_nested(kvm, lpid: l1_lpid); |
854 | if (!newgp) |
855 | return NULL; |
856 | |
857 | if (!__prealloc_nested(kvm, lpid: l1_lpid)) { |
858 | kvmhv_release_nested(gp: newgp); |
859 | return NULL; |
860 | } |
861 | |
862 | spin_lock(lock: &kvm->mmu_lock); |
863 | gp = __find_nested(kvm, lpid: l1_lpid); |
864 | if (!gp) { |
865 | __add_nested(kvm, lpid: l1_lpid, gp: newgp); |
866 | ++newgp->refcnt; |
867 | gp = newgp; |
868 | newgp = NULL; |
869 | } |
870 | ++gp->refcnt; |
871 | spin_unlock(lock: &kvm->mmu_lock); |
872 | |
873 | if (newgp) |
874 | kvmhv_release_nested(gp: newgp); |
875 | |
876 | return gp; |
877 | } |
878 | |
879 | void kvmhv_put_nested(struct kvm_nested_guest *gp) |
880 | { |
881 | struct kvm *kvm = gp->l1_host; |
882 | long ref; |
883 | |
884 | spin_lock(lock: &kvm->mmu_lock); |
885 | ref = --gp->refcnt; |
886 | spin_unlock(lock: &kvm->mmu_lock); |
887 | if (ref == 0) |
888 | kvmhv_release_nested(gp); |
889 | } |
890 | |
891 | pte_t *find_kvm_nested_guest_pte(struct kvm *kvm, unsigned long lpid, |
892 | unsigned long ea, unsigned *hshift) |
893 | { |
894 | struct kvm_nested_guest *gp; |
895 | pte_t *pte; |
896 | |
897 | gp = __find_nested(kvm, lpid); |
898 | if (!gp) |
899 | return NULL; |
900 | |
901 | VM_WARN(!spin_is_locked(&kvm->mmu_lock), |
902 | "%s called with kvm mmu_lock not held \n" , __func__); |
903 | pte = __find_linux_pte(gp->shadow_pgtable, ea, NULL, hshift); |
904 | |
905 | return pte; |
906 | } |
907 | |
908 | static inline bool kvmhv_n_rmap_is_equal(u64 rmap_1, u64 rmap_2) |
909 | { |
910 | return !((rmap_1 ^ rmap_2) & (RMAP_NESTED_LPID_MASK | |
911 | RMAP_NESTED_GPA_MASK)); |
912 | } |
913 | |
914 | void kvmhv_insert_nest_rmap(struct kvm *kvm, unsigned long *rmapp, |
915 | struct rmap_nested **n_rmap) |
916 | { |
917 | struct llist_node *entry = ((struct llist_head *) rmapp)->first; |
918 | struct rmap_nested *cursor; |
919 | u64 rmap, new_rmap = (*n_rmap)->rmap; |
920 | |
921 | /* Are there any existing entries? */ |
922 | if (!(*rmapp)) { |
923 | /* No -> use the rmap as a single entry */ |
924 | *rmapp = new_rmap | RMAP_NESTED_IS_SINGLE_ENTRY; |
925 | return; |
926 | } |
927 | |
928 | /* Do any entries match what we're trying to insert? */ |
929 | for_each_nest_rmap_safe(cursor, entry, &rmap) { |
930 | if (kvmhv_n_rmap_is_equal(rmap_1: rmap, rmap_2: new_rmap)) |
931 | return; |
932 | } |
933 | |
934 | /* Do we need to create a list or just add the new entry? */ |
935 | rmap = *rmapp; |
936 | if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ |
937 | *rmapp = 0UL; |
938 | llist_add(new: &((*n_rmap)->list), head: (struct llist_head *) rmapp); |
939 | if (rmap & RMAP_NESTED_IS_SINGLE_ENTRY) /* Not previously a list */ |
940 | (*n_rmap)->list.next = (struct llist_node *) rmap; |
941 | |
942 | /* Set NULL so not freed by caller */ |
943 | *n_rmap = NULL; |
944 | } |
945 | |
946 | static void kvmhv_update_nest_rmap_rc(struct kvm *kvm, u64 n_rmap, |
947 | unsigned long clr, unsigned long set, |
948 | unsigned long hpa, unsigned long mask) |
949 | { |
950 | unsigned long gpa; |
951 | unsigned int shift, lpid; |
952 | pte_t *ptep; |
953 | |
954 | gpa = n_rmap & RMAP_NESTED_GPA_MASK; |
955 | lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; |
956 | |
957 | /* Find the pte */ |
958 | ptep = find_kvm_nested_guest_pte(kvm, lpid, ea: gpa, hshift: &shift); |
959 | /* |
960 | * If the pte is present and the pfn is still the same, update the pte. |
961 | * If the pfn has changed then this is a stale rmap entry, the nested |
962 | * gpa actually points somewhere else now, and there is nothing to do. |
963 | * XXX A future optimisation would be to remove the rmap entry here. |
964 | */ |
965 | if (ptep && pte_present(a: *ptep) && ((pte_val(pte: *ptep) & mask) == hpa)) { |
966 | __radix_pte_update(ptep, clr, set); |
967 | kvmppc_radix_tlbie_page(kvm, gpa, shift, lpid); |
968 | } |
969 | } |
970 | |
971 | /* |
972 | * For a given list of rmap entries, update the rc bits in all ptes in shadow |
973 | * page tables for nested guests which are referenced by the rmap list. |
974 | */ |
975 | void kvmhv_update_nest_rmap_rc_list(struct kvm *kvm, unsigned long *rmapp, |
976 | unsigned long clr, unsigned long set, |
977 | unsigned long hpa, unsigned long nbytes) |
978 | { |
979 | struct llist_node *entry = ((struct llist_head *) rmapp)->first; |
980 | struct rmap_nested *cursor; |
981 | unsigned long rmap, mask; |
982 | |
983 | if ((clr | set) & ~(_PAGE_DIRTY | _PAGE_ACCESSED)) |
984 | return; |
985 | |
986 | mask = PTE_RPN_MASK & ~(nbytes - 1); |
987 | hpa &= mask; |
988 | |
989 | for_each_nest_rmap_safe(cursor, entry, &rmap) |
990 | kvmhv_update_nest_rmap_rc(kvm, n_rmap: rmap, clr, set, hpa, mask); |
991 | } |
992 | |
993 | static void kvmhv_remove_nest_rmap(struct kvm *kvm, u64 n_rmap, |
994 | unsigned long hpa, unsigned long mask) |
995 | { |
996 | struct kvm_nested_guest *gp; |
997 | unsigned long gpa; |
998 | unsigned int shift, lpid; |
999 | pte_t *ptep; |
1000 | |
1001 | gpa = n_rmap & RMAP_NESTED_GPA_MASK; |
1002 | lpid = (n_rmap & RMAP_NESTED_LPID_MASK) >> RMAP_NESTED_LPID_SHIFT; |
1003 | gp = __find_nested(kvm, lpid); |
1004 | if (!gp) |
1005 | return; |
1006 | |
1007 | /* Find and invalidate the pte */ |
1008 | ptep = find_kvm_nested_guest_pte(kvm, lpid, ea: gpa, hshift: &shift); |
1009 | /* Don't spuriously invalidate ptes if the pfn has changed */ |
1010 | if (ptep && pte_present(a: *ptep) && ((pte_val(pte: *ptep) & mask) == hpa)) |
1011 | kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); |
1012 | } |
1013 | |
1014 | static void kvmhv_remove_nest_rmap_list(struct kvm *kvm, unsigned long *rmapp, |
1015 | unsigned long hpa, unsigned long mask) |
1016 | { |
1017 | struct llist_node *entry = llist_del_all(head: (struct llist_head *) rmapp); |
1018 | struct rmap_nested *cursor; |
1019 | unsigned long rmap; |
1020 | |
1021 | for_each_nest_rmap_safe(cursor, entry, &rmap) { |
1022 | kvmhv_remove_nest_rmap(kvm, n_rmap: rmap, hpa, mask); |
1023 | kfree(objp: cursor); |
1024 | } |
1025 | } |
1026 | |
1027 | /* called with kvm->mmu_lock held */ |
1028 | void kvmhv_remove_nest_rmap_range(struct kvm *kvm, |
1029 | const struct kvm_memory_slot *memslot, |
1030 | unsigned long gpa, unsigned long hpa, |
1031 | unsigned long nbytes) |
1032 | { |
1033 | unsigned long gfn, end_gfn; |
1034 | unsigned long addr_mask; |
1035 | |
1036 | if (!memslot) |
1037 | return; |
1038 | gfn = (gpa >> PAGE_SHIFT) - memslot->base_gfn; |
1039 | end_gfn = gfn + (nbytes >> PAGE_SHIFT); |
1040 | |
1041 | addr_mask = PTE_RPN_MASK & ~(nbytes - 1); |
1042 | hpa &= addr_mask; |
1043 | |
1044 | for (; gfn < end_gfn; gfn++) { |
1045 | unsigned long *rmap = &memslot->arch.rmap[gfn]; |
1046 | kvmhv_remove_nest_rmap_list(kvm, rmapp: rmap, hpa, mask: addr_mask); |
1047 | } |
1048 | } |
1049 | |
1050 | static void kvmhv_free_memslot_nest_rmap(struct kvm_memory_slot *free) |
1051 | { |
1052 | unsigned long page; |
1053 | |
1054 | for (page = 0; page < free->npages; page++) { |
1055 | unsigned long rmap, *rmapp = &free->arch.rmap[page]; |
1056 | struct rmap_nested *cursor; |
1057 | struct llist_node *entry; |
1058 | |
1059 | entry = llist_del_all(head: (struct llist_head *) rmapp); |
1060 | for_each_nest_rmap_safe(cursor, entry, &rmap) |
1061 | kfree(objp: cursor); |
1062 | } |
1063 | } |
1064 | |
1065 | static bool kvmhv_invalidate_shadow_pte(struct kvm_vcpu *vcpu, |
1066 | struct kvm_nested_guest *gp, |
1067 | long gpa, int *shift_ret) |
1068 | { |
1069 | struct kvm *kvm = vcpu->kvm; |
1070 | bool ret = false; |
1071 | pte_t *ptep; |
1072 | int shift; |
1073 | |
1074 | spin_lock(lock: &kvm->mmu_lock); |
1075 | ptep = find_kvm_nested_guest_pte(kvm, lpid: gp->l1_lpid, ea: gpa, hshift: &shift); |
1076 | if (!shift) |
1077 | shift = PAGE_SHIFT; |
1078 | if (ptep && pte_present(a: *ptep)) { |
1079 | kvmppc_unmap_pte(kvm, ptep, gpa, shift, NULL, gp->shadow_lpid); |
1080 | ret = true; |
1081 | } |
1082 | spin_unlock(lock: &kvm->mmu_lock); |
1083 | |
1084 | if (shift_ret) |
1085 | *shift_ret = shift; |
1086 | return ret; |
1087 | } |
1088 | |
1089 | static inline int get_ric(unsigned int instr) |
1090 | { |
1091 | return (instr >> 18) & 0x3; |
1092 | } |
1093 | |
1094 | static inline int get_prs(unsigned int instr) |
1095 | { |
1096 | return (instr >> 17) & 0x1; |
1097 | } |
1098 | |
1099 | static inline int get_r(unsigned int instr) |
1100 | { |
1101 | return (instr >> 16) & 0x1; |
1102 | } |
1103 | |
1104 | static inline int get_lpid(unsigned long r_val) |
1105 | { |
1106 | return r_val & 0xffffffff; |
1107 | } |
1108 | |
1109 | static inline int get_is(unsigned long r_val) |
1110 | { |
1111 | return (r_val >> 10) & 0x3; |
1112 | } |
1113 | |
1114 | static inline int get_ap(unsigned long r_val) |
1115 | { |
1116 | return (r_val >> 5) & 0x7; |
1117 | } |
1118 | |
1119 | static inline long get_epn(unsigned long r_val) |
1120 | { |
1121 | return r_val >> 12; |
1122 | } |
1123 | |
1124 | static int kvmhv_emulate_tlbie_tlb_addr(struct kvm_vcpu *vcpu, int lpid, |
1125 | int ap, long epn) |
1126 | { |
1127 | struct kvm *kvm = vcpu->kvm; |
1128 | struct kvm_nested_guest *gp; |
1129 | long npages; |
1130 | int shift, shadow_shift; |
1131 | unsigned long addr; |
1132 | |
1133 | shift = ap_to_shift(ap); |
1134 | addr = epn << 12; |
1135 | if (shift < 0) |
1136 | /* Invalid ap encoding */ |
1137 | return -EINVAL; |
1138 | |
1139 | addr &= ~((1UL << shift) - 1); |
1140 | npages = 1UL << (shift - PAGE_SHIFT); |
1141 | |
1142 | gp = kvmhv_get_nested(kvm, lpid, false); |
1143 | if (!gp) /* No such guest -> nothing to do */ |
1144 | return 0; |
1145 | mutex_lock(&gp->tlb_lock); |
1146 | |
1147 | /* There may be more than one host page backing this single guest pte */ |
1148 | do { |
1149 | kvmhv_invalidate_shadow_pte(vcpu, gp, gpa: addr, shift_ret: &shadow_shift); |
1150 | |
1151 | npages -= 1UL << (shadow_shift - PAGE_SHIFT); |
1152 | addr += 1UL << shadow_shift; |
1153 | } while (npages > 0); |
1154 | |
1155 | mutex_unlock(lock: &gp->tlb_lock); |
1156 | kvmhv_put_nested(gp); |
1157 | return 0; |
1158 | } |
1159 | |
1160 | static void kvmhv_emulate_tlbie_lpid(struct kvm_vcpu *vcpu, |
1161 | struct kvm_nested_guest *gp, int ric) |
1162 | { |
1163 | struct kvm *kvm = vcpu->kvm; |
1164 | |
1165 | mutex_lock(&gp->tlb_lock); |
1166 | switch (ric) { |
1167 | case 0: |
1168 | /* Invalidate TLB */ |
1169 | spin_lock(lock: &kvm->mmu_lock); |
1170 | kvmppc_free_pgtable_radix(kvm, gp->shadow_pgtable, |
1171 | gp->shadow_lpid); |
1172 | kvmhv_flush_lpid(lpid: gp->shadow_lpid); |
1173 | spin_unlock(lock: &kvm->mmu_lock); |
1174 | break; |
1175 | case 1: |
1176 | /* |
1177 | * Invalidate PWC |
1178 | * We don't cache this -> nothing to do |
1179 | */ |
1180 | break; |
1181 | case 2: |
1182 | /* Invalidate TLB, PWC and caching of partition table entries */ |
1183 | kvmhv_flush_nested(gp); |
1184 | break; |
1185 | default: |
1186 | break; |
1187 | } |
1188 | mutex_unlock(lock: &gp->tlb_lock); |
1189 | } |
1190 | |
1191 | static void kvmhv_emulate_tlbie_all_lpid(struct kvm_vcpu *vcpu, int ric) |
1192 | { |
1193 | struct kvm *kvm = vcpu->kvm; |
1194 | struct kvm_nested_guest *gp; |
1195 | int lpid; |
1196 | |
1197 | spin_lock(lock: &kvm->mmu_lock); |
1198 | idr_for_each_entry(&kvm->arch.kvm_nested_guest_idr, gp, lpid) { |
1199 | spin_unlock(lock: &kvm->mmu_lock); |
1200 | kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); |
1201 | spin_lock(lock: &kvm->mmu_lock); |
1202 | } |
1203 | spin_unlock(lock: &kvm->mmu_lock); |
1204 | } |
1205 | |
1206 | static int kvmhv_emulate_priv_tlbie(struct kvm_vcpu *vcpu, unsigned int instr, |
1207 | unsigned long rsval, unsigned long rbval) |
1208 | { |
1209 | struct kvm *kvm = vcpu->kvm; |
1210 | struct kvm_nested_guest *gp; |
1211 | int r, ric, prs, is, ap; |
1212 | int lpid; |
1213 | long epn; |
1214 | int ret = 0; |
1215 | |
1216 | ric = get_ric(instr); |
1217 | prs = get_prs(instr); |
1218 | r = get_r(instr); |
1219 | lpid = get_lpid(r_val: rsval); |
1220 | is = get_is(r_val: rbval); |
1221 | |
1222 | /* |
1223 | * These cases are invalid and are not handled: |
1224 | * r != 1 -> Only radix supported |
1225 | * prs == 1 -> Not HV privileged |
1226 | * ric == 3 -> No cluster bombs for radix |
1227 | * is == 1 -> Partition scoped translations not associated with pid |
1228 | * (!is) && (ric == 1 || ric == 2) -> Not supported by ISA |
1229 | */ |
1230 | if ((!r) || (prs) || (ric == 3) || (is == 1) || |
1231 | ((!is) && (ric == 1 || ric == 2))) |
1232 | return -EINVAL; |
1233 | |
1234 | switch (is) { |
1235 | case 0: |
1236 | /* |
1237 | * We know ric == 0 |
1238 | * Invalidate TLB for a given target address |
1239 | */ |
1240 | epn = get_epn(r_val: rbval); |
1241 | ap = get_ap(r_val: rbval); |
1242 | ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, epn); |
1243 | break; |
1244 | case 2: |
1245 | /* Invalidate matching LPID */ |
1246 | gp = kvmhv_get_nested(kvm, lpid, false); |
1247 | if (gp) { |
1248 | kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); |
1249 | kvmhv_put_nested(gp); |
1250 | } |
1251 | break; |
1252 | case 3: |
1253 | /* Invalidate ALL LPIDs */ |
1254 | kvmhv_emulate_tlbie_all_lpid(vcpu, ric); |
1255 | break; |
1256 | default: |
1257 | ret = -EINVAL; |
1258 | break; |
1259 | } |
1260 | |
1261 | return ret; |
1262 | } |
1263 | |
1264 | /* |
1265 | * This handles the H_TLB_INVALIDATE hcall. |
1266 | * Parameters are (r4) tlbie instruction code, (r5) rS contents, |
1267 | * (r6) rB contents. |
1268 | */ |
1269 | long kvmhv_do_nested_tlbie(struct kvm_vcpu *vcpu) |
1270 | { |
1271 | int ret; |
1272 | |
1273 | ret = kvmhv_emulate_priv_tlbie(vcpu, instr: kvmppc_get_gpr(vcpu, 4), |
1274 | rsval: kvmppc_get_gpr(vcpu, 5), rbval: kvmppc_get_gpr(vcpu, 6)); |
1275 | if (ret) |
1276 | return H_PARAMETER; |
1277 | return H_SUCCESS; |
1278 | } |
1279 | |
1280 | static long do_tlb_invalidate_nested_all(struct kvm_vcpu *vcpu, |
1281 | unsigned long lpid, unsigned long ric) |
1282 | { |
1283 | struct kvm *kvm = vcpu->kvm; |
1284 | struct kvm_nested_guest *gp; |
1285 | |
1286 | gp = kvmhv_get_nested(kvm, lpid, false); |
1287 | if (gp) { |
1288 | kvmhv_emulate_tlbie_lpid(vcpu, gp, ric); |
1289 | kvmhv_put_nested(gp); |
1290 | } |
1291 | return H_SUCCESS; |
1292 | } |
1293 | |
1294 | /* |
1295 | * Number of pages above which we invalidate the entire LPID rather than |
1296 | * flush individual pages. |
1297 | */ |
1298 | static unsigned long tlb_range_flush_page_ceiling __read_mostly = 33; |
1299 | |
1300 | static long do_tlb_invalidate_nested_tlb(struct kvm_vcpu *vcpu, |
1301 | unsigned long lpid, |
1302 | unsigned long pg_sizes, |
1303 | unsigned long start, |
1304 | unsigned long end) |
1305 | { |
1306 | int ret = H_P4; |
1307 | unsigned long addr, nr_pages; |
1308 | struct mmu_psize_def *def; |
1309 | unsigned long psize, ap, page_size; |
1310 | bool flush_lpid; |
1311 | |
1312 | for (psize = 0; psize < MMU_PAGE_COUNT; psize++) { |
1313 | def = &mmu_psize_defs[psize]; |
1314 | if (!(pg_sizes & def->h_rpt_pgsize)) |
1315 | continue; |
1316 | |
1317 | nr_pages = (end - start) >> def->shift; |
1318 | flush_lpid = nr_pages > tlb_range_flush_page_ceiling; |
1319 | if (flush_lpid) |
1320 | return do_tlb_invalidate_nested_all(vcpu, lpid, |
1321 | RIC_FLUSH_TLB); |
1322 | addr = start; |
1323 | ap = mmu_get_ap(psize); |
1324 | page_size = 1UL << def->shift; |
1325 | do { |
1326 | ret = kvmhv_emulate_tlbie_tlb_addr(vcpu, lpid, ap, |
1327 | get_epn(addr)); |
1328 | if (ret) |
1329 | return H_P4; |
1330 | addr += page_size; |
1331 | } while (addr < end); |
1332 | } |
1333 | return ret; |
1334 | } |
1335 | |
1336 | /* |
1337 | * Performs partition-scoped invalidations for nested guests |
1338 | * as part of H_RPT_INVALIDATE hcall. |
1339 | */ |
1340 | long do_h_rpt_invalidate_pat(struct kvm_vcpu *vcpu, unsigned long lpid, |
1341 | unsigned long type, unsigned long pg_sizes, |
1342 | unsigned long start, unsigned long end) |
1343 | { |
1344 | /* |
1345 | * If L2 lpid isn't valid, we need to return H_PARAMETER. |
1346 | * |
1347 | * However, nested KVM issues a L2 lpid flush call when creating |
1348 | * partition table entries for L2. This happens even before the |
1349 | * corresponding shadow lpid is created in HV which happens in |
1350 | * H_ENTER_NESTED call. Since we can't differentiate this case from |
1351 | * the invalid case, we ignore such flush requests and return success. |
1352 | */ |
1353 | if (!__find_nested(vcpu->kvm, lpid)) |
1354 | return H_SUCCESS; |
1355 | |
1356 | /* |
1357 | * A flush all request can be handled by a full lpid flush only. |
1358 | */ |
1359 | if ((type & H_RPTI_TYPE_NESTED_ALL) == H_RPTI_TYPE_NESTED_ALL) |
1360 | return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_ALL); |
1361 | |
1362 | /* |
1363 | * We don't need to handle a PWC flush like process table here, |
1364 | * because intermediate partition scoped table in nested guest doesn't |
1365 | * really have PWC. Only level we have PWC is in L0 and for nested |
1366 | * invalidate at L0 we always do kvm_flush_lpid() which does |
1367 | * radix__flush_all_lpid(). For range invalidate at any level, we |
1368 | * are not removing the higher level page tables and hence there is |
1369 | * no PWC invalidate needed. |
1370 | * |
1371 | * if (type & H_RPTI_TYPE_PWC) { |
1372 | * ret = do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_PWC); |
1373 | * if (ret) |
1374 | * return H_P4; |
1375 | * } |
1376 | */ |
1377 | |
1378 | if (start == 0 && end == -1) |
1379 | return do_tlb_invalidate_nested_all(vcpu, lpid, RIC_FLUSH_TLB); |
1380 | |
1381 | if (type & H_RPTI_TYPE_TLB) |
1382 | return do_tlb_invalidate_nested_tlb(vcpu, lpid, pg_sizes, |
1383 | start, end); |
1384 | return H_SUCCESS; |
1385 | } |
1386 | |
1387 | /* Used to convert a nested guest real address to a L1 guest real address */ |
1388 | static int kvmhv_translate_addr_nested(struct kvm_vcpu *vcpu, |
1389 | struct kvm_nested_guest *gp, |
1390 | unsigned long n_gpa, unsigned long dsisr, |
1391 | struct kvmppc_pte *gpte_p) |
1392 | { |
1393 | u64 fault_addr, flags = dsisr & DSISR_ISSTORE; |
1394 | int ret; |
1395 | |
1396 | ret = kvmppc_mmu_walk_radix_tree(vcpu, n_gpa, gpte_p, gp->l1_gr_to_hr, |
1397 | &fault_addr); |
1398 | |
1399 | if (ret) { |
1400 | /* We didn't find a pte */ |
1401 | if (ret == -EINVAL) { |
1402 | /* Unsupported mmu config */ |
1403 | flags |= DSISR_UNSUPP_MMU; |
1404 | } else if (ret == -ENOENT) { |
1405 | /* No translation found */ |
1406 | flags |= DSISR_NOHPTE; |
1407 | } else if (ret == -EFAULT) { |
1408 | /* Couldn't access L1 real address */ |
1409 | flags |= DSISR_PRTABLE_FAULT; |
1410 | vcpu->arch.fault_gpa = fault_addr; |
1411 | } else { |
1412 | /* Unknown error */ |
1413 | return ret; |
1414 | } |
1415 | goto forward_to_l1; |
1416 | } else { |
1417 | /* We found a pte -> check permissions */ |
1418 | if (dsisr & DSISR_ISSTORE) { |
1419 | /* Can we write? */ |
1420 | if (!gpte_p->may_write) { |
1421 | flags |= DSISR_PROTFAULT; |
1422 | goto forward_to_l1; |
1423 | } |
1424 | } else if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { |
1425 | /* Can we execute? */ |
1426 | if (!gpte_p->may_execute) { |
1427 | flags |= SRR1_ISI_N_G_OR_CIP; |
1428 | goto forward_to_l1; |
1429 | } |
1430 | } else { |
1431 | /* Can we read? */ |
1432 | if (!gpte_p->may_read && !gpte_p->may_write) { |
1433 | flags |= DSISR_PROTFAULT; |
1434 | goto forward_to_l1; |
1435 | } |
1436 | } |
1437 | } |
1438 | |
1439 | return 0; |
1440 | |
1441 | forward_to_l1: |
1442 | vcpu->arch.fault_dsisr = flags; |
1443 | if (vcpu->arch.trap == BOOK3S_INTERRUPT_H_INST_STORAGE) { |
1444 | vcpu->arch.shregs.msr &= SRR1_MSR_BITS; |
1445 | vcpu->arch.shregs.msr |= flags; |
1446 | } |
1447 | return RESUME_HOST; |
1448 | } |
1449 | |
1450 | static long kvmhv_handle_nested_set_rc(struct kvm_vcpu *vcpu, |
1451 | struct kvm_nested_guest *gp, |
1452 | unsigned long n_gpa, |
1453 | struct kvmppc_pte gpte, |
1454 | unsigned long dsisr) |
1455 | { |
1456 | struct kvm *kvm = vcpu->kvm; |
1457 | bool writing = !!(dsisr & DSISR_ISSTORE); |
1458 | u64 pgflags; |
1459 | long ret; |
1460 | |
1461 | /* Are the rc bits set in the L1 partition scoped pte? */ |
1462 | pgflags = _PAGE_ACCESSED; |
1463 | if (writing) |
1464 | pgflags |= _PAGE_DIRTY; |
1465 | if (pgflags & ~gpte.rc) |
1466 | return RESUME_HOST; |
1467 | |
1468 | spin_lock(lock: &kvm->mmu_lock); |
1469 | /* Set the rc bit in the pte of our (L0) pgtable for the L1 guest */ |
1470 | ret = kvmppc_hv_handle_set_rc(kvm, false, writing, |
1471 | gpte.raddr, kvm->arch.lpid); |
1472 | if (!ret) { |
1473 | ret = -EINVAL; |
1474 | goto out_unlock; |
1475 | } |
1476 | |
1477 | /* Set the rc bit in the pte of the shadow_pgtable for the nest guest */ |
1478 | ret = kvmppc_hv_handle_set_rc(kvm, true, writing, |
1479 | n_gpa, gp->l1_lpid); |
1480 | if (!ret) |
1481 | ret = -EINVAL; |
1482 | else |
1483 | ret = 0; |
1484 | |
1485 | out_unlock: |
1486 | spin_unlock(lock: &kvm->mmu_lock); |
1487 | return ret; |
1488 | } |
1489 | |
1490 | static inline int kvmppc_radix_level_to_shift(int level) |
1491 | { |
1492 | switch (level) { |
1493 | case 2: |
1494 | return PUD_SHIFT; |
1495 | case 1: |
1496 | return PMD_SHIFT; |
1497 | default: |
1498 | return PAGE_SHIFT; |
1499 | } |
1500 | } |
1501 | |
1502 | static inline int kvmppc_radix_shift_to_level(int shift) |
1503 | { |
1504 | if (shift == PUD_SHIFT) |
1505 | return 2; |
1506 | if (shift == PMD_SHIFT) |
1507 | return 1; |
1508 | if (shift == PAGE_SHIFT) |
1509 | return 0; |
1510 | WARN_ON_ONCE(1); |
1511 | return 0; |
1512 | } |
1513 | |
1514 | /* called with gp->tlb_lock held */ |
1515 | static long int __kvmhv_nested_page_fault(struct kvm_vcpu *vcpu, |
1516 | struct kvm_nested_guest *gp) |
1517 | { |
1518 | struct kvm *kvm = vcpu->kvm; |
1519 | struct kvm_memory_slot *memslot; |
1520 | struct rmap_nested *n_rmap; |
1521 | struct kvmppc_pte gpte; |
1522 | pte_t pte, *pte_p; |
1523 | unsigned long mmu_seq; |
1524 | unsigned long dsisr = vcpu->arch.fault_dsisr; |
1525 | unsigned long ea = vcpu->arch.fault_dar; |
1526 | unsigned long *rmapp; |
1527 | unsigned long n_gpa, gpa, gfn, perm = 0UL; |
1528 | unsigned int shift, l1_shift, level; |
1529 | bool writing = !!(dsisr & DSISR_ISSTORE); |
1530 | bool kvm_ro = false; |
1531 | long int ret; |
1532 | |
1533 | if (!gp->l1_gr_to_hr) { |
1534 | kvmhv_update_ptbl_cache(gp); |
1535 | if (!gp->l1_gr_to_hr) |
1536 | return RESUME_HOST; |
1537 | } |
1538 | |
1539 | /* Convert the nested guest real address into a L1 guest real address */ |
1540 | |
1541 | n_gpa = vcpu->arch.fault_gpa & ~0xF000000000000FFFULL; |
1542 | if (!(dsisr & DSISR_PRTABLE_FAULT)) |
1543 | n_gpa |= ea & 0xFFF; |
1544 | ret = kvmhv_translate_addr_nested(vcpu, gp, n_gpa, dsisr, gpte_p: &gpte); |
1545 | |
1546 | /* |
1547 | * If the hardware found a translation but we don't now have a usable |
1548 | * translation in the l1 partition-scoped tree, remove the shadow pte |
1549 | * and let the guest retry. |
1550 | */ |
1551 | if (ret == RESUME_HOST && |
1552 | (dsisr & (DSISR_PROTFAULT | DSISR_BADACCESS | DSISR_NOEXEC_OR_G | |
1553 | DSISR_BAD_COPYPASTE))) |
1554 | goto inval; |
1555 | if (ret) |
1556 | return ret; |
1557 | |
1558 | /* Failed to set the reference/change bits */ |
1559 | if (dsisr & DSISR_SET_RC) { |
1560 | ret = kvmhv_handle_nested_set_rc(vcpu, gp, n_gpa, gpte: gpte, dsisr); |
1561 | if (ret == RESUME_HOST) |
1562 | return ret; |
1563 | if (ret) |
1564 | goto inval; |
1565 | dsisr &= ~DSISR_SET_RC; |
1566 | if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | |
1567 | DSISR_PROTFAULT))) |
1568 | return RESUME_GUEST; |
1569 | } |
1570 | |
1571 | /* |
1572 | * We took an HISI or HDSI while we were running a nested guest which |
1573 | * means we have no partition scoped translation for that. This means |
1574 | * we need to insert a pte for the mapping into our shadow_pgtable. |
1575 | */ |
1576 | |
1577 | l1_shift = gpte.page_shift; |
1578 | if (l1_shift < PAGE_SHIFT) { |
1579 | /* We don't support l1 using a page size smaller than our own */ |
1580 | pr_err("KVM: L1 guest page shift (%d) less than our own (%d)\n" , |
1581 | l1_shift, PAGE_SHIFT); |
1582 | return -EINVAL; |
1583 | } |
1584 | gpa = gpte.raddr; |
1585 | gfn = gpa >> PAGE_SHIFT; |
1586 | |
1587 | /* 1. Get the corresponding host memslot */ |
1588 | |
1589 | memslot = gfn_to_memslot(kvm, gfn); |
1590 | if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { |
1591 | if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS)) { |
1592 | /* unusual error -> reflect to the guest as a DSI */ |
1593 | kvmppc_core_queue_data_storage(vcpu, |
1594 | kvmppc_get_msr(vcpu) & SRR1_PREFIXED, |
1595 | ea, dsisr); |
1596 | return RESUME_GUEST; |
1597 | } |
1598 | |
1599 | /* passthrough of emulated MMIO case */ |
1600 | return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing); |
1601 | } |
1602 | if (memslot->flags & KVM_MEM_READONLY) { |
1603 | if (writing) { |
1604 | /* Give the guest a DSI */ |
1605 | kvmppc_core_queue_data_storage(vcpu, |
1606 | kvmppc_get_msr(vcpu) & SRR1_PREFIXED, |
1607 | ea, DSISR_ISSTORE | DSISR_PROTFAULT); |
1608 | return RESUME_GUEST; |
1609 | } |
1610 | kvm_ro = true; |
1611 | } |
1612 | |
1613 | /* 2. Find the host pte for this L1 guest real address */ |
1614 | |
1615 | /* Used to check for invalidations in progress */ |
1616 | mmu_seq = kvm->mmu_invalidate_seq; |
1617 | smp_rmb(); |
1618 | |
1619 | /* See if can find translation in our partition scoped tables for L1 */ |
1620 | pte = __pte(val: 0); |
1621 | spin_lock(lock: &kvm->mmu_lock); |
1622 | pte_p = find_kvm_secondary_pte(kvm, gpa, &shift); |
1623 | if (!shift) |
1624 | shift = PAGE_SHIFT; |
1625 | if (pte_p) |
1626 | pte = *pte_p; |
1627 | spin_unlock(lock: &kvm->mmu_lock); |
1628 | |
1629 | if (!pte_present(pte) || (writing && !(pte_val(pte) & _PAGE_WRITE))) { |
1630 | /* No suitable pte found -> try to insert a mapping */ |
1631 | ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, |
1632 | writing, kvm_ro, &pte, &level); |
1633 | if (ret == -EAGAIN) |
1634 | return RESUME_GUEST; |
1635 | else if (ret) |
1636 | return ret; |
1637 | shift = kvmppc_radix_level_to_shift(level); |
1638 | } |
1639 | /* Align gfn to the start of the page */ |
1640 | gfn = (gpa & ~((1UL << shift) - 1)) >> PAGE_SHIFT; |
1641 | |
1642 | /* 3. Compute the pte we need to insert for nest_gpa -> host r_addr */ |
1643 | |
1644 | /* The permissions is the combination of the host and l1 guest ptes */ |
1645 | perm |= gpte.may_read ? 0UL : _PAGE_READ; |
1646 | perm |= gpte.may_write ? 0UL : _PAGE_WRITE; |
1647 | perm |= gpte.may_execute ? 0UL : _PAGE_EXEC; |
1648 | /* Only set accessed/dirty (rc) bits if set in host and l1 guest ptes */ |
1649 | perm |= (gpte.rc & _PAGE_ACCESSED) ? 0UL : _PAGE_ACCESSED; |
1650 | perm |= ((gpte.rc & _PAGE_DIRTY) && writing) ? 0UL : _PAGE_DIRTY; |
1651 | pte = __pte(val: pte_val(pte) & ~perm); |
1652 | |
1653 | /* What size pte can we insert? */ |
1654 | if (shift > l1_shift) { |
1655 | u64 mask; |
1656 | unsigned int actual_shift = PAGE_SHIFT; |
1657 | if (PMD_SHIFT < l1_shift) |
1658 | actual_shift = PMD_SHIFT; |
1659 | mask = (1UL << shift) - (1UL << actual_shift); |
1660 | pte = __pte(val: pte_val(pte) | (gpa & mask)); |
1661 | shift = actual_shift; |
1662 | } |
1663 | level = kvmppc_radix_shift_to_level(shift); |
1664 | n_gpa &= ~((1UL << shift) - 1); |
1665 | |
1666 | /* 4. Insert the pte into our shadow_pgtable */ |
1667 | |
1668 | n_rmap = kzalloc(sizeof(*n_rmap), GFP_KERNEL); |
1669 | if (!n_rmap) |
1670 | return RESUME_GUEST; /* Let the guest try again */ |
1671 | n_rmap->rmap = (n_gpa & RMAP_NESTED_GPA_MASK) | |
1672 | (((unsigned long) gp->l1_lpid) << RMAP_NESTED_LPID_SHIFT); |
1673 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; |
1674 | ret = kvmppc_create_pte(kvm, gp->shadow_pgtable, pte, n_gpa, level, |
1675 | mmu_seq, gp->shadow_lpid, rmapp, &n_rmap); |
1676 | kfree(objp: n_rmap); |
1677 | if (ret == -EAGAIN) |
1678 | ret = RESUME_GUEST; /* Let the guest try again */ |
1679 | |
1680 | return ret; |
1681 | |
1682 | inval: |
1683 | kvmhv_invalidate_shadow_pte(vcpu, gp, gpa: n_gpa, NULL); |
1684 | return RESUME_GUEST; |
1685 | } |
1686 | |
1687 | long int kvmhv_nested_page_fault(struct kvm_vcpu *vcpu) |
1688 | { |
1689 | struct kvm_nested_guest *gp = vcpu->arch.nested; |
1690 | long int ret; |
1691 | |
1692 | mutex_lock(&gp->tlb_lock); |
1693 | ret = __kvmhv_nested_page_fault(vcpu, gp); |
1694 | mutex_unlock(lock: &gp->tlb_lock); |
1695 | return ret; |
1696 | } |
1697 | |
1698 | int kvmhv_nested_next_lpid(struct kvm *kvm, int lpid) |
1699 | { |
1700 | int ret = lpid + 1; |
1701 | |
1702 | spin_lock(lock: &kvm->mmu_lock); |
1703 | if (!idr_get_next(&kvm->arch.kvm_nested_guest_idr, nextid: &ret)) |
1704 | ret = -1; |
1705 | spin_unlock(lock: &kvm->mmu_lock); |
1706 | |
1707 | return ret; |
1708 | } |
1709 | |