1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * |
4 | * Copyright 2016 Paul Mackerras, IBM Corp. <paulus@au1.ibm.com> |
5 | */ |
6 | |
7 | #include <linux/types.h> |
8 | #include <linux/string.h> |
9 | #include <linux/kvm.h> |
10 | #include <linux/kvm_host.h> |
11 | #include <linux/anon_inodes.h> |
12 | #include <linux/file.h> |
13 | #include <linux/debugfs.h> |
14 | #include <linux/pgtable.h> |
15 | |
16 | #include <asm/kvm_ppc.h> |
17 | #include <asm/kvm_book3s.h> |
18 | #include "book3s_hv.h" |
19 | #include <asm/page.h> |
20 | #include <asm/mmu.h> |
21 | #include <asm/pgalloc.h> |
22 | #include <asm/pte-walk.h> |
23 | #include <asm/ultravisor.h> |
24 | #include <asm/kvm_book3s_uvmem.h> |
25 | #include <asm/plpar_wrappers.h> |
26 | #include <asm/firmware.h> |
27 | |
28 | /* |
29 | * Supported radix tree geometry. |
30 | * Like p9, we support either 5 or 9 bits at the first (lowest) level, |
31 | * for a page size of 64k or 4k. |
32 | */ |
33 | static int p9_supported_radix_bits[4] = { 5, 9, 9, 13 }; |
34 | |
35 | unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int pid, |
36 | gva_t eaddr, void *to, void *from, |
37 | unsigned long n) |
38 | { |
39 | int old_pid, old_lpid; |
40 | unsigned long quadrant, ret = n; |
41 | bool is_load = !!to; |
42 | |
43 | if (kvmhv_is_nestedv2()) |
44 | return H_UNSUPPORTED; |
45 | |
46 | /* Can't access quadrants 1 or 2 in non-HV mode, call the HV to do it */ |
47 | if (kvmhv_on_pseries()) |
48 | return plpar_hcall_norets(H_COPY_TOFROM_GUEST, lpid, pid, eaddr, |
49 | (to != NULL) ? __pa(to): 0, |
50 | (from != NULL) ? __pa(from): 0, n); |
51 | |
52 | if (eaddr & (0xFFFUL << 52)) |
53 | return ret; |
54 | |
55 | quadrant = 1; |
56 | if (!pid) |
57 | quadrant = 2; |
58 | if (is_load) |
59 | from = (void *) (eaddr | (quadrant << 62)); |
60 | else |
61 | to = (void *) (eaddr | (quadrant << 62)); |
62 | |
63 | preempt_disable(); |
64 | |
65 | asm volatile("hwsync" ::: "memory" ); |
66 | isync(); |
67 | /* switch the lpid first to avoid running host with unallocated pid */ |
68 | old_lpid = mfspr(SPRN_LPID); |
69 | if (old_lpid != lpid) |
70 | mtspr(SPRN_LPID, lpid); |
71 | if (quadrant == 1) { |
72 | old_pid = mfspr(SPRN_PID); |
73 | if (old_pid != pid) |
74 | mtspr(SPRN_PID, pid); |
75 | } |
76 | isync(); |
77 | |
78 | pagefault_disable(); |
79 | if (is_load) |
80 | ret = __copy_from_user_inatomic(to, from: (const void __user *)from, n); |
81 | else |
82 | ret = __copy_to_user_inatomic(to: (void __user *)to, from, n); |
83 | pagefault_enable(); |
84 | |
85 | asm volatile("hwsync" ::: "memory" ); |
86 | isync(); |
87 | /* switch the pid first to avoid running host with unallocated pid */ |
88 | if (quadrant == 1 && pid != old_pid) |
89 | mtspr(SPRN_PID, old_pid); |
90 | if (lpid != old_lpid) |
91 | mtspr(SPRN_LPID, old_lpid); |
92 | isync(); |
93 | |
94 | preempt_enable(); |
95 | |
96 | return ret; |
97 | } |
98 | |
99 | static long kvmhv_copy_tofrom_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, |
100 | void *to, void *from, unsigned long n) |
101 | { |
102 | int lpid = vcpu->kvm->arch.lpid; |
103 | int pid; |
104 | |
105 | /* This would cause a data segment intr so don't allow the access */ |
106 | if (eaddr & (0x3FFUL << 52)) |
107 | return -EINVAL; |
108 | |
109 | /* Should we be using the nested lpid */ |
110 | if (vcpu->arch.nested) |
111 | lpid = vcpu->arch.nested->shadow_lpid; |
112 | |
113 | /* If accessing quadrant 3 then pid is expected to be 0 */ |
114 | if (((eaddr >> 62) & 0x3) == 0x3) |
115 | pid = 0; |
116 | else |
117 | pid = kvmppc_get_pid(vcpu); |
118 | |
119 | eaddr &= ~(0xFFFUL << 52); |
120 | |
121 | return __kvmhv_copy_tofrom_guest_radix(lpid, pid, eaddr, to, from, n); |
122 | } |
123 | |
124 | long kvmhv_copy_from_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *to, |
125 | unsigned long n) |
126 | { |
127 | long ret; |
128 | |
129 | ret = kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, to, NULL, n); |
130 | if (ret > 0) |
131 | memset(to + (n - ret), 0, ret); |
132 | |
133 | return ret; |
134 | } |
135 | |
136 | long kvmhv_copy_to_guest_radix(struct kvm_vcpu *vcpu, gva_t eaddr, void *from, |
137 | unsigned long n) |
138 | { |
139 | return kvmhv_copy_tofrom_guest_radix(vcpu, eaddr, NULL, from, n); |
140 | } |
141 | |
142 | int kvmppc_mmu_walk_radix_tree(struct kvm_vcpu *vcpu, gva_t eaddr, |
143 | struct kvmppc_pte *gpte, u64 root, |
144 | u64 *pte_ret_p) |
145 | { |
146 | struct kvm *kvm = vcpu->kvm; |
147 | int ret, level, ps; |
148 | unsigned long rts, bits, offset, index; |
149 | u64 pte, base, gpa; |
150 | __be64 rpte; |
151 | |
152 | rts = ((root & RTS1_MASK) >> (RTS1_SHIFT - 3)) | |
153 | ((root & RTS2_MASK) >> RTS2_SHIFT); |
154 | bits = root & RPDS_MASK; |
155 | base = root & RPDB_MASK; |
156 | |
157 | offset = rts + 31; |
158 | |
159 | /* Current implementations only support 52-bit space */ |
160 | if (offset != 52) |
161 | return -EINVAL; |
162 | |
163 | /* Walk each level of the radix tree */ |
164 | for (level = 3; level >= 0; --level) { |
165 | u64 addr; |
166 | /* Check a valid size */ |
167 | if (level && bits != p9_supported_radix_bits[level]) |
168 | return -EINVAL; |
169 | if (level == 0 && !(bits == 5 || bits == 9)) |
170 | return -EINVAL; |
171 | offset -= bits; |
172 | index = (eaddr >> offset) & ((1UL << bits) - 1); |
173 | /* Check that low bits of page table base are zero */ |
174 | if (base & ((1UL << (bits + 3)) - 1)) |
175 | return -EINVAL; |
176 | /* Read the entry from guest memory */ |
177 | addr = base + (index * sizeof(rpte)); |
178 | |
179 | kvm_vcpu_srcu_read_lock(vcpu); |
180 | ret = kvm_read_guest(kvm, gpa: addr, data: &rpte, len: sizeof(rpte)); |
181 | kvm_vcpu_srcu_read_unlock(vcpu); |
182 | if (ret) { |
183 | if (pte_ret_p) |
184 | *pte_ret_p = addr; |
185 | return ret; |
186 | } |
187 | pte = __be64_to_cpu(rpte); |
188 | if (!(pte & _PAGE_PRESENT)) |
189 | return -ENOENT; |
190 | /* Check if a leaf entry */ |
191 | if (pte & _PAGE_PTE) |
192 | break; |
193 | /* Get ready to walk the next level */ |
194 | base = pte & RPDB_MASK; |
195 | bits = pte & RPDS_MASK; |
196 | } |
197 | |
198 | /* Need a leaf at lowest level; 512GB pages not supported */ |
199 | if (level < 0 || level == 3) |
200 | return -EINVAL; |
201 | |
202 | /* We found a valid leaf PTE */ |
203 | /* Offset is now log base 2 of the page size */ |
204 | gpa = pte & 0x01fffffffffff000ul; |
205 | if (gpa & ((1ul << offset) - 1)) |
206 | return -EINVAL; |
207 | gpa |= eaddr & ((1ul << offset) - 1); |
208 | for (ps = MMU_PAGE_4K; ps < MMU_PAGE_COUNT; ++ps) |
209 | if (offset == mmu_psize_defs[ps].shift) |
210 | break; |
211 | gpte->page_size = ps; |
212 | gpte->page_shift = offset; |
213 | |
214 | gpte->eaddr = eaddr; |
215 | gpte->raddr = gpa; |
216 | |
217 | /* Work out permissions */ |
218 | gpte->may_read = !!(pte & _PAGE_READ); |
219 | gpte->may_write = !!(pte & _PAGE_WRITE); |
220 | gpte->may_execute = !!(pte & _PAGE_EXEC); |
221 | |
222 | gpte->rc = pte & (_PAGE_ACCESSED | _PAGE_DIRTY); |
223 | |
224 | if (pte_ret_p) |
225 | *pte_ret_p = pte; |
226 | |
227 | return 0; |
228 | } |
229 | |
230 | /* |
231 | * Used to walk a partition or process table radix tree in guest memory |
232 | * Note: We exploit the fact that a partition table and a process |
233 | * table have the same layout, a partition-scoped page table and a |
234 | * process-scoped page table have the same layout, and the 2nd |
235 | * doubleword of a partition table entry has the same layout as |
236 | * the PTCR register. |
237 | */ |
238 | int kvmppc_mmu_radix_translate_table(struct kvm_vcpu *vcpu, gva_t eaddr, |
239 | struct kvmppc_pte *gpte, u64 table, |
240 | int table_index, u64 *pte_ret_p) |
241 | { |
242 | struct kvm *kvm = vcpu->kvm; |
243 | int ret; |
244 | unsigned long size, ptbl, root; |
245 | struct prtb_entry entry; |
246 | |
247 | if ((table & PRTS_MASK) > 24) |
248 | return -EINVAL; |
249 | size = 1ul << ((table & PRTS_MASK) + 12); |
250 | |
251 | /* Is the table big enough to contain this entry? */ |
252 | if ((table_index * sizeof(entry)) >= size) |
253 | return -EINVAL; |
254 | |
255 | /* Read the table to find the root of the radix tree */ |
256 | ptbl = (table & PRTB_MASK) + (table_index * sizeof(entry)); |
257 | kvm_vcpu_srcu_read_lock(vcpu); |
258 | ret = kvm_read_guest(kvm, gpa: ptbl, data: &entry, len: sizeof(entry)); |
259 | kvm_vcpu_srcu_read_unlock(vcpu); |
260 | if (ret) |
261 | return ret; |
262 | |
263 | /* Root is stored in the first double word */ |
264 | root = be64_to_cpu(entry.prtb0); |
265 | |
266 | return kvmppc_mmu_walk_radix_tree(vcpu, eaddr, gpte, root, pte_ret_p); |
267 | } |
268 | |
269 | int kvmppc_mmu_radix_xlate(struct kvm_vcpu *vcpu, gva_t eaddr, |
270 | struct kvmppc_pte *gpte, bool data, bool iswrite) |
271 | { |
272 | u32 pid; |
273 | u64 pte; |
274 | int ret; |
275 | |
276 | /* Work out effective PID */ |
277 | switch (eaddr >> 62) { |
278 | case 0: |
279 | pid = kvmppc_get_pid(vcpu); |
280 | break; |
281 | case 3: |
282 | pid = 0; |
283 | break; |
284 | default: |
285 | return -EINVAL; |
286 | } |
287 | |
288 | ret = kvmppc_mmu_radix_translate_table(vcpu, eaddr, gpte, |
289 | table: vcpu->kvm->arch.process_table, table_index: pid, pte_ret_p: &pte); |
290 | if (ret) |
291 | return ret; |
292 | |
293 | /* Check privilege (applies only to process scoped translations) */ |
294 | if (kvmppc_get_msr(vcpu) & MSR_PR) { |
295 | if (pte & _PAGE_PRIVILEGED) { |
296 | gpte->may_read = 0; |
297 | gpte->may_write = 0; |
298 | gpte->may_execute = 0; |
299 | } |
300 | } else { |
301 | if (!(pte & _PAGE_PRIVILEGED)) { |
302 | /* Check AMR/IAMR to see if strict mode is in force */ |
303 | if (kvmppc_get_amr_hv(vcpu) & (1ul << 62)) |
304 | gpte->may_read = 0; |
305 | if (kvmppc_get_amr_hv(vcpu) & (1ul << 63)) |
306 | gpte->may_write = 0; |
307 | if (vcpu->arch.iamr & (1ul << 62)) |
308 | gpte->may_execute = 0; |
309 | } |
310 | } |
311 | |
312 | return 0; |
313 | } |
314 | |
315 | void kvmppc_radix_tlbie_page(struct kvm *kvm, unsigned long addr, |
316 | unsigned int pshift, u64 lpid) |
317 | { |
318 | unsigned long psize = PAGE_SIZE; |
319 | int psi; |
320 | long rc; |
321 | unsigned long rb; |
322 | |
323 | if (pshift) |
324 | psize = 1UL << pshift; |
325 | else |
326 | pshift = PAGE_SHIFT; |
327 | |
328 | addr &= ~(psize - 1); |
329 | |
330 | if (!kvmhv_on_pseries()) { |
331 | radix__flush_tlb_lpid_page(lpid, addr, psize); |
332 | return; |
333 | } |
334 | |
335 | psi = shift_to_mmu_psize(pshift); |
336 | |
337 | if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) { |
338 | rb = addr | (mmu_get_ap(psi) << PPC_BITLSHIFT(58)); |
339 | rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(0, 0, 1), |
340 | lpid, rb); |
341 | } else { |
342 | rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU, |
343 | H_RPTI_TYPE_NESTED | |
344 | H_RPTI_TYPE_TLB, |
345 | psize_to_rpti_pgsize(psi), |
346 | addr, addr + psize); |
347 | } |
348 | |
349 | if (rc) |
350 | pr_err("KVM: TLB page invalidation hcall failed, rc=%ld\n" , rc); |
351 | } |
352 | |
353 | static void kvmppc_radix_flush_pwc(struct kvm *kvm, u64 lpid) |
354 | { |
355 | long rc; |
356 | |
357 | if (!kvmhv_on_pseries()) { |
358 | radix__flush_pwc_lpid(lpid); |
359 | return; |
360 | } |
361 | |
362 | if (!firmware_has_feature(FW_FEATURE_RPT_INVALIDATE)) |
363 | rc = plpar_hcall_norets(H_TLB_INVALIDATE, H_TLBIE_P1_ENC(1, 0, 1), |
364 | lpid, TLBIEL_INVAL_SET_LPID); |
365 | else |
366 | rc = pseries_rpt_invalidate(lpid, H_RPTI_TARGET_CMMU, |
367 | H_RPTI_TYPE_NESTED | |
368 | H_RPTI_TYPE_PWC, H_RPTI_PAGE_ALL, |
369 | 0, -1UL); |
370 | if (rc) |
371 | pr_err("KVM: TLB PWC invalidation hcall failed, rc=%ld\n" , rc); |
372 | } |
373 | |
374 | static unsigned long kvmppc_radix_update_pte(struct kvm *kvm, pte_t *ptep, |
375 | unsigned long clr, unsigned long set, |
376 | unsigned long addr, unsigned int shift) |
377 | { |
378 | return __radix_pte_update(ptep, clr, set); |
379 | } |
380 | |
381 | static void kvmppc_radix_set_pte_at(struct kvm *kvm, unsigned long addr, |
382 | pte_t *ptep, pte_t pte) |
383 | { |
384 | radix__set_pte_at(kvm->mm, addr, ptep, pte, 0); |
385 | } |
386 | |
387 | static struct kmem_cache *kvm_pte_cache; |
388 | static struct kmem_cache *kvm_pmd_cache; |
389 | |
390 | static pte_t *kvmppc_pte_alloc(void) |
391 | { |
392 | pte_t *pte; |
393 | |
394 | pte = kmem_cache_alloc(cachep: kvm_pte_cache, GFP_KERNEL); |
395 | /* pmd_populate() will only reference _pa(pte). */ |
396 | kmemleak_ignore(pte); |
397 | |
398 | return pte; |
399 | } |
400 | |
401 | static void kvmppc_pte_free(pte_t *ptep) |
402 | { |
403 | kmem_cache_free(s: kvm_pte_cache, objp: ptep); |
404 | } |
405 | |
406 | static pmd_t *kvmppc_pmd_alloc(void) |
407 | { |
408 | pmd_t *pmd; |
409 | |
410 | pmd = kmem_cache_alloc(cachep: kvm_pmd_cache, GFP_KERNEL); |
411 | /* pud_populate() will only reference _pa(pmd). */ |
412 | kmemleak_ignore(pmd); |
413 | |
414 | return pmd; |
415 | } |
416 | |
417 | static void kvmppc_pmd_free(pmd_t *pmdp) |
418 | { |
419 | kmem_cache_free(s: kvm_pmd_cache, objp: pmdp); |
420 | } |
421 | |
422 | /* Called with kvm->mmu_lock held */ |
423 | void kvmppc_unmap_pte(struct kvm *kvm, pte_t *pte, unsigned long gpa, |
424 | unsigned int shift, |
425 | const struct kvm_memory_slot *memslot, |
426 | u64 lpid) |
427 | |
428 | { |
429 | unsigned long old; |
430 | unsigned long gfn = gpa >> PAGE_SHIFT; |
431 | unsigned long page_size = PAGE_SIZE; |
432 | unsigned long hpa; |
433 | |
434 | old = kvmppc_radix_update_pte(kvm, ptep: pte, clr: ~0UL, set: 0, addr: gpa, shift); |
435 | kvmppc_radix_tlbie_page(kvm, addr: gpa, pshift: shift, lpid); |
436 | |
437 | /* The following only applies to L1 entries */ |
438 | if (lpid != kvm->arch.lpid) |
439 | return; |
440 | |
441 | if (!memslot) { |
442 | memslot = gfn_to_memslot(kvm, gfn); |
443 | if (!memslot) |
444 | return; |
445 | } |
446 | if (shift) { /* 1GB or 2MB page */ |
447 | page_size = 1ul << shift; |
448 | if (shift == PMD_SHIFT) |
449 | kvm->stat.num_2M_pages--; |
450 | else if (shift == PUD_SHIFT) |
451 | kvm->stat.num_1G_pages--; |
452 | } |
453 | |
454 | gpa &= ~(page_size - 1); |
455 | hpa = old & PTE_RPN_MASK; |
456 | kvmhv_remove_nest_rmap_range(kvm, memslot, gpa, hpa, page_size); |
457 | |
458 | if ((old & _PAGE_DIRTY) && memslot->dirty_bitmap) |
459 | kvmppc_update_dirty_map(memslot, gfn, page_size); |
460 | } |
461 | |
462 | /* |
463 | * kvmppc_free_p?d are used to free existing page tables, and recursively |
464 | * descend and clear and free children. |
465 | * Callers are responsible for flushing the PWC. |
466 | * |
467 | * When page tables are being unmapped/freed as part of page fault path |
468 | * (full == false), valid ptes are generally not expected; however, there |
469 | * is one situation where they arise, which is when dirty page logging is |
470 | * turned off for a memslot while the VM is running. The new memslot |
471 | * becomes visible to page faults before the memslot commit function |
472 | * gets to flush the memslot, which can lead to a 2MB page mapping being |
473 | * installed for a guest physical address where there are already 64kB |
474 | * (or 4kB) mappings (of sub-pages of the same 2MB page). |
475 | */ |
476 | static void kvmppc_unmap_free_pte(struct kvm *kvm, pte_t *pte, bool full, |
477 | u64 lpid) |
478 | { |
479 | if (full) { |
480 | memset(pte, 0, sizeof(long) << RADIX_PTE_INDEX_SIZE); |
481 | } else { |
482 | pte_t *p = pte; |
483 | unsigned long it; |
484 | |
485 | for (it = 0; it < PTRS_PER_PTE; ++it, ++p) { |
486 | if (pte_val(pte: *p) == 0) |
487 | continue; |
488 | kvmppc_unmap_pte(kvm, pte: p, |
489 | gpa: pte_pfn(pte: *p) << PAGE_SHIFT, |
490 | PAGE_SHIFT, NULL, lpid); |
491 | } |
492 | } |
493 | |
494 | kvmppc_pte_free(ptep: pte); |
495 | } |
496 | |
497 | static void kvmppc_unmap_free_pmd(struct kvm *kvm, pmd_t *pmd, bool full, |
498 | u64 lpid) |
499 | { |
500 | unsigned long im; |
501 | pmd_t *p = pmd; |
502 | |
503 | for (im = 0; im < PTRS_PER_PMD; ++im, ++p) { |
504 | if (!pmd_present(pmd: *p)) |
505 | continue; |
506 | if (pmd_leaf(pte: *p)) { |
507 | if (full) { |
508 | pmd_clear(pmdp: p); |
509 | } else { |
510 | WARN_ON_ONCE(1); |
511 | kvmppc_unmap_pte(kvm, pte: (pte_t *)p, |
512 | gpa: pte_pfn(pte: *(pte_t *)p) << PAGE_SHIFT, |
513 | PMD_SHIFT, NULL, lpid); |
514 | } |
515 | } else { |
516 | pte_t *pte; |
517 | |
518 | pte = pte_offset_kernel(pmd: p, address: 0); |
519 | kvmppc_unmap_free_pte(kvm, pte, full, lpid); |
520 | pmd_clear(pmdp: p); |
521 | } |
522 | } |
523 | kvmppc_pmd_free(pmdp: pmd); |
524 | } |
525 | |
526 | static void kvmppc_unmap_free_pud(struct kvm *kvm, pud_t *pud, |
527 | u64 lpid) |
528 | { |
529 | unsigned long iu; |
530 | pud_t *p = pud; |
531 | |
532 | for (iu = 0; iu < PTRS_PER_PUD; ++iu, ++p) { |
533 | if (!pud_present(pud: *p)) |
534 | continue; |
535 | if (pud_leaf(pud: *p)) { |
536 | pud_clear(pudp: p); |
537 | } else { |
538 | pmd_t *pmd; |
539 | |
540 | pmd = pmd_offset(pud: p, address: 0); |
541 | kvmppc_unmap_free_pmd(kvm, pmd, full: true, lpid); |
542 | pud_clear(pudp: p); |
543 | } |
544 | } |
545 | pud_free(mm: kvm->mm, pud); |
546 | } |
547 | |
548 | void kvmppc_free_pgtable_radix(struct kvm *kvm, pgd_t *pgd, u64 lpid) |
549 | { |
550 | unsigned long ig; |
551 | |
552 | for (ig = 0; ig < PTRS_PER_PGD; ++ig, ++pgd) { |
553 | p4d_t *p4d = p4d_offset(pgd, address: 0); |
554 | pud_t *pud; |
555 | |
556 | if (!p4d_present(p4d: *p4d)) |
557 | continue; |
558 | pud = pud_offset(p4d, address: 0); |
559 | kvmppc_unmap_free_pud(kvm, pud, lpid); |
560 | p4d_clear(p4dp: p4d); |
561 | } |
562 | } |
563 | |
564 | void kvmppc_free_radix(struct kvm *kvm) |
565 | { |
566 | if (kvm->arch.pgtable) { |
567 | kvmppc_free_pgtable_radix(kvm, pgd: kvm->arch.pgtable, |
568 | lpid: kvm->arch.lpid); |
569 | pgd_free(mm: kvm->mm, pgd: kvm->arch.pgtable); |
570 | kvm->arch.pgtable = NULL; |
571 | } |
572 | } |
573 | |
574 | static void kvmppc_unmap_free_pmd_entry_table(struct kvm *kvm, pmd_t *pmd, |
575 | unsigned long gpa, u64 lpid) |
576 | { |
577 | pte_t *pte = pte_offset_kernel(pmd, address: 0); |
578 | |
579 | /* |
580 | * Clearing the pmd entry then flushing the PWC ensures that the pte |
581 | * page no longer be cached by the MMU, so can be freed without |
582 | * flushing the PWC again. |
583 | */ |
584 | pmd_clear(pmdp: pmd); |
585 | kvmppc_radix_flush_pwc(kvm, lpid); |
586 | |
587 | kvmppc_unmap_free_pte(kvm, pte, full: false, lpid); |
588 | } |
589 | |
590 | static void kvmppc_unmap_free_pud_entry_table(struct kvm *kvm, pud_t *pud, |
591 | unsigned long gpa, u64 lpid) |
592 | { |
593 | pmd_t *pmd = pmd_offset(pud, address: 0); |
594 | |
595 | /* |
596 | * Clearing the pud entry then flushing the PWC ensures that the pmd |
597 | * page and any children pte pages will no longer be cached by the MMU, |
598 | * so can be freed without flushing the PWC again. |
599 | */ |
600 | pud_clear(pudp: pud); |
601 | kvmppc_radix_flush_pwc(kvm, lpid); |
602 | |
603 | kvmppc_unmap_free_pmd(kvm, pmd, full: false, lpid); |
604 | } |
605 | |
606 | /* |
607 | * There are a number of bits which may differ between different faults to |
608 | * the same partition scope entry. RC bits, in the course of cleaning and |
609 | * aging. And the write bit can change, either the access could have been |
610 | * upgraded, or a read fault could happen concurrently with a write fault |
611 | * that sets those bits first. |
612 | */ |
613 | #define PTE_BITS_MUST_MATCH (~(_PAGE_WRITE | _PAGE_DIRTY | _PAGE_ACCESSED)) |
614 | |
615 | int kvmppc_create_pte(struct kvm *kvm, pgd_t *pgtable, pte_t pte, |
616 | unsigned long gpa, unsigned int level, |
617 | unsigned long mmu_seq, u64 lpid, |
618 | unsigned long *rmapp, struct rmap_nested **n_rmap) |
619 | { |
620 | pgd_t *pgd; |
621 | p4d_t *p4d; |
622 | pud_t *pud, *new_pud = NULL; |
623 | pmd_t *pmd, *new_pmd = NULL; |
624 | pte_t *ptep, *new_ptep = NULL; |
625 | int ret; |
626 | |
627 | /* Traverse the guest's 2nd-level tree, allocate new levels needed */ |
628 | pgd = pgtable + pgd_index(gpa); |
629 | p4d = p4d_offset(pgd, address: gpa); |
630 | |
631 | pud = NULL; |
632 | if (p4d_present(p4d: *p4d)) |
633 | pud = pud_offset(p4d, address: gpa); |
634 | else |
635 | new_pud = pud_alloc_one(mm: kvm->mm, addr: gpa); |
636 | |
637 | pmd = NULL; |
638 | if (pud && pud_present(pud: *pud) && !pud_leaf(pud: *pud)) |
639 | pmd = pmd_offset(pud, address: gpa); |
640 | else if (level <= 1) |
641 | new_pmd = kvmppc_pmd_alloc(); |
642 | |
643 | if (level == 0 && !(pmd && pmd_present(pmd: *pmd) && !pmd_leaf(pte: *pmd))) |
644 | new_ptep = kvmppc_pte_alloc(); |
645 | |
646 | /* Check if we might have been invalidated; let the guest retry if so */ |
647 | spin_lock(lock: &kvm->mmu_lock); |
648 | ret = -EAGAIN; |
649 | if (mmu_invalidate_retry(kvm, mmu_seq)) |
650 | goto out_unlock; |
651 | |
652 | /* Now traverse again under the lock and change the tree */ |
653 | ret = -ENOMEM; |
654 | if (p4d_none(p4d: *p4d)) { |
655 | if (!new_pud) |
656 | goto out_unlock; |
657 | p4d_populate(mm: kvm->mm, p4d, pud: new_pud); |
658 | new_pud = NULL; |
659 | } |
660 | pud = pud_offset(p4d, address: gpa); |
661 | if (pud_leaf(pud: *pud)) { |
662 | unsigned long hgpa = gpa & PUD_MASK; |
663 | |
664 | /* Check if we raced and someone else has set the same thing */ |
665 | if (level == 2) { |
666 | if (pud_raw(*pud) == pte_raw(pte)) { |
667 | ret = 0; |
668 | goto out_unlock; |
669 | } |
670 | /* Valid 1GB page here already, add our extra bits */ |
671 | WARN_ON_ONCE((pud_val(*pud) ^ pte_val(pte)) & |
672 | PTE_BITS_MUST_MATCH); |
673 | kvmppc_radix_update_pte(kvm, ptep: (pte_t *)pud, |
674 | clr: 0, set: pte_val(pte), addr: hgpa, PUD_SHIFT); |
675 | ret = 0; |
676 | goto out_unlock; |
677 | } |
678 | /* |
679 | * If we raced with another CPU which has just put |
680 | * a 1GB pte in after we saw a pmd page, try again. |
681 | */ |
682 | if (!new_pmd) { |
683 | ret = -EAGAIN; |
684 | goto out_unlock; |
685 | } |
686 | /* Valid 1GB page here already, remove it */ |
687 | kvmppc_unmap_pte(kvm, pte: (pte_t *)pud, gpa: hgpa, PUD_SHIFT, NULL, |
688 | lpid); |
689 | } |
690 | if (level == 2) { |
691 | if (!pud_none(pud: *pud)) { |
692 | /* |
693 | * There's a page table page here, but we wanted to |
694 | * install a large page, so remove and free the page |
695 | * table page. |
696 | */ |
697 | kvmppc_unmap_free_pud_entry_table(kvm, pud, gpa, lpid); |
698 | } |
699 | kvmppc_radix_set_pte_at(kvm, addr: gpa, ptep: (pte_t *)pud, pte); |
700 | if (rmapp && n_rmap) |
701 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); |
702 | ret = 0; |
703 | goto out_unlock; |
704 | } |
705 | if (pud_none(pud: *pud)) { |
706 | if (!new_pmd) |
707 | goto out_unlock; |
708 | pud_populate(mm: kvm->mm, pud, pmd: new_pmd); |
709 | new_pmd = NULL; |
710 | } |
711 | pmd = pmd_offset(pud, address: gpa); |
712 | if (pmd_leaf(pte: *pmd)) { |
713 | unsigned long lgpa = gpa & PMD_MASK; |
714 | |
715 | /* Check if we raced and someone else has set the same thing */ |
716 | if (level == 1) { |
717 | if (pmd_raw(*pmd) == pte_raw(pte)) { |
718 | ret = 0; |
719 | goto out_unlock; |
720 | } |
721 | /* Valid 2MB page here already, add our extra bits */ |
722 | WARN_ON_ONCE((pmd_val(*pmd) ^ pte_val(pte)) & |
723 | PTE_BITS_MUST_MATCH); |
724 | kvmppc_radix_update_pte(kvm, ptep: pmdp_ptep(pmd), |
725 | clr: 0, set: pte_val(pte), addr: lgpa, PMD_SHIFT); |
726 | ret = 0; |
727 | goto out_unlock; |
728 | } |
729 | |
730 | /* |
731 | * If we raced with another CPU which has just put |
732 | * a 2MB pte in after we saw a pte page, try again. |
733 | */ |
734 | if (!new_ptep) { |
735 | ret = -EAGAIN; |
736 | goto out_unlock; |
737 | } |
738 | /* Valid 2MB page here already, remove it */ |
739 | kvmppc_unmap_pte(kvm, pte: pmdp_ptep(pmd), gpa: lgpa, PMD_SHIFT, NULL, |
740 | lpid); |
741 | } |
742 | if (level == 1) { |
743 | if (!pmd_none(pmd: *pmd)) { |
744 | /* |
745 | * There's a page table page here, but we wanted to |
746 | * install a large page, so remove and free the page |
747 | * table page. |
748 | */ |
749 | kvmppc_unmap_free_pmd_entry_table(kvm, pmd, gpa, lpid); |
750 | } |
751 | kvmppc_radix_set_pte_at(kvm, addr: gpa, ptep: pmdp_ptep(pmd), pte); |
752 | if (rmapp && n_rmap) |
753 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); |
754 | ret = 0; |
755 | goto out_unlock; |
756 | } |
757 | if (pmd_none(pmd: *pmd)) { |
758 | if (!new_ptep) |
759 | goto out_unlock; |
760 | pmd_populate(mm: kvm->mm, pmd, pte: new_ptep); |
761 | new_ptep = NULL; |
762 | } |
763 | ptep = pte_offset_kernel(pmd, address: gpa); |
764 | if (pte_present(a: *ptep)) { |
765 | /* Check if someone else set the same thing */ |
766 | if (pte_raw(*ptep) == pte_raw(pte)) { |
767 | ret = 0; |
768 | goto out_unlock; |
769 | } |
770 | /* Valid page here already, add our extra bits */ |
771 | WARN_ON_ONCE((pte_val(*ptep) ^ pte_val(pte)) & |
772 | PTE_BITS_MUST_MATCH); |
773 | kvmppc_radix_update_pte(kvm, ptep, clr: 0, set: pte_val(pte), addr: gpa, shift: 0); |
774 | ret = 0; |
775 | goto out_unlock; |
776 | } |
777 | kvmppc_radix_set_pte_at(kvm, addr: gpa, ptep, pte); |
778 | if (rmapp && n_rmap) |
779 | kvmhv_insert_nest_rmap(kvm, rmapp, n_rmap); |
780 | ret = 0; |
781 | |
782 | out_unlock: |
783 | spin_unlock(lock: &kvm->mmu_lock); |
784 | if (new_pud) |
785 | pud_free(mm: kvm->mm, pud: new_pud); |
786 | if (new_pmd) |
787 | kvmppc_pmd_free(pmdp: new_pmd); |
788 | if (new_ptep) |
789 | kvmppc_pte_free(ptep: new_ptep); |
790 | return ret; |
791 | } |
792 | |
793 | bool kvmppc_hv_handle_set_rc(struct kvm *kvm, bool nested, bool writing, |
794 | unsigned long gpa, u64 lpid) |
795 | { |
796 | unsigned long pgflags; |
797 | unsigned int shift; |
798 | pte_t *ptep; |
799 | |
800 | /* |
801 | * Need to set an R or C bit in the 2nd-level tables; |
802 | * since we are just helping out the hardware here, |
803 | * it is sufficient to do what the hardware does. |
804 | */ |
805 | pgflags = _PAGE_ACCESSED; |
806 | if (writing) |
807 | pgflags |= _PAGE_DIRTY; |
808 | |
809 | if (nested) |
810 | ptep = find_kvm_nested_guest_pte(kvm, lpid, gpa, &shift); |
811 | else |
812 | ptep = find_kvm_secondary_pte(kvm, gpa, &shift); |
813 | |
814 | if (ptep && pte_present(a: *ptep) && (!writing || pte_write(pte: *ptep))) { |
815 | kvmppc_radix_update_pte(kvm, ptep, clr: 0, set: pgflags, addr: gpa, shift); |
816 | return true; |
817 | } |
818 | return false; |
819 | } |
820 | |
821 | int kvmppc_book3s_instantiate_page(struct kvm_vcpu *vcpu, |
822 | unsigned long gpa, |
823 | struct kvm_memory_slot *memslot, |
824 | bool writing, bool kvm_ro, |
825 | pte_t *inserted_pte, unsigned int *levelp) |
826 | { |
827 | struct kvm *kvm = vcpu->kvm; |
828 | struct page *page = NULL; |
829 | unsigned long mmu_seq; |
830 | unsigned long hva, gfn = gpa >> PAGE_SHIFT; |
831 | bool upgrade_write = false; |
832 | bool *upgrade_p = &upgrade_write; |
833 | pte_t pte, *ptep; |
834 | unsigned int shift, level; |
835 | int ret; |
836 | bool large_enable; |
837 | |
838 | /* used to check for invalidations in progress */ |
839 | mmu_seq = kvm->mmu_invalidate_seq; |
840 | smp_rmb(); |
841 | |
842 | /* |
843 | * Do a fast check first, since __gfn_to_pfn_memslot doesn't |
844 | * do it with !atomic && !async, which is how we call it. |
845 | * We always ask for write permission since the common case |
846 | * is that the page is writable. |
847 | */ |
848 | hva = gfn_to_hva_memslot(slot: memslot, gfn); |
849 | if (!kvm_ro && get_user_page_fast_only(addr: hva, gup_flags: FOLL_WRITE, pagep: &page)) { |
850 | upgrade_write = true; |
851 | } else { |
852 | unsigned long pfn; |
853 | |
854 | /* Call KVM generic code to do the slow-path check */ |
855 | pfn = __gfn_to_pfn_memslot(slot: memslot, gfn, atomic: false, interruptible: false, NULL, |
856 | write_fault: writing, writable: upgrade_p, NULL); |
857 | if (is_error_noslot_pfn(pfn)) |
858 | return -EFAULT; |
859 | page = NULL; |
860 | if (pfn_valid(pfn)) { |
861 | page = pfn_to_page(pfn); |
862 | if (PageReserved(page)) |
863 | page = NULL; |
864 | } |
865 | } |
866 | |
867 | /* |
868 | * Read the PTE from the process' radix tree and use that |
869 | * so we get the shift and attribute bits. |
870 | */ |
871 | spin_lock(lock: &kvm->mmu_lock); |
872 | ptep = find_kvm_host_pte(kvm, mmu_seq, hva, &shift); |
873 | pte = __pte(val: 0); |
874 | if (ptep) |
875 | pte = READ_ONCE(*ptep); |
876 | spin_unlock(lock: &kvm->mmu_lock); |
877 | /* |
878 | * If the PTE disappeared temporarily due to a THP |
879 | * collapse, just return and let the guest try again. |
880 | */ |
881 | if (!pte_present(a: pte)) { |
882 | if (page) |
883 | put_page(page); |
884 | return RESUME_GUEST; |
885 | } |
886 | |
887 | /* If we're logging dirty pages, always map single pages */ |
888 | large_enable = !(memslot->flags & KVM_MEM_LOG_DIRTY_PAGES); |
889 | |
890 | /* Get pte level from shift/size */ |
891 | if (large_enable && shift == PUD_SHIFT && |
892 | (gpa & (PUD_SIZE - PAGE_SIZE)) == |
893 | (hva & (PUD_SIZE - PAGE_SIZE))) { |
894 | level = 2; |
895 | } else if (large_enable && shift == PMD_SHIFT && |
896 | (gpa & (PMD_SIZE - PAGE_SIZE)) == |
897 | (hva & (PMD_SIZE - PAGE_SIZE))) { |
898 | level = 1; |
899 | } else { |
900 | level = 0; |
901 | if (shift > PAGE_SHIFT) { |
902 | /* |
903 | * If the pte maps more than one page, bring over |
904 | * bits from the virtual address to get the real |
905 | * address of the specific single page we want. |
906 | */ |
907 | unsigned long rpnmask = (1ul << shift) - PAGE_SIZE; |
908 | pte = __pte(val: pte_val(pte) | (hva & rpnmask)); |
909 | } |
910 | } |
911 | |
912 | pte = __pte(pte_val(pte) | _PAGE_EXEC | _PAGE_ACCESSED); |
913 | if (writing || upgrade_write) { |
914 | if (pte_val(pte) & _PAGE_WRITE) |
915 | pte = __pte(val: pte_val(pte) | _PAGE_DIRTY); |
916 | } else { |
917 | pte = __pte(pte_val(pte) & ~(_PAGE_WRITE | _PAGE_DIRTY)); |
918 | } |
919 | |
920 | /* Allocate space in the tree and write the PTE */ |
921 | ret = kvmppc_create_pte(kvm, pgtable: kvm->arch.pgtable, pte, gpa, level, |
922 | mmu_seq, lpid: kvm->arch.lpid, NULL, NULL); |
923 | if (inserted_pte) |
924 | *inserted_pte = pte; |
925 | if (levelp) |
926 | *levelp = level; |
927 | |
928 | if (page) { |
929 | if (!ret && (pte_val(pte) & _PAGE_WRITE)) |
930 | set_page_dirty_lock(page); |
931 | put_page(page); |
932 | } |
933 | |
934 | /* Increment number of large pages if we (successfully) inserted one */ |
935 | if (!ret) { |
936 | if (level == 1) |
937 | kvm->stat.num_2M_pages++; |
938 | else if (level == 2) |
939 | kvm->stat.num_1G_pages++; |
940 | } |
941 | |
942 | return ret; |
943 | } |
944 | |
945 | int kvmppc_book3s_radix_page_fault(struct kvm_vcpu *vcpu, |
946 | unsigned long ea, unsigned long dsisr) |
947 | { |
948 | struct kvm *kvm = vcpu->kvm; |
949 | unsigned long gpa, gfn; |
950 | struct kvm_memory_slot *memslot; |
951 | long ret; |
952 | bool writing = !!(dsisr & DSISR_ISSTORE); |
953 | bool kvm_ro = false; |
954 | |
955 | /* Check for unusual errors */ |
956 | if (dsisr & DSISR_UNSUPP_MMU) { |
957 | pr_err("KVM: Got unsupported MMU fault\n" ); |
958 | return -EFAULT; |
959 | } |
960 | if (dsisr & DSISR_BADACCESS) { |
961 | /* Reflect to the guest as DSI */ |
962 | pr_err("KVM: Got radix HV page fault with DSISR=%lx\n" , dsisr); |
963 | kvmppc_core_queue_data_storage(vcpu, |
964 | kvmppc_get_msr(vcpu) & SRR1_PREFIXED, |
965 | ea, dsisr); |
966 | return RESUME_GUEST; |
967 | } |
968 | |
969 | /* Translate the logical address */ |
970 | gpa = vcpu->arch.fault_gpa & ~0xfffUL; |
971 | gpa &= ~0xF000000000000000ul; |
972 | gfn = gpa >> PAGE_SHIFT; |
973 | if (!(dsisr & DSISR_PRTABLE_FAULT)) |
974 | gpa |= ea & 0xfff; |
975 | |
976 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
977 | return kvmppc_send_page_to_uv(kvm, gfn); |
978 | |
979 | /* Get the corresponding memslot */ |
980 | memslot = gfn_to_memslot(kvm, gfn); |
981 | |
982 | /* No memslot means it's an emulated MMIO region */ |
983 | if (!memslot || (memslot->flags & KVM_MEMSLOT_INVALID)) { |
984 | if (dsisr & (DSISR_PRTABLE_FAULT | DSISR_BADACCESS | |
985 | DSISR_SET_RC)) { |
986 | /* |
987 | * Bad address in guest page table tree, or other |
988 | * unusual error - reflect it to the guest as DSI. |
989 | */ |
990 | kvmppc_core_queue_data_storage(vcpu, |
991 | kvmppc_get_msr(vcpu) & SRR1_PREFIXED, |
992 | ea, dsisr); |
993 | return RESUME_GUEST; |
994 | } |
995 | return kvmppc_hv_emulate_mmio(vcpu, gpa, ea, writing); |
996 | } |
997 | |
998 | if (memslot->flags & KVM_MEM_READONLY) { |
999 | if (writing) { |
1000 | /* give the guest a DSI */ |
1001 | kvmppc_core_queue_data_storage(vcpu, |
1002 | kvmppc_get_msr(vcpu) & SRR1_PREFIXED, |
1003 | ea, DSISR_ISSTORE | DSISR_PROTFAULT); |
1004 | return RESUME_GUEST; |
1005 | } |
1006 | kvm_ro = true; |
1007 | } |
1008 | |
1009 | /* Failed to set the reference/change bits */ |
1010 | if (dsisr & DSISR_SET_RC) { |
1011 | spin_lock(lock: &kvm->mmu_lock); |
1012 | if (kvmppc_hv_handle_set_rc(kvm, false, writing, |
1013 | gpa, kvm->arch.lpid)) |
1014 | dsisr &= ~DSISR_SET_RC; |
1015 | spin_unlock(lock: &kvm->mmu_lock); |
1016 | |
1017 | if (!(dsisr & (DSISR_BAD_FAULT_64S | DSISR_NOHPTE | |
1018 | DSISR_PROTFAULT | DSISR_SET_RC))) |
1019 | return RESUME_GUEST; |
1020 | } |
1021 | |
1022 | /* Try to insert a pte */ |
1023 | ret = kvmppc_book3s_instantiate_page(vcpu, gpa, memslot, writing, |
1024 | kvm_ro, NULL, NULL); |
1025 | |
1026 | if (ret == 0 || ret == -EAGAIN) |
1027 | ret = RESUME_GUEST; |
1028 | return ret; |
1029 | } |
1030 | |
1031 | /* Called with kvm->mmu_lock held */ |
1032 | void kvm_unmap_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
1033 | unsigned long gfn) |
1034 | { |
1035 | pte_t *ptep; |
1036 | unsigned long gpa = gfn << PAGE_SHIFT; |
1037 | unsigned int shift; |
1038 | |
1039 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) { |
1040 | uv_page_inval(kvm->arch.lpid, gpa, PAGE_SHIFT); |
1041 | return; |
1042 | } |
1043 | |
1044 | ptep = find_kvm_secondary_pte(kvm, gpa, &shift); |
1045 | if (ptep && pte_present(a: *ptep)) |
1046 | kvmppc_unmap_pte(kvm, pte: ptep, gpa, shift, memslot, |
1047 | lpid: kvm->arch.lpid); |
1048 | } |
1049 | |
1050 | /* Called with kvm->mmu_lock held */ |
1051 | bool kvm_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
1052 | unsigned long gfn) |
1053 | { |
1054 | pte_t *ptep; |
1055 | unsigned long gpa = gfn << PAGE_SHIFT; |
1056 | unsigned int shift; |
1057 | bool ref = false; |
1058 | unsigned long old, *rmapp; |
1059 | |
1060 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1061 | return ref; |
1062 | |
1063 | ptep = find_kvm_secondary_pte(kvm, gpa, &shift); |
1064 | if (ptep && pte_present(a: *ptep) && pte_young(pte: *ptep)) { |
1065 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_ACCESSED, set: 0, |
1066 | addr: gpa, shift); |
1067 | /* XXX need to flush tlb here? */ |
1068 | /* Also clear bit in ptes in shadow pgtable for nested guests */ |
1069 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; |
1070 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_ACCESSED, 0, |
1071 | old & PTE_RPN_MASK, |
1072 | 1UL << shift); |
1073 | ref = true; |
1074 | } |
1075 | return ref; |
1076 | } |
1077 | |
1078 | /* Called with kvm->mmu_lock held */ |
1079 | bool kvm_test_age_radix(struct kvm *kvm, struct kvm_memory_slot *memslot, |
1080 | unsigned long gfn) |
1081 | |
1082 | { |
1083 | pte_t *ptep; |
1084 | unsigned long gpa = gfn << PAGE_SHIFT; |
1085 | unsigned int shift; |
1086 | bool ref = false; |
1087 | |
1088 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1089 | return ref; |
1090 | |
1091 | ptep = find_kvm_secondary_pte(kvm, gpa, &shift); |
1092 | if (ptep && pte_present(a: *ptep) && pte_young(pte: *ptep)) |
1093 | ref = true; |
1094 | return ref; |
1095 | } |
1096 | |
1097 | /* Returns the number of PAGE_SIZE pages that are dirty */ |
1098 | static int kvm_radix_test_clear_dirty(struct kvm *kvm, |
1099 | struct kvm_memory_slot *memslot, int pagenum) |
1100 | { |
1101 | unsigned long gfn = memslot->base_gfn + pagenum; |
1102 | unsigned long gpa = gfn << PAGE_SHIFT; |
1103 | pte_t *ptep, pte; |
1104 | unsigned int shift; |
1105 | int ret = 0; |
1106 | unsigned long old, *rmapp; |
1107 | |
1108 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1109 | return ret; |
1110 | |
1111 | /* |
1112 | * For performance reasons we don't hold kvm->mmu_lock while walking the |
1113 | * partition scoped table. |
1114 | */ |
1115 | ptep = find_kvm_secondary_pte_unlocked(kvm, gpa, &shift); |
1116 | if (!ptep) |
1117 | return 0; |
1118 | |
1119 | pte = READ_ONCE(*ptep); |
1120 | if (pte_present(a: pte) && pte_dirty(pte)) { |
1121 | spin_lock(lock: &kvm->mmu_lock); |
1122 | /* |
1123 | * Recheck the pte again |
1124 | */ |
1125 | if (pte_val(pte) != pte_val(pte: *ptep)) { |
1126 | /* |
1127 | * We have KVM_MEM_LOG_DIRTY_PAGES enabled. Hence we can |
1128 | * only find PAGE_SIZE pte entries here. We can continue |
1129 | * to use the pte addr returned by above page table |
1130 | * walk. |
1131 | */ |
1132 | if (!pte_present(a: *ptep) || !pte_dirty(pte: *ptep)) { |
1133 | spin_unlock(lock: &kvm->mmu_lock); |
1134 | return 0; |
1135 | } |
1136 | } |
1137 | |
1138 | ret = 1; |
1139 | VM_BUG_ON(shift); |
1140 | old = kvmppc_radix_update_pte(kvm, ptep, _PAGE_DIRTY, set: 0, |
1141 | addr: gpa, shift); |
1142 | kvmppc_radix_tlbie_page(kvm, addr: gpa, pshift: shift, lpid: kvm->arch.lpid); |
1143 | /* Also clear bit in ptes in shadow pgtable for nested guests */ |
1144 | rmapp = &memslot->arch.rmap[gfn - memslot->base_gfn]; |
1145 | kvmhv_update_nest_rmap_rc_list(kvm, rmapp, _PAGE_DIRTY, 0, |
1146 | old & PTE_RPN_MASK, |
1147 | 1UL << shift); |
1148 | spin_unlock(lock: &kvm->mmu_lock); |
1149 | } |
1150 | return ret; |
1151 | } |
1152 | |
1153 | long kvmppc_hv_get_dirty_log_radix(struct kvm *kvm, |
1154 | struct kvm_memory_slot *memslot, unsigned long *map) |
1155 | { |
1156 | unsigned long i, j; |
1157 | int npages; |
1158 | |
1159 | for (i = 0; i < memslot->npages; i = j) { |
1160 | npages = kvm_radix_test_clear_dirty(kvm, memslot, pagenum: i); |
1161 | |
1162 | /* |
1163 | * Note that if npages > 0 then i must be a multiple of npages, |
1164 | * since huge pages are only used to back the guest at guest |
1165 | * real addresses that are a multiple of their size. |
1166 | * Since we have at most one PTE covering any given guest |
1167 | * real address, if npages > 1 we can skip to i + npages. |
1168 | */ |
1169 | j = i + 1; |
1170 | if (npages) { |
1171 | set_dirty_bits(map, i, npages); |
1172 | j = i + npages; |
1173 | } |
1174 | } |
1175 | return 0; |
1176 | } |
1177 | |
1178 | void kvmppc_radix_flush_memslot(struct kvm *kvm, |
1179 | const struct kvm_memory_slot *memslot) |
1180 | { |
1181 | unsigned long n; |
1182 | pte_t *ptep; |
1183 | unsigned long gpa; |
1184 | unsigned int shift; |
1185 | |
1186 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_START) |
1187 | kvmppc_uvmem_drop_pages(memslot, kvm, true); |
1188 | |
1189 | if (kvm->arch.secure_guest & KVMPPC_SECURE_INIT_DONE) |
1190 | return; |
1191 | |
1192 | gpa = memslot->base_gfn << PAGE_SHIFT; |
1193 | spin_lock(lock: &kvm->mmu_lock); |
1194 | for (n = memslot->npages; n; --n) { |
1195 | ptep = find_kvm_secondary_pte(kvm, gpa, &shift); |
1196 | if (ptep && pte_present(a: *ptep)) |
1197 | kvmppc_unmap_pte(kvm, pte: ptep, gpa, shift, memslot, |
1198 | lpid: kvm->arch.lpid); |
1199 | gpa += PAGE_SIZE; |
1200 | } |
1201 | /* |
1202 | * Increase the mmu notifier sequence number to prevent any page |
1203 | * fault that read the memslot earlier from writing a PTE. |
1204 | */ |
1205 | kvm->mmu_invalidate_seq++; |
1206 | spin_unlock(lock: &kvm->mmu_lock); |
1207 | } |
1208 | |
1209 | static void add_rmmu_ap_encoding(struct kvm_ppc_rmmu_info *info, |
1210 | int psize, int *indexp) |
1211 | { |
1212 | if (!mmu_psize_defs[psize].shift) |
1213 | return; |
1214 | info->ap_encodings[*indexp] = mmu_psize_defs[psize].shift | |
1215 | (mmu_psize_defs[psize].ap << 29); |
1216 | ++(*indexp); |
1217 | } |
1218 | |
1219 | int kvmhv_get_rmmu_info(struct kvm *kvm, struct kvm_ppc_rmmu_info *info) |
1220 | { |
1221 | int i; |
1222 | |
1223 | if (!radix_enabled()) |
1224 | return -EINVAL; |
1225 | memset(info, 0, sizeof(*info)); |
1226 | |
1227 | /* 4k page size */ |
1228 | info->geometries[0].page_shift = 12; |
1229 | info->geometries[0].level_bits[0] = 9; |
1230 | for (i = 1; i < 4; ++i) |
1231 | info->geometries[0].level_bits[i] = p9_supported_radix_bits[i]; |
1232 | /* 64k page size */ |
1233 | info->geometries[1].page_shift = 16; |
1234 | for (i = 0; i < 4; ++i) |
1235 | info->geometries[1].level_bits[i] = p9_supported_radix_bits[i]; |
1236 | |
1237 | i = 0; |
1238 | add_rmmu_ap_encoding(info, MMU_PAGE_4K, &i); |
1239 | add_rmmu_ap_encoding(info, MMU_PAGE_64K, &i); |
1240 | add_rmmu_ap_encoding(info, MMU_PAGE_2M, &i); |
1241 | add_rmmu_ap_encoding(info, MMU_PAGE_1G, &i); |
1242 | |
1243 | return 0; |
1244 | } |
1245 | |
1246 | int kvmppc_init_vm_radix(struct kvm *kvm) |
1247 | { |
1248 | kvm->arch.pgtable = pgd_alloc(kvm->mm); |
1249 | if (!kvm->arch.pgtable) |
1250 | return -ENOMEM; |
1251 | return 0; |
1252 | } |
1253 | |
1254 | static void pte_ctor(void *addr) |
1255 | { |
1256 | memset(addr, 0, RADIX_PTE_TABLE_SIZE); |
1257 | } |
1258 | |
1259 | static void pmd_ctor(void *addr) |
1260 | { |
1261 | memset(addr, 0, RADIX_PMD_TABLE_SIZE); |
1262 | } |
1263 | |
1264 | struct debugfs_radix_state { |
1265 | struct kvm *kvm; |
1266 | struct mutex mutex; |
1267 | unsigned long gpa; |
1268 | int lpid; |
1269 | int chars_left; |
1270 | int buf_index; |
1271 | char buf[128]; |
1272 | u8 hdr; |
1273 | }; |
1274 | |
1275 | static int debugfs_radix_open(struct inode *inode, struct file *file) |
1276 | { |
1277 | struct kvm *kvm = inode->i_private; |
1278 | struct debugfs_radix_state *p; |
1279 | |
1280 | p = kzalloc(size: sizeof(*p), GFP_KERNEL); |
1281 | if (!p) |
1282 | return -ENOMEM; |
1283 | |
1284 | kvm_get_kvm(kvm); |
1285 | p->kvm = kvm; |
1286 | mutex_init(&p->mutex); |
1287 | file->private_data = p; |
1288 | |
1289 | return nonseekable_open(inode, filp: file); |
1290 | } |
1291 | |
1292 | static int debugfs_radix_release(struct inode *inode, struct file *file) |
1293 | { |
1294 | struct debugfs_radix_state *p = file->private_data; |
1295 | |
1296 | kvm_put_kvm(kvm: p->kvm); |
1297 | kfree(objp: p); |
1298 | return 0; |
1299 | } |
1300 | |
1301 | static ssize_t debugfs_radix_read(struct file *file, char __user *buf, |
1302 | size_t len, loff_t *ppos) |
1303 | { |
1304 | struct debugfs_radix_state *p = file->private_data; |
1305 | ssize_t ret, r; |
1306 | unsigned long n; |
1307 | struct kvm *kvm; |
1308 | unsigned long gpa; |
1309 | pgd_t *pgt; |
1310 | struct kvm_nested_guest *nested; |
1311 | pgd_t *pgdp; |
1312 | p4d_t p4d, *p4dp; |
1313 | pud_t pud, *pudp; |
1314 | pmd_t pmd, *pmdp; |
1315 | pte_t *ptep; |
1316 | int shift; |
1317 | unsigned long pte; |
1318 | |
1319 | kvm = p->kvm; |
1320 | if (!kvm_is_radix(kvm)) |
1321 | return 0; |
1322 | |
1323 | ret = mutex_lock_interruptible(&p->mutex); |
1324 | if (ret) |
1325 | return ret; |
1326 | |
1327 | if (p->chars_left) { |
1328 | n = p->chars_left; |
1329 | if (n > len) |
1330 | n = len; |
1331 | r = copy_to_user(to: buf, from: p->buf + p->buf_index, n); |
1332 | n -= r; |
1333 | p->chars_left -= n; |
1334 | p->buf_index += n; |
1335 | buf += n; |
1336 | len -= n; |
1337 | ret = n; |
1338 | if (r) { |
1339 | if (!n) |
1340 | ret = -EFAULT; |
1341 | goto out; |
1342 | } |
1343 | } |
1344 | |
1345 | gpa = p->gpa; |
1346 | nested = NULL; |
1347 | pgt = NULL; |
1348 | while (len != 0 && p->lpid >= 0) { |
1349 | if (gpa >= RADIX_PGTABLE_RANGE) { |
1350 | gpa = 0; |
1351 | pgt = NULL; |
1352 | if (nested) { |
1353 | kvmhv_put_nested(nested); |
1354 | nested = NULL; |
1355 | } |
1356 | p->lpid = kvmhv_nested_next_lpid(kvm, p->lpid); |
1357 | p->hdr = 0; |
1358 | if (p->lpid < 0) |
1359 | break; |
1360 | } |
1361 | if (!pgt) { |
1362 | if (p->lpid == 0) { |
1363 | pgt = kvm->arch.pgtable; |
1364 | } else { |
1365 | nested = kvmhv_get_nested(kvm, p->lpid, false); |
1366 | if (!nested) { |
1367 | gpa = RADIX_PGTABLE_RANGE; |
1368 | continue; |
1369 | } |
1370 | pgt = nested->shadow_pgtable; |
1371 | } |
1372 | } |
1373 | n = 0; |
1374 | if (!p->hdr) { |
1375 | if (p->lpid > 0) |
1376 | n = scnprintf(buf: p->buf, size: sizeof(p->buf), |
1377 | fmt: "\nNested LPID %d: " , p->lpid); |
1378 | n += scnprintf(buf: p->buf + n, size: sizeof(p->buf) - n, |
1379 | fmt: "pgdir: %lx\n" , (unsigned long)pgt); |
1380 | p->hdr = 1; |
1381 | goto copy; |
1382 | } |
1383 | |
1384 | pgdp = pgt + pgd_index(gpa); |
1385 | p4dp = p4d_offset(pgd: pgdp, address: gpa); |
1386 | p4d = READ_ONCE(*p4dp); |
1387 | if (!(p4d_val(p4d) & _PAGE_PRESENT)) { |
1388 | gpa = (gpa & P4D_MASK) + P4D_SIZE; |
1389 | continue; |
1390 | } |
1391 | |
1392 | pudp = pud_offset(p4d: &p4d, address: gpa); |
1393 | pud = READ_ONCE(*pudp); |
1394 | if (!(pud_val(pud) & _PAGE_PRESENT)) { |
1395 | gpa = (gpa & PUD_MASK) + PUD_SIZE; |
1396 | continue; |
1397 | } |
1398 | if (pud_val(pud) & _PAGE_PTE) { |
1399 | pte = pud_val(pud); |
1400 | shift = PUD_SHIFT; |
1401 | goto leaf; |
1402 | } |
1403 | |
1404 | pmdp = pmd_offset(pud: &pud, address: gpa); |
1405 | pmd = READ_ONCE(*pmdp); |
1406 | if (!(pmd_val(pmd) & _PAGE_PRESENT)) { |
1407 | gpa = (gpa & PMD_MASK) + PMD_SIZE; |
1408 | continue; |
1409 | } |
1410 | if (pmd_val(pmd) & _PAGE_PTE) { |
1411 | pte = pmd_val(pmd); |
1412 | shift = PMD_SHIFT; |
1413 | goto leaf; |
1414 | } |
1415 | |
1416 | ptep = pte_offset_kernel(pmd: &pmd, address: gpa); |
1417 | pte = pte_val(READ_ONCE(*ptep)); |
1418 | if (!(pte & _PAGE_PRESENT)) { |
1419 | gpa += PAGE_SIZE; |
1420 | continue; |
1421 | } |
1422 | shift = PAGE_SHIFT; |
1423 | leaf: |
1424 | n = scnprintf(buf: p->buf, size: sizeof(p->buf), |
1425 | fmt: " %lx: %lx %d\n" , gpa, pte, shift); |
1426 | gpa += 1ul << shift; |
1427 | copy: |
1428 | p->chars_left = n; |
1429 | if (n > len) |
1430 | n = len; |
1431 | r = copy_to_user(to: buf, from: p->buf, n); |
1432 | n -= r; |
1433 | p->chars_left -= n; |
1434 | p->buf_index = n; |
1435 | buf += n; |
1436 | len -= n; |
1437 | ret += n; |
1438 | if (r) { |
1439 | if (!ret) |
1440 | ret = -EFAULT; |
1441 | break; |
1442 | } |
1443 | } |
1444 | p->gpa = gpa; |
1445 | if (nested) |
1446 | kvmhv_put_nested(nested); |
1447 | |
1448 | out: |
1449 | mutex_unlock(lock: &p->mutex); |
1450 | return ret; |
1451 | } |
1452 | |
1453 | static ssize_t debugfs_radix_write(struct file *file, const char __user *buf, |
1454 | size_t len, loff_t *ppos) |
1455 | { |
1456 | return -EACCES; |
1457 | } |
1458 | |
1459 | static const struct file_operations debugfs_radix_fops = { |
1460 | .owner = THIS_MODULE, |
1461 | .open = debugfs_radix_open, |
1462 | .release = debugfs_radix_release, |
1463 | .read = debugfs_radix_read, |
1464 | .write = debugfs_radix_write, |
1465 | .llseek = generic_file_llseek, |
1466 | }; |
1467 | |
1468 | void kvmhv_radix_debugfs_init(struct kvm *kvm) |
1469 | { |
1470 | debugfs_create_file(name: "radix" , mode: 0400, parent: kvm->debugfs_dentry, data: kvm, |
1471 | fops: &debugfs_radix_fops); |
1472 | } |
1473 | |
1474 | int kvmppc_radix_init(void) |
1475 | { |
1476 | unsigned long size = sizeof(void *) << RADIX_PTE_INDEX_SIZE; |
1477 | |
1478 | kvm_pte_cache = kmem_cache_create(name: "kvm-pte" , size, align: size, flags: 0, ctor: pte_ctor); |
1479 | if (!kvm_pte_cache) |
1480 | return -ENOMEM; |
1481 | |
1482 | size = sizeof(void *) << RADIX_PMD_INDEX_SIZE; |
1483 | |
1484 | kvm_pmd_cache = kmem_cache_create(name: "kvm-pmd" , size, align: size, flags: 0, ctor: pmd_ctor); |
1485 | if (!kvm_pmd_cache) { |
1486 | kmem_cache_destroy(s: kvm_pte_cache); |
1487 | return -ENOMEM; |
1488 | } |
1489 | |
1490 | return 0; |
1491 | } |
1492 | |
1493 | void kvmppc_radix_exit(void) |
1494 | { |
1495 | kmem_cache_destroy(s: kvm_pte_cache); |
1496 | kmem_cache_destroy(s: kvm_pmd_cache); |
1497 | } |
1498 | |