1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | /* |
4 | * Xen leaves the responsibility for maintaining p2m mappings to the |
5 | * guests themselves, but it must also access and update the p2m array |
6 | * during suspend/resume when all the pages are reallocated. |
7 | * |
8 | * The logical flat p2m table is mapped to a linear kernel memory area. |
9 | * For accesses by Xen a three-level tree linked via mfns only is set up to |
10 | * allow the address space to be sparse. |
11 | * |
12 | * Xen |
13 | * | |
14 | * p2m_top_mfn |
15 | * / \ |
16 | * p2m_mid_mfn p2m_mid_mfn |
17 | * / / |
18 | * p2m p2m p2m ... |
19 | * |
20 | * The p2m_mid_mfn pages are mapped by p2m_top_mfn_p. |
21 | * |
22 | * The p2m_top_mfn level is limited to 1 page, so the maximum representable |
23 | * pseudo-physical address space is: |
24 | * P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE pages |
25 | * |
26 | * P2M_PER_PAGE depends on the architecture, as a mfn is always |
27 | * unsigned long (8 bytes on 64-bit, 4 bytes on 32), leading to |
28 | * 512 and 1024 entries respectively. |
29 | * |
30 | * In short, these structures contain the Machine Frame Number (MFN) of the PFN. |
31 | * |
32 | * However not all entries are filled with MFNs. Specifically for all other |
33 | * leaf entries, or for the top root, or middle one, for which there is a void |
34 | * entry, we assume it is "missing". So (for example) |
35 | * pfn_to_mfn(0x90909090)=INVALID_P2M_ENTRY. |
36 | * We have a dedicated page p2m_missing with all entries being |
37 | * INVALID_P2M_ENTRY. This page may be referenced multiple times in the p2m |
38 | * list/tree in case there are multiple areas with P2M_PER_PAGE invalid pfns. |
39 | * |
40 | * We also have the possibility of setting 1-1 mappings on certain regions, so |
41 | * that: |
42 | * pfn_to_mfn(0xc0000)=0xc0000 |
43 | * |
44 | * The benefit of this is, that we can assume for non-RAM regions (think |
45 | * PCI BARs, or ACPI spaces), we can create mappings easily because we |
46 | * get the PFN value to match the MFN. |
47 | * |
48 | * For this to work efficiently we have one new page p2m_identity. All entries |
49 | * in p2m_identity are set to INVALID_P2M_ENTRY type (Xen toolstack only |
50 | * recognizes that and MFNs, no other fancy value). |
51 | * |
52 | * On lookup we spot that the entry points to p2m_identity and return the |
53 | * identity value instead of dereferencing and returning INVALID_P2M_ENTRY. |
54 | * If the entry points to an allocated page, we just proceed as before and |
55 | * return the PFN. If the PFN has IDENTITY_FRAME_BIT set we unmask that in |
56 | * appropriate functions (pfn_to_mfn). |
57 | * |
58 | * The reason for having the IDENTITY_FRAME_BIT instead of just returning the |
59 | * PFN is that we could find ourselves where pfn_to_mfn(pfn)==pfn for a |
60 | * non-identity pfn. To protect ourselves against we elect to set (and get) the |
61 | * IDENTITY_FRAME_BIT on all identity mapped PFNs. |
62 | */ |
63 | |
64 | #include <linux/init.h> |
65 | #include <linux/export.h> |
66 | #include <linux/list.h> |
67 | #include <linux/hash.h> |
68 | #include <linux/sched.h> |
69 | #include <linux/seq_file.h> |
70 | #include <linux/memblock.h> |
71 | #include <linux/slab.h> |
72 | #include <linux/vmalloc.h> |
73 | |
74 | #include <asm/cache.h> |
75 | #include <asm/setup.h> |
76 | #include <linux/uaccess.h> |
77 | |
78 | #include <asm/xen/page.h> |
79 | #include <asm/xen/hypercall.h> |
80 | #include <asm/xen/hypervisor.h> |
81 | #include <xen/balloon.h> |
82 | #include <xen/grant_table.h> |
83 | |
84 | #include "multicalls.h" |
85 | #include "xen-ops.h" |
86 | |
87 | #define P2M_MID_PER_PAGE (PAGE_SIZE / sizeof(unsigned long *)) |
88 | #define P2M_TOP_PER_PAGE (PAGE_SIZE / sizeof(unsigned long **)) |
89 | |
90 | #define MAX_P2M_PFN (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE) |
91 | |
92 | #define PMDS_PER_MID_PAGE (P2M_MID_PER_PAGE / PTRS_PER_PTE) |
93 | |
94 | unsigned long *xen_p2m_addr __read_mostly; |
95 | EXPORT_SYMBOL_GPL(xen_p2m_addr); |
96 | unsigned long xen_p2m_size __read_mostly; |
97 | EXPORT_SYMBOL_GPL(xen_p2m_size); |
98 | unsigned long xen_max_p2m_pfn __read_mostly; |
99 | EXPORT_SYMBOL_GPL(xen_max_p2m_pfn); |
100 | |
101 | #ifdef CONFIG_XEN_MEMORY_HOTPLUG_LIMIT |
102 | #define P2M_LIMIT CONFIG_XEN_MEMORY_HOTPLUG_LIMIT |
103 | #else |
104 | #define P2M_LIMIT 0 |
105 | #endif |
106 | |
107 | static DEFINE_SPINLOCK(p2m_update_lock); |
108 | |
109 | static unsigned long *p2m_mid_missing_mfn; |
110 | static unsigned long *p2m_top_mfn; |
111 | static unsigned long **p2m_top_mfn_p; |
112 | static unsigned long *p2m_missing; |
113 | static unsigned long *p2m_identity; |
114 | static pte_t *p2m_missing_pte; |
115 | static pte_t *p2m_identity_pte; |
116 | |
117 | /* |
118 | * Hint at last populated PFN. |
119 | * |
120 | * Used to set HYPERVISOR_shared_info->arch.max_pfn so the toolstack |
121 | * can avoid scanning the whole P2M (which may be sized to account for |
122 | * hotplugged memory). |
123 | */ |
124 | static unsigned long xen_p2m_last_pfn; |
125 | |
126 | static inline unsigned p2m_top_index(unsigned long pfn) |
127 | { |
128 | BUG_ON(pfn >= MAX_P2M_PFN); |
129 | return pfn / (P2M_MID_PER_PAGE * P2M_PER_PAGE); |
130 | } |
131 | |
132 | static inline unsigned p2m_mid_index(unsigned long pfn) |
133 | { |
134 | return (pfn / P2M_PER_PAGE) % P2M_MID_PER_PAGE; |
135 | } |
136 | |
137 | static void p2m_top_mfn_init(unsigned long *top) |
138 | { |
139 | unsigned i; |
140 | |
141 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) |
142 | top[i] = virt_to_mfn(p2m_mid_missing_mfn); |
143 | } |
144 | |
145 | static void p2m_top_mfn_p_init(unsigned long **top) |
146 | { |
147 | unsigned i; |
148 | |
149 | for (i = 0; i < P2M_TOP_PER_PAGE; i++) |
150 | top[i] = p2m_mid_missing_mfn; |
151 | } |
152 | |
153 | static void p2m_mid_mfn_init(unsigned long *mid, unsigned long *leaf) |
154 | { |
155 | unsigned i; |
156 | |
157 | for (i = 0; i < P2M_MID_PER_PAGE; i++) |
158 | mid[i] = virt_to_mfn(leaf); |
159 | } |
160 | |
161 | static void p2m_init(unsigned long *p2m) |
162 | { |
163 | unsigned i; |
164 | |
165 | for (i = 0; i < P2M_PER_PAGE; i++) |
166 | p2m[i] = INVALID_P2M_ENTRY; |
167 | } |
168 | |
169 | static void p2m_init_identity(unsigned long *p2m, unsigned long pfn) |
170 | { |
171 | unsigned i; |
172 | |
173 | for (i = 0; i < P2M_PER_PAGE; i++) |
174 | p2m[i] = IDENTITY_FRAME(pfn + i); |
175 | } |
176 | |
177 | static void * __ref alloc_p2m_page(void) |
178 | { |
179 | if (unlikely(!slab_is_available())) { |
180 | void *ptr = memblock_alloc(PAGE_SIZE, PAGE_SIZE); |
181 | |
182 | if (!ptr) |
183 | panic(fmt: "%s: Failed to allocate %lu bytes align=0x%lx\n" , |
184 | __func__, PAGE_SIZE, PAGE_SIZE); |
185 | |
186 | return ptr; |
187 | } |
188 | |
189 | return (void *)__get_free_page(GFP_KERNEL); |
190 | } |
191 | |
192 | static void __ref free_p2m_page(void *p) |
193 | { |
194 | if (unlikely(!slab_is_available())) { |
195 | memblock_free(ptr: p, PAGE_SIZE); |
196 | return; |
197 | } |
198 | |
199 | free_page((unsigned long)p); |
200 | } |
201 | |
202 | /* |
203 | * Build the parallel p2m_top_mfn and p2m_mid_mfn structures |
204 | * |
205 | * This is called both at boot time, and after resuming from suspend: |
206 | * - At boot time we're called rather early, and must use alloc_bootmem*() |
207 | * to allocate memory. |
208 | * |
209 | * - After resume we're called from within stop_machine, but the mfn |
210 | * tree should already be completely allocated. |
211 | */ |
212 | void __ref xen_build_mfn_list_list(void) |
213 | { |
214 | unsigned long pfn, mfn; |
215 | pte_t *ptep; |
216 | unsigned int level, topidx, mididx; |
217 | unsigned long *mid_mfn_p; |
218 | |
219 | if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) |
220 | return; |
221 | |
222 | /* Pre-initialize p2m_top_mfn to be completely missing */ |
223 | if (p2m_top_mfn == NULL) { |
224 | p2m_mid_missing_mfn = alloc_p2m_page(); |
225 | p2m_mid_mfn_init(mid: p2m_mid_missing_mfn, leaf: p2m_missing); |
226 | |
227 | p2m_top_mfn_p = alloc_p2m_page(); |
228 | p2m_top_mfn_p_init(top: p2m_top_mfn_p); |
229 | |
230 | p2m_top_mfn = alloc_p2m_page(); |
231 | p2m_top_mfn_init(top: p2m_top_mfn); |
232 | } else { |
233 | /* Reinitialise, mfn's all change after migration */ |
234 | p2m_mid_mfn_init(mid: p2m_mid_missing_mfn, leaf: p2m_missing); |
235 | } |
236 | |
237 | for (pfn = 0; pfn < xen_max_p2m_pfn && pfn < MAX_P2M_PFN; |
238 | pfn += P2M_PER_PAGE) { |
239 | topidx = p2m_top_index(pfn); |
240 | mididx = p2m_mid_index(pfn); |
241 | |
242 | mid_mfn_p = p2m_top_mfn_p[topidx]; |
243 | ptep = lookup_address(address: (unsigned long)(xen_p2m_addr + pfn), |
244 | level: &level); |
245 | BUG_ON(!ptep || level != PG_LEVEL_4K); |
246 | mfn = pte_mfn(pte: *ptep); |
247 | ptep = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); |
248 | |
249 | /* Don't bother allocating any mfn mid levels if |
250 | * they're just missing, just update the stored mfn, |
251 | * since all could have changed over a migrate. |
252 | */ |
253 | if (ptep == p2m_missing_pte || ptep == p2m_identity_pte) { |
254 | BUG_ON(mididx); |
255 | BUG_ON(mid_mfn_p != p2m_mid_missing_mfn); |
256 | p2m_top_mfn[topidx] = virt_to_mfn(p2m_mid_missing_mfn); |
257 | pfn += (P2M_MID_PER_PAGE - 1) * P2M_PER_PAGE; |
258 | continue; |
259 | } |
260 | |
261 | if (mid_mfn_p == p2m_mid_missing_mfn) { |
262 | mid_mfn_p = alloc_p2m_page(); |
263 | p2m_mid_mfn_init(mid: mid_mfn_p, leaf: p2m_missing); |
264 | |
265 | p2m_top_mfn_p[topidx] = mid_mfn_p; |
266 | } |
267 | |
268 | p2m_top_mfn[topidx] = virt_to_mfn(mid_mfn_p); |
269 | mid_mfn_p[mididx] = mfn; |
270 | } |
271 | } |
272 | |
273 | void xen_setup_mfn_list_list(void) |
274 | { |
275 | BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); |
276 | |
277 | if (xen_start_info->flags & SIF_VIRT_P2M_4TOOLS) |
278 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = ~0UL; |
279 | else |
280 | HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = |
281 | virt_to_mfn(p2m_top_mfn); |
282 | HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; |
283 | HYPERVISOR_shared_info->arch.p2m_generation = 0; |
284 | HYPERVISOR_shared_info->arch.p2m_vaddr = (unsigned long)xen_p2m_addr; |
285 | HYPERVISOR_shared_info->arch.p2m_cr3 = |
286 | xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir)); |
287 | } |
288 | |
289 | /* Set up p2m_top to point to the domain-builder provided p2m pages */ |
290 | void __init xen_build_dynamic_phys_to_machine(void) |
291 | { |
292 | unsigned long pfn; |
293 | |
294 | xen_p2m_addr = (unsigned long *)xen_start_info->mfn_list; |
295 | xen_p2m_size = ALIGN(xen_start_info->nr_pages, P2M_PER_PAGE); |
296 | |
297 | for (pfn = xen_start_info->nr_pages; pfn < xen_p2m_size; pfn++) |
298 | xen_p2m_addr[pfn] = INVALID_P2M_ENTRY; |
299 | |
300 | xen_max_p2m_pfn = xen_p2m_size; |
301 | } |
302 | |
303 | #define P2M_TYPE_IDENTITY 0 |
304 | #define P2M_TYPE_MISSING 1 |
305 | #define P2M_TYPE_PFN 2 |
306 | #define P2M_TYPE_UNKNOWN 3 |
307 | |
308 | static int xen_p2m_elem_type(unsigned long pfn) |
309 | { |
310 | unsigned long mfn; |
311 | |
312 | if (pfn >= xen_p2m_size) |
313 | return P2M_TYPE_IDENTITY; |
314 | |
315 | mfn = xen_p2m_addr[pfn]; |
316 | |
317 | if (mfn == INVALID_P2M_ENTRY) |
318 | return P2M_TYPE_MISSING; |
319 | |
320 | if (mfn & IDENTITY_FRAME_BIT) |
321 | return P2M_TYPE_IDENTITY; |
322 | |
323 | return P2M_TYPE_PFN; |
324 | } |
325 | |
326 | static void __init xen_rebuild_p2m_list(unsigned long *p2m) |
327 | { |
328 | unsigned int i, chunk; |
329 | unsigned long pfn; |
330 | unsigned long *mfns; |
331 | pte_t *ptep; |
332 | pmd_t *pmdp; |
333 | int type; |
334 | |
335 | p2m_missing = alloc_p2m_page(); |
336 | p2m_init(p2m: p2m_missing); |
337 | p2m_identity = alloc_p2m_page(); |
338 | p2m_init(p2m: p2m_identity); |
339 | |
340 | p2m_missing_pte = alloc_p2m_page(); |
341 | paravirt_alloc_pte(mm: &init_mm, __pa(p2m_missing_pte) >> PAGE_SHIFT); |
342 | p2m_identity_pte = alloc_p2m_page(); |
343 | paravirt_alloc_pte(mm: &init_mm, __pa(p2m_identity_pte) >> PAGE_SHIFT); |
344 | for (i = 0; i < PTRS_PER_PTE; i++) { |
345 | set_pte(ptep: p2m_missing_pte + i, |
346 | pte: pfn_pte(PFN_DOWN(__pa(p2m_missing)), PAGE_KERNEL_RO)); |
347 | set_pte(ptep: p2m_identity_pte + i, |
348 | pte: pfn_pte(PFN_DOWN(__pa(p2m_identity)), PAGE_KERNEL_RO)); |
349 | } |
350 | |
351 | for (pfn = 0; pfn < xen_max_p2m_pfn; pfn += chunk) { |
352 | /* |
353 | * Try to map missing/identity PMDs or p2m-pages if possible. |
354 | * We have to respect the structure of the mfn_list_list |
355 | * which will be built just afterwards. |
356 | * Chunk size to test is one p2m page if we are in the middle |
357 | * of a mfn_list_list mid page and the complete mid page area |
358 | * if we are at index 0 of the mid page. Please note that a |
359 | * mid page might cover more than one PMD, e.g. on 32 bit PAE |
360 | * kernels. |
361 | */ |
362 | chunk = (pfn & (P2M_PER_PAGE * P2M_MID_PER_PAGE - 1)) ? |
363 | P2M_PER_PAGE : P2M_PER_PAGE * P2M_MID_PER_PAGE; |
364 | |
365 | type = xen_p2m_elem_type(pfn); |
366 | i = 0; |
367 | if (type != P2M_TYPE_PFN) |
368 | for (i = 1; i < chunk; i++) |
369 | if (xen_p2m_elem_type(pfn: pfn + i) != type) |
370 | break; |
371 | if (i < chunk) |
372 | /* Reset to minimal chunk size. */ |
373 | chunk = P2M_PER_PAGE; |
374 | |
375 | if (type == P2M_TYPE_PFN || i < chunk) { |
376 | /* Use initial p2m page contents. */ |
377 | mfns = alloc_p2m_page(); |
378 | copy_page(to: mfns, from: xen_p2m_addr + pfn); |
379 | ptep = populate_extra_pte(vaddr: (unsigned long)(p2m + pfn)); |
380 | set_pte(ptep, |
381 | pte: pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL)); |
382 | continue; |
383 | } |
384 | |
385 | if (chunk == P2M_PER_PAGE) { |
386 | /* Map complete missing or identity p2m-page. */ |
387 | mfns = (type == P2M_TYPE_MISSING) ? |
388 | p2m_missing : p2m_identity; |
389 | ptep = populate_extra_pte(vaddr: (unsigned long)(p2m + pfn)); |
390 | set_pte(ptep, |
391 | pte: pfn_pte(PFN_DOWN(__pa(mfns)), PAGE_KERNEL_RO)); |
392 | continue; |
393 | } |
394 | |
395 | /* Complete missing or identity PMD(s) can be mapped. */ |
396 | ptep = (type == P2M_TYPE_MISSING) ? |
397 | p2m_missing_pte : p2m_identity_pte; |
398 | for (i = 0; i < PMDS_PER_MID_PAGE; i++) { |
399 | pmdp = populate_extra_pmd( |
400 | vaddr: (unsigned long)(p2m + pfn) + i * PMD_SIZE); |
401 | set_pmd(pmdp, pmd: __pmd(__pa(ptep) | _KERNPG_TABLE)); |
402 | } |
403 | } |
404 | } |
405 | |
406 | void __init xen_vmalloc_p2m_tree(void) |
407 | { |
408 | static struct vm_struct vm; |
409 | unsigned long p2m_limit; |
410 | |
411 | xen_p2m_last_pfn = xen_max_p2m_pfn; |
412 | |
413 | p2m_limit = (phys_addr_t)P2M_LIMIT * 1024 * 1024 * 1024 / PAGE_SIZE; |
414 | vm.flags = VM_ALLOC; |
415 | vm.size = ALIGN(sizeof(unsigned long) * max(xen_max_p2m_pfn, p2m_limit), |
416 | PMD_SIZE * PMDS_PER_MID_PAGE); |
417 | vm_area_register_early(vm: &vm, PMD_SIZE * PMDS_PER_MID_PAGE); |
418 | pr_notice("p2m virtual area at %p, size is %lx\n" , vm.addr, vm.size); |
419 | |
420 | xen_max_p2m_pfn = vm.size / sizeof(unsigned long); |
421 | |
422 | xen_rebuild_p2m_list(p2m: vm.addr); |
423 | |
424 | xen_p2m_addr = vm.addr; |
425 | xen_p2m_size = xen_max_p2m_pfn; |
426 | |
427 | xen_inv_extra_mem(); |
428 | } |
429 | |
430 | unsigned long get_phys_to_machine(unsigned long pfn) |
431 | { |
432 | pte_t *ptep; |
433 | unsigned int level; |
434 | |
435 | if (unlikely(pfn >= xen_p2m_size)) { |
436 | if (pfn < xen_max_p2m_pfn) |
437 | return xen_chk_extra_mem(pfn); |
438 | |
439 | return IDENTITY_FRAME(pfn); |
440 | } |
441 | |
442 | ptep = lookup_address(address: (unsigned long)(xen_p2m_addr + pfn), level: &level); |
443 | BUG_ON(!ptep || level != PG_LEVEL_4K); |
444 | |
445 | /* |
446 | * The INVALID_P2M_ENTRY is filled in both p2m_*identity |
447 | * and in p2m_*missing, so returning the INVALID_P2M_ENTRY |
448 | * would be wrong. |
449 | */ |
450 | if (pte_pfn(pte: *ptep) == PFN_DOWN(__pa(p2m_identity))) |
451 | return IDENTITY_FRAME(pfn); |
452 | |
453 | return xen_p2m_addr[pfn]; |
454 | } |
455 | EXPORT_SYMBOL_GPL(get_phys_to_machine); |
456 | |
457 | /* |
458 | * Allocate new pmd(s). It is checked whether the old pmd is still in place. |
459 | * If not, nothing is changed. This is okay as the only reason for allocating |
460 | * a new pmd is to replace p2m_missing_pte or p2m_identity_pte by a individual |
461 | * pmd. |
462 | */ |
463 | static pte_t *alloc_p2m_pmd(unsigned long addr, pte_t *pte_pg) |
464 | { |
465 | pte_t *ptechk; |
466 | pte_t *pte_newpg[PMDS_PER_MID_PAGE]; |
467 | pmd_t *pmdp; |
468 | unsigned int level; |
469 | unsigned long flags; |
470 | unsigned long vaddr; |
471 | int i; |
472 | |
473 | /* Do all allocations first to bail out in error case. */ |
474 | for (i = 0; i < PMDS_PER_MID_PAGE; i++) { |
475 | pte_newpg[i] = alloc_p2m_page(); |
476 | if (!pte_newpg[i]) { |
477 | for (i--; i >= 0; i--) |
478 | free_p2m_page(p: pte_newpg[i]); |
479 | |
480 | return NULL; |
481 | } |
482 | } |
483 | |
484 | vaddr = addr & ~(PMD_SIZE * PMDS_PER_MID_PAGE - 1); |
485 | |
486 | for (i = 0; i < PMDS_PER_MID_PAGE; i++) { |
487 | copy_page(to: pte_newpg[i], from: pte_pg); |
488 | paravirt_alloc_pte(mm: &init_mm, __pa(pte_newpg[i]) >> PAGE_SHIFT); |
489 | |
490 | pmdp = lookup_pmd_address(address: vaddr); |
491 | BUG_ON(!pmdp); |
492 | |
493 | spin_lock_irqsave(&p2m_update_lock, flags); |
494 | |
495 | ptechk = lookup_address(address: vaddr, level: &level); |
496 | if (ptechk == pte_pg) { |
497 | HYPERVISOR_shared_info->arch.p2m_generation++; |
498 | wmb(); /* Tools are synchronizing via p2m_generation. */ |
499 | set_pmd(pmdp, |
500 | pmd: __pmd(__pa(pte_newpg[i]) | _KERNPG_TABLE)); |
501 | wmb(); /* Tools are synchronizing via p2m_generation. */ |
502 | HYPERVISOR_shared_info->arch.p2m_generation++; |
503 | pte_newpg[i] = NULL; |
504 | } |
505 | |
506 | spin_unlock_irqrestore(lock: &p2m_update_lock, flags); |
507 | |
508 | if (pte_newpg[i]) { |
509 | paravirt_release_pte(__pa(pte_newpg[i]) >> PAGE_SHIFT); |
510 | free_p2m_page(p: pte_newpg[i]); |
511 | } |
512 | |
513 | vaddr += PMD_SIZE; |
514 | } |
515 | |
516 | return lookup_address(address: addr, level: &level); |
517 | } |
518 | |
519 | /* |
520 | * Fully allocate the p2m structure for a given pfn. We need to check |
521 | * that both the top and mid levels are allocated, and make sure the |
522 | * parallel mfn tree is kept in sync. We may race with other cpus, so |
523 | * the new pages are installed with cmpxchg; if we lose the race then |
524 | * simply free the page we allocated and use the one that's there. |
525 | */ |
526 | int xen_alloc_p2m_entry(unsigned long pfn) |
527 | { |
528 | unsigned topidx; |
529 | unsigned long *top_mfn_p, *mid_mfn; |
530 | pte_t *ptep, *pte_pg; |
531 | unsigned int level; |
532 | unsigned long flags; |
533 | unsigned long addr = (unsigned long)(xen_p2m_addr + pfn); |
534 | unsigned long p2m_pfn; |
535 | |
536 | ptep = lookup_address(address: addr, level: &level); |
537 | BUG_ON(!ptep || level != PG_LEVEL_4K); |
538 | pte_pg = (pte_t *)((unsigned long)ptep & ~(PAGE_SIZE - 1)); |
539 | |
540 | if (pte_pg == p2m_missing_pte || pte_pg == p2m_identity_pte) { |
541 | /* PMD level is missing, allocate a new one */ |
542 | ptep = alloc_p2m_pmd(addr, pte_pg); |
543 | if (!ptep) |
544 | return -ENOMEM; |
545 | } |
546 | |
547 | if (p2m_top_mfn && pfn < MAX_P2M_PFN) { |
548 | topidx = p2m_top_index(pfn); |
549 | top_mfn_p = &p2m_top_mfn[topidx]; |
550 | mid_mfn = READ_ONCE(p2m_top_mfn_p[topidx]); |
551 | |
552 | BUG_ON(virt_to_mfn(mid_mfn) != *top_mfn_p); |
553 | |
554 | if (mid_mfn == p2m_mid_missing_mfn) { |
555 | /* Separately check the mid mfn level */ |
556 | unsigned long missing_mfn; |
557 | unsigned long mid_mfn_mfn; |
558 | unsigned long old_mfn; |
559 | |
560 | mid_mfn = alloc_p2m_page(); |
561 | if (!mid_mfn) |
562 | return -ENOMEM; |
563 | |
564 | p2m_mid_mfn_init(mid: mid_mfn, leaf: p2m_missing); |
565 | |
566 | missing_mfn = virt_to_mfn(p2m_mid_missing_mfn); |
567 | mid_mfn_mfn = virt_to_mfn(mid_mfn); |
568 | old_mfn = cmpxchg(top_mfn_p, missing_mfn, mid_mfn_mfn); |
569 | if (old_mfn != missing_mfn) { |
570 | free_p2m_page(p: mid_mfn); |
571 | mid_mfn = mfn_to_virt(old_mfn); |
572 | } else { |
573 | p2m_top_mfn_p[topidx] = mid_mfn; |
574 | } |
575 | } |
576 | } else { |
577 | mid_mfn = NULL; |
578 | } |
579 | |
580 | p2m_pfn = pte_pfn(READ_ONCE(*ptep)); |
581 | if (p2m_pfn == PFN_DOWN(__pa(p2m_identity)) || |
582 | p2m_pfn == PFN_DOWN(__pa(p2m_missing))) { |
583 | /* p2m leaf page is missing */ |
584 | unsigned long *p2m; |
585 | |
586 | p2m = alloc_p2m_page(); |
587 | if (!p2m) |
588 | return -ENOMEM; |
589 | |
590 | if (p2m_pfn == PFN_DOWN(__pa(p2m_missing))) |
591 | p2m_init(p2m); |
592 | else |
593 | p2m_init_identity(p2m, pfn: pfn & ~(P2M_PER_PAGE - 1)); |
594 | |
595 | spin_lock_irqsave(&p2m_update_lock, flags); |
596 | |
597 | if (pte_pfn(pte: *ptep) == p2m_pfn) { |
598 | HYPERVISOR_shared_info->arch.p2m_generation++; |
599 | wmb(); /* Tools are synchronizing via p2m_generation. */ |
600 | set_pte(ptep, |
601 | pte: pfn_pte(PFN_DOWN(__pa(p2m)), PAGE_KERNEL)); |
602 | wmb(); /* Tools are synchronizing via p2m_generation. */ |
603 | HYPERVISOR_shared_info->arch.p2m_generation++; |
604 | if (mid_mfn) |
605 | mid_mfn[p2m_mid_index(pfn)] = virt_to_mfn(p2m); |
606 | p2m = NULL; |
607 | } |
608 | |
609 | spin_unlock_irqrestore(lock: &p2m_update_lock, flags); |
610 | |
611 | if (p2m) |
612 | free_p2m_page(p: p2m); |
613 | } |
614 | |
615 | /* Expanded the p2m? */ |
616 | if (pfn >= xen_p2m_last_pfn) { |
617 | xen_p2m_last_pfn = ALIGN(pfn + 1, P2M_PER_PAGE); |
618 | HYPERVISOR_shared_info->arch.max_pfn = xen_p2m_last_pfn; |
619 | } |
620 | |
621 | return 0; |
622 | } |
623 | EXPORT_SYMBOL(xen_alloc_p2m_entry); |
624 | |
625 | unsigned long __init set_phys_range_identity(unsigned long pfn_s, |
626 | unsigned long pfn_e) |
627 | { |
628 | unsigned long pfn; |
629 | |
630 | if (unlikely(pfn_s >= xen_p2m_size)) |
631 | return 0; |
632 | |
633 | if (pfn_s > pfn_e) |
634 | return 0; |
635 | |
636 | if (pfn_e > xen_p2m_size) |
637 | pfn_e = xen_p2m_size; |
638 | |
639 | for (pfn = pfn_s; pfn < pfn_e; pfn++) |
640 | xen_p2m_addr[pfn] = IDENTITY_FRAME(pfn); |
641 | |
642 | return pfn - pfn_s; |
643 | } |
644 | |
645 | bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) |
646 | { |
647 | pte_t *ptep; |
648 | unsigned int level; |
649 | |
650 | /* Only invalid entries allowed above the highest p2m covered frame. */ |
651 | if (unlikely(pfn >= xen_p2m_size)) |
652 | return mfn == INVALID_P2M_ENTRY; |
653 | |
654 | /* |
655 | * The interface requires atomic updates on p2m elements. |
656 | * xen_safe_write_ulong() is using an atomic store via asm(). |
657 | */ |
658 | if (likely(!xen_safe_write_ulong(xen_p2m_addr + pfn, mfn))) |
659 | return true; |
660 | |
661 | ptep = lookup_address(address: (unsigned long)(xen_p2m_addr + pfn), level: &level); |
662 | BUG_ON(!ptep || level != PG_LEVEL_4K); |
663 | |
664 | if (pte_pfn(pte: *ptep) == PFN_DOWN(__pa(p2m_missing))) |
665 | return mfn == INVALID_P2M_ENTRY; |
666 | |
667 | if (pte_pfn(pte: *ptep) == PFN_DOWN(__pa(p2m_identity))) |
668 | return mfn == IDENTITY_FRAME(pfn); |
669 | |
670 | return false; |
671 | } |
672 | |
673 | bool set_phys_to_machine(unsigned long pfn, unsigned long mfn) |
674 | { |
675 | if (unlikely(!__set_phys_to_machine(pfn, mfn))) { |
676 | int ret; |
677 | |
678 | ret = xen_alloc_p2m_entry(pfn); |
679 | if (ret < 0) |
680 | return false; |
681 | |
682 | return __set_phys_to_machine(pfn, mfn); |
683 | } |
684 | |
685 | return true; |
686 | } |
687 | |
688 | int set_foreign_p2m_mapping(struct gnttab_map_grant_ref *map_ops, |
689 | struct gnttab_map_grant_ref *kmap_ops, |
690 | struct page **pages, unsigned int count) |
691 | { |
692 | int i, ret = 0; |
693 | pte_t *pte; |
694 | |
695 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
696 | return 0; |
697 | |
698 | if (kmap_ops) { |
699 | ret = HYPERVISOR_grant_table_op(GNTTABOP_map_grant_ref, |
700 | uop: kmap_ops, count); |
701 | if (ret) |
702 | goto out; |
703 | } |
704 | |
705 | for (i = 0; i < count; i++) { |
706 | unsigned long mfn, pfn; |
707 | struct gnttab_unmap_grant_ref unmap[2]; |
708 | int rc; |
709 | |
710 | /* Do not add to override if the map failed. */ |
711 | if (map_ops[i].status != GNTST_okay || |
712 | (kmap_ops && kmap_ops[i].status != GNTST_okay)) |
713 | continue; |
714 | |
715 | if (map_ops[i].flags & GNTMAP_contains_pte) { |
716 | pte = (pte_t *)(mfn_to_virt(PFN_DOWN(map_ops[i].host_addr)) + |
717 | (map_ops[i].host_addr & ~PAGE_MASK)); |
718 | mfn = pte_mfn(pte: *pte); |
719 | } else { |
720 | mfn = PFN_DOWN(map_ops[i].dev_bus_addr); |
721 | } |
722 | pfn = page_to_pfn(pages[i]); |
723 | |
724 | WARN(pfn_to_mfn(pfn) != INVALID_P2M_ENTRY, "page must be ballooned" ); |
725 | |
726 | if (likely(set_phys_to_machine(pfn, FOREIGN_FRAME(mfn)))) |
727 | continue; |
728 | |
729 | /* |
730 | * Signal an error for this slot. This in turn requires |
731 | * immediate unmapping. |
732 | */ |
733 | map_ops[i].status = GNTST_general_error; |
734 | unmap[0].host_addr = map_ops[i].host_addr, |
735 | unmap[0].handle = map_ops[i].handle; |
736 | map_ops[i].handle = INVALID_GRANT_HANDLE; |
737 | if (map_ops[i].flags & GNTMAP_device_map) |
738 | unmap[0].dev_bus_addr = map_ops[i].dev_bus_addr; |
739 | else |
740 | unmap[0].dev_bus_addr = 0; |
741 | |
742 | if (kmap_ops) { |
743 | kmap_ops[i].status = GNTST_general_error; |
744 | unmap[1].host_addr = kmap_ops[i].host_addr, |
745 | unmap[1].handle = kmap_ops[i].handle; |
746 | kmap_ops[i].handle = INVALID_GRANT_HANDLE; |
747 | if (kmap_ops[i].flags & GNTMAP_device_map) |
748 | unmap[1].dev_bus_addr = kmap_ops[i].dev_bus_addr; |
749 | else |
750 | unmap[1].dev_bus_addr = 0; |
751 | } |
752 | |
753 | /* |
754 | * Pre-populate both status fields, to be recognizable in |
755 | * the log message below. |
756 | */ |
757 | unmap[0].status = 1; |
758 | unmap[1].status = 1; |
759 | |
760 | rc = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, |
761 | uop: unmap, count: 1 + !!kmap_ops); |
762 | if (rc || unmap[0].status != GNTST_okay || |
763 | unmap[1].status != GNTST_okay) |
764 | pr_err_once("gnttab unmap failed: rc=%d st0=%d st1=%d\n" , |
765 | rc, unmap[0].status, unmap[1].status); |
766 | } |
767 | |
768 | out: |
769 | return ret; |
770 | } |
771 | |
772 | int clear_foreign_p2m_mapping(struct gnttab_unmap_grant_ref *unmap_ops, |
773 | struct gnttab_unmap_grant_ref *kunmap_ops, |
774 | struct page **pages, unsigned int count) |
775 | { |
776 | int i, ret = 0; |
777 | |
778 | if (xen_feature(XENFEAT_auto_translated_physmap)) |
779 | return 0; |
780 | |
781 | for (i = 0; i < count; i++) { |
782 | unsigned long mfn = __pfn_to_mfn(page_to_pfn(pages[i])); |
783 | unsigned long pfn = page_to_pfn(pages[i]); |
784 | |
785 | if (mfn != INVALID_P2M_ENTRY && (mfn & FOREIGN_FRAME_BIT)) |
786 | set_phys_to_machine(pfn, INVALID_P2M_ENTRY); |
787 | else |
788 | ret = -EINVAL; |
789 | } |
790 | if (kunmap_ops) |
791 | ret = HYPERVISOR_grant_table_op(GNTTABOP_unmap_grant_ref, |
792 | uop: kunmap_ops, count) ?: ret; |
793 | |
794 | return ret; |
795 | } |
796 | |
797 | #ifdef CONFIG_XEN_DEBUG_FS |
798 | #include <linux/debugfs.h> |
799 | #include "debugfs.h" |
800 | static int p2m_dump_show(struct seq_file *m, void *v) |
801 | { |
802 | static const char * const type_name[] = { |
803 | [P2M_TYPE_IDENTITY] = "identity" , |
804 | [P2M_TYPE_MISSING] = "missing" , |
805 | [P2M_TYPE_PFN] = "pfn" , |
806 | [P2M_TYPE_UNKNOWN] = "abnormal" }; |
807 | unsigned long pfn, first_pfn; |
808 | int type, prev_type; |
809 | |
810 | prev_type = xen_p2m_elem_type(pfn: 0); |
811 | first_pfn = 0; |
812 | |
813 | for (pfn = 0; pfn < xen_p2m_size; pfn++) { |
814 | type = xen_p2m_elem_type(pfn); |
815 | if (type != prev_type) { |
816 | seq_printf(m, fmt: " [0x%lx->0x%lx] %s\n" , first_pfn, pfn, |
817 | type_name[prev_type]); |
818 | prev_type = type; |
819 | first_pfn = pfn; |
820 | } |
821 | } |
822 | seq_printf(m, fmt: " [0x%lx->0x%lx] %s\n" , first_pfn, pfn, |
823 | type_name[prev_type]); |
824 | return 0; |
825 | } |
826 | |
827 | DEFINE_SHOW_ATTRIBUTE(p2m_dump); |
828 | |
829 | static struct dentry *d_mmu_debug; |
830 | |
831 | static int __init xen_p2m_debugfs(void) |
832 | { |
833 | struct dentry *d_xen = xen_init_debugfs(); |
834 | |
835 | d_mmu_debug = debugfs_create_dir(name: "mmu" , parent: d_xen); |
836 | |
837 | debugfs_create_file(name: "p2m" , mode: 0600, parent: d_mmu_debug, NULL, fops: &p2m_dump_fops); |
838 | return 0; |
839 | } |
840 | fs_initcall(xen_p2m_debugfs); |
841 | #endif /* CONFIG_XEN_DEBUG_FS */ |
842 | |