mmu_pv.c source code [linux/arch/x86/xen/mmu_pv.c]

1	// SPDX-License-Identifier: GPL-2.0
2
3	/*
4	* Xen mmu operations
5	*
6	* This file contains the various mmu fetch and update operations.
7	* The most important job they must perform is the mapping between the
8	* domain's pfn and the overall machine mfns.
9	*
10	* Xen allows guests to directly update the pagetable, in a controlled
11	* fashion. In other words, the guest modifies the same pagetable
12	* that the CPU actually uses, which eliminates the overhead of having
13	* a separate shadow pagetable.
14	*
15	* In order to allow this, it falls on the guest domain to map its
16	* notion of a "physical" pfn - which is just a domain-local linear
17	* address - into a real "machine address" which the CPU's MMU can
18	* use.
19	*
20	* A pgd_t/pmd_t/pte_t will typically contain an mfn, and so can be
21	* inserted directly into the pagetable. When creating a new
22	* pte/pmd/pgd, it converts the passed pfn into an mfn. Conversely,
23	* when reading the content back with __(pgd\|pmd\|pte)_val, it converts
24	* the mfn back into a pfn.
25	*
26	* The other constraint is that all pages which make up a pagetable
27	* must be mapped read-only in the guest. This prevents uncontrolled
28	* guest updates to the pagetable. Xen strictly enforces this, and
29	* will disallow any pagetable update which will end up mapping a
30	* pagetable page RW, and will disallow using any writable page as a
31	* pagetable.
32	*
33	* Naively, when loading %cr3 with the base of a new pagetable, Xen
34	* would need to validate the whole pagetable before going on.
35	* Naturally, this is quite slow. The solution is to "pin" a
36	* pagetable, which enforces all the constraints on the pagetable even
37	* when it is not actively in use. This menas that Xen can be assured
38	* that it is still valid when you do load it into %cr3, and doesn't
39	* need to revalidate it.
40	*
41	* Jeremy Fitzhardinge <jeremy@xensource.com>, XenSource Inc, 2007
42	*/
43	#include <linux/sched/mm.h>
44	#include <linux/debugfs.h>
45	#include <linux/bug.h>
46	#include <linux/vmalloc.h>
47	#include <linux/export.h>
48	#include <linux/init.h>
49	#include <linux/gfp.h>
50	#include <linux/memblock.h>
51	#include <linux/seq_file.h>
52	#include <linux/crash_dump.h>
53	#include <linux/pgtable.h>
54	#ifdef CONFIG_KEXEC_CORE
55	#include <linux/kexec.h>
56	#endif
57
58	#include <trace/events/xen.h>
59
60	#include <asm/tlbflush.h>
61	#include <asm/fixmap.h>
62	#include <asm/mmu_context.h>
63	#include <asm/setup.h>
64	#include <asm/paravirt.h>
65	#include <asm/e820/api.h>
66	#include <asm/linkage.h>
67	#include <asm/page.h>
68	#include <asm/init.h>
69	#include <asm/memtype.h>
70	#include <asm/smp.h>
71	#include <asm/tlb.h>
72
73	#include <asm/xen/hypercall.h>
74	#include <asm/xen/hypervisor.h>
75
76	#include <xen/xen.h>
77	#include <xen/page.h>
78	#include <xen/interface/xen.h>
79	#include <xen/interface/hvm/hvm_op.h>
80	#include <xen/interface/version.h>
81	#include <xen/interface/memory.h>
82	#include <xen/hvc-console.h>
83	#include <xen/swiotlb-xen.h>
84
85	#include "multicalls.h"
86	#include "mmu.h"
87	#include "debugfs.h"
88
89	/*
90	* Prototypes for functions called via PV_CALLEE_SAVE_REGS_THUNK() in order
91	* to avoid warnings with "-Wmissing-prototypes".
92	*/
93	pteval_t xen_pte_val(pte_t pte);
94	pgdval_t xen_pgd_val(pgd_t pgd);
95	pmdval_t xen_pmd_val(pmd_t pmd);
96	pudval_t xen_pud_val(pud_t pud);
97	p4dval_t xen_p4d_val(p4d_t p4d);
98	pte_t xen_make_pte(pteval_t pte);
99	pgd_t xen_make_pgd(pgdval_t pgd);
100	pmd_t xen_make_pmd(pmdval_t pmd);
101	pud_t xen_make_pud(pudval_t pud);
102	p4d_t xen_make_p4d(p4dval_t p4d);
103	pte_t xen_make_pte_init(pteval_t pte);
104
105	#ifdef CONFIG_X86_VSYSCALL_EMULATION
106	/ l3 pud for userspace vsyscall mapping /
107	static pud_t level3_user_vsyscall[PTRS_PER_PUD] __page_aligned_bss;
108	#endif
109
110	/*
111	* Protects atomic reservation decrease/increase against concurrent increases.
112	* Also protects non-atomic updates of current_pages and balloon lists.
113	*/
114	static DEFINE_SPINLOCK(xen_reservation_lock);
115
116	/*
117	* Note about cr3 (pagetable base) values:
118	*
119	* xen_cr3 contains the current logical cr3 value; it contains the
120	* last set cr3. This may not be the current effective cr3, because
121	* its update may be being lazily deferred. However, a vcpu looking
122	* at its own cr3 can use this value knowing that it everything will
123	* be self-consistent.
124	*
125	* xen_current_cr3 contains the actual vcpu cr3; it is set once the
126	* hypercall to set the vcpu cr3 is complete (so it may be a little
127	* out of date, but it will never be set early). If one vcpu is
128	* looking at another vcpu's cr3 value, it should use this variable.
129	*/
130	DEFINE_PER_CPU(unsigned long, xen_cr3); / cr3 stored as physaddr /
131	DEFINE_PER_CPU(unsigned long, xen_current_cr3); / actual vcpu cr3 /
132
133	static phys_addr_t xen_pt_base, xen_pt_size __initdata;
134
135	static DEFINE_STATIC_KEY_FALSE(xen_struct_pages_ready);
136
137	/*
138	* Just beyond the highest usermode address. STACK_TOP_MAX has a
139	* redzone above it, so round it up to a PGD boundary.
140	*/
141	#define USER_LIMIT ((STACK_TOP_MAX + PGDIR_SIZE - 1) & PGDIR_MASK)
142
143	void make_lowmem_page_readonly(void *vaddr)
144	{
145	pte_t *pte, ptev;
146	unsigned long address = (unsigned long)vaddr;
147	unsigned int level;
148
149	pte = lookup_address(address, level: &level);
150	if (pte == NULL)
151	return; / vaddr missing /
152
153	ptev = pte_wrprotect(pte: *pte);
154
155	if (HYPERVISOR_update_va_mapping(va: address, new_val: ptev, flags: `0`))
156	BUG();
157	}
158
159	void make_lowmem_page_readwrite(void *vaddr)
160	{
161	pte_t *pte, ptev;
162	unsigned long address = (unsigned long)vaddr;
163	unsigned int level;
164
165	pte = lookup_address(address, level: &level);
166	if (pte == NULL)
167	return; / vaddr missing /
168
169	ptev = pte_mkwrite_novma(pte: *pte);
170
171	if (HYPERVISOR_update_va_mapping(va: address, new_val: ptev, flags: `0`))
172	BUG();
173	}
174
175
176	/*
177	* During early boot all page table pages are pinned, but we do not have struct
178	* pages, so return true until struct pages are ready.
179	*/
180	static bool xen_page_pinned(void *ptr)
181	{
182	if (static_branch_likely(&xen_struct_pages_ready)) {
183	struct page *page = virt_to_page(ptr);
184
185	return PagePinned(page);
186	}
187	return true;
188	}
189
190	static void xen_extend_mmu_update(const struct mmu_update *update)
191	{
192	struct multicall_space mcs;
193	struct mmu_update *u;
194
195	mcs = xen_mc_extend_args(__HYPERVISOR_mmu_update, arg_size: sizeof(*u));
196
197	if (mcs.mc != NULL) {
198	mcs.mc->args[`1`]++;
199	} else {
200	mcs = __xen_mc_entry(args: sizeof(*u));
201	MULTI_mmu_update(mcl: mcs.mc, req: mcs.args, count: `1`, NULL, DOMID_SELF);
202	}
203
204	u = mcs.args;
205	u = update;
206	}
207
208	static void xen_extend_mmuext_op(const struct mmuext_op *op)
209	{
210	struct multicall_space mcs;
211	struct mmuext_op *u;
212
213	mcs = xen_mc_extend_args(__HYPERVISOR_mmuext_op, arg_size: sizeof(*u));
214
215	if (mcs.mc != NULL) {
216	mcs.mc->args[`1`]++;
217	} else {
218	mcs = __xen_mc_entry(args: sizeof(*u));
219	MULTI_mmuext_op(mcl: mcs.mc, op: mcs.args, count: `1`, NULL, DOMID_SELF);
220	}
221
222	u = mcs.args;
223	u = op;
224	}
225
226	static void xen_set_pmd_hyper(pmd_t *ptr, pmd_t val)
227	{
228	struct mmu_update u;
229
230	preempt_disable();
231
232	xen_mc_batch();
233
234	/ ptr may be ioremapped for 64-bit pagetable setup /
235	u.ptr = arbitrary_virt_to_machine(address: ptr).maddr;
236	u.val = pmd_val_ma(val);
237	xen_extend_mmu_update(update: &u);
238
239	xen_mc_issue(mode: XEN_LAZY_MMU);
240
241	preempt_enable();
242	}
243
244	static void xen_set_pmd(pmd_t *ptr, pmd_t val)
245	{
246	trace_xen_mmu_set_pmd(pmdp: ptr, pmdval: val);
247
248	/ If page is not pinned, we can just update the entry*
249	directly /*
250	if (!xen_page_pinned(ptr)) {
251	*ptr = val;
252	return;
253	}
254
255	xen_set_pmd_hyper(ptr, val);
256	}
257
258	/*
259	* Associate a virtual page frame with a given physical page frame
260	* and protection flags for that frame.
261	*/
262	void __init set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags)
263	{
264	if (HYPERVISOR_update_va_mapping(va: vaddr, new_val: mfn_pte(page_nr: mfn, pgprot: flags),
265	UVMF_INVLPG))
266	BUG();
267	}
268
269	static bool xen_batched_set_pte(pte_t *ptep, pte_t pteval)
270	{
271	struct mmu_update u;
272
273	if (xen_get_lazy_mode() != XEN_LAZY_MMU)
274	return false;
275
276	xen_mc_batch();
277
278	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;
279	u.val = pte_val_ma(pte: pteval);
280	xen_extend_mmu_update(update: &u);
281
282	xen_mc_issue(mode: XEN_LAZY_MMU);
283
284	return true;
285	}
286
287	static inline void __xen_set_pte(pte_t *ptep, pte_t pteval)
288	{
289	if (!xen_batched_set_pte(ptep, pteval)) {
290	/*
291	* Could call native_set_pte() here and trap and
292	* emulate the PTE write, but a hypercall is much cheaper.
293	*/
294	struct mmu_update u;
295
296	u.ptr = virt_to_machine(ptep).maddr \| MMU_NORMAL_PT_UPDATE;
297	u.val = pte_val_ma(pte: pteval);
298	HYPERVISOR_mmu_update(req: &u, count: `1`, NULL, DOMID_SELF);
299	}
300	}
301
302	static void xen_set_pte(pte_t *ptep, pte_t pteval)
303	{
304	trace_xen_mmu_set_pte(ptep, pteval);
305	__xen_set_pte(ptep, pteval);
306	}
307
308	pte_t xen_ptep_modify_prot_start(struct vm_area_struct *vma,
309	unsigned long addr, pte_t *ptep)
310	{
311	/ Just return the pte as-is. We preserve the bits on commit /
312	trace_xen_mmu_ptep_modify_prot_start(mm: vma->vm_mm, addr, ptep, pteval: *ptep);
313	return *ptep;
314	}
315
316	void xen_ptep_modify_prot_commit(struct vm_area_struct vma, unsigned* long addr,
317	pte_t *ptep, pte_t pte)
318	{
319	struct mmu_update u;
320
321	trace_xen_mmu_ptep_modify_prot_commit(mm: vma->vm_mm, addr, ptep, pteval: pte);
322	xen_mc_batch();
323
324	u.ptr = virt_to_machine(ptep).maddr \| MMU_PT_UPDATE_PRESERVE_AD;
325	u.val = pte_val_ma(pte);
326	xen_extend_mmu_update(update: &u);
327
328	xen_mc_issue(mode: XEN_LAZY_MMU);
329	}
330
331	/ Assume pteval_t is equivalent to all the other val_t types. /*
332	static pteval_t pte_mfn_to_pfn(pteval_t val)
333	{
334	if (val & _PAGE_PRESENT) {
335	unsigned long mfn = (val & XEN_PTE_MFN_MASK) >> PAGE_SHIFT;
336	unsigned long pfn = mfn_to_pfn(mfn);
337
338	pteval_t flags = val & PTE_FLAGS_MASK;
339	if (unlikely(pfn == ~`0`))
340	val = flags & ~_PAGE_PRESENT;
341	else
342	val = ((pteval_t)pfn << PAGE_SHIFT) \| flags;
343	}
344
345	return val;
346	}
347
348	static pteval_t pte_pfn_to_mfn(pteval_t val)
349	{
350	if (val & _PAGE_PRESENT) {
351	unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
352	pteval_t flags = val & PTE_FLAGS_MASK;
353	unsigned long mfn;
354
355	mfn = __pfn_to_mfn(pfn);
356
357	/*
358	* If there's no mfn for the pfn, then just create an
359	* empty non-present pte. Unfortunately this loses
360	* information about the original pfn, so
361	* pte_mfn_to_pfn is asymmetric.
362	*/
363	if (unlikely(mfn == INVALID_P2M_ENTRY)) {
364	mfn = `0`;
365	flags = `0`;
366	} else
367	mfn &= ~(FOREIGN_FRAME_BIT \| IDENTITY_FRAME_BIT);
368	val = ((pteval_t)mfn << PAGE_SHIFT) \| flags;
369	}
370
371	return val;
372	}
373
374	__visible pteval_t xen_pte_val(pte_t pte)
375	{
376	pteval_t pteval = pte.pte;
377
378	return pte_mfn_to_pfn(val: pteval);
379	}
380	PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
381
382	__visible pgdval_t xen_pgd_val(pgd_t pgd)
383	{
384	return pte_mfn_to_pfn(val: pgd.pgd);
385	}
386	PV_CALLEE_SAVE_REGS_THUNK(xen_pgd_val);
387
388	__visible pte_t xen_make_pte(pteval_t pte)
389	{
390	pte = pte_pfn_to_mfn(val: pte);
391
392	return native_make_pte(val: pte);
393	}
394	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte);
395
396	__visible pgd_t xen_make_pgd(pgdval_t pgd)
397	{
398	pgd = pte_pfn_to_mfn(val: pgd);
399	return native_make_pgd(val: pgd);
400	}
401	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pgd);
402
403	__visible pmdval_t xen_pmd_val(pmd_t pmd)
404	{
405	return pte_mfn_to_pfn(val: pmd.pmd);
406	}
407	PV_CALLEE_SAVE_REGS_THUNK(xen_pmd_val);
408
409	static void xen_set_pud_hyper(pud_t *ptr, pud_t val)
410	{
411	struct mmu_update u;
412
413	preempt_disable();
414
415	xen_mc_batch();
416
417	/ ptr may be ioremapped for 64-bit pagetable setup /
418	u.ptr = arbitrary_virt_to_machine(address: ptr).maddr;
419	u.val = pud_val_ma(val);
420	xen_extend_mmu_update(update: &u);
421
422	xen_mc_issue(mode: XEN_LAZY_MMU);
423
424	preempt_enable();
425	}
426
427	static void xen_set_pud(pud_t *ptr, pud_t val)
428	{
429	trace_xen_mmu_set_pud(pudp: ptr, pudval: val);
430
431	/ If page is not pinned, we can just update the entry*
432	directly /*
433	if (!xen_page_pinned(ptr)) {
434	*ptr = val;
435	return;
436	}
437
438	xen_set_pud_hyper(ptr, val);
439	}
440
441	__visible pmd_t xen_make_pmd(pmdval_t pmd)
442	{
443	pmd = pte_pfn_to_mfn(val: pmd);
444	return native_make_pmd(val: pmd);
445	}
446	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
447
448	__visible pudval_t xen_pud_val(pud_t pud)
449	{
450	return pte_mfn_to_pfn(val: pud.pud);
451	}
452	PV_CALLEE_SAVE_REGS_THUNK(xen_pud_val);
453
454	__visible pud_t xen_make_pud(pudval_t pud)
455	{
456	pud = pte_pfn_to_mfn(val: pud);
457
458	return native_make_pud(val: pud);
459	}
460	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pud);
461
462	static pgd_t xen_get_user_pgd(pgd_t pgd)
463	{
464	pgd_t pgd_page = (pgd_t )(((unsigned long)pgd) & PAGE_MASK);
465	unsigned offset = pgd - pgd_page;
466	pgd_t *user_ptr = NULL;
467
468	if (offset < pgd_index(USER_LIMIT)) {
469	struct page *page = virt_to_page(pgd_page);
470	user_ptr = (pgd_t *)page->private;
471	if (user_ptr)
472	user_ptr += offset;
473	}
474
475	return user_ptr;
476	}
477
478	static void __xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
479	{
480	struct mmu_update u;
481
482	u.ptr = virt_to_machine(ptr).maddr;
483	u.val = p4d_val_ma(val);
484	xen_extend_mmu_update(update: &u);
485	}
486
487	/*
488	* Raw hypercall-based set_p4d, intended for in early boot before
489	* there's a page structure. This implies:
490	* 1. The only existing pagetable is the kernel's
491	* 2. It is always pinned
492	* 3. It has no user pagetable attached to it
493	*/
494	static void __init xen_set_p4d_hyper(p4d_t *ptr, p4d_t val)
495	{
496	preempt_disable();
497
498	xen_mc_batch();
499
500	__xen_set_p4d_hyper(ptr, val);
501
502	xen_mc_issue(mode: XEN_LAZY_MMU);
503
504	preempt_enable();
505	}
506
507	static void xen_set_p4d(p4d_t *ptr, p4d_t val)
508	{
509	pgd_t user_ptr = xen_get_user_pgd(pgd: (pgd_t )ptr);
510	pgd_t pgd_val;
511
512	trace_xen_mmu_set_p4d(p4dp: ptr, user_p4dp: (p4d_t *)user_ptr, p4dval: val);
513
514	/ If page is not pinned, we can just update the entry*
515	directly /*
516	if (!xen_page_pinned(ptr)) {
517	*ptr = val;
518	if (user_ptr) {
519	WARN_ON(xen_page_pinned(user_ptr));
520	pgd_val.pgd = p4d_val_ma(val);
521	*user_ptr = pgd_val;
522	}
523	return;
524	}
525
526	/ If it's pinned, then we can at least batch the kernel and*
527	user updates together. /*
528	xen_mc_batch();
529
530	__xen_set_p4d_hyper(ptr, val);
531	if (user_ptr)
532	__xen_set_p4d_hyper(ptr: (p4d_t *)user_ptr, val);
533
534	xen_mc_issue(mode: XEN_LAZY_MMU);
535	}
536
537	#if CONFIG_PGTABLE_LEVELS >= 5
538	__visible p4dval_t xen_p4d_val(p4d_t p4d)
539	{
540	return pte_mfn_to_pfn(val: p4d.p4d);
541	}
542	PV_CALLEE_SAVE_REGS_THUNK(xen_p4d_val);
543
544	__visible p4d_t xen_make_p4d(p4dval_t p4d)
545	{
546	p4d = pte_pfn_to_mfn(val: p4d);
547
548	return native_make_p4d(val: p4d);
549	}
550	PV_CALLEE_SAVE_REGS_THUNK(xen_make_p4d);
551	#endif /* CONFIG_PGTABLE_LEVELS >= 5 */
552
553	static void xen_pmd_walk(struct mm_struct mm, pmd_t pmd,
554	void (func)(struct* mm_struct mm, struct* page *,
555	enum pt_level),
556	bool last, unsigned long limit)
557	{
558	int i, nr;
559
560	nr = last ? pmd_index(address: limit) + `1` : PTRS_PER_PMD;
561	for (i = `0`; i < nr; i++) {
562	if (!pmd_none(pmd: pmd[i]))
563	(*func)(mm, pmd_page(pmd[i]), PT_PTE);
564	}
565	}
566
567	static void xen_pud_walk(struct mm_struct mm, pud_t pud,
568	void (func)(struct* mm_struct mm, struct* page *,
569	enum pt_level),
570	bool last, unsigned long limit)
571	{
572	int i, nr;
573
574	nr = last ? pud_index(address: limit) + `1` : PTRS_PER_PUD;
575	for (i = `0`; i < nr; i++) {
576	pmd_t *pmd;
577
578	if (pud_none(pud: pud[i]))
579	continue;
580
581	pmd = pmd_offset(pud: &pud[i], address: `0`);
582	if (PTRS_PER_PMD > `1`)
583	(*func)(mm, virt_to_page(pmd), PT_PMD);
584	xen_pmd_walk(mm, pmd, func, last: last && i == nr - `1`, limit);
585	}
586	}
587
588	static void xen_p4d_walk(struct mm_struct mm, p4d_t p4d,
589	void (func)(struct* mm_struct mm, struct* page *,
590	enum pt_level),
591	bool last, unsigned long limit)
592	{
593	pud_t *pud;
594
595
596	if (p4d_none(p4d: *p4d))
597	return;
598
599	pud = pud_offset(p4d, address: `0`);
600	if (PTRS_PER_PUD > `1`)
601	(*func)(mm, virt_to_page(pud), PT_PUD);
602	xen_pud_walk(mm, pud, func, last, limit);
603	}
604
605	/*
606	* (Yet another) pagetable walker. This one is intended for pinning a
607	* pagetable. This means that it walks a pagetable and calls the
608	* callback function on each page it finds making up the page table,
609	* at every level. It walks the entire pagetable, but it only bothers
610	* pinning pte pages which are below limit. In the normal case this
611	* will be STACK_TOP_MAX, but at boot we need to pin up to
612	* FIXADDR_TOP.
613	*
614	* We must skip the Xen hole in the middle of the address space, just after
615	* the big x86-64 virtual hole.
616	*/
617	static void __xen_pgd_walk(struct mm_struct mm, pgd_t pgd,
618	void (func)(struct* mm_struct mm, struct* page *,
619	enum pt_level),
620	unsigned long limit)
621	{
622	int i, nr;
623	unsigned hole_low = `0`, hole_high = `0`;
624
625	/ The limit is the last byte to be touched /
626	limit--;
627	BUG_ON(limit >= FIXADDR_TOP);
628
629	/*
630	* 64-bit has a great big hole in the middle of the address
631	* space, which contains the Xen mappings.
632	*/
633	hole_low = pgd_index(GUARD_HOLE_BASE_ADDR);
634	hole_high = pgd_index(GUARD_HOLE_END_ADDR);
635
636	nr = pgd_index(limit) + `1`;
637	for (i = `0`; i < nr; i++) {
638	p4d_t *p4d;
639
640	if (i >= hole_low && i < hole_high)
641	continue;
642
643	if (pgd_none(pgd: pgd[i]))
644	continue;
645
646	p4d = p4d_offset(pgd: &pgd[i], address: `0`);
647	xen_p4d_walk(mm, p4d, func, last: i == nr - `1`, limit);
648	}
649
650	/ Do the top level last, so that the callbacks can use it as*
651	a cue to do final things like tlb flushes. /*
652	(*func)(mm, virt_to_page(pgd), PT_PGD);
653	}
654
655	static void xen_pgd_walk(struct mm_struct *mm,
656	void (func)(struct* mm_struct mm, struct* page *,
657	enum pt_level),
658	unsigned long limit)
659	{
660	__xen_pgd_walk(mm, pgd: mm->pgd, func, limit);
661	}
662
663	/ If we're using split pte locks, then take the page's lock and*
664	return a pointer to it. Otherwise return NULL. /*
665	static spinlock_t xen_pte_lock(struct* page page, struct* mm_struct *mm)
666	{
667	spinlock_t *ptl = NULL;
668
669	#if USE_SPLIT_PTE_PTLOCKS
670	ptl = ptlock_ptr(page_ptdesc(page));
671	spin_lock_nest_lock(ptl, &mm->page_table_lock);
672	#endif
673
674	return ptl;
675	}
676
677	static void xen_pte_unlock(void *v)
678	{
679	spinlock_t *ptl = v;
680	spin_unlock(lock: ptl);
681	}
682
683	static void xen_do_pin(unsigned level, unsigned long pfn)
684	{
685	struct mmuext_op op;
686
687	op.cmd = level;
688	op.arg1.mfn = pfn_to_mfn(pfn);
689
690	xen_extend_mmuext_op(op: &op);
691	}
692
693	static void xen_pin_page(struct mm_struct mm, struct* page *page,
694	enum pt_level level)
695	{
696	unsigned pgfl = TestSetPagePinned(page);
697
698	if (!pgfl) {
699	void *pt = lowmem_page_address(page);
700	unsigned long pfn = page_to_pfn(page);
701	struct multicall_space mcs = __xen_mc_entry(args: `0`);
702	spinlock_t *ptl;
703
704	/*
705	* We need to hold the pagetable lock between the time
706	* we make the pagetable RO and when we actually pin
707	* it. If we don't, then other users may come in and
708	* attempt to update the pagetable by writing it,
709	* which will fail because the memory is RO but not
710	* pinned, so Xen won't do the trap'n'emulate.
711	*
712	* If we're using split pte locks, we can't hold the
713	* entire pagetable's worth of locks during the
714	* traverse, because we may wrap the preempt count (8
715	* bits). The solution is to mark RO and pin each PTE
716	* page while holding the lock. This means the number
717	* of locks we end up holding is never more than a
718	* batch size (~32 entries, at present).
719	*
720	* If we're not using split pte locks, we needn't pin
721	* the PTE pages independently, because we're
722	* protected by the overall pagetable lock.
723	*/
724	ptl = NULL;
725	if (level == PT_PTE)
726	ptl = xen_pte_lock(page, mm);
727
728	MULTI_update_va_mapping(mcl: mcs.mc, va: (unsigned long)pt,
729	new_val: pfn_pte(page_nr: pfn, PAGE_KERNEL_RO),
730	flags: level == PT_PGD ? UVMF_TLB_FLUSH : `0`);
731
732	if (ptl) {
733	xen_do_pin(MMUEXT_PIN_L1_TABLE, pfn);
734
735	/ Queue a deferred unlock for when this batch*
736	is completed. /*
737	xen_mc_callback(fn: xen_pte_unlock, data: ptl);
738	}
739	}
740	}
741
742	/ This is called just after a mm has been created, but it has not*
743	been used yet. We need to make sure that its pagetable is all
744	read-only, and can be pinned. /*
745	static void __xen_pgd_pin(struct mm_struct mm, pgd_t pgd)
746	{
747	pgd_t *user_pgd = xen_get_user_pgd(pgd);
748
749	trace_xen_mmu_pgd_pin(mm, pgd);
750
751	xen_mc_batch();
752
753	__xen_pgd_walk(mm, pgd, func: xen_pin_page, USER_LIMIT);
754
755	xen_do_pin(MMUEXT_PIN_L4_TABLE, PFN_DOWN(__pa(pgd)));
756
757	if (user_pgd) {
758	xen_pin_page(mm, virt_to_page(user_pgd), level: PT_PGD);
759	xen_do_pin(MMUEXT_PIN_L4_TABLE,
760	PFN_DOWN(__pa(user_pgd)));
761	}
762
763	xen_mc_issue(mode: `0`);
764	}
765
766	static void xen_pgd_pin(struct mm_struct *mm)
767	{
768	__xen_pgd_pin(mm, pgd: mm->pgd);
769	}
770
771	/*
772	* On save, we need to pin all pagetables to make sure they get their
773	* mfns turned into pfns. Search the list for any unpinned pgds and pin
774	* them (unpinned pgds are not currently in use, probably because the
775	* process is under construction or destruction).
776	*
777	* Expected to be called in stop_machine() ("equivalent to taking
778	* every spinlock in the system"), so the locking doesn't really
779	* matter all that much.
780	*/
781	void xen_mm_pin_all(void)
782	{
783	struct page *page;
784
785	spin_lock(lock: &pgd_lock);
786
787	list_for_each_entry(page, &pgd_list, lru) {
788	if (!PagePinned(page)) {
789	__xen_pgd_pin(mm: &init_mm, pgd: (pgd_t *)page_address(page));
790	SetPageSavePinned(page);
791	}
792	}
793
794	spin_unlock(lock: &pgd_lock);
795	}
796
797	static void __init xen_mark_pinned(struct mm_struct mm, struct* page *page,
798	enum pt_level level)
799	{
800	SetPagePinned(page);
801	}
802
803	/*
804	* The init_mm pagetable is really pinned as soon as its created, but
805	* that's before we have page structures to store the bits. So do all
806	* the book-keeping now once struct pages for allocated pages are
807	* initialized. This happens only after memblock_free_all() is called.
808	*/
809	static void __init xen_after_bootmem(void)
810	{
811	static_branch_enable(&xen_struct_pages_ready);
812	#ifdef CONFIG_X86_VSYSCALL_EMULATION
813	SetPagePinned(virt_to_page(level3_user_vsyscall));
814	#endif
815	xen_pgd_walk(mm: &init_mm, func: xen_mark_pinned, FIXADDR_TOP);
816	}
817
818	static void xen_unpin_page(struct mm_struct mm, struct* page *page,
819	enum pt_level level)
820	{
821	unsigned pgfl = TestClearPagePinned(page);
822
823	if (pgfl) {
824	void *pt = lowmem_page_address(page);
825	unsigned long pfn = page_to_pfn(page);
826	spinlock_t *ptl = NULL;
827	struct multicall_space mcs;
828
829	/*
830	* Do the converse to pin_page. If we're using split
831	* pte locks, we must be holding the lock for while
832	* the pte page is unpinned but still RO to prevent
833	* concurrent updates from seeing it in this
834	* partially-pinned state.
835	*/
836	if (level == PT_PTE) {
837	ptl = xen_pte_lock(page, mm);
838
839	if (ptl)
840	xen_do_pin(MMUEXT_UNPIN_TABLE, pfn);
841	}
842
843	mcs = __xen_mc_entry(args: `0`);
844
845	MULTI_update_va_mapping(mcl: mcs.mc, va: (unsigned long)pt,
846	new_val: pfn_pte(page_nr: pfn, PAGE_KERNEL),
847	flags: level == PT_PGD ? UVMF_TLB_FLUSH : `0`);
848
849	if (ptl) {
850	/ unlock when batch completed /
851	xen_mc_callback(fn: xen_pte_unlock, data: ptl);
852	}
853	}
854	}
855
856	/ Release a pagetables pages back as normal RW /
857	static void __xen_pgd_unpin(struct mm_struct mm, pgd_t pgd)
858	{
859	pgd_t *user_pgd = xen_get_user_pgd(pgd);
860
861	trace_xen_mmu_pgd_unpin(mm, pgd);
862
863	xen_mc_batch();
864
865	xen_do_pin(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
866
867	if (user_pgd) {
868	xen_do_pin(MMUEXT_UNPIN_TABLE,
869	PFN_DOWN(__pa(user_pgd)));
870	xen_unpin_page(mm, virt_to_page(user_pgd), level: PT_PGD);
871	}
872
873	__xen_pgd_walk(mm, pgd, func: xen_unpin_page, USER_LIMIT);
874
875	xen_mc_issue(mode: `0`);
876	}
877
878	static void xen_pgd_unpin(struct mm_struct *mm)
879	{
880	__xen_pgd_unpin(mm, pgd: mm->pgd);
881	}
882
883	/*
884	* On resume, undo any pinning done at save, so that the rest of the
885	* kernel doesn't see any unexpected pinned pagetables.
886	*/
887	void xen_mm_unpin_all(void)
888	{
889	struct page *page;
890
891	spin_lock(lock: &pgd_lock);
892
893	list_for_each_entry(page, &pgd_list, lru) {
894	if (PageSavePinned(page)) {
895	BUG_ON(!PagePinned(page));
896	__xen_pgd_unpin(mm: &init_mm, pgd: (pgd_t *)page_address(page));
897	ClearPageSavePinned(page);
898	}
899	}
900
901	spin_unlock(lock: &pgd_lock);
902	}
903
904	static void xen_enter_mmap(struct mm_struct *mm)
905	{
906	spin_lock(lock: &mm->page_table_lock);
907	xen_pgd_pin(mm);
908	spin_unlock(lock: &mm->page_table_lock);
909	}
910
911	static void drop_mm_ref_this_cpu(void *info)
912	{
913	struct mm_struct *mm = info;
914
915	if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm)
916	leave_mm(smp_processor_id());
917
918	/*
919	* If this cpu still has a stale cr3 reference, then make sure
920	* it has been flushed.
921	*/
922	if (this_cpu_read(xen_current_cr3) == __pa(mm->pgd))
923	xen_mc_flush();
924	}
925
926	#ifdef CONFIG_SMP
927	/*
928	* Another cpu may still have their %cr3 pointing at the pagetable, so
929	* we need to repoint it somewhere else before we can unpin it.
930	*/
931	static void xen_drop_mm_ref(struct mm_struct *mm)
932	{
933	cpumask_var_t mask;
934	unsigned cpu;
935
936	drop_mm_ref_this_cpu(info: mm);
937
938	/ Get the "official" set of cpus referring to our pagetable. /
939	if (!alloc_cpumask_var(mask: &mask, GFP_ATOMIC)) {
940	for_each_online_cpu(cpu) {
941	if (per_cpu(xen_current_cr3, cpu) != __pa(mm->pgd))
942	continue;
943	smp_call_function_single(cpuid: cpu, func: drop_mm_ref_this_cpu, info: mm, wait: `1`);
944	}
945	return;
946	}
947
948	/*
949	* It's possible that a vcpu may have a stale reference to our
950	* cr3, because its in lazy mode, and it hasn't yet flushed
951	* its set of pending hypercalls yet. In this case, we can
952	* look at its actual current cr3 value, and force it to flush
953	* if needed.
954	*/
955	cpumask_clear(dstp: mask);
956	for_each_online_cpu(cpu) {
957	if (per_cpu(xen_current_cr3, cpu) == __pa(mm->pgd))
958	cpumask_set_cpu(cpu, dstp: mask);
959	}
960
961	smp_call_function_many(mask, func: drop_mm_ref_this_cpu, info: mm, wait: `1`);
962	free_cpumask_var(mask);
963	}
964	#else
965	static void xen_drop_mm_ref(struct mm_struct *mm)
966	{
967	drop_mm_ref_this_cpu(mm);
968	}
969	#endif
970
971	/*
972	* While a process runs, Xen pins its pagetables, which means that the
973	* hypervisor forces it to be read-only, and it controls all updates
974	* to it. This means that all pagetable updates have to go via the
975	* hypervisor, which is moderately expensive.
976	*
977	* Since we're pulling the pagetable down, we switch to use init_mm,
978	* unpin old process pagetable and mark it all read-write, which
979	* allows further operations on it to be simple memory accesses.
980	*
981	* The only subtle point is that another CPU may be still using the
982	* pagetable because of lazy tlb flushing. This means we need need to
983	* switch all CPUs off this pagetable before we can unpin it.
984	*/
985	static void xen_exit_mmap(struct mm_struct *mm)
986	{
987	get_cpu(); / make sure we don't move around /
988	xen_drop_mm_ref(mm);
989	put_cpu();
990
991	spin_lock(lock: &mm->page_table_lock);
992
993	/ pgd may not be pinned in the error exit path of execve /
994	if (xen_page_pinned(ptr: mm->pgd))
995	xen_pgd_unpin(mm);
996
997	spin_unlock(lock: &mm->page_table_lock);
998	}
999
1000	static void xen_post_allocator_init(void);
1001
1002	static void __init pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1003	{
1004	struct mmuext_op op;
1005
1006	op.cmd = cmd;
1007	op.arg1.mfn = pfn_to_mfn(pfn);
1008	if (HYPERVISOR_mmuext_op(op: &op, count: `1`, NULL, DOMID_SELF))
1009	BUG();
1010	}
1011
1012	static void __init xen_cleanhighmap(unsigned long vaddr,
1013	unsigned long vaddr_end)
1014	{
1015	unsigned long kernel_end = roundup((unsigned long)_brk_end, PMD_SIZE) - `1`;
1016	pmd_t *pmd = level2_kernel_pgt + pmd_index(address: vaddr);
1017
1018	/ NOTE: The loop is more greedy than the cleanup_highmap variant.*
1019	* We include the PMD passed in on _both_ boundaries. */
1020	for (; vaddr <= vaddr_end && (pmd < (level2_kernel_pgt + PTRS_PER_PMD));
1021	pmd++, vaddr += PMD_SIZE) {
1022	if (pmd_none(pmd: *pmd))
1023	continue;
1024	if (vaddr < (unsigned long) _text \|\| vaddr > kernel_end)
1025	set_pmd(pmdp: pmd, pmd: __pmd(val: `0`));
1026	}
1027	/ In case we did something silly, we should crash in this function*
1028	* instead of somewhere later and be confusing. */
1029	xen_mc_flush();
1030	}
1031
1032	/*
1033	* Make a page range writeable and free it.
1034	*/
1035	static void __init xen_free_ro_pages(unsigned long paddr, unsigned long size)
1036	{
1037	void *vaddr = __va(paddr);
1038	void *vaddr_end = vaddr + size;
1039
1040	for (; vaddr < vaddr_end; vaddr += PAGE_SIZE)
1041	make_lowmem_page_readwrite(vaddr);
1042
1043	memblock_phys_free(base: paddr, size);
1044	}
1045
1046	static void __init xen_cleanmfnmap_free_pgtbl(void *pgtbl, bool unpin)
1047	{
1048	unsigned long pa = __pa(pgtbl) & PHYSICAL_PAGE_MASK;
1049
1050	if (unpin)
1051	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(pa));
1052	ClearPagePinned(virt_to_page(__va(pa)));
1053	xen_free_ro_pages(paddr: pa, PAGE_SIZE);
1054	}
1055
1056	static void __init xen_cleanmfnmap_pmd(pmd_t *pmd, bool unpin)
1057	{
1058	unsigned long pa;
1059	pte_t *pte_tbl;
1060	int i;
1061
1062	if (pmd_large(pte: *pmd)) {
1063	pa = pmd_val(pmd: *pmd) & PHYSICAL_PAGE_MASK;
1064	xen_free_ro_pages(paddr: pa, PMD_SIZE);
1065	return;
1066	}
1067
1068	pte_tbl = pte_offset_kernel(pmd, address: `0`);
1069	for (i = `0`; i < PTRS_PER_PTE; i++) {
1070	if (pte_none(pte: pte_tbl[i]))
1071	continue;
1072	pa = pte_pfn(pte: pte_tbl[i]) << PAGE_SHIFT;
1073	xen_free_ro_pages(paddr: pa, PAGE_SIZE);
1074	}
1075	set_pmd(pmdp: pmd, pmd: __pmd(val: `0`));
1076	xen_cleanmfnmap_free_pgtbl(pgtbl: pte_tbl, unpin);
1077	}
1078
1079	static void __init xen_cleanmfnmap_pud(pud_t *pud, bool unpin)
1080	{
1081	unsigned long pa;
1082	pmd_t *pmd_tbl;
1083	int i;
1084
1085	if (pud_large(pud: *pud)) {
1086	pa = pud_val(pud: *pud) & PHYSICAL_PAGE_MASK;
1087	xen_free_ro_pages(paddr: pa, PUD_SIZE);
1088	return;
1089	}
1090
1091	pmd_tbl = pmd_offset(pud, address: `0`);
1092	for (i = `0`; i < PTRS_PER_PMD; i++) {
1093	if (pmd_none(pmd: pmd_tbl[i]))
1094	continue;
1095	xen_cleanmfnmap_pmd(pmd: pmd_tbl + i, unpin);
1096	}
1097	set_pud(pudp: pud, pud: __pud(val: `0`));
1098	xen_cleanmfnmap_free_pgtbl(pgtbl: pmd_tbl, unpin);
1099	}
1100
1101	static void __init xen_cleanmfnmap_p4d(p4d_t *p4d, bool unpin)
1102	{
1103	unsigned long pa;
1104	pud_t *pud_tbl;
1105	int i;
1106
1107	if (p4d_large(p4d: *p4d)) {
1108	pa = p4d_val(p4d: *p4d) & PHYSICAL_PAGE_MASK;
1109	xen_free_ro_pages(paddr: pa, P4D_SIZE);
1110	return;
1111	}
1112
1113	pud_tbl = pud_offset(p4d, address: `0`);
1114	for (i = `0`; i < PTRS_PER_PUD; i++) {
1115	if (pud_none(pud: pud_tbl[i]))
1116	continue;
1117	xen_cleanmfnmap_pud(pud: pud_tbl + i, unpin);
1118	}
1119	set_p4d(p4dp: p4d, p4d: __p4d(val: `0`));
1120	xen_cleanmfnmap_free_pgtbl(pgtbl: pud_tbl, unpin);
1121	}
1122
1123	/*
1124	* Since it is well isolated we can (and since it is perhaps large we should)
1125	* also free the page tables mapping the initial P->M table.
1126	*/
1127	static void __init xen_cleanmfnmap(unsigned long vaddr)
1128	{
1129	pgd_t *pgd;
1130	p4d_t *p4d;
1131	bool unpin;
1132
1133	unpin = (vaddr == `2` * PGDIR_SIZE);
1134	vaddr &= PMD_MASK;
1135	pgd = pgd_offset_k(vaddr);
1136	p4d = p4d_offset(pgd, address: `0`);
1137	if (!p4d_none(p4d: *p4d))
1138	xen_cleanmfnmap_p4d(p4d, unpin);
1139	}
1140
1141	static void __init xen_pagetable_p2m_free(void)
1142	{
1143	unsigned long size;
1144	unsigned long addr;
1145
1146	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1147
1148	/ No memory or already called. /
1149	if ((unsigned long)xen_p2m_addr == xen_start_info->mfn_list)
1150	return;
1151
1152	/ using __ka address and sticking INVALID_P2M_ENTRY! /
1153	memset((void *)xen_start_info->mfn_list, `0xff`, size);
1154
1155	addr = xen_start_info->mfn_list;
1156	/*
1157	* We could be in __ka space.
1158	* We roundup to the PMD, which means that if anybody at this stage is
1159	* using the __ka address of xen_start_info or
1160	* xen_start_info->shared_info they are in going to crash. Fortunately
1161	* we have already revectored in xen_setup_kernel_pagetable.
1162	*/
1163	size = roundup(size, PMD_SIZE);
1164
1165	if (addr >= __START_KERNEL_map) {
1166	xen_cleanhighmap(vaddr: addr, vaddr_end: addr + size);
1167	size = PAGE_ALIGN(xen_start_info->nr_pages *
1168	sizeof(unsigned long));
1169	memblock_free(ptr: (void *)addr, size);
1170	} else {
1171	xen_cleanmfnmap(vaddr: addr);
1172	}
1173	}
1174
1175	static void __init xen_pagetable_cleanhighmap(void)
1176	{
1177	unsigned long size;
1178	unsigned long addr;
1179
1180	/ At this stage, cleanup_highmap has already cleaned __ka space*
1181	* from _brk_limit way up to the max_pfn_mapped (which is the end of
1182	* the ramdisk). We continue on, erasing PMD entries that point to page
1183	* tables - do note that they are accessible at this stage via __va.
1184	* As Xen is aligning the memory end to a 4MB boundary, for good
1185	* measure we also round up to PMD_SIZE * 2 - which means that if
1186	* anybody is using __ka address to the initial boot-stack - and try
1187	* to use it - they are going to crash. The xen_start_info has been
1188	* taken care of already in xen_setup_kernel_pagetable. */
1189	addr = xen_start_info->pt_base;
1190	size = xen_start_info->nr_pt_frames * PAGE_SIZE;
1191
1192	xen_cleanhighmap(vaddr: addr, roundup(addr + size, PMD_SIZE * `2`));
1193	xen_start_info->pt_base = (unsigned long)__va(__pa(xen_start_info->pt_base));
1194	}
1195
1196	static void __init xen_pagetable_p2m_setup(void)
1197	{
1198	xen_vmalloc_p2m_tree();
1199
1200	xen_pagetable_p2m_free();
1201
1202	xen_pagetable_cleanhighmap();
1203
1204	/ And revector! Bye bye old array /
1205	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
1206	}
1207
1208	static void __init xen_pagetable_init(void)
1209	{
1210	/*
1211	* The majority of further PTE writes is to pagetables already
1212	* announced as such to Xen. Hence it is more efficient to use
1213	* hypercalls for these updates.
1214	*/
1215	pv_ops.mmu.set_pte = __xen_set_pte;
1216
1217	paging_init();
1218	xen_post_allocator_init();
1219
1220	xen_pagetable_p2m_setup();
1221
1222	/ Allocate and initialize top and mid mfn levels for p2m structure /
1223	xen_build_mfn_list_list();
1224
1225	/ Remap memory freed due to conflicts with E820 map /
1226	xen_remap_memory();
1227	xen_setup_mfn_list_list();
1228	}
1229
1230	static noinstr void xen_write_cr2(unsigned long cr2)
1231	{
1232	this_cpu_read(xen_vcpu)->arch.cr2 = cr2;
1233	}
1234
1235	static noinline void xen_flush_tlb(void)
1236	{
1237	struct mmuext_op *op;
1238	struct multicall_space mcs;
1239
1240	preempt_disable();
1241
1242	mcs = xen_mc_entry(args: sizeof(*op));
1243
1244	op = mcs.args;
1245	op->cmd = MMUEXT_TLB_FLUSH_LOCAL;
1246	MULTI_mmuext_op(mcl: mcs.mc, op, count: `1`, NULL, DOMID_SELF);
1247
1248	xen_mc_issue(mode: XEN_LAZY_MMU);
1249
1250	preempt_enable();
1251	}
1252
1253	static void xen_flush_tlb_one_user(unsigned long addr)
1254	{
1255	struct mmuext_op *op;
1256	struct multicall_space mcs;
1257
1258	trace_xen_mmu_flush_tlb_one_user(addr);
1259
1260	preempt_disable();
1261
1262	mcs = xen_mc_entry(args: sizeof(*op));
1263	op = mcs.args;
1264	op->cmd = MMUEXT_INVLPG_LOCAL;
1265	op->arg1.linear_addr = addr & PAGE_MASK;
1266	MULTI_mmuext_op(mcl: mcs.mc, op, count: `1`, NULL, DOMID_SELF);
1267
1268	xen_mc_issue(mode: XEN_LAZY_MMU);
1269
1270	preempt_enable();
1271	}
1272
1273	static void xen_flush_tlb_multi(const struct cpumask *cpus,
1274	const struct flush_tlb_info *info)
1275	{
1276	struct {
1277	struct mmuext_op op;
1278	DECLARE_BITMAP(mask, NR_CPUS);
1279	} *args;
1280	struct multicall_space mcs;
1281	const size_t mc_entry_size = sizeof(args->op) +
1282	sizeof(args->mask[`0`]) * BITS_TO_LONGS(num_possible_cpus());
1283
1284	trace_xen_mmu_flush_tlb_multi(cpus, mm: info->mm, addr: info->start, end: info->end);
1285
1286	if (cpumask_empty(srcp: cpus))
1287	return; / nothing to do /
1288
1289	mcs = xen_mc_entry(args: mc_entry_size);
1290	args = mcs.args;
1291	args->op.arg2.vcpumask = to_cpumask(args->mask);
1292
1293	/ Remove any offline CPUs /
1294	cpumask_and(to_cpumask(args->mask), src1p: cpus, cpu_online_mask);
1295
1296	args->op.cmd = MMUEXT_TLB_FLUSH_MULTI;
1297	if (info->end != TLB_FLUSH_ALL &&
1298	(info->end - info->start) <= PAGE_SIZE) {
1299	args->op.cmd = MMUEXT_INVLPG_MULTI;
1300	args->op.arg1.linear_addr = info->start;
1301	}
1302
1303	MULTI_mmuext_op(mcl: mcs.mc, op: &args->op, count: `1`, NULL, DOMID_SELF);
1304
1305	xen_mc_issue(mode: XEN_LAZY_MMU);
1306	}
1307
1308	static unsigned long xen_read_cr3(void)
1309	{
1310	return this_cpu_read(xen_cr3);
1311	}
1312
1313	static void set_current_cr3(void *v)
1314	{
1315	this_cpu_write(xen_current_cr3, (unsigned long)v);
1316	}
1317
1318	static void __xen_write_cr3(bool kernel, unsigned long cr3)
1319	{
1320	struct mmuext_op op;
1321	unsigned long mfn;
1322
1323	trace_xen_mmu_write_cr3(kernel, cr3);
1324
1325	if (cr3)
1326	mfn = pfn_to_mfn(PFN_DOWN(cr3));
1327	else
1328	mfn = `0`;
1329
1330	WARN_ON(mfn == `0` && kernel);
1331
1332	op.cmd = kernel ? MMUEXT_NEW_BASEPTR : MMUEXT_NEW_USER_BASEPTR;
1333	op.arg1.mfn = mfn;
1334
1335	xen_extend_mmuext_op(op: &op);
1336
1337	if (kernel) {
1338	this_cpu_write(xen_cr3, cr3);
1339
1340	/ Update xen_current_cr3 once the batch has actually*
1341	been submitted. /*
1342	xen_mc_callback(fn: set_current_cr3, data: (void *)cr3);
1343	}
1344	}
1345	static void xen_write_cr3(unsigned long cr3)
1346	{
1347	pgd_t *user_pgd = xen_get_user_pgd(__va(cr3));
1348
1349	BUG_ON(preemptible());
1350
1351	xen_mc_batch(); / disables interrupts /
1352
1353	/ Update while interrupts are disabled, so its atomic with*
1354	respect to ipis /*
1355	this_cpu_write(xen_cr3, cr3);
1356
1357	__xen_write_cr3(kernel: true, cr3);
1358
1359	if (user_pgd)
1360	__xen_write_cr3(kernel: false, __pa(user_pgd));
1361	else
1362	__xen_write_cr3(kernel: false, cr3: `0`);
1363
1364	xen_mc_issue(mode: XEN_LAZY_CPU); / interrupts restored /
1365	}
1366
1367	/*
1368	* At the start of the day - when Xen launches a guest, it has already
1369	* built pagetables for the guest. We diligently look over them
1370	* in xen_setup_kernel_pagetable and graft as appropriate them in the
1371	* init_top_pgt and its friends. Then when we are happy we load
1372	* the new init_top_pgt - and continue on.
1373	*
1374	* The generic code starts (start_kernel) and 'init_mem_mapping' sets
1375	* up the rest of the pagetables. When it has completed it loads the cr3.
1376	* N.B. that baremetal would start at 'start_kernel' (and the early
1377	* #PF handler would create bootstrap pagetables) - so we are running
1378	* with the same assumptions as what to do when write_cr3 is executed
1379	* at this point.
1380	*
1381	* Since there are no user-page tables at all, we have two variants
1382	* of xen_write_cr3 - the early bootup (this one), and the late one
1383	* (xen_write_cr3). The reason we have to do that is that in 64-bit
1384	* the Linux kernel and user-space are both in ring 3 while the
1385	* hypervisor is in ring 0.
1386	*/
1387	static void __init xen_write_cr3_init(unsigned long cr3)
1388	{
1389	BUG_ON(preemptible());
1390
1391	xen_mc_batch(); / disables interrupts /
1392
1393	/ Update while interrupts are disabled, so its atomic with*
1394	respect to ipis /*
1395	this_cpu_write(xen_cr3, cr3);
1396
1397	__xen_write_cr3(kernel: true, cr3);
1398
1399	xen_mc_issue(mode: XEN_LAZY_CPU); / interrupts restored /
1400	}
1401
1402	static int xen_pgd_alloc(struct mm_struct *mm)
1403	{
1404	pgd_t *pgd = mm->pgd;
1405	struct page *page = virt_to_page(pgd);
1406	pgd_t *user_pgd;
1407	int ret = -ENOMEM;
1408
1409	BUG_ON(PagePinned(virt_to_page(pgd)));
1410	BUG_ON(page->private != `0`);
1411
1412	user_pgd = (pgd_t *)__get_free_page(GFP_KERNEL \| __GFP_ZERO);
1413	page->private = (unsigned long)user_pgd;
1414
1415	if (user_pgd != NULL) {
1416	#ifdef CONFIG_X86_VSYSCALL_EMULATION
1417	user_pgd[pgd_index(VSYSCALL_ADDR)] =
1418	__pgd(__pa(level3_user_vsyscall) \| _PAGE_TABLE);
1419	#endif
1420	ret = `0`;
1421	}
1422
1423	BUG_ON(PagePinned(virt_to_page(xen_get_user_pgd(pgd))));
1424
1425	return ret;
1426	}
1427
1428	static void xen_pgd_free(struct mm_struct mm, pgd_t pgd)
1429	{
1430	pgd_t *user_pgd = xen_get_user_pgd(pgd);
1431
1432	if (user_pgd)
1433	free_page((unsigned long)user_pgd);
1434	}
1435
1436	/*
1437	* Init-time set_pte while constructing initial pagetables, which
1438	* doesn't allow RO page table pages to be remapped RW.
1439	*
1440	* If there is no MFN for this PFN then this page is initially
1441	* ballooned out so clear the PTE (as in decrease_reservation() in
1442	* drivers/xen/balloon.c).
1443	*
1444	* Many of these PTE updates are done on unpinned and writable pages
1445	* and doing a hypercall for these is unnecessary and expensive. At
1446	* this point it is rarely possible to tell if a page is pinned, so
1447	* mostly write the PTE directly and rely on Xen trapping and
1448	* emulating any updates as necessary.
1449	*/
1450	static void __init xen_set_pte_init(pte_t *ptep, pte_t pte)
1451	{
1452	if (unlikely(is_early_ioremap_ptep(ptep)))
1453	__xen_set_pte(ptep, pteval: pte);
1454	else
1455	native_set_pte(ptep, pte);
1456	}
1457
1458	__visible pte_t xen_make_pte_init(pteval_t pte)
1459	{
1460	unsigned long pfn;
1461
1462	/*
1463	* Pages belonging to the initial p2m list mapped outside the default
1464	* address range must be mapped read-only. This region contains the
1465	* page tables for mapping the p2m list, too, and page tables MUST be
1466	* mapped read-only.
1467	*/
1468	pfn = (pte & PTE_PFN_MASK) >> PAGE_SHIFT;
1469	if (xen_start_info->mfn_list < __START_KERNEL_map &&
1470	pfn >= xen_start_info->first_p2m_pfn &&
1471	pfn < xen_start_info->first_p2m_pfn + xen_start_info->nr_p2m_frames)
1472	pte &= ~_PAGE_RW;
1473
1474	pte = pte_pfn_to_mfn(val: pte);
1475	return native_make_pte(val: pte);
1476	}
1477	PV_CALLEE_SAVE_REGS_THUNK(xen_make_pte_init);
1478
1479	/ Early in boot, while setting up the initial pagetable, assume*
1480	everything is pinned. /*
1481	static void __init xen_alloc_pte_init(struct mm_struct mm, unsigned* long pfn)
1482	{
1483	#ifdef CONFIG_FLATMEM
1484	BUG_ON(mem_map); / should only be used early /
1485	#endif
1486	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1487	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1488	}
1489
1490	/ Used for pmd and pud /
1491	static void __init xen_alloc_pmd_init(struct mm_struct mm, unsigned* long pfn)
1492	{
1493	#ifdef CONFIG_FLATMEM
1494	BUG_ON(mem_map); / should only be used early /
1495	#endif
1496	make_lowmem_page_readonly(__va(PFN_PHYS(pfn)));
1497	}
1498
1499	/ Early release_pte assumes that all pts are pinned, since there's*
1500	only init_mm and anything attached to that is pinned. /*
1501	static void __init xen_release_pte_init(unsigned long pfn)
1502	{
1503	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1504	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1505	}
1506
1507	static void __init xen_release_pmd_init(unsigned long pfn)
1508	{
1509	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1510	}
1511
1512	static inline void __pin_pagetable_pfn(unsigned cmd, unsigned long pfn)
1513	{
1514	struct multicall_space mcs;
1515	struct mmuext_op *op;
1516
1517	mcs = __xen_mc_entry(args: sizeof(*op));
1518	op = mcs.args;
1519	op->cmd = cmd;
1520	op->arg1.mfn = pfn_to_mfn(pfn);
1521
1522	MULTI_mmuext_op(mcl: mcs.mc, op: mcs.args, count: `1`, NULL, DOMID_SELF);
1523	}
1524
1525	static inline void __set_pfn_prot(unsigned long pfn, pgprot_t prot)
1526	{
1527	struct multicall_space mcs;
1528	unsigned long addr = (unsigned long)__va(pfn << PAGE_SHIFT);
1529
1530	mcs = __xen_mc_entry(args: `0`);
1531	MULTI_update_va_mapping(mcl: mcs.mc, va: (unsigned long)addr,
1532	new_val: pfn_pte(page_nr: pfn, pgprot: prot), flags: `0`);
1533	}
1534
1535	/ This needs to make sure the new pte page is pinned iff its being*
1536	attached to a pinned pagetable. /*
1537	static inline void xen_alloc_ptpage(struct mm_struct mm, unsigned* long pfn,
1538	unsigned level)
1539	{
1540	bool pinned = xen_page_pinned(ptr: mm->pgd);
1541
1542	trace_xen_mmu_alloc_ptpage(mm, pfn, level, pinned);
1543
1544	if (pinned) {
1545	struct page *page = pfn_to_page(pfn);
1546
1547	pinned = false;
1548	if (static_branch_likely(&xen_struct_pages_ready)) {
1549	pinned = PagePinned(page);
1550	SetPagePinned(page);
1551	}
1552
1553	xen_mc_batch();
1554
1555	__set_pfn_prot(pfn, PAGE_KERNEL_RO);
1556
1557	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS && !pinned)
1558	__pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn);
1559
1560	xen_mc_issue(mode: XEN_LAZY_MMU);
1561	}
1562	}
1563
1564	static void xen_alloc_pte(struct mm_struct mm, unsigned* long pfn)
1565	{
1566	xen_alloc_ptpage(mm, pfn, level: PT_PTE);
1567	}
1568
1569	static void xen_alloc_pmd(struct mm_struct mm, unsigned* long pfn)
1570	{
1571	xen_alloc_ptpage(mm, pfn, level: PT_PMD);
1572	}
1573
1574	/ This should never happen until we're OK to use struct page /
1575	static inline void xen_release_ptpage(unsigned long pfn, unsigned level)
1576	{
1577	struct page *page = pfn_to_page(pfn);
1578	bool pinned = PagePinned(page);
1579
1580	trace_xen_mmu_release_ptpage(pfn, level, pinned);
1581
1582	if (pinned) {
1583	xen_mc_batch();
1584
1585	if (level == PT_PTE && USE_SPLIT_PTE_PTLOCKS)
1586	__pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn);
1587
1588	__set_pfn_prot(pfn, PAGE_KERNEL);
1589
1590	xen_mc_issue(mode: XEN_LAZY_MMU);
1591
1592	ClearPagePinned(page);
1593	}
1594	}
1595
1596	static void xen_release_pte(unsigned long pfn)
1597	{
1598	xen_release_ptpage(pfn, level: PT_PTE);
1599	}
1600
1601	static void xen_release_pmd(unsigned long pfn)
1602	{
1603	xen_release_ptpage(pfn, level: PT_PMD);
1604	}
1605
1606	static void xen_alloc_pud(struct mm_struct mm, unsigned* long pfn)
1607	{
1608	xen_alloc_ptpage(mm, pfn, level: PT_PUD);
1609	}
1610
1611	static void xen_release_pud(unsigned long pfn)
1612	{
1613	xen_release_ptpage(pfn, level: PT_PUD);
1614	}
1615
1616	/*
1617	* Like __va(), but returns address in the kernel mapping (which is
1618	* all we have until the physical memory mapping has been set up.
1619	*/
1620	static void * __init __ka(phys_addr_t paddr)
1621	{
1622	return (void *)(paddr + __START_KERNEL_map);
1623	}
1624
1625	/ Convert a machine address to physical address /
1626	static unsigned long __init m2p(phys_addr_t maddr)
1627	{
1628	phys_addr_t paddr;
1629
1630	maddr &= XEN_PTE_MFN_MASK;
1631	paddr = mfn_to_pfn(mfn: maddr >> PAGE_SHIFT) << PAGE_SHIFT;
1632
1633	return paddr;
1634	}
1635
1636	/ Convert a machine address to kernel virtual /
1637	static void * __init m2v(phys_addr_t maddr)
1638	{
1639	return __ka(paddr: m2p(maddr));
1640	}
1641
1642	/ Set the page permissions on an identity-mapped pages /
1643	static void __init set_page_prot_flags(void *addr, pgprot_t prot,
1644	unsigned long flags)
1645	{
1646	unsigned long pfn = __pa(addr) >> PAGE_SHIFT;
1647	pte_t pte = pfn_pte(page_nr: pfn, pgprot: prot);
1648
1649	if (HYPERVISOR_update_va_mapping(va: (unsigned long)addr, new_val: pte, flags))
1650	BUG();
1651	}
1652	static void __init set_page_prot(void *addr, pgprot_t prot)
1653	{
1654	return set_page_prot_flags(addr, prot, UVMF_NONE);
1655	}
1656
1657	void __init xen_setup_machphys_mapping(void)
1658	{
1659	struct xen_machphys_mapping mapping;
1660
1661	if (HYPERVISOR_memory_op(XENMEM_machphys_mapping, arg: &mapping) == `0`) {
1662	machine_to_phys_mapping = (unsigned long *)mapping.v_start;
1663	machine_to_phys_nr = mapping.max_mfn + `1`;
1664	} else {
1665	machine_to_phys_nr = MACH2PHYS_NR_ENTRIES;
1666	}
1667	}
1668
1669	static void __init convert_pfn_mfn(void *v)
1670	{
1671	pte_t *pte = v;
1672	int i;
1673
1674	/ All levels are converted the same way, so just treat them*
1675	as ptes. /*
1676	for (i = `0`; i < PTRS_PER_PTE; i++)
1677	pte[i] = xen_make_pte(pte: pte[i].pte);
1678	}
1679	static void __init check_pt_base(unsigned long pt_base, unsigned* long *pt_end,
1680	unsigned long addr)
1681	{
1682	if (*pt_base == PFN_DOWN(__pa(addr))) {
1683	set_page_prot_flags(addr: (void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1684	clear_page(page: (void *)addr);
1685	(*pt_base)++;
1686	}
1687	if (*pt_end == PFN_DOWN(__pa(addr))) {
1688	set_page_prot_flags(addr: (void *)addr, PAGE_KERNEL, UVMF_INVLPG);
1689	clear_page(page: (void *)addr);
1690	(*pt_end)--;
1691	}
1692	}
1693	/*
1694	* Set up the initial kernel pagetable.
1695	*
1696	* We can construct this by grafting the Xen provided pagetable into
1697	* head_64.S's preconstructed pagetables. We copy the Xen L2's into
1698	* level2_ident_pgt, and level2_kernel_pgt. This means that only the
1699	* kernel has a physical mapping to start with - but that's enough to
1700	* get __va working. We need to fill in the rest of the physical
1701	* mapping once some sort of allocator has been set up.
1702	*/
1703	void __init xen_setup_kernel_pagetable(pgd_t pgd, unsigned* long max_pfn)
1704	{
1705	pud_t *l3;
1706	pmd_t *l2;
1707	unsigned long addr[`3`];
1708	unsigned long pt_base, pt_end;
1709	unsigned i;
1710
1711	/ max_pfn_mapped is the last pfn mapped in the initial memory*
1712	* mappings. Considering that on Xen after the kernel mappings we
1713	* have the mappings of some pages that don't exist in pfn space, we
1714	* set max_pfn_mapped to the last real pfn mapped. */
1715	if (xen_start_info->mfn_list < __START_KERNEL_map)
1716	max_pfn_mapped = xen_start_info->first_p2m_pfn;
1717	else
1718	max_pfn_mapped = PFN_DOWN(__pa(xen_start_info->mfn_list));
1719
1720	pt_base = PFN_DOWN(__pa(xen_start_info->pt_base));
1721	pt_end = pt_base + xen_start_info->nr_pt_frames;
1722
1723	/ Zap identity mapping /
1724	init_top_pgt[`0`] = __pgd(val: `0`);
1725
1726	/ Pre-constructed entries are in pfn, so convert to mfn /
1727	/ L4[273] -> level3_ident_pgt /
1728	/ L4[511] -> level3_kernel_pgt /
1729	convert_pfn_mfn(v: init_top_pgt);
1730
1731	/ L3_i[0] -> level2_ident_pgt /
1732	convert_pfn_mfn(v: level3_ident_pgt);
1733	/ L3_k[510] -> level2_kernel_pgt /
1734	/ L3_k[511] -> level2_fixmap_pgt /
1735	convert_pfn_mfn(v: level3_kernel_pgt);
1736
1737	/ L3_k[511][508-FIXMAP_PMD_NUM ... 507] -> level1_fixmap_pgt /
1738	convert_pfn_mfn(v: level2_fixmap_pgt);
1739
1740	/ We get [511][511] and have Xen's version of level2_kernel_pgt /
1741	l3 = m2v(maddr: pgd[pgd_index(__START_KERNEL_map)].pgd);
1742	l2 = m2v(maddr: l3[pud_index(__START_KERNEL_map)].pud);
1743
1744	addr[`0`] = (unsigned long)pgd;
1745	addr[`1`] = (unsigned long)l3;
1746	addr[`2`] = (unsigned long)l2;
1747	/ Graft it onto L4[273][0]. Note that we creating an aliasing problem:*
1748	* Both L4[273][0] and L4[511][510] have entries that point to the same
1749	* L2 (PMD) tables. Meaning that if you modify it in __va space
1750	* it will be also modified in the __ka space! (But if you just
1751	* modify the PMD table to point to other PTE's or none, then you
1752	* are OK - which is what cleanup_highmap does) */
1753	copy_page(to: level2_ident_pgt, from: l2);
1754	/ Graft it onto L4[511][510] /
1755	copy_page(to: level2_kernel_pgt, from: l2);
1756
1757	/*
1758	* Zap execute permission from the ident map. Due to the sharing of
1759	* L1 entries we need to do this in the L2.
1760	*/
1761	if (__supported_pte_mask & _PAGE_NX) {
1762	for (i = `0`; i < PTRS_PER_PMD; ++i) {
1763	if (pmd_none(pmd: level2_ident_pgt[i]))
1764	continue;
1765	level2_ident_pgt[i] = pmd_set_flags(pmd: level2_ident_pgt[i], _PAGE_NX);
1766	}
1767	}
1768
1769	/ Copy the initial P->M table mappings if necessary. /
1770	i = pgd_index(xen_start_info->mfn_list);
1771	if (i && i < pgd_index(__START_KERNEL_map))
1772	init_top_pgt[i] = ((pgd_t *)xen_start_info->pt_base)[i];
1773
1774	/ Make pagetable pieces RO /
1775	set_page_prot(addr: init_top_pgt, PAGE_KERNEL_RO);
1776	set_page_prot(addr: level3_ident_pgt, PAGE_KERNEL_RO);
1777	set_page_prot(addr: level3_kernel_pgt, PAGE_KERNEL_RO);
1778	set_page_prot(addr: level2_ident_pgt, PAGE_KERNEL_RO);
1779	set_page_prot(addr: level2_kernel_pgt, PAGE_KERNEL_RO);
1780	set_page_prot(addr: level2_fixmap_pgt, PAGE_KERNEL_RO);
1781
1782	for (i = `0`; i < FIXMAP_PMD_NUM; i++) {
1783	set_page_prot(addr: level1_fixmap_pgt + i * PTRS_PER_PTE,
1784	PAGE_KERNEL_RO);
1785	}
1786
1787	/ Pin down new L4 /
1788	pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE,
1789	PFN_DOWN(__pa_symbol(init_top_pgt)));
1790
1791	/ Unpin Xen-provided one /
1792	pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, PFN_DOWN(__pa(pgd)));
1793
1794	#ifdef CONFIG_X86_VSYSCALL_EMULATION
1795	/ Pin user vsyscall L3 /
1796	set_page_prot(addr: level3_user_vsyscall, PAGE_KERNEL_RO);
1797	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE,
1798	PFN_DOWN(__pa_symbol(level3_user_vsyscall)));
1799	#endif
1800
1801	/*
1802	* At this stage there can be no user pgd, and no page structure to
1803	* attach it to, so make sure we just set kernel pgd.
1804	*/
1805	xen_mc_batch();
1806	__xen_write_cr3(kernel: true, __pa(init_top_pgt));
1807	xen_mc_issue(mode: XEN_LAZY_CPU);
1808
1809	/ We can't that easily rip out L3 and L2, as the Xen pagetables are*
1810	* set out this way: [L4], [L1], [L2], [L3], [L1], [L1] ... for
1811	* the initial domain. For guests using the toolstack, they are in:
1812	* [L4], [L3], [L2], [L1], [L1], order .. So for dom0 we can only
1813	* rip out the [L4] (pgd), but for guests we shave off three pages.
1814	*/
1815	for (i = `0`; i < ARRAY_SIZE(addr); i++)
1816	check_pt_base(pt_base: &pt_base, pt_end: &pt_end, addr: addr[i]);
1817
1818	/ Our (by three pages) smaller Xen pagetable that we are using /
1819	xen_pt_base = PFN_PHYS(pt_base);
1820	xen_pt_size = (pt_end - pt_base) * PAGE_SIZE;
1821	memblock_reserve(base: xen_pt_base, size: xen_pt_size);
1822
1823	/ Revector the xen_start_info /
1824	xen_start_info = (struct start_info *)__va(__pa(xen_start_info));
1825	}
1826
1827	/*
1828	* Read a value from a physical address.
1829	*/
1830	static unsigned long __init xen_read_phys_ulong(phys_addr_t addr)
1831	{
1832	unsigned long *vaddr;
1833	unsigned long val;
1834
1835	vaddr = early_memremap_ro(phys_addr: addr, size: sizeof(val));
1836	val = *vaddr;
1837	early_memunmap(addr: vaddr, size: sizeof(val));
1838	return val;
1839	}
1840
1841	/*
1842	* Translate a virtual address to a physical one without relying on mapped
1843	* page tables. Don't rely on big pages being aligned in (guest) physical
1844	* space!
1845	*/
1846	static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
1847	{
1848	phys_addr_t pa;
1849	pgd_t pgd;
1850	pud_t pud;
1851	pmd_t pmd;
1852	pte_t pte;
1853
1854	pa = read_cr3_pa();
1855	pgd = native_make_pgd(val: xen_read_phys_ulong(addr: pa + pgd_index(vaddr) *
1856	sizeof(pgd)));
1857	if (!pgd_present(pgd))
1858	return `0`;
1859
1860	pa = pgd_val(pgd) & PTE_PFN_MASK;
1861	pud = native_make_pud(val: xen_read_phys_ulong(addr: pa + pud_index(address: vaddr) *
1862	sizeof(pud)));
1863	if (!pud_present(pud))
1864	return `0`;
1865	pa = pud_val(pud) & PTE_PFN_MASK;
1866	if (pud_large(pud))
1867	return pa + (vaddr & ~PUD_MASK);
1868
1869	pmd = native_make_pmd(val: xen_read_phys_ulong(addr: pa + pmd_index(address: vaddr) *
1870	sizeof(pmd)));
1871	if (!pmd_present(pmd))
1872	return `0`;
1873	pa = pmd_val(pmd) & PTE_PFN_MASK;
1874	if (pmd_large(pte: pmd))
1875	return pa + (vaddr & ~PMD_MASK);
1876
1877	pte = native_make_pte(val: xen_read_phys_ulong(addr: pa + pte_index(address: vaddr) *
1878	sizeof(pte)));
1879	if (!pte_present(a: pte))
1880	return `0`;
1881	pa = pte_pfn(pte) << PAGE_SHIFT;
1882
1883	return pa \| (vaddr & ~PAGE_MASK);
1884	}
1885
1886	/*
1887	* Find a new area for the hypervisor supplied p2m list and relocate the p2m to
1888	* this area.
1889	*/
1890	void __init xen_relocate_p2m(void)
1891	{
1892	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
1893	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
1894	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
1895	pte_t *pt;
1896	pmd_t *pmd;
1897	pud_t *pud;
1898	pgd_t *pgd;
1899	unsigned long *new_p2m;
1900
1901	size = PAGE_ALIGN(xen_start_info->nr_pages * sizeof(unsigned long));
1902	n_pte = roundup(size, PAGE_SIZE) >> PAGE_SHIFT;
1903	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
1904	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
1905	n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
1906	n_frames = n_pte + n_pt + n_pmd + n_pud;
1907
1908	new_area = xen_find_free_area(PFN_PHYS(n_frames));
1909	if (!new_area) {
1910	xen_raw_console_write(str: "Can't find new memory area for p2m needed due to E820 map conflict\n");
1911	BUG();
1912	}
1913
1914	/*
1915	* Setup the page tables for addressing the new p2m list.
1916	* We have asked the hypervisor to map the p2m list at the user address
1917	* PUD_SIZE. It may have done so, or it may have used a kernel space
1918	* address depending on the Xen version.
1919	* To avoid any possible virtual address collision, just use
1920	* 2 * PUD_SIZE for the new area.
1921	*/
1922	pud_phys = new_area;
1923	pmd_phys = pud_phys + PFN_PHYS(n_pud);
1924	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
1925	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
1926
1927	pgd = __va(read_cr3_pa());
1928	new_p2m = (unsigned long )(`2` PGDIR_SIZE);
1929	for (idx_pud = `0`; idx_pud < n_pud; idx_pud++) {
1930	pud = early_memremap(phys_addr: pud_phys, PAGE_SIZE);
1931	clear_page(page: pud);
1932	for (idx_pmd = `0`; idx_pmd < min(n_pmd, PTRS_PER_PUD);
1933	idx_pmd++) {
1934	pmd = early_memremap(phys_addr: pmd_phys, PAGE_SIZE);
1935	clear_page(page: pmd);
1936	for (idx_pt = `0`; idx_pt < min(n_pt, PTRS_PER_PMD);
1937	idx_pt++) {
1938	pt = early_memremap(phys_addr: pt_phys, PAGE_SIZE);
1939	clear_page(page: pt);
1940	for (idx_pte = `0`;
1941	idx_pte < min(n_pte, PTRS_PER_PTE);
1942	idx_pte++) {
1943	pt[idx_pte] = pfn_pte(page_nr: p2m_pfn,
1944	PAGE_KERNEL);
1945	p2m_pfn++;
1946	}
1947	n_pte -= PTRS_PER_PTE;
1948	early_memunmap(addr: pt, PAGE_SIZE);
1949	make_lowmem_page_readonly(__va(pt_phys));
1950	pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
1951	PFN_DOWN(pt_phys));
1952	pmd[idx_pt] = __pmd(_PAGE_TABLE \| pt_phys);
1953	pt_phys += PAGE_SIZE;
1954	}
1955	n_pt -= PTRS_PER_PMD;
1956	early_memunmap(addr: pmd, PAGE_SIZE);
1957	make_lowmem_page_readonly(__va(pmd_phys));
1958	pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
1959	PFN_DOWN(pmd_phys));
1960	pud[idx_pmd] = __pud(_PAGE_TABLE \| pmd_phys);
1961	pmd_phys += PAGE_SIZE;
1962	}
1963	n_pmd -= PTRS_PER_PUD;
1964	early_memunmap(addr: pud, PAGE_SIZE);
1965	make_lowmem_page_readonly(__va(pud_phys));
1966	pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
1967	set_pgd(pgd + `2` + idx_pud, __pgd(_PAGE_TABLE \| pud_phys));
1968	pud_phys += PAGE_SIZE;
1969	}
1970
1971	/ Now copy the old p2m info to the new area. /
1972	memcpy(new_p2m, xen_p2m_addr, size);
1973	xen_p2m_addr = new_p2m;
1974
1975	/ Release the old p2m list and set new list info. /
1976	p2m_pfn = PFN_DOWN(xen_early_virt_to_phys(xen_start_info->mfn_list));
1977	BUG_ON(!p2m_pfn);
1978	p2m_pfn_end = p2m_pfn + PFN_DOWN(size);
1979
1980	if (xen_start_info->mfn_list < __START_KERNEL_map) {
1981	pfn = xen_start_info->first_p2m_pfn;
1982	pfn_end = xen_start_info->first_p2m_pfn +
1983	xen_start_info->nr_p2m_frames;
1984	set_pgd(pgd + `1`, __pgd(`0`));
1985	} else {
1986	pfn = p2m_pfn;
1987	pfn_end = p2m_pfn_end;
1988	}
1989
1990	memblock_phys_free(PFN_PHYS(pfn), PAGE_SIZE * (pfn_end - pfn));
1991	while (pfn < pfn_end) {
1992	if (pfn == p2m_pfn) {
1993	pfn = p2m_pfn_end;
1994	continue;
1995	}
1996	make_lowmem_page_readwrite(__va(PFN_PHYS(pfn)));
1997	pfn++;
1998	}
1999
2000	xen_start_info->mfn_list = (unsigned long)xen_p2m_addr;
2001	xen_start_info->first_p2m_pfn = PFN_DOWN(new_area);
2002	xen_start_info->nr_p2m_frames = n_frames;
2003	}
2004
2005	void __init xen_reserve_special_pages(void)
2006	{
2007	phys_addr_t paddr;
2008
2009	memblock_reserve(__pa(xen_start_info), PAGE_SIZE);
2010	if (xen_start_info->store_mfn) {
2011	paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->store_mfn));
2012	memblock_reserve(base: paddr, PAGE_SIZE);
2013	}
2014	if (!xen_initial_domain()) {
2015	paddr = PFN_PHYS(mfn_to_pfn(xen_start_info->console.domU.mfn));
2016	memblock_reserve(base: paddr, PAGE_SIZE);
2017	}
2018	}
2019
2020	void __init xen_pt_check_e820(void)
2021	{
2022	if (xen_is_e820_reserved(start: xen_pt_base, size: xen_pt_size)) {
2023	xen_raw_console_write(str: "Xen hypervisor allocated page table memory conflicts with E820 map\n");
2024	BUG();
2025	}
2026	}
2027
2028	static unsigned char dummy_mapping[PAGE_SIZE] __page_aligned_bss;
2029
2030	static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
2031	{
2032	pte_t pte;
2033	unsigned long vaddr;
2034
2035	phys >>= PAGE_SHIFT;
2036
2037	switch (idx) {
2038	case FIX_BTMAP_END ... FIX_BTMAP_BEGIN:
2039	#ifdef CONFIG_X86_VSYSCALL_EMULATION
2040	case VSYSCALL_PAGE:
2041	#endif
2042	/ All local page mappings /
2043	pte = pfn_pte(page_nr: phys, pgprot: prot);
2044	break;
2045
2046	#ifdef CONFIG_X86_LOCAL_APIC
2047	case FIX_APIC_BASE: / maps dummy local APIC /
2048	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2049	break;
2050	#endif
2051
2052	#ifdef CONFIG_X86_IO_APIC
2053	case FIX_IO_APIC_BASE_0 ... FIX_IO_APIC_BASE_END:
2054	/*
2055	* We just don't map the IO APIC - all access is via
2056	* hypercalls. Keep the address in the pte for reference.
2057	*/
2058	pte = pfn_pte(PFN_DOWN(__pa(dummy_mapping)), PAGE_KERNEL);
2059	break;
2060	#endif
2061
2062	case FIX_PARAVIRT_BOOTMAP:
2063	/ This is an MFN, but it isn't an IO mapping from the*
2064	IO domain /*
2065	pte = mfn_pte(page_nr: phys, pgprot: prot);
2066	break;
2067
2068	default:
2069	/ By default, set_fixmap is used for hardware mappings /
2070	pte = mfn_pte(page_nr: phys, pgprot: prot);
2071	break;
2072	}
2073
2074	vaddr = __fix_to_virt(idx);
2075	if (HYPERVISOR_update_va_mapping(va: vaddr, new_val: pte, UVMF_INVLPG))
2076	BUG();
2077
2078	#ifdef CONFIG_X86_VSYSCALL_EMULATION
2079	/ Replicate changes to map the vsyscall page into the user*
2080	pagetable vsyscall mapping. /*
2081	if (idx == VSYSCALL_PAGE)
2082	set_pte_vaddr_pud(pud_page: level3_user_vsyscall, vaddr, new_pte: pte);
2083	#endif
2084	}
2085
2086	static void xen_enter_lazy_mmu(void)
2087	{
2088	enter_lazy(mode: XEN_LAZY_MMU);
2089	}
2090
2091	static void xen_flush_lazy_mmu(void)
2092	{
2093	preempt_disable();
2094
2095	if (xen_get_lazy_mode() == XEN_LAZY_MMU) {
2096	arch_leave_lazy_mmu_mode();
2097	arch_enter_lazy_mmu_mode();
2098	}
2099
2100	preempt_enable();
2101	}
2102
2103	static void __init xen_post_allocator_init(void)
2104	{
2105	pv_ops.mmu.set_pte = xen_set_pte;
2106	pv_ops.mmu.set_pmd = xen_set_pmd;
2107	pv_ops.mmu.set_pud = xen_set_pud;
2108	pv_ops.mmu.set_p4d = xen_set_p4d;
2109
2110	/ This will work as long as patching hasn't happened yet*
2111	(which it hasn't) /*
2112	pv_ops.mmu.alloc_pte = xen_alloc_pte;
2113	pv_ops.mmu.alloc_pmd = xen_alloc_pmd;
2114	pv_ops.mmu.release_pte = xen_release_pte;
2115	pv_ops.mmu.release_pmd = xen_release_pmd;
2116	pv_ops.mmu.alloc_pud = xen_alloc_pud;
2117	pv_ops.mmu.release_pud = xen_release_pud;
2118	pv_ops.mmu.make_pte = PV_CALLEE_SAVE(xen_make_pte);
2119
2120	pv_ops.mmu.write_cr3 = &xen_write_cr3;
2121	}
2122
2123	static void xen_leave_lazy_mmu(void)
2124	{
2125	preempt_disable();
2126	xen_mc_flush();
2127	leave_lazy(mode: XEN_LAZY_MMU);
2128	preempt_enable();
2129	}
2130
2131	static const typeof(pv_ops) xen_mmu_ops __initconst = {
2132	.mmu = {
2133	.read_cr2 = __PV_IS_CALLEE_SAVE(xen_read_cr2),
2134	.write_cr2 = xen_write_cr2,
2135
2136	.read_cr3 = xen_read_cr3,
2137	.write_cr3 = xen_write_cr3_init,
2138
2139	.flush_tlb_user = xen_flush_tlb,
2140	.flush_tlb_kernel = xen_flush_tlb,
2141	.flush_tlb_one_user = xen_flush_tlb_one_user,
2142	.flush_tlb_multi = xen_flush_tlb_multi,
2143	.tlb_remove_table = tlb_remove_table,
2144
2145	.pgd_alloc = xen_pgd_alloc,
2146	.pgd_free = xen_pgd_free,
2147
2148	.alloc_pte = xen_alloc_pte_init,
2149	.release_pte = xen_release_pte_init,
2150	.alloc_pmd = xen_alloc_pmd_init,
2151	.release_pmd = xen_release_pmd_init,
2152
2153	.set_pte = xen_set_pte_init,
2154	.set_pmd = xen_set_pmd_hyper,
2155
2156	.ptep_modify_prot_start = xen_ptep_modify_prot_start,
2157	.ptep_modify_prot_commit = xen_ptep_modify_prot_commit,
2158
2159	.pte_val = PV_CALLEE_SAVE(xen_pte_val),
2160	.pgd_val = PV_CALLEE_SAVE(xen_pgd_val),
2161
2162	.make_pte = PV_CALLEE_SAVE(xen_make_pte_init),
2163	.make_pgd = PV_CALLEE_SAVE(xen_make_pgd),
2164
2165	.set_pud = xen_set_pud_hyper,
2166
2167	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
2168	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
2169
2170	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
2171	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
2172	.set_p4d = xen_set_p4d_hyper,
2173
2174	.alloc_pud = xen_alloc_pmd_init,
2175	.release_pud = xen_release_pmd_init,
2176
2177	#if CONFIG_PGTABLE_LEVELS >= 5
2178	.p4d_val = PV_CALLEE_SAVE(xen_p4d_val),
2179	.make_p4d = PV_CALLEE_SAVE(xen_make_p4d),
2180	#endif
2181
2182	.enter_mmap = xen_enter_mmap,
2183	.exit_mmap = xen_exit_mmap,
2184
2185	.lazy_mode = {
2186	.enter = xen_enter_lazy_mmu,
2187	.leave = xen_leave_lazy_mmu,
2188	.flush = xen_flush_lazy_mmu,
2189	},
2190
2191	.set_fixmap = xen_set_fixmap,
2192	},
2193	};
2194
2195	void __init xen_init_mmu_ops(void)
2196	{
2197	x86_init.paging.pagetable_init = xen_pagetable_init;
2198	x86_init.hyper.init_after_bootmem = xen_after_bootmem;
2199
2200	pv_ops.mmu = xen_mmu_ops.mmu;
2201
2202	memset(dummy_mapping, `0xff`, PAGE_SIZE);
2203	}
2204
2205	/ Protected by xen_reservation_lock. /
2206	#define MAX_CONTIG_ORDER 9 /* 2MB */
2207	static unsigned long discontig_frames[`1`<<MAX_CONTIG_ORDER];
2208
2209	#define VOID_PTE (mfn_pte(0, __pgprot(0)))
2210	static void xen_zap_pfn_range(unsigned long vaddr, unsigned int order,
2211	unsigned long *in_frames,
2212	unsigned long *out_frames)
2213	{
2214	int i;
2215	struct multicall_space mcs;
2216
2217	xen_mc_batch();
2218	for (i = `0`; i < (`1UL`<<order); i++, vaddr += PAGE_SIZE) {
2219	mcs = __xen_mc_entry(args: `0`);
2220
2221	if (in_frames)
2222	in_frames[i] = virt_to_mfn((void *)vaddr);
2223
2224	MULTI_update_va_mapping(mcl: mcs.mc, va: vaddr, VOID_PTE, flags: `0`);
2225	__set_phys_to_machine(pfn: virt_to_pfn(v: (void *)vaddr), INVALID_P2M_ENTRY);
2226
2227	if (out_frames)
2228	out_frames[i] = virt_to_pfn(v: (void *)vaddr);
2229	}
2230	xen_mc_issue(mode: `0`);
2231	}
2232
2233	/*
2234	* Update the pfn-to-mfn mappings for a virtual address range, either to
2235	* point to an array of mfns, or contiguously from a single starting
2236	* mfn.
2237	*/
2238	static void xen_remap_exchanged_ptes(unsigned long vaddr, int order,
2239	unsigned long *mfns,
2240	unsigned long first_mfn)
2241	{
2242	unsigned i, limit;
2243	unsigned long mfn;
2244
2245	xen_mc_batch();
2246
2247	limit = `1u` << order;
2248	for (i = `0`; i < limit; i++, vaddr += PAGE_SIZE) {
2249	struct multicall_space mcs;
2250	unsigned flags;
2251
2252	mcs = __xen_mc_entry(args: `0`);
2253	if (mfns)
2254	mfn = mfns[i];
2255	else
2256	mfn = first_mfn + i;
2257
2258	if (i < (limit - `1`))
2259	flags = `0`;
2260	else {
2261	if (order == `0`)
2262	flags = UVMF_INVLPG \| UVMF_ALL;
2263	else
2264	flags = UVMF_TLB_FLUSH \| UVMF_ALL;
2265	}
2266
2267	MULTI_update_va_mapping(mcl: mcs.mc, va: vaddr,
2268	new_val: mfn_pte(page_nr: mfn, PAGE_KERNEL), flags);
2269
2270	set_phys_to_machine(pfn: virt_to_pfn(v: (void *)vaddr), mfn);
2271	}
2272
2273	xen_mc_issue(mode: `0`);
2274	}
2275
2276	/*
2277	* Perform the hypercall to exchange a region of our pfns to point to
2278	* memory with the required contiguous alignment. Takes the pfns as
2279	* input, and populates mfns as output.
2280	*
2281	* Returns a success code indicating whether the hypervisor was able to
2282	* satisfy the request or not.
2283	*/
2284	static int xen_exchange_memory(unsigned long extents_in, unsigned int order_in,
2285	unsigned long *pfns_in,
2286	unsigned long extents_out,
2287	unsigned int order_out,
2288	unsigned long *mfns_out,
2289	unsigned int address_bits)
2290	{
2291	long rc;
2292	int success;
2293
2294	struct xen_memory_exchange exchange = {
2295	.in = {
2296	.nr_extents = extents_in,
2297	.extent_order = order_in,
2298	.extent_start = pfns_in,
2299	.domid = DOMID_SELF
2300	},
2301	.out = {
2302	.nr_extents = extents_out,
2303	.extent_order = order_out,
2304	.extent_start = mfns_out,
2305	.address_bits = address_bits,
2306	.domid = DOMID_SELF
2307	}
2308	};
2309
2310	BUG_ON(extents_in << order_in != extents_out << order_out);
2311
2312	rc = HYPERVISOR_memory_op(XENMEM_exchange, arg: &exchange);
2313	success = (exchange.nr_exchanged == extents_in);
2314
2315	BUG_ON(!success && ((exchange.nr_exchanged != `0`) \|\| (rc == `0`)));
2316	BUG_ON(success && (rc != `0`));
2317
2318	return success;
2319	}
2320
2321	int xen_create_contiguous_region(phys_addr_t pstart, unsigned int order,
2322	unsigned int address_bits,
2323	dma_addr_t *dma_handle)
2324	{
2325	unsigned long *in_frames = discontig_frames, out_frame;
2326	unsigned long flags;
2327	int success;
2328	unsigned long vstart = (unsigned long)phys_to_virt(address: pstart);
2329
2330	if (unlikely(order > MAX_CONTIG_ORDER))
2331	return -ENOMEM;
2332
2333	memset((void *) vstart, `0`, PAGE_SIZE << order);
2334
2335	spin_lock_irqsave(&xen_reservation_lock, flags);
2336
2337	/ 1. Zap current PTEs, remembering MFNs. /
2338	xen_zap_pfn_range(vaddr: vstart, order, in_frames, NULL);
2339
2340	/ 2. Get a new contiguous memory extent. /
2341	out_frame = virt_to_pfn(v: (void *)vstart);
2342	success = xen_exchange_memory(extents_in: `1UL` << order, order_in: `0`, pfns_in: in_frames,
2343	extents_out: `1`, order_out: order, mfns_out: &out_frame,
2344	address_bits);
2345
2346	/ 3. Map the new extent in place of old pages. /
2347	if (success)
2348	xen_remap_exchanged_ptes(vaddr: vstart, order, NULL, first_mfn: out_frame);
2349	else
2350	xen_remap_exchanged_ptes(vaddr: vstart, order, mfns: in_frames, first_mfn: `0`);
2351
2352	spin_unlock_irqrestore(lock: &xen_reservation_lock, flags);
2353
2354	*dma_handle = virt_to_machine(vstart).maddr;
2355	return success ? `0` : -ENOMEM;
2356	}
2357
2358	void xen_destroy_contiguous_region(phys_addr_t pstart, unsigned int order)
2359	{
2360	unsigned long *out_frames = discontig_frames, in_frame;
2361	unsigned long flags;
2362	int success;
2363	unsigned long vstart;
2364
2365	if (unlikely(order > MAX_CONTIG_ORDER))
2366	return;
2367
2368	vstart = (unsigned long)phys_to_virt(address: pstart);
2369	memset((void *) vstart, `0`, PAGE_SIZE << order);
2370
2371	spin_lock_irqsave(&xen_reservation_lock, flags);
2372
2373	/ 1. Find start MFN of contiguous extent. /
2374	in_frame = virt_to_mfn((void *)vstart);
2375
2376	/ 2. Zap current PTEs. /
2377	xen_zap_pfn_range(vaddr: vstart, order, NULL, out_frames);
2378
2379	/ 3. Do the exchange for non-contiguous MFNs. /
2380	success = xen_exchange_memory(extents_in: `1`, order_in: order, pfns_in: &in_frame, extents_out: `1UL` << order,
2381	order_out: `0`, mfns_out: out_frames, address_bits: `0`);
2382
2383	/ 4. Map new pages in place of old pages. /
2384	if (success)
2385	xen_remap_exchanged_ptes(vaddr: vstart, order, mfns: out_frames, first_mfn: `0`);
2386	else
2387	xen_remap_exchanged_ptes(vaddr: vstart, order, NULL, first_mfn: in_frame);
2388
2389	spin_unlock_irqrestore(lock: &xen_reservation_lock, flags);
2390	}
2391
2392	static noinline void xen_flush_tlb_all(void)
2393	{
2394	struct mmuext_op *op;
2395	struct multicall_space mcs;
2396
2397	preempt_disable();
2398
2399	mcs = xen_mc_entry(args: sizeof(*op));
2400
2401	op = mcs.args;
2402	op->cmd = MMUEXT_TLB_FLUSH_ALL;
2403	MULTI_mmuext_op(mcl: mcs.mc, op, count: `1`, NULL, DOMID_SELF);
2404
2405	xen_mc_issue(mode: XEN_LAZY_MMU);
2406
2407	preempt_enable();
2408	}
2409
2410	#define REMAP_BATCH_SIZE 16
2411
2412	struct remap_data {
2413	xen_pfn_t *pfn;
2414	bool contiguous;
2415	bool no_translate;
2416	pgprot_t prot;
2417	struct mmu_update *mmu_update;
2418	};
2419
2420	static int remap_area_pfn_pte_fn(pte_t ptep, unsigned* long addr, void *data)
2421	{
2422	struct remap_data *rmd = data;
2423	pte_t pte = pte_mkspecial(pte: mfn_pte(page_nr: *rmd->pfn, pgprot: rmd->prot));
2424
2425	/*
2426	* If we have a contiguous range, just update the pfn itself,
2427	* else update pointer to be "next pfn".
2428	*/
2429	if (rmd->contiguous)
2430	(*rmd->pfn)++;
2431	else
2432	rmd->pfn++;
2433
2434	rmd->mmu_update->ptr = virt_to_machine(ptep).maddr;
2435	rmd->mmu_update->ptr \|= rmd->no_translate ?
2436	MMU_PT_UPDATE_NO_TRANSLATE :
2437	MMU_NORMAL_PT_UPDATE;
2438	rmd->mmu_update->val = pte_val_ma(pte);
2439	rmd->mmu_update++;
2440
2441	return `0`;
2442	}
2443
2444	int xen_remap_pfn(struct vm_area_struct vma, unsigned* long addr,
2445	xen_pfn_t pfn, int* nr, int *err_ptr, pgprot_t prot,
2446	unsigned int domid, bool no_translate)
2447	{
2448	int err = `0`;
2449	struct remap_data rmd;
2450	struct mmu_update mmu_update[REMAP_BATCH_SIZE];
2451	unsigned long range;
2452	int mapped = `0`;
2453
2454	BUG_ON(!((vma->vm_flags & (VM_PFNMAP \| VM_IO)) == (VM_PFNMAP \| VM_IO)));
2455
2456	rmd.pfn = pfn;
2457	rmd.prot = prot;
2458	/*
2459	* We use the err_ptr to indicate if there we are doing a contiguous
2460	* mapping or a discontiguous mapping.
2461	*/
2462	rmd.contiguous = !err_ptr;
2463	rmd.no_translate = no_translate;
2464
2465	while (nr) {
2466	int index = `0`;
2467	int done = `0`;
2468	int batch = min(REMAP_BATCH_SIZE, nr);
2469	int batch_left = batch;
2470
2471	range = (unsigned long)batch << PAGE_SHIFT;
2472
2473	rmd.mmu_update = mmu_update;
2474	err = apply_to_page_range(mm: vma->vm_mm, address: addr, size: range,
2475	fn: remap_area_pfn_pte_fn, data: &rmd);
2476	if (err)
2477	goto out;
2478
2479	/*
2480	* We record the error for each page that gives an error, but
2481	* continue mapping until the whole set is done
2482	*/
2483	do {
2484	int i;
2485
2486	err = HYPERVISOR_mmu_update(req: &mmu_update[index],
2487	count: batch_left, success_count: &done, domid);
2488
2489	/*
2490	* @err_ptr may be the same buffer as @gfn, so
2491	* only clear it after each chunk of @gfn is
2492	* used.
2493	*/
2494	if (err_ptr) {
2495	for (i = index; i < index + done; i++)
2496	err_ptr[i] = `0`;
2497	}
2498	if (err < `0`) {
2499	if (!err_ptr)
2500	goto out;
2501	err_ptr[i] = err;
2502	done++; / Skip failed frame. /
2503	} else
2504	mapped += done;
2505	batch_left -= done;
2506	index += done;
2507	} while (batch_left);
2508
2509	nr -= batch;
2510	addr += range;
2511	if (err_ptr)
2512	err_ptr += batch;
2513	cond_resched();
2514	}
2515	out:
2516
2517	xen_flush_tlb_all();
2518
2519	return err < `0` ? err : mapped;
2520	}
2521	EXPORT_SYMBOL_GPL(xen_remap_pfn);
2522
2523	#ifdef CONFIG_KEXEC_CORE
2524	phys_addr_t paddr_vmcoreinfo_note(void)
2525	{
2526	if (xen_pv_domain())
2527	return virt_to_machine(vmcoreinfo_note).maddr;
2528	else
2529	return __pa(vmcoreinfo_note);
2530	}
2531	#endif /* CONFIG_KEXEC_CORE */
2532

source code of linux/arch/x86/xen/mmu_pv.c