mremap.c source code [linux/mm/mremap.c]

1	// SPDX-License-Identifier: GPL-2.0
2	/*
3	* mm/mremap.c
4	*
5	* (C) Copyright 1996 Linus Torvalds
6	*
7	* Address space accounting code <alan@lxorguk.ukuu.org.uk>
8	* (C) Copyright 2002 Red Hat Inc, All Rights Reserved
9	*/
10
11	#include <linux/mm.h>
12	#include <linux/mm_inline.h>
13	#include <linux/hugetlb.h>
14	#include <linux/shm.h>
15	#include <linux/ksm.h>
16	#include <linux/mman.h>
17	#include <linux/swap.h>
18	#include <linux/capability.h>
19	#include <linux/fs.h>
20	#include <linux/swapops.h>
21	#include <linux/highmem.h>
22	#include <linux/security.h>
23	#include <linux/syscalls.h>
24	#include <linux/mmu_notifier.h>
25	#include <linux/uaccess.h>
26	#include <linux/userfaultfd_k.h>
27	#include <linux/mempolicy.h>
28
29	#include <asm/cacheflush.h>
30	#include <asm/tlb.h>
31	#include <asm/pgalloc.h>
32
33	#include "internal.h"
34
35	static pud_t get_old_pud(struct* mm_struct mm, unsigned* long addr)
36	{
37	pgd_t *pgd;
38	p4d_t *p4d;
39	pud_t *pud;
40
41	pgd = pgd_offset(mm, addr);
42	if (pgd_none_or_clear_bad(pgd))
43	return NULL;
44
45	p4d = p4d_offset(pgd, address: addr);
46	if (p4d_none_or_clear_bad(p4d))
47	return NULL;
48
49	pud = pud_offset(p4d, address: addr);
50	if (pud_none_or_clear_bad(pud))
51	return NULL;
52
53	return pud;
54	}
55
56	static pmd_t get_old_pmd(struct* mm_struct mm, unsigned* long addr)
57	{
58	pud_t *pud;
59	pmd_t *pmd;
60
61	pud = get_old_pud(mm, addr);
62	if (!pud)
63	return NULL;
64
65	pmd = pmd_offset(pud, address: addr);
66	if (pmd_none(pmd: *pmd))
67	return NULL;
68
69	return pmd;
70	}
71
72	static pud_t alloc_new_pud(struct* mm_struct mm, struct* vm_area_struct *vma,
73	unsigned long addr)
74	{
75	pgd_t *pgd;
76	p4d_t *p4d;
77
78	pgd = pgd_offset(mm, addr);
79	p4d = p4d_alloc(mm, pgd, address: addr);
80	if (!p4d)
81	return NULL;
82
83	return pud_alloc(mm, p4d, address: addr);
84	}
85
86	static pmd_t alloc_new_pmd(struct* mm_struct mm, struct* vm_area_struct *vma,
87	unsigned long addr)
88	{
89	pud_t *pud;
90	pmd_t *pmd;
91
92	pud = alloc_new_pud(mm, vma, addr);
93	if (!pud)
94	return NULL;
95
96	pmd = pmd_alloc(mm, pud, address: addr);
97	if (!pmd)
98	return NULL;
99
100	VM_BUG_ON(pmd_trans_huge(*pmd));
101
102	return pmd;
103	}
104
105	static void take_rmap_locks(struct vm_area_struct *vma)
106	{
107	if (vma->vm_file)
108	i_mmap_lock_write(mapping: vma->vm_file->f_mapping);
109	if (vma->anon_vma)
110	anon_vma_lock_write(anon_vma: vma->anon_vma);
111	}
112
113	static void drop_rmap_locks(struct vm_area_struct *vma)
114	{
115	if (vma->anon_vma)
116	anon_vma_unlock_write(anon_vma: vma->anon_vma);
117	if (vma->vm_file)
118	i_mmap_unlock_write(mapping: vma->vm_file->f_mapping);
119	}
120
121	static pte_t move_soft_dirty_pte(pte_t pte)
122	{
123	/*
124	* Set soft dirty bit so we can notice
125	* in userspace the ptes were moved.
126	*/
127	#ifdef CONFIG_MEM_SOFT_DIRTY
128	if (pte_present(a: pte))
129	pte = pte_mksoft_dirty(pte);
130	else if (is_swap_pte(pte))
131	pte = pte_swp_mksoft_dirty(pte);
132	#endif
133	return pte;
134	}
135
136	static int move_ptes(struct vm_area_struct vma, pmd_t old_pmd,
137	unsigned long old_addr, unsigned long old_end,
138	struct vm_area_struct new_vma, pmd_t new_pmd,
139	unsigned long new_addr, bool need_rmap_locks)
140	{
141	struct mm_struct *mm = vma->vm_mm;
142	pte_t old_pte, new_pte, pte;
143	spinlock_t old_ptl, new_ptl;
144	bool force_flush = false;
145	unsigned long len = old_end - old_addr;
146	int err = `0`;
147
148	/*
149	* When need_rmap_locks is true, we take the i_mmap_rwsem and anon_vma
150	* locks to ensure that rmap will always observe either the old or the
151	* new ptes. This is the easiest way to avoid races with
152	* truncate_pagecache(), page migration, etc...
153	*
154	* When need_rmap_locks is false, we use other ways to avoid
155	* such races:
156	*
157	* - During exec() shift_arg_pages(), we use a specially tagged vma
158	* which rmap call sites look for using vma_is_temporary_stack().
159	*
160	* - During mremap(), new_vma is often known to be placed after vma
161	* in rmap traversal order. This ensures rmap will always observe
162	* either the old pte, or the new pte, or both (the page table locks
163	* serialize access to individual ptes, but only rmap traversal
164	* order guarantees that we won't miss both the old and new ptes).
165	*/
166	if (need_rmap_locks)
167	take_rmap_locks(vma);
168
169	/*
170	* We don't have to worry about the ordering of src and dst
171	* pte locks because exclusive mmap_lock prevents deadlock.
172	*/
173	old_pte = pte_offset_map_lock(mm, pmd: old_pmd, addr: old_addr, ptlp: &old_ptl);
174	if (!old_pte) {
175	err = -EAGAIN;
176	goto out;
177	}
178	new_pte = pte_offset_map_nolock(mm, pmd: new_pmd, addr: new_addr, ptlp: &new_ptl);
179	if (!new_pte) {
180	pte_unmap_unlock(old_pte, old_ptl);
181	err = -EAGAIN;
182	goto out;
183	}
184	if (new_ptl != old_ptl)
185	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
186	flush_tlb_batched_pending(mm: vma->vm_mm);
187	arch_enter_lazy_mmu_mode();
188
189	for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
190	new_pte++, new_addr += PAGE_SIZE) {
191	if (pte_none(pte: ptep_get(ptep: old_pte)))
192	continue;
193
194	pte = ptep_get_and_clear(mm, addr: old_addr, ptep: old_pte);
195	/*
196	* If we are remapping a valid PTE, make sure
197	* to flush TLB before we drop the PTL for the
198	* PTE.
199	*
200	* NOTE! Both old and new PTL matter: the old one
201	* for racing with page_mkclean(), the new one to
202	* make sure the physical page stays valid until
203	* the TLB entry for the old mapping has been
204	* flushed.
205	*/
206	if (pte_present(a: pte))
207	force_flush = true;
208	pte = move_pte(pte, new_vma->vm_page_prot, old_addr, new_addr);
209	pte = move_soft_dirty_pte(pte);
210	set_pte_at(mm, new_addr, new_pte, pte);
211	}
212
213	arch_leave_lazy_mmu_mode();
214	if (force_flush)
215	flush_tlb_range(vma, old_end - len, old_end);
216	if (new_ptl != old_ptl)
217	spin_unlock(lock: new_ptl);
218	pte_unmap(pte: new_pte - `1`);
219	pte_unmap_unlock(old_pte - `1`, old_ptl);
220	out:
221	if (need_rmap_locks)
222	drop_rmap_locks(vma);
223	return err;
224	}
225
226	#ifndef arch_supports_page_table_move
227	#define arch_supports_page_table_move arch_supports_page_table_move
228	static inline bool arch_supports_page_table_move(void)
229	{
230	return IS_ENABLED(CONFIG_HAVE_MOVE_PMD) \|\|
231	IS_ENABLED(CONFIG_HAVE_MOVE_PUD);
232	}
233	#endif
234
235	#ifdef CONFIG_HAVE_MOVE_PMD
236	static bool move_normal_pmd(struct vm_area_struct vma, unsigned* long old_addr,
237	unsigned long new_addr, pmd_t old_pmd, pmd_t new_pmd)
238	{
239	spinlock_t old_ptl, new_ptl;
240	struct mm_struct *mm = vma->vm_mm;
241	pmd_t pmd;
242
243	if (!arch_supports_page_table_move())
244	return false;
245	/*
246	* The destination pmd shouldn't be established, free_pgtables()
247	* should have released it.
248	*
249	* However, there's a case during execve() where we use mremap
250	* to move the initial stack, and in that case the target area
251	* may overlap the source area (always moving down).
252	*
253	* If everything is PMD-aligned, that works fine, as moving
254	* each pmd down will clear the source pmd. But if we first
255	* have a few 4kB-only pages that get moved down, and then
256	* hit the "now the rest is PMD-aligned, let's do everything
257	* one pmd at a time", we will still have the old (now empty
258	* of any 4kB pages, but still there) PMD in the page table
259	* tree.
260	*
261	* Warn on it once - because we really should try to figure
262	* out how to do this better - but then say "I won't move
263	* this pmd".
264	*
265	* One alternative might be to just unmap the target pmd at
266	* this point, and verify that it really is empty. We'll see.
267	*/
268	if (WARN_ON_ONCE(!pmd_none(*new_pmd)))
269	return false;
270
271	/*
272	* We don't have to worry about the ordering of src and dst
273	* ptlocks because exclusive mmap_lock prevents deadlock.
274	*/
275	old_ptl = pmd_lock(mm: vma->vm_mm, pmd: old_pmd);
276	new_ptl = pmd_lockptr(mm, pmd: new_pmd);
277	if (new_ptl != old_ptl)
278	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
279
280	/ Clear the pmd /
281	pmd = *old_pmd;
282	pmd_clear(pmdp: old_pmd);
283
284	VM_BUG_ON(!pmd_none(*new_pmd));
285
286	pmd_populate(mm, pmd: new_pmd, pmd_pgtable(pmd));
287	flush_tlb_range(vma, old_addr, old_addr + PMD_SIZE);
288	if (new_ptl != old_ptl)
289	spin_unlock(lock: new_ptl);
290	spin_unlock(lock: old_ptl);
291
292	return true;
293	}
294	#else
295	static inline bool move_normal_pmd(struct vm_area_struct *vma,
296	unsigned long old_addr, unsigned long new_addr, pmd_t *old_pmd,
297	pmd_t *new_pmd)
298	{
299	return false;
300	}
301	#endif
302
303	#if CONFIG_PGTABLE_LEVELS > 2 && defined(CONFIG_HAVE_MOVE_PUD)
304	static bool move_normal_pud(struct vm_area_struct vma, unsigned* long old_addr,
305	unsigned long new_addr, pud_t old_pud, pud_t new_pud)
306	{
307	spinlock_t old_ptl, new_ptl;
308	struct mm_struct *mm = vma->vm_mm;
309	pud_t pud;
310
311	if (!arch_supports_page_table_move())
312	return false;
313	/*
314	* The destination pud shouldn't be established, free_pgtables()
315	* should have released it.
316	*/
317	if (WARN_ON_ONCE(!pud_none(*new_pud)))
318	return false;
319
320	/*
321	* We don't have to worry about the ordering of src and dst
322	* ptlocks because exclusive mmap_lock prevents deadlock.
323	*/
324	old_ptl = pud_lock(mm: vma->vm_mm, pud: old_pud);
325	new_ptl = pud_lockptr(mm, pud: new_pud);
326	if (new_ptl != old_ptl)
327	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
328
329	/ Clear the pud /
330	pud = *old_pud;
331	pud_clear(pudp: old_pud);
332
333	VM_BUG_ON(!pud_none(*new_pud));
334
335	pud_populate(mm, pud: new_pud, pmd: pud_pgtable(pud));
336	flush_tlb_range(vma, old_addr, old_addr + PUD_SIZE);
337	if (new_ptl != old_ptl)
338	spin_unlock(lock: new_ptl);
339	spin_unlock(lock: old_ptl);
340
341	return true;
342	}
343	#else
344	static inline bool move_normal_pud(struct vm_area_struct *vma,
345	unsigned long old_addr, unsigned long new_addr, pud_t *old_pud,
346	pud_t *new_pud)
347	{
348	return false;
349	}
350	#endif
351
352	#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && defined(CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
353	static bool move_huge_pud(struct vm_area_struct vma, unsigned* long old_addr,
354	unsigned long new_addr, pud_t old_pud, pud_t new_pud)
355	{
356	spinlock_t old_ptl, new_ptl;
357	struct mm_struct *mm = vma->vm_mm;
358	pud_t pud;
359
360	/*
361	* The destination pud shouldn't be established, free_pgtables()
362	* should have released it.
363	*/
364	if (WARN_ON_ONCE(!pud_none(*new_pud)))
365	return false;
366
367	/*
368	* We don't have to worry about the ordering of src and dst
369	* ptlocks because exclusive mmap_lock prevents deadlock.
370	*/
371	old_ptl = pud_lock(mm: vma->vm_mm, pud: old_pud);
372	new_ptl = pud_lockptr(mm, pud: new_pud);
373	if (new_ptl != old_ptl)
374	spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
375
376	/ Clear the pud /
377	pud = *old_pud;
378	pud_clear(pudp: old_pud);
379
380	VM_BUG_ON(!pud_none(*new_pud));
381
382	/ Set the new pud /
383	/ mark soft_ditry when we add pud level soft dirty support /
384	set_pud_at(mm, addr: new_addr, pudp: new_pud, pud);
385	flush_pud_tlb_range(vma, old_addr, old_addr + HPAGE_PUD_SIZE);
386	if (new_ptl != old_ptl)
387	spin_unlock(lock: new_ptl);
388	spin_unlock(lock: old_ptl);
389
390	return true;
391	}
392	#else
393	static bool move_huge_pud(struct vm_area_struct vma, unsigned* long old_addr,
394	unsigned long new_addr, pud_t old_pud, pud_t new_pud)
395	{
396	WARN_ON_ONCE(`1`);
397	return false;
398
399	}
400	#endif
401
402	enum pgt_entry {
403	NORMAL_PMD,
404	HPAGE_PMD,
405	NORMAL_PUD,
406	HPAGE_PUD,
407	};
408
409	/*
410	* Returns an extent of the corresponding size for the pgt_entry specified if
411	* valid. Else returns a smaller extent bounded by the end of the source and
412	* destination pgt_entry.
413	*/
414	static __always_inline unsigned long get_extent(enum pgt_entry entry,
415	unsigned long old_addr, unsigned long old_end,
416	unsigned long new_addr)
417	{
418	unsigned long next, extent, mask, size;
419
420	switch (entry) {
421	case HPAGE_PMD:
422	case NORMAL_PMD:
423	mask = PMD_MASK;
424	size = PMD_SIZE;
425	break;
426	case HPAGE_PUD:
427	case NORMAL_PUD:
428	mask = PUD_MASK;
429	size = PUD_SIZE;
430	break;
431	default:
432	BUILD_BUG();
433	break;
434	}
435
436	next = (old_addr + size) & mask;
437	/ even if next overflowed, extent below will be ok /
438	extent = next - old_addr;
439	if (extent > old_end - old_addr)
440	extent = old_end - old_addr;
441	next = (new_addr + size) & mask;
442	if (extent > next - new_addr)
443	extent = next - new_addr;
444	return extent;
445	}
446
447	/*
448	* Attempts to speedup the move by moving entry at the level corresponding to
449	* pgt_entry. Returns true if the move was successful, else false.
450	*/
451	static bool move_pgt_entry(enum pgt_entry entry, struct vm_area_struct *vma,
452	unsigned long old_addr, unsigned long new_addr,
453	void old_entry, void* *new_entry, bool need_rmap_locks)
454	{
455	bool moved = false;
456
457	/ See comment in move_ptes() /
458	if (need_rmap_locks)
459	take_rmap_locks(vma);
460
461	switch (entry) {
462	case NORMAL_PMD:
463	moved = move_normal_pmd(vma, old_addr, new_addr, old_pmd: old_entry,
464	new_pmd: new_entry);
465	break;
466	case NORMAL_PUD:
467	moved = move_normal_pud(vma, old_addr, new_addr, old_pud: old_entry,
468	new_pud: new_entry);
469	break;
470	case HPAGE_PMD:
471	moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
472	move_huge_pmd(vma, old_addr, new_addr, old_pmd: old_entry,
473	new_pmd: new_entry);
474	break;
475	case HPAGE_PUD:
476	moved = IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) &&
477	move_huge_pud(vma, old_addr, new_addr, old_pud: old_entry,
478	new_pud: new_entry);
479	break;
480
481	default:
482	WARN_ON_ONCE(`1`);
483	break;
484	}
485
486	if (need_rmap_locks)
487	drop_rmap_locks(vma);
488
489	return moved;
490	}
491
492	/*
493	* A helper to check if aligning down is OK. The aligned address should fall
494	* on no mapping. For the stack moving down, that's a special move within
495	* the VMA that is created to span the source and destination of the move,
496	* so we make an exception for it.
497	*/
498	static bool can_align_down(struct vm_area_struct vma, unsigned* long addr_to_align,
499	unsigned long mask, bool for_stack)
500	{
501	unsigned long addr_masked = addr_to_align & mask;
502
503	/*
504	* If @addr_to_align of either source or destination is not the beginning
505	* of the corresponding VMA, we can't align down or we will destroy part
506	* of the current mapping.
507	*/
508	if (!for_stack && vma->vm_start != addr_to_align)
509	return false;
510
511	/ In the stack case we explicitly permit in-VMA alignment. /
512	if (for_stack && addr_masked >= vma->vm_start)
513	return true;
514
515	/*
516	* Make sure the realignment doesn't cause the address to fall on an
517	* existing mapping.
518	*/
519	return find_vma_intersection(mm: vma->vm_mm, start_addr: addr_masked, end_addr: vma->vm_start) == NULL;
520	}
521
522	/ Opportunistically realign to specified boundary for faster copy. /
523	static void try_realign_addr(unsigned long old_addr, struct* vm_area_struct *old_vma,
524	unsigned long new_addr, struct* vm_area_struct *new_vma,
525	unsigned long mask, bool for_stack)
526	{
527	/ Skip if the addresses are already aligned. /
528	if ((*old_addr & ~mask) == `0`)
529	return;
530
531	/ Only realign if the new and old addresses are mutually aligned. /
532	if ((old_addr & ~mask) != (new_addr & ~mask))
533	return;
534
535	/ Ensure realignment doesn't cause overlap with existing mappings. /
536	if (!can_align_down(vma: old_vma, addr_to_align: *old_addr, mask, for_stack) \|\|
537	!can_align_down(vma: new_vma, addr_to_align: *new_addr, mask, for_stack))
538	return;
539
540	old_addr = old_addr & mask;
541	new_addr = new_addr & mask;
542	}
543
544	unsigned long move_page_tables(struct vm_area_struct *vma,
545	unsigned long old_addr, struct vm_area_struct *new_vma,
546	unsigned long new_addr, unsigned long len,
547	bool need_rmap_locks, bool for_stack)
548	{
549	unsigned long extent, old_end;
550	struct mmu_notifier_range range;
551	pmd_t old_pmd, new_pmd;
552	pud_t old_pud, new_pud;
553
554	if (!len)
555	return `0`;
556
557	old_end = old_addr + len;
558
559	if (is_vm_hugetlb_page(vma))
560	return move_hugetlb_page_tables(vma, new_vma, old_addr,
561	new_addr, len);
562
563	/*
564	* If possible, realign addresses to PMD boundary for faster copy.
565	* Only realign if the mremap copying hits a PMD boundary.
566	*/
567	if (len >= PMD_SIZE - (old_addr & ~PMD_MASK))
568	try_realign_addr(old_addr: &old_addr, old_vma: vma, new_addr: &new_addr, new_vma, PMD_MASK,
569	for_stack);
570
571	flush_cache_range(vma, start: old_addr, end: old_end);
572	mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_UNMAP, flags: `0`, mm: vma->vm_mm,
573	start: old_addr, end: old_end);
574	mmu_notifier_invalidate_range_start(range: &range);
575
576	for (; old_addr < old_end; old_addr += extent, new_addr += extent) {
577	cond_resched();
578	/*
579	* If extent is PUD-sized try to speed up the move by moving at the
580	* PUD level if possible.
581	*/
582	extent = get_extent(entry: NORMAL_PUD, old_addr, old_end, new_addr);
583
584	old_pud = get_old_pud(mm: vma->vm_mm, addr: old_addr);
585	if (!old_pud)
586	continue;
587	new_pud = alloc_new_pud(mm: vma->vm_mm, vma, addr: new_addr);
588	if (!new_pud)
589	break;
590	if (pud_trans_huge(pud: old_pud) \|\| pud_devmap(pud: old_pud)) {
591	if (extent == HPAGE_PUD_SIZE) {
592	move_pgt_entry(entry: HPAGE_PUD, vma, old_addr, new_addr,
593	old_entry: old_pud, new_entry: new_pud, need_rmap_locks);
594	/ We ignore and continue on error? /
595	continue;
596	}
597	} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PUD) && extent == PUD_SIZE) {
598
599	if (move_pgt_entry(entry: NORMAL_PUD, vma, old_addr, new_addr,
600	old_entry: old_pud, new_entry: new_pud, need_rmap_locks: true))
601	continue;
602	}
603
604	extent = get_extent(entry: NORMAL_PMD, old_addr, old_end, new_addr);
605	old_pmd = get_old_pmd(mm: vma->vm_mm, addr: old_addr);
606	if (!old_pmd)
607	continue;
608	new_pmd = alloc_new_pmd(mm: vma->vm_mm, vma, addr: new_addr);
609	if (!new_pmd)
610	break;
611	again:
612	if (is_swap_pmd(pmd: old_pmd) \|\| pmd_trans_huge(pmd: old_pmd) \|\|
613	pmd_devmap(pmd: *old_pmd)) {
614	if (extent == HPAGE_PMD_SIZE &&
615	move_pgt_entry(entry: HPAGE_PMD, vma, old_addr, new_addr,
616	old_entry: old_pmd, new_entry: new_pmd, need_rmap_locks))
617	continue;
618	split_huge_pmd(vma, old_pmd, old_addr);
619	} else if (IS_ENABLED(CONFIG_HAVE_MOVE_PMD) &&
620	extent == PMD_SIZE) {
621	/*
622	* If the extent is PMD-sized, try to speed the move by
623	* moving at the PMD level if possible.
624	*/
625	if (move_pgt_entry(entry: NORMAL_PMD, vma, old_addr, new_addr,
626	old_entry: old_pmd, new_entry: new_pmd, need_rmap_locks: true))
627	continue;
628	}
629	if (pmd_none(pmd: *old_pmd))
630	continue;
631	if (pte_alloc(new_vma->vm_mm, new_pmd))
632	break;
633	if (move_ptes(vma, old_pmd, old_addr, old_end: old_addr + extent,
634	new_vma, new_pmd, new_addr, need_rmap_locks) < `0`)
635	goto again;
636	}
637
638	mmu_notifier_invalidate_range_end(range: &range);
639
640	/*
641	* Prevent negative return values when {old,new}_addr was realigned
642	* but we broke out of the above loop for the first PMD itself.
643	*/
644	if (len + old_addr < old_end)
645	return `0`;
646
647	return len + old_addr - old_end; / how much done /
648	}
649
650	static unsigned long move_vma(struct vm_area_struct *vma,
651	unsigned long old_addr, unsigned long old_len,
652	unsigned long new_len, unsigned long new_addr,
653	bool locked, unsigned* long flags,
654	struct vm_userfaultfd_ctx uf, struct* list_head *uf_unmap)
655	{
656	long to_account = new_len - old_len;
657	struct mm_struct *mm = vma->vm_mm;
658	struct vm_area_struct *new_vma;
659	unsigned long vm_flags = vma->vm_flags;
660	unsigned long new_pgoff;
661	unsigned long moved_len;
662	unsigned long account_start = `0`;
663	unsigned long account_end = `0`;
664	unsigned long hiwater_vm;
665	int err = `0`;
666	bool need_rmap_locks;
667	struct vma_iterator vmi;
668
669	/*
670	* We'd prefer to avoid failure later on in do_munmap:
671	* which may split one vma into three before unmapping.
672	*/
673	if (mm->map_count >= sysctl_max_map_count - `3`)
674	return -ENOMEM;
675
676	if (unlikely(flags & MREMAP_DONTUNMAP))
677	to_account = new_len;
678
679	if (vma->vm_ops && vma->vm_ops->may_split) {
680	if (vma->vm_start != old_addr)
681	err = vma->vm_ops->may_split(vma, old_addr);
682	if (!err && vma->vm_end != old_addr + old_len)
683	err = vma->vm_ops->may_split(vma, old_addr + old_len);
684	if (err)
685	return err;
686	}
687
688	/*
689	* Advise KSM to break any KSM pages in the area to be moved:
690	* it would be confusing if they were to turn up at the new
691	* location, where they happen to coincide with different KSM
692	* pages recently unmapped. But leave vma->vm_flags as it was,
693	* so KSM can come around to merge on vma and new_vma afterwards.
694	*/
695	err = ksm_madvise(vma, start: old_addr, end: old_addr + old_len,
696	MADV_UNMERGEABLE, vm_flags: &vm_flags);
697	if (err)
698	return err;
699
700	if (vm_flags & VM_ACCOUNT) {
701	if (security_vm_enough_memory_mm(mm, pages: to_account >> PAGE_SHIFT))
702	return -ENOMEM;
703	}
704
705	vma_start_write(vma);
706	new_pgoff = vma->vm_pgoff + ((old_addr - vma->vm_start) >> PAGE_SHIFT);
707	new_vma = copy_vma(&vma, addr: new_addr, len: new_len, pgoff: new_pgoff,
708	need_rmap_locks: &need_rmap_locks);
709	if (!new_vma) {
710	if (vm_flags & VM_ACCOUNT)
711	vm_unacct_memory(pages: to_account >> PAGE_SHIFT);
712	return -ENOMEM;
713	}
714
715	moved_len = move_page_tables(vma, old_addr, new_vma, new_addr, len: old_len,
716	need_rmap_locks, for_stack: false);
717	if (moved_len < old_len) {
718	err = -ENOMEM;
719	} else if (vma->vm_ops && vma->vm_ops->mremap) {
720	err = vma->vm_ops->mremap(new_vma);
721	}
722
723	if (unlikely(err)) {
724	/*
725	* On error, move entries back from new area to old,
726	* which will succeed since page tables still there,
727	* and then proceed to unmap new area instead of old.
728	*/
729	move_page_tables(vma: new_vma, old_addr: new_addr, new_vma: vma, new_addr: old_addr, len: moved_len,
730	need_rmap_locks: true, for_stack: false);
731	vma = new_vma;
732	old_len = new_len;
733	old_addr = new_addr;
734	new_addr = err;
735	} else {
736	mremap_userfaultfd_prep(new_vma, uf);
737	}
738
739	if (is_vm_hugetlb_page(vma)) {
740	clear_vma_resv_huge_pages(vma);
741	}
742
743	/ Conceal VM_ACCOUNT so old reservation is not undone /
744	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP)) {
745	vm_flags_clear(vma, VM_ACCOUNT);
746	if (vma->vm_start < old_addr)
747	account_start = vma->vm_start;
748	if (vma->vm_end > old_addr + old_len)
749	account_end = vma->vm_end;
750	}
751
752	/*
753	* If we failed to move page tables we still do total_vm increment
754	* since do_munmap() will decrement it by old_len == new_len.
755	*
756	* Since total_vm is about to be raised artificially high for a
757	* moment, we need to restore high watermark afterwards: if stats
758	* are taken meanwhile, total_vm and hiwater_vm appear too high.
759	* If this were a serious issue, we'd add a flag to do_munmap().
760	*/
761	hiwater_vm = mm->hiwater_vm;
762	vm_stat_account(mm, vma->vm_flags, npages: new_len >> PAGE_SHIFT);
763
764	/ Tell pfnmap has moved from this vma /
765	if (unlikely(vma->vm_flags & VM_PFNMAP))
766	untrack_pfn_clear(vma);
767
768	if (unlikely(!err && (flags & MREMAP_DONTUNMAP))) {
769	/ We always clear VM_LOCKED[ONFAULT] on the old vma /
770	vm_flags_clear(vma, VM_LOCKED_MASK);
771
772	/*
773	* anon_vma links of the old vma is no longer needed after its page
774	* table has been moved.
775	*/
776	if (new_vma != vma && vma->vm_start == old_addr &&
777	vma->vm_end == (old_addr + old_len))
778	unlink_anon_vmas(vma);
779
780	/ Because we won't unmap we don't need to touch locked_vm /
781	return new_addr;
782	}
783
784	vma_iter_init(vmi: &vmi, mm, addr: old_addr);
785	if (do_vmi_munmap(vmi: &vmi, mm, start: old_addr, len: old_len, uf: uf_unmap, unlock: false) < `0`) {
786	/ OOM: unable to split vma, just get accounts right /
787	if (vm_flags & VM_ACCOUNT && !(flags & MREMAP_DONTUNMAP))
788	vm_acct_memory(pages: old_len >> PAGE_SHIFT);
789	account_start = account_end = `0`;
790	}
791
792	if (vm_flags & VM_LOCKED) {
793	mm->locked_vm += new_len >> PAGE_SHIFT;
794	*locked = true;
795	}
796
797	mm->hiwater_vm = hiwater_vm;
798
799	/ Restore VM_ACCOUNT if one or two pieces of vma left /
800	if (account_start) {
801	vma = vma_prev(vmi: &vmi);
802	vm_flags_set(vma, VM_ACCOUNT);
803	}
804
805	if (account_end) {
806	vma = vma_next(vmi: &vmi);
807	vm_flags_set(vma, VM_ACCOUNT);
808	}
809
810	return new_addr;
811	}
812
813	static struct vm_area_struct vma_to_resize(unsigned* long addr,
814	unsigned long old_len, unsigned long new_len, unsigned long flags)
815	{
816	struct mm_struct *mm = current->mm;
817	struct vm_area_struct *vma;
818	unsigned long pgoff;
819
820	vma = vma_lookup(mm, addr);
821	if (!vma)
822	return ERR_PTR(error: -EFAULT);
823
824	/*
825	* !old_len is a special case where an attempt is made to 'duplicate'
826	* a mapping. This makes no sense for private mappings as it will
827	* instead create a fresh/new mapping unrelated to the original. This
828	* is contrary to the basic idea of mremap which creates new mappings
829	* based on the original. There are no known use cases for this
830	* behavior. As a result, fail such attempts.
831	*/
832	if (!old_len && !(vma->vm_flags & (VM_SHARED \| VM_MAYSHARE))) {
833	pr_warn_once("%s (%d): attempted to duplicate a private mapping with mremap. This is not supported.\n", current->comm, current->pid);
834	return ERR_PTR(error: -EINVAL);
835	}
836
837	if ((flags & MREMAP_DONTUNMAP) &&
838	(vma->vm_flags & (VM_DONTEXPAND \| VM_PFNMAP)))
839	return ERR_PTR(error: -EINVAL);
840
841	/ We can't remap across vm area boundaries /
842	if (old_len > vma->vm_end - addr)
843	return ERR_PTR(error: -EFAULT);
844
845	if (new_len == old_len)
846	return vma;
847
848	/ Need to be careful about a growing mapping /
849	pgoff = (addr - vma->vm_start) >> PAGE_SHIFT;
850	pgoff += vma->vm_pgoff;
851	if (pgoff + (new_len >> PAGE_SHIFT) < pgoff)
852	return ERR_PTR(error: -EINVAL);
853
854	if (vma->vm_flags & (VM_DONTEXPAND \| VM_PFNMAP))
855	return ERR_PTR(error: -EFAULT);
856
857	if (!mlock_future_ok(mm, flags: vma->vm_flags, bytes: new_len - old_len))
858	return ERR_PTR(error: -EAGAIN);
859
860	if (!may_expand_vm(mm, vma->vm_flags,
861	npages: (new_len - old_len) >> PAGE_SHIFT))
862	return ERR_PTR(error: -ENOMEM);
863
864	return vma;
865	}
866
867	static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
868	unsigned long new_addr, unsigned long new_len, bool *locked,
869	unsigned long flags, struct vm_userfaultfd_ctx *uf,
870	struct list_head *uf_unmap_early,
871	struct list_head *uf_unmap)
872	{
873	struct mm_struct *mm = current->mm;
874	struct vm_area_struct *vma;
875	unsigned long ret = -EINVAL;
876	unsigned long map_flags = `0`;
877
878	if (offset_in_page(new_addr))
879	goto out;
880
881	if (new_len > TASK_SIZE \|\| new_addr > TASK_SIZE - new_len)
882	goto out;
883
884	/ Ensure the old/new locations do not overlap /
885	if (addr + old_len > new_addr && new_addr + new_len > addr)
886	goto out;
887
888	/*
889	* move_vma() need us to stay 4 maps below the threshold, otherwise
890	* it will bail out at the very beginning.
891	* That is a problem if we have already unmaped the regions here
892	* (new_addr, and old_addr), because userspace will not know the
893	* state of the vma's after it gets -ENOMEM.
894	* So, to avoid such scenario we can pre-compute if the whole
895	* operation has high chances to success map-wise.
896	* Worst-scenario case is when both vma's (new_addr and old_addr) get
897	* split in 3 before unmapping it.
898	* That means 2 more maps (1 for each) to the ones we already hold.
899	* Check whether current map count plus 2 still leads us to 4 maps below
900	* the threshold, otherwise return -ENOMEM here to be more safe.
901	*/
902	if ((mm->map_count + `2`) >= sysctl_max_map_count - `3`)
903	return -ENOMEM;
904
905	if (flags & MREMAP_FIXED) {
906	ret = do_munmap(mm, new_addr, new_len, uf: uf_unmap_early);
907	if (ret)
908	goto out;
909	}
910
911	if (old_len > new_len) {
912	ret = do_munmap(mm, addr+new_len, old_len - new_len, uf: uf_unmap);
913	if (ret)
914	goto out;
915	old_len = new_len;
916	}
917
918	vma = vma_to_resize(addr, old_len, new_len, flags);
919	if (IS_ERR(ptr: vma)) {
920	ret = PTR_ERR(ptr: vma);
921	goto out;
922	}
923
924	/ MREMAP_DONTUNMAP expands by old_len since old_len == new_len /
925	if (flags & MREMAP_DONTUNMAP &&
926	!may_expand_vm(mm, vma->vm_flags, npages: old_len >> PAGE_SHIFT)) {
927	ret = -ENOMEM;
928	goto out;
929	}
930
931	if (flags & MREMAP_FIXED)
932	map_flags \|= MAP_FIXED;
933
934	if (vma->vm_flags & VM_MAYSHARE)
935	map_flags \|= MAP_SHARED;
936
937	ret = get_unmapped_area(vma->vm_file, new_addr, new_len, vma->vm_pgoff +
938	((addr - vma->vm_start) >> PAGE_SHIFT),
939	map_flags);
940	if (IS_ERR_VALUE(ret))
941	goto out;
942
943	/ We got a new mapping /
944	if (!(flags & MREMAP_FIXED))
945	new_addr = ret;
946
947	ret = move_vma(vma, old_addr: addr, old_len, new_len, new_addr, locked, flags, uf,
948	uf_unmap);
949
950	out:
951	return ret;
952	}
953
954	static int vma_expandable(struct vm_area_struct vma, unsigned* long delta)
955	{
956	unsigned long end = vma->vm_end + delta;
957
958	if (end < vma->vm_end) / overflow /
959	return `0`;
960	if (find_vma_intersection(mm: vma->vm_mm, start_addr: vma->vm_end, end_addr: end))
961	return `0`;
962	if (get_unmapped_area(NULL, vma->vm_start, end - vma->vm_start,
963	`0`, MAP_FIXED) & ~PAGE_MASK)
964	return `0`;
965	return `1`;
966	}
967
968	/*
969	* Expand (or shrink) an existing mapping, potentially moving it at the
970	* same time (controlled by the MREMAP_MAYMOVE flag and available VM space)
971	*
972	* MREMAP_FIXED option added 5-Dec-1999 by Benjamin LaHaise
973	* This option implies MREMAP_MAYMOVE.
974	*/
975	SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
976	unsigned long, new_len, unsigned long, flags,
977	unsigned long, new_addr)
978	{
979	struct mm_struct *mm = current->mm;
980	struct vm_area_struct *vma;
981	unsigned long ret = -EINVAL;
982	bool locked = false;
983	struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
984	LIST_HEAD(uf_unmap_early);
985	LIST_HEAD(uf_unmap);
986
987	/*
988	* There is a deliberate asymmetry here: we strip the pointer tag
989	* from the old address but leave the new address alone. This is
990	* for consistency with mmap(), where we prevent the creation of
991	* aliasing mappings in userspace by leaving the tag bits of the
992	* mapping address intact. A non-zero tag will cause the subsequent
993	* range checks to reject the address as invalid.
994	*
995	* See Documentation/arch/arm64/tagged-address-abi.rst for more
996	* information.
997	*/
998	addr = untagged_addr(addr);
999
1000	if (flags & ~(MREMAP_FIXED \| MREMAP_MAYMOVE \| MREMAP_DONTUNMAP))
1001	return ret;
1002
1003	if (flags & MREMAP_FIXED && !(flags & MREMAP_MAYMOVE))
1004	return ret;
1005
1006	/*
1007	* MREMAP_DONTUNMAP is always a move and it does not allow resizing
1008	* in the process.
1009	*/
1010	if (flags & MREMAP_DONTUNMAP &&
1011	(!(flags & MREMAP_MAYMOVE) \|\| old_len != new_len))
1012	return ret;
1013
1014
1015	if (offset_in_page(addr))
1016	return ret;
1017
1018	old_len = PAGE_ALIGN(old_len);
1019	new_len = PAGE_ALIGN(new_len);
1020
1021	/*
1022	* We allow a zero old-len as a special case
1023	* for DOS-emu "duplicate shm area" thing. But
1024	* a zero new-len is nonsensical.
1025	*/
1026	if (!new_len)
1027	return ret;
1028
1029	if (mmap_write_lock_killable(current->mm))
1030	return -EINTR;
1031	vma = vma_lookup(mm, addr);
1032	if (!vma) {
1033	ret = -EFAULT;
1034	goto out;
1035	}
1036
1037	if (is_vm_hugetlb_page(vma)) {
1038	struct hstate *h __maybe_unused = hstate_vma(vma);
1039
1040	old_len = ALIGN(old_len, huge_page_size(h));
1041	new_len = ALIGN(new_len, huge_page_size(h));
1042
1043	/ addrs must be huge page aligned /
1044	if (addr & ~huge_page_mask(h))
1045	goto out;
1046	if (new_addr & ~huge_page_mask(h))
1047	goto out;
1048
1049	/*
1050	* Don't allow remap expansion, because the underlying hugetlb
1051	* reservation is not yet capable to handle split reservation.
1052	*/
1053	if (new_len > old_len)
1054	goto out;
1055	}
1056
1057	if (flags & (MREMAP_FIXED \| MREMAP_DONTUNMAP)) {
1058	ret = mremap_to(addr, old_len, new_addr, new_len,
1059	locked: &locked, flags, uf: &uf, uf_unmap_early: &uf_unmap_early,
1060	uf_unmap: &uf_unmap);
1061	goto out;
1062	}
1063
1064	/*
1065	* Always allow a shrinking remap: that just unmaps
1066	* the unnecessary pages..
1067	* do_vmi_munmap does all the needed commit accounting, and
1068	* unlocks the mmap_lock if so directed.
1069	*/
1070	if (old_len >= new_len) {
1071	VMA_ITERATOR(vmi, mm, addr + new_len);
1072
1073	if (old_len == new_len) {
1074	ret = addr;
1075	goto out;
1076	}
1077
1078	ret = do_vmi_munmap(vmi: &vmi, mm, start: addr + new_len, len: old_len - new_len,
1079	uf: &uf_unmap, unlock: true);
1080	if (ret)
1081	goto out;
1082
1083	ret = addr;
1084	goto out_unlocked;
1085	}
1086
1087	/*
1088	* Ok, we need to grow..
1089	*/
1090	vma = vma_to_resize(addr, old_len, new_len, flags);
1091	if (IS_ERR(ptr: vma)) {
1092	ret = PTR_ERR(ptr: vma);
1093	goto out;
1094	}
1095
1096	/ old_len exactly to the end of the area..*
1097	*/
1098	if (old_len == vma->vm_end - addr) {
1099	unsigned long delta = new_len - old_len;
1100
1101	/ can we just expand the current mapping? /
1102	if (vma_expandable(vma, delta)) {
1103	long pages = delta >> PAGE_SHIFT;
1104	VMA_ITERATOR(vmi, mm, vma->vm_end);
1105	long charged = `0`;
1106
1107	if (vma->vm_flags & VM_ACCOUNT) {
1108	if (security_vm_enough_memory_mm(mm, pages)) {
1109	ret = -ENOMEM;
1110	goto out;
1111	}
1112	charged = pages;
1113	}
1114
1115	/*
1116	* Function vma_merge_extend() is called on the
1117	* extension we are adding to the already existing vma,
1118	* vma_merge_extend() will merge this extension with the
1119	* already existing vma (expand operation itself) and
1120	* possibly also with the next vma if it becomes
1121	* adjacent to the expanded vma and otherwise
1122	* compatible.
1123	*/
1124	vma = vma_merge_extend(vmi: &vmi, vma, delta);
1125	if (!vma) {
1126	vm_unacct_memory(pages: charged);
1127	ret = -ENOMEM;
1128	goto out;
1129	}
1130
1131	vm_stat_account(mm, vma->vm_flags, npages: pages);
1132	if (vma->vm_flags & VM_LOCKED) {
1133	mm->locked_vm += pages;
1134	locked = true;
1135	new_addr = addr;
1136	}
1137	ret = addr;
1138	goto out;
1139	}
1140	}
1141
1142	/*
1143	* We weren't able to just expand or shrink the area,
1144	* we need to create a new one and move it..
1145	*/
1146	ret = -ENOMEM;
1147	if (flags & MREMAP_MAYMOVE) {
1148	unsigned long map_flags = `0`;
1149	if (vma->vm_flags & VM_MAYSHARE)
1150	map_flags \|= MAP_SHARED;
1151
1152	new_addr = get_unmapped_area(vma->vm_file, `0`, new_len,
1153	vma->vm_pgoff +
1154	((addr - vma->vm_start) >> PAGE_SHIFT),
1155	map_flags);
1156	if (IS_ERR_VALUE(new_addr)) {
1157	ret = new_addr;
1158	goto out;
1159	}
1160
1161	ret = move_vma(vma, old_addr: addr, old_len, new_len, new_addr,
1162	locked: &locked, flags, uf: &uf, uf_unmap: &uf_unmap);
1163	}
1164	out:
1165	if (offset_in_page(ret))
1166	locked = false;
1167	mmap_write_unlock(current->mm);
1168	if (locked && new_len > old_len)
1169	mm_populate(addr: new_addr + old_len, len: new_len - old_len);
1170	out_unlocked:
1171	userfaultfd_unmap_complete(mm, uf: &uf_unmap_early);
1172	mremap_userfaultfd_complete(&uf, from: addr, to: ret, len: old_len);
1173	userfaultfd_unmap_complete(mm, uf: &uf_unmap);
1174	return ret;
1175	}
1176

source code of linux/mm/mremap.c