1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * linux/mm/madvise.c |
4 | * |
5 | * Copyright (C) 1999 Linus Torvalds |
6 | * Copyright (C) 2002 Christoph Hellwig |
7 | */ |
8 | |
9 | #include <linux/mman.h> |
10 | #include <linux/pagemap.h> |
11 | #include <linux/syscalls.h> |
12 | #include <linux/mempolicy.h> |
13 | #include <linux/page-isolation.h> |
14 | #include <linux/page_idle.h> |
15 | #include <linux/userfaultfd_k.h> |
16 | #include <linux/hugetlb.h> |
17 | #include <linux/falloc.h> |
18 | #include <linux/fadvise.h> |
19 | #include <linux/sched.h> |
20 | #include <linux/sched/mm.h> |
21 | #include <linux/mm_inline.h> |
22 | #include <linux/string.h> |
23 | #include <linux/uio.h> |
24 | #include <linux/ksm.h> |
25 | #include <linux/fs.h> |
26 | #include <linux/file.h> |
27 | #include <linux/blkdev.h> |
28 | #include <linux/backing-dev.h> |
29 | #include <linux/pagewalk.h> |
30 | #include <linux/swap.h> |
31 | #include <linux/swapops.h> |
32 | #include <linux/shmem_fs.h> |
33 | #include <linux/mmu_notifier.h> |
34 | |
35 | #include <asm/tlb.h> |
36 | |
37 | #include "internal.h" |
38 | #include "swap.h" |
39 | |
40 | struct madvise_walk_private { |
41 | struct mmu_gather *tlb; |
42 | bool pageout; |
43 | }; |
44 | |
45 | /* |
46 | * Any behaviour which results in changes to the vma->vm_flags needs to |
47 | * take mmap_lock for writing. Others, which simply traverse vmas, need |
48 | * to only take it for reading. |
49 | */ |
50 | static int madvise_need_mmap_write(int behavior) |
51 | { |
52 | switch (behavior) { |
53 | case MADV_REMOVE: |
54 | case MADV_WILLNEED: |
55 | case MADV_DONTNEED: |
56 | case MADV_DONTNEED_LOCKED: |
57 | case MADV_COLD: |
58 | case MADV_PAGEOUT: |
59 | case MADV_FREE: |
60 | case MADV_POPULATE_READ: |
61 | case MADV_POPULATE_WRITE: |
62 | case MADV_COLLAPSE: |
63 | return 0; |
64 | default: |
65 | /* be safe, default to 1. list exceptions explicitly */ |
66 | return 1; |
67 | } |
68 | } |
69 | |
70 | #ifdef CONFIG_ANON_VMA_NAME |
71 | struct anon_vma_name *anon_vma_name_alloc(const char *name) |
72 | { |
73 | struct anon_vma_name *anon_name; |
74 | size_t count; |
75 | |
76 | /* Add 1 for NUL terminator at the end of the anon_name->name */ |
77 | count = strlen(name) + 1; |
78 | anon_name = kmalloc(struct_size(anon_name, name, count), GFP_KERNEL); |
79 | if (anon_name) { |
80 | kref_init(kref: &anon_name->kref); |
81 | memcpy(anon_name->name, name, count); |
82 | } |
83 | |
84 | return anon_name; |
85 | } |
86 | |
87 | void anon_vma_name_free(struct kref *kref) |
88 | { |
89 | struct anon_vma_name *anon_name = |
90 | container_of(kref, struct anon_vma_name, kref); |
91 | kfree(objp: anon_name); |
92 | } |
93 | |
94 | struct anon_vma_name *anon_vma_name(struct vm_area_struct *vma) |
95 | { |
96 | mmap_assert_locked(mm: vma->vm_mm); |
97 | |
98 | return vma->anon_name; |
99 | } |
100 | |
101 | /* mmap_lock should be write-locked */ |
102 | static int replace_anon_vma_name(struct vm_area_struct *vma, |
103 | struct anon_vma_name *anon_name) |
104 | { |
105 | struct anon_vma_name *orig_name = anon_vma_name(vma); |
106 | |
107 | if (!anon_name) { |
108 | vma->anon_name = NULL; |
109 | anon_vma_name_put(anon_name: orig_name); |
110 | return 0; |
111 | } |
112 | |
113 | if (anon_vma_name_eq(anon_name1: orig_name, anon_name2: anon_name)) |
114 | return 0; |
115 | |
116 | vma->anon_name = anon_vma_name_reuse(anon_name); |
117 | anon_vma_name_put(anon_name: orig_name); |
118 | |
119 | return 0; |
120 | } |
121 | #else /* CONFIG_ANON_VMA_NAME */ |
122 | static int replace_anon_vma_name(struct vm_area_struct *vma, |
123 | struct anon_vma_name *anon_name) |
124 | { |
125 | if (anon_name) |
126 | return -EINVAL; |
127 | |
128 | return 0; |
129 | } |
130 | #endif /* CONFIG_ANON_VMA_NAME */ |
131 | /* |
132 | * Update the vm_flags on region of a vma, splitting it or merging it as |
133 | * necessary. Must be called with mmap_lock held for writing; |
134 | * Caller should ensure anon_name stability by raising its refcount even when |
135 | * anon_name belongs to a valid vma because this function might free that vma. |
136 | */ |
137 | static int madvise_update_vma(struct vm_area_struct *vma, |
138 | struct vm_area_struct **prev, unsigned long start, |
139 | unsigned long end, unsigned long new_flags, |
140 | struct anon_vma_name *anon_name) |
141 | { |
142 | struct mm_struct *mm = vma->vm_mm; |
143 | int error; |
144 | VMA_ITERATOR(vmi, mm, start); |
145 | |
146 | if (new_flags == vma->vm_flags && anon_vma_name_eq(anon_name1: anon_vma_name(vma), anon_name2: anon_name)) { |
147 | *prev = vma; |
148 | return 0; |
149 | } |
150 | |
151 | vma = vma_modify_flags_name(vmi: &vmi, prev: *prev, vma, start, end, new_flags, |
152 | new_name: anon_name); |
153 | if (IS_ERR(ptr: vma)) |
154 | return PTR_ERR(ptr: vma); |
155 | |
156 | *prev = vma; |
157 | |
158 | /* vm_flags is protected by the mmap_lock held in write mode. */ |
159 | vma_start_write(vma); |
160 | vm_flags_reset(vma, flags: new_flags); |
161 | if (!vma->vm_file || vma_is_anon_shmem(vma)) { |
162 | error = replace_anon_vma_name(vma, anon_name); |
163 | if (error) |
164 | return error; |
165 | } |
166 | |
167 | return 0; |
168 | } |
169 | |
170 | #ifdef CONFIG_SWAP |
171 | static int swapin_walk_pmd_entry(pmd_t *pmd, unsigned long start, |
172 | unsigned long end, struct mm_walk *walk) |
173 | { |
174 | struct vm_area_struct *vma = walk->private; |
175 | struct swap_iocb *splug = NULL; |
176 | pte_t *ptep = NULL; |
177 | spinlock_t *ptl; |
178 | unsigned long addr; |
179 | |
180 | for (addr = start; addr < end; addr += PAGE_SIZE) { |
181 | pte_t pte; |
182 | swp_entry_t entry; |
183 | struct page *page; |
184 | |
185 | if (!ptep++) { |
186 | ptep = pte_offset_map_lock(mm: vma->vm_mm, pmd, addr, ptlp: &ptl); |
187 | if (!ptep) |
188 | break; |
189 | } |
190 | |
191 | pte = ptep_get(ptep); |
192 | if (!is_swap_pte(pte)) |
193 | continue; |
194 | entry = pte_to_swp_entry(pte); |
195 | if (unlikely(non_swap_entry(entry))) |
196 | continue; |
197 | |
198 | pte_unmap_unlock(ptep, ptl); |
199 | ptep = NULL; |
200 | |
201 | page = read_swap_cache_async(entry, GFP_HIGHUSER_MOVABLE, |
202 | vma, addr, plug: &splug); |
203 | if (page) |
204 | put_page(page); |
205 | } |
206 | |
207 | if (ptep) |
208 | pte_unmap_unlock(ptep, ptl); |
209 | swap_read_unplug(plug: splug); |
210 | cond_resched(); |
211 | |
212 | return 0; |
213 | } |
214 | |
215 | static const struct mm_walk_ops swapin_walk_ops = { |
216 | .pmd_entry = swapin_walk_pmd_entry, |
217 | .walk_lock = PGWALK_RDLOCK, |
218 | }; |
219 | |
220 | static void shmem_swapin_range(struct vm_area_struct *vma, |
221 | unsigned long start, unsigned long end, |
222 | struct address_space *mapping) |
223 | { |
224 | XA_STATE(xas, &mapping->i_pages, linear_page_index(vma, start)); |
225 | pgoff_t end_index = linear_page_index(vma, address: end) - 1; |
226 | struct page *page; |
227 | struct swap_iocb *splug = NULL; |
228 | |
229 | rcu_read_lock(); |
230 | xas_for_each(&xas, page, end_index) { |
231 | unsigned long addr; |
232 | swp_entry_t entry; |
233 | |
234 | if (!xa_is_value(entry: page)) |
235 | continue; |
236 | entry = radix_to_swp_entry(arg: page); |
237 | /* There might be swapin error entries in shmem mapping. */ |
238 | if (non_swap_entry(entry)) |
239 | continue; |
240 | |
241 | addr = vma->vm_start + |
242 | ((xas.xa_index - vma->vm_pgoff) << PAGE_SHIFT); |
243 | xas_pause(&xas); |
244 | rcu_read_unlock(); |
245 | |
246 | page = read_swap_cache_async(entry, gfp_mask: mapping_gfp_mask(mapping), |
247 | vma, addr, plug: &splug); |
248 | if (page) |
249 | put_page(page); |
250 | |
251 | rcu_read_lock(); |
252 | } |
253 | rcu_read_unlock(); |
254 | swap_read_unplug(plug: splug); |
255 | } |
256 | #endif /* CONFIG_SWAP */ |
257 | |
258 | /* |
259 | * Schedule all required I/O operations. Do not wait for completion. |
260 | */ |
261 | static long madvise_willneed(struct vm_area_struct *vma, |
262 | struct vm_area_struct **prev, |
263 | unsigned long start, unsigned long end) |
264 | { |
265 | struct mm_struct *mm = vma->vm_mm; |
266 | struct file *file = vma->vm_file; |
267 | loff_t offset; |
268 | |
269 | *prev = vma; |
270 | #ifdef CONFIG_SWAP |
271 | if (!file) { |
272 | walk_page_range(mm: vma->vm_mm, start, end, ops: &swapin_walk_ops, private: vma); |
273 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
274 | return 0; |
275 | } |
276 | |
277 | if (shmem_mapping(mapping: file->f_mapping)) { |
278 | shmem_swapin_range(vma, start, end, mapping: file->f_mapping); |
279 | lru_add_drain(); /* Push any new pages onto the LRU now */ |
280 | return 0; |
281 | } |
282 | #else |
283 | if (!file) |
284 | return -EBADF; |
285 | #endif |
286 | |
287 | if (IS_DAX(file_inode(file))) { |
288 | /* no bad return value, but ignore advice */ |
289 | return 0; |
290 | } |
291 | |
292 | /* |
293 | * Filesystem's fadvise may need to take various locks. We need to |
294 | * explicitly grab a reference because the vma (and hence the |
295 | * vma's reference to the file) can go away as soon as we drop |
296 | * mmap_lock. |
297 | */ |
298 | *prev = NULL; /* tell sys_madvise we drop mmap_lock */ |
299 | get_file(f: file); |
300 | offset = (loff_t)(start - vma->vm_start) |
301 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
302 | mmap_read_unlock(mm); |
303 | vfs_fadvise(file, offset, len: end - start, POSIX_FADV_WILLNEED); |
304 | fput(file); |
305 | mmap_read_lock(mm); |
306 | return 0; |
307 | } |
308 | |
309 | static inline bool can_do_file_pageout(struct vm_area_struct *vma) |
310 | { |
311 | if (!vma->vm_file) |
312 | return false; |
313 | /* |
314 | * paging out pagecache only for non-anonymous mappings that correspond |
315 | * to the files the calling process could (if tried) open for writing; |
316 | * otherwise we'd be including shared non-exclusive mappings, which |
317 | * opens a side channel. |
318 | */ |
319 | return inode_owner_or_capable(idmap: &nop_mnt_idmap, |
320 | inode: file_inode(f: vma->vm_file)) || |
321 | file_permission(file: vma->vm_file, MAY_WRITE) == 0; |
322 | } |
323 | |
324 | static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, |
325 | unsigned long addr, unsigned long end, |
326 | struct mm_walk *walk) |
327 | { |
328 | struct madvise_walk_private *private = walk->private; |
329 | struct mmu_gather *tlb = private->tlb; |
330 | bool pageout = private->pageout; |
331 | struct mm_struct *mm = tlb->mm; |
332 | struct vm_area_struct *vma = walk->vma; |
333 | pte_t *start_pte, *pte, ptent; |
334 | spinlock_t *ptl; |
335 | struct folio *folio = NULL; |
336 | LIST_HEAD(folio_list); |
337 | bool pageout_anon_only_filter; |
338 | |
339 | if (fatal_signal_pending(current)) |
340 | return -EINTR; |
341 | |
342 | pageout_anon_only_filter = pageout && !vma_is_anonymous(vma) && |
343 | !can_do_file_pageout(vma); |
344 | |
345 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
346 | if (pmd_trans_huge(pmd: *pmd)) { |
347 | pmd_t orig_pmd; |
348 | unsigned long next = pmd_addr_end(addr, end); |
349 | |
350 | tlb_change_page_size(tlb, HPAGE_PMD_SIZE); |
351 | ptl = pmd_trans_huge_lock(pmd, vma); |
352 | if (!ptl) |
353 | return 0; |
354 | |
355 | orig_pmd = *pmd; |
356 | if (is_huge_zero_pmd(pmd: orig_pmd)) |
357 | goto huge_unlock; |
358 | |
359 | if (unlikely(!pmd_present(orig_pmd))) { |
360 | VM_BUG_ON(thp_migration_supported() && |
361 | !is_pmd_migration_entry(orig_pmd)); |
362 | goto huge_unlock; |
363 | } |
364 | |
365 | folio = pfn_folio(pfn: pmd_pfn(pmd: orig_pmd)); |
366 | |
367 | /* Do not interfere with other mappings of this folio */ |
368 | if (folio_estimated_sharers(folio) != 1) |
369 | goto huge_unlock; |
370 | |
371 | if (pageout_anon_only_filter && !folio_test_anon(folio)) |
372 | goto huge_unlock; |
373 | |
374 | if (next - addr != HPAGE_PMD_SIZE) { |
375 | int err; |
376 | |
377 | folio_get(folio); |
378 | spin_unlock(lock: ptl); |
379 | folio_lock(folio); |
380 | err = split_folio(folio); |
381 | folio_unlock(folio); |
382 | folio_put(folio); |
383 | if (!err) |
384 | goto regular_folio; |
385 | return 0; |
386 | } |
387 | |
388 | if (pmd_young(pmd: orig_pmd)) { |
389 | pmdp_invalidate(vma, address: addr, pmdp: pmd); |
390 | orig_pmd = pmd_mkold(pmd: orig_pmd); |
391 | |
392 | set_pmd_at(mm, addr, pmdp: pmd, pmd: orig_pmd); |
393 | tlb_remove_pmd_tlb_entry(tlb, pmd, addr); |
394 | } |
395 | |
396 | folio_clear_referenced(folio); |
397 | folio_test_clear_young(folio); |
398 | if (folio_test_active(folio)) |
399 | folio_set_workingset(folio); |
400 | if (pageout) { |
401 | if (folio_isolate_lru(folio)) { |
402 | if (folio_test_unevictable(folio)) |
403 | folio_putback_lru(folio); |
404 | else |
405 | list_add(new: &folio->lru, head: &folio_list); |
406 | } |
407 | } else |
408 | folio_deactivate(folio); |
409 | huge_unlock: |
410 | spin_unlock(lock: ptl); |
411 | if (pageout) |
412 | reclaim_pages(folio_list: &folio_list); |
413 | return 0; |
414 | } |
415 | |
416 | regular_folio: |
417 | #endif |
418 | tlb_change_page_size(tlb, PAGE_SIZE); |
419 | start_pte = pte = pte_offset_map_lock(mm: vma->vm_mm, pmd, addr, ptlp: &ptl); |
420 | if (!start_pte) |
421 | return 0; |
422 | flush_tlb_batched_pending(mm); |
423 | arch_enter_lazy_mmu_mode(); |
424 | for (; addr < end; pte++, addr += PAGE_SIZE) { |
425 | ptent = ptep_get(ptep: pte); |
426 | |
427 | if (pte_none(pte: ptent)) |
428 | continue; |
429 | |
430 | if (!pte_present(a: ptent)) |
431 | continue; |
432 | |
433 | folio = vm_normal_folio(vma, addr, pte: ptent); |
434 | if (!folio || folio_is_zone_device(folio)) |
435 | continue; |
436 | |
437 | /* |
438 | * Creating a THP page is expensive so split it only if we |
439 | * are sure it's worth. Split it if we are only owner. |
440 | */ |
441 | if (folio_test_large(folio)) { |
442 | int err; |
443 | |
444 | if (folio_estimated_sharers(folio) != 1) |
445 | break; |
446 | if (pageout_anon_only_filter && !folio_test_anon(folio)) |
447 | break; |
448 | if (!folio_trylock(folio)) |
449 | break; |
450 | folio_get(folio); |
451 | arch_leave_lazy_mmu_mode(); |
452 | pte_unmap_unlock(start_pte, ptl); |
453 | start_pte = NULL; |
454 | err = split_folio(folio); |
455 | folio_unlock(folio); |
456 | folio_put(folio); |
457 | if (err) |
458 | break; |
459 | start_pte = pte = |
460 | pte_offset_map_lock(mm, pmd, addr, ptlp: &ptl); |
461 | if (!start_pte) |
462 | break; |
463 | arch_enter_lazy_mmu_mode(); |
464 | pte--; |
465 | addr -= PAGE_SIZE; |
466 | continue; |
467 | } |
468 | |
469 | /* |
470 | * Do not interfere with other mappings of this folio and |
471 | * non-LRU folio. |
472 | */ |
473 | if (!folio_test_lru(folio) || folio_mapcount(folio) != 1) |
474 | continue; |
475 | |
476 | if (pageout_anon_only_filter && !folio_test_anon(folio)) |
477 | continue; |
478 | |
479 | VM_BUG_ON_FOLIO(folio_test_large(folio), folio); |
480 | |
481 | if (pte_young(pte: ptent)) { |
482 | ptent = ptep_get_and_clear_full(mm, addr, ptep: pte, |
483 | full: tlb->fullmm); |
484 | ptent = pte_mkold(pte: ptent); |
485 | set_pte_at(mm, addr, pte, ptent); |
486 | tlb_remove_tlb_entry(tlb, pte, addr); |
487 | } |
488 | |
489 | /* |
490 | * We are deactivating a folio for accelerating reclaiming. |
491 | * VM couldn't reclaim the folio unless we clear PG_young. |
492 | * As a side effect, it makes confuse idle-page tracking |
493 | * because they will miss recent referenced history. |
494 | */ |
495 | folio_clear_referenced(folio); |
496 | folio_test_clear_young(folio); |
497 | if (folio_test_active(folio)) |
498 | folio_set_workingset(folio); |
499 | if (pageout) { |
500 | if (folio_isolate_lru(folio)) { |
501 | if (folio_test_unevictable(folio)) |
502 | folio_putback_lru(folio); |
503 | else |
504 | list_add(new: &folio->lru, head: &folio_list); |
505 | } |
506 | } else |
507 | folio_deactivate(folio); |
508 | } |
509 | |
510 | if (start_pte) { |
511 | arch_leave_lazy_mmu_mode(); |
512 | pte_unmap_unlock(start_pte, ptl); |
513 | } |
514 | if (pageout) |
515 | reclaim_pages(folio_list: &folio_list); |
516 | cond_resched(); |
517 | |
518 | return 0; |
519 | } |
520 | |
521 | static const struct mm_walk_ops cold_walk_ops = { |
522 | .pmd_entry = madvise_cold_or_pageout_pte_range, |
523 | .walk_lock = PGWALK_RDLOCK, |
524 | }; |
525 | |
526 | static void madvise_cold_page_range(struct mmu_gather *tlb, |
527 | struct vm_area_struct *vma, |
528 | unsigned long addr, unsigned long end) |
529 | { |
530 | struct madvise_walk_private walk_private = { |
531 | .pageout = false, |
532 | .tlb = tlb, |
533 | }; |
534 | |
535 | tlb_start_vma(tlb, vma); |
536 | walk_page_range(mm: vma->vm_mm, start: addr, end, ops: &cold_walk_ops, private: &walk_private); |
537 | tlb_end_vma(tlb, vma); |
538 | } |
539 | |
540 | static inline bool can_madv_lru_vma(struct vm_area_struct *vma) |
541 | { |
542 | return !(vma->vm_flags & (VM_LOCKED|VM_PFNMAP|VM_HUGETLB)); |
543 | } |
544 | |
545 | static long madvise_cold(struct vm_area_struct *vma, |
546 | struct vm_area_struct **prev, |
547 | unsigned long start_addr, unsigned long end_addr) |
548 | { |
549 | struct mm_struct *mm = vma->vm_mm; |
550 | struct mmu_gather tlb; |
551 | |
552 | *prev = vma; |
553 | if (!can_madv_lru_vma(vma)) |
554 | return -EINVAL; |
555 | |
556 | lru_add_drain(); |
557 | tlb_gather_mmu(tlb: &tlb, mm); |
558 | madvise_cold_page_range(tlb: &tlb, vma, addr: start_addr, end: end_addr); |
559 | tlb_finish_mmu(tlb: &tlb); |
560 | |
561 | return 0; |
562 | } |
563 | |
564 | static void madvise_pageout_page_range(struct mmu_gather *tlb, |
565 | struct vm_area_struct *vma, |
566 | unsigned long addr, unsigned long end) |
567 | { |
568 | struct madvise_walk_private walk_private = { |
569 | .pageout = true, |
570 | .tlb = tlb, |
571 | }; |
572 | |
573 | tlb_start_vma(tlb, vma); |
574 | walk_page_range(mm: vma->vm_mm, start: addr, end, ops: &cold_walk_ops, private: &walk_private); |
575 | tlb_end_vma(tlb, vma); |
576 | } |
577 | |
578 | static long madvise_pageout(struct vm_area_struct *vma, |
579 | struct vm_area_struct **prev, |
580 | unsigned long start_addr, unsigned long end_addr) |
581 | { |
582 | struct mm_struct *mm = vma->vm_mm; |
583 | struct mmu_gather tlb; |
584 | |
585 | *prev = vma; |
586 | if (!can_madv_lru_vma(vma)) |
587 | return -EINVAL; |
588 | |
589 | /* |
590 | * If the VMA belongs to a private file mapping, there can be private |
591 | * dirty pages which can be paged out if even this process is neither |
592 | * owner nor write capable of the file. We allow private file mappings |
593 | * further to pageout dirty anon pages. |
594 | */ |
595 | if (!vma_is_anonymous(vma) && (!can_do_file_pageout(vma) && |
596 | (vma->vm_flags & VM_MAYSHARE))) |
597 | return 0; |
598 | |
599 | lru_add_drain(); |
600 | tlb_gather_mmu(tlb: &tlb, mm); |
601 | madvise_pageout_page_range(tlb: &tlb, vma, addr: start_addr, end: end_addr); |
602 | tlb_finish_mmu(tlb: &tlb); |
603 | |
604 | return 0; |
605 | } |
606 | |
607 | static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, |
608 | unsigned long end, struct mm_walk *walk) |
609 | |
610 | { |
611 | struct mmu_gather *tlb = walk->private; |
612 | struct mm_struct *mm = tlb->mm; |
613 | struct vm_area_struct *vma = walk->vma; |
614 | spinlock_t *ptl; |
615 | pte_t *start_pte, *pte, ptent; |
616 | struct folio *folio; |
617 | int nr_swap = 0; |
618 | unsigned long next; |
619 | |
620 | next = pmd_addr_end(addr, end); |
621 | if (pmd_trans_huge(pmd: *pmd)) |
622 | if (madvise_free_huge_pmd(tlb, vma, pmd, addr, next)) |
623 | return 0; |
624 | |
625 | tlb_change_page_size(tlb, PAGE_SIZE); |
626 | start_pte = pte = pte_offset_map_lock(mm, pmd, addr, ptlp: &ptl); |
627 | if (!start_pte) |
628 | return 0; |
629 | flush_tlb_batched_pending(mm); |
630 | arch_enter_lazy_mmu_mode(); |
631 | for (; addr != end; pte++, addr += PAGE_SIZE) { |
632 | ptent = ptep_get(ptep: pte); |
633 | |
634 | if (pte_none(pte: ptent)) |
635 | continue; |
636 | /* |
637 | * If the pte has swp_entry, just clear page table to |
638 | * prevent swap-in which is more expensive rather than |
639 | * (page allocation + zeroing). |
640 | */ |
641 | if (!pte_present(a: ptent)) { |
642 | swp_entry_t entry; |
643 | |
644 | entry = pte_to_swp_entry(pte: ptent); |
645 | if (!non_swap_entry(entry)) { |
646 | nr_swap--; |
647 | free_swap_and_cache(entry); |
648 | pte_clear_not_present_full(mm, address: addr, ptep: pte, full: tlb->fullmm); |
649 | } else if (is_hwpoison_entry(entry) || |
650 | is_poisoned_swp_entry(entry)) { |
651 | pte_clear_not_present_full(mm, address: addr, ptep: pte, full: tlb->fullmm); |
652 | } |
653 | continue; |
654 | } |
655 | |
656 | folio = vm_normal_folio(vma, addr, pte: ptent); |
657 | if (!folio || folio_is_zone_device(folio)) |
658 | continue; |
659 | |
660 | /* |
661 | * If pmd isn't transhuge but the folio is large and |
662 | * is owned by only this process, split it and |
663 | * deactivate all pages. |
664 | */ |
665 | if (folio_test_large(folio)) { |
666 | int err; |
667 | |
668 | if (folio_estimated_sharers(folio) != 1) |
669 | break; |
670 | if (!folio_trylock(folio)) |
671 | break; |
672 | folio_get(folio); |
673 | arch_leave_lazy_mmu_mode(); |
674 | pte_unmap_unlock(start_pte, ptl); |
675 | start_pte = NULL; |
676 | err = split_folio(folio); |
677 | folio_unlock(folio); |
678 | folio_put(folio); |
679 | if (err) |
680 | break; |
681 | start_pte = pte = |
682 | pte_offset_map_lock(mm, pmd, addr, ptlp: &ptl); |
683 | if (!start_pte) |
684 | break; |
685 | arch_enter_lazy_mmu_mode(); |
686 | pte--; |
687 | addr -= PAGE_SIZE; |
688 | continue; |
689 | } |
690 | |
691 | if (folio_test_swapcache(folio) || folio_test_dirty(folio)) { |
692 | if (!folio_trylock(folio)) |
693 | continue; |
694 | /* |
695 | * If folio is shared with others, we mustn't clear |
696 | * the folio's dirty flag. |
697 | */ |
698 | if (folio_mapcount(folio) != 1) { |
699 | folio_unlock(folio); |
700 | continue; |
701 | } |
702 | |
703 | if (folio_test_swapcache(folio) && |
704 | !folio_free_swap(folio)) { |
705 | folio_unlock(folio); |
706 | continue; |
707 | } |
708 | |
709 | folio_clear_dirty(folio); |
710 | folio_unlock(folio); |
711 | } |
712 | |
713 | if (pte_young(pte: ptent) || pte_dirty(pte: ptent)) { |
714 | /* |
715 | * Some of architecture(ex, PPC) don't update TLB |
716 | * with set_pte_at and tlb_remove_tlb_entry so for |
717 | * the portability, remap the pte with old|clean |
718 | * after pte clearing. |
719 | */ |
720 | ptent = ptep_get_and_clear_full(mm, addr, ptep: pte, |
721 | full: tlb->fullmm); |
722 | |
723 | ptent = pte_mkold(pte: ptent); |
724 | ptent = pte_mkclean(pte: ptent); |
725 | set_pte_at(mm, addr, pte, ptent); |
726 | tlb_remove_tlb_entry(tlb, pte, addr); |
727 | } |
728 | folio_mark_lazyfree(folio); |
729 | } |
730 | |
731 | if (nr_swap) |
732 | add_mm_counter(mm, member: MM_SWAPENTS, value: nr_swap); |
733 | if (start_pte) { |
734 | arch_leave_lazy_mmu_mode(); |
735 | pte_unmap_unlock(start_pte, ptl); |
736 | } |
737 | cond_resched(); |
738 | |
739 | return 0; |
740 | } |
741 | |
742 | static const struct mm_walk_ops madvise_free_walk_ops = { |
743 | .pmd_entry = madvise_free_pte_range, |
744 | .walk_lock = PGWALK_RDLOCK, |
745 | }; |
746 | |
747 | static int madvise_free_single_vma(struct vm_area_struct *vma, |
748 | unsigned long start_addr, unsigned long end_addr) |
749 | { |
750 | struct mm_struct *mm = vma->vm_mm; |
751 | struct mmu_notifier_range range; |
752 | struct mmu_gather tlb; |
753 | |
754 | /* MADV_FREE works for only anon vma at the moment */ |
755 | if (!vma_is_anonymous(vma)) |
756 | return -EINVAL; |
757 | |
758 | range.start = max(vma->vm_start, start_addr); |
759 | if (range.start >= vma->vm_end) |
760 | return -EINVAL; |
761 | range.end = min(vma->vm_end, end_addr); |
762 | if (range.end <= vma->vm_start) |
763 | return -EINVAL; |
764 | mmu_notifier_range_init(range: &range, event: MMU_NOTIFY_CLEAR, flags: 0, mm, |
765 | start: range.start, end: range.end); |
766 | |
767 | lru_add_drain(); |
768 | tlb_gather_mmu(tlb: &tlb, mm); |
769 | update_hiwater_rss(mm); |
770 | |
771 | mmu_notifier_invalidate_range_start(range: &range); |
772 | tlb_start_vma(tlb: &tlb, vma); |
773 | walk_page_range(mm: vma->vm_mm, start: range.start, end: range.end, |
774 | ops: &madvise_free_walk_ops, private: &tlb); |
775 | tlb_end_vma(tlb: &tlb, vma); |
776 | mmu_notifier_invalidate_range_end(range: &range); |
777 | tlb_finish_mmu(tlb: &tlb); |
778 | |
779 | return 0; |
780 | } |
781 | |
782 | /* |
783 | * Application no longer needs these pages. If the pages are dirty, |
784 | * it's OK to just throw them away. The app will be more careful about |
785 | * data it wants to keep. Be sure to free swap resources too. The |
786 | * zap_page_range_single call sets things up for shrink_active_list to actually |
787 | * free these pages later if no one else has touched them in the meantime, |
788 | * although we could add these pages to a global reuse list for |
789 | * shrink_active_list to pick up before reclaiming other pages. |
790 | * |
791 | * NB: This interface discards data rather than pushes it out to swap, |
792 | * as some implementations do. This has performance implications for |
793 | * applications like large transactional databases which want to discard |
794 | * pages in anonymous maps after committing to backing store the data |
795 | * that was kept in them. There is no reason to write this data out to |
796 | * the swap area if the application is discarding it. |
797 | * |
798 | * An interface that causes the system to free clean pages and flush |
799 | * dirty pages is already available as msync(MS_INVALIDATE). |
800 | */ |
801 | static long madvise_dontneed_single_vma(struct vm_area_struct *vma, |
802 | unsigned long start, unsigned long end) |
803 | { |
804 | zap_page_range_single(vma, address: start, size: end - start, NULL); |
805 | return 0; |
806 | } |
807 | |
808 | static bool madvise_dontneed_free_valid_vma(struct vm_area_struct *vma, |
809 | unsigned long start, |
810 | unsigned long *end, |
811 | int behavior) |
812 | { |
813 | if (!is_vm_hugetlb_page(vma)) { |
814 | unsigned int forbidden = VM_PFNMAP; |
815 | |
816 | if (behavior != MADV_DONTNEED_LOCKED) |
817 | forbidden |= VM_LOCKED; |
818 | |
819 | return !(vma->vm_flags & forbidden); |
820 | } |
821 | |
822 | if (behavior != MADV_DONTNEED && behavior != MADV_DONTNEED_LOCKED) |
823 | return false; |
824 | if (start & ~huge_page_mask(h: hstate_vma(vma))) |
825 | return false; |
826 | |
827 | /* |
828 | * Madvise callers expect the length to be rounded up to PAGE_SIZE |
829 | * boundaries, and may be unaware that this VMA uses huge pages. |
830 | * Avoid unexpected data loss by rounding down the number of |
831 | * huge pages freed. |
832 | */ |
833 | *end = ALIGN_DOWN(*end, huge_page_size(hstate_vma(vma))); |
834 | |
835 | return true; |
836 | } |
837 | |
838 | static long madvise_dontneed_free(struct vm_area_struct *vma, |
839 | struct vm_area_struct **prev, |
840 | unsigned long start, unsigned long end, |
841 | int behavior) |
842 | { |
843 | struct mm_struct *mm = vma->vm_mm; |
844 | |
845 | *prev = vma; |
846 | if (!madvise_dontneed_free_valid_vma(vma, start, end: &end, behavior)) |
847 | return -EINVAL; |
848 | |
849 | if (start == end) |
850 | return 0; |
851 | |
852 | if (!userfaultfd_remove(vma, start, end)) { |
853 | *prev = NULL; /* mmap_lock has been dropped, prev is stale */ |
854 | |
855 | mmap_read_lock(mm); |
856 | vma = vma_lookup(mm, addr: start); |
857 | if (!vma) |
858 | return -ENOMEM; |
859 | /* |
860 | * Potential end adjustment for hugetlb vma is OK as |
861 | * the check below keeps end within vma. |
862 | */ |
863 | if (!madvise_dontneed_free_valid_vma(vma, start, end: &end, |
864 | behavior)) |
865 | return -EINVAL; |
866 | if (end > vma->vm_end) { |
867 | /* |
868 | * Don't fail if end > vma->vm_end. If the old |
869 | * vma was split while the mmap_lock was |
870 | * released the effect of the concurrent |
871 | * operation may not cause madvise() to |
872 | * have an undefined result. There may be an |
873 | * adjacent next vma that we'll walk |
874 | * next. userfaultfd_remove() will generate an |
875 | * UFFD_EVENT_REMOVE repetition on the |
876 | * end-vma->vm_end range, but the manager can |
877 | * handle a repetition fine. |
878 | */ |
879 | end = vma->vm_end; |
880 | } |
881 | VM_WARN_ON(start >= end); |
882 | } |
883 | |
884 | if (behavior == MADV_DONTNEED || behavior == MADV_DONTNEED_LOCKED) |
885 | return madvise_dontneed_single_vma(vma, start, end); |
886 | else if (behavior == MADV_FREE) |
887 | return madvise_free_single_vma(vma, start_addr: start, end_addr: end); |
888 | else |
889 | return -EINVAL; |
890 | } |
891 | |
892 | static long madvise_populate(struct vm_area_struct *vma, |
893 | struct vm_area_struct **prev, |
894 | unsigned long start, unsigned long end, |
895 | int behavior) |
896 | { |
897 | const bool write = behavior == MADV_POPULATE_WRITE; |
898 | struct mm_struct *mm = vma->vm_mm; |
899 | unsigned long tmp_end; |
900 | int locked = 1; |
901 | long pages; |
902 | |
903 | *prev = vma; |
904 | |
905 | while (start < end) { |
906 | /* |
907 | * We might have temporarily dropped the lock. For example, |
908 | * our VMA might have been split. |
909 | */ |
910 | if (!vma || start >= vma->vm_end) { |
911 | vma = vma_lookup(mm, addr: start); |
912 | if (!vma) |
913 | return -ENOMEM; |
914 | } |
915 | |
916 | tmp_end = min_t(unsigned long, end, vma->vm_end); |
917 | /* Populate (prefault) page tables readable/writable. */ |
918 | pages = faultin_vma_page_range(vma, start, end: tmp_end, write, |
919 | locked: &locked); |
920 | if (!locked) { |
921 | mmap_read_lock(mm); |
922 | locked = 1; |
923 | *prev = NULL; |
924 | vma = NULL; |
925 | } |
926 | if (pages < 0) { |
927 | switch (pages) { |
928 | case -EINTR: |
929 | return -EINTR; |
930 | case -EINVAL: /* Incompatible mappings / permissions. */ |
931 | return -EINVAL; |
932 | case -EHWPOISON: |
933 | return -EHWPOISON; |
934 | case -EFAULT: /* VM_FAULT_SIGBUS or VM_FAULT_SIGSEGV */ |
935 | return -EFAULT; |
936 | default: |
937 | pr_warn_once("%s: unhandled return value: %ld\n" , |
938 | __func__, pages); |
939 | fallthrough; |
940 | case -ENOMEM: |
941 | return -ENOMEM; |
942 | } |
943 | } |
944 | start += pages * PAGE_SIZE; |
945 | } |
946 | return 0; |
947 | } |
948 | |
949 | /* |
950 | * Application wants to free up the pages and associated backing store. |
951 | * This is effectively punching a hole into the middle of a file. |
952 | */ |
953 | static long madvise_remove(struct vm_area_struct *vma, |
954 | struct vm_area_struct **prev, |
955 | unsigned long start, unsigned long end) |
956 | { |
957 | loff_t offset; |
958 | int error; |
959 | struct file *f; |
960 | struct mm_struct *mm = vma->vm_mm; |
961 | |
962 | *prev = NULL; /* tell sys_madvise we drop mmap_lock */ |
963 | |
964 | if (vma->vm_flags & VM_LOCKED) |
965 | return -EINVAL; |
966 | |
967 | f = vma->vm_file; |
968 | |
969 | if (!f || !f->f_mapping || !f->f_mapping->host) { |
970 | return -EINVAL; |
971 | } |
972 | |
973 | if (!vma_is_shared_maywrite(vma)) |
974 | return -EACCES; |
975 | |
976 | offset = (loff_t)(start - vma->vm_start) |
977 | + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); |
978 | |
979 | /* |
980 | * Filesystem's fallocate may need to take i_rwsem. We need to |
981 | * explicitly grab a reference because the vma (and hence the |
982 | * vma's reference to the file) can go away as soon as we drop |
983 | * mmap_lock. |
984 | */ |
985 | get_file(f); |
986 | if (userfaultfd_remove(vma, start, end)) { |
987 | /* mmap_lock was not released by userfaultfd_remove() */ |
988 | mmap_read_unlock(mm); |
989 | } |
990 | error = vfs_fallocate(file: f, |
991 | FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, |
992 | offset, len: end - start); |
993 | fput(f); |
994 | mmap_read_lock(mm); |
995 | return error; |
996 | } |
997 | |
998 | /* |
999 | * Apply an madvise behavior to a region of a vma. madvise_update_vma |
1000 | * will handle splitting a vm area into separate areas, each area with its own |
1001 | * behavior. |
1002 | */ |
1003 | static int madvise_vma_behavior(struct vm_area_struct *vma, |
1004 | struct vm_area_struct **prev, |
1005 | unsigned long start, unsigned long end, |
1006 | unsigned long behavior) |
1007 | { |
1008 | int error; |
1009 | struct anon_vma_name *anon_name; |
1010 | unsigned long new_flags = vma->vm_flags; |
1011 | |
1012 | switch (behavior) { |
1013 | case MADV_REMOVE: |
1014 | return madvise_remove(vma, prev, start, end); |
1015 | case MADV_WILLNEED: |
1016 | return madvise_willneed(vma, prev, start, end); |
1017 | case MADV_COLD: |
1018 | return madvise_cold(vma, prev, start_addr: start, end_addr: end); |
1019 | case MADV_PAGEOUT: |
1020 | return madvise_pageout(vma, prev, start_addr: start, end_addr: end); |
1021 | case MADV_FREE: |
1022 | case MADV_DONTNEED: |
1023 | case MADV_DONTNEED_LOCKED: |
1024 | return madvise_dontneed_free(vma, prev, start, end, behavior); |
1025 | case MADV_POPULATE_READ: |
1026 | case MADV_POPULATE_WRITE: |
1027 | return madvise_populate(vma, prev, start, end, behavior); |
1028 | case MADV_NORMAL: |
1029 | new_flags = new_flags & ~VM_RAND_READ & ~VM_SEQ_READ; |
1030 | break; |
1031 | case MADV_SEQUENTIAL: |
1032 | new_flags = (new_flags & ~VM_RAND_READ) | VM_SEQ_READ; |
1033 | break; |
1034 | case MADV_RANDOM: |
1035 | new_flags = (new_flags & ~VM_SEQ_READ) | VM_RAND_READ; |
1036 | break; |
1037 | case MADV_DONTFORK: |
1038 | new_flags |= VM_DONTCOPY; |
1039 | break; |
1040 | case MADV_DOFORK: |
1041 | if (vma->vm_flags & VM_IO) |
1042 | return -EINVAL; |
1043 | new_flags &= ~VM_DONTCOPY; |
1044 | break; |
1045 | case MADV_WIPEONFORK: |
1046 | /* MADV_WIPEONFORK is only supported on anonymous memory. */ |
1047 | if (vma->vm_file || vma->vm_flags & VM_SHARED) |
1048 | return -EINVAL; |
1049 | new_flags |= VM_WIPEONFORK; |
1050 | break; |
1051 | case MADV_KEEPONFORK: |
1052 | new_flags &= ~VM_WIPEONFORK; |
1053 | break; |
1054 | case MADV_DONTDUMP: |
1055 | new_flags |= VM_DONTDUMP; |
1056 | break; |
1057 | case MADV_DODUMP: |
1058 | if (!is_vm_hugetlb_page(vma) && new_flags & VM_SPECIAL) |
1059 | return -EINVAL; |
1060 | new_flags &= ~VM_DONTDUMP; |
1061 | break; |
1062 | case MADV_MERGEABLE: |
1063 | case MADV_UNMERGEABLE: |
1064 | error = ksm_madvise(vma, start, end, advice: behavior, vm_flags: &new_flags); |
1065 | if (error) |
1066 | goto out; |
1067 | break; |
1068 | case MADV_HUGEPAGE: |
1069 | case MADV_NOHUGEPAGE: |
1070 | error = hugepage_madvise(vma, vm_flags: &new_flags, advice: behavior); |
1071 | if (error) |
1072 | goto out; |
1073 | break; |
1074 | case MADV_COLLAPSE: |
1075 | return madvise_collapse(vma, prev, start, end); |
1076 | } |
1077 | |
1078 | anon_name = anon_vma_name(vma); |
1079 | anon_vma_name_get(anon_name); |
1080 | error = madvise_update_vma(vma, prev, start, end, new_flags, |
1081 | anon_name); |
1082 | anon_vma_name_put(anon_name); |
1083 | |
1084 | out: |
1085 | /* |
1086 | * madvise() returns EAGAIN if kernel resources, such as |
1087 | * slab, are temporarily unavailable. |
1088 | */ |
1089 | if (error == -ENOMEM) |
1090 | error = -EAGAIN; |
1091 | return error; |
1092 | } |
1093 | |
1094 | #ifdef CONFIG_MEMORY_FAILURE |
1095 | /* |
1096 | * Error injection support for memory error handling. |
1097 | */ |
1098 | static int madvise_inject_error(int behavior, |
1099 | unsigned long start, unsigned long end) |
1100 | { |
1101 | unsigned long size; |
1102 | |
1103 | if (!capable(CAP_SYS_ADMIN)) |
1104 | return -EPERM; |
1105 | |
1106 | |
1107 | for (; start < end; start += size) { |
1108 | unsigned long pfn; |
1109 | struct page *page; |
1110 | int ret; |
1111 | |
1112 | ret = get_user_pages_fast(start, nr_pages: 1, gup_flags: 0, pages: &page); |
1113 | if (ret != 1) |
1114 | return ret; |
1115 | pfn = page_to_pfn(page); |
1116 | |
1117 | /* |
1118 | * When soft offlining hugepages, after migrating the page |
1119 | * we dissolve it, therefore in the second loop "page" will |
1120 | * no longer be a compound page. |
1121 | */ |
1122 | size = page_size(compound_head(page)); |
1123 | |
1124 | if (behavior == MADV_SOFT_OFFLINE) { |
1125 | pr_info("Soft offlining pfn %#lx at process virtual address %#lx\n" , |
1126 | pfn, start); |
1127 | ret = soft_offline_page(pfn, flags: MF_COUNT_INCREASED); |
1128 | } else { |
1129 | pr_info("Injecting memory failure for pfn %#lx at process virtual address %#lx\n" , |
1130 | pfn, start); |
1131 | ret = memory_failure(pfn, flags: MF_COUNT_INCREASED | MF_SW_SIMULATED); |
1132 | if (ret == -EOPNOTSUPP) |
1133 | ret = 0; |
1134 | } |
1135 | |
1136 | if (ret) |
1137 | return ret; |
1138 | } |
1139 | |
1140 | return 0; |
1141 | } |
1142 | #endif |
1143 | |
1144 | static bool |
1145 | madvise_behavior_valid(int behavior) |
1146 | { |
1147 | switch (behavior) { |
1148 | case MADV_DOFORK: |
1149 | case MADV_DONTFORK: |
1150 | case MADV_NORMAL: |
1151 | case MADV_SEQUENTIAL: |
1152 | case MADV_RANDOM: |
1153 | case MADV_REMOVE: |
1154 | case MADV_WILLNEED: |
1155 | case MADV_DONTNEED: |
1156 | case MADV_DONTNEED_LOCKED: |
1157 | case MADV_FREE: |
1158 | case MADV_COLD: |
1159 | case MADV_PAGEOUT: |
1160 | case MADV_POPULATE_READ: |
1161 | case MADV_POPULATE_WRITE: |
1162 | #ifdef CONFIG_KSM |
1163 | case MADV_MERGEABLE: |
1164 | case MADV_UNMERGEABLE: |
1165 | #endif |
1166 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
1167 | case MADV_HUGEPAGE: |
1168 | case MADV_NOHUGEPAGE: |
1169 | case MADV_COLLAPSE: |
1170 | #endif |
1171 | case MADV_DONTDUMP: |
1172 | case MADV_DODUMP: |
1173 | case MADV_WIPEONFORK: |
1174 | case MADV_KEEPONFORK: |
1175 | #ifdef CONFIG_MEMORY_FAILURE |
1176 | case MADV_SOFT_OFFLINE: |
1177 | case MADV_HWPOISON: |
1178 | #endif |
1179 | return true; |
1180 | |
1181 | default: |
1182 | return false; |
1183 | } |
1184 | } |
1185 | |
1186 | static bool process_madvise_behavior_valid(int behavior) |
1187 | { |
1188 | switch (behavior) { |
1189 | case MADV_COLD: |
1190 | case MADV_PAGEOUT: |
1191 | case MADV_WILLNEED: |
1192 | case MADV_COLLAPSE: |
1193 | return true; |
1194 | default: |
1195 | return false; |
1196 | } |
1197 | } |
1198 | |
1199 | /* |
1200 | * Walk the vmas in range [start,end), and call the visit function on each one. |
1201 | * The visit function will get start and end parameters that cover the overlap |
1202 | * between the current vma and the original range. Any unmapped regions in the |
1203 | * original range will result in this function returning -ENOMEM while still |
1204 | * calling the visit function on all of the existing vmas in the range. |
1205 | * Must be called with the mmap_lock held for reading or writing. |
1206 | */ |
1207 | static |
1208 | int madvise_walk_vmas(struct mm_struct *mm, unsigned long start, |
1209 | unsigned long end, unsigned long arg, |
1210 | int (*visit)(struct vm_area_struct *vma, |
1211 | struct vm_area_struct **prev, unsigned long start, |
1212 | unsigned long end, unsigned long arg)) |
1213 | { |
1214 | struct vm_area_struct *vma; |
1215 | struct vm_area_struct *prev; |
1216 | unsigned long tmp; |
1217 | int unmapped_error = 0; |
1218 | |
1219 | /* |
1220 | * If the interval [start,end) covers some unmapped address |
1221 | * ranges, just ignore them, but return -ENOMEM at the end. |
1222 | * - different from the way of handling in mlock etc. |
1223 | */ |
1224 | vma = find_vma_prev(mm, addr: start, pprev: &prev); |
1225 | if (vma && start > vma->vm_start) |
1226 | prev = vma; |
1227 | |
1228 | for (;;) { |
1229 | int error; |
1230 | |
1231 | /* Still start < end. */ |
1232 | if (!vma) |
1233 | return -ENOMEM; |
1234 | |
1235 | /* Here start < (end|vma->vm_end). */ |
1236 | if (start < vma->vm_start) { |
1237 | unmapped_error = -ENOMEM; |
1238 | start = vma->vm_start; |
1239 | if (start >= end) |
1240 | break; |
1241 | } |
1242 | |
1243 | /* Here vma->vm_start <= start < (end|vma->vm_end) */ |
1244 | tmp = vma->vm_end; |
1245 | if (end < tmp) |
1246 | tmp = end; |
1247 | |
1248 | /* Here vma->vm_start <= start < tmp <= (end|vma->vm_end). */ |
1249 | error = visit(vma, &prev, start, tmp, arg); |
1250 | if (error) |
1251 | return error; |
1252 | start = tmp; |
1253 | if (prev && start < prev->vm_end) |
1254 | start = prev->vm_end; |
1255 | if (start >= end) |
1256 | break; |
1257 | if (prev) |
1258 | vma = find_vma(mm, addr: prev->vm_end); |
1259 | else /* madvise_remove dropped mmap_lock */ |
1260 | vma = find_vma(mm, addr: start); |
1261 | } |
1262 | |
1263 | return unmapped_error; |
1264 | } |
1265 | |
1266 | #ifdef CONFIG_ANON_VMA_NAME |
1267 | static int madvise_vma_anon_name(struct vm_area_struct *vma, |
1268 | struct vm_area_struct **prev, |
1269 | unsigned long start, unsigned long end, |
1270 | unsigned long anon_name) |
1271 | { |
1272 | int error; |
1273 | |
1274 | /* Only anonymous mappings can be named */ |
1275 | if (vma->vm_file && !vma_is_anon_shmem(vma)) |
1276 | return -EBADF; |
1277 | |
1278 | error = madvise_update_vma(vma, prev, start, end, new_flags: vma->vm_flags, |
1279 | anon_name: (struct anon_vma_name *)anon_name); |
1280 | |
1281 | /* |
1282 | * madvise() returns EAGAIN if kernel resources, such as |
1283 | * slab, are temporarily unavailable. |
1284 | */ |
1285 | if (error == -ENOMEM) |
1286 | error = -EAGAIN; |
1287 | return error; |
1288 | } |
1289 | |
1290 | int madvise_set_anon_name(struct mm_struct *mm, unsigned long start, |
1291 | unsigned long len_in, struct anon_vma_name *anon_name) |
1292 | { |
1293 | unsigned long end; |
1294 | unsigned long len; |
1295 | |
1296 | if (start & ~PAGE_MASK) |
1297 | return -EINVAL; |
1298 | len = (len_in + ~PAGE_MASK) & PAGE_MASK; |
1299 | |
1300 | /* Check to see whether len was rounded up from small -ve to zero */ |
1301 | if (len_in && !len) |
1302 | return -EINVAL; |
1303 | |
1304 | end = start + len; |
1305 | if (end < start) |
1306 | return -EINVAL; |
1307 | |
1308 | if (end == start) |
1309 | return 0; |
1310 | |
1311 | return madvise_walk_vmas(mm, start, end, arg: (unsigned long)anon_name, |
1312 | visit: madvise_vma_anon_name); |
1313 | } |
1314 | #endif /* CONFIG_ANON_VMA_NAME */ |
1315 | /* |
1316 | * The madvise(2) system call. |
1317 | * |
1318 | * Applications can use madvise() to advise the kernel how it should |
1319 | * handle paging I/O in this VM area. The idea is to help the kernel |
1320 | * use appropriate read-ahead and caching techniques. The information |
1321 | * provided is advisory only, and can be safely disregarded by the |
1322 | * kernel without affecting the correct operation of the application. |
1323 | * |
1324 | * behavior values: |
1325 | * MADV_NORMAL - the default behavior is to read clusters. This |
1326 | * results in some read-ahead and read-behind. |
1327 | * MADV_RANDOM - the system should read the minimum amount of data |
1328 | * on any access, since it is unlikely that the appli- |
1329 | * cation will need more than what it asks for. |
1330 | * MADV_SEQUENTIAL - pages in the given range will probably be accessed |
1331 | * once, so they can be aggressively read ahead, and |
1332 | * can be freed soon after they are accessed. |
1333 | * MADV_WILLNEED - the application is notifying the system to read |
1334 | * some pages ahead. |
1335 | * MADV_DONTNEED - the application is finished with the given range, |
1336 | * so the kernel can free resources associated with it. |
1337 | * MADV_FREE - the application marks pages in the given range as lazy free, |
1338 | * where actual purges are postponed until memory pressure happens. |
1339 | * MADV_REMOVE - the application wants to free up the given range of |
1340 | * pages and associated backing store. |
1341 | * MADV_DONTFORK - omit this area from child's address space when forking: |
1342 | * typically, to avoid COWing pages pinned by get_user_pages(). |
1343 | * MADV_DOFORK - cancel MADV_DONTFORK: no longer omit this area when forking. |
1344 | * MADV_WIPEONFORK - present the child process with zero-filled memory in this |
1345 | * range after a fork. |
1346 | * MADV_KEEPONFORK - undo the effect of MADV_WIPEONFORK |
1347 | * MADV_HWPOISON - trigger memory error handler as if the given memory range |
1348 | * were corrupted by unrecoverable hardware memory failure. |
1349 | * MADV_SOFT_OFFLINE - try to soft-offline the given range of memory. |
1350 | * MADV_MERGEABLE - the application recommends that KSM try to merge pages in |
1351 | * this area with pages of identical content from other such areas. |
1352 | * MADV_UNMERGEABLE- cancel MADV_MERGEABLE: no longer merge pages with others. |
1353 | * MADV_HUGEPAGE - the application wants to back the given range by transparent |
1354 | * huge pages in the future. Existing pages might be coalesced and |
1355 | * new pages might be allocated as THP. |
1356 | * MADV_NOHUGEPAGE - mark the given range as not worth being backed by |
1357 | * transparent huge pages so the existing pages will not be |
1358 | * coalesced into THP and new pages will not be allocated as THP. |
1359 | * MADV_COLLAPSE - synchronously coalesce pages into new THP. |
1360 | * MADV_DONTDUMP - the application wants to prevent pages in the given range |
1361 | * from being included in its core dump. |
1362 | * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. |
1363 | * MADV_COLD - the application is not expected to use this memory soon, |
1364 | * deactivate pages in this range so that they can be reclaimed |
1365 | * easily if memory pressure happens. |
1366 | * MADV_PAGEOUT - the application is not expected to use this memory soon, |
1367 | * page out the pages in this range immediately. |
1368 | * MADV_POPULATE_READ - populate (prefault) page tables readable by |
1369 | * triggering read faults if required |
1370 | * MADV_POPULATE_WRITE - populate (prefault) page tables writable by |
1371 | * triggering write faults if required |
1372 | * |
1373 | * return values: |
1374 | * zero - success |
1375 | * -EINVAL - start + len < 0, start is not page-aligned, |
1376 | * "behavior" is not a valid value, or application |
1377 | * is attempting to release locked or shared pages, |
1378 | * or the specified address range includes file, Huge TLB, |
1379 | * MAP_SHARED or VMPFNMAP range. |
1380 | * -ENOMEM - addresses in the specified range are not currently |
1381 | * mapped, or are outside the AS of the process. |
1382 | * -EIO - an I/O error occurred while paging in data. |
1383 | * -EBADF - map exists, but area maps something that isn't a file. |
1384 | * -EAGAIN - a kernel resource was temporarily unavailable. |
1385 | */ |
1386 | int do_madvise(struct mm_struct *mm, unsigned long start, size_t len_in, int behavior) |
1387 | { |
1388 | unsigned long end; |
1389 | int error; |
1390 | int write; |
1391 | size_t len; |
1392 | struct blk_plug plug; |
1393 | |
1394 | if (!madvise_behavior_valid(behavior)) |
1395 | return -EINVAL; |
1396 | |
1397 | if (!PAGE_ALIGNED(start)) |
1398 | return -EINVAL; |
1399 | len = PAGE_ALIGN(len_in); |
1400 | |
1401 | /* Check to see whether len was rounded up from small -ve to zero */ |
1402 | if (len_in && !len) |
1403 | return -EINVAL; |
1404 | |
1405 | end = start + len; |
1406 | if (end < start) |
1407 | return -EINVAL; |
1408 | |
1409 | if (end == start) |
1410 | return 0; |
1411 | |
1412 | #ifdef CONFIG_MEMORY_FAILURE |
1413 | if (behavior == MADV_HWPOISON || behavior == MADV_SOFT_OFFLINE) |
1414 | return madvise_inject_error(behavior, start, end: start + len_in); |
1415 | #endif |
1416 | |
1417 | write = madvise_need_mmap_write(behavior); |
1418 | if (write) { |
1419 | if (mmap_write_lock_killable(mm)) |
1420 | return -EINTR; |
1421 | } else { |
1422 | mmap_read_lock(mm); |
1423 | } |
1424 | |
1425 | start = untagged_addr_remote(mm, start); |
1426 | end = start + len; |
1427 | |
1428 | blk_start_plug(&plug); |
1429 | error = madvise_walk_vmas(mm, start, end, arg: behavior, |
1430 | visit: madvise_vma_behavior); |
1431 | blk_finish_plug(&plug); |
1432 | if (write) |
1433 | mmap_write_unlock(mm); |
1434 | else |
1435 | mmap_read_unlock(mm); |
1436 | |
1437 | return error; |
1438 | } |
1439 | |
1440 | SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) |
1441 | { |
1442 | return do_madvise(current->mm, start, len_in, behavior); |
1443 | } |
1444 | |
1445 | SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, |
1446 | size_t, vlen, int, behavior, unsigned int, flags) |
1447 | { |
1448 | ssize_t ret; |
1449 | struct iovec iovstack[UIO_FASTIOV]; |
1450 | struct iovec *iov = iovstack; |
1451 | struct iov_iter iter; |
1452 | struct task_struct *task; |
1453 | struct mm_struct *mm; |
1454 | size_t total_len; |
1455 | unsigned int f_flags; |
1456 | |
1457 | if (flags != 0) { |
1458 | ret = -EINVAL; |
1459 | goto out; |
1460 | } |
1461 | |
1462 | ret = import_iovec(ITER_DEST, uvec: vec, nr_segs: vlen, ARRAY_SIZE(iovstack), iovp: &iov, i: &iter); |
1463 | if (ret < 0) |
1464 | goto out; |
1465 | |
1466 | task = pidfd_get_task(pidfd, flags: &f_flags); |
1467 | if (IS_ERR(ptr: task)) { |
1468 | ret = PTR_ERR(ptr: task); |
1469 | goto free_iov; |
1470 | } |
1471 | |
1472 | if (!process_madvise_behavior_valid(behavior)) { |
1473 | ret = -EINVAL; |
1474 | goto release_task; |
1475 | } |
1476 | |
1477 | /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ |
1478 | mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); |
1479 | if (IS_ERR_OR_NULL(ptr: mm)) { |
1480 | ret = IS_ERR(ptr: mm) ? PTR_ERR(ptr: mm) : -ESRCH; |
1481 | goto release_task; |
1482 | } |
1483 | |
1484 | /* |
1485 | * Require CAP_SYS_NICE for influencing process performance. Note that |
1486 | * only non-destructive hints are currently supported. |
1487 | */ |
1488 | if (!capable(CAP_SYS_NICE)) { |
1489 | ret = -EPERM; |
1490 | goto release_mm; |
1491 | } |
1492 | |
1493 | total_len = iov_iter_count(i: &iter); |
1494 | |
1495 | while (iov_iter_count(i: &iter)) { |
1496 | ret = do_madvise(mm, start: (unsigned long)iter_iov_addr(&iter), |
1497 | iter_iov_len(&iter), behavior); |
1498 | if (ret < 0) |
1499 | break; |
1500 | iov_iter_advance(i: &iter, iter_iov_len(&iter)); |
1501 | } |
1502 | |
1503 | ret = (total_len - iov_iter_count(i: &iter)) ? : ret; |
1504 | |
1505 | release_mm: |
1506 | mmput(mm); |
1507 | release_task: |
1508 | put_task_struct(t: task); |
1509 | free_iov: |
1510 | kfree(objp: iov); |
1511 | out: |
1512 | return ret; |
1513 | } |
1514 | |