1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_RMAP_H |
3 | #define _LINUX_RMAP_H |
4 | /* |
5 | * Declarations for Reverse Mapping functions in mm/rmap.c |
6 | */ |
7 | |
8 | #include <linux/list.h> |
9 | #include <linux/slab.h> |
10 | #include <linux/mm.h> |
11 | #include <linux/rwsem.h> |
12 | #include <linux/memcontrol.h> |
13 | #include <linux/highmem.h> |
14 | #include <linux/pagemap.h> |
15 | #include <linux/memremap.h> |
16 | |
17 | /* |
18 | * The anon_vma heads a list of private "related" vmas, to scan if |
19 | * an anonymous page pointing to this anon_vma needs to be unmapped: |
20 | * the vmas on the list will be related by forking, or by splitting. |
21 | * |
22 | * Since vmas come and go as they are split and merged (particularly |
23 | * in mprotect), the mapping field of an anonymous page cannot point |
24 | * directly to a vma: instead it points to an anon_vma, on whose list |
25 | * the related vmas can be easily linked or unlinked. |
26 | * |
27 | * After unlinking the last vma on the list, we must garbage collect |
28 | * the anon_vma object itself: we're guaranteed no page can be |
29 | * pointing to this anon_vma once its vma list is empty. |
30 | */ |
31 | struct anon_vma { |
32 | struct anon_vma *root; /* Root of this anon_vma tree */ |
33 | struct rw_semaphore rwsem; /* W: modification, R: walking the list */ |
34 | /* |
35 | * The refcount is taken on an anon_vma when there is no |
36 | * guarantee that the vma of page tables will exist for |
37 | * the duration of the operation. A caller that takes |
38 | * the reference is responsible for clearing up the |
39 | * anon_vma if they are the last user on release |
40 | */ |
41 | atomic_t refcount; |
42 | |
43 | /* |
44 | * Count of child anon_vmas. Equals to the count of all anon_vmas that |
45 | * have ->parent pointing to this one, including itself. |
46 | * |
47 | * This counter is used for making decision about reusing anon_vma |
48 | * instead of forking new one. See comments in function anon_vma_clone. |
49 | */ |
50 | unsigned long num_children; |
51 | /* Count of VMAs whose ->anon_vma pointer points to this object. */ |
52 | unsigned long num_active_vmas; |
53 | |
54 | struct anon_vma *parent; /* Parent of this anon_vma */ |
55 | |
56 | /* |
57 | * NOTE: the LSB of the rb_root.rb_node is set by |
58 | * mm_take_all_locks() _after_ taking the above lock. So the |
59 | * rb_root must only be read/written after taking the above lock |
60 | * to be sure to see a valid next pointer. The LSB bit itself |
61 | * is serialized by a system wide lock only visible to |
62 | * mm_take_all_locks() (mm_all_locks_mutex). |
63 | */ |
64 | |
65 | /* Interval tree of private "related" vmas */ |
66 | struct rb_root_cached rb_root; |
67 | }; |
68 | |
69 | /* |
70 | * The copy-on-write semantics of fork mean that an anon_vma |
71 | * can become associated with multiple processes. Furthermore, |
72 | * each child process will have its own anon_vma, where new |
73 | * pages for that process are instantiated. |
74 | * |
75 | * This structure allows us to find the anon_vmas associated |
76 | * with a VMA, or the VMAs associated with an anon_vma. |
77 | * The "same_vma" list contains the anon_vma_chains linking |
78 | * all the anon_vmas associated with this VMA. |
79 | * The "rb" field indexes on an interval tree the anon_vma_chains |
80 | * which link all the VMAs associated with this anon_vma. |
81 | */ |
82 | struct anon_vma_chain { |
83 | struct vm_area_struct *vma; |
84 | struct anon_vma *anon_vma; |
85 | struct list_head same_vma; /* locked by mmap_lock & page_table_lock */ |
86 | struct rb_node rb; /* locked by anon_vma->rwsem */ |
87 | unsigned long rb_subtree_last; |
88 | #ifdef CONFIG_DEBUG_VM_RB |
89 | unsigned long cached_vma_start, cached_vma_last; |
90 | #endif |
91 | }; |
92 | |
93 | enum ttu_flags { |
94 | TTU_SPLIT_HUGE_PMD = 0x4, /* split huge PMD if any */ |
95 | TTU_IGNORE_MLOCK = 0x8, /* ignore mlock */ |
96 | TTU_SYNC = 0x10, /* avoid racy checks with PVMW_SYNC */ |
97 | TTU_HWPOISON = 0x20, /* do convert pte to hwpoison entry */ |
98 | TTU_BATCH_FLUSH = 0x40, /* Batch TLB flushes where possible |
99 | * and caller guarantees they will |
100 | * do a final flush if necessary */ |
101 | TTU_RMAP_LOCKED = 0x80, /* do not grab rmap lock: |
102 | * caller holds it */ |
103 | }; |
104 | |
105 | #ifdef CONFIG_MMU |
106 | static inline void get_anon_vma(struct anon_vma *anon_vma) |
107 | { |
108 | atomic_inc(v: &anon_vma->refcount); |
109 | } |
110 | |
111 | void __put_anon_vma(struct anon_vma *anon_vma); |
112 | |
113 | static inline void put_anon_vma(struct anon_vma *anon_vma) |
114 | { |
115 | if (atomic_dec_and_test(v: &anon_vma->refcount)) |
116 | __put_anon_vma(anon_vma); |
117 | } |
118 | |
119 | static inline void anon_vma_lock_write(struct anon_vma *anon_vma) |
120 | { |
121 | down_write(sem: &anon_vma->root->rwsem); |
122 | } |
123 | |
124 | static inline int anon_vma_trylock_write(struct anon_vma *anon_vma) |
125 | { |
126 | return down_write_trylock(sem: &anon_vma->root->rwsem); |
127 | } |
128 | |
129 | static inline void anon_vma_unlock_write(struct anon_vma *anon_vma) |
130 | { |
131 | up_write(sem: &anon_vma->root->rwsem); |
132 | } |
133 | |
134 | static inline void anon_vma_lock_read(struct anon_vma *anon_vma) |
135 | { |
136 | down_read(sem: &anon_vma->root->rwsem); |
137 | } |
138 | |
139 | static inline int anon_vma_trylock_read(struct anon_vma *anon_vma) |
140 | { |
141 | return down_read_trylock(sem: &anon_vma->root->rwsem); |
142 | } |
143 | |
144 | static inline void anon_vma_unlock_read(struct anon_vma *anon_vma) |
145 | { |
146 | up_read(sem: &anon_vma->root->rwsem); |
147 | } |
148 | |
149 | |
150 | /* |
151 | * anon_vma helper functions. |
152 | */ |
153 | void anon_vma_init(void); /* create anon_vma_cachep */ |
154 | int __anon_vma_prepare(struct vm_area_struct *); |
155 | void unlink_anon_vmas(struct vm_area_struct *); |
156 | int anon_vma_clone(struct vm_area_struct *, struct vm_area_struct *); |
157 | int anon_vma_fork(struct vm_area_struct *, struct vm_area_struct *); |
158 | |
159 | static inline int anon_vma_prepare(struct vm_area_struct *vma) |
160 | { |
161 | if (likely(vma->anon_vma)) |
162 | return 0; |
163 | |
164 | return __anon_vma_prepare(vma); |
165 | } |
166 | |
167 | static inline void anon_vma_merge(struct vm_area_struct *vma, |
168 | struct vm_area_struct *next) |
169 | { |
170 | VM_BUG_ON_VMA(vma->anon_vma != next->anon_vma, vma); |
171 | unlink_anon_vmas(next); |
172 | } |
173 | |
174 | struct anon_vma *folio_get_anon_vma(struct folio *folio); |
175 | |
176 | /* RMAP flags, currently only relevant for some anon rmap operations. */ |
177 | typedef int __bitwise rmap_t; |
178 | |
179 | /* |
180 | * No special request: A mapped anonymous (sub)page is possibly shared between |
181 | * processes. |
182 | */ |
183 | #define RMAP_NONE ((__force rmap_t)0) |
184 | |
185 | /* The anonymous (sub)page is exclusive to a single process. */ |
186 | #define RMAP_EXCLUSIVE ((__force rmap_t)BIT(0)) |
187 | |
188 | /* |
189 | * Internally, we're using an enum to specify the granularity. We make the |
190 | * compiler emit specialized code for each granularity. |
191 | */ |
192 | enum rmap_level { |
193 | RMAP_LEVEL_PTE = 0, |
194 | RMAP_LEVEL_PMD, |
195 | }; |
196 | |
197 | static inline void __folio_rmap_sanity_checks(struct folio *folio, |
198 | struct page *page, int nr_pages, enum rmap_level level) |
199 | { |
200 | /* hugetlb folios are handled separately. */ |
201 | VM_WARN_ON_FOLIO(folio_test_hugetlb(folio), folio); |
202 | |
203 | /* |
204 | * TODO: we get driver-allocated folios that have nothing to do with |
205 | * the rmap using vm_insert_page(); therefore, we cannot assume that |
206 | * folio_test_large_rmappable() holds for large folios. We should |
207 | * handle any desired mapcount+stats accounting for these folios in |
208 | * VM_MIXEDMAP VMAs separately, and then sanity-check here that |
209 | * we really only get rmappable folios. |
210 | */ |
211 | |
212 | VM_WARN_ON_ONCE(nr_pages <= 0); |
213 | VM_WARN_ON_FOLIO(page_folio(page) != folio, folio); |
214 | VM_WARN_ON_FOLIO(page_folio(page + nr_pages - 1) != folio, folio); |
215 | |
216 | switch (level) { |
217 | case RMAP_LEVEL_PTE: |
218 | break; |
219 | case RMAP_LEVEL_PMD: |
220 | /* |
221 | * We don't support folios larger than a single PMD yet. So |
222 | * when RMAP_LEVEL_PMD is set, we assume that we are creating |
223 | * a single "entire" mapping of the folio. |
224 | */ |
225 | VM_WARN_ON_FOLIO(folio_nr_pages(folio) != HPAGE_PMD_NR, folio); |
226 | VM_WARN_ON_FOLIO(nr_pages != HPAGE_PMD_NR, folio); |
227 | break; |
228 | default: |
229 | VM_WARN_ON_ONCE(true); |
230 | } |
231 | } |
232 | |
233 | /* |
234 | * rmap interfaces called when adding or removing pte of page |
235 | */ |
236 | void folio_move_anon_rmap(struct folio *, struct vm_area_struct *); |
237 | void folio_add_anon_rmap_ptes(struct folio *, struct page *, int nr_pages, |
238 | struct vm_area_struct *, unsigned long address, rmap_t flags); |
239 | #define folio_add_anon_rmap_pte(folio, page, vma, address, flags) \ |
240 | folio_add_anon_rmap_ptes(folio, page, 1, vma, address, flags) |
241 | void folio_add_anon_rmap_pmd(struct folio *, struct page *, |
242 | struct vm_area_struct *, unsigned long address, rmap_t flags); |
243 | void folio_add_new_anon_rmap(struct folio *, struct vm_area_struct *, |
244 | unsigned long address); |
245 | void folio_add_file_rmap_ptes(struct folio *, struct page *, int nr_pages, |
246 | struct vm_area_struct *); |
247 | #define folio_add_file_rmap_pte(folio, page, vma) \ |
248 | folio_add_file_rmap_ptes(folio, page, 1, vma) |
249 | void folio_add_file_rmap_pmd(struct folio *, struct page *, |
250 | struct vm_area_struct *); |
251 | void folio_remove_rmap_ptes(struct folio *, struct page *, int nr_pages, |
252 | struct vm_area_struct *); |
253 | #define folio_remove_rmap_pte(folio, page, vma) \ |
254 | folio_remove_rmap_ptes(folio, page, 1, vma) |
255 | void folio_remove_rmap_pmd(struct folio *, struct page *, |
256 | struct vm_area_struct *); |
257 | |
258 | void hugetlb_add_anon_rmap(struct folio *, struct vm_area_struct *, |
259 | unsigned long address, rmap_t flags); |
260 | void hugetlb_add_new_anon_rmap(struct folio *, struct vm_area_struct *, |
261 | unsigned long address); |
262 | |
263 | /* See folio_try_dup_anon_rmap_*() */ |
264 | static inline int hugetlb_try_dup_anon_rmap(struct folio *folio, |
265 | struct vm_area_struct *vma) |
266 | { |
267 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); |
268 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); |
269 | |
270 | if (PageAnonExclusive(page: &folio->page)) { |
271 | if (unlikely(folio_needs_cow_for_dma(vma, folio))) |
272 | return -EBUSY; |
273 | ClearPageAnonExclusive(page: &folio->page); |
274 | } |
275 | atomic_inc(v: &folio->_entire_mapcount); |
276 | return 0; |
277 | } |
278 | |
279 | /* See folio_try_share_anon_rmap_*() */ |
280 | static inline int hugetlb_try_share_anon_rmap(struct folio *folio) |
281 | { |
282 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); |
283 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); |
284 | VM_WARN_ON_FOLIO(!PageAnonExclusive(&folio->page), folio); |
285 | |
286 | /* Paired with the memory barrier in try_grab_folio(). */ |
287 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) |
288 | smp_mb(); |
289 | |
290 | if (unlikely(folio_maybe_dma_pinned(folio))) |
291 | return -EBUSY; |
292 | ClearPageAnonExclusive(page: &folio->page); |
293 | |
294 | /* |
295 | * This is conceptually a smp_wmb() paired with the smp_rmb() in |
296 | * gup_must_unshare(). |
297 | */ |
298 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) |
299 | smp_mb__after_atomic(); |
300 | return 0; |
301 | } |
302 | |
303 | static inline void hugetlb_add_file_rmap(struct folio *folio) |
304 | { |
305 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); |
306 | VM_WARN_ON_FOLIO(folio_test_anon(folio), folio); |
307 | |
308 | atomic_inc(v: &folio->_entire_mapcount); |
309 | } |
310 | |
311 | static inline void hugetlb_remove_rmap(struct folio *folio) |
312 | { |
313 | VM_WARN_ON_FOLIO(!folio_test_hugetlb(folio), folio); |
314 | |
315 | atomic_dec(v: &folio->_entire_mapcount); |
316 | } |
317 | |
318 | static __always_inline void __folio_dup_file_rmap(struct folio *folio, |
319 | struct page *page, int nr_pages, enum rmap_level level) |
320 | { |
321 | __folio_rmap_sanity_checks(folio, page, nr_pages, level); |
322 | |
323 | switch (level) { |
324 | case RMAP_LEVEL_PTE: |
325 | do { |
326 | atomic_inc(v: &page->_mapcount); |
327 | } while (page++, --nr_pages > 0); |
328 | break; |
329 | case RMAP_LEVEL_PMD: |
330 | atomic_inc(v: &folio->_entire_mapcount); |
331 | break; |
332 | } |
333 | } |
334 | |
335 | /** |
336 | * folio_dup_file_rmap_ptes - duplicate PTE mappings of a page range of a folio |
337 | * @folio: The folio to duplicate the mappings of |
338 | * @page: The first page to duplicate the mappings of |
339 | * @nr_pages: The number of pages of which the mapping will be duplicated |
340 | * |
341 | * The page range of the folio is defined by [page, page + nr_pages) |
342 | * |
343 | * The caller needs to hold the page table lock. |
344 | */ |
345 | static inline void folio_dup_file_rmap_ptes(struct folio *folio, |
346 | struct page *page, int nr_pages) |
347 | { |
348 | __folio_dup_file_rmap(folio, page, nr_pages, level: RMAP_LEVEL_PTE); |
349 | } |
350 | #define folio_dup_file_rmap_pte(folio, page) \ |
351 | folio_dup_file_rmap_ptes(folio, page, 1) |
352 | |
353 | /** |
354 | * folio_dup_file_rmap_pmd - duplicate a PMD mapping of a page range of a folio |
355 | * @folio: The folio to duplicate the mapping of |
356 | * @page: The first page to duplicate the mapping of |
357 | * |
358 | * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) |
359 | * |
360 | * The caller needs to hold the page table lock. |
361 | */ |
362 | static inline void folio_dup_file_rmap_pmd(struct folio *folio, |
363 | struct page *page) |
364 | { |
365 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
366 | __folio_dup_file_rmap(folio, page, HPAGE_PMD_NR, level: RMAP_LEVEL_PTE); |
367 | #else |
368 | WARN_ON_ONCE(true); |
369 | #endif |
370 | } |
371 | |
372 | static __always_inline int __folio_try_dup_anon_rmap(struct folio *folio, |
373 | struct page *page, int nr_pages, struct vm_area_struct *src_vma, |
374 | enum rmap_level level) |
375 | { |
376 | bool maybe_pinned; |
377 | int i; |
378 | |
379 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); |
380 | __folio_rmap_sanity_checks(folio, page, nr_pages, level); |
381 | |
382 | /* |
383 | * If this folio may have been pinned by the parent process, |
384 | * don't allow to duplicate the mappings but instead require to e.g., |
385 | * copy the subpage immediately for the child so that we'll always |
386 | * guarantee the pinned folio won't be randomly replaced in the |
387 | * future on write faults. |
388 | */ |
389 | maybe_pinned = likely(!folio_is_device_private(folio)) && |
390 | unlikely(folio_needs_cow_for_dma(src_vma, folio)); |
391 | |
392 | /* |
393 | * No need to check+clear for already shared PTEs/PMDs of the |
394 | * folio. But if any page is PageAnonExclusive, we must fallback to |
395 | * copying if the folio maybe pinned. |
396 | */ |
397 | switch (level) { |
398 | case RMAP_LEVEL_PTE: |
399 | if (unlikely(maybe_pinned)) { |
400 | for (i = 0; i < nr_pages; i++) |
401 | if (PageAnonExclusive(page: page + i)) |
402 | return -EBUSY; |
403 | } |
404 | do { |
405 | if (PageAnonExclusive(page)) |
406 | ClearPageAnonExclusive(page); |
407 | atomic_inc(v: &page->_mapcount); |
408 | } while (page++, --nr_pages > 0); |
409 | break; |
410 | case RMAP_LEVEL_PMD: |
411 | if (PageAnonExclusive(page)) { |
412 | if (unlikely(maybe_pinned)) |
413 | return -EBUSY; |
414 | ClearPageAnonExclusive(page); |
415 | } |
416 | atomic_inc(v: &folio->_entire_mapcount); |
417 | break; |
418 | } |
419 | return 0; |
420 | } |
421 | |
422 | /** |
423 | * folio_try_dup_anon_rmap_ptes - try duplicating PTE mappings of a page range |
424 | * of a folio |
425 | * @folio: The folio to duplicate the mappings of |
426 | * @page: The first page to duplicate the mappings of |
427 | * @nr_pages: The number of pages of which the mapping will be duplicated |
428 | * @src_vma: The vm area from which the mappings are duplicated |
429 | * |
430 | * The page range of the folio is defined by [page, page + nr_pages) |
431 | * |
432 | * The caller needs to hold the page table lock and the |
433 | * vma->vma_mm->write_protect_seq. |
434 | * |
435 | * Duplicating the mappings can only fail if the folio may be pinned; device |
436 | * private folios cannot get pinned and consequently this function cannot fail |
437 | * for them. |
438 | * |
439 | * If duplicating the mappings succeeded, the duplicated PTEs have to be R/O in |
440 | * the parent and the child. They must *not* be writable after this call |
441 | * succeeded. |
442 | * |
443 | * Returns 0 if duplicating the mappings succeeded. Returns -EBUSY otherwise. |
444 | */ |
445 | static inline int folio_try_dup_anon_rmap_ptes(struct folio *folio, |
446 | struct page *page, int nr_pages, struct vm_area_struct *src_vma) |
447 | { |
448 | return __folio_try_dup_anon_rmap(folio, page, nr_pages, src_vma, |
449 | level: RMAP_LEVEL_PTE); |
450 | } |
451 | #define folio_try_dup_anon_rmap_pte(folio, page, vma) \ |
452 | folio_try_dup_anon_rmap_ptes(folio, page, 1, vma) |
453 | |
454 | /** |
455 | * folio_try_dup_anon_rmap_pmd - try duplicating a PMD mapping of a page range |
456 | * of a folio |
457 | * @folio: The folio to duplicate the mapping of |
458 | * @page: The first page to duplicate the mapping of |
459 | * @src_vma: The vm area from which the mapping is duplicated |
460 | * |
461 | * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) |
462 | * |
463 | * The caller needs to hold the page table lock and the |
464 | * vma->vma_mm->write_protect_seq. |
465 | * |
466 | * Duplicating the mapping can only fail if the folio may be pinned; device |
467 | * private folios cannot get pinned and consequently this function cannot fail |
468 | * for them. |
469 | * |
470 | * If duplicating the mapping succeeds, the duplicated PMD has to be R/O in |
471 | * the parent and the child. They must *not* be writable after this call |
472 | * succeeded. |
473 | * |
474 | * Returns 0 if duplicating the mapping succeeded. Returns -EBUSY otherwise. |
475 | */ |
476 | static inline int folio_try_dup_anon_rmap_pmd(struct folio *folio, |
477 | struct page *page, struct vm_area_struct *src_vma) |
478 | { |
479 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
480 | return __folio_try_dup_anon_rmap(folio, page, HPAGE_PMD_NR, src_vma, |
481 | level: RMAP_LEVEL_PMD); |
482 | #else |
483 | WARN_ON_ONCE(true); |
484 | return -EBUSY; |
485 | #endif |
486 | } |
487 | |
488 | static __always_inline int __folio_try_share_anon_rmap(struct folio *folio, |
489 | struct page *page, int nr_pages, enum rmap_level level) |
490 | { |
491 | VM_WARN_ON_FOLIO(!folio_test_anon(folio), folio); |
492 | VM_WARN_ON_FOLIO(!PageAnonExclusive(page), folio); |
493 | __folio_rmap_sanity_checks(folio, page, nr_pages, level); |
494 | |
495 | /* device private folios cannot get pinned via GUP. */ |
496 | if (unlikely(folio_is_device_private(folio))) { |
497 | ClearPageAnonExclusive(page); |
498 | return 0; |
499 | } |
500 | |
501 | /* |
502 | * We have to make sure that when we clear PageAnonExclusive, that |
503 | * the page is not pinned and that concurrent GUP-fast won't succeed in |
504 | * concurrently pinning the page. |
505 | * |
506 | * Conceptually, PageAnonExclusive clearing consists of: |
507 | * (A1) Clear PTE |
508 | * (A2) Check if the page is pinned; back off if so. |
509 | * (A3) Clear PageAnonExclusive |
510 | * (A4) Restore PTE (optional, but certainly not writable) |
511 | * |
512 | * When clearing PageAnonExclusive, we cannot possibly map the page |
513 | * writable again, because anon pages that may be shared must never |
514 | * be writable. So in any case, if the PTE was writable it cannot |
515 | * be writable anymore afterwards and there would be a PTE change. Only |
516 | * if the PTE wasn't writable, there might not be a PTE change. |
517 | * |
518 | * Conceptually, GUP-fast pinning of an anon page consists of: |
519 | * (B1) Read the PTE |
520 | * (B2) FOLL_WRITE: check if the PTE is not writable; back off if so. |
521 | * (B3) Pin the mapped page |
522 | * (B4) Check if the PTE changed by re-reading it; back off if so. |
523 | * (B5) If the original PTE is not writable, check if |
524 | * PageAnonExclusive is not set; back off if so. |
525 | * |
526 | * If the PTE was writable, we only have to make sure that GUP-fast |
527 | * observes a PTE change and properly backs off. |
528 | * |
529 | * If the PTE was not writable, we have to make sure that GUP-fast either |
530 | * detects a (temporary) PTE change or that PageAnonExclusive is cleared |
531 | * and properly backs off. |
532 | * |
533 | * Consequently, when clearing PageAnonExclusive(), we have to make |
534 | * sure that (A1), (A2)/(A3) and (A4) happen in the right memory |
535 | * order. In GUP-fast pinning code, we have to make sure that (B3),(B4) |
536 | * and (B5) happen in the right memory order. |
537 | * |
538 | * We assume that there might not be a memory barrier after |
539 | * clearing/invalidating the PTE (A1) and before restoring the PTE (A4), |
540 | * so we use explicit ones here. |
541 | */ |
542 | |
543 | /* Paired with the memory barrier in try_grab_folio(). */ |
544 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) |
545 | smp_mb(); |
546 | |
547 | if (unlikely(folio_maybe_dma_pinned(folio))) |
548 | return -EBUSY; |
549 | ClearPageAnonExclusive(page); |
550 | |
551 | /* |
552 | * This is conceptually a smp_wmb() paired with the smp_rmb() in |
553 | * gup_must_unshare(). |
554 | */ |
555 | if (IS_ENABLED(CONFIG_HAVE_FAST_GUP)) |
556 | smp_mb__after_atomic(); |
557 | return 0; |
558 | } |
559 | |
560 | /** |
561 | * folio_try_share_anon_rmap_pte - try marking an exclusive anonymous page |
562 | * mapped by a PTE possibly shared to prepare |
563 | * for KSM or temporary unmapping |
564 | * @folio: The folio to share a mapping of |
565 | * @page: The mapped exclusive page |
566 | * |
567 | * The caller needs to hold the page table lock and has to have the page table |
568 | * entries cleared/invalidated. |
569 | * |
570 | * This is similar to folio_try_dup_anon_rmap_pte(), however, not used during |
571 | * fork() to duplicate mappings, but instead to prepare for KSM or temporarily |
572 | * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pte(). |
573 | * |
574 | * Marking the mapped page shared can only fail if the folio maybe pinned; |
575 | * device private folios cannot get pinned and consequently this function cannot |
576 | * fail. |
577 | * |
578 | * Returns 0 if marking the mapped page possibly shared succeeded. Returns |
579 | * -EBUSY otherwise. |
580 | */ |
581 | static inline int folio_try_share_anon_rmap_pte(struct folio *folio, |
582 | struct page *page) |
583 | { |
584 | return __folio_try_share_anon_rmap(folio, page, nr_pages: 1, level: RMAP_LEVEL_PTE); |
585 | } |
586 | |
587 | /** |
588 | * folio_try_share_anon_rmap_pmd - try marking an exclusive anonymous page |
589 | * range mapped by a PMD possibly shared to |
590 | * prepare for temporary unmapping |
591 | * @folio: The folio to share the mapping of |
592 | * @page: The first page to share the mapping of |
593 | * |
594 | * The page range of the folio is defined by [page, page + HPAGE_PMD_NR) |
595 | * |
596 | * The caller needs to hold the page table lock and has to have the page table |
597 | * entries cleared/invalidated. |
598 | * |
599 | * This is similar to folio_try_dup_anon_rmap_pmd(), however, not used during |
600 | * fork() to duplicate a mapping, but instead to prepare for temporarily |
601 | * unmapping parts of a folio (swap, migration) via folio_remove_rmap_pmd(). |
602 | * |
603 | * Marking the mapped pages shared can only fail if the folio maybe pinned; |
604 | * device private folios cannot get pinned and consequently this function cannot |
605 | * fail. |
606 | * |
607 | * Returns 0 if marking the mapped pages possibly shared succeeded. Returns |
608 | * -EBUSY otherwise. |
609 | */ |
610 | static inline int folio_try_share_anon_rmap_pmd(struct folio *folio, |
611 | struct page *page) |
612 | { |
613 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
614 | return __folio_try_share_anon_rmap(folio, page, HPAGE_PMD_NR, |
615 | level: RMAP_LEVEL_PMD); |
616 | #else |
617 | WARN_ON_ONCE(true); |
618 | return -EBUSY; |
619 | #endif |
620 | } |
621 | |
622 | /* |
623 | * Called from mm/vmscan.c to handle paging out |
624 | */ |
625 | int folio_referenced(struct folio *, int is_locked, |
626 | struct mem_cgroup *memcg, unsigned long *vm_flags); |
627 | |
628 | void try_to_migrate(struct folio *folio, enum ttu_flags flags); |
629 | void try_to_unmap(struct folio *, enum ttu_flags flags); |
630 | |
631 | int make_device_exclusive_range(struct mm_struct *mm, unsigned long start, |
632 | unsigned long end, struct page **pages, |
633 | void *arg); |
634 | |
635 | /* Avoid racy checks */ |
636 | #define PVMW_SYNC (1 << 0) |
637 | /* Look for migration entries rather than present PTEs */ |
638 | #define PVMW_MIGRATION (1 << 1) |
639 | |
640 | struct page_vma_mapped_walk { |
641 | unsigned long pfn; |
642 | unsigned long nr_pages; |
643 | pgoff_t pgoff; |
644 | struct vm_area_struct *vma; |
645 | unsigned long address; |
646 | pmd_t *pmd; |
647 | pte_t *pte; |
648 | spinlock_t *ptl; |
649 | unsigned int flags; |
650 | }; |
651 | |
652 | #define DEFINE_PAGE_VMA_WALK(name, _page, _vma, _address, _flags) \ |
653 | struct page_vma_mapped_walk name = { \ |
654 | .pfn = page_to_pfn(_page), \ |
655 | .nr_pages = compound_nr(_page), \ |
656 | .pgoff = page_to_pgoff(_page), \ |
657 | .vma = _vma, \ |
658 | .address = _address, \ |
659 | .flags = _flags, \ |
660 | } |
661 | |
662 | #define DEFINE_FOLIO_VMA_WALK(name, _folio, _vma, _address, _flags) \ |
663 | struct page_vma_mapped_walk name = { \ |
664 | .pfn = folio_pfn(_folio), \ |
665 | .nr_pages = folio_nr_pages(_folio), \ |
666 | .pgoff = folio_pgoff(_folio), \ |
667 | .vma = _vma, \ |
668 | .address = _address, \ |
669 | .flags = _flags, \ |
670 | } |
671 | |
672 | static inline void page_vma_mapped_walk_done(struct page_vma_mapped_walk *pvmw) |
673 | { |
674 | /* HugeTLB pte is set to the relevant page table entry without pte_mapped. */ |
675 | if (pvmw->pte && !is_vm_hugetlb_page(vma: pvmw->vma)) |
676 | pte_unmap(pte: pvmw->pte); |
677 | if (pvmw->ptl) |
678 | spin_unlock(lock: pvmw->ptl); |
679 | } |
680 | |
681 | bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw); |
682 | |
683 | /* |
684 | * Used by swapoff to help locate where page is expected in vma. |
685 | */ |
686 | unsigned long page_address_in_vma(struct page *, struct vm_area_struct *); |
687 | |
688 | /* |
689 | * Cleans the PTEs of shared mappings. |
690 | * (and since clean PTEs should also be readonly, write protects them too) |
691 | * |
692 | * returns the number of cleaned PTEs. |
693 | */ |
694 | int folio_mkclean(struct folio *); |
695 | |
696 | int pfn_mkclean_range(unsigned long pfn, unsigned long nr_pages, pgoff_t pgoff, |
697 | struct vm_area_struct *vma); |
698 | |
699 | void remove_migration_ptes(struct folio *src, struct folio *dst, bool locked); |
700 | |
701 | int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma); |
702 | |
703 | /* |
704 | * rmap_walk_control: To control rmap traversing for specific needs |
705 | * |
706 | * arg: passed to rmap_one() and invalid_vma() |
707 | * try_lock: bail out if the rmap lock is contended |
708 | * contended: indicate the rmap traversal bailed out due to lock contention |
709 | * rmap_one: executed on each vma where page is mapped |
710 | * done: for checking traversing termination condition |
711 | * anon_lock: for getting anon_lock by optimized way rather than default |
712 | * invalid_vma: for skipping uninterested vma |
713 | */ |
714 | struct rmap_walk_control { |
715 | void *arg; |
716 | bool try_lock; |
717 | bool contended; |
718 | /* |
719 | * Return false if page table scanning in rmap_walk should be stopped. |
720 | * Otherwise, return true. |
721 | */ |
722 | bool (*rmap_one)(struct folio *folio, struct vm_area_struct *vma, |
723 | unsigned long addr, void *arg); |
724 | int (*done)(struct folio *folio); |
725 | struct anon_vma *(*anon_lock)(struct folio *folio, |
726 | struct rmap_walk_control *rwc); |
727 | bool (*invalid_vma)(struct vm_area_struct *vma, void *arg); |
728 | }; |
729 | |
730 | void rmap_walk(struct folio *folio, struct rmap_walk_control *rwc); |
731 | void rmap_walk_locked(struct folio *folio, struct rmap_walk_control *rwc); |
732 | struct anon_vma *folio_lock_anon_vma_read(struct folio *folio, |
733 | struct rmap_walk_control *rwc); |
734 | |
735 | #else /* !CONFIG_MMU */ |
736 | |
737 | #define anon_vma_init() do {} while (0) |
738 | #define anon_vma_prepare(vma) (0) |
739 | |
740 | static inline int folio_referenced(struct folio *folio, int is_locked, |
741 | struct mem_cgroup *memcg, |
742 | unsigned long *vm_flags) |
743 | { |
744 | *vm_flags = 0; |
745 | return 0; |
746 | } |
747 | |
748 | static inline void try_to_unmap(struct folio *folio, enum ttu_flags flags) |
749 | { |
750 | } |
751 | |
752 | static inline int folio_mkclean(struct folio *folio) |
753 | { |
754 | return 0; |
755 | } |
756 | #endif /* CONFIG_MMU */ |
757 | |
758 | static inline int page_mkclean(struct page *page) |
759 | { |
760 | return folio_mkclean(page_folio(page)); |
761 | } |
762 | #endif /* _LINUX_RMAP_H */ |
763 | |