1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | #ifndef _LINUX_MMU_NOTIFIER_H |
3 | #define _LINUX_MMU_NOTIFIER_H |
4 | |
5 | #include <linux/list.h> |
6 | #include <linux/spinlock.h> |
7 | #include <linux/mm_types.h> |
8 | #include <linux/srcu.h> |
9 | |
10 | struct mmu_notifier; |
11 | struct mmu_notifier_ops; |
12 | |
13 | #ifdef CONFIG_MMU_NOTIFIER |
14 | |
15 | /* |
16 | * The mmu notifier_mm structure is allocated and installed in |
17 | * mm->mmu_notifier_mm inside the mm_take_all_locks() protected |
18 | * critical section and it's released only when mm_count reaches zero |
19 | * in mmdrop(). |
20 | */ |
21 | struct mmu_notifier_mm { |
22 | /* all mmu notifiers registerd in this mm are queued in this list */ |
23 | struct hlist_head list; |
24 | /* to serialize the list modifications and hlist_unhashed */ |
25 | spinlock_t lock; |
26 | }; |
27 | |
28 | struct mmu_notifier_range { |
29 | struct mm_struct *mm; |
30 | unsigned long start; |
31 | unsigned long end; |
32 | bool blockable; |
33 | }; |
34 | |
35 | struct mmu_notifier_ops { |
36 | /* |
37 | * Called either by mmu_notifier_unregister or when the mm is |
38 | * being destroyed by exit_mmap, always before all pages are |
39 | * freed. This can run concurrently with other mmu notifier |
40 | * methods (the ones invoked outside the mm context) and it |
41 | * should tear down all secondary mmu mappings and freeze the |
42 | * secondary mmu. If this method isn't implemented you've to |
43 | * be sure that nothing could possibly write to the pages |
44 | * through the secondary mmu by the time the last thread with |
45 | * tsk->mm == mm exits. |
46 | * |
47 | * As side note: the pages freed after ->release returns could |
48 | * be immediately reallocated by the gart at an alias physical |
49 | * address with a different cache model, so if ->release isn't |
50 | * implemented because all _software_ driven memory accesses |
51 | * through the secondary mmu are terminated by the time the |
52 | * last thread of this mm quits, you've also to be sure that |
53 | * speculative _hardware_ operations can't allocate dirty |
54 | * cachelines in the cpu that could not be snooped and made |
55 | * coherent with the other read and write operations happening |
56 | * through the gart alias address, so leading to memory |
57 | * corruption. |
58 | */ |
59 | void (*release)(struct mmu_notifier *mn, |
60 | struct mm_struct *mm); |
61 | |
62 | /* |
63 | * clear_flush_young is called after the VM is |
64 | * test-and-clearing the young/accessed bitflag in the |
65 | * pte. This way the VM will provide proper aging to the |
66 | * accesses to the page through the secondary MMUs and not |
67 | * only to the ones through the Linux pte. |
68 | * Start-end is necessary in case the secondary MMU is mapping the page |
69 | * at a smaller granularity than the primary MMU. |
70 | */ |
71 | int (*clear_flush_young)(struct mmu_notifier *mn, |
72 | struct mm_struct *mm, |
73 | unsigned long start, |
74 | unsigned long end); |
75 | |
76 | /* |
77 | * clear_young is a lightweight version of clear_flush_young. Like the |
78 | * latter, it is supposed to test-and-clear the young/accessed bitflag |
79 | * in the secondary pte, but it may omit flushing the secondary tlb. |
80 | */ |
81 | int (*clear_young)(struct mmu_notifier *mn, |
82 | struct mm_struct *mm, |
83 | unsigned long start, |
84 | unsigned long end); |
85 | |
86 | /* |
87 | * test_young is called to check the young/accessed bitflag in |
88 | * the secondary pte. This is used to know if the page is |
89 | * frequently used without actually clearing the flag or tearing |
90 | * down the secondary mapping on the page. |
91 | */ |
92 | int (*test_young)(struct mmu_notifier *mn, |
93 | struct mm_struct *mm, |
94 | unsigned long address); |
95 | |
96 | /* |
97 | * change_pte is called in cases that pte mapping to page is changed: |
98 | * for example, when ksm remaps pte to point to a new shared page. |
99 | */ |
100 | void (*change_pte)(struct mmu_notifier *mn, |
101 | struct mm_struct *mm, |
102 | unsigned long address, |
103 | pte_t pte); |
104 | |
105 | /* |
106 | * invalidate_range_start() and invalidate_range_end() must be |
107 | * paired and are called only when the mmap_sem and/or the |
108 | * locks protecting the reverse maps are held. If the subsystem |
109 | * can't guarantee that no additional references are taken to |
110 | * the pages in the range, it has to implement the |
111 | * invalidate_range() notifier to remove any references taken |
112 | * after invalidate_range_start(). |
113 | * |
114 | * Invalidation of multiple concurrent ranges may be |
115 | * optionally permitted by the driver. Either way the |
116 | * establishment of sptes is forbidden in the range passed to |
117 | * invalidate_range_begin/end for the whole duration of the |
118 | * invalidate_range_begin/end critical section. |
119 | * |
120 | * invalidate_range_start() is called when all pages in the |
121 | * range are still mapped and have at least a refcount of one. |
122 | * |
123 | * invalidate_range_end() is called when all pages in the |
124 | * range have been unmapped and the pages have been freed by |
125 | * the VM. |
126 | * |
127 | * The VM will remove the page table entries and potentially |
128 | * the page between invalidate_range_start() and |
129 | * invalidate_range_end(). If the page must not be freed |
130 | * because of pending I/O or other circumstances then the |
131 | * invalidate_range_start() callback (or the initial mapping |
132 | * by the driver) must make sure that the refcount is kept |
133 | * elevated. |
134 | * |
135 | * If the driver increases the refcount when the pages are |
136 | * initially mapped into an address space then either |
137 | * invalidate_range_start() or invalidate_range_end() may |
138 | * decrease the refcount. If the refcount is decreased on |
139 | * invalidate_range_start() then the VM can free pages as page |
140 | * table entries are removed. If the refcount is only |
141 | * droppped on invalidate_range_end() then the driver itself |
142 | * will drop the last refcount but it must take care to flush |
143 | * any secondary tlb before doing the final free on the |
144 | * page. Pages will no longer be referenced by the linux |
145 | * address space but may still be referenced by sptes until |
146 | * the last refcount is dropped. |
147 | * |
148 | * If blockable argument is set to false then the callback cannot |
149 | * sleep and has to return with -EAGAIN. 0 should be returned |
150 | * otherwise. Please note that if invalidate_range_start approves |
151 | * a non-blocking behavior then the same applies to |
152 | * invalidate_range_end. |
153 | * |
154 | */ |
155 | int (*invalidate_range_start)(struct mmu_notifier *mn, |
156 | const struct mmu_notifier_range *range); |
157 | void (*invalidate_range_end)(struct mmu_notifier *mn, |
158 | const struct mmu_notifier_range *range); |
159 | |
160 | /* |
161 | * invalidate_range() is either called between |
162 | * invalidate_range_start() and invalidate_range_end() when the |
163 | * VM has to free pages that where unmapped, but before the |
164 | * pages are actually freed, or outside of _start()/_end() when |
165 | * a (remote) TLB is necessary. |
166 | * |
167 | * If invalidate_range() is used to manage a non-CPU TLB with |
168 | * shared page-tables, it not necessary to implement the |
169 | * invalidate_range_start()/end() notifiers, as |
170 | * invalidate_range() alread catches the points in time when an |
171 | * external TLB range needs to be flushed. For more in depth |
172 | * discussion on this see Documentation/vm/mmu_notifier.rst |
173 | * |
174 | * Note that this function might be called with just a sub-range |
175 | * of what was passed to invalidate_range_start()/end(), if |
176 | * called between those functions. |
177 | */ |
178 | void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, |
179 | unsigned long start, unsigned long end); |
180 | }; |
181 | |
182 | /* |
183 | * The notifier chains are protected by mmap_sem and/or the reverse map |
184 | * semaphores. Notifier chains are only changed when all reverse maps and |
185 | * the mmap_sem locks are taken. |
186 | * |
187 | * Therefore notifier chains can only be traversed when either |
188 | * |
189 | * 1. mmap_sem is held. |
190 | * 2. One of the reverse map locks is held (i_mmap_rwsem or anon_vma->rwsem). |
191 | * 3. No other concurrent thread can access the list (release) |
192 | */ |
193 | struct mmu_notifier { |
194 | struct hlist_node hlist; |
195 | const struct mmu_notifier_ops *ops; |
196 | }; |
197 | |
198 | static inline int mm_has_notifiers(struct mm_struct *mm) |
199 | { |
200 | return unlikely(mm->mmu_notifier_mm); |
201 | } |
202 | |
203 | extern int mmu_notifier_register(struct mmu_notifier *mn, |
204 | struct mm_struct *mm); |
205 | extern int __mmu_notifier_register(struct mmu_notifier *mn, |
206 | struct mm_struct *mm); |
207 | extern void mmu_notifier_unregister(struct mmu_notifier *mn, |
208 | struct mm_struct *mm); |
209 | extern void mmu_notifier_unregister_no_release(struct mmu_notifier *mn, |
210 | struct mm_struct *mm); |
211 | extern void __mmu_notifier_mm_destroy(struct mm_struct *mm); |
212 | extern void __mmu_notifier_release(struct mm_struct *mm); |
213 | extern int __mmu_notifier_clear_flush_young(struct mm_struct *mm, |
214 | unsigned long start, |
215 | unsigned long end); |
216 | extern int __mmu_notifier_clear_young(struct mm_struct *mm, |
217 | unsigned long start, |
218 | unsigned long end); |
219 | extern int __mmu_notifier_test_young(struct mm_struct *mm, |
220 | unsigned long address); |
221 | extern void __mmu_notifier_change_pte(struct mm_struct *mm, |
222 | unsigned long address, pte_t pte); |
223 | extern int __mmu_notifier_invalidate_range_start(struct mmu_notifier_range *r); |
224 | extern void __mmu_notifier_invalidate_range_end(struct mmu_notifier_range *r, |
225 | bool only_end); |
226 | extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, |
227 | unsigned long start, unsigned long end); |
228 | |
229 | static inline void mmu_notifier_release(struct mm_struct *mm) |
230 | { |
231 | if (mm_has_notifiers(mm)) |
232 | __mmu_notifier_release(mm); |
233 | } |
234 | |
235 | static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, |
236 | unsigned long start, |
237 | unsigned long end) |
238 | { |
239 | if (mm_has_notifiers(mm)) |
240 | return __mmu_notifier_clear_flush_young(mm, start, end); |
241 | return 0; |
242 | } |
243 | |
244 | static inline int mmu_notifier_clear_young(struct mm_struct *mm, |
245 | unsigned long start, |
246 | unsigned long end) |
247 | { |
248 | if (mm_has_notifiers(mm)) |
249 | return __mmu_notifier_clear_young(mm, start, end); |
250 | return 0; |
251 | } |
252 | |
253 | static inline int mmu_notifier_test_young(struct mm_struct *mm, |
254 | unsigned long address) |
255 | { |
256 | if (mm_has_notifiers(mm)) |
257 | return __mmu_notifier_test_young(mm, address); |
258 | return 0; |
259 | } |
260 | |
261 | static inline void mmu_notifier_change_pte(struct mm_struct *mm, |
262 | unsigned long address, pte_t pte) |
263 | { |
264 | if (mm_has_notifiers(mm)) |
265 | __mmu_notifier_change_pte(mm, address, pte); |
266 | } |
267 | |
268 | static inline void |
269 | mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
270 | { |
271 | if (mm_has_notifiers(range->mm)) { |
272 | range->blockable = true; |
273 | __mmu_notifier_invalidate_range_start(range); |
274 | } |
275 | } |
276 | |
277 | static inline int |
278 | mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) |
279 | { |
280 | if (mm_has_notifiers(range->mm)) { |
281 | range->blockable = false; |
282 | return __mmu_notifier_invalidate_range_start(range); |
283 | } |
284 | return 0; |
285 | } |
286 | |
287 | static inline void |
288 | mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) |
289 | { |
290 | if (mm_has_notifiers(range->mm)) |
291 | __mmu_notifier_invalidate_range_end(range, false); |
292 | } |
293 | |
294 | static inline void |
295 | mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) |
296 | { |
297 | if (mm_has_notifiers(range->mm)) |
298 | __mmu_notifier_invalidate_range_end(range, true); |
299 | } |
300 | |
301 | static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, |
302 | unsigned long start, unsigned long end) |
303 | { |
304 | if (mm_has_notifiers(mm)) |
305 | __mmu_notifier_invalidate_range(mm, start, end); |
306 | } |
307 | |
308 | static inline void mmu_notifier_mm_init(struct mm_struct *mm) |
309 | { |
310 | mm->mmu_notifier_mm = NULL; |
311 | } |
312 | |
313 | static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) |
314 | { |
315 | if (mm_has_notifiers(mm)) |
316 | __mmu_notifier_mm_destroy(mm); |
317 | } |
318 | |
319 | |
320 | static inline void mmu_notifier_range_init(struct mmu_notifier_range *range, |
321 | struct mm_struct *mm, |
322 | unsigned long start, |
323 | unsigned long end) |
324 | { |
325 | range->mm = mm; |
326 | range->start = start; |
327 | range->end = end; |
328 | } |
329 | |
330 | #define ptep_clear_flush_young_notify(__vma, __address, __ptep) \ |
331 | ({ \ |
332 | int __young; \ |
333 | struct vm_area_struct *___vma = __vma; \ |
334 | unsigned long ___address = __address; \ |
335 | __young = ptep_clear_flush_young(___vma, ___address, __ptep); \ |
336 | __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ |
337 | ___address, \ |
338 | ___address + \ |
339 | PAGE_SIZE); \ |
340 | __young; \ |
341 | }) |
342 | |
343 | #define pmdp_clear_flush_young_notify(__vma, __address, __pmdp) \ |
344 | ({ \ |
345 | int __young; \ |
346 | struct vm_area_struct *___vma = __vma; \ |
347 | unsigned long ___address = __address; \ |
348 | __young = pmdp_clear_flush_young(___vma, ___address, __pmdp); \ |
349 | __young |= mmu_notifier_clear_flush_young(___vma->vm_mm, \ |
350 | ___address, \ |
351 | ___address + \ |
352 | PMD_SIZE); \ |
353 | __young; \ |
354 | }) |
355 | |
356 | #define ptep_clear_young_notify(__vma, __address, __ptep) \ |
357 | ({ \ |
358 | int __young; \ |
359 | struct vm_area_struct *___vma = __vma; \ |
360 | unsigned long ___address = __address; \ |
361 | __young = ptep_test_and_clear_young(___vma, ___address, __ptep);\ |
362 | __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ |
363 | ___address + PAGE_SIZE); \ |
364 | __young; \ |
365 | }) |
366 | |
367 | #define pmdp_clear_young_notify(__vma, __address, __pmdp) \ |
368 | ({ \ |
369 | int __young; \ |
370 | struct vm_area_struct *___vma = __vma; \ |
371 | unsigned long ___address = __address; \ |
372 | __young = pmdp_test_and_clear_young(___vma, ___address, __pmdp);\ |
373 | __young |= mmu_notifier_clear_young(___vma->vm_mm, ___address, \ |
374 | ___address + PMD_SIZE); \ |
375 | __young; \ |
376 | }) |
377 | |
378 | #define ptep_clear_flush_notify(__vma, __address, __ptep) \ |
379 | ({ \ |
380 | unsigned long ___addr = __address & PAGE_MASK; \ |
381 | struct mm_struct *___mm = (__vma)->vm_mm; \ |
382 | pte_t ___pte; \ |
383 | \ |
384 | ___pte = ptep_clear_flush(__vma, __address, __ptep); \ |
385 | mmu_notifier_invalidate_range(___mm, ___addr, \ |
386 | ___addr + PAGE_SIZE); \ |
387 | \ |
388 | ___pte; \ |
389 | }) |
390 | |
391 | #define pmdp_huge_clear_flush_notify(__vma, __haddr, __pmd) \ |
392 | ({ \ |
393 | unsigned long ___haddr = __haddr & HPAGE_PMD_MASK; \ |
394 | struct mm_struct *___mm = (__vma)->vm_mm; \ |
395 | pmd_t ___pmd; \ |
396 | \ |
397 | ___pmd = pmdp_huge_clear_flush(__vma, __haddr, __pmd); \ |
398 | mmu_notifier_invalidate_range(___mm, ___haddr, \ |
399 | ___haddr + HPAGE_PMD_SIZE); \ |
400 | \ |
401 | ___pmd; \ |
402 | }) |
403 | |
404 | #define pudp_huge_clear_flush_notify(__vma, __haddr, __pud) \ |
405 | ({ \ |
406 | unsigned long ___haddr = __haddr & HPAGE_PUD_MASK; \ |
407 | struct mm_struct *___mm = (__vma)->vm_mm; \ |
408 | pud_t ___pud; \ |
409 | \ |
410 | ___pud = pudp_huge_clear_flush(__vma, __haddr, __pud); \ |
411 | mmu_notifier_invalidate_range(___mm, ___haddr, \ |
412 | ___haddr + HPAGE_PUD_SIZE); \ |
413 | \ |
414 | ___pud; \ |
415 | }) |
416 | |
417 | /* |
418 | * set_pte_at_notify() sets the pte _after_ running the notifier. |
419 | * This is safe to start by updating the secondary MMUs, because the primary MMU |
420 | * pte invalidate must have already happened with a ptep_clear_flush() before |
421 | * set_pte_at_notify() has been invoked. Updating the secondary MMUs first is |
422 | * required when we change both the protection of the mapping from read-only to |
423 | * read-write and the pfn (like during copy on write page faults). Otherwise the |
424 | * old page would remain mapped readonly in the secondary MMUs after the new |
425 | * page is already writable by some CPU through the primary MMU. |
426 | */ |
427 | #define set_pte_at_notify(__mm, __address, __ptep, __pte) \ |
428 | ({ \ |
429 | struct mm_struct *___mm = __mm; \ |
430 | unsigned long ___address = __address; \ |
431 | pte_t ___pte = __pte; \ |
432 | \ |
433 | mmu_notifier_change_pte(___mm, ___address, ___pte); \ |
434 | set_pte_at(___mm, ___address, __ptep, ___pte); \ |
435 | }) |
436 | |
437 | extern void mmu_notifier_call_srcu(struct rcu_head *rcu, |
438 | void (*func)(struct rcu_head *rcu)); |
439 | |
440 | #else /* CONFIG_MMU_NOTIFIER */ |
441 | |
442 | struct mmu_notifier_range { |
443 | unsigned long start; |
444 | unsigned long end; |
445 | }; |
446 | |
447 | static inline void _mmu_notifier_range_init(struct mmu_notifier_range *range, |
448 | unsigned long start, |
449 | unsigned long end) |
450 | { |
451 | range->start = start; |
452 | range->end = end; |
453 | } |
454 | |
455 | #define mmu_notifier_range_init(range, mm, start, end) \ |
456 | _mmu_notifier_range_init(range, start, end) |
457 | |
458 | |
459 | static inline int mm_has_notifiers(struct mm_struct *mm) |
460 | { |
461 | return 0; |
462 | } |
463 | |
464 | static inline void mmu_notifier_release(struct mm_struct *mm) |
465 | { |
466 | } |
467 | |
468 | static inline int mmu_notifier_clear_flush_young(struct mm_struct *mm, |
469 | unsigned long start, |
470 | unsigned long end) |
471 | { |
472 | return 0; |
473 | } |
474 | |
475 | static inline int mmu_notifier_test_young(struct mm_struct *mm, |
476 | unsigned long address) |
477 | { |
478 | return 0; |
479 | } |
480 | |
481 | static inline void mmu_notifier_change_pte(struct mm_struct *mm, |
482 | unsigned long address, pte_t pte) |
483 | { |
484 | } |
485 | |
486 | static inline void |
487 | mmu_notifier_invalidate_range_start(struct mmu_notifier_range *range) |
488 | { |
489 | } |
490 | |
491 | static inline int |
492 | mmu_notifier_invalidate_range_start_nonblock(struct mmu_notifier_range *range) |
493 | { |
494 | return 0; |
495 | } |
496 | |
497 | static inline |
498 | void mmu_notifier_invalidate_range_end(struct mmu_notifier_range *range) |
499 | { |
500 | } |
501 | |
502 | static inline void |
503 | mmu_notifier_invalidate_range_only_end(struct mmu_notifier_range *range) |
504 | { |
505 | } |
506 | |
507 | static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, |
508 | unsigned long start, unsigned long end) |
509 | { |
510 | } |
511 | |
512 | static inline void mmu_notifier_mm_init(struct mm_struct *mm) |
513 | { |
514 | } |
515 | |
516 | static inline void mmu_notifier_mm_destroy(struct mm_struct *mm) |
517 | { |
518 | } |
519 | |
520 | #define ptep_clear_flush_young_notify ptep_clear_flush_young |
521 | #define pmdp_clear_flush_young_notify pmdp_clear_flush_young |
522 | #define ptep_clear_young_notify ptep_test_and_clear_young |
523 | #define pmdp_clear_young_notify pmdp_test_and_clear_young |
524 | #define ptep_clear_flush_notify ptep_clear_flush |
525 | #define pmdp_huge_clear_flush_notify pmdp_huge_clear_flush |
526 | #define pudp_huge_clear_flush_notify pudp_huge_clear_flush |
527 | #define set_pte_at_notify set_pte_at |
528 | |
529 | #endif /* CONFIG_MMU_NOTIFIER */ |
530 | |
531 | #endif /* _LINUX_MMU_NOTIFIER_H */ |
532 | |