1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * mm/pgtable-generic.c |
4 | * |
5 | * Generic pgtable methods declared in linux/pgtable.h |
6 | * |
7 | * Copyright (C) 2010 Linus Torvalds |
8 | */ |
9 | |
10 | #include <linux/pagemap.h> |
11 | #include <linux/hugetlb.h> |
12 | #include <linux/pgtable.h> |
13 | #include <linux/swap.h> |
14 | #include <linux/swapops.h> |
15 | #include <linux/mm_inline.h> |
16 | #include <asm/pgalloc.h> |
17 | #include <asm/tlb.h> |
18 | |
19 | /* |
20 | * If a p?d_bad entry is found while walking page tables, report |
21 | * the error, before resetting entry to p?d_none. Usually (but |
22 | * very seldom) called out from the p?d_none_or_clear_bad macros. |
23 | */ |
24 | |
25 | void pgd_clear_bad(pgd_t *pgd) |
26 | { |
27 | pgd_ERROR(*pgd); |
28 | pgd_clear(pgd); |
29 | } |
30 | |
31 | #ifndef __PAGETABLE_P4D_FOLDED |
32 | void p4d_clear_bad(p4d_t *p4d) |
33 | { |
34 | p4d_ERROR(*p4d); |
35 | p4d_clear(p4dp: p4d); |
36 | } |
37 | #endif |
38 | |
39 | #ifndef __PAGETABLE_PUD_FOLDED |
40 | void pud_clear_bad(pud_t *pud) |
41 | { |
42 | pud_ERROR(*pud); |
43 | pud_clear(pudp: pud); |
44 | } |
45 | #endif |
46 | |
47 | /* |
48 | * Note that the pmd variant below can't be stub'ed out just as for p4d/pud |
49 | * above. pmd folding is special and typically pmd_* macros refer to upper |
50 | * level even when folded |
51 | */ |
52 | void pmd_clear_bad(pmd_t *pmd) |
53 | { |
54 | pmd_ERROR(*pmd); |
55 | pmd_clear(pmdp: pmd); |
56 | } |
57 | |
58 | #ifndef __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS |
59 | /* |
60 | * Only sets the access flags (dirty, accessed), as well as write |
61 | * permission. Furthermore, we know it always gets set to a "more |
62 | * permissive" setting, which allows most architectures to optimize |
63 | * this. We return whether the PTE actually changed, which in turn |
64 | * instructs the caller to do things like update__mmu_cache. This |
65 | * used to be done in the caller, but sparc needs minor faults to |
66 | * force that call on sun4c so we changed this macro slightly |
67 | */ |
68 | int ptep_set_access_flags(struct vm_area_struct *vma, |
69 | unsigned long address, pte_t *ptep, |
70 | pte_t entry, int dirty) |
71 | { |
72 | int changed = !pte_same(ptep_get(ptep), entry); |
73 | if (changed) { |
74 | set_pte_at(vma->vm_mm, address, ptep, entry); |
75 | flush_tlb_fix_spurious_fault(vma, address, ptep); |
76 | } |
77 | return changed; |
78 | } |
79 | #endif |
80 | |
81 | #ifndef __HAVE_ARCH_PTEP_CLEAR_YOUNG_FLUSH |
82 | int ptep_clear_flush_young(struct vm_area_struct *vma, |
83 | unsigned long address, pte_t *ptep) |
84 | { |
85 | int young; |
86 | young = ptep_test_and_clear_young(vma, address, ptep); |
87 | if (young) |
88 | flush_tlb_page(vma, address); |
89 | return young; |
90 | } |
91 | #endif |
92 | |
93 | #ifndef __HAVE_ARCH_PTEP_CLEAR_FLUSH |
94 | pte_t ptep_clear_flush(struct vm_area_struct *vma, unsigned long address, |
95 | pte_t *ptep) |
96 | { |
97 | struct mm_struct *mm = (vma)->vm_mm; |
98 | pte_t pte; |
99 | pte = ptep_get_and_clear(mm, addr: address, ptep); |
100 | if (pte_accessible(mm, a: pte)) |
101 | flush_tlb_page(vma, a: address); |
102 | return pte; |
103 | } |
104 | #endif |
105 | |
106 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
107 | |
108 | #ifndef __HAVE_ARCH_PMDP_SET_ACCESS_FLAGS |
109 | int pmdp_set_access_flags(struct vm_area_struct *vma, |
110 | unsigned long address, pmd_t *pmdp, |
111 | pmd_t entry, int dirty) |
112 | { |
113 | int changed = !pmd_same(*pmdp, entry); |
114 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
115 | if (changed) { |
116 | set_pmd_at(vma->vm_mm, address, pmdp, entry); |
117 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
118 | } |
119 | return changed; |
120 | } |
121 | #endif |
122 | |
123 | #ifndef __HAVE_ARCH_PMDP_CLEAR_YOUNG_FLUSH |
124 | int pmdp_clear_flush_young(struct vm_area_struct *vma, |
125 | unsigned long address, pmd_t *pmdp) |
126 | { |
127 | int young; |
128 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
129 | young = pmdp_test_and_clear_young(vma, address, pmdp); |
130 | if (young) |
131 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
132 | return young; |
133 | } |
134 | #endif |
135 | |
136 | #ifndef __HAVE_ARCH_PMDP_HUGE_CLEAR_FLUSH |
137 | pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, |
138 | pmd_t *pmdp) |
139 | { |
140 | pmd_t pmd; |
141 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
142 | VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && |
143 | !pmd_devmap(*pmdp)); |
144 | pmd = pmdp_huge_get_and_clear(mm: vma->vm_mm, addr: address, pmdp); |
145 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
146 | return pmd; |
147 | } |
148 | |
149 | #ifdef CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD |
150 | pud_t pudp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, |
151 | pud_t *pudp) |
152 | { |
153 | pud_t pud; |
154 | |
155 | VM_BUG_ON(address & ~HPAGE_PUD_MASK); |
156 | VM_BUG_ON(!pud_trans_huge(*pudp) && !pud_devmap(*pudp)); |
157 | pud = pudp_huge_get_and_clear(mm: vma->vm_mm, addr: address, pudp); |
158 | flush_pud_tlb_range(vma, address, address + HPAGE_PUD_SIZE); |
159 | return pud; |
160 | } |
161 | #endif |
162 | #endif |
163 | |
164 | #ifndef __HAVE_ARCH_PGTABLE_DEPOSIT |
165 | void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
166 | pgtable_t pgtable) |
167 | { |
168 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
169 | |
170 | /* FIFO */ |
171 | if (!pmd_huge_pte(mm, pmdp)) |
172 | INIT_LIST_HEAD(list: &pgtable->lru); |
173 | else |
174 | list_add(new: &pgtable->lru, head: &pmd_huge_pte(mm, pmdp)->lru); |
175 | pmd_huge_pte(mm, pmdp) = pgtable; |
176 | } |
177 | #endif |
178 | |
179 | #ifndef __HAVE_ARCH_PGTABLE_WITHDRAW |
180 | /* no "address" argument so destroys page coloring of some arch */ |
181 | pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) |
182 | { |
183 | pgtable_t pgtable; |
184 | |
185 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
186 | |
187 | /* FIFO */ |
188 | pgtable = pmd_huge_pte(mm, pmdp); |
189 | pmd_huge_pte(mm, pmdp) = list_first_entry_or_null(&pgtable->lru, |
190 | struct page, lru); |
191 | if (pmd_huge_pte(mm, pmdp)) |
192 | list_del(entry: &pgtable->lru); |
193 | return pgtable; |
194 | } |
195 | #endif |
196 | |
197 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE |
198 | pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, |
199 | pmd_t *pmdp) |
200 | { |
201 | pmd_t old = pmdp_establish(vma, address, pmdp, pmd: pmd_mkinvalid(pmd: *pmdp)); |
202 | flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
203 | return old; |
204 | } |
205 | #endif |
206 | |
207 | #ifndef __HAVE_ARCH_PMDP_INVALIDATE_AD |
208 | pmd_t pmdp_invalidate_ad(struct vm_area_struct *vma, unsigned long address, |
209 | pmd_t *pmdp) |
210 | { |
211 | return pmdp_invalidate(vma, address, pmdp); |
212 | } |
213 | #endif |
214 | |
215 | #ifndef pmdp_collapse_flush |
216 | pmd_t pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, |
217 | pmd_t *pmdp) |
218 | { |
219 | /* |
220 | * pmd and hugepage pte format are same. So we could |
221 | * use the same function. |
222 | */ |
223 | pmd_t pmd; |
224 | |
225 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
226 | VM_BUG_ON(pmd_trans_huge(*pmdp)); |
227 | pmd = pmdp_huge_get_and_clear(mm: vma->vm_mm, addr: address, pmdp); |
228 | |
229 | /* collapse entails shooting down ptes not pmd */ |
230 | flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); |
231 | return pmd; |
232 | } |
233 | #endif |
234 | |
235 | /* arch define pte_free_defer in asm/pgalloc.h for its own implementation */ |
236 | #ifndef pte_free_defer |
237 | static void pte_free_now(struct rcu_head *head) |
238 | { |
239 | struct page *page; |
240 | |
241 | page = container_of(head, struct page, rcu_head); |
242 | pte_free(NULL /* mm not passed and not used */, pte_page: (pgtable_t)page); |
243 | } |
244 | |
245 | void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable) |
246 | { |
247 | struct page *page; |
248 | |
249 | page = pgtable; |
250 | call_rcu(head: &page->rcu_head, func: pte_free_now); |
251 | } |
252 | #endif /* pte_free_defer */ |
253 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
254 | |
255 | #if defined(CONFIG_GUP_GET_PXX_LOW_HIGH) && \ |
256 | (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RCU)) |
257 | /* |
258 | * See the comment above ptep_get_lockless() in include/linux/pgtable.h: |
259 | * the barriers in pmdp_get_lockless() cannot guarantee that the value in |
260 | * pmd_high actually belongs with the value in pmd_low; but holding interrupts |
261 | * off blocks the TLB flush between present updates, which guarantees that a |
262 | * successful __pte_offset_map() points to a page from matched halves. |
263 | */ |
264 | static unsigned long pmdp_get_lockless_start(void) |
265 | { |
266 | unsigned long irqflags; |
267 | |
268 | local_irq_save(irqflags); |
269 | return irqflags; |
270 | } |
271 | static void pmdp_get_lockless_end(unsigned long irqflags) |
272 | { |
273 | local_irq_restore(irqflags); |
274 | } |
275 | #else |
276 | static unsigned long pmdp_get_lockless_start(void) { return 0; } |
277 | static void pmdp_get_lockless_end(unsigned long irqflags) { } |
278 | #endif |
279 | |
280 | pte_t *__pte_offset_map(pmd_t *pmd, unsigned long addr, pmd_t *pmdvalp) |
281 | { |
282 | unsigned long irqflags; |
283 | pmd_t pmdval; |
284 | |
285 | rcu_read_lock(); |
286 | irqflags = pmdp_get_lockless_start(); |
287 | pmdval = pmdp_get_lockless(pmdp: pmd); |
288 | pmdp_get_lockless_end(irqflags); |
289 | |
290 | if (pmdvalp) |
291 | *pmdvalp = pmdval; |
292 | if (unlikely(pmd_none(pmdval) || is_pmd_migration_entry(pmdval))) |
293 | goto nomap; |
294 | if (unlikely(pmd_trans_huge(pmdval) || pmd_devmap(pmdval))) |
295 | goto nomap; |
296 | if (unlikely(pmd_bad(pmdval))) { |
297 | pmd_clear_bad(pmd); |
298 | goto nomap; |
299 | } |
300 | return __pte_map(pmd: &pmdval, address: addr); |
301 | nomap: |
302 | rcu_read_unlock(); |
303 | return NULL; |
304 | } |
305 | |
306 | pte_t *pte_offset_map_nolock(struct mm_struct *mm, pmd_t *pmd, |
307 | unsigned long addr, spinlock_t **ptlp) |
308 | { |
309 | pmd_t pmdval; |
310 | pte_t *pte; |
311 | |
312 | pte = __pte_offset_map(pmd, addr, pmdvalp: &pmdval); |
313 | if (likely(pte)) |
314 | *ptlp = pte_lockptr(mm, pmd: &pmdval); |
315 | return pte; |
316 | } |
317 | |
318 | /* |
319 | * pte_offset_map_lock(mm, pmd, addr, ptlp), and its internal implementation |
320 | * __pte_offset_map_lock() below, is usually called with the pmd pointer for |
321 | * addr, reached by walking down the mm's pgd, p4d, pud for addr: either while |
322 | * holding mmap_lock or vma lock for read or for write; or in truncate or rmap |
323 | * context, while holding file's i_mmap_lock or anon_vma lock for read (or for |
324 | * write). In a few cases, it may be used with pmd pointing to a pmd_t already |
325 | * copied to or constructed on the stack. |
326 | * |
327 | * When successful, it returns the pte pointer for addr, with its page table |
328 | * kmapped if necessary (when CONFIG_HIGHPTE), and locked against concurrent |
329 | * modification by software, with a pointer to that spinlock in ptlp (in some |
330 | * configs mm->page_table_lock, in SPLIT_PTLOCK configs a spinlock in table's |
331 | * struct page). pte_unmap_unlock(pte, ptl) to unlock and unmap afterwards. |
332 | * |
333 | * But it is unsuccessful, returning NULL with *ptlp unchanged, if there is no |
334 | * page table at *pmd: if, for example, the page table has just been removed, |
335 | * or replaced by the huge pmd of a THP. (When successful, *pmd is rechecked |
336 | * after acquiring the ptlock, and retried internally if it changed: so that a |
337 | * page table can be safely removed or replaced by THP while holding its lock.) |
338 | * |
339 | * pte_offset_map(pmd, addr), and its internal helper __pte_offset_map() above, |
340 | * just returns the pte pointer for addr, its page table kmapped if necessary; |
341 | * or NULL if there is no page table at *pmd. It does not attempt to lock the |
342 | * page table, so cannot normally be used when the page table is to be updated, |
343 | * or when entries read must be stable. But it does take rcu_read_lock(): so |
344 | * that even when page table is racily removed, it remains a valid though empty |
345 | * and disconnected table. Until pte_unmap(pte) unmaps and rcu_read_unlock()s |
346 | * afterwards. |
347 | * |
348 | * pte_offset_map_nolock(mm, pmd, addr, ptlp), above, is like pte_offset_map(); |
349 | * but when successful, it also outputs a pointer to the spinlock in ptlp - as |
350 | * pte_offset_map_lock() does, but in this case without locking it. This helps |
351 | * the caller to avoid a later pte_lockptr(mm, *pmd), which might by that time |
352 | * act on a changed *pmd: pte_offset_map_nolock() provides the correct spinlock |
353 | * pointer for the page table that it returns. In principle, the caller should |
354 | * recheck *pmd once the lock is taken; in practice, no callsite needs that - |
355 | * either the mmap_lock for write, or pte_same() check on contents, is enough. |
356 | * |
357 | * Note that free_pgtables(), used after unmapping detached vmas, or when |
358 | * exiting the whole mm, does not take page table lock before freeing a page |
359 | * table, and may not use RCU at all: "outsiders" like khugepaged should avoid |
360 | * pte_offset_map() and co once the vma is detached from mm or mm_users is zero. |
361 | */ |
362 | pte_t *__pte_offset_map_lock(struct mm_struct *mm, pmd_t *pmd, |
363 | unsigned long addr, spinlock_t **ptlp) |
364 | { |
365 | spinlock_t *ptl; |
366 | pmd_t pmdval; |
367 | pte_t *pte; |
368 | again: |
369 | pte = __pte_offset_map(pmd, addr, pmdvalp: &pmdval); |
370 | if (unlikely(!pte)) |
371 | return pte; |
372 | ptl = pte_lockptr(mm, pmd: &pmdval); |
373 | spin_lock(lock: ptl); |
374 | if (likely(pmd_same(pmdval, pmdp_get_lockless(pmd)))) { |
375 | *ptlp = ptl; |
376 | return pte; |
377 | } |
378 | pte_unmap_unlock(pte, ptl); |
379 | goto again; |
380 | } |
381 | |