1 | // SPDX-License-Identifier: GPL-2.0-or-later |
2 | /* |
3 | * Copyright 2005, Paul Mackerras, IBM Corporation. |
4 | * Copyright 2009, Benjamin Herrenschmidt, IBM Corporation. |
5 | * Copyright 2015-2016, Aneesh Kumar K.V, IBM Corporation. |
6 | */ |
7 | |
8 | #include <linux/sched.h> |
9 | #include <linux/mm_types.h> |
10 | #include <linux/mm.h> |
11 | #include <linux/stop_machine.h> |
12 | |
13 | #include <asm/sections.h> |
14 | #include <asm/mmu.h> |
15 | #include <asm/tlb.h> |
16 | #include <asm/firmware.h> |
17 | |
18 | #include <mm/mmu_decl.h> |
19 | |
20 | #include <trace/events/thp.h> |
21 | |
22 | #if H_PGTABLE_RANGE > (USER_VSID_RANGE * (TASK_SIZE_USER64 / TASK_CONTEXT_SIZE)) |
23 | #warning Limited user VSID range means pagetable space is wasted |
24 | #endif |
25 | |
26 | #ifdef CONFIG_SPARSEMEM_VMEMMAP |
27 | /* |
28 | * vmemmap is the starting address of the virtual address space where |
29 | * struct pages are allocated for all possible PFNs present on the system |
30 | * including holes and bad memory (hence sparse). These virtual struct |
31 | * pages are stored in sequence in this virtual address space irrespective |
32 | * of the fact whether the corresponding PFN is valid or not. This achieves |
33 | * constant relationship between address of struct page and its PFN. |
34 | * |
35 | * During boot or memory hotplug operation when a new memory section is |
36 | * added, physical memory allocation (including hash table bolting) will |
37 | * be performed for the set of struct pages which are part of the memory |
38 | * section. This saves memory by not allocating struct pages for PFNs |
39 | * which are not valid. |
40 | * |
41 | * ---------------------------------------------- |
42 | * | PHYSICAL ALLOCATION OF VIRTUAL STRUCT PAGES| |
43 | * ---------------------------------------------- |
44 | * |
45 | * f000000000000000 c000000000000000 |
46 | * vmemmap +--------------+ +--------------+ |
47 | * + | page struct | +--------------> | page struct | |
48 | * | +--------------+ +--------------+ |
49 | * | | page struct | +--------------> | page struct | |
50 | * | +--------------+ | +--------------+ |
51 | * | | page struct | + +------> | page struct | |
52 | * | +--------------+ | +--------------+ |
53 | * | | page struct | | +--> | page struct | |
54 | * | +--------------+ | | +--------------+ |
55 | * | | page struct | | | |
56 | * | +--------------+ | | |
57 | * | | page struct | | | |
58 | * | +--------------+ | | |
59 | * | | page struct | | | |
60 | * | +--------------+ | | |
61 | * | | page struct | | | |
62 | * | +--------------+ | | |
63 | * | | page struct | +-------+ | |
64 | * | +--------------+ | |
65 | * | | page struct | +-----------+ |
66 | * | +--------------+ |
67 | * | | page struct | No mapping |
68 | * | +--------------+ |
69 | * | | page struct | No mapping |
70 | * v +--------------+ |
71 | * |
72 | * ----------------------------------------- |
73 | * | RELATION BETWEEN STRUCT PAGES AND PFNS| |
74 | * ----------------------------------------- |
75 | * |
76 | * vmemmap +--------------+ +---------------+ |
77 | * + | page struct | +-------------> | PFN | |
78 | * | +--------------+ +---------------+ |
79 | * | | page struct | +-------------> | PFN | |
80 | * | +--------------+ +---------------+ |
81 | * | | page struct | +-------------> | PFN | |
82 | * | +--------------+ +---------------+ |
83 | * | | page struct | +-------------> | PFN | |
84 | * | +--------------+ +---------------+ |
85 | * | | | |
86 | * | +--------------+ |
87 | * | | | |
88 | * | +--------------+ |
89 | * | | | |
90 | * | +--------------+ +---------------+ |
91 | * | | page struct | +-------------> | PFN | |
92 | * | +--------------+ +---------------+ |
93 | * | | | |
94 | * | +--------------+ |
95 | * | | | |
96 | * | +--------------+ +---------------+ |
97 | * | | page struct | +-------------> | PFN | |
98 | * | +--------------+ +---------------+ |
99 | * | | page struct | +-------------> | PFN | |
100 | * v +--------------+ +---------------+ |
101 | */ |
102 | /* |
103 | * On hash-based CPUs, the vmemmap is bolted in the hash table. |
104 | * |
105 | */ |
106 | int __meminit hash__vmemmap_create_mapping(unsigned long start, |
107 | unsigned long page_size, |
108 | unsigned long phys) |
109 | { |
110 | int rc; |
111 | |
112 | if ((start + page_size) >= H_VMEMMAP_END) { |
113 | pr_warn("Outside the supported range\n" ); |
114 | return -1; |
115 | } |
116 | |
117 | rc = htab_bolt_mapping(start, start + page_size, phys, |
118 | pgprot_val(PAGE_KERNEL), |
119 | mmu_vmemmap_psize, mmu_kernel_ssize); |
120 | if (rc < 0) { |
121 | int rc2 = htab_remove_mapping(start, start + page_size, |
122 | mmu_vmemmap_psize, |
123 | mmu_kernel_ssize); |
124 | BUG_ON(rc2 && (rc2 != -ENOENT)); |
125 | } |
126 | return rc; |
127 | } |
128 | |
129 | #ifdef CONFIG_MEMORY_HOTPLUG |
130 | void hash__vmemmap_remove_mapping(unsigned long start, |
131 | unsigned long page_size) |
132 | { |
133 | int rc = htab_remove_mapping(start, start + page_size, |
134 | mmu_vmemmap_psize, |
135 | mmu_kernel_ssize); |
136 | BUG_ON((rc < 0) && (rc != -ENOENT)); |
137 | WARN_ON(rc == -ENOENT); |
138 | } |
139 | #endif |
140 | #endif /* CONFIG_SPARSEMEM_VMEMMAP */ |
141 | |
142 | /* |
143 | * map_kernel_page currently only called by __ioremap |
144 | * map_kernel_page adds an entry to the ioremap page table |
145 | * and adds an entry to the HPT, possibly bolting it |
146 | */ |
147 | int hash__map_kernel_page(unsigned long ea, unsigned long pa, pgprot_t prot) |
148 | { |
149 | pgd_t *pgdp; |
150 | p4d_t *p4dp; |
151 | pud_t *pudp; |
152 | pmd_t *pmdp; |
153 | pte_t *ptep; |
154 | |
155 | BUILD_BUG_ON(TASK_SIZE_USER64 > H_PGTABLE_RANGE); |
156 | if (slab_is_available()) { |
157 | pgdp = pgd_offset_k(ea); |
158 | p4dp = p4d_offset(pgd: pgdp, address: ea); |
159 | pudp = pud_alloc(mm: &init_mm, p4d: p4dp, address: ea); |
160 | if (!pudp) |
161 | return -ENOMEM; |
162 | pmdp = pmd_alloc(mm: &init_mm, pud: pudp, address: ea); |
163 | if (!pmdp) |
164 | return -ENOMEM; |
165 | ptep = pte_alloc_kernel(pmdp, ea); |
166 | if (!ptep) |
167 | return -ENOMEM; |
168 | set_pte_at(&init_mm, ea, ptep, pfn_pte(pa >> PAGE_SHIFT, prot)); |
169 | } else { |
170 | /* |
171 | * If the mm subsystem is not fully up, we cannot create a |
172 | * linux page table entry for this mapping. Simply bolt an |
173 | * entry in the hardware page table. |
174 | * |
175 | */ |
176 | if (htab_bolt_mapping(ea, ea + PAGE_SIZE, pa, pgprot_val(prot), |
177 | mmu_io_psize, mmu_kernel_ssize)) { |
178 | printk(KERN_ERR "Failed to do bolted mapping IO " |
179 | "memory at %016lx !\n" , pa); |
180 | return -ENOMEM; |
181 | } |
182 | } |
183 | |
184 | smp_wmb(); |
185 | return 0; |
186 | } |
187 | |
188 | #ifdef CONFIG_TRANSPARENT_HUGEPAGE |
189 | |
190 | unsigned long hash__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, |
191 | pmd_t *pmdp, unsigned long clr, |
192 | unsigned long set) |
193 | { |
194 | __be64 old_be, tmp; |
195 | unsigned long old; |
196 | |
197 | #ifdef CONFIG_DEBUG_VM |
198 | WARN_ON(!hash__pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); |
199 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
200 | #endif |
201 | |
202 | __asm__ __volatile__( |
203 | "1: ldarx %0,0,%3\n\ |
204 | and. %1,%0,%6\n\ |
205 | bne- 1b \n\ |
206 | andc %1,%0,%4 \n\ |
207 | or %1,%1,%7\n\ |
208 | stdcx. %1,0,%3 \n\ |
209 | bne- 1b" |
210 | : "=&r" (old_be), "=&r" (tmp), "=m" (*pmdp) |
211 | : "r" (pmdp), "r" (cpu_to_be64(clr)), "m" (*pmdp), |
212 | "r" (cpu_to_be64(H_PAGE_BUSY)), "r" (cpu_to_be64(set)) |
213 | : "cc" ); |
214 | |
215 | old = be64_to_cpu(old_be); |
216 | |
217 | trace_hugepage_update_pmd(addr, pmd: old, clr, set); |
218 | if (old & H_PAGE_HASHPTE) |
219 | hpte_do_hugepage_flush(mm, addr, pmdp, old); |
220 | return old; |
221 | } |
222 | |
223 | pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, unsigned long address, |
224 | pmd_t *pmdp) |
225 | { |
226 | pmd_t pmd; |
227 | |
228 | VM_BUG_ON(address & ~HPAGE_PMD_MASK); |
229 | VM_BUG_ON(pmd_trans_huge(*pmdp)); |
230 | VM_BUG_ON(pmd_devmap(*pmdp)); |
231 | |
232 | pmd = *pmdp; |
233 | pmd_clear(pmdp); |
234 | /* |
235 | * Wait for all pending hash_page to finish. This is needed |
236 | * in case of subpage collapse. When we collapse normal pages |
237 | * to hugepage, we first clear the pmd, then invalidate all |
238 | * the PTE entries. The assumption here is that any low level |
239 | * page fault will see a none pmd and take the slow path that |
240 | * will wait on mmap_lock. But we could very well be in a |
241 | * hash_page with local ptep pointer value. Such a hash page |
242 | * can result in adding new HPTE entries for normal subpages. |
243 | * That means we could be modifying the page content as we |
244 | * copy them to a huge page. So wait for parallel hash_page |
245 | * to finish before invalidating HPTE entries. We can do this |
246 | * by sending an IPI to all the cpus and executing a dummy |
247 | * function there. |
248 | */ |
249 | serialize_against_pte_lookup(vma->vm_mm); |
250 | /* |
251 | * Now invalidate the hpte entries in the range |
252 | * covered by pmd. This make sure we take a |
253 | * fault and will find the pmd as none, which will |
254 | * result in a major fault which takes mmap_lock and |
255 | * hence wait for collapse to complete. Without this |
256 | * the __collapse_huge_page_copy can result in copying |
257 | * the old content. |
258 | */ |
259 | flush_hash_table_pmd_range(vma->vm_mm, &pmd, address); |
260 | return pmd; |
261 | } |
262 | |
263 | /* |
264 | * We want to put the pgtable in pmd and use pgtable for tracking |
265 | * the base page size hptes |
266 | */ |
267 | void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, |
268 | pgtable_t pgtable) |
269 | { |
270 | pgtable_t *pgtable_slot; |
271 | |
272 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
273 | /* |
274 | * we store the pgtable in the second half of PMD |
275 | */ |
276 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; |
277 | *pgtable_slot = pgtable; |
278 | /* |
279 | * expose the deposited pgtable to other cpus. |
280 | * before we set the hugepage PTE at pmd level |
281 | * hash fault code looks at the deposted pgtable |
282 | * to store hash index values. |
283 | */ |
284 | smp_wmb(); |
285 | } |
286 | |
287 | pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) |
288 | { |
289 | pgtable_t pgtable; |
290 | pgtable_t *pgtable_slot; |
291 | |
292 | assert_spin_locked(pmd_lockptr(mm, pmdp)); |
293 | |
294 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; |
295 | pgtable = *pgtable_slot; |
296 | /* |
297 | * Once we withdraw, mark the entry NULL. |
298 | */ |
299 | *pgtable_slot = NULL; |
300 | /* |
301 | * We store HPTE information in the deposited PTE fragment. |
302 | * zero out the content on withdraw. |
303 | */ |
304 | memset(pgtable, 0, PTE_FRAG_SIZE); |
305 | return pgtable; |
306 | } |
307 | |
308 | /* |
309 | * A linux hugepage PMD was changed and the corresponding hash table entries |
310 | * neesd to be flushed. |
311 | */ |
312 | void hpte_do_hugepage_flush(struct mm_struct *mm, unsigned long addr, |
313 | pmd_t *pmdp, unsigned long old_pmd) |
314 | { |
315 | int ssize; |
316 | unsigned int psize; |
317 | unsigned long vsid; |
318 | unsigned long flags = 0; |
319 | |
320 | /* get the base page size,vsid and segment size */ |
321 | #ifdef CONFIG_DEBUG_VM |
322 | psize = get_slice_psize(mm, addr); |
323 | BUG_ON(psize == MMU_PAGE_16M); |
324 | #endif |
325 | if (old_pmd & H_PAGE_COMBO) |
326 | psize = MMU_PAGE_4K; |
327 | else |
328 | psize = MMU_PAGE_64K; |
329 | |
330 | if (!is_kernel_addr(addr)) { |
331 | ssize = user_segment_size(addr); |
332 | vsid = get_user_vsid(&mm->context, addr, ssize); |
333 | WARN_ON(vsid == 0); |
334 | } else { |
335 | vsid = get_kernel_vsid(addr, mmu_kernel_ssize); |
336 | ssize = mmu_kernel_ssize; |
337 | } |
338 | |
339 | if (mm_is_thread_local(mm)) |
340 | flags |= HPTE_LOCAL_UPDATE; |
341 | |
342 | return flush_hash_hugepage(vsid, addr, pmdp, psize, ssize, flags); |
343 | } |
344 | |
345 | pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, |
346 | unsigned long addr, pmd_t *pmdp) |
347 | { |
348 | pmd_t old_pmd; |
349 | pgtable_t pgtable; |
350 | unsigned long old; |
351 | pgtable_t *pgtable_slot; |
352 | |
353 | old = pmd_hugepage_update(mm, addr, pmdp, ~0UL, 0); |
354 | old_pmd = __pmd(val: old); |
355 | /* |
356 | * We have pmd == none and we are holding page_table_lock. |
357 | * So we can safely go and clear the pgtable hash |
358 | * index info. |
359 | */ |
360 | pgtable_slot = (pgtable_t *)pmdp + PTRS_PER_PMD; |
361 | pgtable = *pgtable_slot; |
362 | /* |
363 | * Let's zero out old valid and hash index details |
364 | * hash fault look at them. |
365 | */ |
366 | memset(pgtable, 0, PTE_FRAG_SIZE); |
367 | return old_pmd; |
368 | } |
369 | |
370 | int hash__has_transparent_hugepage(void) |
371 | { |
372 | |
373 | if (!mmu_has_feature(MMU_FTR_16M_PAGE)) |
374 | return 0; |
375 | /* |
376 | * We support THP only if PMD_SIZE is 16MB. |
377 | */ |
378 | if (mmu_psize_defs[MMU_PAGE_16M].shift != PMD_SHIFT) |
379 | return 0; |
380 | /* |
381 | * We need to make sure that we support 16MB hugepage in a segment |
382 | * with base page size 64K or 4K. We only enable THP with a PAGE_SIZE |
383 | * of 64K. |
384 | */ |
385 | /* |
386 | * If we have 64K HPTE, we will be using that by default |
387 | */ |
388 | if (mmu_psize_defs[MMU_PAGE_64K].shift && |
389 | (mmu_psize_defs[MMU_PAGE_64K].penc[MMU_PAGE_16M] == -1)) |
390 | return 0; |
391 | /* |
392 | * Ok we only have 4K HPTE |
393 | */ |
394 | if (mmu_psize_defs[MMU_PAGE_4K].penc[MMU_PAGE_16M] == -1) |
395 | return 0; |
396 | |
397 | return 1; |
398 | } |
399 | EXPORT_SYMBOL_GPL(hash__has_transparent_hugepage); |
400 | |
401 | #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ |
402 | |
403 | #ifdef CONFIG_STRICT_KERNEL_RWX |
404 | |
405 | struct change_memory_parms { |
406 | unsigned long start, end, newpp; |
407 | unsigned int step, nr_cpus; |
408 | atomic_t master_cpu; |
409 | atomic_t cpu_counter; |
410 | }; |
411 | |
412 | // We'd rather this was on the stack but it has to be in the RMO |
413 | static struct change_memory_parms chmem_parms; |
414 | |
415 | // And therefore we need a lock to protect it from concurrent use |
416 | static DEFINE_MUTEX(chmem_lock); |
417 | |
418 | static void change_memory_range(unsigned long start, unsigned long end, |
419 | unsigned int step, unsigned long newpp) |
420 | { |
421 | unsigned long idx; |
422 | |
423 | pr_debug("Changing page protection on range 0x%lx-0x%lx, to 0x%lx, step 0x%x\n" , |
424 | start, end, newpp, step); |
425 | |
426 | for (idx = start; idx < end; idx += step) |
427 | /* Not sure if we can do much with the return value */ |
428 | mmu_hash_ops.hpte_updateboltedpp(newpp, idx, mmu_linear_psize, |
429 | mmu_kernel_ssize); |
430 | } |
431 | |
432 | static int notrace chmem_secondary_loop(struct change_memory_parms *parms) |
433 | { |
434 | unsigned long msr, tmp, flags; |
435 | int *p; |
436 | |
437 | p = &parms->cpu_counter.counter; |
438 | |
439 | local_irq_save(flags); |
440 | hard_irq_disable(); |
441 | |
442 | asm volatile ( |
443 | // Switch to real mode and leave interrupts off |
444 | "mfmsr %[msr] ;" |
445 | "li %[tmp], %[MSR_IR_DR] ;" |
446 | "andc %[tmp], %[msr], %[tmp] ;" |
447 | "mtmsrd %[tmp] ;" |
448 | |
449 | // Tell the master we are in real mode |
450 | "1: " |
451 | "lwarx %[tmp], 0, %[p] ;" |
452 | "addic %[tmp], %[tmp], -1 ;" |
453 | "stwcx. %[tmp], 0, %[p] ;" |
454 | "bne- 1b ;" |
455 | |
456 | // Spin until the counter goes to zero |
457 | "2: ;" |
458 | "lwz %[tmp], 0(%[p]) ;" |
459 | "cmpwi %[tmp], 0 ;" |
460 | "bne- 2b ;" |
461 | |
462 | // Switch back to virtual mode |
463 | "mtmsrd %[msr] ;" |
464 | |
465 | : // outputs |
466 | [msr] "=&r" (msr), [tmp] "=&b" (tmp), "+m" (*p) |
467 | : // inputs |
468 | [p] "b" (p), [MSR_IR_DR] "i" (MSR_IR | MSR_DR) |
469 | : // clobbers |
470 | "cc" , "xer" |
471 | ); |
472 | |
473 | local_irq_restore(flags); |
474 | |
475 | return 0; |
476 | } |
477 | |
478 | static int change_memory_range_fn(void *data) |
479 | { |
480 | struct change_memory_parms *parms = data; |
481 | |
482 | // First CPU goes through, all others wait. |
483 | if (atomic_xchg(v: &parms->master_cpu, new: 1) == 1) |
484 | return chmem_secondary_loop(parms); |
485 | |
486 | // Wait for all but one CPU (this one) to call-in |
487 | while (atomic_read(v: &parms->cpu_counter) > 1) |
488 | barrier(); |
489 | |
490 | change_memory_range(start: parms->start, end: parms->end, step: parms->step, newpp: parms->newpp); |
491 | |
492 | mb(); |
493 | |
494 | // Signal the other CPUs that we're done |
495 | atomic_dec(v: &parms->cpu_counter); |
496 | |
497 | return 0; |
498 | } |
499 | |
500 | static bool hash__change_memory_range(unsigned long start, unsigned long end, |
501 | unsigned long newpp) |
502 | { |
503 | unsigned int step, shift; |
504 | |
505 | shift = mmu_psize_defs[mmu_linear_psize].shift; |
506 | step = 1 << shift; |
507 | |
508 | start = ALIGN_DOWN(start, step); |
509 | end = ALIGN(end, step); // aligns up |
510 | |
511 | if (start >= end) |
512 | return false; |
513 | |
514 | if (firmware_has_feature(FW_FEATURE_LPAR)) { |
515 | mutex_lock(&chmem_lock); |
516 | |
517 | chmem_parms.start = start; |
518 | chmem_parms.end = end; |
519 | chmem_parms.step = step; |
520 | chmem_parms.newpp = newpp; |
521 | atomic_set(v: &chmem_parms.master_cpu, i: 0); |
522 | |
523 | cpus_read_lock(); |
524 | |
525 | atomic_set(v: &chmem_parms.cpu_counter, i: num_online_cpus()); |
526 | |
527 | // Ensure state is consistent before we call the other CPUs |
528 | mb(); |
529 | |
530 | stop_machine_cpuslocked(fn: change_memory_range_fn, data: &chmem_parms, |
531 | cpu_online_mask); |
532 | |
533 | cpus_read_unlock(); |
534 | mutex_unlock(lock: &chmem_lock); |
535 | } else |
536 | change_memory_range(start, end, step, newpp); |
537 | |
538 | return true; |
539 | } |
540 | |
541 | void hash__mark_rodata_ro(void) |
542 | { |
543 | unsigned long start, end, pp; |
544 | |
545 | start = (unsigned long)_stext; |
546 | end = (unsigned long)__end_rodata; |
547 | |
548 | pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL_ROX), HPTE_USE_KERNEL_KEY); |
549 | |
550 | WARN_ON(!hash__change_memory_range(start, end, pp)); |
551 | } |
552 | |
553 | void hash__mark_initmem_nx(void) |
554 | { |
555 | unsigned long start, end, pp; |
556 | |
557 | start = (unsigned long)__init_begin; |
558 | end = (unsigned long)__init_end; |
559 | |
560 | pp = htab_convert_pte_flags(pgprot_val(PAGE_KERNEL), HPTE_USE_KERNEL_KEY); |
561 | |
562 | WARN_ON(!hash__change_memory_range(start, end, pp)); |
563 | } |
564 | #endif |
565 | |