1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * CPU-agnostic AMD IO page table allocator. |
4 | * |
5 | * Copyright (C) 2020 Advanced Micro Devices, Inc. |
6 | * Author: Suravee Suthikulpanit <suravee.suthikulpanit@amd.com> |
7 | */ |
8 | |
9 | #define pr_fmt(fmt) "AMD-Vi: " fmt |
10 | #define dev_fmt(fmt) pr_fmt(fmt) |
11 | |
12 | #include <linux/atomic.h> |
13 | #include <linux/bitops.h> |
14 | #include <linux/io-pgtable.h> |
15 | #include <linux/kernel.h> |
16 | #include <linux/sizes.h> |
17 | #include <linux/slab.h> |
18 | #include <linux/types.h> |
19 | #include <linux/dma-mapping.h> |
20 | |
21 | #include <asm/barrier.h> |
22 | |
23 | #include "amd_iommu_types.h" |
24 | #include "amd_iommu.h" |
25 | |
26 | static void v1_tlb_flush_all(void *cookie) |
27 | { |
28 | } |
29 | |
30 | static void v1_tlb_flush_walk(unsigned long iova, size_t size, |
31 | size_t granule, void *cookie) |
32 | { |
33 | } |
34 | |
35 | static void v1_tlb_add_page(struct iommu_iotlb_gather *gather, |
36 | unsigned long iova, size_t granule, |
37 | void *cookie) |
38 | { |
39 | } |
40 | |
41 | static const struct iommu_flush_ops v1_flush_ops = { |
42 | .tlb_flush_all = v1_tlb_flush_all, |
43 | .tlb_flush_walk = v1_tlb_flush_walk, |
44 | .tlb_add_page = v1_tlb_add_page, |
45 | }; |
46 | |
47 | /* |
48 | * Helper function to get the first pte of a large mapping |
49 | */ |
50 | static u64 *first_pte_l7(u64 *pte, unsigned long *page_size, |
51 | unsigned long *count) |
52 | { |
53 | unsigned long pte_mask, pg_size, cnt; |
54 | u64 *fpte; |
55 | |
56 | pg_size = PTE_PAGE_SIZE(*pte); |
57 | cnt = PAGE_SIZE_PTE_COUNT(pg_size); |
58 | pte_mask = ~((cnt << 3) - 1); |
59 | fpte = (u64 *)(((unsigned long)pte) & pte_mask); |
60 | |
61 | if (page_size) |
62 | *page_size = pg_size; |
63 | |
64 | if (count) |
65 | *count = cnt; |
66 | |
67 | return fpte; |
68 | } |
69 | |
70 | /**************************************************************************** |
71 | * |
72 | * The functions below are used the create the page table mappings for |
73 | * unity mapped regions. |
74 | * |
75 | ****************************************************************************/ |
76 | |
77 | static void free_pt_page(u64 *pt, struct list_head *freelist) |
78 | { |
79 | struct page *p = virt_to_page(pt); |
80 | |
81 | list_add_tail(new: &p->lru, head: freelist); |
82 | } |
83 | |
84 | static void free_pt_lvl(u64 *pt, struct list_head *freelist, int lvl) |
85 | { |
86 | u64 *p; |
87 | int i; |
88 | |
89 | for (i = 0; i < 512; ++i) { |
90 | /* PTE present? */ |
91 | if (!IOMMU_PTE_PRESENT(pt[i])) |
92 | continue; |
93 | |
94 | /* Large PTE? */ |
95 | if (PM_PTE_LEVEL(pt[i]) == 0 || |
96 | PM_PTE_LEVEL(pt[i]) == 7) |
97 | continue; |
98 | |
99 | /* |
100 | * Free the next level. No need to look at l1 tables here since |
101 | * they can only contain leaf PTEs; just free them directly. |
102 | */ |
103 | p = IOMMU_PTE_PAGE(pt[i]); |
104 | if (lvl > 2) |
105 | free_pt_lvl(pt: p, freelist, lvl: lvl - 1); |
106 | else |
107 | free_pt_page(pt: p, freelist); |
108 | } |
109 | |
110 | free_pt_page(pt, freelist); |
111 | } |
112 | |
113 | static void free_sub_pt(u64 *root, int mode, struct list_head *freelist) |
114 | { |
115 | switch (mode) { |
116 | case PAGE_MODE_NONE: |
117 | case PAGE_MODE_7_LEVEL: |
118 | break; |
119 | case PAGE_MODE_1_LEVEL: |
120 | free_pt_page(pt: root, freelist); |
121 | break; |
122 | case PAGE_MODE_2_LEVEL: |
123 | case PAGE_MODE_3_LEVEL: |
124 | case PAGE_MODE_4_LEVEL: |
125 | case PAGE_MODE_5_LEVEL: |
126 | case PAGE_MODE_6_LEVEL: |
127 | free_pt_lvl(pt: root, freelist, lvl: mode); |
128 | break; |
129 | default: |
130 | BUG(); |
131 | } |
132 | } |
133 | |
134 | void amd_iommu_domain_set_pgtable(struct protection_domain *domain, |
135 | u64 *root, int mode) |
136 | { |
137 | u64 pt_root; |
138 | |
139 | /* lowest 3 bits encode pgtable mode */ |
140 | pt_root = mode & 7; |
141 | pt_root |= (u64)root; |
142 | |
143 | amd_iommu_domain_set_pt_root(domain, root: pt_root); |
144 | } |
145 | |
146 | /* |
147 | * This function is used to add another level to an IO page table. Adding |
148 | * another level increases the size of the address space by 9 bits to a size up |
149 | * to 64 bits. |
150 | */ |
151 | static bool increase_address_space(struct protection_domain *domain, |
152 | unsigned long address, |
153 | gfp_t gfp) |
154 | { |
155 | unsigned long flags; |
156 | bool ret = true; |
157 | u64 *pte; |
158 | |
159 | pte = alloc_pgtable_page(nid: domain->nid, gfp); |
160 | if (!pte) |
161 | return false; |
162 | |
163 | spin_lock_irqsave(&domain->lock, flags); |
164 | |
165 | if (address <= PM_LEVEL_SIZE(domain->iop.mode)) |
166 | goto out; |
167 | |
168 | ret = false; |
169 | if (WARN_ON_ONCE(domain->iop.mode == PAGE_MODE_6_LEVEL)) |
170 | goto out; |
171 | |
172 | *pte = PM_LEVEL_PDE(domain->iop.mode, iommu_virt_to_phys(domain->iop.root)); |
173 | |
174 | domain->iop.root = pte; |
175 | domain->iop.mode += 1; |
176 | amd_iommu_update_and_flush_device_table(domain); |
177 | amd_iommu_domain_flush_complete(domain); |
178 | |
179 | /* |
180 | * Device Table needs to be updated and flushed before the new root can |
181 | * be published. |
182 | */ |
183 | amd_iommu_domain_set_pgtable(domain, root: pte, mode: domain->iop.mode); |
184 | |
185 | pte = NULL; |
186 | ret = true; |
187 | |
188 | out: |
189 | spin_unlock_irqrestore(lock: &domain->lock, flags); |
190 | free_page((unsigned long)pte); |
191 | |
192 | return ret; |
193 | } |
194 | |
195 | static u64 *alloc_pte(struct protection_domain *domain, |
196 | unsigned long address, |
197 | unsigned long page_size, |
198 | u64 **pte_page, |
199 | gfp_t gfp, |
200 | bool *updated) |
201 | { |
202 | int level, end_lvl; |
203 | u64 *pte, *page; |
204 | |
205 | BUG_ON(!is_power_of_2(page_size)); |
206 | |
207 | while (address > PM_LEVEL_SIZE(domain->iop.mode)) { |
208 | /* |
209 | * Return an error if there is no memory to update the |
210 | * page-table. |
211 | */ |
212 | if (!increase_address_space(domain, address, gfp)) |
213 | return NULL; |
214 | } |
215 | |
216 | |
217 | level = domain->iop.mode - 1; |
218 | pte = &domain->iop.root[PM_LEVEL_INDEX(level, address)]; |
219 | address = PAGE_SIZE_ALIGN(address, page_size); |
220 | end_lvl = PAGE_SIZE_LEVEL(page_size); |
221 | |
222 | while (level > end_lvl) { |
223 | u64 __pte, __npte; |
224 | int pte_level; |
225 | |
226 | __pte = *pte; |
227 | pte_level = PM_PTE_LEVEL(__pte); |
228 | |
229 | /* |
230 | * If we replace a series of large PTEs, we need |
231 | * to tear down all of them. |
232 | */ |
233 | if (IOMMU_PTE_PRESENT(__pte) && |
234 | pte_level == PAGE_MODE_7_LEVEL) { |
235 | unsigned long count, i; |
236 | u64 *lpte; |
237 | |
238 | lpte = first_pte_l7(pte, NULL, count: &count); |
239 | |
240 | /* |
241 | * Unmap the replicated PTEs that still match the |
242 | * original large mapping |
243 | */ |
244 | for (i = 0; i < count; ++i) |
245 | cmpxchg64(&lpte[i], __pte, 0ULL); |
246 | |
247 | *updated = true; |
248 | continue; |
249 | } |
250 | |
251 | if (!IOMMU_PTE_PRESENT(__pte) || |
252 | pte_level == PAGE_MODE_NONE) { |
253 | page = alloc_pgtable_page(nid: domain->nid, gfp); |
254 | |
255 | if (!page) |
256 | return NULL; |
257 | |
258 | __npte = PM_LEVEL_PDE(level, iommu_virt_to_phys(page)); |
259 | |
260 | /* pte could have been changed somewhere. */ |
261 | if (!try_cmpxchg64(pte, &__pte, __npte)) |
262 | free_page((unsigned long)page); |
263 | else if (IOMMU_PTE_PRESENT(__pte)) |
264 | *updated = true; |
265 | |
266 | continue; |
267 | } |
268 | |
269 | /* No level skipping support yet */ |
270 | if (pte_level != level) |
271 | return NULL; |
272 | |
273 | level -= 1; |
274 | |
275 | pte = IOMMU_PTE_PAGE(__pte); |
276 | |
277 | if (pte_page && level == end_lvl) |
278 | *pte_page = pte; |
279 | |
280 | pte = &pte[PM_LEVEL_INDEX(level, address)]; |
281 | } |
282 | |
283 | return pte; |
284 | } |
285 | |
286 | /* |
287 | * This function checks if there is a PTE for a given dma address. If |
288 | * there is one, it returns the pointer to it. |
289 | */ |
290 | static u64 *fetch_pte(struct amd_io_pgtable *pgtable, |
291 | unsigned long address, |
292 | unsigned long *page_size) |
293 | { |
294 | int level; |
295 | u64 *pte; |
296 | |
297 | *page_size = 0; |
298 | |
299 | if (address > PM_LEVEL_SIZE(pgtable->mode)) |
300 | return NULL; |
301 | |
302 | level = pgtable->mode - 1; |
303 | pte = &pgtable->root[PM_LEVEL_INDEX(level, address)]; |
304 | *page_size = PTE_LEVEL_PAGE_SIZE(level); |
305 | |
306 | while (level > 0) { |
307 | |
308 | /* Not Present */ |
309 | if (!IOMMU_PTE_PRESENT(*pte)) |
310 | return NULL; |
311 | |
312 | /* Large PTE */ |
313 | if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL || |
314 | PM_PTE_LEVEL(*pte) == PAGE_MODE_NONE) |
315 | break; |
316 | |
317 | /* No level skipping support yet */ |
318 | if (PM_PTE_LEVEL(*pte) != level) |
319 | return NULL; |
320 | |
321 | level -= 1; |
322 | |
323 | /* Walk to the next level */ |
324 | pte = IOMMU_PTE_PAGE(*pte); |
325 | pte = &pte[PM_LEVEL_INDEX(level, address)]; |
326 | *page_size = PTE_LEVEL_PAGE_SIZE(level); |
327 | } |
328 | |
329 | /* |
330 | * If we have a series of large PTEs, make |
331 | * sure to return a pointer to the first one. |
332 | */ |
333 | if (PM_PTE_LEVEL(*pte) == PAGE_MODE_7_LEVEL) |
334 | pte = first_pte_l7(pte, page_size, NULL); |
335 | |
336 | return pte; |
337 | } |
338 | |
339 | static void free_clear_pte(u64 *pte, u64 pteval, struct list_head *freelist) |
340 | { |
341 | u64 *pt; |
342 | int mode; |
343 | |
344 | while (!try_cmpxchg64(pte, &pteval, 0)) |
345 | pr_warn("AMD-Vi: IOMMU pte changed since we read it\n" ); |
346 | |
347 | if (!IOMMU_PTE_PRESENT(pteval)) |
348 | return; |
349 | |
350 | pt = IOMMU_PTE_PAGE(pteval); |
351 | mode = IOMMU_PTE_MODE(pteval); |
352 | |
353 | free_sub_pt(root: pt, mode, freelist); |
354 | } |
355 | |
356 | /* |
357 | * Generic mapping functions. It maps a physical address into a DMA |
358 | * address space. It allocates the page table pages if necessary. |
359 | * In the future it can be extended to a generic mapping function |
360 | * supporting all features of AMD IOMMU page tables like level skipping |
361 | * and full 64 bit address spaces. |
362 | */ |
363 | static int iommu_v1_map_pages(struct io_pgtable_ops *ops, unsigned long iova, |
364 | phys_addr_t paddr, size_t pgsize, size_t pgcount, |
365 | int prot, gfp_t gfp, size_t *mapped) |
366 | { |
367 | struct protection_domain *dom = io_pgtable_ops_to_domain(ops); |
368 | LIST_HEAD(freelist); |
369 | bool updated = false; |
370 | u64 __pte, *pte; |
371 | int ret, i, count; |
372 | size_t size = pgcount << __ffs(pgsize); |
373 | unsigned long o_iova = iova; |
374 | |
375 | BUG_ON(!IS_ALIGNED(iova, pgsize)); |
376 | BUG_ON(!IS_ALIGNED(paddr, pgsize)); |
377 | |
378 | ret = -EINVAL; |
379 | if (!(prot & IOMMU_PROT_MASK)) |
380 | goto out; |
381 | |
382 | while (pgcount > 0) { |
383 | count = PAGE_SIZE_PTE_COUNT(pgsize); |
384 | pte = alloc_pte(domain: dom, address: iova, page_size: pgsize, NULL, gfp, updated: &updated); |
385 | |
386 | ret = -ENOMEM; |
387 | if (!pte) |
388 | goto out; |
389 | |
390 | for (i = 0; i < count; ++i) |
391 | free_clear_pte(pte: &pte[i], pteval: pte[i], freelist: &freelist); |
392 | |
393 | if (!list_empty(head: &freelist)) |
394 | updated = true; |
395 | |
396 | if (count > 1) { |
397 | __pte = PAGE_SIZE_PTE(__sme_set(paddr), pgsize); |
398 | __pte |= PM_LEVEL_ENC(7) | IOMMU_PTE_PR | IOMMU_PTE_FC; |
399 | } else |
400 | __pte = __sme_set(paddr) | IOMMU_PTE_PR | IOMMU_PTE_FC; |
401 | |
402 | if (prot & IOMMU_PROT_IR) |
403 | __pte |= IOMMU_PTE_IR; |
404 | if (prot & IOMMU_PROT_IW) |
405 | __pte |= IOMMU_PTE_IW; |
406 | |
407 | for (i = 0; i < count; ++i) |
408 | pte[i] = __pte; |
409 | |
410 | iova += pgsize; |
411 | paddr += pgsize; |
412 | pgcount--; |
413 | if (mapped) |
414 | *mapped += pgsize; |
415 | } |
416 | |
417 | ret = 0; |
418 | |
419 | out: |
420 | if (updated) { |
421 | unsigned long flags; |
422 | |
423 | spin_lock_irqsave(&dom->lock, flags); |
424 | /* |
425 | * Flush domain TLB(s) and wait for completion. Any Device-Table |
426 | * Updates and flushing already happened in |
427 | * increase_address_space(). |
428 | */ |
429 | amd_iommu_domain_flush_pages(domain: dom, address: o_iova, size); |
430 | spin_unlock_irqrestore(lock: &dom->lock, flags); |
431 | } |
432 | |
433 | /* Everything flushed out, free pages now */ |
434 | put_pages_list(pages: &freelist); |
435 | |
436 | return ret; |
437 | } |
438 | |
439 | static unsigned long iommu_v1_unmap_pages(struct io_pgtable_ops *ops, |
440 | unsigned long iova, |
441 | size_t pgsize, size_t pgcount, |
442 | struct iommu_iotlb_gather *gather) |
443 | { |
444 | struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); |
445 | unsigned long long unmapped; |
446 | unsigned long unmap_size; |
447 | u64 *pte; |
448 | size_t size = pgcount << __ffs(pgsize); |
449 | |
450 | BUG_ON(!is_power_of_2(pgsize)); |
451 | |
452 | unmapped = 0; |
453 | |
454 | while (unmapped < size) { |
455 | pte = fetch_pte(pgtable, address: iova, page_size: &unmap_size); |
456 | if (pte) { |
457 | int i, count; |
458 | |
459 | count = PAGE_SIZE_PTE_COUNT(unmap_size); |
460 | for (i = 0; i < count; i++) |
461 | pte[i] = 0ULL; |
462 | } else { |
463 | return unmapped; |
464 | } |
465 | |
466 | iova = (iova & ~(unmap_size - 1)) + unmap_size; |
467 | unmapped += unmap_size; |
468 | } |
469 | |
470 | return unmapped; |
471 | } |
472 | |
473 | static phys_addr_t iommu_v1_iova_to_phys(struct io_pgtable_ops *ops, unsigned long iova) |
474 | { |
475 | struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); |
476 | unsigned long offset_mask, pte_pgsize; |
477 | u64 *pte, __pte; |
478 | |
479 | pte = fetch_pte(pgtable, address: iova, page_size: &pte_pgsize); |
480 | |
481 | if (!pte || !IOMMU_PTE_PRESENT(*pte)) |
482 | return 0; |
483 | |
484 | offset_mask = pte_pgsize - 1; |
485 | __pte = __sme_clr(*pte & PM_ADDR_MASK); |
486 | |
487 | return (__pte & ~offset_mask) | (iova & offset_mask); |
488 | } |
489 | |
490 | static bool pte_test_and_clear_dirty(u64 *ptep, unsigned long size, |
491 | unsigned long flags) |
492 | { |
493 | bool test_only = flags & IOMMU_DIRTY_NO_CLEAR; |
494 | bool dirty = false; |
495 | int i, count; |
496 | |
497 | /* |
498 | * 2.2.3.2 Host Dirty Support |
499 | * When a non-default page size is used , software must OR the |
500 | * Dirty bits in all of the replicated host PTEs used to map |
501 | * the page. The IOMMU does not guarantee the Dirty bits are |
502 | * set in all of the replicated PTEs. Any portion of the page |
503 | * may have been written even if the Dirty bit is set in only |
504 | * one of the replicated PTEs. |
505 | */ |
506 | count = PAGE_SIZE_PTE_COUNT(size); |
507 | for (i = 0; i < count && test_only; i++) { |
508 | if (test_bit(IOMMU_PTE_HD_BIT, (unsigned long *)&ptep[i])) { |
509 | dirty = true; |
510 | break; |
511 | } |
512 | } |
513 | |
514 | for (i = 0; i < count && !test_only; i++) { |
515 | if (test_and_clear_bit(IOMMU_PTE_HD_BIT, |
516 | addr: (unsigned long *)&ptep[i])) { |
517 | dirty = true; |
518 | } |
519 | } |
520 | |
521 | return dirty; |
522 | } |
523 | |
524 | static int iommu_v1_read_and_clear_dirty(struct io_pgtable_ops *ops, |
525 | unsigned long iova, size_t size, |
526 | unsigned long flags, |
527 | struct iommu_dirty_bitmap *dirty) |
528 | { |
529 | struct amd_io_pgtable *pgtable = io_pgtable_ops_to_data(ops); |
530 | unsigned long end = iova + size - 1; |
531 | |
532 | do { |
533 | unsigned long pgsize = 0; |
534 | u64 *ptep, pte; |
535 | |
536 | ptep = fetch_pte(pgtable, address: iova, page_size: &pgsize); |
537 | if (ptep) |
538 | pte = READ_ONCE(*ptep); |
539 | if (!ptep || !IOMMU_PTE_PRESENT(pte)) { |
540 | pgsize = pgsize ?: PTE_LEVEL_PAGE_SIZE(0); |
541 | iova += pgsize; |
542 | continue; |
543 | } |
544 | |
545 | /* |
546 | * Mark the whole IOVA range as dirty even if only one of |
547 | * the replicated PTEs were marked dirty. |
548 | */ |
549 | if (pte_test_and_clear_dirty(ptep, size: pgsize, flags)) |
550 | iommu_dirty_bitmap_record(dirty, iova, length: pgsize); |
551 | iova += pgsize; |
552 | } while (iova < end); |
553 | |
554 | return 0; |
555 | } |
556 | |
557 | /* |
558 | * ---------------------------------------------------- |
559 | */ |
560 | static void v1_free_pgtable(struct io_pgtable *iop) |
561 | { |
562 | struct amd_io_pgtable *pgtable = container_of(iop, struct amd_io_pgtable, iop); |
563 | struct protection_domain *dom; |
564 | LIST_HEAD(freelist); |
565 | |
566 | if (pgtable->mode == PAGE_MODE_NONE) |
567 | return; |
568 | |
569 | dom = container_of(pgtable, struct protection_domain, iop); |
570 | |
571 | /* Page-table is not visible to IOMMU anymore, so free it */ |
572 | BUG_ON(pgtable->mode < PAGE_MODE_NONE || |
573 | pgtable->mode > PAGE_MODE_6_LEVEL); |
574 | |
575 | free_sub_pt(root: pgtable->root, mode: pgtable->mode, freelist: &freelist); |
576 | |
577 | /* Update data structure */ |
578 | amd_iommu_domain_clr_pt_root(domain: dom); |
579 | |
580 | /* Make changes visible to IOMMUs */ |
581 | amd_iommu_domain_update(domain: dom); |
582 | |
583 | put_pages_list(pages: &freelist); |
584 | } |
585 | |
586 | static struct io_pgtable *v1_alloc_pgtable(struct io_pgtable_cfg *cfg, void *cookie) |
587 | { |
588 | struct amd_io_pgtable *pgtable = io_pgtable_cfg_to_data(cfg); |
589 | |
590 | cfg->pgsize_bitmap = AMD_IOMMU_PGSIZES, |
591 | cfg->ias = IOMMU_IN_ADDR_BIT_SIZE, |
592 | cfg->oas = IOMMU_OUT_ADDR_BIT_SIZE, |
593 | cfg->tlb = &v1_flush_ops; |
594 | |
595 | pgtable->iop.ops.map_pages = iommu_v1_map_pages; |
596 | pgtable->iop.ops.unmap_pages = iommu_v1_unmap_pages; |
597 | pgtable->iop.ops.iova_to_phys = iommu_v1_iova_to_phys; |
598 | pgtable->iop.ops.read_and_clear_dirty = iommu_v1_read_and_clear_dirty; |
599 | |
600 | return &pgtable->iop; |
601 | } |
602 | |
603 | struct io_pgtable_init_fns io_pgtable_amd_iommu_v1_init_fns = { |
604 | .alloc = v1_alloc_pgtable, |
605 | .free = v1_free_pgtable, |
606 | }; |
607 | |