1 | /* SPDX-License-Identifier: GPL-2.0 */ |
2 | /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. |
3 | * |
4 | */ |
5 | #ifndef __IO_PAGETABLE_H |
6 | #define __IO_PAGETABLE_H |
7 | |
8 | #include <linux/interval_tree.h> |
9 | #include <linux/mutex.h> |
10 | #include <linux/kref.h> |
11 | #include <linux/xarray.h> |
12 | |
13 | #include "iommufd_private.h" |
14 | |
15 | struct iommu_domain; |
16 | |
17 | /* |
18 | * Each io_pagetable is composed of intervals of areas which cover regions of |
19 | * the iova that are backed by something. iova not covered by areas is not |
20 | * populated in the page table. Each area is fully populated with pages. |
21 | * |
22 | * iovas are in byte units, but must be iopt->iova_alignment aligned. |
23 | * |
24 | * pages can be NULL, this means some other thread is still working on setting |
25 | * up or tearing down the area. When observed under the write side of the |
26 | * domain_rwsem a NULL pages must mean the area is still being setup and no |
27 | * domains are filled. |
28 | * |
29 | * storage_domain points at an arbitrary iommu_domain that is holding the PFNs |
30 | * for this area. It is locked by the pages->mutex. This simplifies the locking |
31 | * as the pages code can rely on the storage_domain without having to get the |
32 | * iopt->domains_rwsem. |
33 | * |
34 | * The io_pagetable::iova_rwsem protects node |
35 | * The iopt_pages::mutex protects pages_node |
36 | * iopt and iommu_prot are immutable |
37 | * The pages::mutex protects num_accesses |
38 | */ |
39 | struct iopt_area { |
40 | struct interval_tree_node node; |
41 | struct interval_tree_node pages_node; |
42 | struct io_pagetable *iopt; |
43 | struct iopt_pages *pages; |
44 | struct iommu_domain *storage_domain; |
45 | /* How many bytes into the first page the area starts */ |
46 | unsigned int page_offset; |
47 | /* IOMMU_READ, IOMMU_WRITE, etc */ |
48 | int iommu_prot; |
49 | bool prevent_access : 1; |
50 | unsigned int num_accesses; |
51 | }; |
52 | |
53 | struct iopt_allowed { |
54 | struct interval_tree_node node; |
55 | }; |
56 | |
57 | struct iopt_reserved { |
58 | struct interval_tree_node node; |
59 | void *owner; |
60 | }; |
61 | |
62 | int iopt_area_fill_domains(struct iopt_area *area, struct iopt_pages *pages); |
63 | void iopt_area_unfill_domains(struct iopt_area *area, struct iopt_pages *pages); |
64 | |
65 | int iopt_area_fill_domain(struct iopt_area *area, struct iommu_domain *domain); |
66 | void iopt_area_unfill_domain(struct iopt_area *area, struct iopt_pages *pages, |
67 | struct iommu_domain *domain); |
68 | void iopt_area_unmap_domain(struct iopt_area *area, |
69 | struct iommu_domain *domain); |
70 | |
71 | static inline unsigned long iopt_area_index(struct iopt_area *area) |
72 | { |
73 | return area->pages_node.start; |
74 | } |
75 | |
76 | static inline unsigned long iopt_area_last_index(struct iopt_area *area) |
77 | { |
78 | return area->pages_node.last; |
79 | } |
80 | |
81 | static inline unsigned long iopt_area_iova(struct iopt_area *area) |
82 | { |
83 | return area->node.start; |
84 | } |
85 | |
86 | static inline unsigned long iopt_area_last_iova(struct iopt_area *area) |
87 | { |
88 | return area->node.last; |
89 | } |
90 | |
91 | static inline size_t iopt_area_length(struct iopt_area *area) |
92 | { |
93 | return (area->node.last - area->node.start) + 1; |
94 | } |
95 | |
96 | /* |
97 | * Number of bytes from the start of the iopt_pages that the iova begins. |
98 | * iopt_area_start_byte() / PAGE_SIZE encodes the starting page index |
99 | * iopt_area_start_byte() % PAGE_SIZE encodes the offset within that page |
100 | */ |
101 | static inline unsigned long iopt_area_start_byte(struct iopt_area *area, |
102 | unsigned long iova) |
103 | { |
104 | if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) |
105 | WARN_ON(iova < iopt_area_iova(area) || |
106 | iova > iopt_area_last_iova(area)); |
107 | return (iova - iopt_area_iova(area)) + area->page_offset + |
108 | iopt_area_index(area) * PAGE_SIZE; |
109 | } |
110 | |
111 | static inline unsigned long iopt_area_iova_to_index(struct iopt_area *area, |
112 | unsigned long iova) |
113 | { |
114 | return iopt_area_start_byte(area, iova) / PAGE_SIZE; |
115 | } |
116 | |
117 | #define __make_iopt_iter(name) \ |
118 | static inline struct iopt_##name *iopt_##name##_iter_first( \ |
119 | struct io_pagetable *iopt, unsigned long start, \ |
120 | unsigned long last) \ |
121 | { \ |
122 | struct interval_tree_node *node; \ |
123 | \ |
124 | lockdep_assert_held(&iopt->iova_rwsem); \ |
125 | node = interval_tree_iter_first(&iopt->name##_itree, start, \ |
126 | last); \ |
127 | if (!node) \ |
128 | return NULL; \ |
129 | return container_of(node, struct iopt_##name, node); \ |
130 | } \ |
131 | static inline struct iopt_##name *iopt_##name##_iter_next( \ |
132 | struct iopt_##name *last_node, unsigned long start, \ |
133 | unsigned long last) \ |
134 | { \ |
135 | struct interval_tree_node *node; \ |
136 | \ |
137 | node = interval_tree_iter_next(&last_node->node, start, last); \ |
138 | if (!node) \ |
139 | return NULL; \ |
140 | return container_of(node, struct iopt_##name, node); \ |
141 | } |
142 | |
143 | __make_iopt_iter(area) |
144 | __make_iopt_iter(allowed) |
145 | __make_iopt_iter(reserved) |
146 | |
147 | struct iopt_area_contig_iter { |
148 | unsigned long cur_iova; |
149 | unsigned long last_iova; |
150 | struct iopt_area *area; |
151 | }; |
152 | struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, |
153 | struct io_pagetable *iopt, |
154 | unsigned long iova, |
155 | unsigned long last_iova); |
156 | struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter); |
157 | |
158 | static inline bool iopt_area_contig_done(struct iopt_area_contig_iter *iter) |
159 | { |
160 | return iter->area && iter->last_iova <= iopt_area_last_iova(area: iter->area); |
161 | } |
162 | |
163 | /* |
164 | * Iterate over a contiguous list of areas that span the iova,last_iova range. |
165 | * The caller must check iopt_area_contig_done() after the loop to see if |
166 | * contiguous areas existed. |
167 | */ |
168 | #define iopt_for_each_contig_area(iter, area, iopt, iova, last_iova) \ |
169 | for (area = iopt_area_contig_init(iter, iopt, iova, last_iova); area; \ |
170 | area = iopt_area_contig_next(iter)) |
171 | |
172 | enum { |
173 | IOPT_PAGES_ACCOUNT_NONE = 0, |
174 | IOPT_PAGES_ACCOUNT_USER = 1, |
175 | IOPT_PAGES_ACCOUNT_MM = 2, |
176 | }; |
177 | |
178 | /* |
179 | * This holds a pinned page list for multiple areas of IO address space. The |
180 | * pages always originate from a linear chunk of userspace VA. Multiple |
181 | * io_pagetable's, through their iopt_area's, can share a single iopt_pages |
182 | * which avoids multi-pinning and double accounting of page consumption. |
183 | * |
184 | * indexes in this structure are measured in PAGE_SIZE units, are 0 based from |
185 | * the start of the uptr and extend to npages. pages are pinned dynamically |
186 | * according to the intervals in the access_itree and domains_itree, npinned |
187 | * records the current number of pages pinned. |
188 | */ |
189 | struct iopt_pages { |
190 | struct kref kref; |
191 | struct mutex mutex; |
192 | size_t npages; |
193 | size_t npinned; |
194 | size_t last_npinned; |
195 | struct task_struct *source_task; |
196 | struct mm_struct *source_mm; |
197 | struct user_struct *source_user; |
198 | void __user *uptr; |
199 | bool writable:1; |
200 | u8 account_mode; |
201 | |
202 | struct xarray pinned_pfns; |
203 | /* Of iopt_pages_access::node */ |
204 | struct rb_root_cached access_itree; |
205 | /* Of iopt_area::pages_node */ |
206 | struct rb_root_cached domains_itree; |
207 | }; |
208 | |
209 | struct iopt_pages *iopt_alloc_pages(void __user *uptr, unsigned long length, |
210 | bool writable); |
211 | void iopt_release_pages(struct kref *kref); |
212 | static inline void iopt_put_pages(struct iopt_pages *pages) |
213 | { |
214 | kref_put(kref: &pages->kref, release: iopt_release_pages); |
215 | } |
216 | |
217 | void iopt_pages_fill_from_xarray(struct iopt_pages *pages, unsigned long start, |
218 | unsigned long last, struct page **out_pages); |
219 | int iopt_pages_fill_xarray(struct iopt_pages *pages, unsigned long start, |
220 | unsigned long last, struct page **out_pages); |
221 | void iopt_pages_unfill_xarray(struct iopt_pages *pages, unsigned long start, |
222 | unsigned long last); |
223 | |
224 | int iopt_area_add_access(struct iopt_area *area, unsigned long start, |
225 | unsigned long last, struct page **out_pages, |
226 | unsigned int flags); |
227 | void iopt_area_remove_access(struct iopt_area *area, unsigned long start, |
228 | unsigned long last); |
229 | int iopt_pages_rw_access(struct iopt_pages *pages, unsigned long start_byte, |
230 | void *data, unsigned long length, unsigned int flags); |
231 | |
232 | /* |
233 | * Each interval represents an active iopt_access_pages(), it acts as an |
234 | * interval lock that keeps the PFNs pinned and stored in the xarray. |
235 | */ |
236 | struct iopt_pages_access { |
237 | struct interval_tree_node node; |
238 | unsigned int users; |
239 | }; |
240 | |
241 | #endif |
242 | |