1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. |
3 | * |
4 | * The io_pagetable is the top of datastructure that maps IOVA's to PFNs. The |
5 | * PFNs can be placed into an iommu_domain, or returned to the caller as a page |
6 | * list for access by an in-kernel user. |
7 | * |
8 | * The datastructure uses the iopt_pages to optimize the storage of the PFNs |
9 | * between the domains and xarray. |
10 | */ |
11 | #include <linux/iommufd.h> |
12 | #include <linux/lockdep.h> |
13 | #include <linux/iommu.h> |
14 | #include <linux/sched/mm.h> |
15 | #include <linux/err.h> |
16 | #include <linux/slab.h> |
17 | #include <linux/errno.h> |
18 | #include <uapi/linux/iommufd.h> |
19 | |
20 | #include "io_pagetable.h" |
21 | #include "double_span.h" |
22 | |
23 | struct iopt_pages_list { |
24 | struct iopt_pages *pages; |
25 | struct iopt_area *area; |
26 | struct list_head next; |
27 | unsigned long start_byte; |
28 | unsigned long length; |
29 | }; |
30 | |
31 | struct iopt_area *iopt_area_contig_init(struct iopt_area_contig_iter *iter, |
32 | struct io_pagetable *iopt, |
33 | unsigned long iova, |
34 | unsigned long last_iova) |
35 | { |
36 | lockdep_assert_held(&iopt->iova_rwsem); |
37 | |
38 | iter->cur_iova = iova; |
39 | iter->last_iova = last_iova; |
40 | iter->area = iopt_area_iter_first(iopt, start: iova, last: iova); |
41 | if (!iter->area) |
42 | return NULL; |
43 | if (!iter->area->pages) { |
44 | iter->area = NULL; |
45 | return NULL; |
46 | } |
47 | return iter->area; |
48 | } |
49 | |
50 | struct iopt_area *iopt_area_contig_next(struct iopt_area_contig_iter *iter) |
51 | { |
52 | unsigned long last_iova; |
53 | |
54 | if (!iter->area) |
55 | return NULL; |
56 | last_iova = iopt_area_last_iova(area: iter->area); |
57 | if (iter->last_iova <= last_iova) |
58 | return NULL; |
59 | |
60 | iter->cur_iova = last_iova + 1; |
61 | iter->area = iopt_area_iter_next(last_node: iter->area, start: iter->cur_iova, |
62 | last: iter->last_iova); |
63 | if (!iter->area) |
64 | return NULL; |
65 | if (iter->cur_iova != iopt_area_iova(area: iter->area) || |
66 | !iter->area->pages) { |
67 | iter->area = NULL; |
68 | return NULL; |
69 | } |
70 | return iter->area; |
71 | } |
72 | |
73 | static bool __alloc_iova_check_hole(struct interval_tree_double_span_iter *span, |
74 | unsigned long length, |
75 | unsigned long iova_alignment, |
76 | unsigned long page_offset) |
77 | { |
78 | if (span->is_used || span->last_hole - span->start_hole < length - 1) |
79 | return false; |
80 | |
81 | span->start_hole = ALIGN(span->start_hole, iova_alignment) | |
82 | page_offset; |
83 | if (span->start_hole > span->last_hole || |
84 | span->last_hole - span->start_hole < length - 1) |
85 | return false; |
86 | return true; |
87 | } |
88 | |
89 | static bool __alloc_iova_check_used(struct interval_tree_span_iter *span, |
90 | unsigned long length, |
91 | unsigned long iova_alignment, |
92 | unsigned long page_offset) |
93 | { |
94 | if (span->is_hole || span->last_used - span->start_used < length - 1) |
95 | return false; |
96 | |
97 | span->start_used = ALIGN(span->start_used, iova_alignment) | |
98 | page_offset; |
99 | if (span->start_used > span->last_used || |
100 | span->last_used - span->start_used < length - 1) |
101 | return false; |
102 | return true; |
103 | } |
104 | |
105 | /* |
106 | * Automatically find a block of IOVA that is not being used and not reserved. |
107 | * Does not return a 0 IOVA even if it is valid. |
108 | */ |
109 | static int iopt_alloc_iova(struct io_pagetable *iopt, unsigned long *iova, |
110 | unsigned long uptr, unsigned long length) |
111 | { |
112 | unsigned long page_offset = uptr % PAGE_SIZE; |
113 | struct interval_tree_double_span_iter used_span; |
114 | struct interval_tree_span_iter allowed_span; |
115 | unsigned long iova_alignment; |
116 | |
117 | lockdep_assert_held(&iopt->iova_rwsem); |
118 | |
119 | /* Protect roundup_pow-of_two() from overflow */ |
120 | if (length == 0 || length >= ULONG_MAX / 2) |
121 | return -EOVERFLOW; |
122 | |
123 | /* |
124 | * Keep alignment present in the uptr when building the IOVA, this |
125 | * increases the chance we can map a THP. |
126 | */ |
127 | if (!uptr) |
128 | iova_alignment = roundup_pow_of_two(length); |
129 | else |
130 | iova_alignment = min_t(unsigned long, |
131 | roundup_pow_of_two(length), |
132 | 1UL << __ffs64(uptr)); |
133 | |
134 | if (iova_alignment < iopt->iova_alignment) |
135 | return -EINVAL; |
136 | |
137 | interval_tree_for_each_span(&allowed_span, &iopt->allowed_itree, |
138 | PAGE_SIZE, ULONG_MAX - PAGE_SIZE) { |
139 | if (RB_EMPTY_ROOT(&iopt->allowed_itree.rb_root)) { |
140 | allowed_span.start_used = PAGE_SIZE; |
141 | allowed_span.last_used = ULONG_MAX - PAGE_SIZE; |
142 | allowed_span.is_hole = false; |
143 | } |
144 | |
145 | if (!__alloc_iova_check_used(span: &allowed_span, length, |
146 | iova_alignment, page_offset)) |
147 | continue; |
148 | |
149 | interval_tree_for_each_double_span( |
150 | &used_span, &iopt->reserved_itree, &iopt->area_itree, |
151 | allowed_span.start_used, allowed_span.last_used) { |
152 | if (!__alloc_iova_check_hole(span: &used_span, length, |
153 | iova_alignment, |
154 | page_offset)) |
155 | continue; |
156 | |
157 | *iova = used_span.start_hole; |
158 | return 0; |
159 | } |
160 | } |
161 | return -ENOSPC; |
162 | } |
163 | |
164 | static int iopt_check_iova(struct io_pagetable *iopt, unsigned long iova, |
165 | unsigned long length) |
166 | { |
167 | unsigned long last; |
168 | |
169 | lockdep_assert_held(&iopt->iova_rwsem); |
170 | |
171 | if ((iova & (iopt->iova_alignment - 1))) |
172 | return -EINVAL; |
173 | |
174 | if (check_add_overflow(iova, length - 1, &last)) |
175 | return -EOVERFLOW; |
176 | |
177 | /* No reserved IOVA intersects the range */ |
178 | if (iopt_reserved_iter_first(iopt, start: iova, last)) |
179 | return -EINVAL; |
180 | |
181 | /* Check that there is not already a mapping in the range */ |
182 | if (iopt_area_iter_first(iopt, start: iova, last)) |
183 | return -EEXIST; |
184 | return 0; |
185 | } |
186 | |
187 | /* |
188 | * The area takes a slice of the pages from start_bytes to start_byte + length |
189 | */ |
190 | static int iopt_insert_area(struct io_pagetable *iopt, struct iopt_area *area, |
191 | struct iopt_pages *pages, unsigned long iova, |
192 | unsigned long start_byte, unsigned long length, |
193 | int iommu_prot) |
194 | { |
195 | lockdep_assert_held_write(&iopt->iova_rwsem); |
196 | |
197 | if ((iommu_prot & IOMMU_WRITE) && !pages->writable) |
198 | return -EPERM; |
199 | |
200 | area->iommu_prot = iommu_prot; |
201 | area->page_offset = start_byte % PAGE_SIZE; |
202 | if (area->page_offset & (iopt->iova_alignment - 1)) |
203 | return -EINVAL; |
204 | |
205 | area->node.start = iova; |
206 | if (check_add_overflow(iova, length - 1, &area->node.last)) |
207 | return -EOVERFLOW; |
208 | |
209 | area->pages_node.start = start_byte / PAGE_SIZE; |
210 | if (check_add_overflow(start_byte, length - 1, &area->pages_node.last)) |
211 | return -EOVERFLOW; |
212 | area->pages_node.last = area->pages_node.last / PAGE_SIZE; |
213 | if (WARN_ON(area->pages_node.last >= pages->npages)) |
214 | return -EOVERFLOW; |
215 | |
216 | /* |
217 | * The area is inserted with a NULL pages indicating it is not fully |
218 | * initialized yet. |
219 | */ |
220 | area->iopt = iopt; |
221 | interval_tree_insert(node: &area->node, root: &iopt->area_itree); |
222 | return 0; |
223 | } |
224 | |
225 | static struct iopt_area *iopt_area_alloc(void) |
226 | { |
227 | struct iopt_area *area; |
228 | |
229 | area = kzalloc(size: sizeof(*area), GFP_KERNEL_ACCOUNT); |
230 | if (!area) |
231 | return NULL; |
232 | RB_CLEAR_NODE(&area->node.rb); |
233 | RB_CLEAR_NODE(&area->pages_node.rb); |
234 | return area; |
235 | } |
236 | |
237 | static int iopt_alloc_area_pages(struct io_pagetable *iopt, |
238 | struct list_head *pages_list, |
239 | unsigned long length, unsigned long *dst_iova, |
240 | int iommu_prot, unsigned int flags) |
241 | { |
242 | struct iopt_pages_list *elm; |
243 | unsigned long iova; |
244 | int rc = 0; |
245 | |
246 | list_for_each_entry(elm, pages_list, next) { |
247 | elm->area = iopt_area_alloc(); |
248 | if (!elm->area) |
249 | return -ENOMEM; |
250 | } |
251 | |
252 | down_write(sem: &iopt->iova_rwsem); |
253 | if ((length & (iopt->iova_alignment - 1)) || !length) { |
254 | rc = -EINVAL; |
255 | goto out_unlock; |
256 | } |
257 | |
258 | if (flags & IOPT_ALLOC_IOVA) { |
259 | /* Use the first entry to guess the ideal IOVA alignment */ |
260 | elm = list_first_entry(pages_list, struct iopt_pages_list, |
261 | next); |
262 | rc = iopt_alloc_iova( |
263 | iopt, iova: dst_iova, |
264 | uptr: (uintptr_t)elm->pages->uptr + elm->start_byte, length); |
265 | if (rc) |
266 | goto out_unlock; |
267 | if (IS_ENABLED(CONFIG_IOMMUFD_TEST) && |
268 | WARN_ON(iopt_check_iova(iopt, *dst_iova, length))) { |
269 | rc = -EINVAL; |
270 | goto out_unlock; |
271 | } |
272 | } else { |
273 | rc = iopt_check_iova(iopt, iova: *dst_iova, length); |
274 | if (rc) |
275 | goto out_unlock; |
276 | } |
277 | |
278 | /* |
279 | * Areas are created with a NULL pages so that the IOVA space is |
280 | * reserved and we can unlock the iova_rwsem. |
281 | */ |
282 | iova = *dst_iova; |
283 | list_for_each_entry(elm, pages_list, next) { |
284 | rc = iopt_insert_area(iopt, area: elm->area, pages: elm->pages, iova, |
285 | start_byte: elm->start_byte, length: elm->length, iommu_prot); |
286 | if (rc) |
287 | goto out_unlock; |
288 | iova += elm->length; |
289 | } |
290 | |
291 | out_unlock: |
292 | up_write(sem: &iopt->iova_rwsem); |
293 | return rc; |
294 | } |
295 | |
296 | static void iopt_abort_area(struct iopt_area *area) |
297 | { |
298 | if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) |
299 | WARN_ON(area->pages); |
300 | if (area->iopt) { |
301 | down_write(sem: &area->iopt->iova_rwsem); |
302 | interval_tree_remove(node: &area->node, root: &area->iopt->area_itree); |
303 | up_write(sem: &area->iopt->iova_rwsem); |
304 | } |
305 | kfree(objp: area); |
306 | } |
307 | |
308 | void iopt_free_pages_list(struct list_head *pages_list) |
309 | { |
310 | struct iopt_pages_list *elm; |
311 | |
312 | while ((elm = list_first_entry_or_null(pages_list, |
313 | struct iopt_pages_list, next))) { |
314 | if (elm->area) |
315 | iopt_abort_area(area: elm->area); |
316 | if (elm->pages) |
317 | iopt_put_pages(pages: elm->pages); |
318 | list_del(entry: &elm->next); |
319 | kfree(objp: elm); |
320 | } |
321 | } |
322 | |
323 | static int iopt_fill_domains_pages(struct list_head *pages_list) |
324 | { |
325 | struct iopt_pages_list *undo_elm; |
326 | struct iopt_pages_list *elm; |
327 | int rc; |
328 | |
329 | list_for_each_entry(elm, pages_list, next) { |
330 | rc = iopt_area_fill_domains(area: elm->area, pages: elm->pages); |
331 | if (rc) |
332 | goto err_undo; |
333 | } |
334 | return 0; |
335 | |
336 | err_undo: |
337 | list_for_each_entry(undo_elm, pages_list, next) { |
338 | if (undo_elm == elm) |
339 | break; |
340 | iopt_area_unfill_domains(area: undo_elm->area, pages: undo_elm->pages); |
341 | } |
342 | return rc; |
343 | } |
344 | |
345 | int iopt_map_pages(struct io_pagetable *iopt, struct list_head *pages_list, |
346 | unsigned long length, unsigned long *dst_iova, |
347 | int iommu_prot, unsigned int flags) |
348 | { |
349 | struct iopt_pages_list *elm; |
350 | int rc; |
351 | |
352 | rc = iopt_alloc_area_pages(iopt, pages_list, length, dst_iova, |
353 | iommu_prot, flags); |
354 | if (rc) |
355 | return rc; |
356 | |
357 | down_read(sem: &iopt->domains_rwsem); |
358 | rc = iopt_fill_domains_pages(pages_list); |
359 | if (rc) |
360 | goto out_unlock_domains; |
361 | |
362 | down_write(sem: &iopt->iova_rwsem); |
363 | list_for_each_entry(elm, pages_list, next) { |
364 | /* |
365 | * area->pages must be set inside the domains_rwsem to ensure |
366 | * any newly added domains will get filled. Moves the reference |
367 | * in from the list. |
368 | */ |
369 | elm->area->pages = elm->pages; |
370 | elm->pages = NULL; |
371 | elm->area = NULL; |
372 | } |
373 | up_write(sem: &iopt->iova_rwsem); |
374 | out_unlock_domains: |
375 | up_read(sem: &iopt->domains_rwsem); |
376 | return rc; |
377 | } |
378 | |
379 | /** |
380 | * iopt_map_user_pages() - Map a user VA to an iova in the io page table |
381 | * @ictx: iommufd_ctx the iopt is part of |
382 | * @iopt: io_pagetable to act on |
383 | * @iova: If IOPT_ALLOC_IOVA is set this is unused on input and contains |
384 | * the chosen iova on output. Otherwise is the iova to map to on input |
385 | * @uptr: User VA to map |
386 | * @length: Number of bytes to map |
387 | * @iommu_prot: Combination of IOMMU_READ/WRITE/etc bits for the mapping |
388 | * @flags: IOPT_ALLOC_IOVA or zero |
389 | * |
390 | * iova, uptr, and length must be aligned to iova_alignment. For domain backed |
391 | * page tables this will pin the pages and load them into the domain at iova. |
392 | * For non-domain page tables this will only setup a lazy reference and the |
393 | * caller must use iopt_access_pages() to touch them. |
394 | * |
395 | * iopt_unmap_iova() must be called to undo this before the io_pagetable can be |
396 | * destroyed. |
397 | */ |
398 | int iopt_map_user_pages(struct iommufd_ctx *ictx, struct io_pagetable *iopt, |
399 | unsigned long *iova, void __user *uptr, |
400 | unsigned long length, int iommu_prot, |
401 | unsigned int flags) |
402 | { |
403 | struct iopt_pages_list elm = {}; |
404 | LIST_HEAD(pages_list); |
405 | int rc; |
406 | |
407 | elm.pages = iopt_alloc_pages(uptr, length, writable: iommu_prot & IOMMU_WRITE); |
408 | if (IS_ERR(ptr: elm.pages)) |
409 | return PTR_ERR(ptr: elm.pages); |
410 | if (ictx->account_mode == IOPT_PAGES_ACCOUNT_MM && |
411 | elm.pages->account_mode == IOPT_PAGES_ACCOUNT_USER) |
412 | elm.pages->account_mode = IOPT_PAGES_ACCOUNT_MM; |
413 | elm.start_byte = uptr - elm.pages->uptr; |
414 | elm.length = length; |
415 | list_add(new: &elm.next, head: &pages_list); |
416 | |
417 | rc = iopt_map_pages(iopt, pages_list: &pages_list, length, dst_iova: iova, iommu_prot, flags); |
418 | if (rc) { |
419 | if (elm.area) |
420 | iopt_abort_area(area: elm.area); |
421 | if (elm.pages) |
422 | iopt_put_pages(pages: elm.pages); |
423 | return rc; |
424 | } |
425 | return 0; |
426 | } |
427 | |
428 | struct iova_bitmap_fn_arg { |
429 | unsigned long flags; |
430 | struct io_pagetable *iopt; |
431 | struct iommu_domain *domain; |
432 | struct iommu_dirty_bitmap *dirty; |
433 | }; |
434 | |
435 | static int __iommu_read_and_clear_dirty(struct iova_bitmap *bitmap, |
436 | unsigned long iova, size_t length, |
437 | void *opaque) |
438 | { |
439 | struct iopt_area *area; |
440 | struct iopt_area_contig_iter iter; |
441 | struct iova_bitmap_fn_arg *arg = opaque; |
442 | struct iommu_domain *domain = arg->domain; |
443 | struct iommu_dirty_bitmap *dirty = arg->dirty; |
444 | const struct iommu_dirty_ops *ops = domain->dirty_ops; |
445 | unsigned long last_iova = iova + length - 1; |
446 | unsigned long flags = arg->flags; |
447 | int ret; |
448 | |
449 | iopt_for_each_contig_area(&iter, area, arg->iopt, iova, last_iova) { |
450 | unsigned long last = min(last_iova, iopt_area_last_iova(area)); |
451 | |
452 | ret = ops->read_and_clear_dirty(domain, iter.cur_iova, |
453 | last - iter.cur_iova + 1, flags, |
454 | dirty); |
455 | if (ret) |
456 | return ret; |
457 | } |
458 | |
459 | if (!iopt_area_contig_done(iter: &iter)) |
460 | return -EINVAL; |
461 | return 0; |
462 | } |
463 | |
464 | static int |
465 | iommu_read_and_clear_dirty(struct iommu_domain *domain, |
466 | struct io_pagetable *iopt, unsigned long flags, |
467 | struct iommu_hwpt_get_dirty_bitmap *bitmap) |
468 | { |
469 | const struct iommu_dirty_ops *ops = domain->dirty_ops; |
470 | struct iommu_iotlb_gather gather; |
471 | struct iommu_dirty_bitmap dirty; |
472 | struct iova_bitmap_fn_arg arg; |
473 | struct iova_bitmap *iter; |
474 | int ret = 0; |
475 | |
476 | if (!ops || !ops->read_and_clear_dirty) |
477 | return -EOPNOTSUPP; |
478 | |
479 | iter = iova_bitmap_alloc(iova: bitmap->iova, length: bitmap->length, |
480 | page_size: bitmap->page_size, |
481 | u64_to_user_ptr(bitmap->data)); |
482 | if (IS_ERR(ptr: iter)) |
483 | return -ENOMEM; |
484 | |
485 | iommu_dirty_bitmap_init(dirty: &dirty, bitmap: iter, gather: &gather); |
486 | |
487 | arg.flags = flags; |
488 | arg.iopt = iopt; |
489 | arg.domain = domain; |
490 | arg.dirty = &dirty; |
491 | iova_bitmap_for_each(bitmap: iter, opaque: &arg, fn: __iommu_read_and_clear_dirty); |
492 | |
493 | if (!(flags & IOMMU_DIRTY_NO_CLEAR)) |
494 | iommu_iotlb_sync(domain, iotlb_gather: &gather); |
495 | |
496 | iova_bitmap_free(bitmap: iter); |
497 | |
498 | return ret; |
499 | } |
500 | |
501 | int iommufd_check_iova_range(struct io_pagetable *iopt, |
502 | struct iommu_hwpt_get_dirty_bitmap *bitmap) |
503 | { |
504 | size_t iommu_pgsize = iopt->iova_alignment; |
505 | u64 last_iova; |
506 | |
507 | if (check_add_overflow(bitmap->iova, bitmap->length - 1, &last_iova)) |
508 | return -EOVERFLOW; |
509 | |
510 | if (bitmap->iova > ULONG_MAX || last_iova > ULONG_MAX) |
511 | return -EOVERFLOW; |
512 | |
513 | if ((bitmap->iova & (iommu_pgsize - 1)) || |
514 | ((last_iova + 1) & (iommu_pgsize - 1))) |
515 | return -EINVAL; |
516 | |
517 | if (!bitmap->page_size) |
518 | return -EINVAL; |
519 | |
520 | if ((bitmap->iova & (bitmap->page_size - 1)) || |
521 | ((last_iova + 1) & (bitmap->page_size - 1))) |
522 | return -EINVAL; |
523 | |
524 | return 0; |
525 | } |
526 | |
527 | int iopt_read_and_clear_dirty_data(struct io_pagetable *iopt, |
528 | struct iommu_domain *domain, |
529 | unsigned long flags, |
530 | struct iommu_hwpt_get_dirty_bitmap *bitmap) |
531 | { |
532 | int ret; |
533 | |
534 | ret = iommufd_check_iova_range(iopt, bitmap); |
535 | if (ret) |
536 | return ret; |
537 | |
538 | down_read(sem: &iopt->iova_rwsem); |
539 | ret = iommu_read_and_clear_dirty(domain, iopt, flags, bitmap); |
540 | up_read(sem: &iopt->iova_rwsem); |
541 | |
542 | return ret; |
543 | } |
544 | |
545 | static int iopt_clear_dirty_data(struct io_pagetable *iopt, |
546 | struct iommu_domain *domain) |
547 | { |
548 | const struct iommu_dirty_ops *ops = domain->dirty_ops; |
549 | struct iommu_iotlb_gather gather; |
550 | struct iommu_dirty_bitmap dirty; |
551 | struct iopt_area *area; |
552 | int ret = 0; |
553 | |
554 | lockdep_assert_held_read(&iopt->iova_rwsem); |
555 | |
556 | iommu_dirty_bitmap_init(dirty: &dirty, NULL, gather: &gather); |
557 | |
558 | for (area = iopt_area_iter_first(iopt, start: 0, ULONG_MAX); area; |
559 | area = iopt_area_iter_next(last_node: area, start: 0, ULONG_MAX)) { |
560 | if (!area->pages) |
561 | continue; |
562 | |
563 | ret = ops->read_and_clear_dirty(domain, iopt_area_iova(area), |
564 | iopt_area_length(area), 0, |
565 | &dirty); |
566 | if (ret) |
567 | break; |
568 | } |
569 | |
570 | iommu_iotlb_sync(domain, iotlb_gather: &gather); |
571 | return ret; |
572 | } |
573 | |
574 | int iopt_set_dirty_tracking(struct io_pagetable *iopt, |
575 | struct iommu_domain *domain, bool enable) |
576 | { |
577 | const struct iommu_dirty_ops *ops = domain->dirty_ops; |
578 | int ret = 0; |
579 | |
580 | if (!ops) |
581 | return -EOPNOTSUPP; |
582 | |
583 | down_read(sem: &iopt->iova_rwsem); |
584 | |
585 | /* Clear dirty bits from PTEs to ensure a clean snapshot */ |
586 | if (enable) { |
587 | ret = iopt_clear_dirty_data(iopt, domain); |
588 | if (ret) |
589 | goto out_unlock; |
590 | } |
591 | |
592 | ret = ops->set_dirty_tracking(domain, enable); |
593 | |
594 | out_unlock: |
595 | up_read(sem: &iopt->iova_rwsem); |
596 | return ret; |
597 | } |
598 | |
599 | int iopt_get_pages(struct io_pagetable *iopt, unsigned long iova, |
600 | unsigned long length, struct list_head *pages_list) |
601 | { |
602 | struct iopt_area_contig_iter iter; |
603 | unsigned long last_iova; |
604 | struct iopt_area *area; |
605 | int rc; |
606 | |
607 | if (!length) |
608 | return -EINVAL; |
609 | if (check_add_overflow(iova, length - 1, &last_iova)) |
610 | return -EOVERFLOW; |
611 | |
612 | down_read(sem: &iopt->iova_rwsem); |
613 | iopt_for_each_contig_area(&iter, area, iopt, iova, last_iova) { |
614 | struct iopt_pages_list *elm; |
615 | unsigned long last = min(last_iova, iopt_area_last_iova(area)); |
616 | |
617 | elm = kzalloc(size: sizeof(*elm), GFP_KERNEL_ACCOUNT); |
618 | if (!elm) { |
619 | rc = -ENOMEM; |
620 | goto err_free; |
621 | } |
622 | elm->start_byte = iopt_area_start_byte(area, iova: iter.cur_iova); |
623 | elm->pages = area->pages; |
624 | elm->length = (last - iter.cur_iova) + 1; |
625 | kref_get(kref: &elm->pages->kref); |
626 | list_add_tail(new: &elm->next, head: pages_list); |
627 | } |
628 | if (!iopt_area_contig_done(iter: &iter)) { |
629 | rc = -ENOENT; |
630 | goto err_free; |
631 | } |
632 | up_read(sem: &iopt->iova_rwsem); |
633 | return 0; |
634 | err_free: |
635 | up_read(sem: &iopt->iova_rwsem); |
636 | iopt_free_pages_list(pages_list); |
637 | return rc; |
638 | } |
639 | |
640 | static int iopt_unmap_iova_range(struct io_pagetable *iopt, unsigned long start, |
641 | unsigned long last, unsigned long *unmapped) |
642 | { |
643 | struct iopt_area *area; |
644 | unsigned long unmapped_bytes = 0; |
645 | unsigned int tries = 0; |
646 | int rc = -ENOENT; |
647 | |
648 | /* |
649 | * The domains_rwsem must be held in read mode any time any area->pages |
650 | * is NULL. This prevents domain attach/detatch from running |
651 | * concurrently with cleaning up the area. |
652 | */ |
653 | again: |
654 | down_read(sem: &iopt->domains_rwsem); |
655 | down_write(sem: &iopt->iova_rwsem); |
656 | while ((area = iopt_area_iter_first(iopt, start, last))) { |
657 | unsigned long area_last = iopt_area_last_iova(area); |
658 | unsigned long area_first = iopt_area_iova(area); |
659 | struct iopt_pages *pages; |
660 | |
661 | /* Userspace should not race map/unmap's of the same area */ |
662 | if (!area->pages) { |
663 | rc = -EBUSY; |
664 | goto out_unlock_iova; |
665 | } |
666 | |
667 | if (area_first < start || area_last > last) { |
668 | rc = -ENOENT; |
669 | goto out_unlock_iova; |
670 | } |
671 | |
672 | if (area_first != start) |
673 | tries = 0; |
674 | |
675 | /* |
676 | * num_accesses writers must hold the iova_rwsem too, so we can |
677 | * safely read it under the write side of the iovam_rwsem |
678 | * without the pages->mutex. |
679 | */ |
680 | if (area->num_accesses) { |
681 | size_t length = iopt_area_length(area); |
682 | |
683 | start = area_first; |
684 | area->prevent_access = true; |
685 | up_write(sem: &iopt->iova_rwsem); |
686 | up_read(sem: &iopt->domains_rwsem); |
687 | |
688 | iommufd_access_notify_unmap(iopt, iova: area_first, length); |
689 | /* Something is not responding to unmap requests. */ |
690 | tries++; |
691 | if (WARN_ON(tries > 100)) |
692 | return -EDEADLOCK; |
693 | goto again; |
694 | } |
695 | |
696 | pages = area->pages; |
697 | area->pages = NULL; |
698 | up_write(sem: &iopt->iova_rwsem); |
699 | |
700 | iopt_area_unfill_domains(area, pages); |
701 | iopt_abort_area(area); |
702 | iopt_put_pages(pages); |
703 | |
704 | unmapped_bytes += area_last - area_first + 1; |
705 | |
706 | down_write(sem: &iopt->iova_rwsem); |
707 | } |
708 | if (unmapped_bytes) |
709 | rc = 0; |
710 | |
711 | out_unlock_iova: |
712 | up_write(sem: &iopt->iova_rwsem); |
713 | up_read(sem: &iopt->domains_rwsem); |
714 | if (unmapped) |
715 | *unmapped = unmapped_bytes; |
716 | return rc; |
717 | } |
718 | |
719 | /** |
720 | * iopt_unmap_iova() - Remove a range of iova |
721 | * @iopt: io_pagetable to act on |
722 | * @iova: Starting iova to unmap |
723 | * @length: Number of bytes to unmap |
724 | * @unmapped: Return number of bytes unmapped |
725 | * |
726 | * The requested range must be a superset of existing ranges. |
727 | * Splitting/truncating IOVA mappings is not allowed. |
728 | */ |
729 | int iopt_unmap_iova(struct io_pagetable *iopt, unsigned long iova, |
730 | unsigned long length, unsigned long *unmapped) |
731 | { |
732 | unsigned long iova_last; |
733 | |
734 | if (!length) |
735 | return -EINVAL; |
736 | |
737 | if (check_add_overflow(iova, length - 1, &iova_last)) |
738 | return -EOVERFLOW; |
739 | |
740 | return iopt_unmap_iova_range(iopt, start: iova, last: iova_last, unmapped); |
741 | } |
742 | |
743 | int iopt_unmap_all(struct io_pagetable *iopt, unsigned long *unmapped) |
744 | { |
745 | int rc; |
746 | |
747 | rc = iopt_unmap_iova_range(iopt, start: 0, ULONG_MAX, unmapped); |
748 | /* If the IOVAs are empty then unmap all succeeds */ |
749 | if (rc == -ENOENT) |
750 | return 0; |
751 | return rc; |
752 | } |
753 | |
754 | /* The caller must always free all the nodes in the allowed_iova rb_root. */ |
755 | int iopt_set_allow_iova(struct io_pagetable *iopt, |
756 | struct rb_root_cached *allowed_iova) |
757 | { |
758 | struct iopt_allowed *allowed; |
759 | |
760 | down_write(sem: &iopt->iova_rwsem); |
761 | swap(*allowed_iova, iopt->allowed_itree); |
762 | |
763 | for (allowed = iopt_allowed_iter_first(iopt, start: 0, ULONG_MAX); allowed; |
764 | allowed = iopt_allowed_iter_next(last_node: allowed, start: 0, ULONG_MAX)) { |
765 | if (iopt_reserved_iter_first(iopt, start: allowed->node.start, |
766 | last: allowed->node.last)) { |
767 | swap(*allowed_iova, iopt->allowed_itree); |
768 | up_write(sem: &iopt->iova_rwsem); |
769 | return -EADDRINUSE; |
770 | } |
771 | } |
772 | up_write(sem: &iopt->iova_rwsem); |
773 | return 0; |
774 | } |
775 | |
776 | int iopt_reserve_iova(struct io_pagetable *iopt, unsigned long start, |
777 | unsigned long last, void *owner) |
778 | { |
779 | struct iopt_reserved *reserved; |
780 | |
781 | lockdep_assert_held_write(&iopt->iova_rwsem); |
782 | |
783 | if (iopt_area_iter_first(iopt, start, last) || |
784 | iopt_allowed_iter_first(iopt, start, last)) |
785 | return -EADDRINUSE; |
786 | |
787 | reserved = kzalloc(size: sizeof(*reserved), GFP_KERNEL_ACCOUNT); |
788 | if (!reserved) |
789 | return -ENOMEM; |
790 | reserved->node.start = start; |
791 | reserved->node.last = last; |
792 | reserved->owner = owner; |
793 | interval_tree_insert(node: &reserved->node, root: &iopt->reserved_itree); |
794 | return 0; |
795 | } |
796 | |
797 | static void __iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) |
798 | { |
799 | struct iopt_reserved *reserved, *next; |
800 | |
801 | lockdep_assert_held_write(&iopt->iova_rwsem); |
802 | |
803 | for (reserved = iopt_reserved_iter_first(iopt, start: 0, ULONG_MAX); reserved; |
804 | reserved = next) { |
805 | next = iopt_reserved_iter_next(last_node: reserved, start: 0, ULONG_MAX); |
806 | |
807 | if (reserved->owner == owner) { |
808 | interval_tree_remove(node: &reserved->node, |
809 | root: &iopt->reserved_itree); |
810 | kfree(objp: reserved); |
811 | } |
812 | } |
813 | } |
814 | |
815 | void iopt_remove_reserved_iova(struct io_pagetable *iopt, void *owner) |
816 | { |
817 | down_write(sem: &iopt->iova_rwsem); |
818 | __iopt_remove_reserved_iova(iopt, owner); |
819 | up_write(sem: &iopt->iova_rwsem); |
820 | } |
821 | |
822 | void iopt_init_table(struct io_pagetable *iopt) |
823 | { |
824 | init_rwsem(&iopt->iova_rwsem); |
825 | init_rwsem(&iopt->domains_rwsem); |
826 | iopt->area_itree = RB_ROOT_CACHED; |
827 | iopt->allowed_itree = RB_ROOT_CACHED; |
828 | iopt->reserved_itree = RB_ROOT_CACHED; |
829 | xa_init_flags(xa: &iopt->domains, XA_FLAGS_ACCOUNT); |
830 | xa_init_flags(xa: &iopt->access_list, XA_FLAGS_ALLOC); |
831 | |
832 | /* |
833 | * iopt's start as SW tables that can use the entire size_t IOVA space |
834 | * due to the use of size_t in the APIs. They have no alignment |
835 | * restriction. |
836 | */ |
837 | iopt->iova_alignment = 1; |
838 | } |
839 | |
840 | void iopt_destroy_table(struct io_pagetable *iopt) |
841 | { |
842 | struct interval_tree_node *node; |
843 | |
844 | if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) |
845 | iopt_remove_reserved_iova(iopt, NULL); |
846 | |
847 | while ((node = interval_tree_iter_first(root: &iopt->allowed_itree, start: 0, |
848 | ULONG_MAX))) { |
849 | interval_tree_remove(node, root: &iopt->allowed_itree); |
850 | kfree(container_of(node, struct iopt_allowed, node)); |
851 | } |
852 | |
853 | WARN_ON(!RB_EMPTY_ROOT(&iopt->reserved_itree.rb_root)); |
854 | WARN_ON(!xa_empty(&iopt->domains)); |
855 | WARN_ON(!xa_empty(&iopt->access_list)); |
856 | WARN_ON(!RB_EMPTY_ROOT(&iopt->area_itree.rb_root)); |
857 | } |
858 | |
859 | /** |
860 | * iopt_unfill_domain() - Unfill a domain with PFNs |
861 | * @iopt: io_pagetable to act on |
862 | * @domain: domain to unfill |
863 | * |
864 | * This is used when removing a domain from the iopt. Every area in the iopt |
865 | * will be unmapped from the domain. The domain must already be removed from the |
866 | * domains xarray. |
867 | */ |
868 | static void iopt_unfill_domain(struct io_pagetable *iopt, |
869 | struct iommu_domain *domain) |
870 | { |
871 | struct iopt_area *area; |
872 | |
873 | lockdep_assert_held(&iopt->iova_rwsem); |
874 | lockdep_assert_held_write(&iopt->domains_rwsem); |
875 | |
876 | /* |
877 | * Some other domain is holding all the pfns still, rapidly unmap this |
878 | * domain. |
879 | */ |
880 | if (iopt->next_domain_id != 0) { |
881 | /* Pick an arbitrary remaining domain to act as storage */ |
882 | struct iommu_domain *storage_domain = |
883 | xa_load(&iopt->domains, index: 0); |
884 | |
885 | for (area = iopt_area_iter_first(iopt, start: 0, ULONG_MAX); area; |
886 | area = iopt_area_iter_next(last_node: area, start: 0, ULONG_MAX)) { |
887 | struct iopt_pages *pages = area->pages; |
888 | |
889 | if (!pages) |
890 | continue; |
891 | |
892 | mutex_lock(&pages->mutex); |
893 | if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) |
894 | WARN_ON(!area->storage_domain); |
895 | if (area->storage_domain == domain) |
896 | area->storage_domain = storage_domain; |
897 | mutex_unlock(lock: &pages->mutex); |
898 | |
899 | iopt_area_unmap_domain(area, domain); |
900 | } |
901 | return; |
902 | } |
903 | |
904 | for (area = iopt_area_iter_first(iopt, start: 0, ULONG_MAX); area; |
905 | area = iopt_area_iter_next(last_node: area, start: 0, ULONG_MAX)) { |
906 | struct iopt_pages *pages = area->pages; |
907 | |
908 | if (!pages) |
909 | continue; |
910 | |
911 | mutex_lock(&pages->mutex); |
912 | interval_tree_remove(node: &area->pages_node, root: &pages->domains_itree); |
913 | WARN_ON(area->storage_domain != domain); |
914 | area->storage_domain = NULL; |
915 | iopt_area_unfill_domain(area, pages, domain); |
916 | mutex_unlock(lock: &pages->mutex); |
917 | } |
918 | } |
919 | |
920 | /** |
921 | * iopt_fill_domain() - Fill a domain with PFNs |
922 | * @iopt: io_pagetable to act on |
923 | * @domain: domain to fill |
924 | * |
925 | * Fill the domain with PFNs from every area in the iopt. On failure the domain |
926 | * is left unchanged. |
927 | */ |
928 | static int iopt_fill_domain(struct io_pagetable *iopt, |
929 | struct iommu_domain *domain) |
930 | { |
931 | struct iopt_area *end_area; |
932 | struct iopt_area *area; |
933 | int rc; |
934 | |
935 | lockdep_assert_held(&iopt->iova_rwsem); |
936 | lockdep_assert_held_write(&iopt->domains_rwsem); |
937 | |
938 | for (area = iopt_area_iter_first(iopt, start: 0, ULONG_MAX); area; |
939 | area = iopt_area_iter_next(last_node: area, start: 0, ULONG_MAX)) { |
940 | struct iopt_pages *pages = area->pages; |
941 | |
942 | if (!pages) |
943 | continue; |
944 | |
945 | mutex_lock(&pages->mutex); |
946 | rc = iopt_area_fill_domain(area, domain); |
947 | if (rc) { |
948 | mutex_unlock(lock: &pages->mutex); |
949 | goto out_unfill; |
950 | } |
951 | if (!area->storage_domain) { |
952 | WARN_ON(iopt->next_domain_id != 0); |
953 | area->storage_domain = domain; |
954 | interval_tree_insert(node: &area->pages_node, |
955 | root: &pages->domains_itree); |
956 | } |
957 | mutex_unlock(lock: &pages->mutex); |
958 | } |
959 | return 0; |
960 | |
961 | out_unfill: |
962 | end_area = area; |
963 | for (area = iopt_area_iter_first(iopt, start: 0, ULONG_MAX); area; |
964 | area = iopt_area_iter_next(last_node: area, start: 0, ULONG_MAX)) { |
965 | struct iopt_pages *pages = area->pages; |
966 | |
967 | if (area == end_area) |
968 | break; |
969 | if (!pages) |
970 | continue; |
971 | mutex_lock(&pages->mutex); |
972 | if (iopt->next_domain_id == 0) { |
973 | interval_tree_remove(node: &area->pages_node, |
974 | root: &pages->domains_itree); |
975 | area->storage_domain = NULL; |
976 | } |
977 | iopt_area_unfill_domain(area, pages, domain); |
978 | mutex_unlock(lock: &pages->mutex); |
979 | } |
980 | return rc; |
981 | } |
982 | |
983 | /* All existing area's conform to an increased page size */ |
984 | static int iopt_check_iova_alignment(struct io_pagetable *iopt, |
985 | unsigned long new_iova_alignment) |
986 | { |
987 | unsigned long align_mask = new_iova_alignment - 1; |
988 | struct iopt_area *area; |
989 | |
990 | lockdep_assert_held(&iopt->iova_rwsem); |
991 | lockdep_assert_held(&iopt->domains_rwsem); |
992 | |
993 | for (area = iopt_area_iter_first(iopt, start: 0, ULONG_MAX); area; |
994 | area = iopt_area_iter_next(last_node: area, start: 0, ULONG_MAX)) |
995 | if ((iopt_area_iova(area) & align_mask) || |
996 | (iopt_area_length(area) & align_mask) || |
997 | (area->page_offset & align_mask)) |
998 | return -EADDRINUSE; |
999 | |
1000 | if (IS_ENABLED(CONFIG_IOMMUFD_TEST)) { |
1001 | struct iommufd_access *access; |
1002 | unsigned long index; |
1003 | |
1004 | xa_for_each(&iopt->access_list, index, access) |
1005 | if (WARN_ON(access->iova_alignment > |
1006 | new_iova_alignment)) |
1007 | return -EADDRINUSE; |
1008 | } |
1009 | return 0; |
1010 | } |
1011 | |
1012 | int iopt_table_add_domain(struct io_pagetable *iopt, |
1013 | struct iommu_domain *domain) |
1014 | { |
1015 | const struct iommu_domain_geometry *geometry = &domain->geometry; |
1016 | struct iommu_domain *iter_domain; |
1017 | unsigned int new_iova_alignment; |
1018 | unsigned long index; |
1019 | int rc; |
1020 | |
1021 | down_write(sem: &iopt->domains_rwsem); |
1022 | down_write(sem: &iopt->iova_rwsem); |
1023 | |
1024 | xa_for_each(&iopt->domains, index, iter_domain) { |
1025 | if (WARN_ON(iter_domain == domain)) { |
1026 | rc = -EEXIST; |
1027 | goto out_unlock; |
1028 | } |
1029 | } |
1030 | |
1031 | /* |
1032 | * The io page size drives the iova_alignment. Internally the iopt_pages |
1033 | * works in PAGE_SIZE units and we adjust when mapping sub-PAGE_SIZE |
1034 | * objects into the iommu_domain. |
1035 | * |
1036 | * A iommu_domain must always be able to accept PAGE_SIZE to be |
1037 | * compatible as we can't guarantee higher contiguity. |
1038 | */ |
1039 | new_iova_alignment = max_t(unsigned long, |
1040 | 1UL << __ffs(domain->pgsize_bitmap), |
1041 | iopt->iova_alignment); |
1042 | if (new_iova_alignment > PAGE_SIZE) { |
1043 | rc = -EINVAL; |
1044 | goto out_unlock; |
1045 | } |
1046 | if (new_iova_alignment != iopt->iova_alignment) { |
1047 | rc = iopt_check_iova_alignment(iopt, new_iova_alignment); |
1048 | if (rc) |
1049 | goto out_unlock; |
1050 | } |
1051 | |
1052 | /* No area exists that is outside the allowed domain aperture */ |
1053 | if (geometry->aperture_start != 0) { |
1054 | rc = iopt_reserve_iova(iopt, start: 0, last: geometry->aperture_start - 1, |
1055 | owner: domain); |
1056 | if (rc) |
1057 | goto out_reserved; |
1058 | } |
1059 | if (geometry->aperture_end != ULONG_MAX) { |
1060 | rc = iopt_reserve_iova(iopt, start: geometry->aperture_end + 1, |
1061 | ULONG_MAX, owner: domain); |
1062 | if (rc) |
1063 | goto out_reserved; |
1064 | } |
1065 | |
1066 | rc = xa_reserve(xa: &iopt->domains, index: iopt->next_domain_id, GFP_KERNEL); |
1067 | if (rc) |
1068 | goto out_reserved; |
1069 | |
1070 | rc = iopt_fill_domain(iopt, domain); |
1071 | if (rc) |
1072 | goto out_release; |
1073 | |
1074 | iopt->iova_alignment = new_iova_alignment; |
1075 | xa_store(&iopt->domains, index: iopt->next_domain_id, entry: domain, GFP_KERNEL); |
1076 | iopt->next_domain_id++; |
1077 | up_write(sem: &iopt->iova_rwsem); |
1078 | up_write(sem: &iopt->domains_rwsem); |
1079 | return 0; |
1080 | out_release: |
1081 | xa_release(xa: &iopt->domains, index: iopt->next_domain_id); |
1082 | out_reserved: |
1083 | __iopt_remove_reserved_iova(iopt, owner: domain); |
1084 | out_unlock: |
1085 | up_write(sem: &iopt->iova_rwsem); |
1086 | up_write(sem: &iopt->domains_rwsem); |
1087 | return rc; |
1088 | } |
1089 | |
1090 | static int iopt_calculate_iova_alignment(struct io_pagetable *iopt) |
1091 | { |
1092 | unsigned long new_iova_alignment; |
1093 | struct iommufd_access *access; |
1094 | struct iommu_domain *domain; |
1095 | unsigned long index; |
1096 | |
1097 | lockdep_assert_held_write(&iopt->iova_rwsem); |
1098 | lockdep_assert_held(&iopt->domains_rwsem); |
1099 | |
1100 | /* See batch_iommu_map_small() */ |
1101 | if (iopt->disable_large_pages) |
1102 | new_iova_alignment = PAGE_SIZE; |
1103 | else |
1104 | new_iova_alignment = 1; |
1105 | |
1106 | xa_for_each(&iopt->domains, index, domain) |
1107 | new_iova_alignment = max_t(unsigned long, |
1108 | 1UL << __ffs(domain->pgsize_bitmap), |
1109 | new_iova_alignment); |
1110 | xa_for_each(&iopt->access_list, index, access) |
1111 | new_iova_alignment = max_t(unsigned long, |
1112 | access->iova_alignment, |
1113 | new_iova_alignment); |
1114 | |
1115 | if (new_iova_alignment > iopt->iova_alignment) { |
1116 | int rc; |
1117 | |
1118 | rc = iopt_check_iova_alignment(iopt, new_iova_alignment); |
1119 | if (rc) |
1120 | return rc; |
1121 | } |
1122 | iopt->iova_alignment = new_iova_alignment; |
1123 | return 0; |
1124 | } |
1125 | |
1126 | void iopt_table_remove_domain(struct io_pagetable *iopt, |
1127 | struct iommu_domain *domain) |
1128 | { |
1129 | struct iommu_domain *iter_domain = NULL; |
1130 | unsigned long index; |
1131 | |
1132 | down_write(sem: &iopt->domains_rwsem); |
1133 | down_write(sem: &iopt->iova_rwsem); |
1134 | |
1135 | xa_for_each(&iopt->domains, index, iter_domain) |
1136 | if (iter_domain == domain) |
1137 | break; |
1138 | if (WARN_ON(iter_domain != domain) || index >= iopt->next_domain_id) |
1139 | goto out_unlock; |
1140 | |
1141 | /* |
1142 | * Compress the xarray to keep it linear by swapping the entry to erase |
1143 | * with the tail entry and shrinking the tail. |
1144 | */ |
1145 | iopt->next_domain_id--; |
1146 | iter_domain = xa_erase(&iopt->domains, index: iopt->next_domain_id); |
1147 | if (index != iopt->next_domain_id) |
1148 | xa_store(&iopt->domains, index, entry: iter_domain, GFP_KERNEL); |
1149 | |
1150 | iopt_unfill_domain(iopt, domain); |
1151 | __iopt_remove_reserved_iova(iopt, owner: domain); |
1152 | |
1153 | WARN_ON(iopt_calculate_iova_alignment(iopt)); |
1154 | out_unlock: |
1155 | up_write(sem: &iopt->iova_rwsem); |
1156 | up_write(sem: &iopt->domains_rwsem); |
1157 | } |
1158 | |
1159 | /** |
1160 | * iopt_area_split - Split an area into two parts at iova |
1161 | * @area: The area to split |
1162 | * @iova: Becomes the last of a new area |
1163 | * |
1164 | * This splits an area into two. It is part of the VFIO compatibility to allow |
1165 | * poking a hole in the mapping. The two areas continue to point at the same |
1166 | * iopt_pages, just with different starting bytes. |
1167 | */ |
1168 | static int iopt_area_split(struct iopt_area *area, unsigned long iova) |
1169 | { |
1170 | unsigned long alignment = area->iopt->iova_alignment; |
1171 | unsigned long last_iova = iopt_area_last_iova(area); |
1172 | unsigned long start_iova = iopt_area_iova(area); |
1173 | unsigned long new_start = iova + 1; |
1174 | struct io_pagetable *iopt = area->iopt; |
1175 | struct iopt_pages *pages = area->pages; |
1176 | struct iopt_area *lhs; |
1177 | struct iopt_area *rhs; |
1178 | int rc; |
1179 | |
1180 | lockdep_assert_held_write(&iopt->iova_rwsem); |
1181 | |
1182 | if (iova == start_iova || iova == last_iova) |
1183 | return 0; |
1184 | |
1185 | if (!pages || area->prevent_access) |
1186 | return -EBUSY; |
1187 | |
1188 | if (new_start & (alignment - 1) || |
1189 | iopt_area_start_byte(area, iova: new_start) & (alignment - 1)) |
1190 | return -EINVAL; |
1191 | |
1192 | lhs = iopt_area_alloc(); |
1193 | if (!lhs) |
1194 | return -ENOMEM; |
1195 | |
1196 | rhs = iopt_area_alloc(); |
1197 | if (!rhs) { |
1198 | rc = -ENOMEM; |
1199 | goto err_free_lhs; |
1200 | } |
1201 | |
1202 | mutex_lock(&pages->mutex); |
1203 | /* |
1204 | * Splitting is not permitted if an access exists, we don't track enough |
1205 | * information to split existing accesses. |
1206 | */ |
1207 | if (area->num_accesses) { |
1208 | rc = -EINVAL; |
1209 | goto err_unlock; |
1210 | } |
1211 | |
1212 | /* |
1213 | * Splitting is not permitted if a domain could have been mapped with |
1214 | * huge pages. |
1215 | */ |
1216 | if (area->storage_domain && !iopt->disable_large_pages) { |
1217 | rc = -EINVAL; |
1218 | goto err_unlock; |
1219 | } |
1220 | |
1221 | interval_tree_remove(node: &area->node, root: &iopt->area_itree); |
1222 | rc = iopt_insert_area(iopt, area: lhs, pages: area->pages, iova: start_iova, |
1223 | start_byte: iopt_area_start_byte(area, iova: start_iova), |
1224 | length: (new_start - 1) - start_iova + 1, |
1225 | iommu_prot: area->iommu_prot); |
1226 | if (WARN_ON(rc)) |
1227 | goto err_insert; |
1228 | |
1229 | rc = iopt_insert_area(iopt, area: rhs, pages: area->pages, iova: new_start, |
1230 | start_byte: iopt_area_start_byte(area, iova: new_start), |
1231 | length: last_iova - new_start + 1, iommu_prot: area->iommu_prot); |
1232 | if (WARN_ON(rc)) |
1233 | goto err_remove_lhs; |
1234 | |
1235 | /* |
1236 | * If the original area has filled a domain, domains_itree has to be |
1237 | * updated. |
1238 | */ |
1239 | if (area->storage_domain) { |
1240 | interval_tree_remove(node: &area->pages_node, root: &pages->domains_itree); |
1241 | interval_tree_insert(node: &lhs->pages_node, root: &pages->domains_itree); |
1242 | interval_tree_insert(node: &rhs->pages_node, root: &pages->domains_itree); |
1243 | } |
1244 | |
1245 | lhs->storage_domain = area->storage_domain; |
1246 | lhs->pages = area->pages; |
1247 | rhs->storage_domain = area->storage_domain; |
1248 | rhs->pages = area->pages; |
1249 | kref_get(kref: &rhs->pages->kref); |
1250 | kfree(objp: area); |
1251 | mutex_unlock(lock: &pages->mutex); |
1252 | |
1253 | /* |
1254 | * No change to domains or accesses because the pages hasn't been |
1255 | * changed |
1256 | */ |
1257 | return 0; |
1258 | |
1259 | err_remove_lhs: |
1260 | interval_tree_remove(node: &lhs->node, root: &iopt->area_itree); |
1261 | err_insert: |
1262 | interval_tree_insert(node: &area->node, root: &iopt->area_itree); |
1263 | err_unlock: |
1264 | mutex_unlock(lock: &pages->mutex); |
1265 | kfree(objp: rhs); |
1266 | err_free_lhs: |
1267 | kfree(objp: lhs); |
1268 | return rc; |
1269 | } |
1270 | |
1271 | int iopt_cut_iova(struct io_pagetable *iopt, unsigned long *iovas, |
1272 | size_t num_iovas) |
1273 | { |
1274 | int rc = 0; |
1275 | int i; |
1276 | |
1277 | down_write(sem: &iopt->iova_rwsem); |
1278 | for (i = 0; i < num_iovas; i++) { |
1279 | struct iopt_area *area; |
1280 | |
1281 | area = iopt_area_iter_first(iopt, start: iovas[i], last: iovas[i]); |
1282 | if (!area) |
1283 | continue; |
1284 | rc = iopt_area_split(area, iova: iovas[i]); |
1285 | if (rc) |
1286 | break; |
1287 | } |
1288 | up_write(sem: &iopt->iova_rwsem); |
1289 | return rc; |
1290 | } |
1291 | |
1292 | void iopt_enable_large_pages(struct io_pagetable *iopt) |
1293 | { |
1294 | int rc; |
1295 | |
1296 | down_write(sem: &iopt->domains_rwsem); |
1297 | down_write(sem: &iopt->iova_rwsem); |
1298 | WRITE_ONCE(iopt->disable_large_pages, false); |
1299 | rc = iopt_calculate_iova_alignment(iopt); |
1300 | WARN_ON(rc); |
1301 | up_write(sem: &iopt->iova_rwsem); |
1302 | up_write(sem: &iopt->domains_rwsem); |
1303 | } |
1304 | |
1305 | int iopt_disable_large_pages(struct io_pagetable *iopt) |
1306 | { |
1307 | int rc = 0; |
1308 | |
1309 | down_write(sem: &iopt->domains_rwsem); |
1310 | down_write(sem: &iopt->iova_rwsem); |
1311 | if (iopt->disable_large_pages) |
1312 | goto out_unlock; |
1313 | |
1314 | /* Won't do it if domains already have pages mapped in them */ |
1315 | if (!xa_empty(xa: &iopt->domains) && |
1316 | !RB_EMPTY_ROOT(&iopt->area_itree.rb_root)) { |
1317 | rc = -EINVAL; |
1318 | goto out_unlock; |
1319 | } |
1320 | |
1321 | WRITE_ONCE(iopt->disable_large_pages, true); |
1322 | rc = iopt_calculate_iova_alignment(iopt); |
1323 | if (rc) |
1324 | WRITE_ONCE(iopt->disable_large_pages, false); |
1325 | out_unlock: |
1326 | up_write(sem: &iopt->iova_rwsem); |
1327 | up_write(sem: &iopt->domains_rwsem); |
1328 | return rc; |
1329 | } |
1330 | |
1331 | int iopt_add_access(struct io_pagetable *iopt, struct iommufd_access *access) |
1332 | { |
1333 | u32 new_id; |
1334 | int rc; |
1335 | |
1336 | down_write(sem: &iopt->domains_rwsem); |
1337 | down_write(sem: &iopt->iova_rwsem); |
1338 | rc = xa_alloc(xa: &iopt->access_list, id: &new_id, entry: access, xa_limit_16b, |
1339 | GFP_KERNEL_ACCOUNT); |
1340 | |
1341 | if (rc) |
1342 | goto out_unlock; |
1343 | |
1344 | rc = iopt_calculate_iova_alignment(iopt); |
1345 | if (rc) { |
1346 | xa_erase(&iopt->access_list, index: new_id); |
1347 | goto out_unlock; |
1348 | } |
1349 | access->iopt_access_list_id = new_id; |
1350 | |
1351 | out_unlock: |
1352 | up_write(sem: &iopt->iova_rwsem); |
1353 | up_write(sem: &iopt->domains_rwsem); |
1354 | return rc; |
1355 | } |
1356 | |
1357 | void iopt_remove_access(struct io_pagetable *iopt, |
1358 | struct iommufd_access *access, |
1359 | u32 iopt_access_list_id) |
1360 | { |
1361 | down_write(sem: &iopt->domains_rwsem); |
1362 | down_write(sem: &iopt->iova_rwsem); |
1363 | WARN_ON(xa_erase(&iopt->access_list, iopt_access_list_id) != access); |
1364 | WARN_ON(iopt_calculate_iova_alignment(iopt)); |
1365 | up_write(sem: &iopt->iova_rwsem); |
1366 | up_write(sem: &iopt->domains_rwsem); |
1367 | } |
1368 | |
1369 | /* Narrow the valid_iova_itree to include reserved ranges from a device. */ |
1370 | int iopt_table_enforce_dev_resv_regions(struct io_pagetable *iopt, |
1371 | struct device *dev, |
1372 | phys_addr_t *sw_msi_start) |
1373 | { |
1374 | struct iommu_resv_region *resv; |
1375 | LIST_HEAD(resv_regions); |
1376 | unsigned int num_hw_msi = 0; |
1377 | unsigned int num_sw_msi = 0; |
1378 | int rc; |
1379 | |
1380 | if (iommufd_should_fail()) |
1381 | return -EINVAL; |
1382 | |
1383 | down_write(sem: &iopt->iova_rwsem); |
1384 | /* FIXME: drivers allocate memory but there is no failure propogated */ |
1385 | iommu_get_resv_regions(dev, list: &resv_regions); |
1386 | |
1387 | list_for_each_entry(resv, &resv_regions, list) { |
1388 | if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE) |
1389 | continue; |
1390 | |
1391 | if (sw_msi_start && resv->type == IOMMU_RESV_MSI) |
1392 | num_hw_msi++; |
1393 | if (sw_msi_start && resv->type == IOMMU_RESV_SW_MSI) { |
1394 | *sw_msi_start = resv->start; |
1395 | num_sw_msi++; |
1396 | } |
1397 | |
1398 | rc = iopt_reserve_iova(iopt, start: resv->start, |
1399 | last: resv->length - 1 + resv->start, owner: dev); |
1400 | if (rc) |
1401 | goto out_reserved; |
1402 | } |
1403 | |
1404 | /* Drivers must offer sane combinations of regions */ |
1405 | if (WARN_ON(num_sw_msi && num_hw_msi) || WARN_ON(num_sw_msi > 1)) { |
1406 | rc = -EINVAL; |
1407 | goto out_reserved; |
1408 | } |
1409 | |
1410 | rc = 0; |
1411 | goto out_free_resv; |
1412 | |
1413 | out_reserved: |
1414 | __iopt_remove_reserved_iova(iopt, owner: dev); |
1415 | out_free_resv: |
1416 | iommu_put_resv_regions(dev, list: &resv_regions); |
1417 | up_write(sem: &iopt->iova_rwsem); |
1418 | return rc; |
1419 | } |
1420 | |