vfio_iommu_type1.c source code [linux/drivers/vfio/vfio_iommu_type1.c]

1	// SPDX-License-Identifier: GPL-2.0-only
2	/*
3	* VFIO: IOMMU DMA mapping support for Type1 IOMMU
4	*
5	* Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6	* Author: Alex Williamson <alex.williamson@redhat.com>
7	*
8	* Derived from original vfio:
9	* Copyright 2010 Cisco Systems, Inc. All rights reserved.
10	* Author: Tom Lyon, pugs@cisco.com
11	*
12	* We arbitrarily define a Type1 IOMMU as one matching the below code.
13	* It could be called the x86 IOMMU as it's designed for AMD-Vi & Intel
14	* VT-d, but that makes it harder to re-use as theoretically anyone
15	* implementing a similar IOMMU could make use of this. We expect the
16	* IOMMU to support the IOMMU API and have few to no restrictions around
17	* the IOVA range that can be mapped. The Type1 IOMMU is currently
18	* optimized for relatively static mappings of a userspace process with
19	* userspace pages pinned into memory. We also assume devices and IOMMU
20	* domains are PCI based as the IOMMU API is still centered around a
21	* device/bus interface rather than a group interface.
22	*/
23
24	#include <linux/compat.h>
25	#include <linux/device.h>
26	#include <linux/fs.h>
27	#include <linux/highmem.h>
28	#include <linux/iommu.h>
29	#include <linux/module.h>
30	#include <linux/mm.h>
31	#include <linux/kthread.h>
32	#include <linux/rbtree.h>
33	#include <linux/sched/signal.h>
34	#include <linux/sched/mm.h>
35	#include <linux/slab.h>
36	#include <linux/uaccess.h>
37	#include <linux/vfio.h>
38	#include <linux/workqueue.h>
39	#include <linux/notifier.h>
40	#include "vfio.h"
41
42	#define DRIVER_VERSION "0.2"
43	#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44	#define DRIVER_DESC "Type1 IOMMU driver for VFIO"
45
46	static bool allow_unsafe_interrupts;
47	module_param_named(allow_unsafe_interrupts,
48	allow_unsafe_interrupts, bool, S_IRUGO \| S_IWUSR);
49	MODULE_PARM_DESC(allow_unsafe_interrupts,
50	"Enable VFIO IOMMU support for on platforms without interrupt remapping support.");
51
52	static bool disable_hugepages;
53	module_param_named(disable_hugepages,
54	disable_hugepages, bool, S_IRUGO \| S_IWUSR);
55	MODULE_PARM_DESC(disable_hugepages,
56	"Disable VFIO IOMMU support for IOMMU hugepages.");
57
58	static unsigned int dma_entry_limit __read_mostly = U16_MAX;
59	module_param_named(dma_entry_limit, dma_entry_limit, uint, `0644`);
60	MODULE_PARM_DESC(dma_entry_limit,
61	"Maximum number of user DMA mappings per container (65535).");
62
63	struct vfio_iommu {
64	struct list_head domain_list;
65	struct list_head iova_list;
66	struct mutex lock;
67	struct rb_root dma_list;
68	struct list_head device_list;
69	struct mutex device_list_lock;
70	unsigned int dma_avail;
71	unsigned int vaddr_invalid_count;
72	uint64_t pgsize_bitmap;
73	uint64_t num_non_pinned_groups;
74	bool v2;
75	bool nesting;
76	bool dirty_page_tracking;
77	struct list_head emulated_iommu_groups;
78	};
79
80	struct vfio_domain {
81	struct iommu_domain *domain;
82	struct list_head next;
83	struct list_head group_list;
84	bool fgsp : `1`; / Fine-grained super pages /
85	bool enforce_cache_coherency : `1`;
86	};
87
88	struct vfio_dma {
89	struct rb_node node;
90	dma_addr_t iova; / Device address /
91	unsigned long vaddr; / Process virtual addr /
92	size_t size; / Map size (bytes) /
93	int prot; / IOMMU_READ/WRITE /
94	bool iommu_mapped;
95	bool lock_cap; / capable(CAP_IPC_LOCK) /
96	bool vaddr_invalid;
97	struct task_struct *task;
98	struct rb_root pfn_list; / Ex-user pinned pfn list /
99	unsigned long *bitmap;
100	struct mm_struct *mm;
101	size_t locked_vm;
102	};
103
104	struct vfio_batch {
105	struct page *pages; /* for pin_user_pages_remote /
106	struct page fallback_page; /* if pages alloc fails /
107	int capacity; / length of pages array /
108	int size; / of batch currently /
109	int offset; / of next entry in pages /
110	};
111
112	struct vfio_iommu_group {
113	struct iommu_group *iommu_group;
114	struct list_head next;
115	bool pinned_page_dirty_scope;
116	};
117
118	struct vfio_iova {
119	struct list_head list;
120	dma_addr_t start;
121	dma_addr_t end;
122	};
123
124	/*
125	* Guest RAM pinning working set or DMA target
126	*/
127	struct vfio_pfn {
128	struct rb_node node;
129	dma_addr_t iova; / Device address /
130	unsigned long pfn; / Host pfn /
131	unsigned int ref_count;
132	};
133
134	struct vfio_regions {
135	struct list_head list;
136	dma_addr_t iova;
137	phys_addr_t phys;
138	size_t len;
139	};
140
141	#define DIRTY_BITMAP_BYTES(n) (ALIGN(n, BITS_PER_TYPE(u64)) / BITS_PER_BYTE)
142
143	/*
144	* Input argument of number of bits to bitmap_set() is unsigned integer, which
145	* further casts to signed integer for unaligned multi-bit operation,
146	* __bitmap_set().
147	* Then maximum bitmap size supported is 2^31 bits divided by 2^3 bits/byte,
148	* that is 2^28 (256 MB) which maps to 2^31 * 2^12 = 2^43 (8TB) on 4K page
149	* system.
150	*/
151	#define DIRTY_BITMAP_PAGES_MAX ((u64)INT_MAX)
152	#define DIRTY_BITMAP_SIZE_MAX DIRTY_BITMAP_BYTES(DIRTY_BITMAP_PAGES_MAX)
153
154	static int put_pfn(unsigned long pfn, int prot);
155
156	static struct vfio_iommu_group*
157	vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
158	struct iommu_group *iommu_group);
159
160	/*
161	* This code handles mapping and unmapping of user data buffers
162	* into DMA'ble space using the IOMMU
163	*/
164
165	static struct vfio_dma vfio_find_dma(struct* vfio_iommu *iommu,
166	dma_addr_t start, size_t size)
167	{
168	struct rb_node *node = iommu->dma_list.rb_node;
169
170	while (node) {
171	struct vfio_dma dma = rb_entry(node, struct* vfio_dma, node);
172
173	if (start + size <= dma->iova)
174	node = node->rb_left;
175	else if (start >= dma->iova + dma->size)
176	node = node->rb_right;
177	else
178	return dma;
179	}
180
181	return NULL;
182	}
183
184	static struct rb_node vfio_find_dma_first_node(struct* vfio_iommu *iommu,
185	dma_addr_t start, u64 size)
186	{
187	struct rb_node *res = NULL;
188	struct rb_node *node = iommu->dma_list.rb_node;
189	struct vfio_dma *dma_res = NULL;
190
191	while (node) {
192	struct vfio_dma dma = rb_entry(node, struct* vfio_dma, node);
193
194	if (start < dma->iova + dma->size) {
195	res = node;
196	dma_res = dma;
197	if (start >= dma->iova)
198	break;
199	node = node->rb_left;
200	} else {
201	node = node->rb_right;
202	}
203	}
204	if (res && size && dma_res->iova >= start + size)
205	res = NULL;
206	return res;
207	}
208
209	static void vfio_link_dma(struct vfio_iommu iommu, struct* vfio_dma *new)
210	{
211	struct rb_node *link = &iommu->dma_list.rb_node, parent = NULL;
212	struct vfio_dma *dma;
213
214	while (*link) {
215	parent = *link;
216	dma = rb_entry(parent, struct vfio_dma, node);
217
218	if (new->iova + new->size <= dma->iova)
219	link = &(*link)->rb_left;
220	else
221	link = &(*link)->rb_right;
222	}
223
224	rb_link_node(node: &new->node, parent, rb_link: link);
225	rb_insert_color(&new->node, &iommu->dma_list);
226	}
227
228	static void vfio_unlink_dma(struct vfio_iommu iommu, struct* vfio_dma *old)
229	{
230	rb_erase(&old->node, &iommu->dma_list);
231	}
232
233
234	static int vfio_dma_bitmap_alloc(struct vfio_dma *dma, size_t pgsize)
235	{
236	uint64_t npages = dma->size / pgsize;
237
238	if (npages > DIRTY_BITMAP_PAGES_MAX)
239	return -EINVAL;
240
241	/*
242	* Allocate extra 64 bits that are used to calculate shift required for
243	* bitmap_shift_left() to manipulate and club unaligned number of pages
244	* in adjacent vfio_dma ranges.
245	*/
246	dma->bitmap = kvzalloc(DIRTY_BITMAP_BYTES(npages) + sizeof(u64),
247	GFP_KERNEL);
248	if (!dma->bitmap)
249	return -ENOMEM;
250
251	return `0`;
252	}
253
254	static void vfio_dma_bitmap_free(struct vfio_dma *dma)
255	{
256	kvfree(addr: dma->bitmap);
257	dma->bitmap = NULL;
258	}
259
260	static void vfio_dma_populate_bitmap(struct vfio_dma *dma, size_t pgsize)
261	{
262	struct rb_node *p;
263	unsigned long pgshift = __ffs(pgsize);
264
265	for (p = rb_first(&dma->pfn_list); p; p = rb_next(p)) {
266	struct vfio_pfn vpfn = rb_entry(p, struct* vfio_pfn, node);
267
268	bitmap_set(map: dma->bitmap, start: (vpfn->iova - dma->iova) >> pgshift, nbits: `1`);
269	}
270	}
271
272	static void vfio_iommu_populate_bitmap_full(struct vfio_iommu *iommu)
273	{
274	struct rb_node *n;
275	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
276
277	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
278	struct vfio_dma dma = rb_entry(n, struct* vfio_dma, node);
279
280	bitmap_set(map: dma->bitmap, start: `0`, nbits: dma->size >> pgshift);
281	}
282	}
283
284	static int vfio_dma_bitmap_alloc_all(struct vfio_iommu *iommu, size_t pgsize)
285	{
286	struct rb_node *n;
287
288	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
289	struct vfio_dma dma = rb_entry(n, struct* vfio_dma, node);
290	int ret;
291
292	ret = vfio_dma_bitmap_alloc(dma, pgsize);
293	if (ret) {
294	struct rb_node *p;
295
296	for (p = rb_prev(n); p; p = rb_prev(p)) {
297	struct vfio_dma *dma = rb_entry(n,
298	struct vfio_dma, node);
299
300	vfio_dma_bitmap_free(dma);
301	}
302	return ret;
303	}
304	vfio_dma_populate_bitmap(dma, pgsize);
305	}
306	return `0`;
307	}
308
309	static void vfio_dma_bitmap_free_all(struct vfio_iommu *iommu)
310	{
311	struct rb_node *n;
312
313	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
314	struct vfio_dma dma = rb_entry(n, struct* vfio_dma, node);
315
316	vfio_dma_bitmap_free(dma);
317	}
318	}
319
320	/*
321	* Helper Functions for host iova-pfn list
322	*/
323	static struct vfio_pfn vfio_find_vpfn(struct* vfio_dma *dma, dma_addr_t iova)
324	{
325	struct vfio_pfn *vpfn;
326	struct rb_node *node = dma->pfn_list.rb_node;
327
328	while (node) {
329	vpfn = rb_entry(node, struct vfio_pfn, node);
330
331	if (iova < vpfn->iova)
332	node = node->rb_left;
333	else if (iova > vpfn->iova)
334	node = node->rb_right;
335	else
336	return vpfn;
337	}
338	return NULL;
339	}
340
341	static void vfio_link_pfn(struct vfio_dma *dma,
342	struct vfio_pfn *new)
343	{
344	struct rb_node *link, parent = NULL;
345	struct vfio_pfn *vpfn;
346
347	link = &dma->pfn_list.rb_node;
348	while (*link) {
349	parent = *link;
350	vpfn = rb_entry(parent, struct vfio_pfn, node);
351
352	if (new->iova < vpfn->iova)
353	link = &(*link)->rb_left;
354	else
355	link = &(*link)->rb_right;
356	}
357
358	rb_link_node(node: &new->node, parent, rb_link: link);
359	rb_insert_color(&new->node, &dma->pfn_list);
360	}
361
362	static void vfio_unlink_pfn(struct vfio_dma dma, struct* vfio_pfn *old)
363	{
364	rb_erase(&old->node, &dma->pfn_list);
365	}
366
367	static int vfio_add_to_pfn_list(struct vfio_dma *dma, dma_addr_t iova,
368	unsigned long pfn)
369	{
370	struct vfio_pfn *vpfn;
371
372	vpfn = kzalloc(size: sizeof(*vpfn), GFP_KERNEL);
373	if (!vpfn)
374	return -ENOMEM;
375
376	vpfn->iova = iova;
377	vpfn->pfn = pfn;
378	vpfn->ref_count = `1`;
379	vfio_link_pfn(dma, new: vpfn);
380	return `0`;
381	}
382
383	static void vfio_remove_from_pfn_list(struct vfio_dma *dma,
384	struct vfio_pfn *vpfn)
385	{
386	vfio_unlink_pfn(dma, old: vpfn);
387	kfree(objp: vpfn);
388	}
389
390	static struct vfio_pfn vfio_iova_get_vfio_pfn(struct* vfio_dma *dma,
391	unsigned long iova)
392	{
393	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
394
395	if (vpfn)
396	vpfn->ref_count++;
397	return vpfn;
398	}
399
400	static int vfio_iova_put_vfio_pfn(struct vfio_dma dma, struct* vfio_pfn *vpfn)
401	{
402	int ret = `0`;
403
404	vpfn->ref_count--;
405	if (!vpfn->ref_count) {
406	ret = put_pfn(pfn: vpfn->pfn, prot: dma->prot);
407	vfio_remove_from_pfn_list(dma, vpfn);
408	}
409	return ret;
410	}
411
412	static int mm_lock_acct(struct task_struct task, struct* mm_struct *mm,
413	bool lock_cap, long npage)
414	{
415	int ret = mmap_write_lock_killable(mm);
416
417	if (ret)
418	return ret;
419
420	ret = __account_locked_vm(mm, abs(npage), inc: npage > `0`, task, bypass_rlim: lock_cap);
421	mmap_write_unlock(mm);
422	return ret;
423	}
424
425	static int vfio_lock_acct(struct vfio_dma dma, long* npage, bool async)
426	{
427	struct mm_struct *mm;
428	int ret;
429
430	if (!npage)
431	return `0`;
432
433	mm = dma->mm;
434	if (async && !mmget_not_zero(mm))
435	return -ESRCH; / process exited /
436
437	ret = mm_lock_acct(task: dma->task, mm, lock_cap: dma->lock_cap, npage);
438	if (!ret)
439	dma->locked_vm += npage;
440
441	if (async)
442	mmput(mm);
443
444	return ret;
445	}
446
447	/*
448	* Some mappings aren't backed by a struct page, for example an mmap'd
449	* MMIO range for our own or another device. These use a different
450	* pfn conversion and shouldn't be tracked as locked pages.
451	* For compound pages, any driver that sets the reserved bit in head
452	* page needs to set the reserved bit in all subpages to be safe.
453	*/
454	static bool is_invalid_reserved_pfn(unsigned long pfn)
455	{
456	if (pfn_valid(pfn))
457	return PageReserved(pfn_to_page(pfn));
458
459	return true;
460	}
461
462	static int put_pfn(unsigned long pfn, int prot)
463	{
464	if (!is_invalid_reserved_pfn(pfn)) {
465	struct page *page = pfn_to_page(pfn);
466
467	unpin_user_pages_dirty_lock(pages: &page, npages: `1`, make_dirty: prot & IOMMU_WRITE);
468	return `1`;
469	}
470	return `0`;
471	}
472
473	#define VFIO_BATCH_MAX_CAPACITY (PAGE_SIZE / sizeof(struct page *))
474
475	static void vfio_batch_init(struct vfio_batch *batch)
476	{
477	batch->size = `0`;
478	batch->offset = `0`;
479
480	if (unlikely(disable_hugepages))
481	goto fallback;
482
483	batch->pages = (struct page **) __get_free_page(GFP_KERNEL);
484	if (!batch->pages)
485	goto fallback;
486
487	batch->capacity = VFIO_BATCH_MAX_CAPACITY;
488	return;
489
490	fallback:
491	batch->pages = &batch->fallback_page;
492	batch->capacity = `1`;
493	}
494
495	static void vfio_batch_unpin(struct vfio_batch batch, struct* vfio_dma *dma)
496	{
497	while (batch->size) {
498	unsigned long pfn = page_to_pfn(batch->pages[batch->offset]);
499
500	put_pfn(pfn, prot: dma->prot);
501	batch->offset++;
502	batch->size--;
503	}
504	}
505
506	static void vfio_batch_fini(struct vfio_batch *batch)
507	{
508	if (batch->capacity == VFIO_BATCH_MAX_CAPACITY)
509	free_page((unsigned long)batch->pages);
510	}
511
512	static int follow_fault_pfn(struct vm_area_struct vma, struct* mm_struct *mm,
513	unsigned long vaddr, unsigned long *pfn,
514	bool write_fault)
515	{
516	pte_t *ptep;
517	pte_t pte;
518	spinlock_t *ptl;
519	int ret;
520
521	ret = follow_pte(mm: vma->vm_mm, address: vaddr, ptepp: &ptep, ptlp: &ptl);
522	if (ret) {
523	bool unlocked = false;
524
525	ret = fixup_user_fault(mm, address: vaddr,
526	fault_flags: FAULT_FLAG_REMOTE \|
527	(write_fault ? FAULT_FLAG_WRITE : `0`),
528	unlocked: &unlocked);
529	if (unlocked)
530	return -EAGAIN;
531
532	if (ret)
533	return ret;
534
535	ret = follow_pte(mm: vma->vm_mm, address: vaddr, ptepp: &ptep, ptlp: &ptl);
536	if (ret)
537	return ret;
538	}
539
540	pte = ptep_get(ptep);
541
542	if (write_fault && !pte_write(pte))
543	ret = -EFAULT;
544	else
545	*pfn = pte_pfn(pte);
546
547	pte_unmap_unlock(ptep, ptl);
548	return ret;
549	}
550
551	/*
552	* Returns the positive number of pfns successfully obtained or a negative
553	* error code.
554	*/
555	static int vaddr_get_pfns(struct mm_struct mm, unsigned* long vaddr,
556	long npages, int prot, unsigned long *pfn,
557	struct page **pages)
558	{
559	struct vm_area_struct *vma;
560	unsigned int flags = `0`;
561	int ret;
562
563	if (prot & IOMMU_WRITE)
564	flags \|= FOLL_WRITE;
565
566	mmap_read_lock(mm);
567	ret = pin_user_pages_remote(mm, start: vaddr, nr_pages: npages, gup_flags: flags \| FOLL_LONGTERM,
568	pages, NULL);
569	if (ret > `0`) {
570	int i;
571
572	/*
573	* The zero page is always resident, we don't need to pin it
574	* and it falls into our invalid/reserved test so we don't
575	* unpin in put_pfn(). Unpin all zero pages in the batch here.
576	*/
577	for (i = `0` ; i < ret; i++) {
578	if (unlikely(is_zero_pfn(page_to_pfn(pages[i]))))
579	unpin_user_page(page: pages[i]);
580	}
581
582	*pfn = page_to_pfn(pages[`0`]);
583	goto done;
584	}
585
586	vaddr = untagged_addr_remote(mm, vaddr);
587
588	retry:
589	vma = vma_lookup(mm, addr: vaddr);
590
591	if (vma && vma->vm_flags & VM_PFNMAP) {
592	ret = follow_fault_pfn(vma, mm, vaddr, pfn, write_fault: prot & IOMMU_WRITE);
593	if (ret == -EAGAIN)
594	goto retry;
595
596	if (!ret) {
597	if (is_invalid_reserved_pfn(pfn: *pfn))
598	ret = `1`;
599	else
600	ret = -EFAULT;
601	}
602	}
603	done:
604	mmap_read_unlock(mm);
605	return ret;
606	}
607
608	/*
609	* Attempt to pin pages. We really don't want to track all the pfns and
610	* the iommu can only map chunks of consecutive pfns anyway, so get the
611	* first page and all consecutive pages with the same locking.
612	*/
613	static long vfio_pin_pages_remote(struct vfio_dma dma, unsigned* long vaddr,
614	long npage, unsigned long *pfn_base,
615	unsigned long limit, struct vfio_batch *batch)
616	{
617	unsigned long pfn;
618	struct mm_struct *mm = current->mm;
619	long ret, pinned = `0`, lock_acct = `0`;
620	bool rsvd;
621	dma_addr_t iova = vaddr - dma->vaddr + dma->iova;
622
623	/ This code path is only user initiated /
624	if (!mm)
625	return -ENODEV;
626
627	if (batch->size) {
628	/ Leftover pages in batch from an earlier call. /
629	*pfn_base = page_to_pfn(batch->pages[batch->offset]);
630	pfn = *pfn_base;
631	rsvd = is_invalid_reserved_pfn(pfn: *pfn_base);
632	} else {
633	*pfn_base = `0`;
634	}
635
636	while (npage) {
637	if (!batch->size) {
638	/ Empty batch, so refill it. /
639	long req_pages = min_t(long, npage, batch->capacity);
640
641	ret = vaddr_get_pfns(mm, vaddr, npages: req_pages, prot: dma->prot,
642	pfn: &pfn, pages: batch->pages);
643	if (ret < `0`)
644	goto unpin_out;
645
646	batch->size = ret;
647	batch->offset = `0`;
648
649	if (!*pfn_base) {
650	*pfn_base = pfn;
651	rsvd = is_invalid_reserved_pfn(pfn: *pfn_base);
652	}
653	}
654
655	/*
656	* pfn is preset for the first iteration of this inner loop and
657	* updated at the end to handle a VM_PFNMAP pfn. In that case,
658	* batch->pages isn't valid (there's no struct page), so allow
659	* batch->pages to be touched only when there's more than one
660	* pfn to check, which guarantees the pfns are from a
661	* !VM_PFNMAP vma.
662	*/
663	while (true) {
664	if (pfn != *pfn_base + pinned \|\|
665	rsvd != is_invalid_reserved_pfn(pfn))
666	goto out;
667
668	/*
669	* Reserved pages aren't counted against the user,
670	* externally pinned pages are already counted against
671	* the user.
672	*/
673	if (!rsvd && !vfio_find_vpfn(dma, iova)) {
674	if (!dma->lock_cap &&
675	mm->locked_vm + lock_acct + `1` > limit) {
676	pr_warn("%s: RLIMIT_MEMLOCK (%ld) exceeded\n",
677	__func__, limit << PAGE_SHIFT);
678	ret = -ENOMEM;
679	goto unpin_out;
680	}
681	lock_acct++;
682	}
683
684	pinned++;
685	npage--;
686	vaddr += PAGE_SIZE;
687	iova += PAGE_SIZE;
688	batch->offset++;
689	batch->size--;
690
691	if (!batch->size)
692	break;
693
694	pfn = page_to_pfn(batch->pages[batch->offset]);
695	}
696
697	if (unlikely(disable_hugepages))
698	break;
699	}
700
701	out:
702	ret = vfio_lock_acct(dma, npage: lock_acct, async: false);
703
704	unpin_out:
705	if (batch->size == `1` && !batch->offset) {
706	/ May be a VM_PFNMAP pfn, which the batch can't remember. /
707	put_pfn(pfn, prot: dma->prot);
708	batch->size = `0`;
709	}
710
711	if (ret < `0`) {
712	if (pinned && !rsvd) {
713	for (pfn = *pfn_base ; pinned ; pfn++, pinned--)
714	put_pfn(pfn, prot: dma->prot);
715	}
716	vfio_batch_unpin(batch, dma);
717
718	return ret;
719	}
720
721	return pinned;
722	}
723
724	static long vfio_unpin_pages_remote(struct vfio_dma *dma, dma_addr_t iova,
725	unsigned long pfn, long npage,
726	bool do_accounting)
727	{
728	long unlocked = `0`, locked = `0`;
729	long i;
730
731	for (i = `0`; i < npage; i++, iova += PAGE_SIZE) {
732	if (put_pfn(pfn: pfn++, prot: dma->prot)) {
733	unlocked++;
734	if (vfio_find_vpfn(dma, iova))
735	locked++;
736	}
737	}
738
739	if (do_accounting)
740	vfio_lock_acct(dma, npage: locked - unlocked, async: true);
741
742	return unlocked;
743	}
744
745	static int vfio_pin_page_external(struct vfio_dma dma, unsigned* long vaddr,
746	unsigned long *pfn_base, bool do_accounting)
747	{
748	struct page *pages[`1`];
749	struct mm_struct *mm;
750	int ret;
751
752	mm = dma->mm;
753	if (!mmget_not_zero(mm))
754	return -ENODEV;
755
756	ret = vaddr_get_pfns(mm, vaddr, npages: `1`, prot: dma->prot, pfn: pfn_base, pages);
757	if (ret != `1`)
758	goto out;
759
760	ret = `0`;
761
762	if (do_accounting && !is_invalid_reserved_pfn(pfn: *pfn_base)) {
763	ret = vfio_lock_acct(dma, npage: `1`, async: false);
764	if (ret) {
765	put_pfn(pfn: *pfn_base, prot: dma->prot);
766	if (ret == -ENOMEM)
767	pr_warn("%s: Task %s (%d) RLIMIT_MEMLOCK "
768	"(%ld) exceeded\n", __func__,
769	dma->task->comm, task_pid_nr(dma->task),
770	task_rlimit(dma->task, RLIMIT_MEMLOCK));
771	}
772	}
773
774	out:
775	mmput(mm);
776	return ret;
777	}
778
779	static int vfio_unpin_page_external(struct vfio_dma *dma, dma_addr_t iova,
780	bool do_accounting)
781	{
782	int unlocked;
783	struct vfio_pfn *vpfn = vfio_find_vpfn(dma, iova);
784
785	if (!vpfn)
786	return `0`;
787
788	unlocked = vfio_iova_put_vfio_pfn(dma, vpfn);
789
790	if (do_accounting)
791	vfio_lock_acct(dma, npage: -unlocked, async: true);
792
793	return unlocked;
794	}
795
796	static int vfio_iommu_type1_pin_pages(void *iommu_data,
797	struct iommu_group *iommu_group,
798	dma_addr_t user_iova,
799	int npage, int prot,
800	struct page **pages)
801	{
802	struct vfio_iommu *iommu = iommu_data;
803	struct vfio_iommu_group *group;
804	int i, j, ret;
805	unsigned long remote_vaddr;
806	struct vfio_dma *dma;
807	bool do_accounting;
808
809	if (!iommu \|\| !pages)
810	return -EINVAL;
811
812	/ Supported for v2 version only /
813	if (!iommu->v2)
814	return -EACCES;
815
816	mutex_lock(&iommu->lock);
817
818	if (WARN_ONCE(iommu->vaddr_invalid_count,
819	"vfio_pin_pages not allowed with VFIO_UPDATE_VADDR\n")) {
820	ret = -EBUSY;
821	goto pin_done;
822	}
823
824	/ Fail if no dma_umap notifier is registered /
825	if (list_empty(head: &iommu->device_list)) {
826	ret = -EINVAL;
827	goto pin_done;
828	}
829
830	/*
831	* If iommu capable domain exist in the container then all pages are
832	* already pinned and accounted. Accounting should be done if there is no
833	* iommu capable domain in the container.
834	*/
835	do_accounting = list_empty(head: &iommu->domain_list);
836
837	for (i = `0`; i < npage; i++) {
838	unsigned long phys_pfn;
839	dma_addr_t iova;
840	struct vfio_pfn *vpfn;
841
842	iova = user_iova + PAGE_SIZE * i;
843	dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE);
844	if (!dma) {
845	ret = -EINVAL;
846	goto pin_unwind;
847	}
848
849	if ((dma->prot & prot) != prot) {
850	ret = -EPERM;
851	goto pin_unwind;
852	}
853
854	vpfn = vfio_iova_get_vfio_pfn(dma, iova);
855	if (vpfn) {
856	pages[i] = pfn_to_page(vpfn->pfn);
857	continue;
858	}
859
860	remote_vaddr = dma->vaddr + (iova - dma->iova);
861	ret = vfio_pin_page_external(dma, vaddr: remote_vaddr, pfn_base: &phys_pfn,
862	do_accounting);
863	if (ret)
864	goto pin_unwind;
865
866	if (!pfn_valid(pfn: phys_pfn)) {
867	ret = -EINVAL;
868	goto pin_unwind;
869	}
870
871	ret = vfio_add_to_pfn_list(dma, iova, pfn: phys_pfn);
872	if (ret) {
873	if (put_pfn(pfn: phys_pfn, prot: dma->prot) && do_accounting)
874	vfio_lock_acct(dma, npage: -`1`, async: true);
875	goto pin_unwind;
876	}
877
878	pages[i] = pfn_to_page(phys_pfn);
879
880	if (iommu->dirty_page_tracking) {
881	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
882
883	/*
884	* Bitmap populated with the smallest supported page
885	* size
886	*/
887	bitmap_set(map: dma->bitmap,
888	start: (iova - dma->iova) >> pgshift, nbits: `1`);
889	}
890	}
891	ret = i;
892
893	group = vfio_iommu_find_iommu_group(iommu, iommu_group);
894	if (!group->pinned_page_dirty_scope) {
895	group->pinned_page_dirty_scope = true;
896	iommu->num_non_pinned_groups--;
897	}
898
899	goto pin_done;
900
901	pin_unwind:
902	pages[i] = NULL;
903	for (j = `0`; j < i; j++) {
904	dma_addr_t iova;
905
906	iova = user_iova + PAGE_SIZE * j;
907	dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE);
908	vfio_unpin_page_external(dma, iova, do_accounting);
909	pages[j] = NULL;
910	}
911	pin_done:
912	mutex_unlock(lock: &iommu->lock);
913	return ret;
914	}
915
916	static void vfio_iommu_type1_unpin_pages(void *iommu_data,
917	dma_addr_t user_iova, int npage)
918	{
919	struct vfio_iommu *iommu = iommu_data;
920	bool do_accounting;
921	int i;
922
923	/ Supported for v2 version only /
924	if (WARN_ON(!iommu->v2))
925	return;
926
927	mutex_lock(&iommu->lock);
928
929	do_accounting = list_empty(head: &iommu->domain_list);
930	for (i = `0`; i < npage; i++) {
931	dma_addr_t iova = user_iova + PAGE_SIZE * i;
932	struct vfio_dma *dma;
933
934	dma = vfio_find_dma(iommu, start: iova, PAGE_SIZE);
935	if (!dma)
936	break;
937
938	vfio_unpin_page_external(dma, iova, do_accounting);
939	}
940
941	mutex_unlock(lock: &iommu->lock);
942
943	WARN_ON(i != npage);
944	}
945
946	static long vfio_sync_unpin(struct vfio_dma dma, struct* vfio_domain *domain,
947	struct list_head *regions,
948	struct iommu_iotlb_gather *iotlb_gather)
949	{
950	long unlocked = `0`;
951	struct vfio_regions entry, next;
952
953	iommu_iotlb_sync(domain: domain->domain, iotlb_gather);
954
955	list_for_each_entry_safe(entry, next, regions, list) {
956	unlocked += vfio_unpin_pages_remote(dma,
957	iova: entry->iova,
958	pfn: entry->phys >> PAGE_SHIFT,
959	npage: entry->len >> PAGE_SHIFT,
960	do_accounting: false);
961	list_del(entry: &entry->list);
962	kfree(objp: entry);
963	}
964
965	cond_resched();
966
967	return unlocked;
968	}
969
970	/*
971	* Generally, VFIO needs to unpin remote pages after each IOTLB flush.
972	* Therefore, when using IOTLB flush sync interface, VFIO need to keep track
973	* of these regions (currently using a list).
974	*
975	* This value specifies maximum number of regions for each IOTLB flush sync.
976	*/
977	#define VFIO_IOMMU_TLB_SYNC_MAX 512
978
979	static size_t unmap_unpin_fast(struct vfio_domain *domain,
980	struct vfio_dma dma, dma_addr_t iova,
981	size_t len, phys_addr_t phys, long *unlocked,
982	struct list_head *unmapped_list,
983	int *unmapped_cnt,
984	struct iommu_iotlb_gather *iotlb_gather)
985	{
986	size_t unmapped = `0`;
987	struct vfio_regions entry = kzalloc(size: sizeof(entry), GFP_KERNEL);
988
989	if (entry) {
990	unmapped = iommu_unmap_fast(domain: domain->domain, iova: *iova, size: len,
991	iotlb_gather);
992
993	if (!unmapped) {
994	kfree(objp: entry);
995	} else {
996	entry->iova = *iova;
997	entry->phys = phys;
998	entry->len = unmapped;
999	list_add_tail(new: &entry->list, head: unmapped_list);
1000
1001	*iova += unmapped;
1002	(*unmapped_cnt)++;
1003	}
1004	}
1005
1006	/*
1007	* Sync if the number of fast-unmap regions hits the limit
1008	* or in case of errors.
1009	*/
1010	if (*unmapped_cnt >= VFIO_IOMMU_TLB_SYNC_MAX \|\| !unmapped) {
1011	*unlocked += vfio_sync_unpin(dma, domain, regions: unmapped_list,
1012	iotlb_gather);
1013	*unmapped_cnt = `0`;
1014	}
1015
1016	return unmapped;
1017	}
1018
1019	static size_t unmap_unpin_slow(struct vfio_domain *domain,
1020	struct vfio_dma dma, dma_addr_t iova,
1021	size_t len, phys_addr_t phys,
1022	long *unlocked)
1023	{
1024	size_t unmapped = iommu_unmap(domain: domain->domain, iova: *iova, size: len);
1025
1026	if (unmapped) {
1027	unlocked += vfio_unpin_pages_remote(dma, iova: iova,
1028	pfn: phys >> PAGE_SHIFT,
1029	npage: unmapped >> PAGE_SHIFT,
1030	do_accounting: false);
1031	*iova += unmapped;
1032	cond_resched();
1033	}
1034	return unmapped;
1035	}
1036
1037	static long vfio_unmap_unpin(struct vfio_iommu iommu, struct* vfio_dma *dma,
1038	bool do_accounting)
1039	{
1040	dma_addr_t iova = dma->iova, end = dma->iova + dma->size;
1041	struct vfio_domain domain, d;
1042	LIST_HEAD(unmapped_region_list);
1043	struct iommu_iotlb_gather iotlb_gather;
1044	int unmapped_region_cnt = `0`;
1045	long unlocked = `0`;
1046
1047	if (!dma->size)
1048	return `0`;
1049
1050	if (list_empty(head: &iommu->domain_list))
1051	return `0`;
1052
1053	/*
1054	* We use the IOMMU to track the physical addresses, otherwise we'd
1055	* need a much more complicated tracking system. Unfortunately that
1056	* means we need to use one of the iommu domains to figure out the
1057	* pfns to unpin. The rest need to be unmapped in advance so we have
1058	* no iommu translations remaining when the pages are unpinned.
1059	*/
1060	domain = d = list_first_entry(&iommu->domain_list,
1061	struct vfio_domain, next);
1062
1063	list_for_each_entry_continue(d, &iommu->domain_list, next) {
1064	iommu_unmap(domain: d->domain, iova: dma->iova, size: dma->size);
1065	cond_resched();
1066	}
1067
1068	iommu_iotlb_gather_init(gather: &iotlb_gather);
1069	while (iova < end) {
1070	size_t unmapped, len;
1071	phys_addr_t phys, next;
1072
1073	phys = iommu_iova_to_phys(domain: domain->domain, iova);
1074	if (WARN_ON(!phys)) {
1075	iova += PAGE_SIZE;
1076	continue;
1077	}
1078
1079	/*
1080	* To optimize for fewer iommu_unmap() calls, each of which
1081	* may require hardware cache flushing, try to find the
1082	* largest contiguous physical memory chunk to unmap.
1083	*/
1084	for (len = PAGE_SIZE;
1085	!domain->fgsp && iova + len < end; len += PAGE_SIZE) {
1086	next = iommu_iova_to_phys(domain: domain->domain, iova: iova + len);
1087	if (next != phys + len)
1088	break;
1089	}
1090
1091	/*
1092	* First, try to use fast unmap/unpin. In case of failure,
1093	* switch to slow unmap/unpin path.
1094	*/
1095	unmapped = unmap_unpin_fast(domain, dma, iova: &iova, len, phys,
1096	unlocked: &unlocked, unmapped_list: &unmapped_region_list,
1097	unmapped_cnt: &unmapped_region_cnt,
1098	iotlb_gather: &iotlb_gather);
1099	if (!unmapped) {
1100	unmapped = unmap_unpin_slow(domain, dma, iova: &iova, len,
1101	phys, unlocked: &unlocked);
1102	if (WARN_ON(!unmapped))
1103	break;
1104	}
1105	}
1106
1107	dma->iommu_mapped = false;
1108
1109	if (unmapped_region_cnt) {
1110	unlocked += vfio_sync_unpin(dma, domain, regions: &unmapped_region_list,
1111	iotlb_gather: &iotlb_gather);
1112	}
1113
1114	if (do_accounting) {
1115	vfio_lock_acct(dma, npage: -unlocked, async: true);
1116	return `0`;
1117	}
1118	return unlocked;
1119	}
1120
1121	static void vfio_remove_dma(struct vfio_iommu iommu, struct* vfio_dma *dma)
1122	{
1123	WARN_ON(!RB_EMPTY_ROOT(&dma->pfn_list));
1124	vfio_unmap_unpin(iommu, dma, do_accounting: true);
1125	vfio_unlink_dma(iommu, old: dma);
1126	put_task_struct(t: dma->task);
1127	mmdrop(mm: dma->mm);
1128	vfio_dma_bitmap_free(dma);
1129	if (dma->vaddr_invalid)
1130	iommu->vaddr_invalid_count--;
1131	kfree(objp: dma);
1132	iommu->dma_avail++;
1133	}
1134
1135	static void vfio_update_pgsize_bitmap(struct vfio_iommu *iommu)
1136	{
1137	struct vfio_domain *domain;
1138
1139	iommu->pgsize_bitmap = ULONG_MAX;
1140
1141	list_for_each_entry(domain, &iommu->domain_list, next)
1142	iommu->pgsize_bitmap &= domain->domain->pgsize_bitmap;
1143
1144	/*
1145	* In case the IOMMU supports page sizes smaller than PAGE_SIZE
1146	* we pretend PAGE_SIZE is supported and hide sub-PAGE_SIZE sizes.
1147	* That way the user will be able to map/unmap buffers whose size/
1148	* start address is aligned with PAGE_SIZE. Pinning code uses that
1149	* granularity while iommu driver can use the sub-PAGE_SIZE size
1150	* to map the buffer.
1151	*/
1152	if (iommu->pgsize_bitmap & ~PAGE_MASK) {
1153	iommu->pgsize_bitmap &= PAGE_MASK;
1154	iommu->pgsize_bitmap \|= PAGE_SIZE;
1155	}
1156	}
1157
1158	static int update_user_bitmap(u64 __user bitmap, struct* vfio_iommu *iommu,
1159	struct vfio_dma *dma, dma_addr_t base_iova,
1160	size_t pgsize)
1161	{
1162	unsigned long pgshift = __ffs(pgsize);
1163	unsigned long nbits = dma->size >> pgshift;
1164	unsigned long bit_offset = (dma->iova - base_iova) >> pgshift;
1165	unsigned long copy_offset = bit_offset / BITS_PER_LONG;
1166	unsigned long shift = bit_offset % BITS_PER_LONG;
1167	unsigned long leftover;
1168
1169	/*
1170	* mark all pages dirty if any IOMMU capable device is not able
1171	* to report dirty pages and all pages are pinned and mapped.
1172	*/
1173	if (iommu->num_non_pinned_groups && dma->iommu_mapped)
1174	bitmap_set(map: dma->bitmap, start: `0`, nbits);
1175
1176	if (shift) {
1177	bitmap_shift_left(dst: dma->bitmap, src: dma->bitmap, shift,
1178	nbits: nbits + shift);
1179
1180	if (copy_from_user(to: &leftover,
1181	from: (void __user *)(bitmap + copy_offset),
1182	n: sizeof(leftover)))
1183	return -EFAULT;
1184
1185	bitmap_or(dst: dma->bitmap, src1: dma->bitmap, src2: &leftover, nbits: shift);
1186	}
1187
1188	if (copy_to_user(to: (void __user *)(bitmap + copy_offset), from: dma->bitmap,
1189	DIRTY_BITMAP_BYTES(nbits + shift)))
1190	return -EFAULT;
1191
1192	return `0`;
1193	}
1194
1195	static int vfio_iova_dirty_bitmap(u64 __user bitmap, struct* vfio_iommu *iommu,
1196	dma_addr_t iova, size_t size, size_t pgsize)
1197	{
1198	struct vfio_dma *dma;
1199	struct rb_node *n;
1200	unsigned long pgshift = __ffs(pgsize);
1201	int ret;
1202
1203	/*
1204	* GET_BITMAP request must fully cover vfio_dma mappings. Multiple
1205	* vfio_dma mappings may be clubbed by specifying large ranges, but
1206	* there must not be any previous mappings bisected by the range.
1207	* An error will be returned if these conditions are not met.
1208	*/
1209	dma = vfio_find_dma(iommu, start: iova, size: `1`);
1210	if (dma && dma->iova != iova)
1211	return -EINVAL;
1212
1213	dma = vfio_find_dma(iommu, start: iova + size - `1`, size: `0`);
1214	if (dma && dma->iova + dma->size != iova + size)
1215	return -EINVAL;
1216
1217	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1218	struct vfio_dma dma = rb_entry(n, struct* vfio_dma, node);
1219
1220	if (dma->iova < iova)
1221	continue;
1222
1223	if (dma->iova > iova + size - `1`)
1224	break;
1225
1226	ret = update_user_bitmap(bitmap, iommu, dma, base_iova: iova, pgsize);
1227	if (ret)
1228	return ret;
1229
1230	/*
1231	* Re-populate bitmap to include all pinned pages which are
1232	* considered as dirty but exclude pages which are unpinned and
1233	* pages which are marked dirty by vfio_dma_rw()
1234	*/
1235	bitmap_clear(map: dma->bitmap, start: `0`, nbits: dma->size >> pgshift);
1236	vfio_dma_populate_bitmap(dma, pgsize);
1237	}
1238	return `0`;
1239	}
1240
1241	static int verify_bitmap_size(uint64_t npages, uint64_t bitmap_size)
1242	{
1243	if (!npages \|\| !bitmap_size \|\| (bitmap_size > DIRTY_BITMAP_SIZE_MAX) \|\|
1244	(bitmap_size < DIRTY_BITMAP_BYTES(npages)))
1245	return -EINVAL;
1246
1247	return `0`;
1248	}
1249
1250	/*
1251	* Notify VFIO drivers using vfio_register_emulated_iommu_dev() to invalidate
1252	* and unmap iovas within the range we're about to unmap. Drivers MUST unpin
1253	* pages in response to an invalidation.
1254	*/
1255	static void vfio_notify_dma_unmap(struct vfio_iommu *iommu,
1256	struct vfio_dma *dma)
1257	{
1258	struct vfio_device *device;
1259
1260	if (list_empty(head: &iommu->device_list))
1261	return;
1262
1263	/*
1264	* The device is expected to call vfio_unpin_pages() for any IOVA it has
1265	* pinned within the range. Since vfio_unpin_pages() will eventually
1266	* call back down to this code and try to obtain the iommu->lock we must
1267	* drop it.
1268	*/
1269	mutex_lock(&iommu->device_list_lock);
1270	mutex_unlock(lock: &iommu->lock);
1271
1272	list_for_each_entry(device, &iommu->device_list, iommu_entry)
1273	device->ops->dma_unmap(device, dma->iova, dma->size);
1274
1275	mutex_unlock(lock: &iommu->device_list_lock);
1276	mutex_lock(&iommu->lock);
1277	}
1278
1279	static int vfio_dma_do_unmap(struct vfio_iommu *iommu,
1280	struct vfio_iommu_type1_dma_unmap *unmap,
1281	struct vfio_bitmap *bitmap)
1282	{
1283	struct vfio_dma dma, dma_last = NULL;
1284	size_t unmapped = `0`, pgsize;
1285	int ret = -EINVAL, retries = `0`;
1286	unsigned long pgshift;
1287	dma_addr_t iova = unmap->iova;
1288	u64 size = unmap->size;
1289	bool unmap_all = unmap->flags & VFIO_DMA_UNMAP_FLAG_ALL;
1290	bool invalidate_vaddr = unmap->flags & VFIO_DMA_UNMAP_FLAG_VADDR;
1291	struct rb_node n, first_n;
1292
1293	mutex_lock(&iommu->lock);
1294
1295	/ Cannot update vaddr if mdev is present. /
1296	if (invalidate_vaddr && !list_empty(head: &iommu->emulated_iommu_groups)) {
1297	ret = -EBUSY;
1298	goto unlock;
1299	}
1300
1301	pgshift = __ffs(iommu->pgsize_bitmap);
1302	pgsize = (size_t)`1` << pgshift;
1303
1304	if (iova & (pgsize - `1`))
1305	goto unlock;
1306
1307	if (unmap_all) {
1308	if (iova \|\| size)
1309	goto unlock;
1310	size = U64_MAX;
1311	} else if (!size \|\| size & (pgsize - `1`) \|\|
1312	iova + size - `1` < iova \|\| size > SIZE_MAX) {
1313	goto unlock;
1314	}
1315
1316	/ When dirty tracking is enabled, allow only min supported pgsize /
1317	if ((unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
1318	(!iommu->dirty_page_tracking \|\| (bitmap->pgsize != pgsize))) {
1319	goto unlock;
1320	}
1321
1322	WARN_ON((pgsize - `1`) & PAGE_MASK);
1323	again:
1324	/*
1325	* vfio-iommu-type1 (v1) - User mappings were coalesced together to
1326	* avoid tracking individual mappings. This means that the granularity
1327	* of the original mapping was lost and the user was allowed to attempt
1328	* to unmap any range. Depending on the contiguousness of physical
1329	* memory and page sizes supported by the IOMMU, arbitrary unmaps may
1330	* or may not have worked. We only guaranteed unmap granularity
1331	* matching the original mapping; even though it was untracked here,
1332	* the original mappings are reflected in IOMMU mappings. This
1333	* resulted in a couple unusual behaviors. First, if a range is not
1334	* able to be unmapped, ex. a set of 4k pages that was mapped as a
1335	* 2M hugepage into the IOMMU, the unmap ioctl returns success but with
1336	* a zero sized unmap. Also, if an unmap request overlaps the first
1337	* address of a hugepage, the IOMMU will unmap the entire hugepage.
1338	* This also returns success and the returned unmap size reflects the
1339	* actual size unmapped.
1340	*
1341	* We attempt to maintain compatibility with this "v1" interface, but
1342	* we take control out of the hands of the IOMMU. Therefore, an unmap
1343	* request offset from the beginning of the original mapping will
1344	* return success with zero sized unmap. And an unmap request covering
1345	* the first iova of mapping will unmap the entire range.
1346	*
1347	* The v2 version of this interface intends to be more deterministic.
1348	* Unmap requests must fully cover previous mappings. Multiple
1349	* mappings may still be unmaped by specifying large ranges, but there
1350	* must not be any previous mappings bisected by the range. An error
1351	* will be returned if these conditions are not met. The v2 interface
1352	* will only return success and a size of zero if there were no
1353	* mappings within the range.
1354	*/
1355	if (iommu->v2 && !unmap_all) {
1356	dma = vfio_find_dma(iommu, start: iova, size: `1`);
1357	if (dma && dma->iova != iova)
1358	goto unlock;
1359
1360	dma = vfio_find_dma(iommu, start: iova + size - `1`, size: `0`);
1361	if (dma && dma->iova + dma->size != iova + size)
1362	goto unlock;
1363	}
1364
1365	ret = `0`;
1366	n = first_n = vfio_find_dma_first_node(iommu, start: iova, size);
1367
1368	while (n) {
1369	dma = rb_entry(n, struct vfio_dma, node);
1370	if (dma->iova >= iova + size)
1371	break;
1372
1373	if (!iommu->v2 && iova > dma->iova)
1374	break;
1375
1376	if (invalidate_vaddr) {
1377	if (dma->vaddr_invalid) {
1378	struct rb_node *last_n = n;
1379
1380	for (n = first_n; n != last_n; n = rb_next(n)) {
1381	dma = rb_entry(n,
1382	struct vfio_dma, node);
1383	dma->vaddr_invalid = false;
1384	iommu->vaddr_invalid_count--;
1385	}
1386	ret = -EINVAL;
1387	unmapped = `0`;
1388	break;
1389	}
1390	dma->vaddr_invalid = true;
1391	iommu->vaddr_invalid_count++;
1392	unmapped += dma->size;
1393	n = rb_next(n);
1394	continue;
1395	}
1396
1397	if (!RB_EMPTY_ROOT(&dma->pfn_list)) {
1398	if (dma_last == dma) {
1399	BUG_ON(++retries > `10`);
1400	} else {
1401	dma_last = dma;
1402	retries = `0`;
1403	}
1404
1405	vfio_notify_dma_unmap(iommu, dma);
1406	goto again;
1407	}
1408
1409	if (unmap->flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
1410	ret = update_user_bitmap(bitmap: bitmap->data, iommu, dma,
1411	base_iova: iova, pgsize);
1412	if (ret)
1413	break;
1414	}
1415
1416	unmapped += dma->size;
1417	n = rb_next(n);
1418	vfio_remove_dma(iommu, dma);
1419	}
1420
1421	unlock:
1422	mutex_unlock(lock: &iommu->lock);
1423
1424	/ Report how much was unmapped /
1425	unmap->size = unmapped;
1426
1427	return ret;
1428	}
1429
1430	static int vfio_iommu_map(struct vfio_iommu *iommu, dma_addr_t iova,
1431	unsigned long pfn, long npage, int prot)
1432	{
1433	struct vfio_domain *d;
1434	int ret;
1435
1436	list_for_each_entry(d, &iommu->domain_list, next) {
1437	ret = iommu_map(domain: d->domain, iova, paddr: (phys_addr_t)pfn << PAGE_SHIFT,
1438	size: npage << PAGE_SHIFT, prot: prot \| IOMMU_CACHE,
1439	GFP_KERNEL);
1440	if (ret)
1441	goto unwind;
1442
1443	cond_resched();
1444	}
1445
1446	return `0`;
1447
1448	unwind:
1449	list_for_each_entry_continue_reverse(d, &iommu->domain_list, next) {
1450	iommu_unmap(domain: d->domain, iova, size: npage << PAGE_SHIFT);
1451	cond_resched();
1452	}
1453
1454	return ret;
1455	}
1456
1457	static int vfio_pin_map_dma(struct vfio_iommu iommu, struct* vfio_dma *dma,
1458	size_t map_size)
1459	{
1460	dma_addr_t iova = dma->iova;
1461	unsigned long vaddr = dma->vaddr;
1462	struct vfio_batch batch;
1463	size_t size = map_size;
1464	long npage;
1465	unsigned long pfn, limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1466	int ret = `0`;
1467
1468	vfio_batch_init(batch: &batch);
1469
1470	while (size) {
1471	/ Pin a contiguous chunk of memory /
1472	npage = vfio_pin_pages_remote(dma, vaddr: vaddr + dma->size,
1473	npage: size >> PAGE_SHIFT, pfn_base: &pfn, limit,
1474	batch: &batch);
1475	if (npage <= `0`) {
1476	WARN_ON(!npage);
1477	ret = (int)npage;
1478	break;
1479	}
1480
1481	/ Map it! /
1482	ret = vfio_iommu_map(iommu, iova: iova + dma->size, pfn, npage,
1483	prot: dma->prot);
1484	if (ret) {
1485	vfio_unpin_pages_remote(dma, iova: iova + dma->size, pfn,
1486	npage, do_accounting: true);
1487	vfio_batch_unpin(batch: &batch, dma);
1488	break;
1489	}
1490
1491	size -= npage << PAGE_SHIFT;
1492	dma->size += npage << PAGE_SHIFT;
1493	}
1494
1495	vfio_batch_fini(batch: &batch);
1496	dma->iommu_mapped = true;
1497
1498	if (ret)
1499	vfio_remove_dma(iommu, dma);
1500
1501	return ret;
1502	}
1503
1504	/*
1505	* Check dma map request is within a valid iova range
1506	*/
1507	static bool vfio_iommu_iova_dma_valid(struct vfio_iommu *iommu,
1508	dma_addr_t start, dma_addr_t end)
1509	{
1510	struct list_head *iova = &iommu->iova_list;
1511	struct vfio_iova *node;
1512
1513	list_for_each_entry(node, iova, list) {
1514	if (start >= node->start && end <= node->end)
1515	return true;
1516	}
1517
1518	/*
1519	* Check for list_empty() as well since a container with
1520	* a single mdev device will have an empty list.
1521	*/
1522	return list_empty(head: iova);
1523	}
1524
1525	static int vfio_change_dma_owner(struct vfio_dma *dma)
1526	{
1527	struct task_struct *task = current->group_leader;
1528	struct mm_struct *mm = current->mm;
1529	long npage = dma->locked_vm;
1530	bool lock_cap;
1531	int ret;
1532
1533	if (mm == dma->mm)
1534	return `0`;
1535
1536	lock_cap = capable(CAP_IPC_LOCK);
1537	ret = mm_lock_acct(task, mm, lock_cap, npage);
1538	if (ret)
1539	return ret;
1540
1541	if (mmget_not_zero(mm: dma->mm)) {
1542	mm_lock_acct(task: dma->task, mm: dma->mm, lock_cap: dma->lock_cap, npage: -npage);
1543	mmput(dma->mm);
1544	}
1545
1546	if (dma->task != task) {
1547	put_task_struct(t: dma->task);
1548	dma->task = get_task_struct(t: task);
1549	}
1550	mmdrop(mm: dma->mm);
1551	dma->mm = mm;
1552	mmgrab(mm: dma->mm);
1553	dma->lock_cap = lock_cap;
1554	return `0`;
1555	}
1556
1557	static int vfio_dma_do_map(struct vfio_iommu *iommu,
1558	struct vfio_iommu_type1_dma_map *map)
1559	{
1560	bool set_vaddr = map->flags & VFIO_DMA_MAP_FLAG_VADDR;
1561	dma_addr_t iova = map->iova;
1562	unsigned long vaddr = map->vaddr;
1563	size_t size = map->size;
1564	int ret = `0`, prot = `0`;
1565	size_t pgsize;
1566	struct vfio_dma *dma;
1567
1568	/ Verify that none of our __u64 fields overflow /
1569	if (map->size != size \|\| map->vaddr != vaddr \|\| map->iova != iova)
1570	return -EINVAL;
1571
1572	/ READ/WRITE from device perspective /
1573	if (map->flags & VFIO_DMA_MAP_FLAG_WRITE)
1574	prot \|= IOMMU_WRITE;
1575	if (map->flags & VFIO_DMA_MAP_FLAG_READ)
1576	prot \|= IOMMU_READ;
1577
1578	if ((prot && set_vaddr) \|\| (!prot && !set_vaddr))
1579	return -EINVAL;
1580
1581	mutex_lock(&iommu->lock);
1582
1583	pgsize = (size_t)`1` << __ffs(iommu->pgsize_bitmap);
1584
1585	WARN_ON((pgsize - `1`) & PAGE_MASK);
1586
1587	if (!size \|\| (size \| iova \| vaddr) & (pgsize - `1`)) {
1588	ret = -EINVAL;
1589	goto out_unlock;
1590	}
1591
1592	/ Don't allow IOVA or virtual address wrap /
1593	if (iova + size - `1` < iova \|\| vaddr + size - `1` < vaddr) {
1594	ret = -EINVAL;
1595	goto out_unlock;
1596	}
1597
1598	dma = vfio_find_dma(iommu, start: iova, size);
1599	if (set_vaddr) {
1600	if (!dma) {
1601	ret = -ENOENT;
1602	} else if (!dma->vaddr_invalid \|\| dma->iova != iova \|\|
1603	dma->size != size) {
1604	ret = -EINVAL;
1605	} else {
1606	ret = vfio_change_dma_owner(dma);
1607	if (ret)
1608	goto out_unlock;
1609	dma->vaddr = vaddr;
1610	dma->vaddr_invalid = false;
1611	iommu->vaddr_invalid_count--;
1612	}
1613	goto out_unlock;
1614	} else if (dma) {
1615	ret = -EEXIST;
1616	goto out_unlock;
1617	}
1618
1619	if (!iommu->dma_avail) {
1620	ret = -ENOSPC;
1621	goto out_unlock;
1622	}
1623
1624	if (!vfio_iommu_iova_dma_valid(iommu, start: iova, end: iova + size - `1`)) {
1625	ret = -EINVAL;
1626	goto out_unlock;
1627	}
1628
1629	dma = kzalloc(size: sizeof(*dma), GFP_KERNEL);
1630	if (!dma) {
1631	ret = -ENOMEM;
1632	goto out_unlock;
1633	}
1634
1635	iommu->dma_avail--;
1636	dma->iova = iova;
1637	dma->vaddr = vaddr;
1638	dma->prot = prot;
1639
1640	/*
1641	* We need to be able to both add to a task's locked memory and test
1642	* against the locked memory limit and we need to be able to do both
1643	* outside of this call path as pinning can be asynchronous via the
1644	* external interfaces for mdev devices. RLIMIT_MEMLOCK requires a
1645	* task_struct. Save the group_leader so that all DMA tracking uses
1646	* the same task, to make debugging easier. VM locked pages requires
1647	* an mm_struct, so grab the mm in case the task dies.
1648	*/
1649	get_task_struct(current->group_leader);
1650	dma->task = current->group_leader;
1651	dma->lock_cap = capable(CAP_IPC_LOCK);
1652	dma->mm = current->mm;
1653	mmgrab(mm: dma->mm);
1654
1655	dma->pfn_list = RB_ROOT;
1656
1657	/ Insert zero-sized and grow as we map chunks of it /
1658	vfio_link_dma(iommu, new: dma);
1659
1660	/ Don't pin and map if container doesn't contain IOMMU capable domain/
1661	if (list_empty(head: &iommu->domain_list))
1662	dma->size = size;
1663	else
1664	ret = vfio_pin_map_dma(iommu, dma, map_size: size);
1665
1666	if (!ret && iommu->dirty_page_tracking) {
1667	ret = vfio_dma_bitmap_alloc(dma, pgsize);
1668	if (ret)
1669	vfio_remove_dma(iommu, dma);
1670	}
1671
1672	out_unlock:
1673	mutex_unlock(lock: &iommu->lock);
1674	return ret;
1675	}
1676
1677	static int vfio_iommu_replay(struct vfio_iommu *iommu,
1678	struct vfio_domain *domain)
1679	{
1680	struct vfio_batch batch;
1681	struct vfio_domain *d = NULL;
1682	struct rb_node *n;
1683	unsigned long limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
1684	int ret;
1685
1686	/ Arbitrarily pick the first domain in the list for lookups /
1687	if (!list_empty(head: &iommu->domain_list))
1688	d = list_first_entry(&iommu->domain_list,
1689	struct vfio_domain, next);
1690
1691	vfio_batch_init(batch: &batch);
1692
1693	n = rb_first(&iommu->dma_list);
1694
1695	for (; n; n = rb_next(n)) {
1696	struct vfio_dma *dma;
1697	dma_addr_t iova;
1698
1699	dma = rb_entry(n, struct vfio_dma, node);
1700	iova = dma->iova;
1701
1702	while (iova < dma->iova + dma->size) {
1703	phys_addr_t phys;
1704	size_t size;
1705
1706	if (dma->iommu_mapped) {
1707	phys_addr_t p;
1708	dma_addr_t i;
1709
1710	if (WARN_ON(!d)) { / mapped w/o a domain?! /
1711	ret = -EINVAL;
1712	goto unwind;
1713	}
1714
1715	phys = iommu_iova_to_phys(domain: d->domain, iova);
1716
1717	if (WARN_ON(!phys)) {
1718	iova += PAGE_SIZE;
1719	continue;
1720	}
1721
1722	size = PAGE_SIZE;
1723	p = phys + size;
1724	i = iova + size;
1725	while (i < dma->iova + dma->size &&
1726	p == iommu_iova_to_phys(domain: d->domain, iova: i)) {
1727	size += PAGE_SIZE;
1728	p += PAGE_SIZE;
1729	i += PAGE_SIZE;
1730	}
1731	} else {
1732	unsigned long pfn;
1733	unsigned long vaddr = dma->vaddr +
1734	(iova - dma->iova);
1735	size_t n = dma->iova + dma->size - iova;
1736	long npage;
1737
1738	npage = vfio_pin_pages_remote(dma, vaddr,
1739	npage: n >> PAGE_SHIFT,
1740	pfn_base: &pfn, limit,
1741	batch: &batch);
1742	if (npage <= `0`) {
1743	WARN_ON(!npage);
1744	ret = (int)npage;
1745	goto unwind;
1746	}
1747
1748	phys = pfn << PAGE_SHIFT;
1749	size = npage << PAGE_SHIFT;
1750	}
1751
1752	ret = iommu_map(domain: domain->domain, iova, paddr: phys, size,
1753	prot: dma->prot \| IOMMU_CACHE, GFP_KERNEL);
1754	if (ret) {
1755	if (!dma->iommu_mapped) {
1756	vfio_unpin_pages_remote(dma, iova,
1757	pfn: phys >> PAGE_SHIFT,
1758	npage: size >> PAGE_SHIFT,
1759	do_accounting: true);
1760	vfio_batch_unpin(batch: &batch, dma);
1761	}
1762	goto unwind;
1763	}
1764
1765	iova += size;
1766	}
1767	}
1768
1769	/ All dmas are now mapped, defer to second tree walk for unwind /
1770	for (n = rb_first(&iommu->dma_list); n; n = rb_next(n)) {
1771	struct vfio_dma dma = rb_entry(n, struct* vfio_dma, node);
1772
1773	dma->iommu_mapped = true;
1774	}
1775
1776	vfio_batch_fini(batch: &batch);
1777	return `0`;
1778
1779	unwind:
1780	for (; n; n = rb_prev(n)) {
1781	struct vfio_dma dma = rb_entry(n, struct* vfio_dma, node);
1782	dma_addr_t iova;
1783
1784	if (dma->iommu_mapped) {
1785	iommu_unmap(domain: domain->domain, iova: dma->iova, size: dma->size);
1786	continue;
1787	}
1788
1789	iova = dma->iova;
1790	while (iova < dma->iova + dma->size) {
1791	phys_addr_t phys, p;
1792	size_t size;
1793	dma_addr_t i;
1794
1795	phys = iommu_iova_to_phys(domain: domain->domain, iova);
1796	if (!phys) {
1797	iova += PAGE_SIZE;
1798	continue;
1799	}
1800
1801	size = PAGE_SIZE;
1802	p = phys + size;
1803	i = iova + size;
1804	while (i < dma->iova + dma->size &&
1805	p == iommu_iova_to_phys(domain: domain->domain, iova: i)) {
1806	size += PAGE_SIZE;
1807	p += PAGE_SIZE;
1808	i += PAGE_SIZE;
1809	}
1810
1811	iommu_unmap(domain: domain->domain, iova, size);
1812	vfio_unpin_pages_remote(dma, iova, pfn: phys >> PAGE_SHIFT,
1813	npage: size >> PAGE_SHIFT, do_accounting: true);
1814	}
1815	}
1816
1817	vfio_batch_fini(batch: &batch);
1818	return ret;
1819	}
1820
1821	/*
1822	* We change our unmap behavior slightly depending on whether the IOMMU
1823	* supports fine-grained superpages. IOMMUs like AMD-Vi will use a superpage
1824	* for practically any contiguous power-of-two mapping we give it. This means
1825	* we don't need to look for contiguous chunks ourselves to make unmapping
1826	* more efficient. On IOMMUs with coarse-grained super pages, like Intel VT-d
1827	* with discrete 2M/1G/512G/1T superpages, identifying contiguous chunks
1828	* significantly boosts non-hugetlbfs mappings and doesn't seem to hurt when
1829	* hugetlbfs is in use.
1830	*/
1831	static void vfio_test_domain_fgsp(struct vfio_domain domain, struct* list_head *regions)
1832	{
1833	int ret, order = get_order(PAGE_SIZE * `2`);
1834	struct vfio_iova *region;
1835	struct page *pages;
1836	dma_addr_t start;
1837
1838	pages = alloc_pages(GFP_KERNEL \| __GFP_ZERO, order);
1839	if (!pages)
1840	return;
1841
1842	list_for_each_entry(region, regions, list) {
1843	start = ALIGN(region->start, PAGE_SIZE * `2`);
1844	if (start >= region->end \|\| (region->end - start < PAGE_SIZE * `2`))
1845	continue;
1846
1847	ret = iommu_map(domain: domain->domain, iova: start, page_to_phys(pages), PAGE_SIZE * `2`,
1848	IOMMU_READ \| IOMMU_WRITE \| IOMMU_CACHE, GFP_KERNEL);
1849	if (!ret) {
1850	size_t unmapped = iommu_unmap(domain: domain->domain, iova: start, PAGE_SIZE);
1851
1852	if (unmapped == PAGE_SIZE)
1853	iommu_unmap(domain: domain->domain, iova: start + PAGE_SIZE, PAGE_SIZE);
1854	else
1855	domain->fgsp = true;
1856	}
1857	break;
1858	}
1859
1860	__free_pages(page: pages, order);
1861	}
1862
1863	static struct vfio_iommu_group find_iommu_group(struct* vfio_domain *domain,
1864	struct iommu_group *iommu_group)
1865	{
1866	struct vfio_iommu_group *g;
1867
1868	list_for_each_entry(g, &domain->group_list, next) {
1869	if (g->iommu_group == iommu_group)
1870	return g;
1871	}
1872
1873	return NULL;
1874	}
1875
1876	static struct vfio_iommu_group*
1877	vfio_iommu_find_iommu_group(struct vfio_iommu *iommu,
1878	struct iommu_group *iommu_group)
1879	{
1880	struct vfio_iommu_group *group;
1881	struct vfio_domain *domain;
1882
1883	list_for_each_entry(domain, &iommu->domain_list, next) {
1884	group = find_iommu_group(domain, iommu_group);
1885	if (group)
1886	return group;
1887	}
1888
1889	list_for_each_entry(group, &iommu->emulated_iommu_groups, next)
1890	if (group->iommu_group == iommu_group)
1891	return group;
1892	return NULL;
1893	}
1894
1895	static bool vfio_iommu_has_sw_msi(struct list_head *group_resv_regions,
1896	phys_addr_t *base)
1897	{
1898	struct iommu_resv_region *region;
1899	bool ret = false;
1900
1901	list_for_each_entry(region, group_resv_regions, list) {
1902	/*
1903	* The presence of any 'real' MSI regions should take
1904	* precedence over the software-managed one if the
1905	* IOMMU driver happens to advertise both types.
1906	*/
1907	if (region->type == IOMMU_RESV_MSI) {
1908	ret = false;
1909	break;
1910	}
1911
1912	if (region->type == IOMMU_RESV_SW_MSI) {
1913	*base = region->start;
1914	ret = true;
1915	}
1916	}
1917
1918	return ret;
1919	}
1920
1921	/*
1922	* This is a helper function to insert an address range to iova list.
1923	* The list is initially created with a single entry corresponding to
1924	* the IOMMU domain geometry to which the device group is attached.
1925	* The list aperture gets modified when a new domain is added to the
1926	* container if the new aperture doesn't conflict with the current one
1927	* or with any existing dma mappings. The list is also modified to
1928	* exclude any reserved regions associated with the device group.
1929	*/
1930	static int vfio_iommu_iova_insert(struct list_head *head,
1931	dma_addr_t start, dma_addr_t end)
1932	{
1933	struct vfio_iova *region;
1934
1935	region = kmalloc(size: sizeof(*region), GFP_KERNEL);
1936	if (!region)
1937	return -ENOMEM;
1938
1939	INIT_LIST_HEAD(list: &region->list);
1940	region->start = start;
1941	region->end = end;
1942
1943	list_add_tail(new: &region->list, head);
1944	return `0`;
1945	}
1946
1947	/*
1948	* Check the new iommu aperture conflicts with existing aper or with any
1949	* existing dma mappings.
1950	*/
1951	static bool vfio_iommu_aper_conflict(struct vfio_iommu *iommu,
1952	dma_addr_t start, dma_addr_t end)
1953	{
1954	struct vfio_iova first, last;
1955	struct list_head *iova = &iommu->iova_list;
1956
1957	if (list_empty(head: iova))
1958	return false;
1959
1960	/ Disjoint sets, return conflict /
1961	first = list_first_entry(iova, struct vfio_iova, list);
1962	last = list_last_entry(iova, struct vfio_iova, list);
1963	if (start > last->end \|\| end < first->start)
1964	return true;
1965
1966	/ Check for any existing dma mappings below the new start /
1967	if (start > first->start) {
1968	if (vfio_find_dma(iommu, start: first->start, size: start - first->start))
1969	return true;
1970	}
1971
1972	/ Check for any existing dma mappings beyond the new end /
1973	if (end < last->end) {
1974	if (vfio_find_dma(iommu, start: end + `1`, size: last->end - end))
1975	return true;
1976	}
1977
1978	return false;
1979	}
1980
1981	/*
1982	* Resize iommu iova aperture window. This is called only if the new
1983	* aperture has no conflict with existing aperture and dma mappings.
1984	*/
1985	static int vfio_iommu_aper_resize(struct list_head *iova,
1986	dma_addr_t start, dma_addr_t end)
1987	{
1988	struct vfio_iova node, next;
1989
1990	if (list_empty(head: iova))
1991	return vfio_iommu_iova_insert(head: iova, start, end);
1992
1993	/ Adjust iova list start /
1994	list_for_each_entry_safe(node, next, iova, list) {
1995	if (start < node->start)
1996	break;
1997	if (start >= node->start && start < node->end) {
1998	node->start = start;
1999	break;
2000	}
2001	/ Delete nodes before new start /
2002	list_del(entry: &node->list);
2003	kfree(objp: node);
2004	}
2005
2006	/ Adjust iova list end /
2007	list_for_each_entry_safe(node, next, iova, list) {
2008	if (end > node->end)
2009	continue;
2010	if (end > node->start && end <= node->end) {
2011	node->end = end;
2012	continue;
2013	}
2014	/ Delete nodes after new end /
2015	list_del(entry: &node->list);
2016	kfree(objp: node);
2017	}
2018
2019	return `0`;
2020	}
2021
2022	/*
2023	* Check reserved region conflicts with existing dma mappings
2024	*/
2025	static bool vfio_iommu_resv_conflict(struct vfio_iommu *iommu,
2026	struct list_head *resv_regions)
2027	{
2028	struct iommu_resv_region *region;
2029
2030	/ Check for conflict with existing dma mappings /
2031	list_for_each_entry(region, resv_regions, list) {
2032	if (region->type == IOMMU_RESV_DIRECT_RELAXABLE)
2033	continue;
2034
2035	if (vfio_find_dma(iommu, start: region->start, size: region->length))
2036	return true;
2037	}
2038
2039	return false;
2040	}
2041
2042	/*
2043	* Check iova region overlap with reserved regions and
2044	* exclude them from the iommu iova range
2045	*/
2046	static int vfio_iommu_resv_exclude(struct list_head *iova,
2047	struct list_head *resv_regions)
2048	{
2049	struct iommu_resv_region *resv;
2050	struct vfio_iova n, next;
2051
2052	list_for_each_entry(resv, resv_regions, list) {
2053	phys_addr_t start, end;
2054
2055	if (resv->type == IOMMU_RESV_DIRECT_RELAXABLE)
2056	continue;
2057
2058	start = resv->start;
2059	end = resv->start + resv->length - `1`;
2060
2061	list_for_each_entry_safe(n, next, iova, list) {
2062	int ret = `0`;
2063
2064	/ No overlap /
2065	if (start > n->end \|\| end < n->start)
2066	continue;
2067	/*
2068	* Insert a new node if current node overlaps with the
2069	* reserve region to exclude that from valid iova range.
2070	* Note that, new node is inserted before the current
2071	* node and finally the current node is deleted keeping
2072	* the list updated and sorted.
2073	*/
2074	if (start > n->start)
2075	ret = vfio_iommu_iova_insert(head: &n->list, start: n->start,
2076	end: start - `1`);
2077	if (!ret && end < n->end)
2078	ret = vfio_iommu_iova_insert(head: &n->list, start: end + `1`,
2079	end: n->end);
2080	if (ret)
2081	return ret;
2082
2083	list_del(entry: &n->list);
2084	kfree(objp: n);
2085	}
2086	}
2087
2088	if (list_empty(head: iova))
2089	return -EINVAL;
2090
2091	return `0`;
2092	}
2093
2094	static void vfio_iommu_resv_free(struct list_head *resv_regions)
2095	{
2096	struct iommu_resv_region n, next;
2097
2098	list_for_each_entry_safe(n, next, resv_regions, list) {
2099	list_del(entry: &n->list);
2100	kfree(objp: n);
2101	}
2102	}
2103
2104	static void vfio_iommu_iova_free(struct list_head *iova)
2105	{
2106	struct vfio_iova n, next;
2107
2108	list_for_each_entry_safe(n, next, iova, list) {
2109	list_del(entry: &n->list);
2110	kfree(objp: n);
2111	}
2112	}
2113
2114	static int vfio_iommu_iova_get_copy(struct vfio_iommu *iommu,
2115	struct list_head *iova_copy)
2116	{
2117	struct list_head *iova = &iommu->iova_list;
2118	struct vfio_iova *n;
2119	int ret;
2120
2121	list_for_each_entry(n, iova, list) {
2122	ret = vfio_iommu_iova_insert(head: iova_copy, start: n->start, end: n->end);
2123	if (ret)
2124	goto out_free;
2125	}
2126
2127	return `0`;
2128
2129	out_free:
2130	vfio_iommu_iova_free(iova: iova_copy);
2131	return ret;
2132	}
2133
2134	static void vfio_iommu_iova_insert_copy(struct vfio_iommu *iommu,
2135	struct list_head *iova_copy)
2136	{
2137	struct list_head *iova = &iommu->iova_list;
2138
2139	vfio_iommu_iova_free(iova);
2140
2141	list_splice_tail(list: iova_copy, head: iova);
2142	}
2143
2144	static int vfio_iommu_domain_alloc(struct device dev, void* *data)
2145	{
2146	struct iommu_domain **domain = data;
2147
2148	*domain = iommu_domain_alloc(bus: dev->bus);
2149	return `1`; / Don't iterate /
2150	}
2151
2152	static int vfio_iommu_type1_attach_group(void *iommu_data,
2153	struct iommu_group iommu_group, enum* vfio_group_type type)
2154	{
2155	struct vfio_iommu *iommu = iommu_data;
2156	struct vfio_iommu_group *group;
2157	struct vfio_domain domain, d;
2158	bool resv_msi;
2159	phys_addr_t resv_msi_base = `0`;
2160	struct iommu_domain_geometry *geo;
2161	LIST_HEAD(iova_copy);
2162	LIST_HEAD(group_resv_regions);
2163	int ret = -EBUSY;
2164
2165	mutex_lock(&iommu->lock);
2166
2167	/ Attach could require pinning, so disallow while vaddr is invalid. /
2168	if (iommu->vaddr_invalid_count)
2169	goto out_unlock;
2170
2171	/ Check for duplicates /
2172	ret = -EINVAL;
2173	if (vfio_iommu_find_iommu_group(iommu, iommu_group))
2174	goto out_unlock;
2175
2176	ret = -ENOMEM;
2177	group = kzalloc(size: sizeof(*group), GFP_KERNEL);
2178	if (!group)
2179	goto out_unlock;
2180	group->iommu_group = iommu_group;
2181
2182	if (type == VFIO_EMULATED_IOMMU) {
2183	list_add(new: &group->next, head: &iommu->emulated_iommu_groups);
2184	/*
2185	* An emulated IOMMU group cannot dirty memory directly, it can
2186	* only use interfaces that provide dirty tracking.
2187	* The iommu scope can only be promoted with the addition of a
2188	* dirty tracking group.
2189	*/
2190	group->pinned_page_dirty_scope = true;
2191	ret = `0`;
2192	goto out_unlock;
2193	}
2194
2195	ret = -ENOMEM;
2196	domain = kzalloc(size: sizeof(*domain), GFP_KERNEL);
2197	if (!domain)
2198	goto out_free_group;
2199
2200	/*
2201	* Going via the iommu_group iterator avoids races, and trivially gives
2202	* us a representative device for the IOMMU API call. We don't actually
2203	* want to iterate beyond the first device (if any).
2204	*/
2205	ret = -EIO;
2206	iommu_group_for_each_dev(group: iommu_group, data: &domain->domain,
2207	fn: vfio_iommu_domain_alloc);
2208	if (!domain->domain)
2209	goto out_free_domain;
2210
2211	if (iommu->nesting) {
2212	ret = iommu_enable_nesting(domain: domain->domain);
2213	if (ret)
2214	goto out_domain;
2215	}
2216
2217	ret = iommu_attach_group(domain: domain->domain, group: group->iommu_group);
2218	if (ret)
2219	goto out_domain;
2220
2221	/ Get aperture info /
2222	geo = &domain->domain->geometry;
2223	if (vfio_iommu_aper_conflict(iommu, start: geo->aperture_start,
2224	end: geo->aperture_end)) {
2225	ret = -EINVAL;
2226	goto out_detach;
2227	}
2228
2229	ret = iommu_get_group_resv_regions(group: iommu_group, head: &group_resv_regions);
2230	if (ret)
2231	goto out_detach;
2232
2233	if (vfio_iommu_resv_conflict(iommu, resv_regions: &group_resv_regions)) {
2234	ret = -EINVAL;
2235	goto out_detach;
2236	}
2237
2238	/*
2239	* We don't want to work on the original iova list as the list
2240	* gets modified and in case of failure we have to retain the
2241	* original list. Get a copy here.
2242	*/
2243	ret = vfio_iommu_iova_get_copy(iommu, iova_copy: &iova_copy);
2244	if (ret)
2245	goto out_detach;
2246
2247	ret = vfio_iommu_aper_resize(iova: &iova_copy, start: geo->aperture_start,
2248	end: geo->aperture_end);
2249	if (ret)
2250	goto out_detach;
2251
2252	ret = vfio_iommu_resv_exclude(iova: &iova_copy, resv_regions: &group_resv_regions);
2253	if (ret)
2254	goto out_detach;
2255
2256	resv_msi = vfio_iommu_has_sw_msi(group_resv_regions: &group_resv_regions, base: &resv_msi_base);
2257
2258	INIT_LIST_HEAD(list: &domain->group_list);
2259	list_add(new: &group->next, head: &domain->group_list);
2260
2261	if (!allow_unsafe_interrupts &&
2262	!iommu_group_has_isolated_msi(group: iommu_group)) {
2263	pr_warn("%s: No interrupt remapping support. Use the module param \"allow_unsafe_interrupts\" to enable VFIO IOMMU support on this platform\n",
2264	__func__);
2265	ret = -EPERM;
2266	goto out_detach;
2267	}
2268
2269	/*
2270	* If the IOMMU can block non-coherent operations (ie PCIe TLPs with
2271	* no-snoop set) then VFIO always turns this feature on because on Intel
2272	* platforms it optimizes KVM to disable wbinvd emulation.
2273	*/
2274	if (domain->domain->ops->enforce_cache_coherency)
2275	domain->enforce_cache_coherency =
2276	domain->domain->ops->enforce_cache_coherency(
2277	domain->domain);
2278
2279	/*
2280	* Try to match an existing compatible domain. We don't want to
2281	* preclude an IOMMU driver supporting multiple bus_types and being
2282	* able to include different bus_types in the same IOMMU domain, so
2283	* we test whether the domains use the same iommu_ops rather than
2284	* testing if they're on the same bus_type.
2285	*/
2286	list_for_each_entry(d, &iommu->domain_list, next) {
2287	if (d->domain->ops == domain->domain->ops &&
2288	d->enforce_cache_coherency ==
2289	domain->enforce_cache_coherency) {
2290	iommu_detach_group(domain: domain->domain, group: group->iommu_group);
2291	if (!iommu_attach_group(domain: d->domain,
2292	group: group->iommu_group)) {
2293	list_add(new: &group->next, head: &d->group_list);
2294	iommu_domain_free(domain: domain->domain);
2295	kfree(objp: domain);
2296	goto done;
2297	}
2298
2299	ret = iommu_attach_group(domain: domain->domain,
2300	group: group->iommu_group);
2301	if (ret)
2302	goto out_domain;
2303	}
2304	}
2305
2306	vfio_test_domain_fgsp(domain, regions: &iova_copy);
2307
2308	/ replay mappings on new domains /
2309	ret = vfio_iommu_replay(iommu, domain);
2310	if (ret)
2311	goto out_detach;
2312
2313	if (resv_msi) {
2314	ret = iommu_get_msi_cookie(domain: domain->domain, base: resv_msi_base);
2315	if (ret && ret != -ENODEV)
2316	goto out_detach;
2317	}
2318
2319	list_add(new: &domain->next, head: &iommu->domain_list);
2320	vfio_update_pgsize_bitmap(iommu);
2321	done:
2322	/ Delete the old one and insert new iova list /
2323	vfio_iommu_iova_insert_copy(iommu, iova_copy: &iova_copy);
2324
2325	/*
2326	* An iommu backed group can dirty memory directly and therefore
2327	* demotes the iommu scope until it declares itself dirty tracking
2328	* capable via the page pinning interface.
2329	*/
2330	iommu->num_non_pinned_groups++;
2331	mutex_unlock(lock: &iommu->lock);
2332	vfio_iommu_resv_free(resv_regions: &group_resv_regions);
2333
2334	return `0`;
2335
2336	out_detach:
2337	iommu_detach_group(domain: domain->domain, group: group->iommu_group);
2338	out_domain:
2339	iommu_domain_free(domain: domain->domain);
2340	vfio_iommu_iova_free(iova: &iova_copy);
2341	vfio_iommu_resv_free(resv_regions: &group_resv_regions);
2342	out_free_domain:
2343	kfree(objp: domain);
2344	out_free_group:
2345	kfree(objp: group);
2346	out_unlock:
2347	mutex_unlock(lock: &iommu->lock);
2348	return ret;
2349	}
2350
2351	static void vfio_iommu_unmap_unpin_all(struct vfio_iommu *iommu)
2352	{
2353	struct rb_node *node;
2354
2355	while ((node = rb_first(&iommu->dma_list)))
2356	vfio_remove_dma(iommu, rb_entry(node, struct vfio_dma, node));
2357	}
2358
2359	static void vfio_iommu_unmap_unpin_reaccount(struct vfio_iommu *iommu)
2360	{
2361	struct rb_node n, p;
2362
2363	n = rb_first(&iommu->dma_list);
2364	for (; n; n = rb_next(n)) {
2365	struct vfio_dma *dma;
2366	long locked = `0`, unlocked = `0`;
2367
2368	dma = rb_entry(n, struct vfio_dma, node);
2369	unlocked += vfio_unmap_unpin(iommu, dma, do_accounting: false);
2370	p = rb_first(&dma->pfn_list);
2371	for (; p; p = rb_next(p)) {
2372	struct vfio_pfn vpfn = rb_entry(p, struct* vfio_pfn,
2373	node);
2374
2375	if (!is_invalid_reserved_pfn(pfn: vpfn->pfn))
2376	locked++;
2377	}
2378	vfio_lock_acct(dma, npage: locked - unlocked, async: true);
2379	}
2380	}
2381
2382	/*
2383	* Called when a domain is removed in detach. It is possible that
2384	* the removed domain decided the iova aperture window. Modify the
2385	* iova aperture with the smallest window among existing domains.
2386	*/
2387	static void vfio_iommu_aper_expand(struct vfio_iommu *iommu,
2388	struct list_head *iova_copy)
2389	{
2390	struct vfio_domain *domain;
2391	struct vfio_iova *node;
2392	dma_addr_t start = `0`;
2393	dma_addr_t end = (dma_addr_t)~`0`;
2394
2395	if (list_empty(head: iova_copy))
2396	return;
2397
2398	list_for_each_entry(domain, &iommu->domain_list, next) {
2399	struct iommu_domain_geometry *geo = &domain->domain->geometry;
2400
2401	if (geo->aperture_start > start)
2402	start = geo->aperture_start;
2403	if (geo->aperture_end < end)
2404	end = geo->aperture_end;
2405	}
2406
2407	/ Modify aperture limits. The new aper is either same or bigger /
2408	node = list_first_entry(iova_copy, struct vfio_iova, list);
2409	node->start = start;
2410	node = list_last_entry(iova_copy, struct vfio_iova, list);
2411	node->end = end;
2412	}
2413
2414	/*
2415	* Called when a group is detached. The reserved regions for that
2416	* group can be part of valid iova now. But since reserved regions
2417	* may be duplicated among groups, populate the iova valid regions
2418	* list again.
2419	*/
2420	static int vfio_iommu_resv_refresh(struct vfio_iommu *iommu,
2421	struct list_head *iova_copy)
2422	{
2423	struct vfio_domain *d;
2424	struct vfio_iommu_group *g;
2425	struct vfio_iova *node;
2426	dma_addr_t start, end;
2427	LIST_HEAD(resv_regions);
2428	int ret;
2429
2430	if (list_empty(head: iova_copy))
2431	return -EINVAL;
2432
2433	list_for_each_entry(d, &iommu->domain_list, next) {
2434	list_for_each_entry(g, &d->group_list, next) {
2435	ret = iommu_get_group_resv_regions(group: g->iommu_group,
2436	head: &resv_regions);
2437	if (ret)
2438	goto done;
2439	}
2440	}
2441
2442	node = list_first_entry(iova_copy, struct vfio_iova, list);
2443	start = node->start;
2444	node = list_last_entry(iova_copy, struct vfio_iova, list);
2445	end = node->end;
2446
2447	/ purge the iova list and create new one /
2448	vfio_iommu_iova_free(iova: iova_copy);
2449
2450	ret = vfio_iommu_aper_resize(iova: iova_copy, start, end);
2451	if (ret)
2452	goto done;
2453
2454	/ Exclude current reserved regions from iova ranges /
2455	ret = vfio_iommu_resv_exclude(iova: iova_copy, resv_regions: &resv_regions);
2456	done:
2457	vfio_iommu_resv_free(resv_regions: &resv_regions);
2458	return ret;
2459	}
2460
2461	static void vfio_iommu_type1_detach_group(void *iommu_data,
2462	struct iommu_group *iommu_group)
2463	{
2464	struct vfio_iommu *iommu = iommu_data;
2465	struct vfio_domain *domain;
2466	struct vfio_iommu_group *group;
2467	bool update_dirty_scope = false;
2468	LIST_HEAD(iova_copy);
2469
2470	mutex_lock(&iommu->lock);
2471	list_for_each_entry(group, &iommu->emulated_iommu_groups, next) {
2472	if (group->iommu_group != iommu_group)
2473	continue;
2474	update_dirty_scope = !group->pinned_page_dirty_scope;
2475	list_del(entry: &group->next);
2476	kfree(objp: group);
2477
2478	if (list_empty(head: &iommu->emulated_iommu_groups) &&
2479	list_empty(head: &iommu->domain_list)) {
2480	WARN_ON(!list_empty(&iommu->device_list));
2481	vfio_iommu_unmap_unpin_all(iommu);
2482	}
2483	goto detach_group_done;
2484	}
2485
2486	/*
2487	* Get a copy of iova list. This will be used to update
2488	* and to replace the current one later. Please note that
2489	* we will leave the original list as it is if update fails.
2490	*/
2491	vfio_iommu_iova_get_copy(iommu, iova_copy: &iova_copy);
2492
2493	list_for_each_entry(domain, &iommu->domain_list, next) {
2494	group = find_iommu_group(domain, iommu_group);
2495	if (!group)
2496	continue;
2497
2498	iommu_detach_group(domain: domain->domain, group: group->iommu_group);
2499	update_dirty_scope = !group->pinned_page_dirty_scope;
2500	list_del(entry: &group->next);
2501	kfree(objp: group);
2502	/*
2503	* Group ownership provides privilege, if the group list is
2504	* empty, the domain goes away. If it's the last domain with
2505	* iommu and external domain doesn't exist, then all the
2506	* mappings go away too. If it's the last domain with iommu and
2507	* external domain exist, update accounting
2508	*/
2509	if (list_empty(head: &domain->group_list)) {
2510	if (list_is_singular(head: &iommu->domain_list)) {
2511	if (list_empty(head: &iommu->emulated_iommu_groups)) {
2512	WARN_ON(!list_empty(
2513	&iommu->device_list));
2514	vfio_iommu_unmap_unpin_all(iommu);
2515	} else {
2516	vfio_iommu_unmap_unpin_reaccount(iommu);
2517	}
2518	}
2519	iommu_domain_free(domain: domain->domain);
2520	list_del(entry: &domain->next);
2521	kfree(objp: domain);
2522	vfio_iommu_aper_expand(iommu, iova_copy: &iova_copy);
2523	vfio_update_pgsize_bitmap(iommu);
2524	}
2525	break;
2526	}
2527
2528	if (!vfio_iommu_resv_refresh(iommu, iova_copy: &iova_copy))
2529	vfio_iommu_iova_insert_copy(iommu, iova_copy: &iova_copy);
2530	else
2531	vfio_iommu_iova_free(iova: &iova_copy);
2532
2533	detach_group_done:
2534	/*
2535	* Removal of a group without dirty tracking may allow the iommu scope
2536	* to be promoted.
2537	*/
2538	if (update_dirty_scope) {
2539	iommu->num_non_pinned_groups--;
2540	if (iommu->dirty_page_tracking)
2541	vfio_iommu_populate_bitmap_full(iommu);
2542	}
2543	mutex_unlock(lock: &iommu->lock);
2544	}
2545
2546	static void vfio_iommu_type1_open(unsigned* long arg)
2547	{
2548	struct vfio_iommu *iommu;
2549
2550	iommu = kzalloc(size: sizeof(*iommu), GFP_KERNEL);
2551	if (!iommu)
2552	return ERR_PTR(error: -ENOMEM);
2553
2554	switch (arg) {
2555	case VFIO_TYPE1_IOMMU:
2556	break;
2557	case VFIO_TYPE1_NESTING_IOMMU:
2558	iommu->nesting = true;
2559	fallthrough;
2560	case VFIO_TYPE1v2_IOMMU:
2561	iommu->v2 = true;
2562	break;
2563	default:
2564	kfree(objp: iommu);
2565	return ERR_PTR(error: -EINVAL);
2566	}
2567
2568	INIT_LIST_HEAD(list: &iommu->domain_list);
2569	INIT_LIST_HEAD(list: &iommu->iova_list);
2570	iommu->dma_list = RB_ROOT;
2571	iommu->dma_avail = dma_entry_limit;
2572	mutex_init(&iommu->lock);
2573	mutex_init(&iommu->device_list_lock);
2574	INIT_LIST_HEAD(list: &iommu->device_list);
2575	iommu->pgsize_bitmap = PAGE_MASK;
2576	INIT_LIST_HEAD(list: &iommu->emulated_iommu_groups);
2577
2578	return iommu;
2579	}
2580
2581	static void vfio_release_domain(struct vfio_domain *domain)
2582	{
2583	struct vfio_iommu_group group, group_tmp;
2584
2585	list_for_each_entry_safe(group, group_tmp,
2586	&domain->group_list, next) {
2587	iommu_detach_group(domain: domain->domain, group: group->iommu_group);
2588	list_del(entry: &group->next);
2589	kfree(objp: group);
2590	}
2591
2592	iommu_domain_free(domain: domain->domain);
2593	}
2594
2595	static void vfio_iommu_type1_release(void *iommu_data)
2596	{
2597	struct vfio_iommu *iommu = iommu_data;
2598	struct vfio_domain domain, domain_tmp;
2599	struct vfio_iommu_group group, next_group;
2600
2601	list_for_each_entry_safe(group, next_group,
2602	&iommu->emulated_iommu_groups, next) {
2603	list_del(entry: &group->next);
2604	kfree(objp: group);
2605	}
2606
2607	vfio_iommu_unmap_unpin_all(iommu);
2608
2609	list_for_each_entry_safe(domain, domain_tmp,
2610	&iommu->domain_list, next) {
2611	vfio_release_domain(domain);
2612	list_del(entry: &domain->next);
2613	kfree(objp: domain);
2614	}
2615
2616	vfio_iommu_iova_free(iova: &iommu->iova_list);
2617
2618	kfree(objp: iommu);
2619	}
2620
2621	static int vfio_domains_have_enforce_cache_coherency(struct vfio_iommu *iommu)
2622	{
2623	struct vfio_domain *domain;
2624	int ret = `1`;
2625
2626	mutex_lock(&iommu->lock);
2627	list_for_each_entry(domain, &iommu->domain_list, next) {
2628	if (!(domain->enforce_cache_coherency)) {
2629	ret = `0`;
2630	break;
2631	}
2632	}
2633	mutex_unlock(lock: &iommu->lock);
2634
2635	return ret;
2636	}
2637
2638	static bool vfio_iommu_has_emulated(struct vfio_iommu *iommu)
2639	{
2640	bool ret;
2641
2642	mutex_lock(&iommu->lock);
2643	ret = !list_empty(head: &iommu->emulated_iommu_groups);
2644	mutex_unlock(lock: &iommu->lock);
2645	return ret;
2646	}
2647
2648	static int vfio_iommu_type1_check_extension(struct vfio_iommu *iommu,
2649	unsigned long arg)
2650	{
2651	switch (arg) {
2652	case VFIO_TYPE1_IOMMU:
2653	case VFIO_TYPE1v2_IOMMU:
2654	case VFIO_TYPE1_NESTING_IOMMU:
2655	case VFIO_UNMAP_ALL:
2656	return `1`;
2657	case VFIO_UPDATE_VADDR:
2658	/*
2659	* Disable this feature if mdevs are present. They cannot
2660	* safely pin/unpin/rw while vaddrs are being updated.
2661	*/
2662	return iommu && !vfio_iommu_has_emulated(iommu);
2663	case VFIO_DMA_CC_IOMMU:
2664	if (!iommu)
2665	return `0`;
2666	return vfio_domains_have_enforce_cache_coherency(iommu);
2667	default:
2668	return `0`;
2669	}
2670	}
2671
2672	static int vfio_iommu_iova_add_cap(struct vfio_info_cap *caps,
2673	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas,
2674	size_t size)
2675	{
2676	struct vfio_info_cap_header *header;
2677	struct vfio_iommu_type1_info_cap_iova_range *iova_cap;
2678
2679	header = vfio_info_cap_add(caps, size,
2680	VFIO_IOMMU_TYPE1_INFO_CAP_IOVA_RANGE, version: `1`);
2681	if (IS_ERR(ptr: header))
2682	return PTR_ERR(ptr: header);
2683
2684	iova_cap = container_of(header,
2685	struct vfio_iommu_type1_info_cap_iova_range,
2686	header);
2687	iova_cap->nr_iovas = cap_iovas->nr_iovas;
2688	memcpy(iova_cap->iova_ranges, cap_iovas->iova_ranges,
2689	cap_iovas->nr_iovas * sizeof(*cap_iovas->iova_ranges));
2690	return `0`;
2691	}
2692
2693	static int vfio_iommu_iova_build_caps(struct vfio_iommu *iommu,
2694	struct vfio_info_cap *caps)
2695	{
2696	struct vfio_iommu_type1_info_cap_iova_range *cap_iovas;
2697	struct vfio_iova *iova;
2698	size_t size;
2699	int iovas = `0`, i = `0`, ret;
2700
2701	list_for_each_entry(iova, &iommu->iova_list, list)
2702	iovas++;
2703
2704	if (!iovas) {
2705	/*
2706	* Return 0 as a container with a single mdev device
2707	* will have an empty list
2708	*/
2709	return `0`;
2710	}
2711
2712	size = struct_size(cap_iovas, iova_ranges, iovas);
2713
2714	cap_iovas = kzalloc(size, GFP_KERNEL);
2715	if (!cap_iovas)
2716	return -ENOMEM;
2717
2718	cap_iovas->nr_iovas = iovas;
2719
2720	list_for_each_entry(iova, &iommu->iova_list, list) {
2721	cap_iovas->iova_ranges[i].start = iova->start;
2722	cap_iovas->iova_ranges[i].end = iova->end;
2723	i++;
2724	}
2725
2726	ret = vfio_iommu_iova_add_cap(caps, cap_iovas, size);
2727
2728	kfree(objp: cap_iovas);
2729	return ret;
2730	}
2731
2732	static int vfio_iommu_migration_build_caps(struct vfio_iommu *iommu,
2733	struct vfio_info_cap *caps)
2734	{
2735	struct vfio_iommu_type1_info_cap_migration cap_mig = {};
2736
2737	cap_mig.header.id = VFIO_IOMMU_TYPE1_INFO_CAP_MIGRATION;
2738	cap_mig.header.version = `1`;
2739
2740	cap_mig.flags = `0`;
2741	/ support minimum pgsize /
2742	cap_mig.pgsize_bitmap = (size_t)`1` << __ffs(iommu->pgsize_bitmap);
2743	cap_mig.max_dirty_bitmap_size = DIRTY_BITMAP_SIZE_MAX;
2744
2745	return vfio_info_add_capability(caps, cap: &cap_mig.header, size: sizeof(cap_mig));
2746	}
2747
2748	static int vfio_iommu_dma_avail_build_caps(struct vfio_iommu *iommu,
2749	struct vfio_info_cap *caps)
2750	{
2751	struct vfio_iommu_type1_info_dma_avail cap_dma_avail;
2752
2753	cap_dma_avail.header.id = VFIO_IOMMU_TYPE1_INFO_DMA_AVAIL;
2754	cap_dma_avail.header.version = `1`;
2755
2756	cap_dma_avail.avail = iommu->dma_avail;
2757
2758	return vfio_info_add_capability(caps, cap: &cap_dma_avail.header,
2759	size: sizeof(cap_dma_avail));
2760	}
2761
2762	static int vfio_iommu_type1_get_info(struct vfio_iommu *iommu,
2763	unsigned long arg)
2764	{
2765	struct vfio_iommu_type1_info info = {};
2766	unsigned long minsz;
2767	struct vfio_info_cap caps = { .buf = NULL, .size = `0` };
2768	int ret;
2769
2770	minsz = offsetofend(struct vfio_iommu_type1_info, iova_pgsizes);
2771
2772	if (copy_from_user(to: &info, from: (void __user *)arg, n: minsz))
2773	return -EFAULT;
2774
2775	if (info.argsz < minsz)
2776	return -EINVAL;
2777
2778	minsz = min_t(size_t, info.argsz, sizeof(info));
2779
2780	mutex_lock(&iommu->lock);
2781	info.flags = VFIO_IOMMU_INFO_PGSIZES;
2782
2783	info.iova_pgsizes = iommu->pgsize_bitmap;
2784
2785	ret = vfio_iommu_migration_build_caps(iommu, caps: &caps);
2786
2787	if (!ret)
2788	ret = vfio_iommu_dma_avail_build_caps(iommu, caps: &caps);
2789
2790	if (!ret)
2791	ret = vfio_iommu_iova_build_caps(iommu, caps: &caps);
2792
2793	mutex_unlock(lock: &iommu->lock);
2794
2795	if (ret)
2796	return ret;
2797
2798	if (caps.size) {
2799	info.flags \|= VFIO_IOMMU_INFO_CAPS;
2800
2801	if (info.argsz < sizeof(info) + caps.size) {
2802	info.argsz = sizeof(info) + caps.size;
2803	} else {
2804	vfio_info_cap_shift(caps: &caps, offset: sizeof(info));
2805	if (copy_to_user(to: (void __user *)arg +
2806	sizeof(info), from: caps.buf,
2807	n: caps.size)) {
2808	kfree(objp: caps.buf);
2809	return -EFAULT;
2810	}
2811	info.cap_offset = sizeof(info);
2812	}
2813
2814	kfree(objp: caps.buf);
2815	}
2816
2817	return copy_to_user(to: (void __user *)arg, from: &info, n: minsz) ?
2818	-EFAULT : `0`;
2819	}
2820
2821	static int vfio_iommu_type1_map_dma(struct vfio_iommu *iommu,
2822	unsigned long arg)
2823	{
2824	struct vfio_iommu_type1_dma_map map;
2825	unsigned long minsz;
2826	uint32_t mask = VFIO_DMA_MAP_FLAG_READ \| VFIO_DMA_MAP_FLAG_WRITE \|
2827	VFIO_DMA_MAP_FLAG_VADDR;
2828
2829	minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
2830
2831	if (copy_from_user(to: &map, from: (void __user *)arg, n: minsz))
2832	return -EFAULT;
2833
2834	if (map.argsz < minsz \|\| map.flags & ~mask)
2835	return -EINVAL;
2836
2837	return vfio_dma_do_map(iommu, map: &map);
2838	}
2839
2840	static int vfio_iommu_type1_unmap_dma(struct vfio_iommu *iommu,
2841	unsigned long arg)
2842	{
2843	struct vfio_iommu_type1_dma_unmap unmap;
2844	struct vfio_bitmap bitmap = { `0` };
2845	uint32_t mask = VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP \|
2846	VFIO_DMA_UNMAP_FLAG_VADDR \|
2847	VFIO_DMA_UNMAP_FLAG_ALL;
2848	unsigned long minsz;
2849	int ret;
2850
2851	minsz = offsetofend(struct vfio_iommu_type1_dma_unmap, size);
2852
2853	if (copy_from_user(to: &unmap, from: (void __user *)arg, n: minsz))
2854	return -EFAULT;
2855
2856	if (unmap.argsz < minsz \|\| unmap.flags & ~mask)
2857	return -EINVAL;
2858
2859	if ((unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) &&
2860	(unmap.flags & (VFIO_DMA_UNMAP_FLAG_ALL \|
2861	VFIO_DMA_UNMAP_FLAG_VADDR)))
2862	return -EINVAL;
2863
2864	if (unmap.flags & VFIO_DMA_UNMAP_FLAG_GET_DIRTY_BITMAP) {
2865	unsigned long pgshift;
2866
2867	if (unmap.argsz < (minsz + sizeof(bitmap)))
2868	return -EINVAL;
2869
2870	if (copy_from_user(to: &bitmap,
2871	from: (void __user *)(arg + minsz),
2872	n: sizeof(bitmap)))
2873	return -EFAULT;
2874
2875	if (!access_ok((void __user *)bitmap.data, bitmap.size))
2876	return -EINVAL;
2877
2878	pgshift = __ffs(bitmap.pgsize);
2879	ret = verify_bitmap_size(npages: unmap.size >> pgshift,
2880	bitmap_size: bitmap.size);
2881	if (ret)
2882	return ret;
2883	}
2884
2885	ret = vfio_dma_do_unmap(iommu, unmap: &unmap, bitmap: &bitmap);
2886	if (ret)
2887	return ret;
2888
2889	return copy_to_user(to: (void __user *)arg, from: &unmap, n: minsz) ?
2890	-EFAULT : `0`;
2891	}
2892
2893	static int vfio_iommu_type1_dirty_pages(struct vfio_iommu *iommu,
2894	unsigned long arg)
2895	{
2896	struct vfio_iommu_type1_dirty_bitmap dirty;
2897	uint32_t mask = VFIO_IOMMU_DIRTY_PAGES_FLAG_START \|
2898	VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP \|
2899	VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP;
2900	unsigned long minsz;
2901	int ret = `0`;
2902
2903	if (!iommu->v2)
2904	return -EACCES;
2905
2906	minsz = offsetofend(struct vfio_iommu_type1_dirty_bitmap, flags);
2907
2908	if (copy_from_user(to: &dirty, from: (void __user *)arg, n: minsz))
2909	return -EFAULT;
2910
2911	if (dirty.argsz < minsz \|\| dirty.flags & ~mask)
2912	return -EINVAL;
2913
2914	/ only one flag should be set at a time /
2915	if (__ffs(dirty.flags) != __fls(word: dirty.flags))
2916	return -EINVAL;
2917
2918	if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_START) {
2919	size_t pgsize;
2920
2921	mutex_lock(&iommu->lock);
2922	pgsize = `1` << __ffs(iommu->pgsize_bitmap);
2923	if (!iommu->dirty_page_tracking) {
2924	ret = vfio_dma_bitmap_alloc_all(iommu, pgsize);
2925	if (!ret)
2926	iommu->dirty_page_tracking = true;
2927	}
2928	mutex_unlock(lock: &iommu->lock);
2929	return ret;
2930	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_STOP) {
2931	mutex_lock(&iommu->lock);
2932	if (iommu->dirty_page_tracking) {
2933	iommu->dirty_page_tracking = false;
2934	vfio_dma_bitmap_free_all(iommu);
2935	}
2936	mutex_unlock(lock: &iommu->lock);
2937	return `0`;
2938	} else if (dirty.flags & VFIO_IOMMU_DIRTY_PAGES_FLAG_GET_BITMAP) {
2939	struct vfio_iommu_type1_dirty_bitmap_get range;
2940	unsigned long pgshift;
2941	size_t data_size = dirty.argsz - minsz;
2942	size_t iommu_pgsize;
2943
2944	if (!data_size \|\| data_size < sizeof(range))
2945	return -EINVAL;
2946
2947	if (copy_from_user(to: &range, from: (void __user *)(arg + minsz),
2948	n: sizeof(range)))
2949	return -EFAULT;
2950
2951	if (range.iova + range.size < range.iova)
2952	return -EINVAL;
2953	if (!access_ok((void __user *)range.bitmap.data,
2954	range.bitmap.size))
2955	return -EINVAL;
2956
2957	pgshift = __ffs(range.bitmap.pgsize);
2958	ret = verify_bitmap_size(npages: range.size >> pgshift,
2959	bitmap_size: range.bitmap.size);
2960	if (ret)
2961	return ret;
2962
2963	mutex_lock(&iommu->lock);
2964
2965	iommu_pgsize = (size_t)`1` << __ffs(iommu->pgsize_bitmap);
2966
2967	/ allow only smallest supported pgsize /
2968	if (range.bitmap.pgsize != iommu_pgsize) {
2969	ret = -EINVAL;
2970	goto out_unlock;
2971	}
2972	if (range.iova & (iommu_pgsize - `1`)) {
2973	ret = -EINVAL;
2974	goto out_unlock;
2975	}
2976	if (!range.size \|\| range.size & (iommu_pgsize - `1`)) {
2977	ret = -EINVAL;
2978	goto out_unlock;
2979	}
2980
2981	if (iommu->dirty_page_tracking)
2982	ret = vfio_iova_dirty_bitmap(bitmap: range.bitmap.data,
2983	iommu, iova: range.iova,
2984	size: range.size,
2985	pgsize: range.bitmap.pgsize);
2986	else
2987	ret = -EINVAL;
2988	out_unlock:
2989	mutex_unlock(lock: &iommu->lock);
2990
2991	return ret;
2992	}
2993
2994	return -EINVAL;
2995	}
2996
2997	static long vfio_iommu_type1_ioctl(void *iommu_data,
2998	unsigned int cmd, unsigned long arg)
2999	{
3000	struct vfio_iommu *iommu = iommu_data;
3001
3002	switch (cmd) {
3003	case VFIO_CHECK_EXTENSION:
3004	return vfio_iommu_type1_check_extension(iommu, arg);
3005	case VFIO_IOMMU_GET_INFO:
3006	return vfio_iommu_type1_get_info(iommu, arg);
3007	case VFIO_IOMMU_MAP_DMA:
3008	return vfio_iommu_type1_map_dma(iommu, arg);
3009	case VFIO_IOMMU_UNMAP_DMA:
3010	return vfio_iommu_type1_unmap_dma(iommu, arg);
3011	case VFIO_IOMMU_DIRTY_PAGES:
3012	return vfio_iommu_type1_dirty_pages(iommu, arg);
3013	default:
3014	return -ENOTTY;
3015	}
3016	}
3017
3018	static void vfio_iommu_type1_register_device(void *iommu_data,
3019	struct vfio_device *vdev)
3020	{
3021	struct vfio_iommu *iommu = iommu_data;
3022
3023	if (!vdev->ops->dma_unmap)
3024	return;
3025
3026	/*
3027	* list_empty(&iommu->device_list) is tested under the iommu->lock while
3028	* iteration for dma_unmap must be done under the device_list_lock.
3029	* Holding both locks here allows avoiding the device_list_lock in
3030	* several fast paths. See vfio_notify_dma_unmap()
3031	*/
3032	mutex_lock(&iommu->lock);
3033	mutex_lock(&iommu->device_list_lock);
3034	list_add(new: &vdev->iommu_entry, head: &iommu->device_list);
3035	mutex_unlock(lock: &iommu->device_list_lock);
3036	mutex_unlock(lock: &iommu->lock);
3037	}
3038
3039	static void vfio_iommu_type1_unregister_device(void *iommu_data,
3040	struct vfio_device *vdev)
3041	{
3042	struct vfio_iommu *iommu = iommu_data;
3043
3044	if (!vdev->ops->dma_unmap)
3045	return;
3046
3047	mutex_lock(&iommu->lock);
3048	mutex_lock(&iommu->device_list_lock);
3049	list_del(entry: &vdev->iommu_entry);
3050	mutex_unlock(lock: &iommu->device_list_lock);
3051	mutex_unlock(lock: &iommu->lock);
3052	}
3053
3054	static int vfio_iommu_type1_dma_rw_chunk(struct vfio_iommu *iommu,
3055	dma_addr_t user_iova, void *data,
3056	size_t count, bool write,
3057	size_t *copied)
3058	{
3059	struct mm_struct *mm;
3060	unsigned long vaddr;
3061	struct vfio_dma *dma;
3062	bool kthread = current->mm == NULL;
3063	size_t offset;
3064
3065	*copied = `0`;
3066
3067	dma = vfio_find_dma(iommu, start: user_iova, size: `1`);
3068	if (!dma)
3069	return -EINVAL;
3070
3071	if ((write && !(dma->prot & IOMMU_WRITE)) \|\|
3072	!(dma->prot & IOMMU_READ))
3073	return -EPERM;
3074
3075	mm = dma->mm;
3076	if (!mmget_not_zero(mm))
3077	return -EPERM;
3078
3079	if (kthread)
3080	kthread_use_mm(mm);
3081	else if (current->mm != mm)
3082	goto out;
3083
3084	offset = user_iova - dma->iova;
3085
3086	if (count > dma->size - offset)
3087	count = dma->size - offset;
3088
3089	vaddr = dma->vaddr + offset;
3090
3091	if (write) {
3092	copied = copy_to_user(to: (void* __user *)vaddr, from: data,
3093	n: count) ? `0` : count;
3094	if (*copied && iommu->dirty_page_tracking) {
3095	unsigned long pgshift = __ffs(iommu->pgsize_bitmap);
3096	/*
3097	* Bitmap populated with the smallest supported page
3098	* size
3099	*/
3100	bitmap_set(map: dma->bitmap, start: offset >> pgshift,
3101	nbits: ((offset + *copied - `1`) >> pgshift) -
3102	(offset >> pgshift) + `1`);
3103	}
3104	} else
3105	copied = copy_from_user(to: data, from: (void* __user *)vaddr,
3106	n: count) ? `0` : count;
3107	if (kthread)
3108	kthread_unuse_mm(mm);
3109	out:
3110	mmput(mm);
3111	return *copied ? `0` : -EFAULT;
3112	}
3113
3114	static int vfio_iommu_type1_dma_rw(void *iommu_data, dma_addr_t user_iova,
3115	void *data, size_t count, bool write)
3116	{
3117	struct vfio_iommu *iommu = iommu_data;
3118	int ret = `0`;
3119	size_t done;
3120
3121	mutex_lock(&iommu->lock);
3122
3123	if (WARN_ONCE(iommu->vaddr_invalid_count,
3124	"vfio_dma_rw not allowed with VFIO_UPDATE_VADDR\n")) {
3125	ret = -EBUSY;
3126	goto out;
3127	}
3128
3129	while (count > `0`) {
3130	ret = vfio_iommu_type1_dma_rw_chunk(iommu, user_iova, data,
3131	count, write, copied: &done);
3132	if (ret)
3133	break;
3134
3135	count -= done;
3136	data += done;
3137	user_iova += done;
3138	}
3139
3140	out:
3141	mutex_unlock(lock: &iommu->lock);
3142	return ret;
3143	}
3144
3145	static struct iommu_domain *
3146	vfio_iommu_type1_group_iommu_domain(void *iommu_data,
3147	struct iommu_group *iommu_group)
3148	{
3149	struct iommu_domain *domain = ERR_PTR(error: -ENODEV);
3150	struct vfio_iommu *iommu = iommu_data;
3151	struct vfio_domain *d;
3152
3153	if (!iommu \|\| !iommu_group)
3154	return ERR_PTR(error: -EINVAL);
3155
3156	mutex_lock(&iommu->lock);
3157	list_for_each_entry(d, &iommu->domain_list, next) {
3158	if (find_iommu_group(domain: d, iommu_group)) {
3159	domain = d->domain;
3160	break;
3161	}
3162	}
3163	mutex_unlock(lock: &iommu->lock);
3164
3165	return domain;
3166	}
3167
3168	static const struct vfio_iommu_driver_ops vfio_iommu_driver_ops_type1 = {
3169	.name = "vfio-iommu-type1",
3170	.owner = THIS_MODULE,
3171	.open = vfio_iommu_type1_open,
3172	.release = vfio_iommu_type1_release,
3173	.ioctl = vfio_iommu_type1_ioctl,
3174	.attach_group = vfio_iommu_type1_attach_group,
3175	.detach_group = vfio_iommu_type1_detach_group,
3176	.pin_pages = vfio_iommu_type1_pin_pages,
3177	.unpin_pages = vfio_iommu_type1_unpin_pages,
3178	.register_device = vfio_iommu_type1_register_device,
3179	.unregister_device = vfio_iommu_type1_unregister_device,
3180	.dma_rw = vfio_iommu_type1_dma_rw,
3181	.group_iommu_domain = vfio_iommu_type1_group_iommu_domain,
3182	};
3183
3184	static int __init vfio_iommu_type1_init(void)
3185	{
3186	return vfio_register_iommu_driver(ops: &vfio_iommu_driver_ops_type1);
3187	}
3188
3189	static void __exit vfio_iommu_type1_cleanup(void)
3190	{
3191	vfio_unregister_iommu_driver(ops: &vfio_iommu_driver_ops_type1);
3192	}
3193
3194	module_init(vfio_iommu_type1_init);
3195	module_exit(vfio_iommu_type1_cleanup);
3196
3197	MODULE_VERSION(DRIVER_VERSION);
3198	MODULE_LICENSE("GPL v2");
3199	MODULE_AUTHOR(DRIVER_AUTHOR);
3200	MODULE_DESCRIPTION(DRIVER_DESC);
3201

source code of linux/drivers/vfio/vfio_iommu_type1.c