1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * VFIO core
4 *
5 * Copyright (C) 2012 Red Hat, Inc. All rights reserved.
6 * Author: Alex Williamson <alex.williamson@redhat.com>
7 *
8 * Derived from original vfio:
9 * Copyright 2010 Cisco Systems, Inc. All rights reserved.
10 * Author: Tom Lyon, pugs@cisco.com
11 */
12
13#include <linux/cdev.h>
14#include <linux/compat.h>
15#include <linux/device.h>
16#include <linux/fs.h>
17#include <linux/idr.h>
18#include <linux/iommu.h>
19#if IS_ENABLED(CONFIG_KVM)
20#include <linux/kvm_host.h>
21#endif
22#include <linux/list.h>
23#include <linux/miscdevice.h>
24#include <linux/module.h>
25#include <linux/mutex.h>
26#include <linux/pci.h>
27#include <linux/rwsem.h>
28#include <linux/sched.h>
29#include <linux/slab.h>
30#include <linux/stat.h>
31#include <linux/string.h>
32#include <linux/uaccess.h>
33#include <linux/vfio.h>
34#include <linux/wait.h>
35#include <linux/sched/signal.h>
36#include <linux/pm_runtime.h>
37#include <linux/interval_tree.h>
38#include <linux/iova_bitmap.h>
39#include <linux/iommufd.h>
40#include "vfio.h"
41
42#define DRIVER_VERSION "0.3"
43#define DRIVER_AUTHOR "Alex Williamson <alex.williamson@redhat.com>"
44#define DRIVER_DESC "VFIO - User Level meta-driver"
45
46static struct vfio {
47 struct class *device_class;
48 struct ida device_ida;
49} vfio;
50
51#ifdef CONFIG_VFIO_NOIOMMU
52bool vfio_noiommu __read_mostly;
53module_param_named(enable_unsafe_noiommu_mode,
54 vfio_noiommu, bool, S_IRUGO | S_IWUSR);
55MODULE_PARM_DESC(enable_unsafe_noiommu_mode, "Enable UNSAFE, no-IOMMU mode. This mode provides no device isolation, no DMA translation, no host kernel protection, cannot be used for device assignment to virtual machines, requires RAWIO permissions, and will taint the kernel. If you do not know what this is for, step away. (default: false)");
56#endif
57
58static DEFINE_XARRAY(vfio_device_set_xa);
59
60int vfio_assign_device_set(struct vfio_device *device, void *set_id)
61{
62 unsigned long idx = (unsigned long)set_id;
63 struct vfio_device_set *new_dev_set;
64 struct vfio_device_set *dev_set;
65
66 if (WARN_ON(!set_id))
67 return -EINVAL;
68
69 /*
70 * Atomically acquire a singleton object in the xarray for this set_id
71 */
72 xa_lock(&vfio_device_set_xa);
73 dev_set = xa_load(&vfio_device_set_xa, index: idx);
74 if (dev_set)
75 goto found_get_ref;
76 xa_unlock(&vfio_device_set_xa);
77
78 new_dev_set = kzalloc(size: sizeof(*new_dev_set), GFP_KERNEL);
79 if (!new_dev_set)
80 return -ENOMEM;
81 mutex_init(&new_dev_set->lock);
82 INIT_LIST_HEAD(list: &new_dev_set->device_list);
83 new_dev_set->set_id = set_id;
84
85 xa_lock(&vfio_device_set_xa);
86 dev_set = __xa_cmpxchg(&vfio_device_set_xa, index: idx, NULL, entry: new_dev_set,
87 GFP_KERNEL);
88 if (!dev_set) {
89 dev_set = new_dev_set;
90 goto found_get_ref;
91 }
92
93 kfree(objp: new_dev_set);
94 if (xa_is_err(entry: dev_set)) {
95 xa_unlock(&vfio_device_set_xa);
96 return xa_err(entry: dev_set);
97 }
98
99found_get_ref:
100 dev_set->device_count++;
101 xa_unlock(&vfio_device_set_xa);
102 mutex_lock(&dev_set->lock);
103 device->dev_set = dev_set;
104 list_add_tail(new: &device->dev_set_list, head: &dev_set->device_list);
105 mutex_unlock(lock: &dev_set->lock);
106 return 0;
107}
108EXPORT_SYMBOL_GPL(vfio_assign_device_set);
109
110static void vfio_release_device_set(struct vfio_device *device)
111{
112 struct vfio_device_set *dev_set = device->dev_set;
113
114 if (!dev_set)
115 return;
116
117 mutex_lock(&dev_set->lock);
118 list_del(entry: &device->dev_set_list);
119 mutex_unlock(lock: &dev_set->lock);
120
121 xa_lock(&vfio_device_set_xa);
122 if (!--dev_set->device_count) {
123 __xa_erase(&vfio_device_set_xa,
124 index: (unsigned long)dev_set->set_id);
125 mutex_destroy(lock: &dev_set->lock);
126 kfree(objp: dev_set);
127 }
128 xa_unlock(&vfio_device_set_xa);
129}
130
131unsigned int vfio_device_set_open_count(struct vfio_device_set *dev_set)
132{
133 struct vfio_device *cur;
134 unsigned int open_count = 0;
135
136 lockdep_assert_held(&dev_set->lock);
137
138 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
139 open_count += cur->open_count;
140 return open_count;
141}
142EXPORT_SYMBOL_GPL(vfio_device_set_open_count);
143
144struct vfio_device *
145vfio_find_device_in_devset(struct vfio_device_set *dev_set,
146 struct device *dev)
147{
148 struct vfio_device *cur;
149
150 lockdep_assert_held(&dev_set->lock);
151
152 list_for_each_entry(cur, &dev_set->device_list, dev_set_list)
153 if (cur->dev == dev)
154 return cur;
155 return NULL;
156}
157EXPORT_SYMBOL_GPL(vfio_find_device_in_devset);
158
159/*
160 * Device objects - create, release, get, put, search
161 */
162/* Device reference always implies a group reference */
163void vfio_device_put_registration(struct vfio_device *device)
164{
165 if (refcount_dec_and_test(r: &device->refcount))
166 complete(&device->comp);
167}
168
169bool vfio_device_try_get_registration(struct vfio_device *device)
170{
171 return refcount_inc_not_zero(r: &device->refcount);
172}
173
174/*
175 * VFIO driver API
176 */
177/* Release helper called by vfio_put_device() */
178static void vfio_device_release(struct device *dev)
179{
180 struct vfio_device *device =
181 container_of(dev, struct vfio_device, device);
182
183 vfio_release_device_set(device);
184 ida_free(&vfio.device_ida, id: device->index);
185
186 if (device->ops->release)
187 device->ops->release(device);
188
189 kvfree(addr: device);
190}
191
192static int vfio_init_device(struct vfio_device *device, struct device *dev,
193 const struct vfio_device_ops *ops);
194
195/*
196 * Allocate and initialize vfio_device so it can be registered to vfio
197 * core.
198 *
199 * Drivers should use the wrapper vfio_alloc_device() for allocation.
200 * @size is the size of the structure to be allocated, including any
201 * private data used by the driver.
202 *
203 * Driver may provide an @init callback to cover device private data.
204 *
205 * Use vfio_put_device() to release the structure after success return.
206 */
207struct vfio_device *_vfio_alloc_device(size_t size, struct device *dev,
208 const struct vfio_device_ops *ops)
209{
210 struct vfio_device *device;
211 int ret;
212
213 if (WARN_ON(size < sizeof(struct vfio_device)))
214 return ERR_PTR(error: -EINVAL);
215
216 device = kvzalloc(size, GFP_KERNEL);
217 if (!device)
218 return ERR_PTR(error: -ENOMEM);
219
220 ret = vfio_init_device(device, dev, ops);
221 if (ret)
222 goto out_free;
223 return device;
224
225out_free:
226 kvfree(addr: device);
227 return ERR_PTR(error: ret);
228}
229EXPORT_SYMBOL_GPL(_vfio_alloc_device);
230
231/*
232 * Initialize a vfio_device so it can be registered to vfio core.
233 */
234static int vfio_init_device(struct vfio_device *device, struct device *dev,
235 const struct vfio_device_ops *ops)
236{
237 int ret;
238
239 ret = ida_alloc_max(ida: &vfio.device_ida, MINORMASK, GFP_KERNEL);
240 if (ret < 0) {
241 dev_dbg(dev, "Error to alloc index\n");
242 return ret;
243 }
244
245 device->index = ret;
246 init_completion(x: &device->comp);
247 device->dev = dev;
248 device->ops = ops;
249
250 if (ops->init) {
251 ret = ops->init(device);
252 if (ret)
253 goto out_uninit;
254 }
255
256 device_initialize(dev: &device->device);
257 device->device.release = vfio_device_release;
258 device->device.class = vfio.device_class;
259 device->device.parent = device->dev;
260 return 0;
261
262out_uninit:
263 vfio_release_device_set(device);
264 ida_free(&vfio.device_ida, id: device->index);
265 return ret;
266}
267
268static int __vfio_register_dev(struct vfio_device *device,
269 enum vfio_group_type type)
270{
271 int ret;
272
273 if (WARN_ON(IS_ENABLED(CONFIG_IOMMUFD) &&
274 (!device->ops->bind_iommufd ||
275 !device->ops->unbind_iommufd ||
276 !device->ops->attach_ioas ||
277 !device->ops->detach_ioas)))
278 return -EINVAL;
279
280 /*
281 * If the driver doesn't specify a set then the device is added to a
282 * singleton set just for itself.
283 */
284 if (!device->dev_set)
285 vfio_assign_device_set(device, device);
286
287 ret = dev_set_name(dev: &device->device, name: "vfio%d", device->index);
288 if (ret)
289 return ret;
290
291 ret = vfio_device_set_group(device, type);
292 if (ret)
293 return ret;
294
295 /*
296 * VFIO always sets IOMMU_CACHE because we offer no way for userspace to
297 * restore cache coherency. It has to be checked here because it is only
298 * valid for cases where we are using iommu groups.
299 */
300 if (type == VFIO_IOMMU && !vfio_device_is_noiommu(vdev: device) &&
301 !device_iommu_capable(dev: device->dev, cap: IOMMU_CAP_CACHE_COHERENCY)) {
302 ret = -EINVAL;
303 goto err_out;
304 }
305
306 ret = vfio_device_add(device);
307 if (ret)
308 goto err_out;
309
310 /* Refcounting can't start until the driver calls register */
311 refcount_set(r: &device->refcount, n: 1);
312
313 vfio_device_group_register(device);
314 vfio_device_debugfs_init(vdev: device);
315
316 return 0;
317err_out:
318 vfio_device_remove_group(device);
319 return ret;
320}
321
322int vfio_register_group_dev(struct vfio_device *device)
323{
324 return __vfio_register_dev(device, type: VFIO_IOMMU);
325}
326EXPORT_SYMBOL_GPL(vfio_register_group_dev);
327
328/*
329 * Register a virtual device without IOMMU backing. The user of this
330 * device must not be able to directly trigger unmediated DMA.
331 */
332int vfio_register_emulated_iommu_dev(struct vfio_device *device)
333{
334 return __vfio_register_dev(device, type: VFIO_EMULATED_IOMMU);
335}
336EXPORT_SYMBOL_GPL(vfio_register_emulated_iommu_dev);
337
338/*
339 * Decrement the device reference count and wait for the device to be
340 * removed. Open file descriptors for the device... */
341void vfio_unregister_group_dev(struct vfio_device *device)
342{
343 unsigned int i = 0;
344 bool interrupted = false;
345 long rc;
346
347 /*
348 * Prevent new device opened by userspace via the
349 * VFIO_GROUP_GET_DEVICE_FD in the group path.
350 */
351 vfio_device_group_unregister(device);
352
353 /*
354 * Balances vfio_device_add() in register path, also prevents
355 * new device opened by userspace in the cdev path.
356 */
357 vfio_device_del(device);
358
359 vfio_device_put_registration(device);
360 rc = try_wait_for_completion(x: &device->comp);
361 while (rc <= 0) {
362 if (device->ops->request)
363 device->ops->request(device, i++);
364
365 if (interrupted) {
366 rc = wait_for_completion_timeout(x: &device->comp,
367 HZ * 10);
368 } else {
369 rc = wait_for_completion_interruptible_timeout(
370 x: &device->comp, HZ * 10);
371 if (rc < 0) {
372 interrupted = true;
373 dev_warn(device->dev,
374 "Device is currently in use, task"
375 " \"%s\" (%d) "
376 "blocked until device is released",
377 current->comm, task_pid_nr(current));
378 }
379 }
380 }
381
382 vfio_device_debugfs_exit(vdev: device);
383 /* Balances vfio_device_set_group in register path */
384 vfio_device_remove_group(device);
385}
386EXPORT_SYMBOL_GPL(vfio_unregister_group_dev);
387
388#if IS_ENABLED(CONFIG_KVM)
389void vfio_device_get_kvm_safe(struct vfio_device *device, struct kvm *kvm)
390{
391 void (*pfn)(struct kvm *kvm);
392 bool (*fn)(struct kvm *kvm);
393 bool ret;
394
395 lockdep_assert_held(&device->dev_set->lock);
396
397 if (!kvm)
398 return;
399
400 pfn = symbol_get(kvm_put_kvm);
401 if (WARN_ON(!pfn))
402 return;
403
404 fn = symbol_get(kvm_get_kvm_safe);
405 if (WARN_ON(!fn)) {
406 symbol_put(kvm_put_kvm);
407 return;
408 }
409
410 ret = fn(kvm);
411 symbol_put(kvm_get_kvm_safe);
412 if (!ret) {
413 symbol_put(kvm_put_kvm);
414 return;
415 }
416
417 device->put_kvm = pfn;
418 device->kvm = kvm;
419}
420
421void vfio_device_put_kvm(struct vfio_device *device)
422{
423 lockdep_assert_held(&device->dev_set->lock);
424
425 if (!device->kvm)
426 return;
427
428 if (WARN_ON(!device->put_kvm))
429 goto clear;
430
431 device->put_kvm(device->kvm);
432 device->put_kvm = NULL;
433 symbol_put(kvm_put_kvm);
434
435clear:
436 device->kvm = NULL;
437}
438#endif
439
440/* true if the vfio_device has open_device() called but not close_device() */
441static bool vfio_assert_device_open(struct vfio_device *device)
442{
443 return !WARN_ON_ONCE(!READ_ONCE(device->open_count));
444}
445
446struct vfio_device_file *
447vfio_allocate_device_file(struct vfio_device *device)
448{
449 struct vfio_device_file *df;
450
451 df = kzalloc(size: sizeof(*df), GFP_KERNEL_ACCOUNT);
452 if (!df)
453 return ERR_PTR(error: -ENOMEM);
454
455 df->device = device;
456 spin_lock_init(&df->kvm_ref_lock);
457
458 return df;
459}
460
461static int vfio_df_device_first_open(struct vfio_device_file *df)
462{
463 struct vfio_device *device = df->device;
464 struct iommufd_ctx *iommufd = df->iommufd;
465 int ret;
466
467 lockdep_assert_held(&device->dev_set->lock);
468
469 if (!try_module_get(module: device->dev->driver->owner))
470 return -ENODEV;
471
472 if (iommufd)
473 ret = vfio_df_iommufd_bind(df);
474 else
475 ret = vfio_device_group_use_iommu(device);
476 if (ret)
477 goto err_module_put;
478
479 if (device->ops->open_device) {
480 ret = device->ops->open_device(device);
481 if (ret)
482 goto err_unuse_iommu;
483 }
484 return 0;
485
486err_unuse_iommu:
487 if (iommufd)
488 vfio_df_iommufd_unbind(df);
489 else
490 vfio_device_group_unuse_iommu(device);
491err_module_put:
492 module_put(module: device->dev->driver->owner);
493 return ret;
494}
495
496static void vfio_df_device_last_close(struct vfio_device_file *df)
497{
498 struct vfio_device *device = df->device;
499 struct iommufd_ctx *iommufd = df->iommufd;
500
501 lockdep_assert_held(&device->dev_set->lock);
502
503 if (device->ops->close_device)
504 device->ops->close_device(device);
505 if (iommufd)
506 vfio_df_iommufd_unbind(df);
507 else
508 vfio_device_group_unuse_iommu(device);
509 module_put(module: device->dev->driver->owner);
510}
511
512int vfio_df_open(struct vfio_device_file *df)
513{
514 struct vfio_device *device = df->device;
515 int ret = 0;
516
517 lockdep_assert_held(&device->dev_set->lock);
518
519 /*
520 * Only the group path allows the device to be opened multiple
521 * times. The device cdev path doesn't have a secure way for it.
522 */
523 if (device->open_count != 0 && !df->group)
524 return -EINVAL;
525
526 device->open_count++;
527 if (device->open_count == 1) {
528 ret = vfio_df_device_first_open(df);
529 if (ret)
530 device->open_count--;
531 }
532
533 return ret;
534}
535
536void vfio_df_close(struct vfio_device_file *df)
537{
538 struct vfio_device *device = df->device;
539
540 lockdep_assert_held(&device->dev_set->lock);
541
542 vfio_assert_device_open(device);
543 if (device->open_count == 1)
544 vfio_df_device_last_close(df);
545 device->open_count--;
546}
547
548/*
549 * Wrapper around pm_runtime_resume_and_get().
550 * Return error code on failure or 0 on success.
551 */
552static inline int vfio_device_pm_runtime_get(struct vfio_device *device)
553{
554 struct device *dev = device->dev;
555
556 if (dev->driver && dev->driver->pm) {
557 int ret;
558
559 ret = pm_runtime_resume_and_get(dev);
560 if (ret) {
561 dev_info_ratelimited(dev,
562 "vfio: runtime resume failed %d\n", ret);
563 return -EIO;
564 }
565 }
566
567 return 0;
568}
569
570/*
571 * Wrapper around pm_runtime_put().
572 */
573static inline void vfio_device_pm_runtime_put(struct vfio_device *device)
574{
575 struct device *dev = device->dev;
576
577 if (dev->driver && dev->driver->pm)
578 pm_runtime_put(dev);
579}
580
581/*
582 * VFIO Device fd
583 */
584static int vfio_device_fops_release(struct inode *inode, struct file *filep)
585{
586 struct vfio_device_file *df = filep->private_data;
587 struct vfio_device *device = df->device;
588
589 if (df->group)
590 vfio_df_group_close(df);
591 else
592 vfio_df_unbind_iommufd(df);
593
594 vfio_device_put_registration(device);
595
596 kfree(objp: df);
597
598 return 0;
599}
600
601/*
602 * vfio_mig_get_next_state - Compute the next step in the FSM
603 * @cur_fsm - The current state the device is in
604 * @new_fsm - The target state to reach
605 * @next_fsm - Pointer to the next step to get to new_fsm
606 *
607 * Return 0 upon success, otherwise -errno
608 * Upon success the next step in the state progression between cur_fsm and
609 * new_fsm will be set in next_fsm.
610 *
611 * This breaks down requests for combination transitions into smaller steps and
612 * returns the next step to get to new_fsm. The function may need to be called
613 * multiple times before reaching new_fsm.
614 *
615 */
616int vfio_mig_get_next_state(struct vfio_device *device,
617 enum vfio_device_mig_state cur_fsm,
618 enum vfio_device_mig_state new_fsm,
619 enum vfio_device_mig_state *next_fsm)
620{
621 enum { VFIO_DEVICE_NUM_STATES = VFIO_DEVICE_STATE_PRE_COPY_P2P + 1 };
622 /*
623 * The coding in this table requires the driver to implement the
624 * following FSM arcs:
625 * RESUMING -> STOP
626 * STOP -> RESUMING
627 * STOP -> STOP_COPY
628 * STOP_COPY -> STOP
629 *
630 * If P2P is supported then the driver must also implement these FSM
631 * arcs:
632 * RUNNING -> RUNNING_P2P
633 * RUNNING_P2P -> RUNNING
634 * RUNNING_P2P -> STOP
635 * STOP -> RUNNING_P2P
636 *
637 * If precopy is supported then the driver must support these additional
638 * FSM arcs:
639 * RUNNING -> PRE_COPY
640 * PRE_COPY -> RUNNING
641 * PRE_COPY -> STOP_COPY
642 * However, if precopy and P2P are supported together then the driver
643 * must support these additional arcs beyond the P2P arcs above:
644 * PRE_COPY -> RUNNING
645 * PRE_COPY -> PRE_COPY_P2P
646 * PRE_COPY_P2P -> PRE_COPY
647 * PRE_COPY_P2P -> RUNNING_P2P
648 * PRE_COPY_P2P -> STOP_COPY
649 * RUNNING -> PRE_COPY
650 * RUNNING_P2P -> PRE_COPY_P2P
651 *
652 * Without P2P and precopy the driver must implement:
653 * RUNNING -> STOP
654 * STOP -> RUNNING
655 *
656 * The coding will step through multiple states for some combination
657 * transitions; if all optional features are supported, this means the
658 * following ones:
659 * PRE_COPY -> PRE_COPY_P2P -> STOP_COPY
660 * PRE_COPY -> RUNNING -> RUNNING_P2P
661 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP
662 * PRE_COPY -> RUNNING -> RUNNING_P2P -> STOP -> RESUMING
663 * PRE_COPY_P2P -> RUNNING_P2P -> RUNNING
664 * PRE_COPY_P2P -> RUNNING_P2P -> STOP
665 * PRE_COPY_P2P -> RUNNING_P2P -> STOP -> RESUMING
666 * RESUMING -> STOP -> RUNNING_P2P
667 * RESUMING -> STOP -> RUNNING_P2P -> PRE_COPY_P2P
668 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING
669 * RESUMING -> STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
670 * RESUMING -> STOP -> STOP_COPY
671 * RUNNING -> RUNNING_P2P -> PRE_COPY_P2P
672 * RUNNING -> RUNNING_P2P -> STOP
673 * RUNNING -> RUNNING_P2P -> STOP -> RESUMING
674 * RUNNING -> RUNNING_P2P -> STOP -> STOP_COPY
675 * RUNNING_P2P -> RUNNING -> PRE_COPY
676 * RUNNING_P2P -> STOP -> RESUMING
677 * RUNNING_P2P -> STOP -> STOP_COPY
678 * STOP -> RUNNING_P2P -> PRE_COPY_P2P
679 * STOP -> RUNNING_P2P -> RUNNING
680 * STOP -> RUNNING_P2P -> RUNNING -> PRE_COPY
681 * STOP_COPY -> STOP -> RESUMING
682 * STOP_COPY -> STOP -> RUNNING_P2P
683 * STOP_COPY -> STOP -> RUNNING_P2P -> RUNNING
684 *
685 * The following transitions are blocked:
686 * STOP_COPY -> PRE_COPY
687 * STOP_COPY -> PRE_COPY_P2P
688 */
689 static const u8 vfio_from_fsm_table[VFIO_DEVICE_NUM_STATES][VFIO_DEVICE_NUM_STATES] = {
690 [VFIO_DEVICE_STATE_STOP] = {
691 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
692 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
693 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
694 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
695 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
696 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
697 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
698 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
699 },
700 [VFIO_DEVICE_STATE_RUNNING] = {
701 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
702 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
703 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
704 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
705 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_RUNNING_P2P,
706 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
707 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
708 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
709 },
710 [VFIO_DEVICE_STATE_PRE_COPY] = {
711 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING,
712 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
713 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
714 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
715 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
716 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING,
717 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING,
718 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
719 },
720 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = {
721 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_RUNNING_P2P,
722 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING_P2P,
723 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_PRE_COPY,
724 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
725 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
726 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RUNNING_P2P,
727 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
728 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
729 },
730 [VFIO_DEVICE_STATE_STOP_COPY] = {
731 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
732 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
733 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
734 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
735 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP_COPY,
736 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
737 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
738 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
739 },
740 [VFIO_DEVICE_STATE_RESUMING] = {
741 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
742 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_STOP,
743 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_STOP,
744 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_STOP,
745 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
746 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_RESUMING,
747 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_STOP,
748 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
749 },
750 [VFIO_DEVICE_STATE_RUNNING_P2P] = {
751 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_STOP,
752 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_RUNNING,
753 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_RUNNING,
754 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_PRE_COPY_P2P,
755 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_STOP,
756 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_STOP,
757 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_RUNNING_P2P,
758 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
759 },
760 [VFIO_DEVICE_STATE_ERROR] = {
761 [VFIO_DEVICE_STATE_STOP] = VFIO_DEVICE_STATE_ERROR,
762 [VFIO_DEVICE_STATE_RUNNING] = VFIO_DEVICE_STATE_ERROR,
763 [VFIO_DEVICE_STATE_PRE_COPY] = VFIO_DEVICE_STATE_ERROR,
764 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_DEVICE_STATE_ERROR,
765 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_DEVICE_STATE_ERROR,
766 [VFIO_DEVICE_STATE_RESUMING] = VFIO_DEVICE_STATE_ERROR,
767 [VFIO_DEVICE_STATE_RUNNING_P2P] = VFIO_DEVICE_STATE_ERROR,
768 [VFIO_DEVICE_STATE_ERROR] = VFIO_DEVICE_STATE_ERROR,
769 },
770 };
771
772 static const unsigned int state_flags_table[VFIO_DEVICE_NUM_STATES] = {
773 [VFIO_DEVICE_STATE_STOP] = VFIO_MIGRATION_STOP_COPY,
774 [VFIO_DEVICE_STATE_RUNNING] = VFIO_MIGRATION_STOP_COPY,
775 [VFIO_DEVICE_STATE_PRE_COPY] =
776 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY,
777 [VFIO_DEVICE_STATE_PRE_COPY_P2P] = VFIO_MIGRATION_STOP_COPY |
778 VFIO_MIGRATION_P2P |
779 VFIO_MIGRATION_PRE_COPY,
780 [VFIO_DEVICE_STATE_STOP_COPY] = VFIO_MIGRATION_STOP_COPY,
781 [VFIO_DEVICE_STATE_RESUMING] = VFIO_MIGRATION_STOP_COPY,
782 [VFIO_DEVICE_STATE_RUNNING_P2P] =
783 VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_P2P,
784 [VFIO_DEVICE_STATE_ERROR] = ~0U,
785 };
786
787 if (WARN_ON(cur_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
788 (state_flags_table[cur_fsm] & device->migration_flags) !=
789 state_flags_table[cur_fsm]))
790 return -EINVAL;
791
792 if (new_fsm >= ARRAY_SIZE(vfio_from_fsm_table) ||
793 (state_flags_table[new_fsm] & device->migration_flags) !=
794 state_flags_table[new_fsm])
795 return -EINVAL;
796
797 /*
798 * Arcs touching optional and unsupported states are skipped over. The
799 * driver will instead see an arc from the original state to the next
800 * logical state, as per the above comment.
801 */
802 *next_fsm = vfio_from_fsm_table[cur_fsm][new_fsm];
803 while ((state_flags_table[*next_fsm] & device->migration_flags) !=
804 state_flags_table[*next_fsm])
805 *next_fsm = vfio_from_fsm_table[*next_fsm][new_fsm];
806
807 return (*next_fsm != VFIO_DEVICE_STATE_ERROR) ? 0 : -EINVAL;
808}
809EXPORT_SYMBOL_GPL(vfio_mig_get_next_state);
810
811/*
812 * Convert the drivers's struct file into a FD number and return it to userspace
813 */
814static int vfio_ioct_mig_return_fd(struct file *filp, void __user *arg,
815 struct vfio_device_feature_mig_state *mig)
816{
817 int ret;
818 int fd;
819
820 fd = get_unused_fd_flags(O_CLOEXEC);
821 if (fd < 0) {
822 ret = fd;
823 goto out_fput;
824 }
825
826 mig->data_fd = fd;
827 if (copy_to_user(to: arg, from: mig, n: sizeof(*mig))) {
828 ret = -EFAULT;
829 goto out_put_unused;
830 }
831 fd_install(fd, file: filp);
832 return 0;
833
834out_put_unused:
835 put_unused_fd(fd);
836out_fput:
837 fput(filp);
838 return ret;
839}
840
841static int
842vfio_ioctl_device_feature_mig_device_state(struct vfio_device *device,
843 u32 flags, void __user *arg,
844 size_t argsz)
845{
846 size_t minsz =
847 offsetofend(struct vfio_device_feature_mig_state, data_fd);
848 struct vfio_device_feature_mig_state mig;
849 struct file *filp = NULL;
850 int ret;
851
852 if (!device->mig_ops)
853 return -ENOTTY;
854
855 ret = vfio_check_feature(flags, argsz,
856 VFIO_DEVICE_FEATURE_SET |
857 VFIO_DEVICE_FEATURE_GET,
858 minsz: sizeof(mig));
859 if (ret != 1)
860 return ret;
861
862 if (copy_from_user(to: &mig, from: arg, n: minsz))
863 return -EFAULT;
864
865 if (flags & VFIO_DEVICE_FEATURE_GET) {
866 enum vfio_device_mig_state curr_state;
867
868 ret = device->mig_ops->migration_get_state(device,
869 &curr_state);
870 if (ret)
871 return ret;
872 mig.device_state = curr_state;
873 goto out_copy;
874 }
875
876 /* Handle the VFIO_DEVICE_FEATURE_SET */
877 filp = device->mig_ops->migration_set_state(device, mig.device_state);
878 if (IS_ERR(ptr: filp) || !filp)
879 goto out_copy;
880
881 return vfio_ioct_mig_return_fd(filp, arg, mig: &mig);
882out_copy:
883 mig.data_fd = -1;
884 if (copy_to_user(to: arg, from: &mig, n: sizeof(mig)))
885 return -EFAULT;
886 if (IS_ERR(ptr: filp))
887 return PTR_ERR(ptr: filp);
888 return 0;
889}
890
891static int
892vfio_ioctl_device_feature_migration_data_size(struct vfio_device *device,
893 u32 flags, void __user *arg,
894 size_t argsz)
895{
896 struct vfio_device_feature_mig_data_size data_size = {};
897 unsigned long stop_copy_length;
898 int ret;
899
900 if (!device->mig_ops)
901 return -ENOTTY;
902
903 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
904 minsz: sizeof(data_size));
905 if (ret != 1)
906 return ret;
907
908 ret = device->mig_ops->migration_get_data_size(device, &stop_copy_length);
909 if (ret)
910 return ret;
911
912 data_size.stop_copy_length = stop_copy_length;
913 if (copy_to_user(to: arg, from: &data_size, n: sizeof(data_size)))
914 return -EFAULT;
915
916 return 0;
917}
918
919static int vfio_ioctl_device_feature_migration(struct vfio_device *device,
920 u32 flags, void __user *arg,
921 size_t argsz)
922{
923 struct vfio_device_feature_migration mig = {
924 .flags = device->migration_flags,
925 };
926 int ret;
927
928 if (!device->mig_ops)
929 return -ENOTTY;
930
931 ret = vfio_check_feature(flags, argsz, VFIO_DEVICE_FEATURE_GET,
932 minsz: sizeof(mig));
933 if (ret != 1)
934 return ret;
935 if (copy_to_user(to: arg, from: &mig, n: sizeof(mig)))
936 return -EFAULT;
937 return 0;
938}
939
940void vfio_combine_iova_ranges(struct rb_root_cached *root, u32 cur_nodes,
941 u32 req_nodes)
942{
943 struct interval_tree_node *prev, *curr, *comb_start, *comb_end;
944 unsigned long min_gap, curr_gap;
945
946 /* Special shortcut when a single range is required */
947 if (req_nodes == 1) {
948 unsigned long last;
949
950 comb_start = interval_tree_iter_first(root, start: 0, ULONG_MAX);
951
952 /* Empty list */
953 if (WARN_ON_ONCE(!comb_start))
954 return;
955
956 curr = comb_start;
957 while (curr) {
958 last = curr->last;
959 prev = curr;
960 curr = interval_tree_iter_next(node: curr, start: 0, ULONG_MAX);
961 if (prev != comb_start)
962 interval_tree_remove(node: prev, root);
963 }
964 comb_start->last = last;
965 return;
966 }
967
968 /* Combine ranges which have the smallest gap */
969 while (cur_nodes > req_nodes) {
970 prev = NULL;
971 min_gap = ULONG_MAX;
972 curr = interval_tree_iter_first(root, start: 0, ULONG_MAX);
973 while (curr) {
974 if (prev) {
975 curr_gap = curr->start - prev->last;
976 if (curr_gap < min_gap) {
977 min_gap = curr_gap;
978 comb_start = prev;
979 comb_end = curr;
980 }
981 }
982 prev = curr;
983 curr = interval_tree_iter_next(node: curr, start: 0, ULONG_MAX);
984 }
985
986 /* Empty list or no nodes to combine */
987 if (WARN_ON_ONCE(min_gap == ULONG_MAX))
988 break;
989
990 comb_start->last = comb_end->last;
991 interval_tree_remove(node: comb_end, root);
992 cur_nodes--;
993 }
994}
995EXPORT_SYMBOL_GPL(vfio_combine_iova_ranges);
996
997/* Ranges should fit into a single kernel page */
998#define LOG_MAX_RANGES \
999 (PAGE_SIZE / sizeof(struct vfio_device_feature_dma_logging_range))
1000
1001static int
1002vfio_ioctl_device_feature_logging_start(struct vfio_device *device,
1003 u32 flags, void __user *arg,
1004 size_t argsz)
1005{
1006 size_t minsz =
1007 offsetofend(struct vfio_device_feature_dma_logging_control,
1008 ranges);
1009 struct vfio_device_feature_dma_logging_range __user *ranges;
1010 struct vfio_device_feature_dma_logging_control control;
1011 struct vfio_device_feature_dma_logging_range range;
1012 struct rb_root_cached root = RB_ROOT_CACHED;
1013 struct interval_tree_node *nodes;
1014 u64 iova_end;
1015 u32 nnodes;
1016 int i, ret;
1017
1018 if (!device->log_ops)
1019 return -ENOTTY;
1020
1021 ret = vfio_check_feature(flags, argsz,
1022 VFIO_DEVICE_FEATURE_SET,
1023 minsz: sizeof(control));
1024 if (ret != 1)
1025 return ret;
1026
1027 if (copy_from_user(to: &control, from: arg, n: minsz))
1028 return -EFAULT;
1029
1030 nnodes = control.num_ranges;
1031 if (!nnodes)
1032 return -EINVAL;
1033
1034 if (nnodes > LOG_MAX_RANGES)
1035 return -E2BIG;
1036
1037 ranges = u64_to_user_ptr(control.ranges);
1038 nodes = kmalloc_array(n: nnodes, size: sizeof(struct interval_tree_node),
1039 GFP_KERNEL);
1040 if (!nodes)
1041 return -ENOMEM;
1042
1043 for (i = 0; i < nnodes; i++) {
1044 if (copy_from_user(to: &range, from: &ranges[i], n: sizeof(range))) {
1045 ret = -EFAULT;
1046 goto end;
1047 }
1048 if (!IS_ALIGNED(range.iova, control.page_size) ||
1049 !IS_ALIGNED(range.length, control.page_size)) {
1050 ret = -EINVAL;
1051 goto end;
1052 }
1053
1054 if (check_add_overflow(range.iova, range.length, &iova_end) ||
1055 iova_end > ULONG_MAX) {
1056 ret = -EOVERFLOW;
1057 goto end;
1058 }
1059
1060 nodes[i].start = range.iova;
1061 nodes[i].last = range.iova + range.length - 1;
1062 if (interval_tree_iter_first(root: &root, start: nodes[i].start,
1063 last: nodes[i].last)) {
1064 /* Range overlapping */
1065 ret = -EINVAL;
1066 goto end;
1067 }
1068 interval_tree_insert(node: nodes + i, root: &root);
1069 }
1070
1071 ret = device->log_ops->log_start(device, &root, nnodes,
1072 &control.page_size);
1073 if (ret)
1074 goto end;
1075
1076 if (copy_to_user(to: arg, from: &control, n: sizeof(control))) {
1077 ret = -EFAULT;
1078 device->log_ops->log_stop(device);
1079 }
1080
1081end:
1082 kfree(objp: nodes);
1083 return ret;
1084}
1085
1086static int
1087vfio_ioctl_device_feature_logging_stop(struct vfio_device *device,
1088 u32 flags, void __user *arg,
1089 size_t argsz)
1090{
1091 int ret;
1092
1093 if (!device->log_ops)
1094 return -ENOTTY;
1095
1096 ret = vfio_check_feature(flags, argsz,
1097 VFIO_DEVICE_FEATURE_SET, minsz: 0);
1098 if (ret != 1)
1099 return ret;
1100
1101 return device->log_ops->log_stop(device);
1102}
1103
1104static int vfio_device_log_read_and_clear(struct iova_bitmap *iter,
1105 unsigned long iova, size_t length,
1106 void *opaque)
1107{
1108 struct vfio_device *device = opaque;
1109
1110 return device->log_ops->log_read_and_clear(device, iova, length, iter);
1111}
1112
1113static int
1114vfio_ioctl_device_feature_logging_report(struct vfio_device *device,
1115 u32 flags, void __user *arg,
1116 size_t argsz)
1117{
1118 size_t minsz =
1119 offsetofend(struct vfio_device_feature_dma_logging_report,
1120 bitmap);
1121 struct vfio_device_feature_dma_logging_report report;
1122 struct iova_bitmap *iter;
1123 u64 iova_end;
1124 int ret;
1125
1126 if (!device->log_ops)
1127 return -ENOTTY;
1128
1129 ret = vfio_check_feature(flags, argsz,
1130 VFIO_DEVICE_FEATURE_GET,
1131 minsz: sizeof(report));
1132 if (ret != 1)
1133 return ret;
1134
1135 if (copy_from_user(to: &report, from: arg, n: minsz))
1136 return -EFAULT;
1137
1138 if (report.page_size < SZ_4K || !is_power_of_2(n: report.page_size))
1139 return -EINVAL;
1140
1141 if (check_add_overflow(report.iova, report.length, &iova_end) ||
1142 iova_end > ULONG_MAX)
1143 return -EOVERFLOW;
1144
1145 iter = iova_bitmap_alloc(iova: report.iova, length: report.length,
1146 page_size: report.page_size,
1147 u64_to_user_ptr(report.bitmap));
1148 if (IS_ERR(ptr: iter))
1149 return PTR_ERR(ptr: iter);
1150
1151 ret = iova_bitmap_for_each(bitmap: iter, opaque: device,
1152 fn: vfio_device_log_read_and_clear);
1153
1154 iova_bitmap_free(bitmap: iter);
1155 return ret;
1156}
1157
1158static int vfio_ioctl_device_feature(struct vfio_device *device,
1159 struct vfio_device_feature __user *arg)
1160{
1161 size_t minsz = offsetofend(struct vfio_device_feature, flags);
1162 struct vfio_device_feature feature;
1163
1164 if (copy_from_user(to: &feature, from: arg, n: minsz))
1165 return -EFAULT;
1166
1167 if (feature.argsz < minsz)
1168 return -EINVAL;
1169
1170 /* Check unknown flags */
1171 if (feature.flags &
1172 ~(VFIO_DEVICE_FEATURE_MASK | VFIO_DEVICE_FEATURE_SET |
1173 VFIO_DEVICE_FEATURE_GET | VFIO_DEVICE_FEATURE_PROBE))
1174 return -EINVAL;
1175
1176 /* GET & SET are mutually exclusive except with PROBE */
1177 if (!(feature.flags & VFIO_DEVICE_FEATURE_PROBE) &&
1178 (feature.flags & VFIO_DEVICE_FEATURE_SET) &&
1179 (feature.flags & VFIO_DEVICE_FEATURE_GET))
1180 return -EINVAL;
1181
1182 switch (feature.flags & VFIO_DEVICE_FEATURE_MASK) {
1183 case VFIO_DEVICE_FEATURE_MIGRATION:
1184 return vfio_ioctl_device_feature_migration(
1185 device, flags: feature.flags, arg: arg->data,
1186 argsz: feature.argsz - minsz);
1187 case VFIO_DEVICE_FEATURE_MIG_DEVICE_STATE:
1188 return vfio_ioctl_device_feature_mig_device_state(
1189 device, flags: feature.flags, arg: arg->data,
1190 argsz: feature.argsz - minsz);
1191 case VFIO_DEVICE_FEATURE_DMA_LOGGING_START:
1192 return vfio_ioctl_device_feature_logging_start(
1193 device, flags: feature.flags, arg: arg->data,
1194 argsz: feature.argsz - minsz);
1195 case VFIO_DEVICE_FEATURE_DMA_LOGGING_STOP:
1196 return vfio_ioctl_device_feature_logging_stop(
1197 device, flags: feature.flags, arg: arg->data,
1198 argsz: feature.argsz - minsz);
1199 case VFIO_DEVICE_FEATURE_DMA_LOGGING_REPORT:
1200 return vfio_ioctl_device_feature_logging_report(
1201 device, flags: feature.flags, arg: arg->data,
1202 argsz: feature.argsz - minsz);
1203 case VFIO_DEVICE_FEATURE_MIG_DATA_SIZE:
1204 return vfio_ioctl_device_feature_migration_data_size(
1205 device, flags: feature.flags, arg: arg->data,
1206 argsz: feature.argsz - minsz);
1207 default:
1208 if (unlikely(!device->ops->device_feature))
1209 return -EINVAL;
1210 return device->ops->device_feature(device, feature.flags,
1211 arg->data,
1212 feature.argsz - minsz);
1213 }
1214}
1215
1216static long vfio_device_fops_unl_ioctl(struct file *filep,
1217 unsigned int cmd, unsigned long arg)
1218{
1219 struct vfio_device_file *df = filep->private_data;
1220 struct vfio_device *device = df->device;
1221 void __user *uptr = (void __user *)arg;
1222 int ret;
1223
1224 if (cmd == VFIO_DEVICE_BIND_IOMMUFD)
1225 return vfio_df_ioctl_bind_iommufd(df, arg: uptr);
1226
1227 /* Paired with smp_store_release() following vfio_df_open() */
1228 if (!smp_load_acquire(&df->access_granted))
1229 return -EINVAL;
1230
1231 ret = vfio_device_pm_runtime_get(device);
1232 if (ret)
1233 return ret;
1234
1235 /* cdev only ioctls */
1236 if (IS_ENABLED(CONFIG_VFIO_DEVICE_CDEV) && !df->group) {
1237 switch (cmd) {
1238 case VFIO_DEVICE_ATTACH_IOMMUFD_PT:
1239 ret = vfio_df_ioctl_attach_pt(df, arg: uptr);
1240 goto out;
1241
1242 case VFIO_DEVICE_DETACH_IOMMUFD_PT:
1243 ret = vfio_df_ioctl_detach_pt(df, arg: uptr);
1244 goto out;
1245 }
1246 }
1247
1248 switch (cmd) {
1249 case VFIO_DEVICE_FEATURE:
1250 ret = vfio_ioctl_device_feature(device, arg: uptr);
1251 break;
1252
1253 default:
1254 if (unlikely(!device->ops->ioctl))
1255 ret = -EINVAL;
1256 else
1257 ret = device->ops->ioctl(device, cmd, arg);
1258 break;
1259 }
1260out:
1261 vfio_device_pm_runtime_put(device);
1262 return ret;
1263}
1264
1265static ssize_t vfio_device_fops_read(struct file *filep, char __user *buf,
1266 size_t count, loff_t *ppos)
1267{
1268 struct vfio_device_file *df = filep->private_data;
1269 struct vfio_device *device = df->device;
1270
1271 /* Paired with smp_store_release() following vfio_df_open() */
1272 if (!smp_load_acquire(&df->access_granted))
1273 return -EINVAL;
1274
1275 if (unlikely(!device->ops->read))
1276 return -EINVAL;
1277
1278 return device->ops->read(device, buf, count, ppos);
1279}
1280
1281static ssize_t vfio_device_fops_write(struct file *filep,
1282 const char __user *buf,
1283 size_t count, loff_t *ppos)
1284{
1285 struct vfio_device_file *df = filep->private_data;
1286 struct vfio_device *device = df->device;
1287
1288 /* Paired with smp_store_release() following vfio_df_open() */
1289 if (!smp_load_acquire(&df->access_granted))
1290 return -EINVAL;
1291
1292 if (unlikely(!device->ops->write))
1293 return -EINVAL;
1294
1295 return device->ops->write(device, buf, count, ppos);
1296}
1297
1298static int vfio_device_fops_mmap(struct file *filep, struct vm_area_struct *vma)
1299{
1300 struct vfio_device_file *df = filep->private_data;
1301 struct vfio_device *device = df->device;
1302
1303 /* Paired with smp_store_release() following vfio_df_open() */
1304 if (!smp_load_acquire(&df->access_granted))
1305 return -EINVAL;
1306
1307 if (unlikely(!device->ops->mmap))
1308 return -EINVAL;
1309
1310 return device->ops->mmap(device, vma);
1311}
1312
1313const struct file_operations vfio_device_fops = {
1314 .owner = THIS_MODULE,
1315 .open = vfio_device_fops_cdev_open,
1316 .release = vfio_device_fops_release,
1317 .read = vfio_device_fops_read,
1318 .write = vfio_device_fops_write,
1319 .unlocked_ioctl = vfio_device_fops_unl_ioctl,
1320 .compat_ioctl = compat_ptr_ioctl,
1321 .mmap = vfio_device_fops_mmap,
1322};
1323
1324static struct vfio_device *vfio_device_from_file(struct file *file)
1325{
1326 struct vfio_device_file *df = file->private_data;
1327
1328 if (file->f_op != &vfio_device_fops)
1329 return NULL;
1330 return df->device;
1331}
1332
1333/**
1334 * vfio_file_is_valid - True if the file is valid vfio file
1335 * @file: VFIO group file or VFIO device file
1336 */
1337bool vfio_file_is_valid(struct file *file)
1338{
1339 return vfio_group_from_file(file) ||
1340 vfio_device_from_file(file);
1341}
1342EXPORT_SYMBOL_GPL(vfio_file_is_valid);
1343
1344/**
1345 * vfio_file_enforced_coherent - True if the DMA associated with the VFIO file
1346 * is always CPU cache coherent
1347 * @file: VFIO group file or VFIO device file
1348 *
1349 * Enforced coherency means that the IOMMU ignores things like the PCIe no-snoop
1350 * bit in DMA transactions. A return of false indicates that the user has
1351 * rights to access additional instructions such as wbinvd on x86.
1352 */
1353bool vfio_file_enforced_coherent(struct file *file)
1354{
1355 struct vfio_device *device;
1356 struct vfio_group *group;
1357
1358 group = vfio_group_from_file(file);
1359 if (group)
1360 return vfio_group_enforced_coherent(group);
1361
1362 device = vfio_device_from_file(file);
1363 if (device)
1364 return device_iommu_capable(dev: device->dev,
1365 cap: IOMMU_CAP_ENFORCE_CACHE_COHERENCY);
1366
1367 return true;
1368}
1369EXPORT_SYMBOL_GPL(vfio_file_enforced_coherent);
1370
1371static void vfio_device_file_set_kvm(struct file *file, struct kvm *kvm)
1372{
1373 struct vfio_device_file *df = file->private_data;
1374
1375 /*
1376 * The kvm is first recorded in the vfio_device_file, and will
1377 * be propagated to vfio_device::kvm when the file is bound to
1378 * iommufd successfully in the vfio device cdev path.
1379 */
1380 spin_lock(lock: &df->kvm_ref_lock);
1381 df->kvm = kvm;
1382 spin_unlock(lock: &df->kvm_ref_lock);
1383}
1384
1385/**
1386 * vfio_file_set_kvm - Link a kvm with VFIO drivers
1387 * @file: VFIO group file or VFIO device file
1388 * @kvm: KVM to link
1389 *
1390 * When a VFIO device is first opened the KVM will be available in
1391 * device->kvm if one was associated with the file.
1392 */
1393void vfio_file_set_kvm(struct file *file, struct kvm *kvm)
1394{
1395 struct vfio_group *group;
1396
1397 group = vfio_group_from_file(file);
1398 if (group)
1399 vfio_group_set_kvm(group, kvm);
1400
1401 if (vfio_device_from_file(file))
1402 vfio_device_file_set_kvm(file, kvm);
1403}
1404EXPORT_SYMBOL_GPL(vfio_file_set_kvm);
1405
1406/*
1407 * Sub-module support
1408 */
1409/*
1410 * Helper for managing a buffer of info chain capabilities, allocate or
1411 * reallocate a buffer with additional @size, filling in @id and @version
1412 * of the capability. A pointer to the new capability is returned.
1413 *
1414 * NB. The chain is based at the head of the buffer, so new entries are
1415 * added to the tail, vfio_info_cap_shift() should be called to fixup the
1416 * next offsets prior to copying to the user buffer.
1417 */
1418struct vfio_info_cap_header *vfio_info_cap_add(struct vfio_info_cap *caps,
1419 size_t size, u16 id, u16 version)
1420{
1421 void *buf;
1422 struct vfio_info_cap_header *header, *tmp;
1423
1424 /* Ensure that the next capability struct will be aligned */
1425 size = ALIGN(size, sizeof(u64));
1426
1427 buf = krealloc(objp: caps->buf, new_size: caps->size + size, GFP_KERNEL);
1428 if (!buf) {
1429 kfree(objp: caps->buf);
1430 caps->buf = NULL;
1431 caps->size = 0;
1432 return ERR_PTR(error: -ENOMEM);
1433 }
1434
1435 caps->buf = buf;
1436 header = buf + caps->size;
1437
1438 /* Eventually copied to user buffer, zero */
1439 memset(header, 0, size);
1440
1441 header->id = id;
1442 header->version = version;
1443
1444 /* Add to the end of the capability chain */
1445 for (tmp = buf; tmp->next; tmp = buf + tmp->next)
1446 ; /* nothing */
1447
1448 tmp->next = caps->size;
1449 caps->size += size;
1450
1451 return header;
1452}
1453EXPORT_SYMBOL_GPL(vfio_info_cap_add);
1454
1455void vfio_info_cap_shift(struct vfio_info_cap *caps, size_t offset)
1456{
1457 struct vfio_info_cap_header *tmp;
1458 void *buf = (void *)caps->buf;
1459
1460 /* Capability structs should start with proper alignment */
1461 WARN_ON(!IS_ALIGNED(offset, sizeof(u64)));
1462
1463 for (tmp = buf; tmp->next; tmp = buf + tmp->next - offset)
1464 tmp->next += offset;
1465}
1466EXPORT_SYMBOL(vfio_info_cap_shift);
1467
1468int vfio_info_add_capability(struct vfio_info_cap *caps,
1469 struct vfio_info_cap_header *cap, size_t size)
1470{
1471 struct vfio_info_cap_header *header;
1472
1473 header = vfio_info_cap_add(caps, size, cap->id, cap->version);
1474 if (IS_ERR(ptr: header))
1475 return PTR_ERR(ptr: header);
1476
1477 memcpy(header + 1, cap + 1, size - sizeof(*header));
1478
1479 return 0;
1480}
1481EXPORT_SYMBOL(vfio_info_add_capability);
1482
1483int vfio_set_irqs_validate_and_prepare(struct vfio_irq_set *hdr, int num_irqs,
1484 int max_irq_type, size_t *data_size)
1485{
1486 unsigned long minsz;
1487 size_t size;
1488
1489 minsz = offsetofend(struct vfio_irq_set, count);
1490
1491 if ((hdr->argsz < minsz) || (hdr->index >= max_irq_type) ||
1492 (hdr->count >= (U32_MAX - hdr->start)) ||
1493 (hdr->flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
1494 VFIO_IRQ_SET_ACTION_TYPE_MASK)))
1495 return -EINVAL;
1496
1497 if (data_size)
1498 *data_size = 0;
1499
1500 if (hdr->start >= num_irqs || hdr->start + hdr->count > num_irqs)
1501 return -EINVAL;
1502
1503 switch (hdr->flags & VFIO_IRQ_SET_DATA_TYPE_MASK) {
1504 case VFIO_IRQ_SET_DATA_NONE:
1505 size = 0;
1506 break;
1507 case VFIO_IRQ_SET_DATA_BOOL:
1508 size = sizeof(uint8_t);
1509 break;
1510 case VFIO_IRQ_SET_DATA_EVENTFD:
1511 size = sizeof(int32_t);
1512 break;
1513 default:
1514 return -EINVAL;
1515 }
1516
1517 if (size) {
1518 if (hdr->argsz - minsz < hdr->count * size)
1519 return -EINVAL;
1520
1521 if (!data_size)
1522 return -EINVAL;
1523
1524 *data_size = hdr->count * size;
1525 }
1526
1527 return 0;
1528}
1529EXPORT_SYMBOL(vfio_set_irqs_validate_and_prepare);
1530
1531/*
1532 * Pin contiguous user pages and return their associated host pages for local
1533 * domain only.
1534 * @device [in] : device
1535 * @iova [in] : starting IOVA of user pages to be pinned.
1536 * @npage [in] : count of pages to be pinned. This count should not
1537 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1538 * @prot [in] : protection flags
1539 * @pages[out] : array of host pages
1540 * Return error or number of pages pinned.
1541 *
1542 * A driver may only call this function if the vfio_device was created
1543 * by vfio_register_emulated_iommu_dev() due to vfio_device_container_pin_pages().
1544 */
1545int vfio_pin_pages(struct vfio_device *device, dma_addr_t iova,
1546 int npage, int prot, struct page **pages)
1547{
1548 /* group->container cannot change while a vfio device is open */
1549 if (!pages || !npage || WARN_ON(!vfio_assert_device_open(device)))
1550 return -EINVAL;
1551 if (!device->ops->dma_unmap)
1552 return -EINVAL;
1553 if (vfio_device_has_container(device))
1554 return vfio_device_container_pin_pages(device, iova,
1555 npage, prot, pages);
1556 if (device->iommufd_access) {
1557 int ret;
1558
1559 if (iova > ULONG_MAX)
1560 return -EINVAL;
1561 /*
1562 * VFIO ignores the sub page offset, npages is from the start of
1563 * a PAGE_SIZE chunk of IOVA. The caller is expected to recover
1564 * the sub page offset by doing:
1565 * pages[0] + (iova % PAGE_SIZE)
1566 */
1567 ret = iommufd_access_pin_pages(
1568 access: device->iommufd_access, ALIGN_DOWN(iova, PAGE_SIZE),
1569 length: npage * PAGE_SIZE, out_pages: pages,
1570 flags: (prot & IOMMU_WRITE) ? IOMMUFD_ACCESS_RW_WRITE : 0);
1571 if (ret)
1572 return ret;
1573 return npage;
1574 }
1575 return -EINVAL;
1576}
1577EXPORT_SYMBOL(vfio_pin_pages);
1578
1579/*
1580 * Unpin contiguous host pages for local domain only.
1581 * @device [in] : device
1582 * @iova [in] : starting address of user pages to be unpinned.
1583 * @npage [in] : count of pages to be unpinned. This count should not
1584 * be greater than VFIO_PIN_PAGES_MAX_ENTRIES.
1585 */
1586void vfio_unpin_pages(struct vfio_device *device, dma_addr_t iova, int npage)
1587{
1588 if (WARN_ON(!vfio_assert_device_open(device)))
1589 return;
1590 if (WARN_ON(!device->ops->dma_unmap))
1591 return;
1592
1593 if (vfio_device_has_container(device)) {
1594 vfio_device_container_unpin_pages(device, iova, npage);
1595 return;
1596 }
1597 if (device->iommufd_access) {
1598 if (WARN_ON(iova > ULONG_MAX))
1599 return;
1600 iommufd_access_unpin_pages(access: device->iommufd_access,
1601 ALIGN_DOWN(iova, PAGE_SIZE),
1602 length: npage * PAGE_SIZE);
1603 return;
1604 }
1605}
1606EXPORT_SYMBOL(vfio_unpin_pages);
1607
1608/*
1609 * This interface allows the CPUs to perform some sort of virtual DMA on
1610 * behalf of the device.
1611 *
1612 * CPUs read/write from/into a range of IOVAs pointing to user space memory
1613 * into/from a kernel buffer.
1614 *
1615 * As the read/write of user space memory is conducted via the CPUs and is
1616 * not a real device DMA, it is not necessary to pin the user space memory.
1617 *
1618 * @device [in] : VFIO device
1619 * @iova [in] : base IOVA of a user space buffer
1620 * @data [in] : pointer to kernel buffer
1621 * @len [in] : kernel buffer length
1622 * @write : indicate read or write
1623 * Return error code on failure or 0 on success.
1624 */
1625int vfio_dma_rw(struct vfio_device *device, dma_addr_t iova, void *data,
1626 size_t len, bool write)
1627{
1628 if (!data || len <= 0 || !vfio_assert_device_open(device))
1629 return -EINVAL;
1630
1631 if (vfio_device_has_container(device))
1632 return vfio_device_container_dma_rw(device, iova,
1633 data, len, write);
1634
1635 if (device->iommufd_access) {
1636 unsigned int flags = 0;
1637
1638 if (iova > ULONG_MAX)
1639 return -EINVAL;
1640
1641 /* VFIO historically tries to auto-detect a kthread */
1642 if (!current->mm)
1643 flags |= IOMMUFD_ACCESS_RW_KTHREAD;
1644 if (write)
1645 flags |= IOMMUFD_ACCESS_RW_WRITE;
1646 return iommufd_access_rw(access: device->iommufd_access, iova, data,
1647 len, flags);
1648 }
1649 return -EINVAL;
1650}
1651EXPORT_SYMBOL(vfio_dma_rw);
1652
1653/*
1654 * Module/class support
1655 */
1656static int __init vfio_init(void)
1657{
1658 int ret;
1659
1660 ida_init(ida: &vfio.device_ida);
1661
1662 ret = vfio_group_init();
1663 if (ret)
1664 return ret;
1665
1666 ret = vfio_virqfd_init();
1667 if (ret)
1668 goto err_virqfd;
1669
1670 /* /sys/class/vfio-dev/vfioX */
1671 vfio.device_class = class_create(name: "vfio-dev");
1672 if (IS_ERR(ptr: vfio.device_class)) {
1673 ret = PTR_ERR(ptr: vfio.device_class);
1674 goto err_dev_class;
1675 }
1676
1677 ret = vfio_cdev_init(device_class: vfio.device_class);
1678 if (ret)
1679 goto err_alloc_dev_chrdev;
1680
1681 vfio_debugfs_create_root();
1682 pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
1683 return 0;
1684
1685err_alloc_dev_chrdev:
1686 class_destroy(cls: vfio.device_class);
1687 vfio.device_class = NULL;
1688err_dev_class:
1689 vfio_virqfd_exit();
1690err_virqfd:
1691 vfio_group_cleanup();
1692 return ret;
1693}
1694
1695static void __exit vfio_cleanup(void)
1696{
1697 vfio_debugfs_remove_root();
1698 ida_destroy(ida: &vfio.device_ida);
1699 vfio_cdev_cleanup();
1700 class_destroy(cls: vfio.device_class);
1701 vfio.device_class = NULL;
1702 vfio_virqfd_exit();
1703 vfio_group_cleanup();
1704 xa_destroy(&vfio_device_set_xa);
1705}
1706
1707module_init(vfio_init);
1708module_exit(vfio_cleanup);
1709
1710MODULE_IMPORT_NS(IOMMUFD);
1711MODULE_VERSION(DRIVER_VERSION);
1712MODULE_LICENSE("GPL v2");
1713MODULE_AUTHOR(DRIVER_AUTHOR);
1714MODULE_DESCRIPTION(DRIVER_DESC);
1715MODULE_SOFTDEP("post: vfio_iommu_type1 vfio_iommu_spapr_tce");
1716

source code of linux/drivers/vfio/vfio_main.c