1// SPDX-License-Identifier: GPL-2.0-only
2/*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6#include <linux/device.h>
7#include <linux/eventfd.h>
8#include <linux/file.h>
9#include <linux/interrupt.h>
10#include <linux/iommu.h>
11#include <linux/module.h>
12#include <linux/mutex.h>
13#include <linux/notifier.h>
14#include <linux/pci.h>
15#include <linux/pm_runtime.h>
16#include <linux/types.h>
17#include <linux/uaccess.h>
18#include <linux/vfio.h>
19#include <linux/sched/mm.h>
20#include <linux/anon_inodes.h>
21
22#include "cmd.h"
23
24/* Device specification max LOAD size */
25#define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1)
26
27#define MAX_CHUNK_SIZE SZ_8M
28
29static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev)
30{
31 struct vfio_pci_core_device *core_device = dev_get_drvdata(dev: &pdev->dev);
32
33 return container_of(core_device, struct mlx5vf_pci_core_device,
34 core_device);
35}
36
37struct page *
38mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf,
39 unsigned long offset)
40{
41 unsigned long cur_offset = 0;
42 struct scatterlist *sg;
43 unsigned int i;
44
45 /* All accesses are sequential */
46 if (offset < buf->last_offset || !buf->last_offset_sg) {
47 buf->last_offset = 0;
48 buf->last_offset_sg = buf->table.sgt.sgl;
49 buf->sg_last_entry = 0;
50 }
51
52 cur_offset = buf->last_offset;
53
54 for_each_sg(buf->last_offset_sg, sg,
55 buf->table.sgt.orig_nents - buf->sg_last_entry, i) {
56 if (offset < sg->length + cur_offset) {
57 buf->last_offset_sg = sg;
58 buf->sg_last_entry += i;
59 buf->last_offset = cur_offset;
60 return nth_page(sg_page(sg),
61 (offset - cur_offset) / PAGE_SIZE);
62 }
63 cur_offset += sg->length;
64 }
65 return NULL;
66}
67
68static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf)
69{
70 mutex_lock(&migf->lock);
71 migf->state = MLX5_MIGF_STATE_ERROR;
72 migf->filp->f_pos = 0;
73 mutex_unlock(lock: &migf->lock);
74}
75
76static int mlx5vf_release_file(struct inode *inode, struct file *filp)
77{
78 struct mlx5_vf_migration_file *migf = filp->private_data;
79
80 mlx5vf_disable_fd(migf);
81 mutex_destroy(lock: &migf->lock);
82 kfree(objp: migf);
83 return 0;
84}
85
86static struct mlx5_vhca_data_buffer *
87mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos,
88 bool *end_of_data)
89{
90 struct mlx5_vhca_data_buffer *buf;
91 bool found = false;
92
93 *end_of_data = false;
94 spin_lock_irq(lock: &migf->list_lock);
95 if (list_empty(head: &migf->buf_list)) {
96 *end_of_data = true;
97 goto end;
98 }
99
100 buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer,
101 buf_elm);
102 if (pos >= buf->start_pos &&
103 pos < buf->start_pos + buf->length) {
104 found = true;
105 goto end;
106 }
107
108 /*
109 * As we use a stream based FD we may expect having the data always
110 * on first chunk
111 */
112 migf->state = MLX5_MIGF_STATE_ERROR;
113
114end:
115 spin_unlock_irq(lock: &migf->list_lock);
116 return found ? buf : NULL;
117}
118
119static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf)
120{
121 struct mlx5_vf_migration_file *migf = vhca_buf->migf;
122
123 if (vhca_buf->stop_copy_chunk_num) {
124 bool is_header = vhca_buf->dma_dir == DMA_NONE;
125 u8 chunk_num = vhca_buf->stop_copy_chunk_num;
126 size_t next_required_umem_size = 0;
127
128 if (is_header)
129 migf->buf_header[chunk_num - 1] = vhca_buf;
130 else
131 migf->buf[chunk_num - 1] = vhca_buf;
132
133 spin_lock_irq(lock: &migf->list_lock);
134 list_del_init(entry: &vhca_buf->buf_elm);
135 if (!is_header) {
136 next_required_umem_size =
137 migf->next_required_umem_size;
138 migf->next_required_umem_size = 0;
139 migf->num_ready_chunks--;
140 }
141 spin_unlock_irq(lock: &migf->list_lock);
142 if (next_required_umem_size)
143 mlx5vf_mig_file_set_save_work(migf, chunk_num,
144 next_required_umem_size);
145 return;
146 }
147
148 spin_lock_irq(lock: &migf->list_lock);
149 list_del_init(entry: &vhca_buf->buf_elm);
150 list_add_tail(new: &vhca_buf->buf_elm, head: &vhca_buf->migf->avail_list);
151 spin_unlock_irq(lock: &migf->list_lock);
152}
153
154static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf,
155 char __user **buf, size_t *len, loff_t *pos)
156{
157 unsigned long offset;
158 ssize_t done = 0;
159 size_t copy_len;
160
161 copy_len = min_t(size_t,
162 vhca_buf->start_pos + vhca_buf->length - *pos, *len);
163 while (copy_len) {
164 size_t page_offset;
165 struct page *page;
166 size_t page_len;
167 u8 *from_buff;
168 int ret;
169
170 offset = *pos - vhca_buf->start_pos;
171 page_offset = offset % PAGE_SIZE;
172 offset -= page_offset;
173 page = mlx5vf_get_migration_page(buf: vhca_buf, offset);
174 if (!page)
175 return -EINVAL;
176 page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset);
177 from_buff = kmap_local_page(page);
178 ret = copy_to_user(to: *buf, from: from_buff + page_offset, n: page_len);
179 kunmap_local(from_buff);
180 if (ret)
181 return -EFAULT;
182 *pos += page_len;
183 *len -= page_len;
184 *buf += page_len;
185 done += page_len;
186 copy_len -= page_len;
187 }
188
189 if (*pos >= vhca_buf->start_pos + vhca_buf->length)
190 mlx5vf_buf_read_done(vhca_buf);
191
192 return done;
193}
194
195static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len,
196 loff_t *pos)
197{
198 struct mlx5_vf_migration_file *migf = filp->private_data;
199 struct mlx5_vhca_data_buffer *vhca_buf;
200 bool first_loop_call = true;
201 bool end_of_data;
202 ssize_t done = 0;
203
204 if (pos)
205 return -ESPIPE;
206 pos = &filp->f_pos;
207
208 if (!(filp->f_flags & O_NONBLOCK)) {
209 if (wait_event_interruptible(migf->poll_wait,
210 !list_empty(&migf->buf_list) ||
211 migf->state == MLX5_MIGF_STATE_ERROR ||
212 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR ||
213 migf->state == MLX5_MIGF_STATE_PRE_COPY ||
214 migf->state == MLX5_MIGF_STATE_COMPLETE))
215 return -ERESTARTSYS;
216 }
217
218 mutex_lock(&migf->lock);
219 if (migf->state == MLX5_MIGF_STATE_ERROR) {
220 done = -ENODEV;
221 goto out_unlock;
222 }
223
224 while (len) {
225 ssize_t count;
226
227 vhca_buf = mlx5vf_get_data_buff_from_pos(migf, pos: *pos,
228 end_of_data: &end_of_data);
229 if (first_loop_call) {
230 first_loop_call = false;
231 /* Temporary end of file as part of PRE_COPY */
232 if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY ||
233 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) {
234 done = -ENOMSG;
235 goto out_unlock;
236 }
237
238 if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) {
239 if (filp->f_flags & O_NONBLOCK) {
240 done = -EAGAIN;
241 goto out_unlock;
242 }
243 }
244 }
245
246 if (end_of_data)
247 goto out_unlock;
248
249 if (!vhca_buf) {
250 done = -EINVAL;
251 goto out_unlock;
252 }
253
254 count = mlx5vf_buf_read(vhca_buf, buf: &buf, len: &len, pos);
255 if (count < 0) {
256 done = count;
257 goto out_unlock;
258 }
259 done += count;
260 }
261
262out_unlock:
263 mutex_unlock(lock: &migf->lock);
264 return done;
265}
266
267static __poll_t mlx5vf_save_poll(struct file *filp,
268 struct poll_table_struct *wait)
269{
270 struct mlx5_vf_migration_file *migf = filp->private_data;
271 __poll_t pollflags = 0;
272
273 poll_wait(filp, wait_address: &migf->poll_wait, p: wait);
274
275 mutex_lock(&migf->lock);
276 if (migf->state == MLX5_MIGF_STATE_ERROR)
277 pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP;
278 else if (!list_empty(head: &migf->buf_list) ||
279 migf->state == MLX5_MIGF_STATE_COMPLETE)
280 pollflags = EPOLLIN | EPOLLRDNORM;
281 mutex_unlock(lock: &migf->lock);
282
283 return pollflags;
284}
285
286/*
287 * FD is exposed and user can use it after receiving an error.
288 * Mark migf in error, and wake the user.
289 */
290static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf)
291{
292 migf->state = MLX5_MIGF_STATE_ERROR;
293 wake_up_interruptible(&migf->poll_wait);
294}
295
296void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf,
297 u8 chunk_num, size_t next_required_umem_size)
298{
299 migf->save_data[chunk_num - 1].next_required_umem_size =
300 next_required_umem_size;
301 migf->save_data[chunk_num - 1].migf = migf;
302 get_file(f: migf->filp);
303 queue_work(wq: migf->mvdev->cb_wq,
304 work: &migf->save_data[chunk_num - 1].work);
305}
306
307static struct mlx5_vhca_data_buffer *
308mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf,
309 u8 index, size_t required_length)
310{
311 struct mlx5_vhca_data_buffer *buf = migf->buf[index];
312 u8 chunk_num;
313
314 WARN_ON(!buf);
315 chunk_num = buf->stop_copy_chunk_num;
316 buf->migf->buf[index] = NULL;
317 /* Checking whether the pre-allocated buffer can fit */
318 if (buf->allocated_length >= required_length)
319 return buf;
320
321 mlx5vf_put_data_buffer(buf);
322 buf = mlx5vf_get_data_buffer(migf: buf->migf, length: required_length,
323 dma_dir: DMA_FROM_DEVICE);
324 if (IS_ERR(ptr: buf))
325 return buf;
326
327 buf->stop_copy_chunk_num = chunk_num;
328 return buf;
329}
330
331static void mlx5vf_mig_file_save_work(struct work_struct *_work)
332{
333 struct mlx5vf_save_work_data *save_data = container_of(_work,
334 struct mlx5vf_save_work_data, work);
335 struct mlx5_vf_migration_file *migf = save_data->migf;
336 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
337 struct mlx5_vhca_data_buffer *buf;
338
339 mutex_lock(&mvdev->state_mutex);
340 if (migf->state == MLX5_MIGF_STATE_ERROR)
341 goto end;
342
343 buf = mlx5vf_mig_file_get_stop_copy_buf(migf,
344 index: save_data->chunk_num - 1,
345 required_length: save_data->next_required_umem_size);
346 if (IS_ERR(ptr: buf))
347 goto err;
348
349 if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: true, track: false))
350 goto err_save;
351
352 goto end;
353
354err_save:
355 mlx5vf_put_data_buffer(buf);
356err:
357 mlx5vf_mark_err(migf);
358end:
359 mlx5vf_state_mutex_unlock(mvdev);
360 fput(migf->filp);
361}
362
363static int mlx5vf_add_stop_copy_header(struct mlx5_vf_migration_file *migf,
364 bool track)
365{
366 size_t size = sizeof(struct mlx5_vf_migration_header) +
367 sizeof(struct mlx5_vf_migration_tag_stop_copy_data);
368 struct mlx5_vf_migration_tag_stop_copy_data data = {};
369 struct mlx5_vhca_data_buffer *header_buf = NULL;
370 struct mlx5_vf_migration_header header = {};
371 unsigned long flags;
372 struct page *page;
373 u8 *to_buff;
374 int ret;
375
376 header_buf = mlx5vf_get_data_buffer(migf, length: size, dma_dir: DMA_NONE);
377 if (IS_ERR(ptr: header_buf))
378 return PTR_ERR(ptr: header_buf);
379
380 header.record_size = cpu_to_le64(sizeof(data));
381 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL);
382 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE);
383 page = mlx5vf_get_migration_page(buf: header_buf, offset: 0);
384 if (!page) {
385 ret = -EINVAL;
386 goto err;
387 }
388 to_buff = kmap_local_page(page);
389 memcpy(to_buff, &header, sizeof(header));
390 header_buf->length = sizeof(header);
391 data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length);
392 memcpy(to_buff + sizeof(header), &data, sizeof(data));
393 header_buf->length += sizeof(data);
394 kunmap_local(to_buff);
395 header_buf->start_pos = header_buf->migf->max_pos;
396 migf->max_pos += header_buf->length;
397 spin_lock_irqsave(&migf->list_lock, flags);
398 list_add_tail(new: &header_buf->buf_elm, head: &migf->buf_list);
399 spin_unlock_irqrestore(lock: &migf->list_lock, flags);
400 if (track)
401 migf->pre_copy_initial_bytes = size;
402 return 0;
403err:
404 mlx5vf_put_data_buffer(buf: header_buf);
405 return ret;
406}
407
408static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev,
409 struct mlx5_vf_migration_file *migf,
410 size_t state_size, u64 full_size,
411 bool track)
412{
413 struct mlx5_vhca_data_buffer *buf;
414 size_t inc_state_size;
415 int num_chunks;
416 int ret;
417 int i;
418
419 if (mvdev->chunk_mode) {
420 size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size);
421
422 /* from firmware perspective at least 'state_size' buffer should be set */
423 inc_state_size = max(state_size, chunk_size);
424 } else {
425 if (track) {
426 /* let's be ready for stop_copy size that might grow by 10 percents */
427 if (check_add_overflow(state_size, state_size / 10, &inc_state_size))
428 inc_state_size = state_size;
429 } else {
430 inc_state_size = state_size;
431 }
432 }
433
434 /* let's not overflow the device specification max SAVE size */
435 inc_state_size = min_t(size_t, inc_state_size,
436 (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE));
437
438 num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1;
439 for (i = 0; i < num_chunks; i++) {
440 buf = mlx5vf_get_data_buffer(migf, length: inc_state_size, dma_dir: DMA_FROM_DEVICE);
441 if (IS_ERR(ptr: buf)) {
442 ret = PTR_ERR(ptr: buf);
443 goto err;
444 }
445
446 migf->buf[i] = buf;
447 buf = mlx5vf_get_data_buffer(migf,
448 length: sizeof(struct mlx5_vf_migration_header), dma_dir: DMA_NONE);
449 if (IS_ERR(ptr: buf)) {
450 ret = PTR_ERR(ptr: buf);
451 goto err;
452 }
453 migf->buf_header[i] = buf;
454 if (mvdev->chunk_mode) {
455 migf->buf[i]->stop_copy_chunk_num = i + 1;
456 migf->buf_header[i]->stop_copy_chunk_num = i + 1;
457 INIT_WORK(&migf->save_data[i].work,
458 mlx5vf_mig_file_save_work);
459 migf->save_data[i].chunk_num = i + 1;
460 }
461 }
462
463 ret = mlx5vf_add_stop_copy_header(migf, track);
464 if (ret)
465 goto err;
466 return 0;
467
468err:
469 for (i = 0; i < num_chunks; i++) {
470 if (migf->buf[i]) {
471 mlx5vf_put_data_buffer(buf: migf->buf[i]);
472 migf->buf[i] = NULL;
473 }
474 if (migf->buf_header[i]) {
475 mlx5vf_put_data_buffer(buf: migf->buf_header[i]);
476 migf->buf_header[i] = NULL;
477 }
478 }
479
480 return ret;
481}
482
483static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd,
484 unsigned long arg)
485{
486 struct mlx5_vf_migration_file *migf = filp->private_data;
487 struct mlx5vf_pci_core_device *mvdev = migf->mvdev;
488 struct mlx5_vhca_data_buffer *buf;
489 struct vfio_precopy_info info = {};
490 loff_t *pos = &filp->f_pos;
491 unsigned long minsz;
492 size_t inc_length = 0;
493 bool end_of_data = false;
494 int ret;
495
496 if (cmd != VFIO_MIG_GET_PRECOPY_INFO)
497 return -ENOTTY;
498
499 minsz = offsetofend(struct vfio_precopy_info, dirty_bytes);
500
501 if (copy_from_user(to: &info, from: (void __user *)arg, n: minsz))
502 return -EFAULT;
503
504 if (info.argsz < minsz)
505 return -EINVAL;
506
507 mutex_lock(&mvdev->state_mutex);
508 if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY &&
509 mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) {
510 ret = -EINVAL;
511 goto err_state_unlock;
512 }
513
514 /*
515 * We can't issue a SAVE command when the device is suspended, so as
516 * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra
517 * bytes that can't be read.
518 */
519 if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) {
520 /*
521 * Once the query returns it's guaranteed that there is no
522 * active SAVE command.
523 * As so, the other code below is safe with the proper locks.
524 */
525 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &inc_length,
526 NULL, query_flags: MLX5VF_QUERY_INC);
527 if (ret)
528 goto err_state_unlock;
529 }
530
531 mutex_lock(&migf->lock);
532 if (migf->state == MLX5_MIGF_STATE_ERROR) {
533 ret = -ENODEV;
534 goto err_migf_unlock;
535 }
536
537 if (migf->pre_copy_initial_bytes > *pos) {
538 info.initial_bytes = migf->pre_copy_initial_bytes - *pos;
539 } else {
540 info.dirty_bytes = migf->max_pos - *pos;
541 if (!info.dirty_bytes)
542 end_of_data = true;
543 info.dirty_bytes += inc_length;
544 }
545
546 if (!end_of_data || !inc_length) {
547 mutex_unlock(lock: &migf->lock);
548 goto done;
549 }
550
551 mutex_unlock(lock: &migf->lock);
552 /*
553 * We finished transferring the current state and the device has a
554 * dirty state, save a new state to be ready for.
555 */
556 buf = mlx5vf_get_data_buffer(migf, length: inc_length, dma_dir: DMA_FROM_DEVICE);
557 if (IS_ERR(ptr: buf)) {
558 ret = PTR_ERR(ptr: buf);
559 mlx5vf_mark_err(migf);
560 goto err_state_unlock;
561 }
562
563 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: true, track: true);
564 if (ret) {
565 mlx5vf_mark_err(migf);
566 mlx5vf_put_data_buffer(buf);
567 goto err_state_unlock;
568 }
569
570done:
571 mlx5vf_state_mutex_unlock(mvdev);
572 if (copy_to_user(to: (void __user *)arg, from: &info, n: minsz))
573 return -EFAULT;
574 return 0;
575
576err_migf_unlock:
577 mutex_unlock(lock: &migf->lock);
578err_state_unlock:
579 mlx5vf_state_mutex_unlock(mvdev);
580 return ret;
581}
582
583static const struct file_operations mlx5vf_save_fops = {
584 .owner = THIS_MODULE,
585 .read = mlx5vf_save_read,
586 .poll = mlx5vf_save_poll,
587 .unlocked_ioctl = mlx5vf_precopy_ioctl,
588 .compat_ioctl = compat_ptr_ioctl,
589 .release = mlx5vf_release_file,
590 .llseek = no_llseek,
591};
592
593static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev)
594{
595 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
596 struct mlx5_vhca_data_buffer *buf;
597 size_t length;
598 int ret;
599
600 if (migf->state == MLX5_MIGF_STATE_ERROR)
601 return -ENODEV;
602
603 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &length, NULL,
604 query_flags: MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL);
605 if (ret)
606 goto err;
607
608 buf = mlx5vf_mig_file_get_stop_copy_buf(migf, index: 0, required_length: length);
609 if (IS_ERR(ptr: buf)) {
610 ret = PTR_ERR(ptr: buf);
611 goto err;
612 }
613
614 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: true, track: false);
615 if (ret)
616 goto err_save;
617
618 return 0;
619
620err_save:
621 mlx5vf_put_data_buffer(buf);
622err:
623 mlx5vf_mark_err(migf);
624 return ret;
625}
626
627static struct mlx5_vf_migration_file *
628mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track)
629{
630 struct mlx5_vf_migration_file *migf;
631 struct mlx5_vhca_data_buffer *buf;
632 size_t length;
633 u64 full_size;
634 int ret;
635
636 migf = kzalloc(size: sizeof(*migf), GFP_KERNEL_ACCOUNT);
637 if (!migf)
638 return ERR_PTR(error: -ENOMEM);
639
640 migf->filp = anon_inode_getfile(name: "mlx5vf_mig", fops: &mlx5vf_save_fops, priv: migf,
641 O_RDONLY);
642 if (IS_ERR(ptr: migf->filp)) {
643 ret = PTR_ERR(ptr: migf->filp);
644 goto end;
645 }
646
647 migf->mvdev = mvdev;
648 ret = mlx5vf_cmd_alloc_pd(migf);
649 if (ret)
650 goto out_free;
651
652 stream_open(inode: migf->filp->f_inode, filp: migf->filp);
653 mutex_init(&migf->lock);
654 init_waitqueue_head(&migf->poll_wait);
655 init_completion(x: &migf->save_comp);
656 /*
657 * save_comp is being used as a binary semaphore built from
658 * a completion. A normal mutex cannot be used because the lock is
659 * passed between kernel threads and lockdep can't model this.
660 */
661 complete(&migf->save_comp);
662 mlx5_cmd_init_async_ctx(dev: mvdev->mdev, ctx: &migf->async_ctx);
663 INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb);
664 INIT_LIST_HEAD(list: &migf->buf_list);
665 INIT_LIST_HEAD(list: &migf->avail_list);
666 spin_lock_init(&migf->list_lock);
667 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &length, total_size: &full_size, query_flags: 0);
668 if (ret)
669 goto out_pd;
670
671 ret = mlx5vf_prep_stop_copy(mvdev, migf, state_size: length, full_size, track);
672 if (ret)
673 goto out_pd;
674
675 if (track) {
676 /* leave the allocated buffer ready for the stop-copy phase */
677 buf = mlx5vf_alloc_data_buffer(migf,
678 length: migf->buf[0]->allocated_length, dma_dir: DMA_FROM_DEVICE);
679 if (IS_ERR(ptr: buf)) {
680 ret = PTR_ERR(ptr: buf);
681 goto out_pd;
682 }
683 } else {
684 buf = migf->buf[0];
685 migf->buf[0] = NULL;
686 }
687
688 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: false, track);
689 if (ret)
690 goto out_save;
691 return migf;
692out_save:
693 mlx5vf_free_data_buffer(buf);
694out_pd:
695 mlx5fv_cmd_clean_migf_resources(migf);
696out_free:
697 fput(migf->filp);
698end:
699 kfree(objp: migf);
700 return ERR_PTR(error: ret);
701}
702
703static int
704mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf,
705 const char __user **buf, size_t *len,
706 loff_t *pos, ssize_t *done)
707{
708 unsigned long offset;
709 size_t page_offset;
710 struct page *page;
711 size_t page_len;
712 u8 *to_buff;
713 int ret;
714
715 offset = *pos - vhca_buf->start_pos;
716 page_offset = offset % PAGE_SIZE;
717
718 page = mlx5vf_get_migration_page(buf: vhca_buf, offset: offset - page_offset);
719 if (!page)
720 return -EINVAL;
721 page_len = min_t(size_t, *len, PAGE_SIZE - page_offset);
722 to_buff = kmap_local_page(page);
723 ret = copy_from_user(to: to_buff + page_offset, from: *buf, n: page_len);
724 kunmap_local(to_buff);
725 if (ret)
726 return -EFAULT;
727
728 *pos += page_len;
729 *done += page_len;
730 *buf += page_len;
731 *len -= page_len;
732 vhca_buf->length += page_len;
733 return 0;
734}
735
736static ssize_t
737mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf,
738 struct mlx5_vhca_data_buffer *vhca_buf,
739 size_t image_size, const char __user **buf,
740 size_t *len, loff_t *pos, ssize_t *done,
741 bool *has_work)
742{
743 size_t copy_len, to_copy;
744 int ret;
745
746 to_copy = min_t(size_t, *len, image_size - vhca_buf->length);
747 copy_len = to_copy;
748 while (to_copy) {
749 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len: &to_copy, pos,
750 done);
751 if (ret)
752 return ret;
753 }
754
755 *len -= copy_len;
756 if (vhca_buf->length == image_size) {
757 migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE;
758 migf->max_pos += image_size;
759 *has_work = true;
760 }
761
762 return 0;
763}
764
765static int
766mlx5vf_resume_read_header_data(struct mlx5_vf_migration_file *migf,
767 struct mlx5_vhca_data_buffer *vhca_buf,
768 const char __user **buf, size_t *len,
769 loff_t *pos, ssize_t *done)
770{
771 size_t copy_len, to_copy;
772 size_t required_data;
773 u8 *to_buff;
774 int ret;
775
776 required_data = migf->record_size - vhca_buf->length;
777 to_copy = min_t(size_t, *len, required_data);
778 copy_len = to_copy;
779 while (to_copy) {
780 ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len: &to_copy, pos,
781 done);
782 if (ret)
783 return ret;
784 }
785
786 *len -= copy_len;
787 if (vhca_buf->length == migf->record_size) {
788 switch (migf->record_tag) {
789 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
790 {
791 struct page *page;
792
793 page = mlx5vf_get_migration_page(buf: vhca_buf, offset: 0);
794 if (!page)
795 return -EINVAL;
796 to_buff = kmap_local_page(page);
797 migf->stop_copy_prep_size = min_t(u64,
798 le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE);
799 kunmap_local(to_buff);
800 break;
801 }
802 default:
803 /* Optional tag */
804 break;
805 }
806
807 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
808 migf->max_pos += migf->record_size;
809 vhca_buf->length = 0;
810 }
811
812 return 0;
813}
814
815static int
816mlx5vf_resume_read_header(struct mlx5_vf_migration_file *migf,
817 struct mlx5_vhca_data_buffer *vhca_buf,
818 const char __user **buf,
819 size_t *len, loff_t *pos,
820 ssize_t *done, bool *has_work)
821{
822 struct page *page;
823 size_t copy_len;
824 u8 *to_buff;
825 int ret;
826
827 copy_len = min_t(size_t, *len,
828 sizeof(struct mlx5_vf_migration_header) - vhca_buf->length);
829 page = mlx5vf_get_migration_page(buf: vhca_buf, offset: 0);
830 if (!page)
831 return -EINVAL;
832 to_buff = kmap_local_page(page);
833 ret = copy_from_user(to: to_buff + vhca_buf->length, from: *buf, n: copy_len);
834 if (ret) {
835 ret = -EFAULT;
836 goto end;
837 }
838
839 *buf += copy_len;
840 *pos += copy_len;
841 *done += copy_len;
842 *len -= copy_len;
843 vhca_buf->length += copy_len;
844 if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) {
845 u64 record_size;
846 u32 flags;
847
848 record_size = le64_to_cpup(p: (__le64 *)to_buff);
849 if (record_size > MAX_LOAD_SIZE) {
850 ret = -ENOMEM;
851 goto end;
852 }
853
854 migf->record_size = record_size;
855 flags = le32_to_cpup(p: (__le32 *)(to_buff +
856 offsetof(struct mlx5_vf_migration_header, flags)));
857 migf->record_tag = le32_to_cpup(p: (__le32 *)(to_buff +
858 offsetof(struct mlx5_vf_migration_header, tag)));
859 switch (migf->record_tag) {
860 case MLX5_MIGF_HEADER_TAG_FW_DATA:
861 migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE;
862 break;
863 case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE:
864 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
865 break;
866 default:
867 if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) {
868 ret = -EOPNOTSUPP;
869 goto end;
870 }
871 /* We may read and skip this optional record data */
872 migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA;
873 }
874
875 migf->max_pos += vhca_buf->length;
876 vhca_buf->length = 0;
877 *has_work = true;
878 }
879end:
880 kunmap_local(to_buff);
881 return ret;
882}
883
884static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf,
885 size_t len, loff_t *pos)
886{
887 struct mlx5_vf_migration_file *migf = filp->private_data;
888 struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0];
889 struct mlx5_vhca_data_buffer *vhca_buf_header = migf->buf_header[0];
890 loff_t requested_length;
891 bool has_work = false;
892 ssize_t done = 0;
893 int ret = 0;
894
895 if (pos)
896 return -ESPIPE;
897 pos = &filp->f_pos;
898
899 if (*pos < 0 ||
900 check_add_overflow((loff_t)len, *pos, &requested_length))
901 return -EINVAL;
902
903 mutex_lock(&migf->mvdev->state_mutex);
904 mutex_lock(&migf->lock);
905 if (migf->state == MLX5_MIGF_STATE_ERROR) {
906 ret = -ENODEV;
907 goto out_unlock;
908 }
909
910 while (len || has_work) {
911 has_work = false;
912 switch (migf->load_state) {
913 case MLX5_VF_LOAD_STATE_READ_HEADER:
914 ret = mlx5vf_resume_read_header(migf, vhca_buf: vhca_buf_header,
915 buf: &buf, len: &len, pos,
916 done: &done, has_work: &has_work);
917 if (ret)
918 goto out_unlock;
919 break;
920 case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA:
921 if (vhca_buf_header->allocated_length < migf->record_size) {
922 mlx5vf_free_data_buffer(buf: vhca_buf_header);
923
924 migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf,
925 length: migf->record_size, dma_dir: DMA_NONE);
926 if (IS_ERR(ptr: migf->buf_header[0])) {
927 ret = PTR_ERR(ptr: migf->buf_header[0]);
928 migf->buf_header[0] = NULL;
929 goto out_unlock;
930 }
931
932 vhca_buf_header = migf->buf_header[0];
933 }
934
935 vhca_buf_header->start_pos = migf->max_pos;
936 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA;
937 break;
938 case MLX5_VF_LOAD_STATE_READ_HEADER_DATA:
939 ret = mlx5vf_resume_read_header_data(migf, vhca_buf: vhca_buf_header,
940 buf: &buf, len: &len, pos, done: &done);
941 if (ret)
942 goto out_unlock;
943 break;
944 case MLX5_VF_LOAD_STATE_PREP_IMAGE:
945 {
946 u64 size = max(migf->record_size,
947 migf->stop_copy_prep_size);
948
949 if (vhca_buf->allocated_length < size) {
950 mlx5vf_free_data_buffer(buf: vhca_buf);
951
952 migf->buf[0] = mlx5vf_alloc_data_buffer(migf,
953 length: size, dma_dir: DMA_TO_DEVICE);
954 if (IS_ERR(ptr: migf->buf[0])) {
955 ret = PTR_ERR(ptr: migf->buf[0]);
956 migf->buf[0] = NULL;
957 goto out_unlock;
958 }
959
960 vhca_buf = migf->buf[0];
961 }
962
963 vhca_buf->start_pos = migf->max_pos;
964 migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE;
965 break;
966 }
967 case MLX5_VF_LOAD_STATE_READ_IMAGE:
968 ret = mlx5vf_resume_read_image(migf, vhca_buf,
969 image_size: migf->record_size,
970 buf: &buf, len: &len, pos, done: &done, has_work: &has_work);
971 if (ret)
972 goto out_unlock;
973 break;
974 case MLX5_VF_LOAD_STATE_LOAD_IMAGE:
975 ret = mlx5vf_cmd_load_vhca_state(mvdev: migf->mvdev, migf, buf: vhca_buf);
976 if (ret)
977 goto out_unlock;
978 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
979
980 /* prep header buf for next image */
981 vhca_buf_header->length = 0;
982 /* prep data buf for next image */
983 vhca_buf->length = 0;
984
985 break;
986 default:
987 break;
988 }
989 }
990
991out_unlock:
992 if (ret)
993 migf->state = MLX5_MIGF_STATE_ERROR;
994 mutex_unlock(lock: &migf->lock);
995 mlx5vf_state_mutex_unlock(mvdev: migf->mvdev);
996 return ret ? ret : done;
997}
998
999static const struct file_operations mlx5vf_resume_fops = {
1000 .owner = THIS_MODULE,
1001 .write = mlx5vf_resume_write,
1002 .release = mlx5vf_release_file,
1003 .llseek = no_llseek,
1004};
1005
1006static struct mlx5_vf_migration_file *
1007mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev)
1008{
1009 struct mlx5_vf_migration_file *migf;
1010 struct mlx5_vhca_data_buffer *buf;
1011 int ret;
1012
1013 migf = kzalloc(size: sizeof(*migf), GFP_KERNEL_ACCOUNT);
1014 if (!migf)
1015 return ERR_PTR(error: -ENOMEM);
1016
1017 migf->filp = anon_inode_getfile(name: "mlx5vf_mig", fops: &mlx5vf_resume_fops, priv: migf,
1018 O_WRONLY);
1019 if (IS_ERR(ptr: migf->filp)) {
1020 ret = PTR_ERR(ptr: migf->filp);
1021 goto end;
1022 }
1023
1024 migf->mvdev = mvdev;
1025 ret = mlx5vf_cmd_alloc_pd(migf);
1026 if (ret)
1027 goto out_free;
1028
1029 buf = mlx5vf_alloc_data_buffer(migf, length: 0, dma_dir: DMA_TO_DEVICE);
1030 if (IS_ERR(ptr: buf)) {
1031 ret = PTR_ERR(ptr: buf);
1032 goto out_pd;
1033 }
1034
1035 migf->buf[0] = buf;
1036 buf = mlx5vf_alloc_data_buffer(migf,
1037 length: sizeof(struct mlx5_vf_migration_header), dma_dir: DMA_NONE);
1038 if (IS_ERR(ptr: buf)) {
1039 ret = PTR_ERR(ptr: buf);
1040 goto out_buf;
1041 }
1042
1043 migf->buf_header[0] = buf;
1044 migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER;
1045
1046 stream_open(inode: migf->filp->f_inode, filp: migf->filp);
1047 mutex_init(&migf->lock);
1048 INIT_LIST_HEAD(list: &migf->buf_list);
1049 INIT_LIST_HEAD(list: &migf->avail_list);
1050 spin_lock_init(&migf->list_lock);
1051 return migf;
1052out_buf:
1053 mlx5vf_free_data_buffer(buf: migf->buf[0]);
1054out_pd:
1055 mlx5vf_cmd_dealloc_pd(migf);
1056out_free:
1057 fput(migf->filp);
1058end:
1059 kfree(objp: migf);
1060 return ERR_PTR(error: ret);
1061}
1062
1063void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev,
1064 enum mlx5_vf_migf_state *last_save_state)
1065{
1066 if (mvdev->resuming_migf) {
1067 mlx5vf_disable_fd(migf: mvdev->resuming_migf);
1068 mlx5fv_cmd_clean_migf_resources(migf: mvdev->resuming_migf);
1069 fput(mvdev->resuming_migf->filp);
1070 mvdev->resuming_migf = NULL;
1071 }
1072 if (mvdev->saving_migf) {
1073 mlx5_cmd_cleanup_async_ctx(ctx: &mvdev->saving_migf->async_ctx);
1074 cancel_work_sync(work: &mvdev->saving_migf->async_data.work);
1075 if (last_save_state)
1076 *last_save_state = mvdev->saving_migf->state;
1077 mlx5vf_disable_fd(migf: mvdev->saving_migf);
1078 wake_up_interruptible(&mvdev->saving_migf->poll_wait);
1079 mlx5fv_cmd_clean_migf_resources(migf: mvdev->saving_migf);
1080 fput(mvdev->saving_migf->filp);
1081 mvdev->saving_migf = NULL;
1082 }
1083}
1084
1085static struct file *
1086mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev,
1087 u32 new)
1088{
1089 u32 cur = mvdev->mig_state;
1090 int ret;
1091
1092 if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) {
1093 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1094 op_mod: MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1095 if (ret)
1096 return ERR_PTR(error: ret);
1097 return NULL;
1098 }
1099
1100 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) {
1101 ret = mlx5vf_cmd_resume_vhca(mvdev,
1102 op_mod: MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER);
1103 if (ret)
1104 return ERR_PTR(error: ret);
1105 return NULL;
1106 }
1107
1108 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) ||
1109 (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1110 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1111 op_mod: MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR);
1112 if (ret)
1113 return ERR_PTR(error: ret);
1114 return NULL;
1115 }
1116
1117 if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) ||
1118 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) {
1119 ret = mlx5vf_cmd_resume_vhca(mvdev,
1120 op_mod: MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR);
1121 if (ret)
1122 return ERR_PTR(error: ret);
1123 return NULL;
1124 }
1125
1126 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) {
1127 struct mlx5_vf_migration_file *migf;
1128
1129 migf = mlx5vf_pci_save_device_data(mvdev, track: false);
1130 if (IS_ERR(ptr: migf))
1131 return ERR_CAST(ptr: migf);
1132 get_file(f: migf->filp);
1133 mvdev->saving_migf = migf;
1134 return migf->filp;
1135 }
1136
1137 if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) {
1138 mlx5vf_disable_fds(mvdev, NULL);
1139 return NULL;
1140 }
1141
1142 if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) ||
1143 (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P &&
1144 new == VFIO_DEVICE_STATE_RUNNING_P2P)) {
1145 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
1146 struct mlx5_vhca_data_buffer *buf;
1147 enum mlx5_vf_migf_state state;
1148 size_t size;
1149
1150 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &size, NULL,
1151 query_flags: MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP);
1152 if (ret)
1153 return ERR_PTR(error: ret);
1154 buf = mlx5vf_get_data_buffer(migf, length: size, dma_dir: DMA_FROM_DEVICE);
1155 if (IS_ERR(ptr: buf))
1156 return ERR_CAST(ptr: buf);
1157 /* pre_copy cleanup */
1158 ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: false, track: false);
1159 if (ret) {
1160 mlx5vf_put_data_buffer(buf);
1161 return ERR_PTR(error: ret);
1162 }
1163 mlx5vf_disable_fds(mvdev, last_save_state: &state);
1164 return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(error: -EIO);
1165 }
1166
1167 if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) {
1168 struct mlx5_vf_migration_file *migf;
1169
1170 migf = mlx5vf_pci_resume_device_data(mvdev);
1171 if (IS_ERR(ptr: migf))
1172 return ERR_CAST(ptr: migf);
1173 get_file(f: migf->filp);
1174 mvdev->resuming_migf = migf;
1175 return migf->filp;
1176 }
1177
1178 if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) {
1179 mlx5vf_disable_fds(mvdev, NULL);
1180 return NULL;
1181 }
1182
1183 if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) ||
1184 (cur == VFIO_DEVICE_STATE_RUNNING_P2P &&
1185 new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) {
1186 struct mlx5_vf_migration_file *migf;
1187
1188 migf = mlx5vf_pci_save_device_data(mvdev, track: true);
1189 if (IS_ERR(ptr: migf))
1190 return ERR_CAST(ptr: migf);
1191 get_file(f: migf->filp);
1192 mvdev->saving_migf = migf;
1193 return migf->filp;
1194 }
1195
1196 if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) {
1197 ret = mlx5vf_cmd_suspend_vhca(mvdev,
1198 op_mod: MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER);
1199 if (ret)
1200 return ERR_PTR(error: ret);
1201 ret = mlx5vf_pci_save_device_inc_data(mvdev);
1202 return ret ? ERR_PTR(error: ret) : NULL;
1203 }
1204
1205 /*
1206 * vfio_mig_get_next_state() does not use arcs other than the above
1207 */
1208 WARN_ON(true);
1209 return ERR_PTR(error: -EINVAL);
1210}
1211
1212/*
1213 * This function is called in all state_mutex unlock cases to
1214 * handle a 'deferred_reset' if exists.
1215 */
1216void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev)
1217{
1218again:
1219 spin_lock(lock: &mvdev->reset_lock);
1220 if (mvdev->deferred_reset) {
1221 mvdev->deferred_reset = false;
1222 spin_unlock(lock: &mvdev->reset_lock);
1223 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1224 mlx5vf_disable_fds(mvdev, NULL);
1225 goto again;
1226 }
1227 mutex_unlock(lock: &mvdev->state_mutex);
1228 spin_unlock(lock: &mvdev->reset_lock);
1229}
1230
1231static struct file *
1232mlx5vf_pci_set_device_state(struct vfio_device *vdev,
1233 enum vfio_device_mig_state new_state)
1234{
1235 struct mlx5vf_pci_core_device *mvdev = container_of(
1236 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1237 enum vfio_device_mig_state next_state;
1238 struct file *res = NULL;
1239 int ret;
1240
1241 mutex_lock(&mvdev->state_mutex);
1242 while (new_state != mvdev->mig_state) {
1243 ret = vfio_mig_get_next_state(device: vdev, cur_fsm: mvdev->mig_state,
1244 new_fsm: new_state, next_fsm: &next_state);
1245 if (ret) {
1246 res = ERR_PTR(error: ret);
1247 break;
1248 }
1249 res = mlx5vf_pci_step_device_state_locked(mvdev, new: next_state);
1250 if (IS_ERR(ptr: res))
1251 break;
1252 mvdev->mig_state = next_state;
1253 if (WARN_ON(res && new_state != mvdev->mig_state)) {
1254 fput(res);
1255 res = ERR_PTR(error: -EINVAL);
1256 break;
1257 }
1258 }
1259 mlx5vf_state_mutex_unlock(mvdev);
1260 return res;
1261}
1262
1263static int mlx5vf_pci_get_data_size(struct vfio_device *vdev,
1264 unsigned long *stop_copy_length)
1265{
1266 struct mlx5vf_pci_core_device *mvdev = container_of(
1267 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1268 size_t state_size;
1269 u64 total_size;
1270 int ret;
1271
1272 mutex_lock(&mvdev->state_mutex);
1273 ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &state_size,
1274 total_size: &total_size, query_flags: 0);
1275 if (!ret)
1276 *stop_copy_length = total_size;
1277 mlx5vf_state_mutex_unlock(mvdev);
1278 return ret;
1279}
1280
1281static int mlx5vf_pci_get_device_state(struct vfio_device *vdev,
1282 enum vfio_device_mig_state *curr_state)
1283{
1284 struct mlx5vf_pci_core_device *mvdev = container_of(
1285 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1286
1287 mutex_lock(&mvdev->state_mutex);
1288 *curr_state = mvdev->mig_state;
1289 mlx5vf_state_mutex_unlock(mvdev);
1290 return 0;
1291}
1292
1293static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev)
1294{
1295 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1296
1297 if (!mvdev->migrate_cap)
1298 return;
1299
1300 /*
1301 * As the higher VFIO layers are holding locks across reset and using
1302 * those same locks with the mm_lock we need to prevent ABBA deadlock
1303 * with the state_mutex and mm_lock.
1304 * In case the state_mutex was taken already we defer the cleanup work
1305 * to the unlock flow of the other running context.
1306 */
1307 spin_lock(lock: &mvdev->reset_lock);
1308 mvdev->deferred_reset = true;
1309 if (!mutex_trylock(lock: &mvdev->state_mutex)) {
1310 spin_unlock(lock: &mvdev->reset_lock);
1311 return;
1312 }
1313 spin_unlock(lock: &mvdev->reset_lock);
1314 mlx5vf_state_mutex_unlock(mvdev);
1315}
1316
1317static int mlx5vf_pci_open_device(struct vfio_device *core_vdev)
1318{
1319 struct mlx5vf_pci_core_device *mvdev = container_of(
1320 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1321 struct vfio_pci_core_device *vdev = &mvdev->core_device;
1322 int ret;
1323
1324 ret = vfio_pci_core_enable(vdev);
1325 if (ret)
1326 return ret;
1327
1328 if (mvdev->migrate_cap)
1329 mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING;
1330 vfio_pci_core_finish_enable(vdev);
1331 return 0;
1332}
1333
1334static void mlx5vf_pci_close_device(struct vfio_device *core_vdev)
1335{
1336 struct mlx5vf_pci_core_device *mvdev = container_of(
1337 core_vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1338
1339 mlx5vf_cmd_close_migratable(mvdev);
1340 vfio_pci_core_close_device(core_vdev);
1341}
1342
1343static const struct vfio_migration_ops mlx5vf_pci_mig_ops = {
1344 .migration_set_state = mlx5vf_pci_set_device_state,
1345 .migration_get_state = mlx5vf_pci_get_device_state,
1346 .migration_get_data_size = mlx5vf_pci_get_data_size,
1347};
1348
1349static const struct vfio_log_ops mlx5vf_pci_log_ops = {
1350 .log_start = mlx5vf_start_page_tracker,
1351 .log_stop = mlx5vf_stop_page_tracker,
1352 .log_read_and_clear = mlx5vf_tracker_read_and_clear,
1353};
1354
1355static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev)
1356{
1357 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1358 struct mlx5vf_pci_core_device, core_device.vdev);
1359 int ret;
1360
1361 ret = vfio_pci_core_init_dev(core_vdev);
1362 if (ret)
1363 return ret;
1364
1365 mlx5vf_cmd_set_migratable(mvdev, mig_ops: &mlx5vf_pci_mig_ops,
1366 log_ops: &mlx5vf_pci_log_ops);
1367
1368 return 0;
1369}
1370
1371static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev)
1372{
1373 struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev,
1374 struct mlx5vf_pci_core_device, core_device.vdev);
1375
1376 mlx5vf_cmd_remove_migratable(mvdev);
1377 vfio_pci_core_release_dev(core_vdev);
1378}
1379
1380static const struct vfio_device_ops mlx5vf_pci_ops = {
1381 .name = "mlx5-vfio-pci",
1382 .init = mlx5vf_pci_init_dev,
1383 .release = mlx5vf_pci_release_dev,
1384 .open_device = mlx5vf_pci_open_device,
1385 .close_device = mlx5vf_pci_close_device,
1386 .ioctl = vfio_pci_core_ioctl,
1387 .device_feature = vfio_pci_core_ioctl_feature,
1388 .read = vfio_pci_core_read,
1389 .write = vfio_pci_core_write,
1390 .mmap = vfio_pci_core_mmap,
1391 .request = vfio_pci_core_request,
1392 .match = vfio_pci_core_match,
1393 .bind_iommufd = vfio_iommufd_physical_bind,
1394 .unbind_iommufd = vfio_iommufd_physical_unbind,
1395 .attach_ioas = vfio_iommufd_physical_attach_ioas,
1396 .detach_ioas = vfio_iommufd_physical_detach_ioas,
1397};
1398
1399static int mlx5vf_pci_probe(struct pci_dev *pdev,
1400 const struct pci_device_id *id)
1401{
1402 struct mlx5vf_pci_core_device *mvdev;
1403 int ret;
1404
1405 mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev,
1406 &pdev->dev, &mlx5vf_pci_ops);
1407 if (IS_ERR(ptr: mvdev))
1408 return PTR_ERR(ptr: mvdev);
1409
1410 dev_set_drvdata(dev: &pdev->dev, data: &mvdev->core_device);
1411 ret = vfio_pci_core_register_device(vdev: &mvdev->core_device);
1412 if (ret)
1413 goto out_put_vdev;
1414 return 0;
1415
1416out_put_vdev:
1417 vfio_put_device(device: &mvdev->core_device.vdev);
1418 return ret;
1419}
1420
1421static void mlx5vf_pci_remove(struct pci_dev *pdev)
1422{
1423 struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev);
1424
1425 vfio_pci_core_unregister_device(vdev: &mvdev->core_device);
1426 vfio_put_device(device: &mvdev->core_device.vdev);
1427}
1428
1429static const struct pci_device_id mlx5vf_pci_table[] = {
1430 { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */
1431 {}
1432};
1433
1434MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table);
1435
1436static const struct pci_error_handlers mlx5vf_err_handlers = {
1437 .reset_done = mlx5vf_pci_aer_reset_done,
1438 .error_detected = vfio_pci_core_aer_err_detected,
1439};
1440
1441static struct pci_driver mlx5vf_pci_driver = {
1442 .name = KBUILD_MODNAME,
1443 .id_table = mlx5vf_pci_table,
1444 .probe = mlx5vf_pci_probe,
1445 .remove = mlx5vf_pci_remove,
1446 .err_handler = &mlx5vf_err_handlers,
1447 .driver_managed_dma = true,
1448};
1449
1450module_pci_driver(mlx5vf_pci_driver);
1451
1452MODULE_IMPORT_NS(IOMMUFD);
1453MODULE_LICENSE("GPL");
1454MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>");
1455MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>");
1456MODULE_DESCRIPTION(
1457 "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family");
1458

source code of linux/drivers/vfio/pci/mlx5/main.c