1 | // SPDX-License-Identifier: GPL-2.0-only |
2 | /* |
3 | * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved |
4 | */ |
5 | |
6 | #include <linux/device.h> |
7 | #include <linux/eventfd.h> |
8 | #include <linux/file.h> |
9 | #include <linux/interrupt.h> |
10 | #include <linux/iommu.h> |
11 | #include <linux/module.h> |
12 | #include <linux/mutex.h> |
13 | #include <linux/notifier.h> |
14 | #include <linux/pci.h> |
15 | #include <linux/pm_runtime.h> |
16 | #include <linux/types.h> |
17 | #include <linux/uaccess.h> |
18 | #include <linux/vfio.h> |
19 | #include <linux/sched/mm.h> |
20 | #include <linux/anon_inodes.h> |
21 | |
22 | #include "cmd.h" |
23 | |
24 | /* Device specification max LOAD size */ |
25 | #define MAX_LOAD_SIZE (BIT_ULL(__mlx5_bit_sz(load_vhca_state_in, size)) - 1) |
26 | |
27 | #define MAX_CHUNK_SIZE SZ_8M |
28 | |
29 | static struct mlx5vf_pci_core_device *mlx5vf_drvdata(struct pci_dev *pdev) |
30 | { |
31 | struct vfio_pci_core_device *core_device = dev_get_drvdata(dev: &pdev->dev); |
32 | |
33 | return container_of(core_device, struct mlx5vf_pci_core_device, |
34 | core_device); |
35 | } |
36 | |
37 | struct page * |
38 | mlx5vf_get_migration_page(struct mlx5_vhca_data_buffer *buf, |
39 | unsigned long offset) |
40 | { |
41 | unsigned long cur_offset = 0; |
42 | struct scatterlist *sg; |
43 | unsigned int i; |
44 | |
45 | /* All accesses are sequential */ |
46 | if (offset < buf->last_offset || !buf->last_offset_sg) { |
47 | buf->last_offset = 0; |
48 | buf->last_offset_sg = buf->table.sgt.sgl; |
49 | buf->sg_last_entry = 0; |
50 | } |
51 | |
52 | cur_offset = buf->last_offset; |
53 | |
54 | for_each_sg(buf->last_offset_sg, sg, |
55 | buf->table.sgt.orig_nents - buf->sg_last_entry, i) { |
56 | if (offset < sg->length + cur_offset) { |
57 | buf->last_offset_sg = sg; |
58 | buf->sg_last_entry += i; |
59 | buf->last_offset = cur_offset; |
60 | return nth_page(sg_page(sg), |
61 | (offset - cur_offset) / PAGE_SIZE); |
62 | } |
63 | cur_offset += sg->length; |
64 | } |
65 | return NULL; |
66 | } |
67 | |
68 | static void mlx5vf_disable_fd(struct mlx5_vf_migration_file *migf) |
69 | { |
70 | mutex_lock(&migf->lock); |
71 | migf->state = MLX5_MIGF_STATE_ERROR; |
72 | migf->filp->f_pos = 0; |
73 | mutex_unlock(lock: &migf->lock); |
74 | } |
75 | |
76 | static int mlx5vf_release_file(struct inode *inode, struct file *filp) |
77 | { |
78 | struct mlx5_vf_migration_file *migf = filp->private_data; |
79 | |
80 | mlx5vf_disable_fd(migf); |
81 | mutex_destroy(lock: &migf->lock); |
82 | kfree(objp: migf); |
83 | return 0; |
84 | } |
85 | |
86 | static struct mlx5_vhca_data_buffer * |
87 | mlx5vf_get_data_buff_from_pos(struct mlx5_vf_migration_file *migf, loff_t pos, |
88 | bool *end_of_data) |
89 | { |
90 | struct mlx5_vhca_data_buffer *buf; |
91 | bool found = false; |
92 | |
93 | *end_of_data = false; |
94 | spin_lock_irq(lock: &migf->list_lock); |
95 | if (list_empty(head: &migf->buf_list)) { |
96 | *end_of_data = true; |
97 | goto end; |
98 | } |
99 | |
100 | buf = list_first_entry(&migf->buf_list, struct mlx5_vhca_data_buffer, |
101 | buf_elm); |
102 | if (pos >= buf->start_pos && |
103 | pos < buf->start_pos + buf->length) { |
104 | found = true; |
105 | goto end; |
106 | } |
107 | |
108 | /* |
109 | * As we use a stream based FD we may expect having the data always |
110 | * on first chunk |
111 | */ |
112 | migf->state = MLX5_MIGF_STATE_ERROR; |
113 | |
114 | end: |
115 | spin_unlock_irq(lock: &migf->list_lock); |
116 | return found ? buf : NULL; |
117 | } |
118 | |
119 | static void mlx5vf_buf_read_done(struct mlx5_vhca_data_buffer *vhca_buf) |
120 | { |
121 | struct mlx5_vf_migration_file *migf = vhca_buf->migf; |
122 | |
123 | if (vhca_buf->stop_copy_chunk_num) { |
124 | bool = vhca_buf->dma_dir == DMA_NONE; |
125 | u8 chunk_num = vhca_buf->stop_copy_chunk_num; |
126 | size_t next_required_umem_size = 0; |
127 | |
128 | if (is_header) |
129 | migf->buf_header[chunk_num - 1] = vhca_buf; |
130 | else |
131 | migf->buf[chunk_num - 1] = vhca_buf; |
132 | |
133 | spin_lock_irq(lock: &migf->list_lock); |
134 | list_del_init(entry: &vhca_buf->buf_elm); |
135 | if (!is_header) { |
136 | next_required_umem_size = |
137 | migf->next_required_umem_size; |
138 | migf->next_required_umem_size = 0; |
139 | migf->num_ready_chunks--; |
140 | } |
141 | spin_unlock_irq(lock: &migf->list_lock); |
142 | if (next_required_umem_size) |
143 | mlx5vf_mig_file_set_save_work(migf, chunk_num, |
144 | next_required_umem_size); |
145 | return; |
146 | } |
147 | |
148 | spin_lock_irq(lock: &migf->list_lock); |
149 | list_del_init(entry: &vhca_buf->buf_elm); |
150 | list_add_tail(new: &vhca_buf->buf_elm, head: &vhca_buf->migf->avail_list); |
151 | spin_unlock_irq(lock: &migf->list_lock); |
152 | } |
153 | |
154 | static ssize_t mlx5vf_buf_read(struct mlx5_vhca_data_buffer *vhca_buf, |
155 | char __user **buf, size_t *len, loff_t *pos) |
156 | { |
157 | unsigned long offset; |
158 | ssize_t done = 0; |
159 | size_t copy_len; |
160 | |
161 | copy_len = min_t(size_t, |
162 | vhca_buf->start_pos + vhca_buf->length - *pos, *len); |
163 | while (copy_len) { |
164 | size_t page_offset; |
165 | struct page *page; |
166 | size_t page_len; |
167 | u8 *from_buff; |
168 | int ret; |
169 | |
170 | offset = *pos - vhca_buf->start_pos; |
171 | page_offset = offset % PAGE_SIZE; |
172 | offset -= page_offset; |
173 | page = mlx5vf_get_migration_page(buf: vhca_buf, offset); |
174 | if (!page) |
175 | return -EINVAL; |
176 | page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); |
177 | from_buff = kmap_local_page(page); |
178 | ret = copy_to_user(to: *buf, from: from_buff + page_offset, n: page_len); |
179 | kunmap_local(from_buff); |
180 | if (ret) |
181 | return -EFAULT; |
182 | *pos += page_len; |
183 | *len -= page_len; |
184 | *buf += page_len; |
185 | done += page_len; |
186 | copy_len -= page_len; |
187 | } |
188 | |
189 | if (*pos >= vhca_buf->start_pos + vhca_buf->length) |
190 | mlx5vf_buf_read_done(vhca_buf); |
191 | |
192 | return done; |
193 | } |
194 | |
195 | static ssize_t mlx5vf_save_read(struct file *filp, char __user *buf, size_t len, |
196 | loff_t *pos) |
197 | { |
198 | struct mlx5_vf_migration_file *migf = filp->private_data; |
199 | struct mlx5_vhca_data_buffer *vhca_buf; |
200 | bool first_loop_call = true; |
201 | bool end_of_data; |
202 | ssize_t done = 0; |
203 | |
204 | if (pos) |
205 | return -ESPIPE; |
206 | pos = &filp->f_pos; |
207 | |
208 | if (!(filp->f_flags & O_NONBLOCK)) { |
209 | if (wait_event_interruptible(migf->poll_wait, |
210 | !list_empty(&migf->buf_list) || |
211 | migf->state == MLX5_MIGF_STATE_ERROR || |
212 | migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR || |
213 | migf->state == MLX5_MIGF_STATE_PRE_COPY || |
214 | migf->state == MLX5_MIGF_STATE_COMPLETE)) |
215 | return -ERESTARTSYS; |
216 | } |
217 | |
218 | mutex_lock(&migf->lock); |
219 | if (migf->state == MLX5_MIGF_STATE_ERROR) { |
220 | done = -ENODEV; |
221 | goto out_unlock; |
222 | } |
223 | |
224 | while (len) { |
225 | ssize_t count; |
226 | |
227 | vhca_buf = mlx5vf_get_data_buff_from_pos(migf, pos: *pos, |
228 | end_of_data: &end_of_data); |
229 | if (first_loop_call) { |
230 | first_loop_call = false; |
231 | /* Temporary end of file as part of PRE_COPY */ |
232 | if (end_of_data && (migf->state == MLX5_MIGF_STATE_PRE_COPY || |
233 | migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)) { |
234 | done = -ENOMSG; |
235 | goto out_unlock; |
236 | } |
237 | |
238 | if (end_of_data && migf->state != MLX5_MIGF_STATE_COMPLETE) { |
239 | if (filp->f_flags & O_NONBLOCK) { |
240 | done = -EAGAIN; |
241 | goto out_unlock; |
242 | } |
243 | } |
244 | } |
245 | |
246 | if (end_of_data) |
247 | goto out_unlock; |
248 | |
249 | if (!vhca_buf) { |
250 | done = -EINVAL; |
251 | goto out_unlock; |
252 | } |
253 | |
254 | count = mlx5vf_buf_read(vhca_buf, buf: &buf, len: &len, pos); |
255 | if (count < 0) { |
256 | done = count; |
257 | goto out_unlock; |
258 | } |
259 | done += count; |
260 | } |
261 | |
262 | out_unlock: |
263 | mutex_unlock(lock: &migf->lock); |
264 | return done; |
265 | } |
266 | |
267 | static __poll_t mlx5vf_save_poll(struct file *filp, |
268 | struct poll_table_struct *wait) |
269 | { |
270 | struct mlx5_vf_migration_file *migf = filp->private_data; |
271 | __poll_t pollflags = 0; |
272 | |
273 | poll_wait(filp, wait_address: &migf->poll_wait, p: wait); |
274 | |
275 | mutex_lock(&migf->lock); |
276 | if (migf->state == MLX5_MIGF_STATE_ERROR) |
277 | pollflags = EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; |
278 | else if (!list_empty(head: &migf->buf_list) || |
279 | migf->state == MLX5_MIGF_STATE_COMPLETE) |
280 | pollflags = EPOLLIN | EPOLLRDNORM; |
281 | mutex_unlock(lock: &migf->lock); |
282 | |
283 | return pollflags; |
284 | } |
285 | |
286 | /* |
287 | * FD is exposed and user can use it after receiving an error. |
288 | * Mark migf in error, and wake the user. |
289 | */ |
290 | static void mlx5vf_mark_err(struct mlx5_vf_migration_file *migf) |
291 | { |
292 | migf->state = MLX5_MIGF_STATE_ERROR; |
293 | wake_up_interruptible(&migf->poll_wait); |
294 | } |
295 | |
296 | void mlx5vf_mig_file_set_save_work(struct mlx5_vf_migration_file *migf, |
297 | u8 chunk_num, size_t next_required_umem_size) |
298 | { |
299 | migf->save_data[chunk_num - 1].next_required_umem_size = |
300 | next_required_umem_size; |
301 | migf->save_data[chunk_num - 1].migf = migf; |
302 | get_file(f: migf->filp); |
303 | queue_work(wq: migf->mvdev->cb_wq, |
304 | work: &migf->save_data[chunk_num - 1].work); |
305 | } |
306 | |
307 | static struct mlx5_vhca_data_buffer * |
308 | mlx5vf_mig_file_get_stop_copy_buf(struct mlx5_vf_migration_file *migf, |
309 | u8 index, size_t required_length) |
310 | { |
311 | struct mlx5_vhca_data_buffer *buf = migf->buf[index]; |
312 | u8 chunk_num; |
313 | |
314 | WARN_ON(!buf); |
315 | chunk_num = buf->stop_copy_chunk_num; |
316 | buf->migf->buf[index] = NULL; |
317 | /* Checking whether the pre-allocated buffer can fit */ |
318 | if (buf->allocated_length >= required_length) |
319 | return buf; |
320 | |
321 | mlx5vf_put_data_buffer(buf); |
322 | buf = mlx5vf_get_data_buffer(migf: buf->migf, length: required_length, |
323 | dma_dir: DMA_FROM_DEVICE); |
324 | if (IS_ERR(ptr: buf)) |
325 | return buf; |
326 | |
327 | buf->stop_copy_chunk_num = chunk_num; |
328 | return buf; |
329 | } |
330 | |
331 | static void mlx5vf_mig_file_save_work(struct work_struct *_work) |
332 | { |
333 | struct mlx5vf_save_work_data *save_data = container_of(_work, |
334 | struct mlx5vf_save_work_data, work); |
335 | struct mlx5_vf_migration_file *migf = save_data->migf; |
336 | struct mlx5vf_pci_core_device *mvdev = migf->mvdev; |
337 | struct mlx5_vhca_data_buffer *buf; |
338 | |
339 | mutex_lock(&mvdev->state_mutex); |
340 | if (migf->state == MLX5_MIGF_STATE_ERROR) |
341 | goto end; |
342 | |
343 | buf = mlx5vf_mig_file_get_stop_copy_buf(migf, |
344 | index: save_data->chunk_num - 1, |
345 | required_length: save_data->next_required_umem_size); |
346 | if (IS_ERR(ptr: buf)) |
347 | goto err; |
348 | |
349 | if (mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: true, track: false)) |
350 | goto err_save; |
351 | |
352 | goto end; |
353 | |
354 | err_save: |
355 | mlx5vf_put_data_buffer(buf); |
356 | err: |
357 | mlx5vf_mark_err(migf); |
358 | end: |
359 | mlx5vf_state_mutex_unlock(mvdev); |
360 | fput(migf->filp); |
361 | } |
362 | |
363 | static int (struct mlx5_vf_migration_file *migf, |
364 | bool track) |
365 | { |
366 | size_t size = sizeof(struct mlx5_vf_migration_header) + |
367 | sizeof(struct mlx5_vf_migration_tag_stop_copy_data); |
368 | struct mlx5_vf_migration_tag_stop_copy_data data = {}; |
369 | struct mlx5_vhca_data_buffer * = NULL; |
370 | struct mlx5_vf_migration_header = {}; |
371 | unsigned long flags; |
372 | struct page *page; |
373 | u8 *to_buff; |
374 | int ret; |
375 | |
376 | header_buf = mlx5vf_get_data_buffer(migf, length: size, dma_dir: DMA_NONE); |
377 | if (IS_ERR(ptr: header_buf)) |
378 | return PTR_ERR(ptr: header_buf); |
379 | |
380 | header.record_size = cpu_to_le64(sizeof(data)); |
381 | header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL); |
382 | header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE); |
383 | page = mlx5vf_get_migration_page(buf: header_buf, offset: 0); |
384 | if (!page) { |
385 | ret = -EINVAL; |
386 | goto err; |
387 | } |
388 | to_buff = kmap_local_page(page); |
389 | memcpy(to_buff, &header, sizeof(header)); |
390 | header_buf->length = sizeof(header); |
391 | data.stop_copy_size = cpu_to_le64(migf->buf[0]->allocated_length); |
392 | memcpy(to_buff + sizeof(header), &data, sizeof(data)); |
393 | header_buf->length += sizeof(data); |
394 | kunmap_local(to_buff); |
395 | header_buf->start_pos = header_buf->migf->max_pos; |
396 | migf->max_pos += header_buf->length; |
397 | spin_lock_irqsave(&migf->list_lock, flags); |
398 | list_add_tail(new: &header_buf->buf_elm, head: &migf->buf_list); |
399 | spin_unlock_irqrestore(lock: &migf->list_lock, flags); |
400 | if (track) |
401 | migf->pre_copy_initial_bytes = size; |
402 | return 0; |
403 | err: |
404 | mlx5vf_put_data_buffer(buf: header_buf); |
405 | return ret; |
406 | } |
407 | |
408 | static int mlx5vf_prep_stop_copy(struct mlx5vf_pci_core_device *mvdev, |
409 | struct mlx5_vf_migration_file *migf, |
410 | size_t state_size, u64 full_size, |
411 | bool track) |
412 | { |
413 | struct mlx5_vhca_data_buffer *buf; |
414 | size_t inc_state_size; |
415 | int num_chunks; |
416 | int ret; |
417 | int i; |
418 | |
419 | if (mvdev->chunk_mode) { |
420 | size_t chunk_size = min_t(size_t, MAX_CHUNK_SIZE, full_size); |
421 | |
422 | /* from firmware perspective at least 'state_size' buffer should be set */ |
423 | inc_state_size = max(state_size, chunk_size); |
424 | } else { |
425 | if (track) { |
426 | /* let's be ready for stop_copy size that might grow by 10 percents */ |
427 | if (check_add_overflow(state_size, state_size / 10, &inc_state_size)) |
428 | inc_state_size = state_size; |
429 | } else { |
430 | inc_state_size = state_size; |
431 | } |
432 | } |
433 | |
434 | /* let's not overflow the device specification max SAVE size */ |
435 | inc_state_size = min_t(size_t, inc_state_size, |
436 | (BIT_ULL(__mlx5_bit_sz(save_vhca_state_in, size)) - PAGE_SIZE)); |
437 | |
438 | num_chunks = mvdev->chunk_mode ? MAX_NUM_CHUNKS : 1; |
439 | for (i = 0; i < num_chunks; i++) { |
440 | buf = mlx5vf_get_data_buffer(migf, length: inc_state_size, dma_dir: DMA_FROM_DEVICE); |
441 | if (IS_ERR(ptr: buf)) { |
442 | ret = PTR_ERR(ptr: buf); |
443 | goto err; |
444 | } |
445 | |
446 | migf->buf[i] = buf; |
447 | buf = mlx5vf_get_data_buffer(migf, |
448 | length: sizeof(struct mlx5_vf_migration_header), dma_dir: DMA_NONE); |
449 | if (IS_ERR(ptr: buf)) { |
450 | ret = PTR_ERR(ptr: buf); |
451 | goto err; |
452 | } |
453 | migf->buf_header[i] = buf; |
454 | if (mvdev->chunk_mode) { |
455 | migf->buf[i]->stop_copy_chunk_num = i + 1; |
456 | migf->buf_header[i]->stop_copy_chunk_num = i + 1; |
457 | INIT_WORK(&migf->save_data[i].work, |
458 | mlx5vf_mig_file_save_work); |
459 | migf->save_data[i].chunk_num = i + 1; |
460 | } |
461 | } |
462 | |
463 | ret = mlx5vf_add_stop_copy_header(migf, track); |
464 | if (ret) |
465 | goto err; |
466 | return 0; |
467 | |
468 | err: |
469 | for (i = 0; i < num_chunks; i++) { |
470 | if (migf->buf[i]) { |
471 | mlx5vf_put_data_buffer(buf: migf->buf[i]); |
472 | migf->buf[i] = NULL; |
473 | } |
474 | if (migf->buf_header[i]) { |
475 | mlx5vf_put_data_buffer(buf: migf->buf_header[i]); |
476 | migf->buf_header[i] = NULL; |
477 | } |
478 | } |
479 | |
480 | return ret; |
481 | } |
482 | |
483 | static long mlx5vf_precopy_ioctl(struct file *filp, unsigned int cmd, |
484 | unsigned long arg) |
485 | { |
486 | struct mlx5_vf_migration_file *migf = filp->private_data; |
487 | struct mlx5vf_pci_core_device *mvdev = migf->mvdev; |
488 | struct mlx5_vhca_data_buffer *buf; |
489 | struct vfio_precopy_info info = {}; |
490 | loff_t *pos = &filp->f_pos; |
491 | unsigned long minsz; |
492 | size_t inc_length = 0; |
493 | bool end_of_data = false; |
494 | int ret; |
495 | |
496 | if (cmd != VFIO_MIG_GET_PRECOPY_INFO) |
497 | return -ENOTTY; |
498 | |
499 | minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); |
500 | |
501 | if (copy_from_user(to: &info, from: (void __user *)arg, n: minsz)) |
502 | return -EFAULT; |
503 | |
504 | if (info.argsz < minsz) |
505 | return -EINVAL; |
506 | |
507 | mutex_lock(&mvdev->state_mutex); |
508 | if (mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && |
509 | mvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { |
510 | ret = -EINVAL; |
511 | goto err_state_unlock; |
512 | } |
513 | |
514 | /* |
515 | * We can't issue a SAVE command when the device is suspended, so as |
516 | * part of VFIO_DEVICE_STATE_PRE_COPY_P2P no reason to query for extra |
517 | * bytes that can't be read. |
518 | */ |
519 | if (mvdev->mig_state == VFIO_DEVICE_STATE_PRE_COPY) { |
520 | /* |
521 | * Once the query returns it's guaranteed that there is no |
522 | * active SAVE command. |
523 | * As so, the other code below is safe with the proper locks. |
524 | */ |
525 | ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &inc_length, |
526 | NULL, query_flags: MLX5VF_QUERY_INC); |
527 | if (ret) |
528 | goto err_state_unlock; |
529 | } |
530 | |
531 | mutex_lock(&migf->lock); |
532 | if (migf->state == MLX5_MIGF_STATE_ERROR) { |
533 | ret = -ENODEV; |
534 | goto err_migf_unlock; |
535 | } |
536 | |
537 | if (migf->pre_copy_initial_bytes > *pos) { |
538 | info.initial_bytes = migf->pre_copy_initial_bytes - *pos; |
539 | } else { |
540 | info.dirty_bytes = migf->max_pos - *pos; |
541 | if (!info.dirty_bytes) |
542 | end_of_data = true; |
543 | info.dirty_bytes += inc_length; |
544 | } |
545 | |
546 | if (!end_of_data || !inc_length) { |
547 | mutex_unlock(lock: &migf->lock); |
548 | goto done; |
549 | } |
550 | |
551 | mutex_unlock(lock: &migf->lock); |
552 | /* |
553 | * We finished transferring the current state and the device has a |
554 | * dirty state, save a new state to be ready for. |
555 | */ |
556 | buf = mlx5vf_get_data_buffer(migf, length: inc_length, dma_dir: DMA_FROM_DEVICE); |
557 | if (IS_ERR(ptr: buf)) { |
558 | ret = PTR_ERR(ptr: buf); |
559 | mlx5vf_mark_err(migf); |
560 | goto err_state_unlock; |
561 | } |
562 | |
563 | ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: true, track: true); |
564 | if (ret) { |
565 | mlx5vf_mark_err(migf); |
566 | mlx5vf_put_data_buffer(buf); |
567 | goto err_state_unlock; |
568 | } |
569 | |
570 | done: |
571 | mlx5vf_state_mutex_unlock(mvdev); |
572 | if (copy_to_user(to: (void __user *)arg, from: &info, n: minsz)) |
573 | return -EFAULT; |
574 | return 0; |
575 | |
576 | err_migf_unlock: |
577 | mutex_unlock(lock: &migf->lock); |
578 | err_state_unlock: |
579 | mlx5vf_state_mutex_unlock(mvdev); |
580 | return ret; |
581 | } |
582 | |
583 | static const struct file_operations mlx5vf_save_fops = { |
584 | .owner = THIS_MODULE, |
585 | .read = mlx5vf_save_read, |
586 | .poll = mlx5vf_save_poll, |
587 | .unlocked_ioctl = mlx5vf_precopy_ioctl, |
588 | .compat_ioctl = compat_ptr_ioctl, |
589 | .release = mlx5vf_release_file, |
590 | .llseek = no_llseek, |
591 | }; |
592 | |
593 | static int mlx5vf_pci_save_device_inc_data(struct mlx5vf_pci_core_device *mvdev) |
594 | { |
595 | struct mlx5_vf_migration_file *migf = mvdev->saving_migf; |
596 | struct mlx5_vhca_data_buffer *buf; |
597 | size_t length; |
598 | int ret; |
599 | |
600 | if (migf->state == MLX5_MIGF_STATE_ERROR) |
601 | return -ENODEV; |
602 | |
603 | ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &length, NULL, |
604 | query_flags: MLX5VF_QUERY_INC | MLX5VF_QUERY_FINAL); |
605 | if (ret) |
606 | goto err; |
607 | |
608 | buf = mlx5vf_mig_file_get_stop_copy_buf(migf, index: 0, required_length: length); |
609 | if (IS_ERR(ptr: buf)) { |
610 | ret = PTR_ERR(ptr: buf); |
611 | goto err; |
612 | } |
613 | |
614 | ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: true, track: false); |
615 | if (ret) |
616 | goto err_save; |
617 | |
618 | return 0; |
619 | |
620 | err_save: |
621 | mlx5vf_put_data_buffer(buf); |
622 | err: |
623 | mlx5vf_mark_err(migf); |
624 | return ret; |
625 | } |
626 | |
627 | static struct mlx5_vf_migration_file * |
628 | mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) |
629 | { |
630 | struct mlx5_vf_migration_file *migf; |
631 | struct mlx5_vhca_data_buffer *buf; |
632 | size_t length; |
633 | u64 full_size; |
634 | int ret; |
635 | |
636 | migf = kzalloc(size: sizeof(*migf), GFP_KERNEL_ACCOUNT); |
637 | if (!migf) |
638 | return ERR_PTR(error: -ENOMEM); |
639 | |
640 | migf->filp = anon_inode_getfile(name: "mlx5vf_mig" , fops: &mlx5vf_save_fops, priv: migf, |
641 | O_RDONLY); |
642 | if (IS_ERR(ptr: migf->filp)) { |
643 | ret = PTR_ERR(ptr: migf->filp); |
644 | goto end; |
645 | } |
646 | |
647 | migf->mvdev = mvdev; |
648 | ret = mlx5vf_cmd_alloc_pd(migf); |
649 | if (ret) |
650 | goto out_free; |
651 | |
652 | stream_open(inode: migf->filp->f_inode, filp: migf->filp); |
653 | mutex_init(&migf->lock); |
654 | init_waitqueue_head(&migf->poll_wait); |
655 | init_completion(x: &migf->save_comp); |
656 | /* |
657 | * save_comp is being used as a binary semaphore built from |
658 | * a completion. A normal mutex cannot be used because the lock is |
659 | * passed between kernel threads and lockdep can't model this. |
660 | */ |
661 | complete(&migf->save_comp); |
662 | mlx5_cmd_init_async_ctx(dev: mvdev->mdev, ctx: &migf->async_ctx); |
663 | INIT_WORK(&migf->async_data.work, mlx5vf_mig_file_cleanup_cb); |
664 | INIT_LIST_HEAD(list: &migf->buf_list); |
665 | INIT_LIST_HEAD(list: &migf->avail_list); |
666 | spin_lock_init(&migf->list_lock); |
667 | ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &length, total_size: &full_size, query_flags: 0); |
668 | if (ret) |
669 | goto out_pd; |
670 | |
671 | ret = mlx5vf_prep_stop_copy(mvdev, migf, state_size: length, full_size, track); |
672 | if (ret) |
673 | goto out_pd; |
674 | |
675 | if (track) { |
676 | /* leave the allocated buffer ready for the stop-copy phase */ |
677 | buf = mlx5vf_alloc_data_buffer(migf, |
678 | length: migf->buf[0]->allocated_length, dma_dir: DMA_FROM_DEVICE); |
679 | if (IS_ERR(ptr: buf)) { |
680 | ret = PTR_ERR(ptr: buf); |
681 | goto out_pd; |
682 | } |
683 | } else { |
684 | buf = migf->buf[0]; |
685 | migf->buf[0] = NULL; |
686 | } |
687 | |
688 | ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: false, track); |
689 | if (ret) |
690 | goto out_save; |
691 | return migf; |
692 | out_save: |
693 | mlx5vf_free_data_buffer(buf); |
694 | out_pd: |
695 | mlx5fv_cmd_clean_migf_resources(migf); |
696 | out_free: |
697 | fput(migf->filp); |
698 | end: |
699 | kfree(objp: migf); |
700 | return ERR_PTR(error: ret); |
701 | } |
702 | |
703 | static int |
704 | mlx5vf_append_page_to_mig_buf(struct mlx5_vhca_data_buffer *vhca_buf, |
705 | const char __user **buf, size_t *len, |
706 | loff_t *pos, ssize_t *done) |
707 | { |
708 | unsigned long offset; |
709 | size_t page_offset; |
710 | struct page *page; |
711 | size_t page_len; |
712 | u8 *to_buff; |
713 | int ret; |
714 | |
715 | offset = *pos - vhca_buf->start_pos; |
716 | page_offset = offset % PAGE_SIZE; |
717 | |
718 | page = mlx5vf_get_migration_page(buf: vhca_buf, offset: offset - page_offset); |
719 | if (!page) |
720 | return -EINVAL; |
721 | page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); |
722 | to_buff = kmap_local_page(page); |
723 | ret = copy_from_user(to: to_buff + page_offset, from: *buf, n: page_len); |
724 | kunmap_local(to_buff); |
725 | if (ret) |
726 | return -EFAULT; |
727 | |
728 | *pos += page_len; |
729 | *done += page_len; |
730 | *buf += page_len; |
731 | *len -= page_len; |
732 | vhca_buf->length += page_len; |
733 | return 0; |
734 | } |
735 | |
736 | static ssize_t |
737 | mlx5vf_resume_read_image(struct mlx5_vf_migration_file *migf, |
738 | struct mlx5_vhca_data_buffer *vhca_buf, |
739 | size_t image_size, const char __user **buf, |
740 | size_t *len, loff_t *pos, ssize_t *done, |
741 | bool *has_work) |
742 | { |
743 | size_t copy_len, to_copy; |
744 | int ret; |
745 | |
746 | to_copy = min_t(size_t, *len, image_size - vhca_buf->length); |
747 | copy_len = to_copy; |
748 | while (to_copy) { |
749 | ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len: &to_copy, pos, |
750 | done); |
751 | if (ret) |
752 | return ret; |
753 | } |
754 | |
755 | *len -= copy_len; |
756 | if (vhca_buf->length == image_size) { |
757 | migf->load_state = MLX5_VF_LOAD_STATE_LOAD_IMAGE; |
758 | migf->max_pos += image_size; |
759 | *has_work = true; |
760 | } |
761 | |
762 | return 0; |
763 | } |
764 | |
765 | static int |
766 | (struct mlx5_vf_migration_file *migf, |
767 | struct mlx5_vhca_data_buffer *vhca_buf, |
768 | const char __user **buf, size_t *len, |
769 | loff_t *pos, ssize_t *done) |
770 | { |
771 | size_t copy_len, to_copy; |
772 | size_t required_data; |
773 | u8 *to_buff; |
774 | int ret; |
775 | |
776 | required_data = migf->record_size - vhca_buf->length; |
777 | to_copy = min_t(size_t, *len, required_data); |
778 | copy_len = to_copy; |
779 | while (to_copy) { |
780 | ret = mlx5vf_append_page_to_mig_buf(vhca_buf, buf, len: &to_copy, pos, |
781 | done); |
782 | if (ret) |
783 | return ret; |
784 | } |
785 | |
786 | *len -= copy_len; |
787 | if (vhca_buf->length == migf->record_size) { |
788 | switch (migf->record_tag) { |
789 | case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: |
790 | { |
791 | struct page *page; |
792 | |
793 | page = mlx5vf_get_migration_page(buf: vhca_buf, offset: 0); |
794 | if (!page) |
795 | return -EINVAL; |
796 | to_buff = kmap_local_page(page); |
797 | migf->stop_copy_prep_size = min_t(u64, |
798 | le64_to_cpup((__le64 *)to_buff), MAX_LOAD_SIZE); |
799 | kunmap_local(to_buff); |
800 | break; |
801 | } |
802 | default: |
803 | /* Optional tag */ |
804 | break; |
805 | } |
806 | |
807 | migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; |
808 | migf->max_pos += migf->record_size; |
809 | vhca_buf->length = 0; |
810 | } |
811 | |
812 | return 0; |
813 | } |
814 | |
815 | static int |
816 | (struct mlx5_vf_migration_file *migf, |
817 | struct mlx5_vhca_data_buffer *vhca_buf, |
818 | const char __user **buf, |
819 | size_t *len, loff_t *pos, |
820 | ssize_t *done, bool *has_work) |
821 | { |
822 | struct page *page; |
823 | size_t copy_len; |
824 | u8 *to_buff; |
825 | int ret; |
826 | |
827 | copy_len = min_t(size_t, *len, |
828 | sizeof(struct mlx5_vf_migration_header) - vhca_buf->length); |
829 | page = mlx5vf_get_migration_page(buf: vhca_buf, offset: 0); |
830 | if (!page) |
831 | return -EINVAL; |
832 | to_buff = kmap_local_page(page); |
833 | ret = copy_from_user(to: to_buff + vhca_buf->length, from: *buf, n: copy_len); |
834 | if (ret) { |
835 | ret = -EFAULT; |
836 | goto end; |
837 | } |
838 | |
839 | *buf += copy_len; |
840 | *pos += copy_len; |
841 | *done += copy_len; |
842 | *len -= copy_len; |
843 | vhca_buf->length += copy_len; |
844 | if (vhca_buf->length == sizeof(struct mlx5_vf_migration_header)) { |
845 | u64 record_size; |
846 | u32 flags; |
847 | |
848 | record_size = le64_to_cpup(p: (__le64 *)to_buff); |
849 | if (record_size > MAX_LOAD_SIZE) { |
850 | ret = -ENOMEM; |
851 | goto end; |
852 | } |
853 | |
854 | migf->record_size = record_size; |
855 | flags = le32_to_cpup(p: (__le32 *)(to_buff + |
856 | offsetof(struct mlx5_vf_migration_header, flags))); |
857 | migf->record_tag = le32_to_cpup(p: (__le32 *)(to_buff + |
858 | offsetof(struct mlx5_vf_migration_header, tag))); |
859 | switch (migf->record_tag) { |
860 | case MLX5_MIGF_HEADER_TAG_FW_DATA: |
861 | migf->load_state = MLX5_VF_LOAD_STATE_PREP_IMAGE; |
862 | break; |
863 | case MLX5_MIGF_HEADER_TAG_STOP_COPY_SIZE: |
864 | migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; |
865 | break; |
866 | default: |
867 | if (!(flags & MLX5_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { |
868 | ret = -EOPNOTSUPP; |
869 | goto end; |
870 | } |
871 | /* We may read and skip this optional record data */ |
872 | migf->load_state = MLX5_VF_LOAD_STATE_PREP_HEADER_DATA; |
873 | } |
874 | |
875 | migf->max_pos += vhca_buf->length; |
876 | vhca_buf->length = 0; |
877 | *has_work = true; |
878 | } |
879 | end: |
880 | kunmap_local(to_buff); |
881 | return ret; |
882 | } |
883 | |
884 | static ssize_t mlx5vf_resume_write(struct file *filp, const char __user *buf, |
885 | size_t len, loff_t *pos) |
886 | { |
887 | struct mlx5_vf_migration_file *migf = filp->private_data; |
888 | struct mlx5_vhca_data_buffer *vhca_buf = migf->buf[0]; |
889 | struct mlx5_vhca_data_buffer * = migf->buf_header[0]; |
890 | loff_t requested_length; |
891 | bool has_work = false; |
892 | ssize_t done = 0; |
893 | int ret = 0; |
894 | |
895 | if (pos) |
896 | return -ESPIPE; |
897 | pos = &filp->f_pos; |
898 | |
899 | if (*pos < 0 || |
900 | check_add_overflow((loff_t)len, *pos, &requested_length)) |
901 | return -EINVAL; |
902 | |
903 | mutex_lock(&migf->mvdev->state_mutex); |
904 | mutex_lock(&migf->lock); |
905 | if (migf->state == MLX5_MIGF_STATE_ERROR) { |
906 | ret = -ENODEV; |
907 | goto out_unlock; |
908 | } |
909 | |
910 | while (len || has_work) { |
911 | has_work = false; |
912 | switch (migf->load_state) { |
913 | case MLX5_VF_LOAD_STATE_READ_HEADER: |
914 | ret = mlx5vf_resume_read_header(migf, vhca_buf: vhca_buf_header, |
915 | buf: &buf, len: &len, pos, |
916 | done: &done, has_work: &has_work); |
917 | if (ret) |
918 | goto out_unlock; |
919 | break; |
920 | case MLX5_VF_LOAD_STATE_PREP_HEADER_DATA: |
921 | if (vhca_buf_header->allocated_length < migf->record_size) { |
922 | mlx5vf_free_data_buffer(buf: vhca_buf_header); |
923 | |
924 | migf->buf_header[0] = mlx5vf_alloc_data_buffer(migf, |
925 | length: migf->record_size, dma_dir: DMA_NONE); |
926 | if (IS_ERR(ptr: migf->buf_header[0])) { |
927 | ret = PTR_ERR(ptr: migf->buf_header[0]); |
928 | migf->buf_header[0] = NULL; |
929 | goto out_unlock; |
930 | } |
931 | |
932 | vhca_buf_header = migf->buf_header[0]; |
933 | } |
934 | |
935 | vhca_buf_header->start_pos = migf->max_pos; |
936 | migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER_DATA; |
937 | break; |
938 | case MLX5_VF_LOAD_STATE_READ_HEADER_DATA: |
939 | ret = mlx5vf_resume_read_header_data(migf, vhca_buf: vhca_buf_header, |
940 | buf: &buf, len: &len, pos, done: &done); |
941 | if (ret) |
942 | goto out_unlock; |
943 | break; |
944 | case MLX5_VF_LOAD_STATE_PREP_IMAGE: |
945 | { |
946 | u64 size = max(migf->record_size, |
947 | migf->stop_copy_prep_size); |
948 | |
949 | if (vhca_buf->allocated_length < size) { |
950 | mlx5vf_free_data_buffer(buf: vhca_buf); |
951 | |
952 | migf->buf[0] = mlx5vf_alloc_data_buffer(migf, |
953 | length: size, dma_dir: DMA_TO_DEVICE); |
954 | if (IS_ERR(ptr: migf->buf[0])) { |
955 | ret = PTR_ERR(ptr: migf->buf[0]); |
956 | migf->buf[0] = NULL; |
957 | goto out_unlock; |
958 | } |
959 | |
960 | vhca_buf = migf->buf[0]; |
961 | } |
962 | |
963 | vhca_buf->start_pos = migf->max_pos; |
964 | migf->load_state = MLX5_VF_LOAD_STATE_READ_IMAGE; |
965 | break; |
966 | } |
967 | case MLX5_VF_LOAD_STATE_READ_IMAGE: |
968 | ret = mlx5vf_resume_read_image(migf, vhca_buf, |
969 | image_size: migf->record_size, |
970 | buf: &buf, len: &len, pos, done: &done, has_work: &has_work); |
971 | if (ret) |
972 | goto out_unlock; |
973 | break; |
974 | case MLX5_VF_LOAD_STATE_LOAD_IMAGE: |
975 | ret = mlx5vf_cmd_load_vhca_state(mvdev: migf->mvdev, migf, buf: vhca_buf); |
976 | if (ret) |
977 | goto out_unlock; |
978 | migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; |
979 | |
980 | /* prep header buf for next image */ |
981 | vhca_buf_header->length = 0; |
982 | /* prep data buf for next image */ |
983 | vhca_buf->length = 0; |
984 | |
985 | break; |
986 | default: |
987 | break; |
988 | } |
989 | } |
990 | |
991 | out_unlock: |
992 | if (ret) |
993 | migf->state = MLX5_MIGF_STATE_ERROR; |
994 | mutex_unlock(lock: &migf->lock); |
995 | mlx5vf_state_mutex_unlock(mvdev: migf->mvdev); |
996 | return ret ? ret : done; |
997 | } |
998 | |
999 | static const struct file_operations mlx5vf_resume_fops = { |
1000 | .owner = THIS_MODULE, |
1001 | .write = mlx5vf_resume_write, |
1002 | .release = mlx5vf_release_file, |
1003 | .llseek = no_llseek, |
1004 | }; |
1005 | |
1006 | static struct mlx5_vf_migration_file * |
1007 | mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) |
1008 | { |
1009 | struct mlx5_vf_migration_file *migf; |
1010 | struct mlx5_vhca_data_buffer *buf; |
1011 | int ret; |
1012 | |
1013 | migf = kzalloc(size: sizeof(*migf), GFP_KERNEL_ACCOUNT); |
1014 | if (!migf) |
1015 | return ERR_PTR(error: -ENOMEM); |
1016 | |
1017 | migf->filp = anon_inode_getfile(name: "mlx5vf_mig" , fops: &mlx5vf_resume_fops, priv: migf, |
1018 | O_WRONLY); |
1019 | if (IS_ERR(ptr: migf->filp)) { |
1020 | ret = PTR_ERR(ptr: migf->filp); |
1021 | goto end; |
1022 | } |
1023 | |
1024 | migf->mvdev = mvdev; |
1025 | ret = mlx5vf_cmd_alloc_pd(migf); |
1026 | if (ret) |
1027 | goto out_free; |
1028 | |
1029 | buf = mlx5vf_alloc_data_buffer(migf, length: 0, dma_dir: DMA_TO_DEVICE); |
1030 | if (IS_ERR(ptr: buf)) { |
1031 | ret = PTR_ERR(ptr: buf); |
1032 | goto out_pd; |
1033 | } |
1034 | |
1035 | migf->buf[0] = buf; |
1036 | buf = mlx5vf_alloc_data_buffer(migf, |
1037 | length: sizeof(struct mlx5_vf_migration_header), dma_dir: DMA_NONE); |
1038 | if (IS_ERR(ptr: buf)) { |
1039 | ret = PTR_ERR(ptr: buf); |
1040 | goto out_buf; |
1041 | } |
1042 | |
1043 | migf->buf_header[0] = buf; |
1044 | migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; |
1045 | |
1046 | stream_open(inode: migf->filp->f_inode, filp: migf->filp); |
1047 | mutex_init(&migf->lock); |
1048 | INIT_LIST_HEAD(list: &migf->buf_list); |
1049 | INIT_LIST_HEAD(list: &migf->avail_list); |
1050 | spin_lock_init(&migf->list_lock); |
1051 | return migf; |
1052 | out_buf: |
1053 | mlx5vf_free_data_buffer(buf: migf->buf[0]); |
1054 | out_pd: |
1055 | mlx5vf_cmd_dealloc_pd(migf); |
1056 | out_free: |
1057 | fput(migf->filp); |
1058 | end: |
1059 | kfree(objp: migf); |
1060 | return ERR_PTR(error: ret); |
1061 | } |
1062 | |
1063 | void mlx5vf_disable_fds(struct mlx5vf_pci_core_device *mvdev, |
1064 | enum mlx5_vf_migf_state *last_save_state) |
1065 | { |
1066 | if (mvdev->resuming_migf) { |
1067 | mlx5vf_disable_fd(migf: mvdev->resuming_migf); |
1068 | mlx5fv_cmd_clean_migf_resources(migf: mvdev->resuming_migf); |
1069 | fput(mvdev->resuming_migf->filp); |
1070 | mvdev->resuming_migf = NULL; |
1071 | } |
1072 | if (mvdev->saving_migf) { |
1073 | mlx5_cmd_cleanup_async_ctx(ctx: &mvdev->saving_migf->async_ctx); |
1074 | cancel_work_sync(work: &mvdev->saving_migf->async_data.work); |
1075 | if (last_save_state) |
1076 | *last_save_state = mvdev->saving_migf->state; |
1077 | mlx5vf_disable_fd(migf: mvdev->saving_migf); |
1078 | wake_up_interruptible(&mvdev->saving_migf->poll_wait); |
1079 | mlx5fv_cmd_clean_migf_resources(migf: mvdev->saving_migf); |
1080 | fput(mvdev->saving_migf->filp); |
1081 | mvdev->saving_migf = NULL; |
1082 | } |
1083 | } |
1084 | |
1085 | static struct file * |
1086 | mlx5vf_pci_step_device_state_locked(struct mlx5vf_pci_core_device *mvdev, |
1087 | u32 new) |
1088 | { |
1089 | u32 cur = mvdev->mig_state; |
1090 | int ret; |
1091 | |
1092 | if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { |
1093 | ret = mlx5vf_cmd_suspend_vhca(mvdev, |
1094 | op_mod: MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); |
1095 | if (ret) |
1096 | return ERR_PTR(error: ret); |
1097 | return NULL; |
1098 | } |
1099 | |
1100 | if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { |
1101 | ret = mlx5vf_cmd_resume_vhca(mvdev, |
1102 | op_mod: MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_RESPONDER); |
1103 | if (ret) |
1104 | return ERR_PTR(error: ret); |
1105 | return NULL; |
1106 | } |
1107 | |
1108 | if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || |
1109 | (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { |
1110 | ret = mlx5vf_cmd_suspend_vhca(mvdev, |
1111 | op_mod: MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_INITIATOR); |
1112 | if (ret) |
1113 | return ERR_PTR(error: ret); |
1114 | return NULL; |
1115 | } |
1116 | |
1117 | if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || |
1118 | (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { |
1119 | ret = mlx5vf_cmd_resume_vhca(mvdev, |
1120 | op_mod: MLX5_RESUME_VHCA_IN_OP_MOD_RESUME_INITIATOR); |
1121 | if (ret) |
1122 | return ERR_PTR(error: ret); |
1123 | return NULL; |
1124 | } |
1125 | |
1126 | if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { |
1127 | struct mlx5_vf_migration_file *migf; |
1128 | |
1129 | migf = mlx5vf_pci_save_device_data(mvdev, track: false); |
1130 | if (IS_ERR(ptr: migf)) |
1131 | return ERR_CAST(ptr: migf); |
1132 | get_file(f: migf->filp); |
1133 | mvdev->saving_migf = migf; |
1134 | return migf->filp; |
1135 | } |
1136 | |
1137 | if (cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) { |
1138 | mlx5vf_disable_fds(mvdev, NULL); |
1139 | return NULL; |
1140 | } |
1141 | |
1142 | if ((cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || |
1143 | (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && |
1144 | new == VFIO_DEVICE_STATE_RUNNING_P2P)) { |
1145 | struct mlx5_vf_migration_file *migf = mvdev->saving_migf; |
1146 | struct mlx5_vhca_data_buffer *buf; |
1147 | enum mlx5_vf_migf_state state; |
1148 | size_t size; |
1149 | |
1150 | ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &size, NULL, |
1151 | query_flags: MLX5VF_QUERY_INC | MLX5VF_QUERY_CLEANUP); |
1152 | if (ret) |
1153 | return ERR_PTR(error: ret); |
1154 | buf = mlx5vf_get_data_buffer(migf, length: size, dma_dir: DMA_FROM_DEVICE); |
1155 | if (IS_ERR(ptr: buf)) |
1156 | return ERR_CAST(ptr: buf); |
1157 | /* pre_copy cleanup */ |
1158 | ret = mlx5vf_cmd_save_vhca_state(mvdev, migf, buf, inc: false, track: false); |
1159 | if (ret) { |
1160 | mlx5vf_put_data_buffer(buf); |
1161 | return ERR_PTR(error: ret); |
1162 | } |
1163 | mlx5vf_disable_fds(mvdev, last_save_state: &state); |
1164 | return (state != MLX5_MIGF_STATE_ERROR) ? NULL : ERR_PTR(error: -EIO); |
1165 | } |
1166 | |
1167 | if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { |
1168 | struct mlx5_vf_migration_file *migf; |
1169 | |
1170 | migf = mlx5vf_pci_resume_device_data(mvdev); |
1171 | if (IS_ERR(ptr: migf)) |
1172 | return ERR_CAST(ptr: migf); |
1173 | get_file(f: migf->filp); |
1174 | mvdev->resuming_migf = migf; |
1175 | return migf->filp; |
1176 | } |
1177 | |
1178 | if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { |
1179 | mlx5vf_disable_fds(mvdev, NULL); |
1180 | return NULL; |
1181 | } |
1182 | |
1183 | if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || |
1184 | (cur == VFIO_DEVICE_STATE_RUNNING_P2P && |
1185 | new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { |
1186 | struct mlx5_vf_migration_file *migf; |
1187 | |
1188 | migf = mlx5vf_pci_save_device_data(mvdev, track: true); |
1189 | if (IS_ERR(ptr: migf)) |
1190 | return ERR_CAST(ptr: migf); |
1191 | get_file(f: migf->filp); |
1192 | mvdev->saving_migf = migf; |
1193 | return migf->filp; |
1194 | } |
1195 | |
1196 | if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { |
1197 | ret = mlx5vf_cmd_suspend_vhca(mvdev, |
1198 | op_mod: MLX5_SUSPEND_VHCA_IN_OP_MOD_SUSPEND_RESPONDER); |
1199 | if (ret) |
1200 | return ERR_PTR(error: ret); |
1201 | ret = mlx5vf_pci_save_device_inc_data(mvdev); |
1202 | return ret ? ERR_PTR(error: ret) : NULL; |
1203 | } |
1204 | |
1205 | /* |
1206 | * vfio_mig_get_next_state() does not use arcs other than the above |
1207 | */ |
1208 | WARN_ON(true); |
1209 | return ERR_PTR(error: -EINVAL); |
1210 | } |
1211 | |
1212 | /* |
1213 | * This function is called in all state_mutex unlock cases to |
1214 | * handle a 'deferred_reset' if exists. |
1215 | */ |
1216 | void mlx5vf_state_mutex_unlock(struct mlx5vf_pci_core_device *mvdev) |
1217 | { |
1218 | again: |
1219 | spin_lock(lock: &mvdev->reset_lock); |
1220 | if (mvdev->deferred_reset) { |
1221 | mvdev->deferred_reset = false; |
1222 | spin_unlock(lock: &mvdev->reset_lock); |
1223 | mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; |
1224 | mlx5vf_disable_fds(mvdev, NULL); |
1225 | goto again; |
1226 | } |
1227 | mutex_unlock(lock: &mvdev->state_mutex); |
1228 | spin_unlock(lock: &mvdev->reset_lock); |
1229 | } |
1230 | |
1231 | static struct file * |
1232 | mlx5vf_pci_set_device_state(struct vfio_device *vdev, |
1233 | enum vfio_device_mig_state new_state) |
1234 | { |
1235 | struct mlx5vf_pci_core_device *mvdev = container_of( |
1236 | vdev, struct mlx5vf_pci_core_device, core_device.vdev); |
1237 | enum vfio_device_mig_state next_state; |
1238 | struct file *res = NULL; |
1239 | int ret; |
1240 | |
1241 | mutex_lock(&mvdev->state_mutex); |
1242 | while (new_state != mvdev->mig_state) { |
1243 | ret = vfio_mig_get_next_state(device: vdev, cur_fsm: mvdev->mig_state, |
1244 | new_fsm: new_state, next_fsm: &next_state); |
1245 | if (ret) { |
1246 | res = ERR_PTR(error: ret); |
1247 | break; |
1248 | } |
1249 | res = mlx5vf_pci_step_device_state_locked(mvdev, new: next_state); |
1250 | if (IS_ERR(ptr: res)) |
1251 | break; |
1252 | mvdev->mig_state = next_state; |
1253 | if (WARN_ON(res && new_state != mvdev->mig_state)) { |
1254 | fput(res); |
1255 | res = ERR_PTR(error: -EINVAL); |
1256 | break; |
1257 | } |
1258 | } |
1259 | mlx5vf_state_mutex_unlock(mvdev); |
1260 | return res; |
1261 | } |
1262 | |
1263 | static int mlx5vf_pci_get_data_size(struct vfio_device *vdev, |
1264 | unsigned long *stop_copy_length) |
1265 | { |
1266 | struct mlx5vf_pci_core_device *mvdev = container_of( |
1267 | vdev, struct mlx5vf_pci_core_device, core_device.vdev); |
1268 | size_t state_size; |
1269 | u64 total_size; |
1270 | int ret; |
1271 | |
1272 | mutex_lock(&mvdev->state_mutex); |
1273 | ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, state_size: &state_size, |
1274 | total_size: &total_size, query_flags: 0); |
1275 | if (!ret) |
1276 | *stop_copy_length = total_size; |
1277 | mlx5vf_state_mutex_unlock(mvdev); |
1278 | return ret; |
1279 | } |
1280 | |
1281 | static int mlx5vf_pci_get_device_state(struct vfio_device *vdev, |
1282 | enum vfio_device_mig_state *curr_state) |
1283 | { |
1284 | struct mlx5vf_pci_core_device *mvdev = container_of( |
1285 | vdev, struct mlx5vf_pci_core_device, core_device.vdev); |
1286 | |
1287 | mutex_lock(&mvdev->state_mutex); |
1288 | *curr_state = mvdev->mig_state; |
1289 | mlx5vf_state_mutex_unlock(mvdev); |
1290 | return 0; |
1291 | } |
1292 | |
1293 | static void mlx5vf_pci_aer_reset_done(struct pci_dev *pdev) |
1294 | { |
1295 | struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); |
1296 | |
1297 | if (!mvdev->migrate_cap) |
1298 | return; |
1299 | |
1300 | /* |
1301 | * As the higher VFIO layers are holding locks across reset and using |
1302 | * those same locks with the mm_lock we need to prevent ABBA deadlock |
1303 | * with the state_mutex and mm_lock. |
1304 | * In case the state_mutex was taken already we defer the cleanup work |
1305 | * to the unlock flow of the other running context. |
1306 | */ |
1307 | spin_lock(lock: &mvdev->reset_lock); |
1308 | mvdev->deferred_reset = true; |
1309 | if (!mutex_trylock(lock: &mvdev->state_mutex)) { |
1310 | spin_unlock(lock: &mvdev->reset_lock); |
1311 | return; |
1312 | } |
1313 | spin_unlock(lock: &mvdev->reset_lock); |
1314 | mlx5vf_state_mutex_unlock(mvdev); |
1315 | } |
1316 | |
1317 | static int mlx5vf_pci_open_device(struct vfio_device *core_vdev) |
1318 | { |
1319 | struct mlx5vf_pci_core_device *mvdev = container_of( |
1320 | core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); |
1321 | struct vfio_pci_core_device *vdev = &mvdev->core_device; |
1322 | int ret; |
1323 | |
1324 | ret = vfio_pci_core_enable(vdev); |
1325 | if (ret) |
1326 | return ret; |
1327 | |
1328 | if (mvdev->migrate_cap) |
1329 | mvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; |
1330 | vfio_pci_core_finish_enable(vdev); |
1331 | return 0; |
1332 | } |
1333 | |
1334 | static void mlx5vf_pci_close_device(struct vfio_device *core_vdev) |
1335 | { |
1336 | struct mlx5vf_pci_core_device *mvdev = container_of( |
1337 | core_vdev, struct mlx5vf_pci_core_device, core_device.vdev); |
1338 | |
1339 | mlx5vf_cmd_close_migratable(mvdev); |
1340 | vfio_pci_core_close_device(core_vdev); |
1341 | } |
1342 | |
1343 | static const struct vfio_migration_ops mlx5vf_pci_mig_ops = { |
1344 | .migration_set_state = mlx5vf_pci_set_device_state, |
1345 | .migration_get_state = mlx5vf_pci_get_device_state, |
1346 | .migration_get_data_size = mlx5vf_pci_get_data_size, |
1347 | }; |
1348 | |
1349 | static const struct vfio_log_ops mlx5vf_pci_log_ops = { |
1350 | .log_start = mlx5vf_start_page_tracker, |
1351 | .log_stop = mlx5vf_stop_page_tracker, |
1352 | .log_read_and_clear = mlx5vf_tracker_read_and_clear, |
1353 | }; |
1354 | |
1355 | static int mlx5vf_pci_init_dev(struct vfio_device *core_vdev) |
1356 | { |
1357 | struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, |
1358 | struct mlx5vf_pci_core_device, core_device.vdev); |
1359 | int ret; |
1360 | |
1361 | ret = vfio_pci_core_init_dev(core_vdev); |
1362 | if (ret) |
1363 | return ret; |
1364 | |
1365 | mlx5vf_cmd_set_migratable(mvdev, mig_ops: &mlx5vf_pci_mig_ops, |
1366 | log_ops: &mlx5vf_pci_log_ops); |
1367 | |
1368 | return 0; |
1369 | } |
1370 | |
1371 | static void mlx5vf_pci_release_dev(struct vfio_device *core_vdev) |
1372 | { |
1373 | struct mlx5vf_pci_core_device *mvdev = container_of(core_vdev, |
1374 | struct mlx5vf_pci_core_device, core_device.vdev); |
1375 | |
1376 | mlx5vf_cmd_remove_migratable(mvdev); |
1377 | vfio_pci_core_release_dev(core_vdev); |
1378 | } |
1379 | |
1380 | static const struct vfio_device_ops mlx5vf_pci_ops = { |
1381 | .name = "mlx5-vfio-pci" , |
1382 | .init = mlx5vf_pci_init_dev, |
1383 | .release = mlx5vf_pci_release_dev, |
1384 | .open_device = mlx5vf_pci_open_device, |
1385 | .close_device = mlx5vf_pci_close_device, |
1386 | .ioctl = vfio_pci_core_ioctl, |
1387 | .device_feature = vfio_pci_core_ioctl_feature, |
1388 | .read = vfio_pci_core_read, |
1389 | .write = vfio_pci_core_write, |
1390 | .mmap = vfio_pci_core_mmap, |
1391 | .request = vfio_pci_core_request, |
1392 | .match = vfio_pci_core_match, |
1393 | .bind_iommufd = vfio_iommufd_physical_bind, |
1394 | .unbind_iommufd = vfio_iommufd_physical_unbind, |
1395 | .attach_ioas = vfio_iommufd_physical_attach_ioas, |
1396 | .detach_ioas = vfio_iommufd_physical_detach_ioas, |
1397 | }; |
1398 | |
1399 | static int mlx5vf_pci_probe(struct pci_dev *pdev, |
1400 | const struct pci_device_id *id) |
1401 | { |
1402 | struct mlx5vf_pci_core_device *mvdev; |
1403 | int ret; |
1404 | |
1405 | mvdev = vfio_alloc_device(mlx5vf_pci_core_device, core_device.vdev, |
1406 | &pdev->dev, &mlx5vf_pci_ops); |
1407 | if (IS_ERR(ptr: mvdev)) |
1408 | return PTR_ERR(ptr: mvdev); |
1409 | |
1410 | dev_set_drvdata(dev: &pdev->dev, data: &mvdev->core_device); |
1411 | ret = vfio_pci_core_register_device(vdev: &mvdev->core_device); |
1412 | if (ret) |
1413 | goto out_put_vdev; |
1414 | return 0; |
1415 | |
1416 | out_put_vdev: |
1417 | vfio_put_device(device: &mvdev->core_device.vdev); |
1418 | return ret; |
1419 | } |
1420 | |
1421 | static void mlx5vf_pci_remove(struct pci_dev *pdev) |
1422 | { |
1423 | struct mlx5vf_pci_core_device *mvdev = mlx5vf_drvdata(pdev); |
1424 | |
1425 | vfio_pci_core_unregister_device(vdev: &mvdev->core_device); |
1426 | vfio_put_device(device: &mvdev->core_device.vdev); |
1427 | } |
1428 | |
1429 | static const struct pci_device_id mlx5vf_pci_table[] = { |
1430 | { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_MELLANOX, 0x101e) }, /* ConnectX Family mlx5Gen Virtual Function */ |
1431 | {} |
1432 | }; |
1433 | |
1434 | MODULE_DEVICE_TABLE(pci, mlx5vf_pci_table); |
1435 | |
1436 | static const struct pci_error_handlers mlx5vf_err_handlers = { |
1437 | .reset_done = mlx5vf_pci_aer_reset_done, |
1438 | .error_detected = vfio_pci_core_aer_err_detected, |
1439 | }; |
1440 | |
1441 | static struct pci_driver mlx5vf_pci_driver = { |
1442 | .name = KBUILD_MODNAME, |
1443 | .id_table = mlx5vf_pci_table, |
1444 | .probe = mlx5vf_pci_probe, |
1445 | .remove = mlx5vf_pci_remove, |
1446 | .err_handler = &mlx5vf_err_handlers, |
1447 | .driver_managed_dma = true, |
1448 | }; |
1449 | |
1450 | module_pci_driver(mlx5vf_pci_driver); |
1451 | |
1452 | MODULE_IMPORT_NS(IOMMUFD); |
1453 | MODULE_LICENSE("GPL" ); |
1454 | MODULE_AUTHOR("Max Gurtovoy <mgurtovoy@nvidia.com>" ); |
1455 | MODULE_AUTHOR("Yishai Hadas <yishaih@nvidia.com>" ); |
1456 | MODULE_DESCRIPTION( |
1457 | "MLX5 VFIO PCI - User Level meta-driver for MLX5 device family" ); |
1458 | |