1// SPDX-License-Identifier: GPL-2.0 OR Linux-OpenIB
2/*
3 * Copyright (c) 2021-2022, NVIDIA CORPORATION & AFFILIATES. All rights reserved
4 */
5
6#include "cmd.h"
7
8enum { CQ_OK = 0, CQ_EMPTY = -1, CQ_POLL_ERR = -2 };
9
10static int mlx5vf_is_migratable(struct mlx5_core_dev *mdev, u16 func_id)
11{
12 int query_sz = MLX5_ST_SZ_BYTES(query_hca_cap_out);
13 void *query_cap = NULL, *cap;
14 int ret;
15
16 query_cap = kzalloc(size: query_sz, GFP_KERNEL);
17 if (!query_cap)
18 return -ENOMEM;
19
20 ret = mlx5_vport_get_other_func_cap(dev: mdev, vport: func_id, out: query_cap,
21 opmod: MLX5_CAP_GENERAL_2);
22 if (ret)
23 goto out;
24
25 cap = MLX5_ADDR_OF(query_hca_cap_out, query_cap, capability);
26 if (!MLX5_GET(cmd_hca_cap_2, cap, migratable))
27 ret = -EOPNOTSUPP;
28out:
29 kfree(objp: query_cap);
30 return ret;
31}
32
33static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
34 u16 *vhca_id);
35static void
36_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev);
37
38int mlx5vf_cmd_suspend_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
39{
40 struct mlx5_vf_migration_file *migf = mvdev->saving_migf;
41 u32 out[MLX5_ST_SZ_DW(suspend_vhca_out)] = {};
42 u32 in[MLX5_ST_SZ_DW(suspend_vhca_in)] = {};
43 int err;
44
45 lockdep_assert_held(&mvdev->state_mutex);
46 if (mvdev->mdev_detach)
47 return -ENOTCONN;
48
49 /*
50 * In case PRE_COPY is used, saving_migf is exposed while the device is
51 * running. Make sure to run only once there is no active save command.
52 * Running both in parallel, might end-up with a failure in the save
53 * command once it will try to turn on 'tracking' on a suspended device.
54 */
55 if (migf) {
56 err = wait_for_completion_interruptible(x: &migf->save_comp);
57 if (err)
58 return err;
59 }
60
61 MLX5_SET(suspend_vhca_in, in, opcode, MLX5_CMD_OP_SUSPEND_VHCA);
62 MLX5_SET(suspend_vhca_in, in, vhca_id, mvdev->vhca_id);
63 MLX5_SET(suspend_vhca_in, in, op_mod, op_mod);
64
65 err = mlx5_cmd_exec_inout(mvdev->mdev, suspend_vhca, in, out);
66 if (migf)
67 complete(&migf->save_comp);
68
69 return err;
70}
71
72int mlx5vf_cmd_resume_vhca(struct mlx5vf_pci_core_device *mvdev, u16 op_mod)
73{
74 u32 out[MLX5_ST_SZ_DW(resume_vhca_out)] = {};
75 u32 in[MLX5_ST_SZ_DW(resume_vhca_in)] = {};
76
77 lockdep_assert_held(&mvdev->state_mutex);
78 if (mvdev->mdev_detach)
79 return -ENOTCONN;
80
81 MLX5_SET(resume_vhca_in, in, opcode, MLX5_CMD_OP_RESUME_VHCA);
82 MLX5_SET(resume_vhca_in, in, vhca_id, mvdev->vhca_id);
83 MLX5_SET(resume_vhca_in, in, op_mod, op_mod);
84
85 return mlx5_cmd_exec_inout(mvdev->mdev, resume_vhca, in, out);
86}
87
88int mlx5vf_cmd_query_vhca_migration_state(struct mlx5vf_pci_core_device *mvdev,
89 size_t *state_size, u64 *total_size,
90 u8 query_flags)
91{
92 u32 out[MLX5_ST_SZ_DW(query_vhca_migration_state_out)] = {};
93 u32 in[MLX5_ST_SZ_DW(query_vhca_migration_state_in)] = {};
94 bool inc = query_flags & MLX5VF_QUERY_INC;
95 int ret;
96
97 lockdep_assert_held(&mvdev->state_mutex);
98 if (mvdev->mdev_detach)
99 return -ENOTCONN;
100
101 /*
102 * In case PRE_COPY is used, saving_migf is exposed while device is
103 * running. Make sure to run only once there is no active save command.
104 * Running both in parallel, might end-up with a failure in the
105 * incremental query command on un-tracked vhca.
106 */
107 if (inc) {
108 ret = wait_for_completion_interruptible(x: &mvdev->saving_migf->save_comp);
109 if (ret)
110 return ret;
111 /* Upon cleanup, ignore previous pre_copy error state */
112 if (mvdev->saving_migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR &&
113 !(query_flags & MLX5VF_QUERY_CLEANUP)) {
114 /*
115 * In case we had a PRE_COPY error, only query full
116 * image for final image
117 */
118 if (!(query_flags & MLX5VF_QUERY_FINAL)) {
119 *state_size = 0;
120 complete(&mvdev->saving_migf->save_comp);
121 return 0;
122 }
123 query_flags &= ~MLX5VF_QUERY_INC;
124 }
125 /* Block incremental query which is state-dependent */
126 if (mvdev->saving_migf->state == MLX5_MIGF_STATE_ERROR) {
127 complete(&mvdev->saving_migf->save_comp);
128 return -ENODEV;
129 }
130 }
131
132 MLX5_SET(query_vhca_migration_state_in, in, opcode,
133 MLX5_CMD_OP_QUERY_VHCA_MIGRATION_STATE);
134 MLX5_SET(query_vhca_migration_state_in, in, vhca_id, mvdev->vhca_id);
135 MLX5_SET(query_vhca_migration_state_in, in, op_mod, 0);
136 MLX5_SET(query_vhca_migration_state_in, in, incremental,
137 query_flags & MLX5VF_QUERY_INC);
138 MLX5_SET(query_vhca_migration_state_in, in, chunk, mvdev->chunk_mode);
139
140 ret = mlx5_cmd_exec_inout(mvdev->mdev, query_vhca_migration_state, in,
141 out);
142 if (inc)
143 complete(&mvdev->saving_migf->save_comp);
144
145 if (ret)
146 return ret;
147
148 *state_size = MLX5_GET(query_vhca_migration_state_out, out,
149 required_umem_size);
150 if (total_size)
151 *total_size = mvdev->chunk_mode ?
152 MLX5_GET64(query_vhca_migration_state_out, out,
153 remaining_total_size) : *state_size;
154
155 return 0;
156}
157
158static void set_tracker_change_event(struct mlx5vf_pci_core_device *mvdev)
159{
160 mvdev->tracker.object_changed = true;
161 complete(&mvdev->tracker_comp);
162}
163
164static void set_tracker_error(struct mlx5vf_pci_core_device *mvdev)
165{
166 /* Mark the tracker under an error and wake it up if it's running */
167 mvdev->tracker.is_err = true;
168 complete(&mvdev->tracker_comp);
169}
170
171static int mlx5fv_vf_event(struct notifier_block *nb,
172 unsigned long event, void *data)
173{
174 struct mlx5vf_pci_core_device *mvdev =
175 container_of(nb, struct mlx5vf_pci_core_device, nb);
176
177 switch (event) {
178 case MLX5_PF_NOTIFY_ENABLE_VF:
179 mutex_lock(&mvdev->state_mutex);
180 mvdev->mdev_detach = false;
181 mlx5vf_state_mutex_unlock(mvdev);
182 break;
183 case MLX5_PF_NOTIFY_DISABLE_VF:
184 mlx5vf_cmd_close_migratable(mvdev);
185 mutex_lock(&mvdev->state_mutex);
186 mvdev->mdev_detach = true;
187 mlx5vf_state_mutex_unlock(mvdev);
188 break;
189 default:
190 break;
191 }
192
193 return 0;
194}
195
196void mlx5vf_cmd_close_migratable(struct mlx5vf_pci_core_device *mvdev)
197{
198 if (!mvdev->migrate_cap)
199 return;
200
201 /* Must be done outside the lock to let it progress */
202 set_tracker_error(mvdev);
203 mutex_lock(&mvdev->state_mutex);
204 mlx5vf_disable_fds(mvdev, NULL);
205 _mlx5vf_free_page_tracker_resources(mvdev);
206 mlx5vf_state_mutex_unlock(mvdev);
207}
208
209void mlx5vf_cmd_remove_migratable(struct mlx5vf_pci_core_device *mvdev)
210{
211 if (!mvdev->migrate_cap)
212 return;
213
214 mlx5_sriov_blocking_notifier_unregister(mdev: mvdev->mdev, vf_id: mvdev->vf_id,
215 nb: &mvdev->nb);
216 destroy_workqueue(wq: mvdev->cb_wq);
217}
218
219void mlx5vf_cmd_set_migratable(struct mlx5vf_pci_core_device *mvdev,
220 const struct vfio_migration_ops *mig_ops,
221 const struct vfio_log_ops *log_ops)
222{
223 struct pci_dev *pdev = mvdev->core_device.pdev;
224 int ret;
225
226 if (!pdev->is_virtfn)
227 return;
228
229 mvdev->mdev = mlx5_vf_get_core_dev(pdev);
230 if (!mvdev->mdev)
231 return;
232
233 if (!MLX5_CAP_GEN(mvdev->mdev, migration))
234 goto end;
235
236 if (!(MLX5_CAP_GEN_2(mvdev->mdev, migration_multi_load) &&
237 MLX5_CAP_GEN_2(mvdev->mdev, migration_tracking_state)))
238 goto end;
239
240 mvdev->vf_id = pci_iov_vf_id(dev: pdev);
241 if (mvdev->vf_id < 0)
242 goto end;
243
244 ret = mlx5vf_is_migratable(mdev: mvdev->mdev, func_id: mvdev->vf_id + 1);
245 if (ret)
246 goto end;
247
248 if (mlx5vf_cmd_get_vhca_id(mdev: mvdev->mdev, function_id: mvdev->vf_id + 1,
249 vhca_id: &mvdev->vhca_id))
250 goto end;
251
252 mvdev->cb_wq = alloc_ordered_workqueue("mlx5vf_wq", 0);
253 if (!mvdev->cb_wq)
254 goto end;
255
256 mutex_init(&mvdev->state_mutex);
257 spin_lock_init(&mvdev->reset_lock);
258 mvdev->nb.notifier_call = mlx5fv_vf_event;
259 ret = mlx5_sriov_blocking_notifier_register(mdev: mvdev->mdev, vf_id: mvdev->vf_id,
260 nb: &mvdev->nb);
261 if (ret) {
262 destroy_workqueue(wq: mvdev->cb_wq);
263 goto end;
264 }
265
266 mvdev->migrate_cap = 1;
267 mvdev->core_device.vdev.migration_flags =
268 VFIO_MIGRATION_STOP_COPY |
269 VFIO_MIGRATION_P2P |
270 VFIO_MIGRATION_PRE_COPY;
271
272 mvdev->core_device.vdev.mig_ops = mig_ops;
273 init_completion(x: &mvdev->tracker_comp);
274 if (MLX5_CAP_GEN(mvdev->mdev, adv_virtualization))
275 mvdev->core_device.vdev.log_ops = log_ops;
276
277 if (MLX5_CAP_GEN_2(mvdev->mdev, migration_in_chunks))
278 mvdev->chunk_mode = 1;
279
280end:
281 mlx5_vf_put_core_dev(mdev: mvdev->mdev);
282}
283
284static int mlx5vf_cmd_get_vhca_id(struct mlx5_core_dev *mdev, u16 function_id,
285 u16 *vhca_id)
286{
287 u32 in[MLX5_ST_SZ_DW(query_hca_cap_in)] = {};
288 int out_size;
289 void *out;
290 int ret;
291
292 out_size = MLX5_ST_SZ_BYTES(query_hca_cap_out);
293 out = kzalloc(size: out_size, GFP_KERNEL);
294 if (!out)
295 return -ENOMEM;
296
297 MLX5_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP);
298 MLX5_SET(query_hca_cap_in, in, other_function, 1);
299 MLX5_SET(query_hca_cap_in, in, function_id, function_id);
300 MLX5_SET(query_hca_cap_in, in, op_mod,
301 MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE << 1 |
302 HCA_CAP_OPMOD_GET_CUR);
303
304 ret = mlx5_cmd_exec_inout(mdev, query_hca_cap, in, out);
305 if (ret)
306 goto err_exec;
307
308 *vhca_id = MLX5_GET(query_hca_cap_out, out,
309 capability.cmd_hca_cap.vhca_id);
310
311err_exec:
312 kfree(objp: out);
313 return ret;
314}
315
316static int _create_mkey(struct mlx5_core_dev *mdev, u32 pdn,
317 struct mlx5_vhca_data_buffer *buf,
318 struct mlx5_vhca_recv_buf *recv_buf,
319 u32 *mkey)
320{
321 size_t npages = buf ? DIV_ROUND_UP(buf->allocated_length, PAGE_SIZE) :
322 recv_buf->npages;
323 int err = 0, inlen;
324 __be64 *mtt;
325 void *mkc;
326 u32 *in;
327
328 inlen = MLX5_ST_SZ_BYTES(create_mkey_in) +
329 sizeof(*mtt) * round_up(npages, 2);
330
331 in = kvzalloc(size: inlen, GFP_KERNEL);
332 if (!in)
333 return -ENOMEM;
334
335 MLX5_SET(create_mkey_in, in, translations_octword_actual_size,
336 DIV_ROUND_UP(npages, 2));
337 mtt = (__be64 *)MLX5_ADDR_OF(create_mkey_in, in, klm_pas_mtt);
338
339 if (buf) {
340 struct sg_dma_page_iter dma_iter;
341
342 for_each_sgtable_dma_page(&buf->table.sgt, &dma_iter, 0)
343 *mtt++ = cpu_to_be64(sg_page_iter_dma_address(&dma_iter));
344 } else {
345 int i;
346
347 for (i = 0; i < npages; i++)
348 *mtt++ = cpu_to_be64(recv_buf->dma_addrs[i]);
349 }
350
351 mkc = MLX5_ADDR_OF(create_mkey_in, in, memory_key_mkey_entry);
352 MLX5_SET(mkc, mkc, access_mode_1_0, MLX5_MKC_ACCESS_MODE_MTT);
353 MLX5_SET(mkc, mkc, lr, 1);
354 MLX5_SET(mkc, mkc, lw, 1);
355 MLX5_SET(mkc, mkc, rr, 1);
356 MLX5_SET(mkc, mkc, rw, 1);
357 MLX5_SET(mkc, mkc, pd, pdn);
358 MLX5_SET(mkc, mkc, bsf_octword_size, 0);
359 MLX5_SET(mkc, mkc, qpn, 0xffffff);
360 MLX5_SET(mkc, mkc, log_page_size, PAGE_SHIFT);
361 MLX5_SET(mkc, mkc, translations_octword_size, DIV_ROUND_UP(npages, 2));
362 MLX5_SET64(mkc, mkc, len, npages * PAGE_SIZE);
363 err = mlx5_core_create_mkey(dev: mdev, mkey, in, inlen);
364 kvfree(addr: in);
365 return err;
366}
367
368static int mlx5vf_dma_data_buffer(struct mlx5_vhca_data_buffer *buf)
369{
370 struct mlx5vf_pci_core_device *mvdev = buf->migf->mvdev;
371 struct mlx5_core_dev *mdev = mvdev->mdev;
372 int ret;
373
374 lockdep_assert_held(&mvdev->state_mutex);
375 if (mvdev->mdev_detach)
376 return -ENOTCONN;
377
378 if (buf->dmaed || !buf->allocated_length)
379 return -EINVAL;
380
381 ret = dma_map_sgtable(dev: mdev->device, sgt: &buf->table.sgt, dir: buf->dma_dir, attrs: 0);
382 if (ret)
383 return ret;
384
385 ret = _create_mkey(mdev, pdn: buf->migf->pdn, buf, NULL, mkey: &buf->mkey);
386 if (ret)
387 goto err;
388
389 buf->dmaed = true;
390
391 return 0;
392err:
393 dma_unmap_sgtable(dev: mdev->device, sgt: &buf->table.sgt, dir: buf->dma_dir, attrs: 0);
394 return ret;
395}
396
397void mlx5vf_free_data_buffer(struct mlx5_vhca_data_buffer *buf)
398{
399 struct mlx5_vf_migration_file *migf = buf->migf;
400 struct sg_page_iter sg_iter;
401
402 lockdep_assert_held(&migf->mvdev->state_mutex);
403 WARN_ON(migf->mvdev->mdev_detach);
404
405 if (buf->dmaed) {
406 mlx5_core_destroy_mkey(dev: migf->mvdev->mdev, mkey: buf->mkey);
407 dma_unmap_sgtable(dev: migf->mvdev->mdev->device, sgt: &buf->table.sgt,
408 dir: buf->dma_dir, attrs: 0);
409 }
410
411 /* Undo alloc_pages_bulk_array() */
412 for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0)
413 __free_page(sg_page_iter_page(&sg_iter));
414 sg_free_append_table(sgt: &buf->table);
415 kfree(objp: buf);
416}
417
418static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf,
419 unsigned int npages)
420{
421 unsigned int to_alloc = npages;
422 struct page **page_list;
423 unsigned long filled;
424 unsigned int to_fill;
425 int ret;
426
427 to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list));
428 page_list = kvzalloc(size: to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT);
429 if (!page_list)
430 return -ENOMEM;
431
432 do {
433 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, nr_pages: to_fill,
434 page_array: page_list);
435 if (!filled) {
436 ret = -ENOMEM;
437 goto err;
438 }
439 to_alloc -= filled;
440 ret = sg_alloc_append_table_from_pages(
441 sgt: &buf->table, pages: page_list, n_pages: filled, offset: 0,
442 size: filled << PAGE_SHIFT, UINT_MAX, SG_MAX_SINGLE_ALLOC,
443 GFP_KERNEL_ACCOUNT);
444
445 if (ret)
446 goto err;
447 buf->allocated_length += filled * PAGE_SIZE;
448 /* clean input for another bulk allocation */
449 memset(page_list, 0, filled * sizeof(*page_list));
450 to_fill = min_t(unsigned int, to_alloc,
451 PAGE_SIZE / sizeof(*page_list));
452 } while (to_alloc > 0);
453
454 kvfree(addr: page_list);
455 return 0;
456
457err:
458 kvfree(addr: page_list);
459 return ret;
460}
461
462struct mlx5_vhca_data_buffer *
463mlx5vf_alloc_data_buffer(struct mlx5_vf_migration_file *migf,
464 size_t length,
465 enum dma_data_direction dma_dir)
466{
467 struct mlx5_vhca_data_buffer *buf;
468 int ret;
469
470 buf = kzalloc(size: sizeof(*buf), GFP_KERNEL_ACCOUNT);
471 if (!buf)
472 return ERR_PTR(error: -ENOMEM);
473
474 buf->dma_dir = dma_dir;
475 buf->migf = migf;
476 if (length) {
477 ret = mlx5vf_add_migration_pages(buf,
478 DIV_ROUND_UP_ULL(length, PAGE_SIZE));
479 if (ret)
480 goto end;
481
482 if (dma_dir != DMA_NONE) {
483 ret = mlx5vf_dma_data_buffer(buf);
484 if (ret)
485 goto end;
486 }
487 }
488
489 return buf;
490end:
491 mlx5vf_free_data_buffer(buf);
492 return ERR_PTR(error: ret);
493}
494
495void mlx5vf_put_data_buffer(struct mlx5_vhca_data_buffer *buf)
496{
497 spin_lock_irq(lock: &buf->migf->list_lock);
498 buf->stop_copy_chunk_num = 0;
499 list_add_tail(new: &buf->buf_elm, head: &buf->migf->avail_list);
500 spin_unlock_irq(lock: &buf->migf->list_lock);
501}
502
503struct mlx5_vhca_data_buffer *
504mlx5vf_get_data_buffer(struct mlx5_vf_migration_file *migf,
505 size_t length, enum dma_data_direction dma_dir)
506{
507 struct mlx5_vhca_data_buffer *buf, *temp_buf;
508 struct list_head free_list;
509
510 lockdep_assert_held(&migf->mvdev->state_mutex);
511 if (migf->mvdev->mdev_detach)
512 return ERR_PTR(error: -ENOTCONN);
513
514 INIT_LIST_HEAD(list: &free_list);
515
516 spin_lock_irq(lock: &migf->list_lock);
517 list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) {
518 if (buf->dma_dir == dma_dir) {
519 list_del_init(entry: &buf->buf_elm);
520 if (buf->allocated_length >= length) {
521 spin_unlock_irq(lock: &migf->list_lock);
522 goto found;
523 }
524 /*
525 * Prevent holding redundant buffers. Put in a free
526 * list and call at the end not under the spin lock
527 * (&migf->list_lock) to mlx5vf_free_data_buffer which
528 * might sleep.
529 */
530 list_add(new: &buf->buf_elm, head: &free_list);
531 }
532 }
533 spin_unlock_irq(lock: &migf->list_lock);
534 buf = mlx5vf_alloc_data_buffer(migf, length, dma_dir);
535
536found:
537 while ((temp_buf = list_first_entry_or_null(&free_list,
538 struct mlx5_vhca_data_buffer, buf_elm))) {
539 list_del(entry: &temp_buf->buf_elm);
540 mlx5vf_free_data_buffer(buf: temp_buf);
541 }
542
543 return buf;
544}
545
546static void
547mlx5vf_save_callback_complete(struct mlx5_vf_migration_file *migf,
548 struct mlx5vf_async_data *async_data)
549{
550 kvfree(addr: async_data->out);
551 complete(&migf->save_comp);
552 fput(migf->filp);
553}
554
555void mlx5vf_mig_file_cleanup_cb(struct work_struct *_work)
556{
557 struct mlx5vf_async_data *async_data = container_of(_work,
558 struct mlx5vf_async_data, work);
559 struct mlx5_vf_migration_file *migf = container_of(async_data,
560 struct mlx5_vf_migration_file, async_data);
561
562 mutex_lock(&migf->lock);
563 if (async_data->status) {
564 mlx5vf_put_data_buffer(buf: async_data->buf);
565 if (async_data->header_buf)
566 mlx5vf_put_data_buffer(buf: async_data->header_buf);
567 if (!async_data->stop_copy_chunk &&
568 async_data->status == MLX5_CMD_STAT_BAD_RES_STATE_ERR)
569 migf->state = MLX5_MIGF_STATE_PRE_COPY_ERROR;
570 else
571 migf->state = MLX5_MIGF_STATE_ERROR;
572 wake_up_interruptible(&migf->poll_wait);
573 }
574 mutex_unlock(lock: &migf->lock);
575 mlx5vf_save_callback_complete(migf, async_data);
576}
577
578static int add_buf_header(struct mlx5_vhca_data_buffer *header_buf,
579 size_t image_size, bool initial_pre_copy)
580{
581 struct mlx5_vf_migration_file *migf = header_buf->migf;
582 struct mlx5_vf_migration_header header = {};
583 unsigned long flags;
584 struct page *page;
585 u8 *to_buff;
586
587 header.record_size = cpu_to_le64(image_size);
588 header.flags = cpu_to_le32(MLX5_MIGF_HEADER_FLAGS_TAG_MANDATORY);
589 header.tag = cpu_to_le32(MLX5_MIGF_HEADER_TAG_FW_DATA);
590 page = mlx5vf_get_migration_page(buf: header_buf, offset: 0);
591 if (!page)
592 return -EINVAL;
593 to_buff = kmap_local_page(page);
594 memcpy(to_buff, &header, sizeof(header));
595 kunmap_local(to_buff);
596 header_buf->length = sizeof(header);
597 header_buf->start_pos = header_buf->migf->max_pos;
598 migf->max_pos += header_buf->length;
599 spin_lock_irqsave(&migf->list_lock, flags);
600 list_add_tail(new: &header_buf->buf_elm, head: &migf->buf_list);
601 spin_unlock_irqrestore(lock: &migf->list_lock, flags);
602 if (initial_pre_copy)
603 migf->pre_copy_initial_bytes += sizeof(header);
604 return 0;
605}
606
607static void mlx5vf_save_callback(int status, struct mlx5_async_work *context)
608{
609 struct mlx5vf_async_data *async_data = container_of(context,
610 struct mlx5vf_async_data, cb_work);
611 struct mlx5_vf_migration_file *migf = container_of(async_data,
612 struct mlx5_vf_migration_file, async_data);
613
614 if (!status) {
615 size_t next_required_umem_size = 0;
616 bool stop_copy_last_chunk;
617 size_t image_size;
618 unsigned long flags;
619 bool initial_pre_copy = migf->state != MLX5_MIGF_STATE_PRE_COPY &&
620 !async_data->stop_copy_chunk;
621
622 image_size = MLX5_GET(save_vhca_state_out, async_data->out,
623 actual_image_size);
624 if (async_data->buf->stop_copy_chunk_num)
625 next_required_umem_size = MLX5_GET(save_vhca_state_out,
626 async_data->out, next_required_umem_size);
627 stop_copy_last_chunk = async_data->stop_copy_chunk &&
628 !next_required_umem_size;
629 if (async_data->header_buf) {
630 status = add_buf_header(header_buf: async_data->header_buf, image_size,
631 initial_pre_copy);
632 if (status)
633 goto err;
634 }
635 async_data->buf->length = image_size;
636 async_data->buf->start_pos = migf->max_pos;
637 migf->max_pos += async_data->buf->length;
638 spin_lock_irqsave(&migf->list_lock, flags);
639 list_add_tail(new: &async_data->buf->buf_elm, head: &migf->buf_list);
640 if (async_data->buf->stop_copy_chunk_num) {
641 migf->num_ready_chunks++;
642 if (next_required_umem_size &&
643 migf->num_ready_chunks >= MAX_NUM_CHUNKS) {
644 /* Delay the next SAVE till one chunk be consumed */
645 migf->next_required_umem_size = next_required_umem_size;
646 next_required_umem_size = 0;
647 }
648 }
649 spin_unlock_irqrestore(lock: &migf->list_lock, flags);
650 if (initial_pre_copy) {
651 migf->pre_copy_initial_bytes += image_size;
652 migf->state = MLX5_MIGF_STATE_PRE_COPY;
653 }
654 if (stop_copy_last_chunk)
655 migf->state = MLX5_MIGF_STATE_COMPLETE;
656 wake_up_interruptible(&migf->poll_wait);
657 if (next_required_umem_size)
658 mlx5vf_mig_file_set_save_work(migf,
659 /* Picking up the next chunk num */
660 chunk_num: (async_data->buf->stop_copy_chunk_num % MAX_NUM_CHUNKS) + 1,
661 next_required_umem_size);
662 mlx5vf_save_callback_complete(migf, async_data);
663 return;
664 }
665
666err:
667 /* The error flow can't run from an interrupt context */
668 if (status == -EREMOTEIO) {
669 status = MLX5_GET(save_vhca_state_out, async_data->out, status);
670 /* Failed in FW, print cmd out failure details */
671 mlx5_cmd_out_err(dev: migf->mvdev->mdev, opcode: MLX5_CMD_OP_SAVE_VHCA_STATE, op_mod: 0,
672 out: async_data->out);
673 }
674
675 async_data->status = status;
676 queue_work(wq: migf->mvdev->cb_wq, work: &async_data->work);
677}
678
679int mlx5vf_cmd_save_vhca_state(struct mlx5vf_pci_core_device *mvdev,
680 struct mlx5_vf_migration_file *migf,
681 struct mlx5_vhca_data_buffer *buf, bool inc,
682 bool track)
683{
684 u32 out_size = MLX5_ST_SZ_BYTES(save_vhca_state_out);
685 u32 in[MLX5_ST_SZ_DW(save_vhca_state_in)] = {};
686 struct mlx5_vhca_data_buffer *header_buf = NULL;
687 struct mlx5vf_async_data *async_data;
688 bool pre_copy_cleanup = false;
689 int err;
690
691 lockdep_assert_held(&mvdev->state_mutex);
692 if (mvdev->mdev_detach)
693 return -ENOTCONN;
694
695 err = wait_for_completion_interruptible(x: &migf->save_comp);
696 if (err)
697 return err;
698
699 if ((migf->state == MLX5_MIGF_STATE_PRE_COPY ||
700 migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR) && !track && !inc)
701 pre_copy_cleanup = true;
702
703 if (migf->state == MLX5_MIGF_STATE_PRE_COPY_ERROR)
704 /*
705 * In case we had a PRE_COPY error, SAVE is triggered only for
706 * the final image, read device full image.
707 */
708 inc = false;
709
710 MLX5_SET(save_vhca_state_in, in, opcode,
711 MLX5_CMD_OP_SAVE_VHCA_STATE);
712 MLX5_SET(save_vhca_state_in, in, op_mod, 0);
713 MLX5_SET(save_vhca_state_in, in, vhca_id, mvdev->vhca_id);
714 MLX5_SET(save_vhca_state_in, in, mkey, buf->mkey);
715 MLX5_SET(save_vhca_state_in, in, size, buf->allocated_length);
716 MLX5_SET(save_vhca_state_in, in, incremental, inc);
717 MLX5_SET(save_vhca_state_in, in, set_track, track);
718
719 async_data = &migf->async_data;
720 async_data->buf = buf;
721 async_data->stop_copy_chunk = (!track && !pre_copy_cleanup);
722 async_data->out = kvzalloc(size: out_size, GFP_KERNEL);
723 if (!async_data->out) {
724 err = -ENOMEM;
725 goto err_out;
726 }
727
728 if (async_data->stop_copy_chunk) {
729 u8 header_idx = buf->stop_copy_chunk_num ?
730 buf->stop_copy_chunk_num - 1 : 0;
731
732 header_buf = migf->buf_header[header_idx];
733 migf->buf_header[header_idx] = NULL;
734 }
735
736 if (!header_buf) {
737 header_buf = mlx5vf_get_data_buffer(migf,
738 length: sizeof(struct mlx5_vf_migration_header), dma_dir: DMA_NONE);
739 if (IS_ERR(ptr: header_buf)) {
740 err = PTR_ERR(ptr: header_buf);
741 goto err_free;
742 }
743 }
744
745 if (async_data->stop_copy_chunk)
746 migf->state = MLX5_MIGF_STATE_SAVE_STOP_COPY_CHUNK;
747
748 async_data->header_buf = header_buf;
749 get_file(f: migf->filp);
750 err = mlx5_cmd_exec_cb(ctx: &migf->async_ctx, in, in_size: sizeof(in),
751 out: async_data->out,
752 out_size, callback: mlx5vf_save_callback,
753 work: &async_data->cb_work);
754 if (err)
755 goto err_exec;
756
757 return 0;
758
759err_exec:
760 if (header_buf)
761 mlx5vf_put_data_buffer(buf: header_buf);
762 fput(migf->filp);
763err_free:
764 kvfree(addr: async_data->out);
765err_out:
766 complete(&migf->save_comp);
767 return err;
768}
769
770int mlx5vf_cmd_load_vhca_state(struct mlx5vf_pci_core_device *mvdev,
771 struct mlx5_vf_migration_file *migf,
772 struct mlx5_vhca_data_buffer *buf)
773{
774 u32 out[MLX5_ST_SZ_DW(load_vhca_state_out)] = {};
775 u32 in[MLX5_ST_SZ_DW(load_vhca_state_in)] = {};
776 int err;
777
778 lockdep_assert_held(&mvdev->state_mutex);
779 if (mvdev->mdev_detach)
780 return -ENOTCONN;
781
782 if (!buf->dmaed) {
783 err = mlx5vf_dma_data_buffer(buf);
784 if (err)
785 return err;
786 }
787
788 MLX5_SET(load_vhca_state_in, in, opcode,
789 MLX5_CMD_OP_LOAD_VHCA_STATE);
790 MLX5_SET(load_vhca_state_in, in, op_mod, 0);
791 MLX5_SET(load_vhca_state_in, in, vhca_id, mvdev->vhca_id);
792 MLX5_SET(load_vhca_state_in, in, mkey, buf->mkey);
793 MLX5_SET(load_vhca_state_in, in, size, buf->length);
794 return mlx5_cmd_exec_inout(mvdev->mdev, load_vhca_state, in, out);
795}
796
797int mlx5vf_cmd_alloc_pd(struct mlx5_vf_migration_file *migf)
798{
799 int err;
800
801 lockdep_assert_held(&migf->mvdev->state_mutex);
802 if (migf->mvdev->mdev_detach)
803 return -ENOTCONN;
804
805 err = mlx5_core_alloc_pd(dev: migf->mvdev->mdev, pdn: &migf->pdn);
806 return err;
807}
808
809void mlx5vf_cmd_dealloc_pd(struct mlx5_vf_migration_file *migf)
810{
811 lockdep_assert_held(&migf->mvdev->state_mutex);
812 if (migf->mvdev->mdev_detach)
813 return;
814
815 mlx5_core_dealloc_pd(dev: migf->mvdev->mdev, pdn: migf->pdn);
816}
817
818void mlx5fv_cmd_clean_migf_resources(struct mlx5_vf_migration_file *migf)
819{
820 struct mlx5_vhca_data_buffer *entry;
821 int i;
822
823 lockdep_assert_held(&migf->mvdev->state_mutex);
824 WARN_ON(migf->mvdev->mdev_detach);
825
826 for (i = 0; i < MAX_NUM_CHUNKS; i++) {
827 if (migf->buf[i]) {
828 mlx5vf_free_data_buffer(buf: migf->buf[i]);
829 migf->buf[i] = NULL;
830 }
831
832 if (migf->buf_header[i]) {
833 mlx5vf_free_data_buffer(buf: migf->buf_header[i]);
834 migf->buf_header[i] = NULL;
835 }
836 }
837
838 list_splice(list: &migf->avail_list, head: &migf->buf_list);
839
840 while ((entry = list_first_entry_or_null(&migf->buf_list,
841 struct mlx5_vhca_data_buffer, buf_elm))) {
842 list_del(entry: &entry->buf_elm);
843 mlx5vf_free_data_buffer(buf: entry);
844 }
845
846 mlx5vf_cmd_dealloc_pd(migf);
847}
848
849static int mlx5vf_create_tracker(struct mlx5_core_dev *mdev,
850 struct mlx5vf_pci_core_device *mvdev,
851 struct rb_root_cached *ranges, u32 nnodes)
852{
853 int max_num_range =
854 MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_max_num_range);
855 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
856 int record_size = MLX5_ST_SZ_BYTES(page_track_range);
857 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
858 struct interval_tree_node *node = NULL;
859 u64 total_ranges_len = 0;
860 u32 num_ranges = nnodes;
861 u8 log_addr_space_size;
862 void *range_list_ptr;
863 void *obj_context;
864 void *cmd_hdr;
865 int inlen;
866 void *in;
867 int err;
868 int i;
869
870 if (num_ranges > max_num_range) {
871 vfio_combine_iova_ranges(root: ranges, cur_nodes: nnodes, req_nodes: max_num_range);
872 num_ranges = max_num_range;
873 }
874
875 inlen = MLX5_ST_SZ_BYTES(create_page_track_obj_in) +
876 record_size * num_ranges;
877 in = kzalloc(size: inlen, GFP_KERNEL);
878 if (!in)
879 return -ENOMEM;
880
881 cmd_hdr = MLX5_ADDR_OF(create_page_track_obj_in, in,
882 general_obj_in_cmd_hdr);
883 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode,
884 MLX5_CMD_OP_CREATE_GENERAL_OBJECT);
885 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type,
886 MLX5_OBJ_TYPE_PAGE_TRACK);
887 obj_context = MLX5_ADDR_OF(create_page_track_obj_in, in, obj_context);
888 MLX5_SET(page_track, obj_context, vhca_id, mvdev->vhca_id);
889 MLX5_SET(page_track, obj_context, track_type, 1);
890 MLX5_SET(page_track, obj_context, log_page_size,
891 ilog2(tracker->host_qp->tracked_page_size));
892 MLX5_SET(page_track, obj_context, log_msg_size,
893 ilog2(tracker->host_qp->max_msg_size));
894 MLX5_SET(page_track, obj_context, reporting_qpn, tracker->fw_qp->qpn);
895 MLX5_SET(page_track, obj_context, num_ranges, num_ranges);
896
897 range_list_ptr = MLX5_ADDR_OF(page_track, obj_context, track_range);
898 node = interval_tree_iter_first(root: ranges, start: 0, ULONG_MAX);
899 for (i = 0; i < num_ranges; i++) {
900 void *addr_range_i_base = range_list_ptr + record_size * i;
901 unsigned long length = node->last - node->start + 1;
902
903 MLX5_SET64(page_track_range, addr_range_i_base, start_address,
904 node->start);
905 MLX5_SET64(page_track_range, addr_range_i_base, length, length);
906 total_ranges_len += length;
907 node = interval_tree_iter_next(node, start: 0, ULONG_MAX);
908 }
909
910 WARN_ON(node);
911 log_addr_space_size = ilog2(roundup_pow_of_two(total_ranges_len));
912 if (log_addr_space_size <
913 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_min_addr_space)) ||
914 log_addr_space_size >
915 (MLX5_CAP_ADV_VIRTUALIZATION(mdev, pg_track_log_max_addr_space))) {
916 err = -EOPNOTSUPP;
917 goto out;
918 }
919
920 MLX5_SET(page_track, obj_context, log_addr_space_size,
921 log_addr_space_size);
922 err = mlx5_cmd_exec(dev: mdev, in, in_size: inlen, out, out_size: sizeof(out));
923 if (err)
924 goto out;
925
926 tracker->id = MLX5_GET(general_obj_out_cmd_hdr, out, obj_id);
927out:
928 kfree(objp: in);
929 return err;
930}
931
932static int mlx5vf_cmd_destroy_tracker(struct mlx5_core_dev *mdev,
933 u32 tracker_id)
934{
935 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
936 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
937
938 MLX5_SET(general_obj_in_cmd_hdr, in, opcode, MLX5_CMD_OP_DESTROY_GENERAL_OBJECT);
939 MLX5_SET(general_obj_in_cmd_hdr, in, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
940 MLX5_SET(general_obj_in_cmd_hdr, in, obj_id, tracker_id);
941
942 return mlx5_cmd_exec(dev: mdev, in, in_size: sizeof(in), out, out_size: sizeof(out));
943}
944
945static int mlx5vf_cmd_modify_tracker(struct mlx5_core_dev *mdev,
946 u32 tracker_id, unsigned long iova,
947 unsigned long length, u32 tracker_state)
948{
949 u32 in[MLX5_ST_SZ_DW(modify_page_track_obj_in)] = {};
950 u32 out[MLX5_ST_SZ_DW(general_obj_out_cmd_hdr)] = {};
951 void *obj_context;
952 void *cmd_hdr;
953
954 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
955 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_MODIFY_GENERAL_OBJECT);
956 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
957 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker_id);
958
959 obj_context = MLX5_ADDR_OF(modify_page_track_obj_in, in, obj_context);
960 MLX5_SET64(page_track, obj_context, modify_field_select, 0x3);
961 MLX5_SET64(page_track, obj_context, range_start_address, iova);
962 MLX5_SET64(page_track, obj_context, length, length);
963 MLX5_SET(page_track, obj_context, state, tracker_state);
964
965 return mlx5_cmd_exec(dev: mdev, in, in_size: sizeof(in), out, out_size: sizeof(out));
966}
967
968static int mlx5vf_cmd_query_tracker(struct mlx5_core_dev *mdev,
969 struct mlx5_vhca_page_tracker *tracker)
970{
971 u32 out[MLX5_ST_SZ_DW(query_page_track_obj_out)] = {};
972 u32 in[MLX5_ST_SZ_DW(general_obj_in_cmd_hdr)] = {};
973 void *obj_context;
974 void *cmd_hdr;
975 int err;
976
977 cmd_hdr = MLX5_ADDR_OF(modify_page_track_obj_in, in, general_obj_in_cmd_hdr);
978 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, opcode, MLX5_CMD_OP_QUERY_GENERAL_OBJECT);
979 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_type, MLX5_OBJ_TYPE_PAGE_TRACK);
980 MLX5_SET(general_obj_in_cmd_hdr, cmd_hdr, obj_id, tracker->id);
981
982 err = mlx5_cmd_exec(dev: mdev, in, in_size: sizeof(in), out, out_size: sizeof(out));
983 if (err)
984 return err;
985
986 obj_context = MLX5_ADDR_OF(query_page_track_obj_out, out, obj_context);
987 tracker->status = MLX5_GET(page_track, obj_context, state);
988 return 0;
989}
990
991static int alloc_cq_frag_buf(struct mlx5_core_dev *mdev,
992 struct mlx5_vhca_cq_buf *buf, int nent,
993 int cqe_size)
994{
995 struct mlx5_frag_buf *frag_buf = &buf->frag_buf;
996 u8 log_wq_stride = 6 + (cqe_size == 128 ? 1 : 0);
997 u8 log_wq_sz = ilog2(cqe_size);
998 int err;
999
1000 err = mlx5_frag_buf_alloc_node(dev: mdev, size: nent * cqe_size, buf: frag_buf,
1001 node: mdev->priv.numa_node);
1002 if (err)
1003 return err;
1004
1005 mlx5_init_fbc(frags: frag_buf->frags, log_stride: log_wq_stride, log_sz: log_wq_sz, fbc: &buf->fbc);
1006 buf->cqe_size = cqe_size;
1007 buf->nent = nent;
1008 return 0;
1009}
1010
1011static void init_cq_frag_buf(struct mlx5_vhca_cq_buf *buf)
1012{
1013 struct mlx5_cqe64 *cqe64;
1014 void *cqe;
1015 int i;
1016
1017 for (i = 0; i < buf->nent; i++) {
1018 cqe = mlx5_frag_buf_get_wqe(fbc: &buf->fbc, ix: i);
1019 cqe64 = buf->cqe_size == 64 ? cqe : cqe + 64;
1020 cqe64->op_own = MLX5_CQE_INVALID << 4;
1021 }
1022}
1023
1024static void mlx5vf_destroy_cq(struct mlx5_core_dev *mdev,
1025 struct mlx5_vhca_cq *cq)
1026{
1027 mlx5_core_destroy_cq(dev: mdev, cq: &cq->mcq);
1028 mlx5_frag_buf_free(dev: mdev, buf: &cq->buf.frag_buf);
1029 mlx5_db_free(dev: mdev, db: &cq->db);
1030}
1031
1032static void mlx5vf_cq_event(struct mlx5_core_cq *mcq, enum mlx5_event type)
1033{
1034 if (type != MLX5_EVENT_TYPE_CQ_ERROR)
1035 return;
1036
1037 set_tracker_error(container_of(mcq, struct mlx5vf_pci_core_device,
1038 tracker.cq.mcq));
1039}
1040
1041static int mlx5vf_event_notifier(struct notifier_block *nb, unsigned long type,
1042 void *data)
1043{
1044 struct mlx5_vhca_page_tracker *tracker =
1045 mlx5_nb_cof(nb, struct mlx5_vhca_page_tracker, nb);
1046 struct mlx5vf_pci_core_device *mvdev = container_of(
1047 tracker, struct mlx5vf_pci_core_device, tracker);
1048 struct mlx5_eqe_obj_change *object;
1049 struct mlx5_eqe *eqe = data;
1050 u8 event_type = (u8)type;
1051 u8 queue_type;
1052 u32 obj_id;
1053 int qp_num;
1054
1055 switch (event_type) {
1056 case MLX5_EVENT_TYPE_WQ_CATAS_ERROR:
1057 case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR:
1058 case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR:
1059 queue_type = eqe->data.qp_srq.type;
1060 if (queue_type != MLX5_EVENT_QUEUE_TYPE_QP)
1061 break;
1062 qp_num = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff;
1063 if (qp_num != tracker->host_qp->qpn &&
1064 qp_num != tracker->fw_qp->qpn)
1065 break;
1066 set_tracker_error(mvdev);
1067 break;
1068 case MLX5_EVENT_TYPE_OBJECT_CHANGE:
1069 object = &eqe->data.obj_change;
1070 obj_id = be32_to_cpu(object->obj_id);
1071 if (obj_id == tracker->id)
1072 set_tracker_change_event(mvdev);
1073 break;
1074 default:
1075 break;
1076 }
1077
1078 return NOTIFY_OK;
1079}
1080
1081static void mlx5vf_cq_complete(struct mlx5_core_cq *mcq,
1082 struct mlx5_eqe *eqe)
1083{
1084 struct mlx5vf_pci_core_device *mvdev =
1085 container_of(mcq, struct mlx5vf_pci_core_device,
1086 tracker.cq.mcq);
1087
1088 complete(&mvdev->tracker_comp);
1089}
1090
1091static int mlx5vf_create_cq(struct mlx5_core_dev *mdev,
1092 struct mlx5_vhca_page_tracker *tracker,
1093 size_t ncqe)
1094{
1095 int cqe_size = cache_line_size() == 128 ? 128 : 64;
1096 u32 out[MLX5_ST_SZ_DW(create_cq_out)];
1097 struct mlx5_vhca_cq *cq;
1098 int inlen, err, eqn;
1099 void *cqc, *in;
1100 __be64 *pas;
1101 int vector;
1102
1103 cq = &tracker->cq;
1104 ncqe = roundup_pow_of_two(ncqe);
1105 err = mlx5_db_alloc_node(dev: mdev, db: &cq->db, node: mdev->priv.numa_node);
1106 if (err)
1107 return err;
1108
1109 cq->ncqe = ncqe;
1110 cq->mcq.set_ci_db = cq->db.db;
1111 cq->mcq.arm_db = cq->db.db + 1;
1112 cq->mcq.cqe_sz = cqe_size;
1113 err = alloc_cq_frag_buf(mdev, buf: &cq->buf, nent: ncqe, cqe_size);
1114 if (err)
1115 goto err_db_free;
1116
1117 init_cq_frag_buf(buf: &cq->buf);
1118 inlen = MLX5_ST_SZ_BYTES(create_cq_in) +
1119 MLX5_FLD_SZ_BYTES(create_cq_in, pas[0]) *
1120 cq->buf.frag_buf.npages;
1121 in = kvzalloc(size: inlen, GFP_KERNEL);
1122 if (!in) {
1123 err = -ENOMEM;
1124 goto err_buff;
1125 }
1126
1127 vector = raw_smp_processor_id() % mlx5_comp_vectors_max(dev: mdev);
1128 err = mlx5_comp_eqn_get(dev: mdev, vecidx: vector, eqn: &eqn);
1129 if (err)
1130 goto err_vec;
1131
1132 cqc = MLX5_ADDR_OF(create_cq_in, in, cq_context);
1133 MLX5_SET(cqc, cqc, log_cq_size, ilog2(ncqe));
1134 MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
1135 MLX5_SET(cqc, cqc, uar_page, tracker->uar->index);
1136 MLX5_SET(cqc, cqc, log_page_size, cq->buf.frag_buf.page_shift -
1137 MLX5_ADAPTER_PAGE_SHIFT);
1138 MLX5_SET64(cqc, cqc, dbr_addr, cq->db.dma);
1139 pas = (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas);
1140 mlx5_fill_page_frag_array(frag_buf: &cq->buf.frag_buf, pas);
1141 cq->mcq.comp = mlx5vf_cq_complete;
1142 cq->mcq.event = mlx5vf_cq_event;
1143 err = mlx5_core_create_cq(dev: mdev, cq: &cq->mcq, in, inlen, out, outlen: sizeof(out));
1144 if (err)
1145 goto err_vec;
1146
1147 mlx5_cq_arm(cq: &cq->mcq, cmd: MLX5_CQ_DB_REQ_NOT, uar_page: tracker->uar->map,
1148 cons_index: cq->mcq.cons_index);
1149 kvfree(addr: in);
1150 return 0;
1151
1152err_vec:
1153 kvfree(addr: in);
1154err_buff:
1155 mlx5_frag_buf_free(dev: mdev, buf: &cq->buf.frag_buf);
1156err_db_free:
1157 mlx5_db_free(dev: mdev, db: &cq->db);
1158 return err;
1159}
1160
1161static struct mlx5_vhca_qp *
1162mlx5vf_create_rc_qp(struct mlx5_core_dev *mdev,
1163 struct mlx5_vhca_page_tracker *tracker, u32 max_recv_wr)
1164{
1165 u32 out[MLX5_ST_SZ_DW(create_qp_out)] = {};
1166 struct mlx5_vhca_qp *qp;
1167 u8 log_rq_stride;
1168 u8 log_rq_sz;
1169 void *qpc;
1170 int inlen;
1171 void *in;
1172 int err;
1173
1174 qp = kzalloc(size: sizeof(*qp), GFP_KERNEL_ACCOUNT);
1175 if (!qp)
1176 return ERR_PTR(error: -ENOMEM);
1177
1178 err = mlx5_db_alloc_node(dev: mdev, db: &qp->db, node: mdev->priv.numa_node);
1179 if (err)
1180 goto err_free;
1181
1182 if (max_recv_wr) {
1183 qp->rq.wqe_cnt = roundup_pow_of_two(max_recv_wr);
1184 log_rq_stride = ilog2(MLX5_SEND_WQE_DS);
1185 log_rq_sz = ilog2(qp->rq.wqe_cnt);
1186 err = mlx5_frag_buf_alloc_node(dev: mdev,
1187 size: wq_get_byte_sz(log_sz: log_rq_sz, log_stride: log_rq_stride),
1188 buf: &qp->buf, node: mdev->priv.numa_node);
1189 if (err)
1190 goto err_db_free;
1191 mlx5_init_fbc(frags: qp->buf.frags, log_stride: log_rq_stride, log_sz: log_rq_sz, fbc: &qp->rq.fbc);
1192 }
1193
1194 qp->rq.db = &qp->db.db[MLX5_RCV_DBR];
1195 inlen = MLX5_ST_SZ_BYTES(create_qp_in) +
1196 MLX5_FLD_SZ_BYTES(create_qp_in, pas[0]) *
1197 qp->buf.npages;
1198 in = kvzalloc(size: inlen, GFP_KERNEL);
1199 if (!in) {
1200 err = -ENOMEM;
1201 goto err_in;
1202 }
1203
1204 qpc = MLX5_ADDR_OF(create_qp_in, in, qpc);
1205 MLX5_SET(qpc, qpc, st, MLX5_QP_ST_RC);
1206 MLX5_SET(qpc, qpc, pm_state, MLX5_QP_PM_MIGRATED);
1207 MLX5_SET(qpc, qpc, pd, tracker->pdn);
1208 MLX5_SET(qpc, qpc, uar_page, tracker->uar->index);
1209 MLX5_SET(qpc, qpc, log_page_size,
1210 qp->buf.page_shift - MLX5_ADAPTER_PAGE_SHIFT);
1211 MLX5_SET(qpc, qpc, ts_format, mlx5_get_qp_default_ts(mdev));
1212 if (MLX5_CAP_GEN(mdev, cqe_version) == 1)
1213 MLX5_SET(qpc, qpc, user_index, 0xFFFFFF);
1214 MLX5_SET(qpc, qpc, no_sq, 1);
1215 if (max_recv_wr) {
1216 MLX5_SET(qpc, qpc, cqn_rcv, tracker->cq.mcq.cqn);
1217 MLX5_SET(qpc, qpc, log_rq_stride, log_rq_stride - 4);
1218 MLX5_SET(qpc, qpc, log_rq_size, log_rq_sz);
1219 MLX5_SET(qpc, qpc, rq_type, MLX5_NON_ZERO_RQ);
1220 MLX5_SET64(qpc, qpc, dbr_addr, qp->db.dma);
1221 mlx5_fill_page_frag_array(frag_buf: &qp->buf,
1222 pas: (__be64 *)MLX5_ADDR_OF(create_qp_in,
1223 in, pas));
1224 } else {
1225 MLX5_SET(qpc, qpc, rq_type, MLX5_ZERO_LEN_RQ);
1226 }
1227
1228 MLX5_SET(create_qp_in, in, opcode, MLX5_CMD_OP_CREATE_QP);
1229 err = mlx5_cmd_exec(dev: mdev, in, in_size: inlen, out, out_size: sizeof(out));
1230 kvfree(addr: in);
1231 if (err)
1232 goto err_in;
1233
1234 qp->qpn = MLX5_GET(create_qp_out, out, qpn);
1235 return qp;
1236
1237err_in:
1238 if (max_recv_wr)
1239 mlx5_frag_buf_free(dev: mdev, buf: &qp->buf);
1240err_db_free:
1241 mlx5_db_free(dev: mdev, db: &qp->db);
1242err_free:
1243 kfree(objp: qp);
1244 return ERR_PTR(error: err);
1245}
1246
1247static void mlx5vf_post_recv(struct mlx5_vhca_qp *qp)
1248{
1249 struct mlx5_wqe_data_seg *data;
1250 unsigned int ix;
1251
1252 WARN_ON(qp->rq.pc - qp->rq.cc >= qp->rq.wqe_cnt);
1253 ix = qp->rq.pc & (qp->rq.wqe_cnt - 1);
1254 data = mlx5_frag_buf_get_wqe(fbc: &qp->rq.fbc, ix);
1255 data->byte_count = cpu_to_be32(qp->max_msg_size);
1256 data->lkey = cpu_to_be32(qp->recv_buf.mkey);
1257 data->addr = cpu_to_be64(qp->recv_buf.next_rq_offset);
1258 qp->rq.pc++;
1259 /* Make sure that descriptors are written before doorbell record. */
1260 dma_wmb();
1261 *qp->rq.db = cpu_to_be32(qp->rq.pc & 0xffff);
1262}
1263
1264static int mlx5vf_activate_qp(struct mlx5_core_dev *mdev,
1265 struct mlx5_vhca_qp *qp, u32 remote_qpn,
1266 bool host_qp)
1267{
1268 u32 init_in[MLX5_ST_SZ_DW(rst2init_qp_in)] = {};
1269 u32 rtr_in[MLX5_ST_SZ_DW(init2rtr_qp_in)] = {};
1270 u32 rts_in[MLX5_ST_SZ_DW(rtr2rts_qp_in)] = {};
1271 void *qpc;
1272 int ret;
1273
1274 /* Init */
1275 qpc = MLX5_ADDR_OF(rst2init_qp_in, init_in, qpc);
1276 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1277 MLX5_SET(qpc, qpc, pm_state, MLX5_QPC_PM_STATE_MIGRATED);
1278 MLX5_SET(qpc, qpc, rre, 1);
1279 MLX5_SET(qpc, qpc, rwe, 1);
1280 MLX5_SET(rst2init_qp_in, init_in, opcode, MLX5_CMD_OP_RST2INIT_QP);
1281 MLX5_SET(rst2init_qp_in, init_in, qpn, qp->qpn);
1282 ret = mlx5_cmd_exec_in(mdev, rst2init_qp, init_in);
1283 if (ret)
1284 return ret;
1285
1286 if (host_qp) {
1287 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1288 int i;
1289
1290 for (i = 0; i < qp->rq.wqe_cnt; i++) {
1291 mlx5vf_post_recv(qp);
1292 recv_buf->next_rq_offset += qp->max_msg_size;
1293 }
1294 }
1295
1296 /* RTR */
1297 qpc = MLX5_ADDR_OF(init2rtr_qp_in, rtr_in, qpc);
1298 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1299 MLX5_SET(qpc, qpc, mtu, IB_MTU_4096);
1300 MLX5_SET(qpc, qpc, log_msg_max, MLX5_CAP_GEN(mdev, log_max_msg));
1301 MLX5_SET(qpc, qpc, remote_qpn, remote_qpn);
1302 MLX5_SET(qpc, qpc, primary_address_path.vhca_port_num, 1);
1303 MLX5_SET(qpc, qpc, primary_address_path.fl, 1);
1304 MLX5_SET(qpc, qpc, min_rnr_nak, 1);
1305 MLX5_SET(init2rtr_qp_in, rtr_in, opcode, MLX5_CMD_OP_INIT2RTR_QP);
1306 MLX5_SET(init2rtr_qp_in, rtr_in, qpn, qp->qpn);
1307 ret = mlx5_cmd_exec_in(mdev, init2rtr_qp, rtr_in);
1308 if (ret || host_qp)
1309 return ret;
1310
1311 /* RTS */
1312 qpc = MLX5_ADDR_OF(rtr2rts_qp_in, rts_in, qpc);
1313 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1314 MLX5_SET(qpc, qpc, retry_count, 7);
1315 MLX5_SET(qpc, qpc, rnr_retry, 7); /* Infinite retry if RNR NACK */
1316 MLX5_SET(qpc, qpc, primary_address_path.ack_timeout, 0x8); /* ~1ms */
1317 MLX5_SET(rtr2rts_qp_in, rts_in, opcode, MLX5_CMD_OP_RTR2RTS_QP);
1318 MLX5_SET(rtr2rts_qp_in, rts_in, qpn, qp->qpn);
1319
1320 return mlx5_cmd_exec_in(mdev, rtr2rts_qp, rts_in);
1321}
1322
1323static void mlx5vf_destroy_qp(struct mlx5_core_dev *mdev,
1324 struct mlx5_vhca_qp *qp)
1325{
1326 u32 in[MLX5_ST_SZ_DW(destroy_qp_in)] = {};
1327
1328 MLX5_SET(destroy_qp_in, in, opcode, MLX5_CMD_OP_DESTROY_QP);
1329 MLX5_SET(destroy_qp_in, in, qpn, qp->qpn);
1330 mlx5_cmd_exec_in(mdev, destroy_qp, in);
1331
1332 mlx5_frag_buf_free(dev: mdev, buf: &qp->buf);
1333 mlx5_db_free(dev: mdev, db: &qp->db);
1334 kfree(objp: qp);
1335}
1336
1337static void free_recv_pages(struct mlx5_vhca_recv_buf *recv_buf)
1338{
1339 int i;
1340
1341 /* Undo alloc_pages_bulk_array() */
1342 for (i = 0; i < recv_buf->npages; i++)
1343 __free_page(recv_buf->page_list[i]);
1344
1345 kvfree(addr: recv_buf->page_list);
1346}
1347
1348static int alloc_recv_pages(struct mlx5_vhca_recv_buf *recv_buf,
1349 unsigned int npages)
1350{
1351 unsigned int filled = 0, done = 0;
1352 int i;
1353
1354 recv_buf->page_list = kvcalloc(n: npages, size: sizeof(*recv_buf->page_list),
1355 GFP_KERNEL_ACCOUNT);
1356 if (!recv_buf->page_list)
1357 return -ENOMEM;
1358
1359 for (;;) {
1360 filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT,
1361 nr_pages: npages - done,
1362 page_array: recv_buf->page_list + done);
1363 if (!filled)
1364 goto err;
1365
1366 done += filled;
1367 if (done == npages)
1368 break;
1369 }
1370
1371 recv_buf->npages = npages;
1372 return 0;
1373
1374err:
1375 for (i = 0; i < npages; i++) {
1376 if (recv_buf->page_list[i])
1377 __free_page(recv_buf->page_list[i]);
1378 }
1379
1380 kvfree(addr: recv_buf->page_list);
1381 return -ENOMEM;
1382}
1383
1384static int register_dma_recv_pages(struct mlx5_core_dev *mdev,
1385 struct mlx5_vhca_recv_buf *recv_buf)
1386{
1387 int i, j;
1388
1389 recv_buf->dma_addrs = kvcalloc(n: recv_buf->npages,
1390 size: sizeof(*recv_buf->dma_addrs),
1391 GFP_KERNEL_ACCOUNT);
1392 if (!recv_buf->dma_addrs)
1393 return -ENOMEM;
1394
1395 for (i = 0; i < recv_buf->npages; i++) {
1396 recv_buf->dma_addrs[i] = dma_map_page(mdev->device,
1397 recv_buf->page_list[i],
1398 0, PAGE_SIZE,
1399 DMA_FROM_DEVICE);
1400 if (dma_mapping_error(dev: mdev->device, dma_addr: recv_buf->dma_addrs[i]))
1401 goto error;
1402 }
1403 return 0;
1404
1405error:
1406 for (j = 0; j < i; j++)
1407 dma_unmap_single(mdev->device, recv_buf->dma_addrs[j],
1408 PAGE_SIZE, DMA_FROM_DEVICE);
1409
1410 kvfree(addr: recv_buf->dma_addrs);
1411 return -ENOMEM;
1412}
1413
1414static void unregister_dma_recv_pages(struct mlx5_core_dev *mdev,
1415 struct mlx5_vhca_recv_buf *recv_buf)
1416{
1417 int i;
1418
1419 for (i = 0; i < recv_buf->npages; i++)
1420 dma_unmap_single(mdev->device, recv_buf->dma_addrs[i],
1421 PAGE_SIZE, DMA_FROM_DEVICE);
1422
1423 kvfree(addr: recv_buf->dma_addrs);
1424}
1425
1426static void mlx5vf_free_qp_recv_resources(struct mlx5_core_dev *mdev,
1427 struct mlx5_vhca_qp *qp)
1428{
1429 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1430
1431 mlx5_core_destroy_mkey(dev: mdev, mkey: recv_buf->mkey);
1432 unregister_dma_recv_pages(mdev, recv_buf);
1433 free_recv_pages(recv_buf: &qp->recv_buf);
1434}
1435
1436static int mlx5vf_alloc_qp_recv_resources(struct mlx5_core_dev *mdev,
1437 struct mlx5_vhca_qp *qp, u32 pdn,
1438 u64 rq_size)
1439{
1440 unsigned int npages = DIV_ROUND_UP_ULL(rq_size, PAGE_SIZE);
1441 struct mlx5_vhca_recv_buf *recv_buf = &qp->recv_buf;
1442 int err;
1443
1444 err = alloc_recv_pages(recv_buf, npages);
1445 if (err < 0)
1446 return err;
1447
1448 err = register_dma_recv_pages(mdev, recv_buf);
1449 if (err)
1450 goto end;
1451
1452 err = _create_mkey(mdev, pdn, NULL, recv_buf, mkey: &recv_buf->mkey);
1453 if (err)
1454 goto err_create_mkey;
1455
1456 return 0;
1457
1458err_create_mkey:
1459 unregister_dma_recv_pages(mdev, recv_buf);
1460end:
1461 free_recv_pages(recv_buf);
1462 return err;
1463}
1464
1465static void
1466_mlx5vf_free_page_tracker_resources(struct mlx5vf_pci_core_device *mvdev)
1467{
1468 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1469 struct mlx5_core_dev *mdev = mvdev->mdev;
1470
1471 lockdep_assert_held(&mvdev->state_mutex);
1472
1473 if (!mvdev->log_active)
1474 return;
1475
1476 WARN_ON(mvdev->mdev_detach);
1477
1478 mlx5_eq_notifier_unregister(dev: mdev, nb: &tracker->nb);
1479 mlx5vf_cmd_destroy_tracker(mdev, tracker_id: tracker->id);
1480 mlx5vf_destroy_qp(mdev, qp: tracker->fw_qp);
1481 mlx5vf_free_qp_recv_resources(mdev, qp: tracker->host_qp);
1482 mlx5vf_destroy_qp(mdev, qp: tracker->host_qp);
1483 mlx5vf_destroy_cq(mdev, cq: &tracker->cq);
1484 mlx5_core_dealloc_pd(dev: mdev, pdn: tracker->pdn);
1485 mlx5_put_uars_page(mdev, up: tracker->uar);
1486 mvdev->log_active = false;
1487}
1488
1489int mlx5vf_stop_page_tracker(struct vfio_device *vdev)
1490{
1491 struct mlx5vf_pci_core_device *mvdev = container_of(
1492 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1493
1494 mutex_lock(&mvdev->state_mutex);
1495 if (!mvdev->log_active)
1496 goto end;
1497
1498 _mlx5vf_free_page_tracker_resources(mvdev);
1499 mvdev->log_active = false;
1500end:
1501 mlx5vf_state_mutex_unlock(mvdev);
1502 return 0;
1503}
1504
1505int mlx5vf_start_page_tracker(struct vfio_device *vdev,
1506 struct rb_root_cached *ranges, u32 nnodes,
1507 u64 *page_size)
1508{
1509 struct mlx5vf_pci_core_device *mvdev = container_of(
1510 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1511 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1512 u8 log_tracked_page = ilog2(*page_size);
1513 struct mlx5_vhca_qp *host_qp;
1514 struct mlx5_vhca_qp *fw_qp;
1515 struct mlx5_core_dev *mdev;
1516 u32 max_msg_size = PAGE_SIZE;
1517 u64 rq_size = SZ_2M;
1518 u32 max_recv_wr;
1519 int err;
1520
1521 mutex_lock(&mvdev->state_mutex);
1522 if (mvdev->mdev_detach) {
1523 err = -ENOTCONN;
1524 goto end;
1525 }
1526
1527 if (mvdev->log_active) {
1528 err = -EINVAL;
1529 goto end;
1530 }
1531
1532 mdev = mvdev->mdev;
1533 memset(tracker, 0, sizeof(*tracker));
1534 tracker->uar = mlx5_get_uars_page(mdev);
1535 if (IS_ERR(ptr: tracker->uar)) {
1536 err = PTR_ERR(ptr: tracker->uar);
1537 goto end;
1538 }
1539
1540 err = mlx5_core_alloc_pd(dev: mdev, pdn: &tracker->pdn);
1541 if (err)
1542 goto err_uar;
1543
1544 max_recv_wr = DIV_ROUND_UP_ULL(rq_size, max_msg_size);
1545 err = mlx5vf_create_cq(mdev, tracker, ncqe: max_recv_wr);
1546 if (err)
1547 goto err_dealloc_pd;
1548
1549 host_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr);
1550 if (IS_ERR(ptr: host_qp)) {
1551 err = PTR_ERR(ptr: host_qp);
1552 goto err_cq;
1553 }
1554
1555 host_qp->max_msg_size = max_msg_size;
1556 if (log_tracked_page < MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1557 pg_track_log_min_page_size)) {
1558 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1559 pg_track_log_min_page_size);
1560 } else if (log_tracked_page > MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1561 pg_track_log_max_page_size)) {
1562 log_tracked_page = MLX5_CAP_ADV_VIRTUALIZATION(mdev,
1563 pg_track_log_max_page_size);
1564 }
1565
1566 host_qp->tracked_page_size = (1ULL << log_tracked_page);
1567 err = mlx5vf_alloc_qp_recv_resources(mdev, qp: host_qp, pdn: tracker->pdn,
1568 rq_size);
1569 if (err)
1570 goto err_host_qp;
1571
1572 fw_qp = mlx5vf_create_rc_qp(mdev, tracker, max_recv_wr: 0);
1573 if (IS_ERR(ptr: fw_qp)) {
1574 err = PTR_ERR(ptr: fw_qp);
1575 goto err_recv_resources;
1576 }
1577
1578 err = mlx5vf_activate_qp(mdev, qp: host_qp, remote_qpn: fw_qp->qpn, host_qp: true);
1579 if (err)
1580 goto err_activate;
1581
1582 err = mlx5vf_activate_qp(mdev, qp: fw_qp, remote_qpn: host_qp->qpn, host_qp: false);
1583 if (err)
1584 goto err_activate;
1585
1586 tracker->host_qp = host_qp;
1587 tracker->fw_qp = fw_qp;
1588 err = mlx5vf_create_tracker(mdev, mvdev, ranges, nnodes);
1589 if (err)
1590 goto err_activate;
1591
1592 MLX5_NB_INIT(&tracker->nb, mlx5vf_event_notifier, NOTIFY_ANY);
1593 mlx5_eq_notifier_register(dev: mdev, nb: &tracker->nb);
1594 *page_size = host_qp->tracked_page_size;
1595 mvdev->log_active = true;
1596 mlx5vf_state_mutex_unlock(mvdev);
1597 return 0;
1598
1599err_activate:
1600 mlx5vf_destroy_qp(mdev, qp: fw_qp);
1601err_recv_resources:
1602 mlx5vf_free_qp_recv_resources(mdev, qp: host_qp);
1603err_host_qp:
1604 mlx5vf_destroy_qp(mdev, qp: host_qp);
1605err_cq:
1606 mlx5vf_destroy_cq(mdev, cq: &tracker->cq);
1607err_dealloc_pd:
1608 mlx5_core_dealloc_pd(dev: mdev, pdn: tracker->pdn);
1609err_uar:
1610 mlx5_put_uars_page(mdev, up: tracker->uar);
1611end:
1612 mlx5vf_state_mutex_unlock(mvdev);
1613 return err;
1614}
1615
1616static void
1617set_report_output(u32 size, int index, struct mlx5_vhca_qp *qp,
1618 struct iova_bitmap *dirty)
1619{
1620 u32 entry_size = MLX5_ST_SZ_BYTES(page_track_report_entry);
1621 u32 nent = size / entry_size;
1622 struct page *page;
1623 u64 addr;
1624 u64 *buf;
1625 int i;
1626
1627 if (WARN_ON(index >= qp->recv_buf.npages ||
1628 (nent > qp->max_msg_size / entry_size)))
1629 return;
1630
1631 page = qp->recv_buf.page_list[index];
1632 buf = kmap_local_page(page);
1633 for (i = 0; i < nent; i++) {
1634 addr = MLX5_GET(page_track_report_entry, buf + i,
1635 dirty_address_low);
1636 addr |= (u64)MLX5_GET(page_track_report_entry, buf + i,
1637 dirty_address_high) << 32;
1638 iova_bitmap_set(bitmap: dirty, iova: addr, length: qp->tracked_page_size);
1639 }
1640 kunmap_local(buf);
1641}
1642
1643static void
1644mlx5vf_rq_cqe(struct mlx5_vhca_qp *qp, struct mlx5_cqe64 *cqe,
1645 struct iova_bitmap *dirty, int *tracker_status)
1646{
1647 u32 size;
1648 int ix;
1649
1650 qp->rq.cc++;
1651 *tracker_status = be32_to_cpu(cqe->immediate) >> 28;
1652 size = be32_to_cpu(cqe->byte_cnt);
1653 ix = be16_to_cpu(cqe->wqe_counter) & (qp->rq.wqe_cnt - 1);
1654
1655 /* zero length CQE, no data */
1656 WARN_ON(!size && *tracker_status == MLX5_PAGE_TRACK_STATE_REPORTING);
1657 if (size)
1658 set_report_output(size, index: ix, qp, dirty);
1659
1660 qp->recv_buf.next_rq_offset = ix * qp->max_msg_size;
1661 mlx5vf_post_recv(qp);
1662}
1663
1664static void *get_cqe(struct mlx5_vhca_cq *cq, int n)
1665{
1666 return mlx5_frag_buf_get_wqe(fbc: &cq->buf.fbc, ix: n);
1667}
1668
1669static struct mlx5_cqe64 *get_sw_cqe(struct mlx5_vhca_cq *cq, int n)
1670{
1671 void *cqe = get_cqe(cq, n: n & (cq->ncqe - 1));
1672 struct mlx5_cqe64 *cqe64;
1673
1674 cqe64 = (cq->mcq.cqe_sz == 64) ? cqe : cqe + 64;
1675
1676 if (likely(get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) &&
1677 !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ncqe)))) {
1678 return cqe64;
1679 } else {
1680 return NULL;
1681 }
1682}
1683
1684static int
1685mlx5vf_cq_poll_one(struct mlx5_vhca_cq *cq, struct mlx5_vhca_qp *qp,
1686 struct iova_bitmap *dirty, int *tracker_status)
1687{
1688 struct mlx5_cqe64 *cqe;
1689 u8 opcode;
1690
1691 cqe = get_sw_cqe(cq, n: cq->mcq.cons_index);
1692 if (!cqe)
1693 return CQ_EMPTY;
1694
1695 ++cq->mcq.cons_index;
1696 /*
1697 * Make sure we read CQ entry contents after we've checked the
1698 * ownership bit.
1699 */
1700 rmb();
1701 opcode = get_cqe_opcode(cqe);
1702 switch (opcode) {
1703 case MLX5_CQE_RESP_SEND_IMM:
1704 mlx5vf_rq_cqe(qp, cqe, dirty, tracker_status);
1705 return CQ_OK;
1706 default:
1707 return CQ_POLL_ERR;
1708 }
1709}
1710
1711int mlx5vf_tracker_read_and_clear(struct vfio_device *vdev, unsigned long iova,
1712 unsigned long length,
1713 struct iova_bitmap *dirty)
1714{
1715 struct mlx5vf_pci_core_device *mvdev = container_of(
1716 vdev, struct mlx5vf_pci_core_device, core_device.vdev);
1717 struct mlx5_vhca_page_tracker *tracker = &mvdev->tracker;
1718 struct mlx5_vhca_cq *cq = &tracker->cq;
1719 struct mlx5_core_dev *mdev;
1720 int poll_err, err;
1721
1722 mutex_lock(&mvdev->state_mutex);
1723 if (!mvdev->log_active) {
1724 err = -EINVAL;
1725 goto end;
1726 }
1727
1728 if (mvdev->mdev_detach) {
1729 err = -ENOTCONN;
1730 goto end;
1731 }
1732
1733 if (tracker->is_err) {
1734 err = -EIO;
1735 goto end;
1736 }
1737
1738 mdev = mvdev->mdev;
1739 err = mlx5vf_cmd_modify_tracker(mdev, tracker_id: tracker->id, iova, length,
1740 tracker_state: MLX5_PAGE_TRACK_STATE_REPORTING);
1741 if (err)
1742 goto end;
1743
1744 tracker->status = MLX5_PAGE_TRACK_STATE_REPORTING;
1745 while (tracker->status == MLX5_PAGE_TRACK_STATE_REPORTING &&
1746 !tracker->is_err) {
1747 poll_err = mlx5vf_cq_poll_one(cq, qp: tracker->host_qp, dirty,
1748 tracker_status: &tracker->status);
1749 if (poll_err == CQ_EMPTY) {
1750 mlx5_cq_arm(cq: &cq->mcq, cmd: MLX5_CQ_DB_REQ_NOT, uar_page: tracker->uar->map,
1751 cons_index: cq->mcq.cons_index);
1752 poll_err = mlx5vf_cq_poll_one(cq, qp: tracker->host_qp,
1753 dirty, tracker_status: &tracker->status);
1754 if (poll_err == CQ_EMPTY) {
1755 wait_for_completion(&mvdev->tracker_comp);
1756 if (tracker->object_changed) {
1757 tracker->object_changed = false;
1758 err = mlx5vf_cmd_query_tracker(mdev, tracker);
1759 if (err)
1760 goto end;
1761 }
1762 continue;
1763 }
1764 }
1765 if (poll_err == CQ_POLL_ERR) {
1766 err = -EIO;
1767 goto end;
1768 }
1769 mlx5_cq_set_ci(cq: &cq->mcq);
1770 }
1771
1772 if (tracker->status == MLX5_PAGE_TRACK_STATE_ERROR)
1773 tracker->is_err = true;
1774
1775 if (tracker->is_err)
1776 err = -EIO;
1777end:
1778 mlx5vf_state_mutex_unlock(mvdev);
1779 return err;
1780}
1781

source code of linux/drivers/vfio/pci/mlx5/cmd.c