1 | // SPDX-License-Identifier: GPL-2.0 |
2 | /* |
3 | * Shared Memory Communications over RDMA (SMC-R) and RoCE |
4 | * |
5 | * Work Requests exploiting Infiniband API |
6 | * |
7 | * Work requests (WR) of type ib_post_send or ib_post_recv respectively |
8 | * are submitted to either RC SQ or RC RQ respectively |
9 | * (reliably connected send/receive queue) |
10 | * and become work queue entries (WQEs). |
11 | * While an SQ WR/WQE is pending, we track it until transmission completion. |
12 | * Through a send or receive completion queue (CQ) respectively, |
13 | * we get completion queue entries (CQEs) [aka work completions (WCs)]. |
14 | * Since the CQ callback is called from IRQ context, we split work by using |
15 | * bottom halves implemented by tasklets. |
16 | * |
17 | * SMC uses this to exchange LLC (link layer control) |
18 | * and CDC (connection data control) messages. |
19 | * |
20 | * Copyright IBM Corp. 2016 |
21 | * |
22 | * Author(s): Steffen Maier <maier@linux.vnet.ibm.com> |
23 | */ |
24 | |
25 | #include <linux/atomic.h> |
26 | #include <linux/hashtable.h> |
27 | #include <linux/wait.h> |
28 | #include <rdma/ib_verbs.h> |
29 | #include <asm/div64.h> |
30 | |
31 | #include "smc.h" |
32 | #include "smc_wr.h" |
33 | |
34 | #define SMC_WR_MAX_POLL_CQE 10 /* max. # of compl. queue elements in 1 poll */ |
35 | |
36 | #define SMC_WR_RX_HASH_BITS 4 |
37 | static DEFINE_HASHTABLE(smc_wr_rx_hash, SMC_WR_RX_HASH_BITS); |
38 | static DEFINE_SPINLOCK(smc_wr_rx_hash_lock); |
39 | |
40 | struct smc_wr_tx_pend { /* control data for a pending send request */ |
41 | u64 wr_id; /* work request id sent */ |
42 | smc_wr_tx_handler handler; |
43 | enum ib_wc_status wc_status; /* CQE status */ |
44 | struct smc_link *link; |
45 | u32 idx; |
46 | struct smc_wr_tx_pend_priv priv; |
47 | u8 compl_requested; |
48 | }; |
49 | |
50 | /******************************** send queue *********************************/ |
51 | |
52 | /*------------------------------- completion --------------------------------*/ |
53 | |
54 | /* returns true if at least one tx work request is pending on the given link */ |
55 | static inline bool smc_wr_is_tx_pend(struct smc_link *link) |
56 | { |
57 | return !bitmap_empty(src: link->wr_tx_mask, nbits: link->wr_tx_cnt); |
58 | } |
59 | |
60 | /* wait till all pending tx work requests on the given link are completed */ |
61 | void smc_wr_tx_wait_no_pending_sends(struct smc_link *link) |
62 | { |
63 | wait_event(link->wr_tx_wait, !smc_wr_is_tx_pend(link)); |
64 | } |
65 | |
66 | static inline int smc_wr_tx_find_pending_index(struct smc_link *link, u64 wr_id) |
67 | { |
68 | u32 i; |
69 | |
70 | for (i = 0; i < link->wr_tx_cnt; i++) { |
71 | if (link->wr_tx_pends[i].wr_id == wr_id) |
72 | return i; |
73 | } |
74 | return link->wr_tx_cnt; |
75 | } |
76 | |
77 | static inline void smc_wr_tx_process_cqe(struct ib_wc *wc) |
78 | { |
79 | struct smc_wr_tx_pend pnd_snd; |
80 | struct smc_link *link; |
81 | u32 pnd_snd_idx; |
82 | |
83 | link = wc->qp->qp_context; |
84 | |
85 | if (wc->opcode == IB_WC_REG_MR) { |
86 | if (wc->status) |
87 | link->wr_reg_state = FAILED; |
88 | else |
89 | link->wr_reg_state = CONFIRMED; |
90 | smc_wr_wakeup_reg_wait(lnk: link); |
91 | return; |
92 | } |
93 | |
94 | pnd_snd_idx = smc_wr_tx_find_pending_index(link, wr_id: wc->wr_id); |
95 | if (pnd_snd_idx == link->wr_tx_cnt) { |
96 | if (link->lgr->smc_version != SMC_V2 || |
97 | link->wr_tx_v2_pend->wr_id != wc->wr_id) |
98 | return; |
99 | link->wr_tx_v2_pend->wc_status = wc->status; |
100 | memcpy(&pnd_snd, link->wr_tx_v2_pend, sizeof(pnd_snd)); |
101 | /* clear the full struct smc_wr_tx_pend including .priv */ |
102 | memset(link->wr_tx_v2_pend, 0, |
103 | sizeof(*link->wr_tx_v2_pend)); |
104 | memset(link->lgr->wr_tx_buf_v2, 0, |
105 | sizeof(*link->lgr->wr_tx_buf_v2)); |
106 | } else { |
107 | link->wr_tx_pends[pnd_snd_idx].wc_status = wc->status; |
108 | if (link->wr_tx_pends[pnd_snd_idx].compl_requested) |
109 | complete(&link->wr_tx_compl[pnd_snd_idx]); |
110 | memcpy(&pnd_snd, &link->wr_tx_pends[pnd_snd_idx], |
111 | sizeof(pnd_snd)); |
112 | /* clear the full struct smc_wr_tx_pend including .priv */ |
113 | memset(&link->wr_tx_pends[pnd_snd_idx], 0, |
114 | sizeof(link->wr_tx_pends[pnd_snd_idx])); |
115 | memset(&link->wr_tx_bufs[pnd_snd_idx], 0, |
116 | sizeof(link->wr_tx_bufs[pnd_snd_idx])); |
117 | if (!test_and_clear_bit(nr: pnd_snd_idx, addr: link->wr_tx_mask)) |
118 | return; |
119 | } |
120 | |
121 | if (wc->status) { |
122 | if (link->lgr->smc_version == SMC_V2) { |
123 | memset(link->wr_tx_v2_pend, 0, |
124 | sizeof(*link->wr_tx_v2_pend)); |
125 | memset(link->lgr->wr_tx_buf_v2, 0, |
126 | sizeof(*link->lgr->wr_tx_buf_v2)); |
127 | } |
128 | /* terminate link */ |
129 | smcr_link_down_cond_sched(lnk: link); |
130 | } |
131 | if (pnd_snd.handler) |
132 | pnd_snd.handler(&pnd_snd.priv, link, wc->status); |
133 | wake_up(&link->wr_tx_wait); |
134 | } |
135 | |
136 | static void smc_wr_tx_tasklet_fn(struct tasklet_struct *t) |
137 | { |
138 | struct smc_ib_device *dev = from_tasklet(dev, t, send_tasklet); |
139 | struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; |
140 | int i = 0, rc; |
141 | int polled = 0; |
142 | |
143 | again: |
144 | polled++; |
145 | do { |
146 | memset(&wc, 0, sizeof(wc)); |
147 | rc = ib_poll_cq(cq: dev->roce_cq_send, SMC_WR_MAX_POLL_CQE, wc); |
148 | if (polled == 1) { |
149 | ib_req_notify_cq(cq: dev->roce_cq_send, |
150 | flags: IB_CQ_NEXT_COMP | |
151 | IB_CQ_REPORT_MISSED_EVENTS); |
152 | } |
153 | if (!rc) |
154 | break; |
155 | for (i = 0; i < rc; i++) |
156 | smc_wr_tx_process_cqe(wc: &wc[i]); |
157 | } while (rc > 0); |
158 | if (polled == 1) |
159 | goto again; |
160 | } |
161 | |
162 | void smc_wr_tx_cq_handler(struct ib_cq *ib_cq, void *cq_context) |
163 | { |
164 | struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; |
165 | |
166 | tasklet_schedule(t: &dev->send_tasklet); |
167 | } |
168 | |
169 | /*---------------------------- request submission ---------------------------*/ |
170 | |
171 | static inline int smc_wr_tx_get_free_slot_index(struct smc_link *link, u32 *idx) |
172 | { |
173 | *idx = link->wr_tx_cnt; |
174 | if (!smc_link_sendable(lnk: link)) |
175 | return -ENOLINK; |
176 | for_each_clear_bit(*idx, link->wr_tx_mask, link->wr_tx_cnt) { |
177 | if (!test_and_set_bit(nr: *idx, addr: link->wr_tx_mask)) |
178 | return 0; |
179 | } |
180 | *idx = link->wr_tx_cnt; |
181 | return -EBUSY; |
182 | } |
183 | |
184 | /** |
185 | * smc_wr_tx_get_free_slot() - returns buffer for message assembly, |
186 | * and sets info for pending transmit tracking |
187 | * @link: Pointer to smc_link used to later send the message. |
188 | * @handler: Send completion handler function pointer. |
189 | * @wr_buf: Out value returns pointer to message buffer. |
190 | * @wr_rdma_buf: Out value returns pointer to rdma work request. |
191 | * @wr_pend_priv: Out value returns pointer serving as handler context. |
192 | * |
193 | * Return: 0 on success, or -errno on error. |
194 | */ |
195 | int smc_wr_tx_get_free_slot(struct smc_link *link, |
196 | smc_wr_tx_handler handler, |
197 | struct smc_wr_buf **wr_buf, |
198 | struct smc_rdma_wr **wr_rdma_buf, |
199 | struct smc_wr_tx_pend_priv **wr_pend_priv) |
200 | { |
201 | struct smc_link_group *lgr = smc_get_lgr(link); |
202 | struct smc_wr_tx_pend *wr_pend; |
203 | u32 idx = link->wr_tx_cnt; |
204 | struct ib_send_wr *wr_ib; |
205 | u64 wr_id; |
206 | int rc; |
207 | |
208 | *wr_buf = NULL; |
209 | *wr_pend_priv = NULL; |
210 | if (in_softirq() || lgr->terminating) { |
211 | rc = smc_wr_tx_get_free_slot_index(link, idx: &idx); |
212 | if (rc) |
213 | return rc; |
214 | } else { |
215 | rc = wait_event_interruptible_timeout( |
216 | link->wr_tx_wait, |
217 | !smc_link_sendable(link) || |
218 | lgr->terminating || |
219 | (smc_wr_tx_get_free_slot_index(link, &idx) != -EBUSY), |
220 | SMC_WR_TX_WAIT_FREE_SLOT_TIME); |
221 | if (!rc) { |
222 | /* timeout - terminate link */ |
223 | smcr_link_down_cond_sched(lnk: link); |
224 | return -EPIPE; |
225 | } |
226 | if (idx == link->wr_tx_cnt) |
227 | return -EPIPE; |
228 | } |
229 | wr_id = smc_wr_tx_get_next_wr_id(link); |
230 | wr_pend = &link->wr_tx_pends[idx]; |
231 | wr_pend->wr_id = wr_id; |
232 | wr_pend->handler = handler; |
233 | wr_pend->link = link; |
234 | wr_pend->idx = idx; |
235 | wr_ib = &link->wr_tx_ibs[idx]; |
236 | wr_ib->wr_id = wr_id; |
237 | *wr_buf = &link->wr_tx_bufs[idx]; |
238 | if (wr_rdma_buf) |
239 | *wr_rdma_buf = &link->wr_tx_rdmas[idx]; |
240 | *wr_pend_priv = &wr_pend->priv; |
241 | return 0; |
242 | } |
243 | |
244 | int smc_wr_tx_get_v2_slot(struct smc_link *link, |
245 | smc_wr_tx_handler handler, |
246 | struct smc_wr_v2_buf **wr_buf, |
247 | struct smc_wr_tx_pend_priv **wr_pend_priv) |
248 | { |
249 | struct smc_wr_tx_pend *wr_pend; |
250 | struct ib_send_wr *wr_ib; |
251 | u64 wr_id; |
252 | |
253 | if (link->wr_tx_v2_pend->idx == link->wr_tx_cnt) |
254 | return -EBUSY; |
255 | |
256 | *wr_buf = NULL; |
257 | *wr_pend_priv = NULL; |
258 | wr_id = smc_wr_tx_get_next_wr_id(link); |
259 | wr_pend = link->wr_tx_v2_pend; |
260 | wr_pend->wr_id = wr_id; |
261 | wr_pend->handler = handler; |
262 | wr_pend->link = link; |
263 | wr_pend->idx = link->wr_tx_cnt; |
264 | wr_ib = link->wr_tx_v2_ib; |
265 | wr_ib->wr_id = wr_id; |
266 | *wr_buf = link->lgr->wr_tx_buf_v2; |
267 | *wr_pend_priv = &wr_pend->priv; |
268 | return 0; |
269 | } |
270 | |
271 | int smc_wr_tx_put_slot(struct smc_link *link, |
272 | struct smc_wr_tx_pend_priv *wr_pend_priv) |
273 | { |
274 | struct smc_wr_tx_pend *pend; |
275 | |
276 | pend = container_of(wr_pend_priv, struct smc_wr_tx_pend, priv); |
277 | if (pend->idx < link->wr_tx_cnt) { |
278 | u32 idx = pend->idx; |
279 | |
280 | /* clear the full struct smc_wr_tx_pend including .priv */ |
281 | memset(&link->wr_tx_pends[idx], 0, |
282 | sizeof(link->wr_tx_pends[idx])); |
283 | memset(&link->wr_tx_bufs[idx], 0, |
284 | sizeof(link->wr_tx_bufs[idx])); |
285 | test_and_clear_bit(nr: idx, addr: link->wr_tx_mask); |
286 | wake_up(&link->wr_tx_wait); |
287 | return 1; |
288 | } else if (link->lgr->smc_version == SMC_V2 && |
289 | pend->idx == link->wr_tx_cnt) { |
290 | /* Large v2 buffer */ |
291 | memset(&link->wr_tx_v2_pend, 0, |
292 | sizeof(link->wr_tx_v2_pend)); |
293 | memset(&link->lgr->wr_tx_buf_v2, 0, |
294 | sizeof(link->lgr->wr_tx_buf_v2)); |
295 | return 1; |
296 | } |
297 | |
298 | return 0; |
299 | } |
300 | |
301 | /* Send prepared WR slot via ib_post_send. |
302 | * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer |
303 | */ |
304 | int smc_wr_tx_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv) |
305 | { |
306 | struct smc_wr_tx_pend *pend; |
307 | int rc; |
308 | |
309 | ib_req_notify_cq(cq: link->smcibdev->roce_cq_send, |
310 | flags: IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); |
311 | pend = container_of(priv, struct smc_wr_tx_pend, priv); |
312 | rc = ib_post_send(qp: link->roce_qp, send_wr: &link->wr_tx_ibs[pend->idx], NULL); |
313 | if (rc) { |
314 | smc_wr_tx_put_slot(link, wr_pend_priv: priv); |
315 | smcr_link_down_cond_sched(lnk: link); |
316 | } |
317 | return rc; |
318 | } |
319 | |
320 | int smc_wr_tx_v2_send(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, |
321 | int len) |
322 | { |
323 | int rc; |
324 | |
325 | link->wr_tx_v2_ib->sg_list[0].length = len; |
326 | ib_req_notify_cq(cq: link->smcibdev->roce_cq_send, |
327 | flags: IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); |
328 | rc = ib_post_send(qp: link->roce_qp, send_wr: link->wr_tx_v2_ib, NULL); |
329 | if (rc) { |
330 | smc_wr_tx_put_slot(link, wr_pend_priv: priv); |
331 | smcr_link_down_cond_sched(lnk: link); |
332 | } |
333 | return rc; |
334 | } |
335 | |
336 | /* Send prepared WR slot via ib_post_send and wait for send completion |
337 | * notification. |
338 | * @priv: pointer to smc_wr_tx_pend_priv identifying prepared message buffer |
339 | */ |
340 | int smc_wr_tx_send_wait(struct smc_link *link, struct smc_wr_tx_pend_priv *priv, |
341 | unsigned long timeout) |
342 | { |
343 | struct smc_wr_tx_pend *pend; |
344 | u32 pnd_idx; |
345 | int rc; |
346 | |
347 | pend = container_of(priv, struct smc_wr_tx_pend, priv); |
348 | pend->compl_requested = 1; |
349 | pnd_idx = pend->idx; |
350 | init_completion(x: &link->wr_tx_compl[pnd_idx]); |
351 | |
352 | rc = smc_wr_tx_send(link, priv); |
353 | if (rc) |
354 | return rc; |
355 | /* wait for completion by smc_wr_tx_process_cqe() */ |
356 | rc = wait_for_completion_interruptible_timeout( |
357 | x: &link->wr_tx_compl[pnd_idx], timeout); |
358 | if (rc <= 0) |
359 | rc = -ENODATA; |
360 | if (rc > 0) |
361 | rc = 0; |
362 | return rc; |
363 | } |
364 | |
365 | /* Register a memory region and wait for result. */ |
366 | int smc_wr_reg_send(struct smc_link *link, struct ib_mr *mr) |
367 | { |
368 | int rc; |
369 | |
370 | ib_req_notify_cq(cq: link->smcibdev->roce_cq_send, |
371 | flags: IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); |
372 | link->wr_reg_state = POSTED; |
373 | link->wr_reg.wr.wr_id = (u64)(uintptr_t)mr; |
374 | link->wr_reg.mr = mr; |
375 | link->wr_reg.key = mr->rkey; |
376 | rc = ib_post_send(qp: link->roce_qp, send_wr: &link->wr_reg.wr, NULL); |
377 | if (rc) |
378 | return rc; |
379 | |
380 | percpu_ref_get(ref: &link->wr_reg_refs); |
381 | rc = wait_event_interruptible_timeout(link->wr_reg_wait, |
382 | (link->wr_reg_state != POSTED), |
383 | SMC_WR_REG_MR_WAIT_TIME); |
384 | percpu_ref_put(ref: &link->wr_reg_refs); |
385 | if (!rc) { |
386 | /* timeout - terminate link */ |
387 | smcr_link_down_cond_sched(lnk: link); |
388 | return -EPIPE; |
389 | } |
390 | if (rc == -ERESTARTSYS) |
391 | return -EINTR; |
392 | switch (link->wr_reg_state) { |
393 | case CONFIRMED: |
394 | rc = 0; |
395 | break; |
396 | case FAILED: |
397 | rc = -EIO; |
398 | break; |
399 | case POSTED: |
400 | rc = -EPIPE; |
401 | break; |
402 | } |
403 | return rc; |
404 | } |
405 | |
406 | /****************************** receive queue ********************************/ |
407 | |
408 | int smc_wr_rx_register_handler(struct smc_wr_rx_handler *handler) |
409 | { |
410 | struct smc_wr_rx_handler *h_iter; |
411 | int rc = 0; |
412 | |
413 | spin_lock(lock: &smc_wr_rx_hash_lock); |
414 | hash_for_each_possible(smc_wr_rx_hash, h_iter, list, handler->type) { |
415 | if (h_iter->type == handler->type) { |
416 | rc = -EEXIST; |
417 | goto out_unlock; |
418 | } |
419 | } |
420 | hash_add(smc_wr_rx_hash, &handler->list, handler->type); |
421 | out_unlock: |
422 | spin_unlock(lock: &smc_wr_rx_hash_lock); |
423 | return rc; |
424 | } |
425 | |
426 | /* Demultiplex a received work request based on the message type to its handler. |
427 | * Relies on smc_wr_rx_hash having been completely filled before any IB WRs, |
428 | * and not being modified any more afterwards so we don't need to lock it. |
429 | */ |
430 | static inline void smc_wr_rx_demultiplex(struct ib_wc *wc) |
431 | { |
432 | struct smc_link *link = (struct smc_link *)wc->qp->qp_context; |
433 | struct smc_wr_rx_handler *handler; |
434 | struct smc_wr_rx_hdr *wr_rx; |
435 | u64 temp_wr_id; |
436 | u32 index; |
437 | |
438 | if (wc->byte_len < sizeof(*wr_rx)) |
439 | return; /* short message */ |
440 | temp_wr_id = wc->wr_id; |
441 | index = do_div(temp_wr_id, link->wr_rx_cnt); |
442 | wr_rx = (struct smc_wr_rx_hdr *)&link->wr_rx_bufs[index]; |
443 | hash_for_each_possible(smc_wr_rx_hash, handler, list, wr_rx->type) { |
444 | if (handler->type == wr_rx->type) |
445 | handler->handler(wc, wr_rx); |
446 | } |
447 | } |
448 | |
449 | static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) |
450 | { |
451 | struct smc_link *link; |
452 | int i; |
453 | |
454 | for (i = 0; i < num; i++) { |
455 | link = wc[i].qp->qp_context; |
456 | link->wr_rx_id_compl = wc[i].wr_id; |
457 | if (wc[i].status == IB_WC_SUCCESS) { |
458 | link->wr_rx_tstamp = jiffies; |
459 | smc_wr_rx_demultiplex(wc: &wc[i]); |
460 | smc_wr_rx_post(link); /* refill WR RX */ |
461 | } else { |
462 | /* handle status errors */ |
463 | switch (wc[i].status) { |
464 | case IB_WC_RETRY_EXC_ERR: |
465 | case IB_WC_RNR_RETRY_EXC_ERR: |
466 | case IB_WC_WR_FLUSH_ERR: |
467 | smcr_link_down_cond_sched(lnk: link); |
468 | if (link->wr_rx_id_compl == link->wr_rx_id) |
469 | wake_up(&link->wr_rx_empty_wait); |
470 | break; |
471 | default: |
472 | smc_wr_rx_post(link); /* refill WR RX */ |
473 | break; |
474 | } |
475 | } |
476 | } |
477 | } |
478 | |
479 | static void smc_wr_rx_tasklet_fn(struct tasklet_struct *t) |
480 | { |
481 | struct smc_ib_device *dev = from_tasklet(dev, t, recv_tasklet); |
482 | struct ib_wc wc[SMC_WR_MAX_POLL_CQE]; |
483 | int polled = 0; |
484 | int rc; |
485 | |
486 | again: |
487 | polled++; |
488 | do { |
489 | memset(&wc, 0, sizeof(wc)); |
490 | rc = ib_poll_cq(cq: dev->roce_cq_recv, SMC_WR_MAX_POLL_CQE, wc); |
491 | if (polled == 1) { |
492 | ib_req_notify_cq(cq: dev->roce_cq_recv, |
493 | flags: IB_CQ_SOLICITED_MASK |
494 | | IB_CQ_REPORT_MISSED_EVENTS); |
495 | } |
496 | if (!rc) |
497 | break; |
498 | smc_wr_rx_process_cqes(wc: &wc[0], num: rc); |
499 | } while (rc > 0); |
500 | if (polled == 1) |
501 | goto again; |
502 | } |
503 | |
504 | void smc_wr_rx_cq_handler(struct ib_cq *ib_cq, void *cq_context) |
505 | { |
506 | struct smc_ib_device *dev = (struct smc_ib_device *)cq_context; |
507 | |
508 | tasklet_schedule(t: &dev->recv_tasklet); |
509 | } |
510 | |
511 | int smc_wr_rx_post_init(struct smc_link *link) |
512 | { |
513 | u32 i; |
514 | int rc = 0; |
515 | |
516 | for (i = 0; i < link->wr_rx_cnt; i++) |
517 | rc = smc_wr_rx_post(link); |
518 | return rc; |
519 | } |
520 | |
521 | /***************************** init, exit, misc ******************************/ |
522 | |
523 | void smc_wr_remember_qp_attr(struct smc_link *lnk) |
524 | { |
525 | struct ib_qp_attr *attr = &lnk->qp_attr; |
526 | struct ib_qp_init_attr init_attr; |
527 | |
528 | memset(attr, 0, sizeof(*attr)); |
529 | memset(&init_attr, 0, sizeof(init_attr)); |
530 | ib_query_qp(qp: lnk->roce_qp, qp_attr: attr, |
531 | qp_attr_mask: IB_QP_STATE | |
532 | IB_QP_CUR_STATE | |
533 | IB_QP_PKEY_INDEX | |
534 | IB_QP_PORT | |
535 | IB_QP_QKEY | |
536 | IB_QP_AV | |
537 | IB_QP_PATH_MTU | |
538 | IB_QP_TIMEOUT | |
539 | IB_QP_RETRY_CNT | |
540 | IB_QP_RNR_RETRY | |
541 | IB_QP_RQ_PSN | |
542 | IB_QP_ALT_PATH | |
543 | IB_QP_MIN_RNR_TIMER | |
544 | IB_QP_SQ_PSN | |
545 | IB_QP_PATH_MIG_STATE | |
546 | IB_QP_CAP | |
547 | IB_QP_DEST_QPN, |
548 | qp_init_attr: &init_attr); |
549 | |
550 | lnk->wr_tx_cnt = min_t(size_t, SMC_WR_BUF_CNT, |
551 | lnk->qp_attr.cap.max_send_wr); |
552 | lnk->wr_rx_cnt = min_t(size_t, SMC_WR_BUF_CNT * 3, |
553 | lnk->qp_attr.cap.max_recv_wr); |
554 | } |
555 | |
556 | static void smc_wr_init_sge(struct smc_link *lnk) |
557 | { |
558 | int sges_per_buf = (lnk->lgr->smc_version == SMC_V2) ? 2 : 1; |
559 | bool send_inline = (lnk->qp_attr.cap.max_inline_data > SMC_WR_TX_SIZE); |
560 | u32 i; |
561 | |
562 | for (i = 0; i < lnk->wr_tx_cnt; i++) { |
563 | lnk->wr_tx_sges[i].addr = send_inline ? (uintptr_t)(&lnk->wr_tx_bufs[i]) : |
564 | lnk->wr_tx_dma_addr + i * SMC_WR_BUF_SIZE; |
565 | lnk->wr_tx_sges[i].length = SMC_WR_TX_SIZE; |
566 | lnk->wr_tx_sges[i].lkey = lnk->roce_pd->local_dma_lkey; |
567 | lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[0].lkey = |
568 | lnk->roce_pd->local_dma_lkey; |
569 | lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge[1].lkey = |
570 | lnk->roce_pd->local_dma_lkey; |
571 | lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[0].lkey = |
572 | lnk->roce_pd->local_dma_lkey; |
573 | lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge[1].lkey = |
574 | lnk->roce_pd->local_dma_lkey; |
575 | lnk->wr_tx_ibs[i].next = NULL; |
576 | lnk->wr_tx_ibs[i].sg_list = &lnk->wr_tx_sges[i]; |
577 | lnk->wr_tx_ibs[i].num_sge = 1; |
578 | lnk->wr_tx_ibs[i].opcode = IB_WR_SEND; |
579 | lnk->wr_tx_ibs[i].send_flags = |
580 | IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
581 | if (send_inline) |
582 | lnk->wr_tx_ibs[i].send_flags |= IB_SEND_INLINE; |
583 | lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.opcode = IB_WR_RDMA_WRITE; |
584 | lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.opcode = IB_WR_RDMA_WRITE; |
585 | lnk->wr_tx_rdmas[i].wr_tx_rdma[0].wr.sg_list = |
586 | lnk->wr_tx_rdma_sges[i].tx_rdma_sge[0].wr_tx_rdma_sge; |
587 | lnk->wr_tx_rdmas[i].wr_tx_rdma[1].wr.sg_list = |
588 | lnk->wr_tx_rdma_sges[i].tx_rdma_sge[1].wr_tx_rdma_sge; |
589 | } |
590 | |
591 | if (lnk->lgr->smc_version == SMC_V2) { |
592 | lnk->wr_tx_v2_sge->addr = lnk->wr_tx_v2_dma_addr; |
593 | lnk->wr_tx_v2_sge->length = SMC_WR_BUF_V2_SIZE; |
594 | lnk->wr_tx_v2_sge->lkey = lnk->roce_pd->local_dma_lkey; |
595 | |
596 | lnk->wr_tx_v2_ib->next = NULL; |
597 | lnk->wr_tx_v2_ib->sg_list = lnk->wr_tx_v2_sge; |
598 | lnk->wr_tx_v2_ib->num_sge = 1; |
599 | lnk->wr_tx_v2_ib->opcode = IB_WR_SEND; |
600 | lnk->wr_tx_v2_ib->send_flags = |
601 | IB_SEND_SIGNALED | IB_SEND_SOLICITED; |
602 | } |
603 | |
604 | /* With SMC-Rv2 there can be messages larger than SMC_WR_TX_SIZE. |
605 | * Each ib_recv_wr gets 2 sges, the second one is a spillover buffer |
606 | * and the same buffer for all sges. When a larger message arrived then |
607 | * the content of the first small sge is copied to the beginning of |
608 | * the larger spillover buffer, allowing easy data mapping. |
609 | */ |
610 | for (i = 0; i < lnk->wr_rx_cnt; i++) { |
611 | int x = i * sges_per_buf; |
612 | |
613 | lnk->wr_rx_sges[x].addr = |
614 | lnk->wr_rx_dma_addr + i * SMC_WR_BUF_SIZE; |
615 | lnk->wr_rx_sges[x].length = SMC_WR_TX_SIZE; |
616 | lnk->wr_rx_sges[x].lkey = lnk->roce_pd->local_dma_lkey; |
617 | if (lnk->lgr->smc_version == SMC_V2) { |
618 | lnk->wr_rx_sges[x + 1].addr = |
619 | lnk->wr_rx_v2_dma_addr + SMC_WR_TX_SIZE; |
620 | lnk->wr_rx_sges[x + 1].length = |
621 | SMC_WR_BUF_V2_SIZE - SMC_WR_TX_SIZE; |
622 | lnk->wr_rx_sges[x + 1].lkey = |
623 | lnk->roce_pd->local_dma_lkey; |
624 | } |
625 | lnk->wr_rx_ibs[i].next = NULL; |
626 | lnk->wr_rx_ibs[i].sg_list = &lnk->wr_rx_sges[x]; |
627 | lnk->wr_rx_ibs[i].num_sge = sges_per_buf; |
628 | } |
629 | lnk->wr_reg.wr.next = NULL; |
630 | lnk->wr_reg.wr.num_sge = 0; |
631 | lnk->wr_reg.wr.send_flags = IB_SEND_SIGNALED; |
632 | lnk->wr_reg.wr.opcode = IB_WR_REG_MR; |
633 | lnk->wr_reg.access = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_WRITE; |
634 | } |
635 | |
636 | void smc_wr_free_link(struct smc_link *lnk) |
637 | { |
638 | struct ib_device *ibdev; |
639 | |
640 | if (!lnk->smcibdev) |
641 | return; |
642 | ibdev = lnk->smcibdev->ibdev; |
643 | |
644 | smc_wr_drain_cq(lnk); |
645 | smc_wr_wakeup_reg_wait(lnk); |
646 | smc_wr_wakeup_tx_wait(lnk); |
647 | |
648 | smc_wr_tx_wait_no_pending_sends(link: lnk); |
649 | percpu_ref_kill(ref: &lnk->wr_reg_refs); |
650 | wait_for_completion(&lnk->reg_ref_comp); |
651 | percpu_ref_kill(ref: &lnk->wr_tx_refs); |
652 | wait_for_completion(&lnk->tx_ref_comp); |
653 | |
654 | if (lnk->wr_rx_dma_addr) { |
655 | ib_dma_unmap_single(dev: ibdev, addr: lnk->wr_rx_dma_addr, |
656 | SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, |
657 | direction: DMA_FROM_DEVICE); |
658 | lnk->wr_rx_dma_addr = 0; |
659 | } |
660 | if (lnk->wr_rx_v2_dma_addr) { |
661 | ib_dma_unmap_single(dev: ibdev, addr: lnk->wr_rx_v2_dma_addr, |
662 | SMC_WR_BUF_V2_SIZE, |
663 | direction: DMA_FROM_DEVICE); |
664 | lnk->wr_rx_v2_dma_addr = 0; |
665 | } |
666 | if (lnk->wr_tx_dma_addr) { |
667 | ib_dma_unmap_single(dev: ibdev, addr: lnk->wr_tx_dma_addr, |
668 | SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, |
669 | direction: DMA_TO_DEVICE); |
670 | lnk->wr_tx_dma_addr = 0; |
671 | } |
672 | if (lnk->wr_tx_v2_dma_addr) { |
673 | ib_dma_unmap_single(dev: ibdev, addr: lnk->wr_tx_v2_dma_addr, |
674 | SMC_WR_BUF_V2_SIZE, |
675 | direction: DMA_TO_DEVICE); |
676 | lnk->wr_tx_v2_dma_addr = 0; |
677 | } |
678 | } |
679 | |
680 | void smc_wr_free_lgr_mem(struct smc_link_group *lgr) |
681 | { |
682 | if (lgr->smc_version < SMC_V2) |
683 | return; |
684 | |
685 | kfree(objp: lgr->wr_rx_buf_v2); |
686 | lgr->wr_rx_buf_v2 = NULL; |
687 | kfree(objp: lgr->wr_tx_buf_v2); |
688 | lgr->wr_tx_buf_v2 = NULL; |
689 | } |
690 | |
691 | void smc_wr_free_link_mem(struct smc_link *lnk) |
692 | { |
693 | kfree(objp: lnk->wr_tx_v2_ib); |
694 | lnk->wr_tx_v2_ib = NULL; |
695 | kfree(objp: lnk->wr_tx_v2_sge); |
696 | lnk->wr_tx_v2_sge = NULL; |
697 | kfree(objp: lnk->wr_tx_v2_pend); |
698 | lnk->wr_tx_v2_pend = NULL; |
699 | kfree(objp: lnk->wr_tx_compl); |
700 | lnk->wr_tx_compl = NULL; |
701 | kfree(objp: lnk->wr_tx_pends); |
702 | lnk->wr_tx_pends = NULL; |
703 | bitmap_free(bitmap: lnk->wr_tx_mask); |
704 | lnk->wr_tx_mask = NULL; |
705 | kfree(objp: lnk->wr_tx_sges); |
706 | lnk->wr_tx_sges = NULL; |
707 | kfree(objp: lnk->wr_tx_rdma_sges); |
708 | lnk->wr_tx_rdma_sges = NULL; |
709 | kfree(objp: lnk->wr_rx_sges); |
710 | lnk->wr_rx_sges = NULL; |
711 | kfree(objp: lnk->wr_tx_rdmas); |
712 | lnk->wr_tx_rdmas = NULL; |
713 | kfree(objp: lnk->wr_rx_ibs); |
714 | lnk->wr_rx_ibs = NULL; |
715 | kfree(objp: lnk->wr_tx_ibs); |
716 | lnk->wr_tx_ibs = NULL; |
717 | kfree(objp: lnk->wr_tx_bufs); |
718 | lnk->wr_tx_bufs = NULL; |
719 | kfree(objp: lnk->wr_rx_bufs); |
720 | lnk->wr_rx_bufs = NULL; |
721 | } |
722 | |
723 | int smc_wr_alloc_lgr_mem(struct smc_link_group *lgr) |
724 | { |
725 | if (lgr->smc_version < SMC_V2) |
726 | return 0; |
727 | |
728 | lgr->wr_rx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); |
729 | if (!lgr->wr_rx_buf_v2) |
730 | return -ENOMEM; |
731 | lgr->wr_tx_buf_v2 = kzalloc(SMC_WR_BUF_V2_SIZE, GFP_KERNEL); |
732 | if (!lgr->wr_tx_buf_v2) { |
733 | kfree(objp: lgr->wr_rx_buf_v2); |
734 | return -ENOMEM; |
735 | } |
736 | return 0; |
737 | } |
738 | |
739 | int smc_wr_alloc_link_mem(struct smc_link *link) |
740 | { |
741 | int sges_per_buf = link->lgr->smc_version == SMC_V2 ? 2 : 1; |
742 | |
743 | /* allocate link related memory */ |
744 | link->wr_tx_bufs = kcalloc(SMC_WR_BUF_CNT, SMC_WR_BUF_SIZE, GFP_KERNEL); |
745 | if (!link->wr_tx_bufs) |
746 | goto no_mem; |
747 | link->wr_rx_bufs = kcalloc(SMC_WR_BUF_CNT * 3, SMC_WR_BUF_SIZE, |
748 | GFP_KERNEL); |
749 | if (!link->wr_rx_bufs) |
750 | goto no_mem_wr_tx_bufs; |
751 | link->wr_tx_ibs = kcalloc(SMC_WR_BUF_CNT, size: sizeof(link->wr_tx_ibs[0]), |
752 | GFP_KERNEL); |
753 | if (!link->wr_tx_ibs) |
754 | goto no_mem_wr_rx_bufs; |
755 | link->wr_rx_ibs = kcalloc(SMC_WR_BUF_CNT * 3, |
756 | size: sizeof(link->wr_rx_ibs[0]), |
757 | GFP_KERNEL); |
758 | if (!link->wr_rx_ibs) |
759 | goto no_mem_wr_tx_ibs; |
760 | link->wr_tx_rdmas = kcalloc(SMC_WR_BUF_CNT, |
761 | size: sizeof(link->wr_tx_rdmas[0]), |
762 | GFP_KERNEL); |
763 | if (!link->wr_tx_rdmas) |
764 | goto no_mem_wr_rx_ibs; |
765 | link->wr_tx_rdma_sges = kcalloc(SMC_WR_BUF_CNT, |
766 | size: sizeof(link->wr_tx_rdma_sges[0]), |
767 | GFP_KERNEL); |
768 | if (!link->wr_tx_rdma_sges) |
769 | goto no_mem_wr_tx_rdmas; |
770 | link->wr_tx_sges = kcalloc(SMC_WR_BUF_CNT, size: sizeof(link->wr_tx_sges[0]), |
771 | GFP_KERNEL); |
772 | if (!link->wr_tx_sges) |
773 | goto no_mem_wr_tx_rdma_sges; |
774 | link->wr_rx_sges = kcalloc(SMC_WR_BUF_CNT * 3, |
775 | size: sizeof(link->wr_rx_sges[0]) * sges_per_buf, |
776 | GFP_KERNEL); |
777 | if (!link->wr_rx_sges) |
778 | goto no_mem_wr_tx_sges; |
779 | link->wr_tx_mask = bitmap_zalloc(SMC_WR_BUF_CNT, GFP_KERNEL); |
780 | if (!link->wr_tx_mask) |
781 | goto no_mem_wr_rx_sges; |
782 | link->wr_tx_pends = kcalloc(SMC_WR_BUF_CNT, |
783 | size: sizeof(link->wr_tx_pends[0]), |
784 | GFP_KERNEL); |
785 | if (!link->wr_tx_pends) |
786 | goto no_mem_wr_tx_mask; |
787 | link->wr_tx_compl = kcalloc(SMC_WR_BUF_CNT, |
788 | size: sizeof(link->wr_tx_compl[0]), |
789 | GFP_KERNEL); |
790 | if (!link->wr_tx_compl) |
791 | goto no_mem_wr_tx_pends; |
792 | |
793 | if (link->lgr->smc_version == SMC_V2) { |
794 | link->wr_tx_v2_ib = kzalloc(size: sizeof(*link->wr_tx_v2_ib), |
795 | GFP_KERNEL); |
796 | if (!link->wr_tx_v2_ib) |
797 | goto no_mem_tx_compl; |
798 | link->wr_tx_v2_sge = kzalloc(size: sizeof(*link->wr_tx_v2_sge), |
799 | GFP_KERNEL); |
800 | if (!link->wr_tx_v2_sge) |
801 | goto no_mem_v2_ib; |
802 | link->wr_tx_v2_pend = kzalloc(size: sizeof(*link->wr_tx_v2_pend), |
803 | GFP_KERNEL); |
804 | if (!link->wr_tx_v2_pend) |
805 | goto no_mem_v2_sge; |
806 | } |
807 | return 0; |
808 | |
809 | no_mem_v2_sge: |
810 | kfree(objp: link->wr_tx_v2_sge); |
811 | no_mem_v2_ib: |
812 | kfree(objp: link->wr_tx_v2_ib); |
813 | no_mem_tx_compl: |
814 | kfree(objp: link->wr_tx_compl); |
815 | no_mem_wr_tx_pends: |
816 | kfree(objp: link->wr_tx_pends); |
817 | no_mem_wr_tx_mask: |
818 | kfree(objp: link->wr_tx_mask); |
819 | no_mem_wr_rx_sges: |
820 | kfree(objp: link->wr_rx_sges); |
821 | no_mem_wr_tx_sges: |
822 | kfree(objp: link->wr_tx_sges); |
823 | no_mem_wr_tx_rdma_sges: |
824 | kfree(objp: link->wr_tx_rdma_sges); |
825 | no_mem_wr_tx_rdmas: |
826 | kfree(objp: link->wr_tx_rdmas); |
827 | no_mem_wr_rx_ibs: |
828 | kfree(objp: link->wr_rx_ibs); |
829 | no_mem_wr_tx_ibs: |
830 | kfree(objp: link->wr_tx_ibs); |
831 | no_mem_wr_rx_bufs: |
832 | kfree(objp: link->wr_rx_bufs); |
833 | no_mem_wr_tx_bufs: |
834 | kfree(objp: link->wr_tx_bufs); |
835 | no_mem: |
836 | return -ENOMEM; |
837 | } |
838 | |
839 | void smc_wr_remove_dev(struct smc_ib_device *smcibdev) |
840 | { |
841 | tasklet_kill(t: &smcibdev->recv_tasklet); |
842 | tasklet_kill(t: &smcibdev->send_tasklet); |
843 | } |
844 | |
845 | void smc_wr_add_dev(struct smc_ib_device *smcibdev) |
846 | { |
847 | tasklet_setup(t: &smcibdev->recv_tasklet, callback: smc_wr_rx_tasklet_fn); |
848 | tasklet_setup(t: &smcibdev->send_tasklet, callback: smc_wr_tx_tasklet_fn); |
849 | } |
850 | |
851 | static void smcr_wr_tx_refs_free(struct percpu_ref *ref) |
852 | { |
853 | struct smc_link *lnk = container_of(ref, struct smc_link, wr_tx_refs); |
854 | |
855 | complete(&lnk->tx_ref_comp); |
856 | } |
857 | |
858 | static void smcr_wr_reg_refs_free(struct percpu_ref *ref) |
859 | { |
860 | struct smc_link *lnk = container_of(ref, struct smc_link, wr_reg_refs); |
861 | |
862 | complete(&lnk->reg_ref_comp); |
863 | } |
864 | |
865 | int smc_wr_create_link(struct smc_link *lnk) |
866 | { |
867 | struct ib_device *ibdev = lnk->smcibdev->ibdev; |
868 | int rc = 0; |
869 | |
870 | smc_wr_tx_set_wr_id(wr_tx_id: &lnk->wr_tx_id, val: 0); |
871 | lnk->wr_rx_id = 0; |
872 | lnk->wr_rx_dma_addr = ib_dma_map_single( |
873 | dev: ibdev, cpu_addr: lnk->wr_rx_bufs, SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, |
874 | direction: DMA_FROM_DEVICE); |
875 | if (ib_dma_mapping_error(dev: ibdev, dma_addr: lnk->wr_rx_dma_addr)) { |
876 | lnk->wr_rx_dma_addr = 0; |
877 | rc = -EIO; |
878 | goto out; |
879 | } |
880 | if (lnk->lgr->smc_version == SMC_V2) { |
881 | lnk->wr_rx_v2_dma_addr = ib_dma_map_single(dev: ibdev, |
882 | cpu_addr: lnk->lgr->wr_rx_buf_v2, SMC_WR_BUF_V2_SIZE, |
883 | direction: DMA_FROM_DEVICE); |
884 | if (ib_dma_mapping_error(dev: ibdev, dma_addr: lnk->wr_rx_v2_dma_addr)) { |
885 | lnk->wr_rx_v2_dma_addr = 0; |
886 | rc = -EIO; |
887 | goto dma_unmap; |
888 | } |
889 | lnk->wr_tx_v2_dma_addr = ib_dma_map_single(dev: ibdev, |
890 | cpu_addr: lnk->lgr->wr_tx_buf_v2, SMC_WR_BUF_V2_SIZE, |
891 | direction: DMA_TO_DEVICE); |
892 | if (ib_dma_mapping_error(dev: ibdev, dma_addr: lnk->wr_tx_v2_dma_addr)) { |
893 | lnk->wr_tx_v2_dma_addr = 0; |
894 | rc = -EIO; |
895 | goto dma_unmap; |
896 | } |
897 | } |
898 | lnk->wr_tx_dma_addr = ib_dma_map_single( |
899 | dev: ibdev, cpu_addr: lnk->wr_tx_bufs, SMC_WR_BUF_SIZE * lnk->wr_tx_cnt, |
900 | direction: DMA_TO_DEVICE); |
901 | if (ib_dma_mapping_error(dev: ibdev, dma_addr: lnk->wr_tx_dma_addr)) { |
902 | rc = -EIO; |
903 | goto dma_unmap; |
904 | } |
905 | smc_wr_init_sge(lnk); |
906 | bitmap_zero(dst: lnk->wr_tx_mask, SMC_WR_BUF_CNT); |
907 | init_waitqueue_head(&lnk->wr_tx_wait); |
908 | rc = percpu_ref_init(ref: &lnk->wr_tx_refs, release: smcr_wr_tx_refs_free, flags: 0, GFP_KERNEL); |
909 | if (rc) |
910 | goto dma_unmap; |
911 | init_completion(x: &lnk->tx_ref_comp); |
912 | init_waitqueue_head(&lnk->wr_reg_wait); |
913 | rc = percpu_ref_init(ref: &lnk->wr_reg_refs, release: smcr_wr_reg_refs_free, flags: 0, GFP_KERNEL); |
914 | if (rc) |
915 | goto dma_unmap; |
916 | init_completion(x: &lnk->reg_ref_comp); |
917 | init_waitqueue_head(&lnk->wr_rx_empty_wait); |
918 | return rc; |
919 | |
920 | dma_unmap: |
921 | if (lnk->wr_rx_v2_dma_addr) { |
922 | ib_dma_unmap_single(dev: ibdev, addr: lnk->wr_rx_v2_dma_addr, |
923 | SMC_WR_BUF_V2_SIZE, |
924 | direction: DMA_FROM_DEVICE); |
925 | lnk->wr_rx_v2_dma_addr = 0; |
926 | } |
927 | if (lnk->wr_tx_v2_dma_addr) { |
928 | ib_dma_unmap_single(dev: ibdev, addr: lnk->wr_tx_v2_dma_addr, |
929 | SMC_WR_BUF_V2_SIZE, |
930 | direction: DMA_TO_DEVICE); |
931 | lnk->wr_tx_v2_dma_addr = 0; |
932 | } |
933 | ib_dma_unmap_single(dev: ibdev, addr: lnk->wr_rx_dma_addr, |
934 | SMC_WR_BUF_SIZE * lnk->wr_rx_cnt, |
935 | direction: DMA_FROM_DEVICE); |
936 | lnk->wr_rx_dma_addr = 0; |
937 | out: |
938 | return rc; |
939 | } |
940 | |